From e00d24b1835a318e8ff79596d6bf0a07ff225273 Mon Sep 17 00:00:00 2001
From: GNU Libc Maintainers <debian-glibc@lists.debian.org>
Date: Sun, 2 Oct 2022 18:46:25 +0100
Subject: [PATCH] git-updates

GIT update of https://sourceware.org/git/glibc.git/release/2.35/master from glibc-2.35

GIT update of https://sourceware.org/git/glibc.git/release/2.35/master from glibc-2.35


Gbp-Pq: Name git-updates.diff
---
 INSTALL                                       |    6 +
 NEWS                                          |   60 +-
 bits/socket.h                                 |   40 +-
 catgets/open_catalog.c                        |    4 +-
 configure                                     |   79 +-
 configure.ac                                  |   55 +-
 csu/libc-start.c                              |    3 -
 csu/libc-tls.c                                |   11 +-
 debug/tst-fortify.c                           |    5 +
 dlfcn/Makefile                                |    4 +
 dlfcn/dladdr.c                                |    2 +-
 dlfcn/dladdr1.c                               |    2 +-
 dlfcn/dlclose.c                               |    2 +-
 dlfcn/dlerror.c                               |    2 +-
 dlfcn/dlfcn.h                                 |    7 +-
 dlfcn/dlinfo.c                                |   15 +-
 dlfcn/dlmopen.c                               |    2 +-
 dlfcn/dlopen.c                                |    4 +-
 dlfcn/dlopenold.c                             |    2 +-
 dlfcn/dlsym.c                                 |    2 +-
 dlfcn/dlvsym.c                                |    2 +-
 dlfcn/tst-dlinfo-phdr.c                       |  125 ++
 elf/Makefile                                  |  186 +-
 elf/dl-audit.c                                |    3 +-
 elf/dl-deps.c                                 |    2 +
 elf/dl-early_allocate.c                       |   30 +
 elf/dl-find_object.c                          |    5 +-
 elf/dl-hwcaps.c                               |    8 +-
 elf/dl-libc.c                                 |    8 +-
 elf/dl-map-segments.h                         |    3 +
 elf/dl-open.c                                 |   13 +-
 elf/dl-sort-maps.c                            |   48 +-
 elf/dl-support.c                              |   85 +-
 elf/dl-sysdep.c                               |  352 +--
 elf/dso-sort-tests-1.def                      |    7 +
 elf/enbl-secure.c                             |   10 -
 elf/libtracemod1-1.c                          |    1 +
 elf/libtracemod2-1.c                          |    1 +
 elf/libtracemod3-1.c                          |    1 +
 elf/libtracemod4-1.c                          |    1 +
 elf/libtracemod5-1.c                          |    1 +
 elf/rtld.c                                    |   80 +-
 elf/tst-audit26.c                             |   35 +
 elf/tst-auditmod24a.c                         |    4 +-
 elf/tst-auditmod24d.c                         |    4 +-
 elf/tst-auditmod25.c                          |    2 +-
 elf/tst-auditmod26.c                          |  104 +
 elf/tst-dlmopen-twice-mod1.c                  |   37 +
 elf/tst-dlmopen-twice-mod2.c                  |   50 +
 elf/tst-dlmopen-twice.c                       |   34 +
 elf/tst-glibc-hwcaps-cache.script             |    6 +
 elf/tst-glibcelf.py                           |  260 +++
 elf/tst-relro-symbols.py                      |  137 ++
 .../tst-tls-allocation-failure-static.c       |   21 +-
 elf/tst-trace1.exp                            |    4 +
 elf/tst-trace2.exp                            |    6 +
 elf/tst-trace3.exp                            |    6 +
 elf/tst-trace4.exp                            |    6 +
 elf/tst-trace5.exp                            |    6 +
 iconv/gconv_parseconfdir.h                    |   13 +-
 include/arpa/nameser.h                        |  106 +
 include/bits/stdio2-decl.h                    |    1 +
 include/bits/wchar2-decl.h                    |    1 +
 include/libc-internal.h                       |    3 -
 include/register-atfork.h                     |   26 +-
 include/resolv.h                              |    3 +
 include/unistd.h                              |    1 -
 inet/ruserpass.c                              |    4 +-
 io/Makefile                                   |    8 +-
 io/tst-lchmod-time64.c                        |    2 +
 io/tst-lchmod.c                               |   22 +-
 io/tst-stat.c                                 |    4 +
 libio/Makefile                                |    2 +-
 libio/bits/stdio2-decl.h                      |  111 +
 libio/bits/stdio2.h                           |   62 -
 libio/stdio.h                                 |   17 +-
 locale/programs/ld-monetary.c                 |  182 +-
 locale/programs/locarchive.c                  |    2 +-
 localedata/Makefile                           |    4 +-
 localedata/gen-locale.sh                      |   10 +-
 malloc/malloc.c                               |   15 +-
 misc/daemon.c                                 |    5 +-
 misc/getusershell.c                           |    4 +-
 misc/sys/cdefs.h                              |   12 +-
 nptl/allocatestack.c                          |    2 -
 nptl/cancellation.c                           |   50 +-
 nptl/cleanup_defer.c                          |   42 +-
 nptl/descr.h                                  |   41 +-
 nptl/libc-cleanup.c                           |   40 +-
 nptl/pthread_cancel.c                         |  111 +-
 nptl/pthread_join_common.c                    |    7 +-
 nptl/pthread_setcancelstate.c                 |   26 +-
 nptl/pthread_setcanceltype.c                  |   31 +-
 nptl/pthread_testcancel.c                     |    9 +-
 nptl/unwind.c                                 |    2 +-
 nscd/connections.c                            |    3 +-
 nss/Makefile                                  |   26 +-
 nss/XXX-lookup.c                              |    5 +
 nss/nss_database.c                            |   39 +-
 nss/nss_module.c                              |   12 +-
 nss/nss_test_errno.c                          |   58 +
 nss/tst-nss-test_errno.c                      |   61 +
 posix/fork.c                                  |    7 +-
 posix/glob.c                                  |   70 +-
 posix/register-atfork.c                       |  140 +-
 posix/tst-spawn6.c                            |   62 +-
 resolv/Makefile                               |   41 +-
 resolv/README                                 |    3 -
 resolv/mapv4v6addr.h                          |   69 -
 resolv/mapv4v6hostent.h                       |   84 -
 resolv/ns_name_length_uncompressed.c          |   72 +
 resolv/ns_rr_cursor_init.c                    |   62 +
 resolv/ns_rr_cursor_next.c                    |   74 +
 resolv/ns_samebinaryname.c                    |   55 +
 resolv/nss_dns/dns-host.c                     | 1155 ++++------
 resolv/res-name-checking.c                    |   14 +-
 resolv/tst-ns_name_length_uncompressed.c      |  135 ++
 resolv/tst-ns_rr_cursor.c                     |  227 ++
 resolv/tst-ns_samebinaryname.c                |   62 +
 resolv/tst-resolv-aliases.c                   |  254 +++
 resolv/tst-resolv-byaddr.c                    |  326 +++
 resolv/tst-resolv-invalid-cname.c             |  406 ++++
 resolv/tst-resolv-maybe_insert_sig.h          |   32 +
 scripts/dso-ordering-test.py                  |   13 +-
 scripts/glibcelf.py                           | 1141 ++++++++++
 scripts/tst-elf-edit.py                       |   34 +-
 scripts/tst-ld-trace.py                       |  108 +
 shlib-versions                                |    5 -
 socket/Makefile                               |    1 +
 socket/sys/socket.h                           |    2 +-
 socket/tst-cmsghdr-skeleton.c                 |   92 +
 socket/tst-cmsghdr.c                          |   56 +
 stdlib/Makefile                               |    3 +
 stdlib/bits/stdlib.h                          |   14 +-
 stdlib/testmb.c                               |    7 +
 string/bits/string_fortified.h                |    2 +-
 string/test-rawmemchr.c                       |   57 +-
 string/test-strncmp.c                         |   23 +
 sysdeps/generic/ldsodefs.h                    |   11 +-
 sysdeps/generic/libc-lock-arch.h              |   25 +
 sysdeps/generic/startup.h                     |   24 -
 sysdeps/hppa/dl-fptr.c                        |   15 +-
 sysdeps/hppa/dl-lookupcfg.h                   |    9 +-
 sysdeps/hppa/dl-machine.h                     |   60 +-
 sysdeps/hppa/dl-runtime.c                     |    4 +-
 sysdeps/hppa/dl-runtime.h                     |    3 +
 sysdeps/i386/fpu/libm-test-ulps               |    2 +-
 .../i386/i686/fpu/multiarch/libm-test-ulps    |    2 +-
 sysdeps/m68k/dl-machine.h                     |   12 +-
 sysdeps/mach/hurd/bits/socket.h               |   40 +-
 sysdeps/mach/hurd/dl-sysdep.c                 |   30 +-
 sysdeps/mach/hurd/i386/init-first.c           |    4 -
 sysdeps/nios2/dl-machine.h                    |   50 +-
 sysdeps/nptl/dl-tls_init_tp.c                 |    3 -
 sysdeps/nptl/libc-lock.h                      |    8 +-
 sysdeps/nptl/libc-lockP.h                     |    3 +-
 sysdeps/nptl/pthreadP.h                       |    2 +-
 sysdeps/posix/fpathconf.c                     |    4 +-
 sysdeps/posix/isfdtype.c                      |    4 +-
 sysdeps/posix/posix_fallocate.c               |    4 +-
 sysdeps/posix/posix_fallocate64.c             |    4 +-
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S |    4 +-
 sysdeps/pthread/Makefile                      |   40 +-
 sysdeps/pthread/tst-atfork3.c                 |  118 +
 sysdeps/pthread/tst-atfork3mod.c              |   44 +
 sysdeps/pthread/tst-atfork4.c                 |  128 ++
 sysdeps/pthread/tst-atfork4mod.c              |   48 +
 sysdeps/pthread/tst-cancel29.c                |  207 ++
 sysdeps/pthread/tst-cancel30.c                |   82 +
 sysdeps/riscv/rv64/rvd/libm-test-ulps         |    2 +-
 sysdeps/s390/dl-procinfo.c                    |    5 +-
 sysdeps/s390/dl-procinfo.h                    |    2 +-
 sysdeps/s390/s390-64/Makefile                 |   25 +-
 sysdeps/s390/s390-64/configure                |  122 ++
 sysdeps/s390/s390-64/configure.ac             |   92 +
 sysdeps/s390/s390-64/dl-hwcap-check.h         |    6 +-
 sysdeps/s390/s390-64/dl-hwcaps-subdirs.c      |   11 +-
 sysdeps/s390/s390-64/start.S                  |   28 +
 sysdeps/s390/s390-64/tst-glibc-hwcaps.c       |    8 +-
 sysdeps/unix/sysv/linux/Makefile              |    1 +
 .../unix/sysv/linux/aarch64/arch-syscall.h    |    1 +
 sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h  |    2 +
 sysdeps/unix/sysv/linux/alpha/arch-syscall.h  |    1 +
 sysdeps/unix/sysv/linux/alpha/brk_call.h      |   27 +
 sysdeps/unix/sysv/linux/alpha/dl-auxv.h       |   18 +-
 sysdeps/unix/sysv/linux/arc/arch-syscall.h    |    1 +
 sysdeps/unix/sysv/linux/arm/arch-syscall.h    |    1 +
 sysdeps/unix/sysv/linux/bits/socket.h         |   42 +-
 sysdeps/unix/sysv/linux/brk.c                 |    3 +-
 sysdeps/unix/sysv/linux/brk_call.h            |   25 +
 sysdeps/unix/sysv/linux/cmsg_nxthdr.c         |   36 +-
 .../unix/sysv/linux/convert_scm_timestamps.c  |    4 +-
 sysdeps/unix/sysv/linux/csky/arch-syscall.h   |    1 +
 sysdeps/unix/sysv/linux/dl-early_allocate.c   |   82 +
 sysdeps/unix/sysv/linux/dl-parse_auxv.h       |   61 +
 sysdeps/unix/sysv/linux/dl-sysdep.c           |  240 +-
 sysdeps/unix/sysv/linux/faccessat.c           |    4 +-
 sysdeps/unix/sysv/linux/fchmodat.c            |    4 +-
 sysdeps/unix/sysv/linux/getsysstats.c         |   96 +-
 sysdeps/unix/sysv/linux/glob64-time64.c       |    1 +
 sysdeps/unix/sysv/linux/hppa/arch-syscall.h   |    1 +
 sysdeps/unix/sysv/linux/hppa/getcontext.S     |   53 +-
 sysdeps/unix/sysv/linux/hppa/setcontext.S     |    9 +-
 sysdeps/unix/sysv/linux/hppa/swapcontext.S    |   72 +
 sysdeps/unix/sysv/linux/hppa/swapcontext.c    |   41 -
 sysdeps/unix/sysv/linux/i386/Makefile         |    2 +-
 sysdeps/unix/sysv/linux/i386/arch-syscall.h   |    1 +
 .../sysv/linux/i386/libc-do-syscall-int80.S   |   25 +
 .../unix/sysv/linux/i386/libc-do-syscall.S    |    3 -
 sysdeps/unix/sysv/linux/i386/startup.h        |   47 +-
 sysdeps/unix/sysv/linux/i386/sysdep.h         |   13 +-
 sysdeps/unix/sysv/linux/ia64/Makefile         |    6 +
 sysdeps/unix/sysv/linux/ia64/arch-syscall.h   |    1 +
 sysdeps/unix/sysv/linux/ia64/brk.c            |    5 +-
 sysdeps/unix/sysv/linux/ia64/startup.h        |   22 +
 sysdeps/unix/sysv/linux/ia64/sysdep.h         |   23 +-
 sysdeps/unix/sysv/linux/ldsodefs.h            |   12 -
 sysdeps/unix/sysv/linux/m68k/arch-syscall.h   |    1 +
 sysdeps/unix/sysv/linux/m68k/libc-lock-arch.h |   25 +
 sysdeps/unix/sysv/linux/m68k/sysdep.h         |    4 +-
 .../unix/sysv/linux/microblaze/arch-syscall.h |    1 +
 .../unix/sysv/linux/mips/bits/struct_stat.h   |   38 +-
 .../sysv/linux/mips/mips32/arch-syscall.h     |    1 +
 .../sysv/linux/mips/mips64/n32/arch-syscall.h |    1 +
 .../sysv/linux/mips/mips64/n64/arch-syscall.h |    1 +
 sysdeps/unix/sysv/linux/mmap_call.h           |   22 +
 sysdeps/unix/sysv/linux/mmap_internal.h       |    6 +-
 sysdeps/unix/sysv/linux/mq_timedreceive.c     |    2 +-
 sysdeps/unix/sysv/linux/nios2/arch-syscall.h  |    1 +
 sysdeps/unix/sysv/linux/or1k/arch-syscall.h   |    1 +
 sysdeps/unix/sysv/linux/pathconf.c            |    4 +-
 sysdeps/unix/sysv/linux/powerpc/dl-auxv.h     |   14 +-
 sysdeps/unix/sysv/linux/powerpc/dl-support.c  |    4 +
 .../linux/powerpc/powerpc32/arch-syscall.h    |    1 +
 .../linux/powerpc/powerpc64/arch-syscall.h    |    1 +
 .../unix/sysv/linux/riscv/rv32/arch-syscall.h |    2 +
 .../unix/sysv/linux/riscv/rv64/arch-syscall.h |    2 +
 .../s390/{mmap_internal.h => mmap_call.h}     |   14 +-
 .../sysv/linux/s390/s390-32/arch-syscall.h    |    1 +
 .../sysv/linux/s390/s390-64/arch-syscall.h    |    1 +
 sysdeps/unix/sysv/linux/sh/arch-syscall.h     |    1 +
 sysdeps/unix/sysv/linux/sparc/brk.c           |   58 -
 .../linux/{alpha/brk.c => sparc/brk_call.h}   |   35 +-
 .../sysv/linux/sparc/sparc32/arch-syscall.h   |    1 +
 .../sysv/linux/sparc/sparc64/arch-syscall.h   |    1 +
 sysdeps/unix/sysv/linux/spawni.c              |    2 +-
 sysdeps/unix/sysv/linux/startup.h             |   39 +
 sysdeps/unix/sysv/linux/syscall-names.list    |    5 +-
 sysdeps/unix/sysv/linux/tst-getauxval.c       |   74 +
 sysdeps/unix/sysv/linux/tst-mman-consts.py    |    2 +-
 .../sysv/linux/tst-socket-timestamp-compat.c  |   25 +-
 .../unix/sysv/linux/x86_64/64/arch-syscall.h  |    1 +
 .../unix/sysv/linux/x86_64/x32/arch-syscall.h |    1 +
 sysdeps/x86/Makefile                          |    7 +-
 sysdeps/x86/dl-cacheinfo.h                    |   32 +-
 sysdeps/x86/isa-level.c                       |    3 +-
 sysdeps/x86/sysdep.h                          |    3 +-
 sysdeps/x86/tst-strncmp-rtm.c                 |   58 +-
 sysdeps/x86/tst-wcsncmp-rtm.c                 |   21 +
 sysdeps/x86_64/bzero.S                        |    1 -
 sysdeps/x86_64/dl-machine.h                   |    6 +-
 sysdeps/x86_64/memcmp.S                       |  884 +++++---
 sysdeps/x86_64/memcmpeq.S                     |    2 +-
 sysdeps/x86_64/memrchr.S                      |  613 +++---
 sysdeps/x86_64/memset.S                       |   16 +-
 sysdeps/x86_64/multiarch/Makefile             |  300 ++-
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h       |   35 +
 sysdeps/x86_64/multiarch/avx-vecs.h           |   47 +
 sysdeps/x86_64/multiarch/bcopy.S              |    7 -
 sysdeps/x86_64/multiarch/evex-vecs-common.h   |   39 +
 sysdeps/x86_64/multiarch/evex256-vecs.h       |   35 +
 sysdeps/x86_64/multiarch/evex512-vecs.h       |   35 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |   86 +-
 sysdeps/x86_64/multiarch/ifunc-memcmp.h       |    4 -
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |   19 +-
 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S    |    1 +
 sysdeps/x86_64/multiarch/memchr-avx2.S        |  109 +-
 sysdeps/x86_64/multiarch/memchr-evex.S        |   46 +-
 sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S  |   98 +-
 sysdeps/x86_64/multiarch/memcmp-sse2.S        |    4 +-
 sysdeps/x86_64/multiarch/memcmp-sse4.S        |  803 -------
 sysdeps/x86_64/multiarch/memcmpeq-sse2.S      |    6 +-
 sysdeps/x86_64/multiarch/memmove-erms.S       |   72 +
 .../multiarch/memmove-vec-unaligned-erms.S    |   79 +-
 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S   |    1 +
 sysdeps/x86_64/multiarch/memrchr-avx2.S       |  534 +++--
 sysdeps/x86_64/multiarch/memrchr-evex.S       |  539 +++--
 .../multiarch/memset-avx2-unaligned-erms.S    |   18 +-
 .../multiarch/memset-avx512-unaligned-erms.S  |   18 +-
 sysdeps/x86_64/multiarch/memset-erms.S        |   44 +
 .../multiarch/memset-evex-unaligned-erms.S    |   18 +-
 .../multiarch/memset-sse2-unaligned-erms.S    |    4 +-
 .../multiarch/memset-vec-unaligned-erms.S     |  223 +-
 sysdeps/x86_64/multiarch/sse2-vecs.h          |   47 +
 .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S  |   15 +
 ...strcasecmp_l-avx.S => strcasecmp_l-avx2.S} |    9 +-
 .../{strncase_l-avx.S => strcasecmp_l-evex.S} |   11 +-
 sysdeps/x86_64/multiarch/strchr-avx2.S        |  204 +-
 sysdeps/x86_64/multiarch/strchr-evex.S        |  146 +-
 sysdeps/x86_64/multiarch/strcmp-avx2.S        | 1762 +++++++++------
 sysdeps/x86_64/multiarch/strcmp-evex.S        | 1945 ++++++++++-------
 sysdeps/x86_64/multiarch/strcmp-sse42.S       |  307 ++-
 sysdeps/x86_64/multiarch/strcmp.c             |    5 +
 sysdeps/x86_64/multiarch/strcspn-c.c          |   83 +-
 .../{strspn-sse2.S => strcspn-sse2.c}         |    8 +-
 sysdeps/x86_64/multiarch/strlen-evex-base.S   |  302 +++
 sysdeps/x86_64/multiarch/strlen-evex512.S     |    7 +
 sysdeps/x86_64/multiarch/strlen-vec.S         |    6 +-
 .../x86_64/multiarch/strncase_l-avx2-rtm.S    |   16 +
 sysdeps/x86_64/multiarch/strncase_l-avx2.S    |   27 +
 sysdeps/x86_64/multiarch/strncase_l-evex.S    |   25 +
 sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S   |    1 +
 sysdeps/x86_64/multiarch/strncmp-avx2.S       |    1 +
 sysdeps/x86_64/multiarch/strncmp-sse4_2.S     |    8 +-
 sysdeps/x86_64/multiarch/strnlen-evex512.S    |    4 +
 .../{strcspn-sse2.S => strpbrk-sse2.c}        |    8 +-
 sysdeps/x86_64/multiarch/strrchr-avx2.S       |  426 ++--
 sysdeps/x86_64/multiarch/strrchr-evex.S       |  471 ++--
 sysdeps/x86_64/multiarch/strrchr-sse2.S       |    2 +-
 sysdeps/x86_64/multiarch/strspn-c.c           |   86 +-
 .../{strpbrk-sse2.S => strspn-sse2.c}         |    9 +-
 sysdeps/x86_64/multiarch/strstr-avx512.c      |  218 ++
 sysdeps/x86_64/multiarch/strstr.c             |   24 +-
 sysdeps/x86_64/multiarch/varshift.c           |    5 +-
 sysdeps/x86_64/multiarch/varshift.h           |    3 +-
 sysdeps/x86_64/multiarch/vec-macros.h         |   90 +
 sysdeps/x86_64/multiarch/wcslen-evex512.S     |    4 +
 sysdeps/x86_64/multiarch/wcslen-sse4_1.S      |    1 +
 sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S   |    2 +-
 sysdeps/x86_64/multiarch/wcsncmp-avx2.S       |    2 +-
 sysdeps/x86_64/multiarch/wcsnlen-evex512.S    |    5 +
 sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S     |    1 +
 sysdeps/x86_64/multiarch/wcsrchr-sse2.S       |    3 +-
 sysdeps/x86_64/multiarch/wmemcmp-c.c          |    9 -
 sysdeps/x86_64/multiarch/wmemcmp-sse2.S       |   21 +
 sysdeps/x86_64/strcmp.S                       |   64 +-
 sysdeps/x86_64/strcspn.S                      |  119 -
 sysdeps/x86_64/strpbrk.S                      |    3 -
 sysdeps/x86_64/strrchr.S                      |  510 +++--
 sysdeps/x86_64/strspn.S                       |  112 -
 sysdeps/x86_64/sysdep.h                       |   24 +-
 sysdeps/x86_64/wcslen.S                       |   86 +-
 sysdeps/x86_64/wcsrchr.S                      |  268 +--
 sysdeps/x86_64/wmemcmp.S                      |   23 +
 wcsmbs/Makefile                               |    5 +-
 wcsmbs/bits/wchar2-decl.h                     |  124 ++
 wcsmbs/bits/wchar2.h                          |   72 -
 wcsmbs/wchar.h                                |   11 +-
 348 files changed, 15336 insertions(+), 8053 deletions(-)
 create mode 100644 dlfcn/tst-dlinfo-phdr.c
 create mode 100644 elf/dl-early_allocate.c
 create mode 100644 elf/libtracemod1-1.c
 create mode 100644 elf/libtracemod2-1.c
 create mode 100644 elf/libtracemod3-1.c
 create mode 100644 elf/libtracemod4-1.c
 create mode 100644 elf/libtracemod5-1.c
 create mode 100644 elf/tst-audit26.c
 create mode 100644 elf/tst-auditmod26.c
 create mode 100644 elf/tst-dlmopen-twice-mod1.c
 create mode 100644 elf/tst-dlmopen-twice-mod2.c
 create mode 100644 elf/tst-dlmopen-twice.c
 create mode 100644 elf/tst-glibcelf.py
 create mode 100644 elf/tst-relro-symbols.py
 rename sysdeps/mach/hurd/enbl-secure.c => elf/tst-tls-allocation-failure-static.c (57%)
 create mode 100644 elf/tst-trace1.exp
 create mode 100644 elf/tst-trace2.exp
 create mode 100644 elf/tst-trace3.exp
 create mode 100644 elf/tst-trace4.exp
 create mode 100644 elf/tst-trace5.exp
 create mode 100644 include/bits/stdio2-decl.h
 create mode 100644 include/bits/wchar2-decl.h
 create mode 100644 io/tst-lchmod-time64.c
 create mode 100644 libio/bits/stdio2-decl.h
 create mode 100644 nss/nss_test_errno.c
 create mode 100644 nss/tst-nss-test_errno.c
 delete mode 100644 resolv/mapv4v6addr.h
 delete mode 100644 resolv/mapv4v6hostent.h
 create mode 100644 resolv/ns_name_length_uncompressed.c
 create mode 100644 resolv/ns_rr_cursor_init.c
 create mode 100644 resolv/ns_rr_cursor_next.c
 create mode 100644 resolv/ns_samebinaryname.c
 create mode 100644 resolv/tst-ns_name_length_uncompressed.c
 create mode 100644 resolv/tst-ns_rr_cursor.c
 create mode 100644 resolv/tst-ns_samebinaryname.c
 create mode 100644 resolv/tst-resolv-aliases.c
 create mode 100644 resolv/tst-resolv-byaddr.c
 create mode 100644 resolv/tst-resolv-invalid-cname.c
 create mode 100644 resolv/tst-resolv-maybe_insert_sig.h
 create mode 100644 scripts/glibcelf.py
 create mode 100755 scripts/tst-ld-trace.py
 create mode 100644 socket/tst-cmsghdr-skeleton.c
 create mode 100644 socket/tst-cmsghdr.c
 create mode 100644 sysdeps/generic/libc-lock-arch.h
 create mode 100644 sysdeps/pthread/tst-atfork3.c
 create mode 100644 sysdeps/pthread/tst-atfork3mod.c
 create mode 100644 sysdeps/pthread/tst-atfork4.c
 create mode 100644 sysdeps/pthread/tst-atfork4mod.c
 create mode 100644 sysdeps/pthread/tst-cancel29.c
 create mode 100644 sysdeps/pthread/tst-cancel30.c
 create mode 100644 sysdeps/s390/s390-64/configure
 create mode 100644 sysdeps/s390/s390-64/configure.ac
 create mode 100644 sysdeps/unix/sysv/linux/alpha/brk_call.h
 create mode 100644 sysdeps/unix/sysv/linux/brk_call.h
 create mode 100644 sysdeps/unix/sysv/linux/dl-early_allocate.c
 create mode 100644 sysdeps/unix/sysv/linux/dl-parse_auxv.h
 create mode 100644 sysdeps/unix/sysv/linux/hppa/swapcontext.S
 delete mode 100644 sysdeps/unix/sysv/linux/hppa/swapcontext.c
 create mode 100644 sysdeps/unix/sysv/linux/i386/libc-do-syscall-int80.S
 create mode 100644 sysdeps/unix/sysv/linux/ia64/startup.h
 create mode 100644 sysdeps/unix/sysv/linux/m68k/libc-lock-arch.h
 create mode 100644 sysdeps/unix/sysv/linux/mmap_call.h
 create mode 100644 sysdeps/unix/sysv/linux/powerpc/dl-support.c
 rename sysdeps/unix/sysv/linux/s390/{mmap_internal.h => mmap_call.h} (78%)
 delete mode 100644 sysdeps/unix/sysv/linux/sparc/brk.c
 rename sysdeps/unix/sysv/linux/{alpha/brk.c => sparc/brk_call.h} (61%)
 create mode 100644 sysdeps/unix/sysv/linux/startup.h
 create mode 100644 sysdeps/unix/sysv/linux/tst-getauxval.c
 create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c
 delete mode 100644 sysdeps/x86_64/bzero.S
 create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S
 create mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h
 create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
 create mode 100644 sysdeps/x86_64/multiarch/memmove-erms.S
 create mode 100644 sysdeps/x86_64/multiarch/memset-erms.S
 create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
 rename sysdeps/x86_64/multiarch/{strcasecmp_l-avx.S => strcasecmp_l-avx2.S} (87%)
 rename sysdeps/x86_64/multiarch/{strncase_l-avx.S => strcasecmp_l-evex.S} (83%)
 rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strcspn-sse2.c} (86%)
 create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
 create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
 rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strpbrk-sse2.c} (85%)
 rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strspn-sse2.c} (84%)
 create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
 create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h
 create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
 delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse2.S
 delete mode 100644 sysdeps/x86_64/strcspn.S
 delete mode 100644 sysdeps/x86_64/strpbrk.S
 delete mode 100644 sysdeps/x86_64/strspn.S
 create mode 100644 sysdeps/x86_64/wmemcmp.S
 create mode 100644 wcsmbs/bits/wchar2-decl.h

diff --git a/INSTALL b/INSTALL
index 63c022d6b..237a2f948 100644
--- a/INSTALL
+++ b/INSTALL
@@ -90,6 +90,12 @@ if 'CFLAGS' is specified it must enable optimization.  For example:
      library will still be usable, but functionality may be lost--for
      example, you can't build a shared libc with old binutils.
 
+'--with-default-link'
+     With '--with-default-link', the build system does not use a custom
+     linker script for linking shared objects.  The default is
+     '--without-default-link', because the custom linker script is
+     needed for full RELRO protection.
+
 '--with-nonshared-cflags=CFLAGS'
      Use additional compiler flags CFLAGS to build the parts of the
      library which are always statically linked into applications and
diff --git a/NEWS b/NEWS
index faa7ec187..ff8201b0e 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,58 @@ See the end for copying conditions.
 
 Please send GNU C library bug reports via <https://sourceware.org/bugzilla/>
 using `glibc' in the "product" field.
+
+Version 2.35.1
+
+The following bugs are resolved with this release:
+
+  [12154] Do not fail DNS resolution for CNAMEs which are not host names
+  [25812] Libio vtable protection is sometimes only partially enforced
+  [28838] FAIL: elf/tst-p_align3
+  [28846] CMSG_NXTHDR may trigger -Wstrict-overflow warning
+  [28850] linux: __get_nprocs_sched reads uninitialized memory from the stack
+  [28853] libc: tst-spawn6 changes current foreground process group
+    (breaks test isolation)
+  [28857] libc: FAIL: elf/tst-audit24a
+  [28860] build: --enable-kernel=5.1.0 build fails because of missing
+    __convert_scm_timestamps
+  [28865] linux: _SC_NPROCESSORS_CONF and _SC_NPROCESSORS_ONLN are inaccurate
+    without /sys and /proc
+  [28868] dynamic-link: Dynamic loader DFS algorithm segfaults on
+    missing libraries
+  [28896] strncmp-avx2-rtm and wcsncmp-avx2-rtm fallback on non-rtm
+    variants when avoiding overflow
+  [28937] New DSO dependency sorter does not put new map first if in a cycle
+  [28953] nss: Protect against errno changes in function lookup
+  [29029] nptl: poll() spuriously returns EINTR during thread
+    cancellation and with cancellation disabled
+  [29062] elf: Fix memory leak in _dl_find_object_update
+  [29078] <dlfcn.h> functions unusable during early auditing
+  [29097] time: fchmodat does not handle 64 bit time_t for
+    AT_SYMLINK_NOFOLLOW
+  [29109] libc: posix_spawn() always returns 1 (EPERM) on clone()
+    failure
+  [29165] libc: [Regression] broken argv adjustment
+  [29187] dynamic-link: [regression] broken argv adjustment for nios2
+  [29203] libc: daemon is not y2038 aware
+  [29204] libc: getusershell is not 2038 aware
+  [29207] libc: posix_fallocate fallback implementation is not y2038
+  [29208] libc: fpathconf(_PC_ASYNC_IO) is not y2038 aware
+  [29209] libc: isfdtype is not y2038 aware
+  [29210] network: ruserpass is not y2038 aware
+  [29211] libc: __open_catalog is not y2038 aware
+  [29213] libc: gconv_parseconfdir is not y2038 aware
+  [29214] nptl: pthread_setcanceltype fails to set type
+  [29225] network: Mistyped define statement in socket/sys/socket.h in
+    line 184
+  [29305] Conserve NSS buffer space during DNS packet parsing
+  [29415] nscd: Fix netlink cache invalidation if epoll is used
+  [29446] _dlopen now ignores dl_caller argument in static mode
+  [29490] alpha: New __brk_call implementation is broken
+  [29528] elf: Call __libc_early_init for reused namespaces
+  [29537] libc: [2.34 regression]: Alignment issue on m68k when using
+  [29583] Use 64-bit interfaces in gconv_parseconfdir
+
 
 Version 2.35
 
@@ -139,6 +191,9 @@ Major new features:
   fortification balanced against additional runtime cost (checking non-constant
   bounds).
 
+* The audit libraries will avoid unnecessary slowdown if it is not required
+  PLT tracking (by not implementing the la_pltenter or la_pltexit callbacks).
+
 Deprecated and removed features, and other changes affecting compatibility:
 
 * On x86-64, the LD_PREFER_MAP_32BIT_EXEC environment variable support
@@ -335,6 +390,8 @@ The following bugs are resolved with this release:
   [28837] libc: FAIL: socket/tst-socket-timestamp-compat
   [28847] locale: Empty mon_decimal_point in LC_MONETARY results in non-
     empty mon_decimal_point_wc
+  [29069] libc: fstatat64_time64_statx wrapper broken on MIPS N32 with
+    -D_FILE_OFFSET_BITS=64 and -D_TIME_BITS=64
 
 
 Version 2.34
@@ -431,9 +488,6 @@ Major new features:
   execute programs that do not have any dynamic dependency (that is,
   they are statically linked).  This feature is Linux-specific.
 
-* The audit libraries will avoid unnecessary slowdown if it is not required
-  PLT tracking (by not implementing the la_pltenter or la_pltexit callbacks).
-
 Deprecated and removed features, and other changes affecting compatibility:
 
 * The function pthread_mutex_consistent_np has been deprecated; programs
diff --git a/bits/socket.h b/bits/socket.h
index 2b99dea33..aac8c49b0 100644
--- a/bits/socket.h
+++ b/bits/socket.h
@@ -245,6 +245,12 @@ struct cmsghdr
 			 + CMSG_ALIGN (sizeof (struct cmsghdr)))
 #define CMSG_LEN(len)   (CMSG_ALIGN (sizeof (struct cmsghdr)) + (len))
 
+/* Given a length, return the additional padding necessary such that
+   len + __CMSG_PADDING(len) == CMSG_ALIGN (len).  */
+#define __CMSG_PADDING(len) ((sizeof (size_t) \
+                              - ((len) & (sizeof (size_t) - 1))) \
+                             & (sizeof (size_t) - 1))
+
 extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr,
 				      struct cmsghdr *__cmsg) __THROW;
 #ifdef __USE_EXTERN_INLINES
@@ -254,18 +260,38 @@ extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr,
 _EXTERN_INLINE struct cmsghdr *
 __NTH (__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg))
 {
+  /* We may safely assume that __cmsg lies between __mhdr->msg_control and
+     __mhdr->msg_controllen because the user is required to obtain the first
+     cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs
+     via CMSG_NXTHDR, setting lengths along the way.  However, we don't yet
+     trust the value of __cmsg->cmsg_len and therefore do not use it in any
+     pointer arithmetic until we check its value.  */
+
+  unsigned char * __msg_control_ptr = (unsigned char *) __mhdr->msg_control;
+  unsigned char * __cmsg_ptr = (unsigned char *) __cmsg;
+
+  size_t __size_needed = sizeof (struct cmsghdr)
+                         + __CMSG_PADDING (__cmsg->cmsg_len);
+
+  /* The current header is malformed, too small to be a full header.  */
   if ((size_t) __cmsg->cmsg_len < sizeof (struct cmsghdr))
-    /* The kernel header does this so there may be a reason.  */
     return (struct cmsghdr *) 0;
 
+  /* There isn't enough space between __cmsg and the end of the buffer to
+  hold the current cmsg *and* the next one.  */
+  if (((size_t)
+         (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr)
+       < __size_needed)
+      || ((size_t)
+            (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr
+             - __size_needed)
+          < __cmsg->cmsg_len))
+
+    return (struct cmsghdr *) 0;
+
+  /* Now, we trust cmsg_len and can use it to find the next header.  */
   __cmsg = (struct cmsghdr *) ((unsigned char *) __cmsg
 			       + CMSG_ALIGN (__cmsg->cmsg_len));
-  if ((unsigned char *) (__cmsg + 1) > ((unsigned char *) __mhdr->msg_control
-					+ __mhdr->msg_controllen)
-      || ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len)
-	  > ((unsigned char *) __mhdr->msg_control + __mhdr->msg_controllen)))
-    /* No more entries.  */
-    return (struct cmsghdr *) 0;
   return __cmsg;
 }
 #endif	/* Use `extern inline'.  */
diff --git a/catgets/open_catalog.c b/catgets/open_catalog.c
index 48c2a4b98..cb1d123cd 100644
--- a/catgets/open_catalog.c
+++ b/catgets/open_catalog.c
@@ -39,7 +39,7 @@ __open_catalog (const char *cat_name, const char *nlspath, const char *env_var,
 		__nl_catd catalog)
 {
   int fd = -1;
-  struct stat64 st;
+  struct __stat64_t64 st;
   int swapping;
   size_t cnt;
   size_t max_offset;
@@ -193,7 +193,7 @@ __open_catalog (const char *cat_name, const char *nlspath, const char *env_var,
       return -1;
     }
 
-  if (__builtin_expect (__fstat64 (fd, &st), 0) < 0)
+  if (__glibc_unlikely (__fstat64_time64 (fd, &st) < 0))
     goto close_unlock_return;
 
   if (__builtin_expect (!S_ISREG (st.st_mode), 0)
diff --git a/configure b/configure
index 00dc63838..443cf6ef9 100755
--- a/configure
+++ b/configure
@@ -730,7 +730,6 @@ infodir
 docdir
 oldincludedir
 includedir
-runstatedir
 localstatedir
 sharedstatedir
 sysconfdir
@@ -845,7 +844,6 @@ datadir='${datarootdir}'
 sysconfdir='${prefix}/etc'
 sharedstatedir='${prefix}/com'
 localstatedir='${prefix}/var'
-runstatedir='${localstatedir}/run'
 includedir='${prefix}/include'
 oldincludedir='/usr/include'
 docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
@@ -1098,15 +1096,6 @@ do
   | -silent | --silent | --silen | --sile | --sil)
     silent=yes ;;
 
-  -runstatedir | --runstatedir | --runstatedi | --runstated \
-  | --runstate | --runstat | --runsta | --runst | --runs \
-  | --run | --ru | --r)
-    ac_prev=runstatedir ;;
-  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
-  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
-  | --run=* | --ru=* | --r=*)
-    runstatedir=$ac_optarg ;;
-
   -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
     ac_prev=sbindir ;;
   -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1244,7 +1233,7 @@ fi
 for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
 		datadir sysconfdir sharedstatedir localstatedir includedir \
 		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-		libdir localedir mandir runstatedir
+		libdir localedir mandir
 do
   eval ac_val=\$$ac_var
   # Remove trailing slashes.
@@ -1397,7 +1386,6 @@ Fine tuning of the installation directories:
   --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
   --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
   --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
-  --runstatedir=DIR       modifiable per-process data [LOCALSTATEDIR/run]
   --libdir=DIR            object code libraries [EPREFIX/lib]
   --includedir=DIR        C header files [PREFIX/include]
   --oldincludedir=DIR     C header files for non-gcc [/usr/include]
@@ -3388,7 +3376,7 @@ fi
 if test "${with_default_link+set}" = set; then :
   withval=$with_default_link; use_default_link=$withval
 else
-  use_default_link=default
+  use_default_link=no
 fi
 
 
@@ -6235,69 +6223,6 @@ fi
 $as_echo "$libc_cv_hashstyle" >&6; }
 
 
-# The linker's default -shared behavior is good enough if it
-# does these things that our custom linker scripts ensure that
-# all allocated NOTE sections come first.
-if test "$use_default_link" = default; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sufficient default -shared layout" >&5
-$as_echo_n "checking for sufficient default -shared layout... " >&6; }
-if ${libc_cv_use_default_link+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-    libc_cv_use_default_link=no
-  cat > conftest.s <<\EOF
-	  .section .note.a,"a",%note
-	  .balign 4
-	  .long 4,4,9
-	  .string "GNU"
-	  .string "foo"
-	  .section .note.b,"a",%note
-	  .balign 4
-	  .long 4,4,9
-	  .string "GNU"
-	  .string "bar"
-EOF
-  if { ac_try='  ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&5'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } &&
-       ac_try=`$READELF -S conftest.so | sed -n \
-	 '${x;p;}
-	  s/^ *\[ *[1-9][0-9]*\]  *\([^ ][^ ]*\)  *\([^ ][^ ]*\) .*$/\2 \1/
-	  t a
-	  b
-	  : a
-	  H'`
-  then
-    libc_seen_a=no libc_seen_b=no
-    set -- $ac_try
-    while test $# -ge 2 -a "$1" = NOTE; do
-      case "$2" in
-      .note.a) libc_seen_a=yes ;;
-      .note.b) libc_seen_b=yes ;;
-      esac
-      shift 2
-    done
-    case "$libc_seen_a$libc_seen_b" in
-    yesyes)
-      libc_cv_use_default_link=yes
-      ;;
-    *)
-      echo >&5 "\
-$libc_seen_a$libc_seen_b from:
-$ac_try"
-      ;;
-    esac
-  fi
-  rm -f conftest*
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_use_default_link" >&5
-$as_echo "$libc_cv_use_default_link" >&6; }
-  use_default_link=$libc_cv_use_default_link
-fi
-
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for GLOB_DAT reloc" >&5
 $as_echo_n "checking for GLOB_DAT reloc... " >&6; }
 if ${libc_cv_has_glob_dat+:} false; then :
diff --git a/configure.ac b/configure.ac
index 87f67d25e..228261a49 100644
--- a/configure.ac
+++ b/configure.ac
@@ -153,7 +153,7 @@ AC_ARG_WITH([default-link],
 	    AS_HELP_STRING([--with-default-link],
 			   [do not use explicit linker scripts]),
 	    [use_default_link=$withval],
-	    [use_default_link=default])
+	    [use_default_link=no])
 
 dnl Additional build flags injection.
 AC_ARG_WITH([nonshared-cflags],
@@ -1402,59 +1402,6 @@ fi
 rm -f conftest*])
 AC_SUBST(libc_cv_hashstyle)
 
-# The linker's default -shared behavior is good enough if it
-# does these things that our custom linker scripts ensure that
-# all allocated NOTE sections come first.
-if test "$use_default_link" = default; then
-  AC_CACHE_CHECK([for sufficient default -shared layout],
-		  libc_cv_use_default_link, [dnl
-  libc_cv_use_default_link=no
-  cat > conftest.s <<\EOF
-	  .section .note.a,"a",%note
-	  .balign 4
-	  .long 4,4,9
-	  .string "GNU"
-	  .string "foo"
-	  .section .note.b,"a",%note
-	  .balign 4
-	  .long 4,4,9
-	  .string "GNU"
-	  .string "bar"
-EOF
-  if AC_TRY_COMMAND([dnl
-  ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&AS_MESSAGE_LOG_FD]) &&
-       ac_try=`$READELF -S conftest.so | sed -n \
-	 ['${x;p;}
-	  s/^ *\[ *[1-9][0-9]*\]  *\([^ ][^ ]*\)  *\([^ ][^ ]*\) .*$/\2 \1/
-	  t a
-	  b
-	  : a
-	  H']`
-  then
-    libc_seen_a=no libc_seen_b=no
-    set -- $ac_try
-    while test $# -ge 2 -a "$1" = NOTE; do
-      case "$2" in
-      .note.a) libc_seen_a=yes ;;
-      .note.b) libc_seen_b=yes ;;
-      esac
-      shift 2
-    done
-    case "$libc_seen_a$libc_seen_b" in
-    yesyes)
-      libc_cv_use_default_link=yes
-      ;;
-    *)
-      echo >&AS_MESSAGE_LOG_FD "\
-$libc_seen_a$libc_seen_b from:
-$ac_try"
-      ;;
-    esac
-  fi
-  rm -f conftest*])
-  use_default_link=$libc_cv_use_default_link
-fi
-
 AC_CACHE_CHECK(for GLOB_DAT reloc,
 	       libc_cv_has_glob_dat, [dnl
 cat > conftest.c <<EOF
diff --git a/csu/libc-start.c b/csu/libc-start.c
index e91f99642..b34bb6dbb 100644
--- a/csu/libc-start.c
+++ b/csu/libc-start.c
@@ -285,9 +285,6 @@ LIBC_START_MAIN (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL),
         }
     }
 
-  /* Initialize very early so that tunables can use it.  */
-  __libc_init_secure ();
-
   __tunables_init (__environ);
 
   ARCH_INIT_CPU_FEATURES ();
diff --git a/csu/libc-tls.c b/csu/libc-tls.c
index bef92a756..0a216c550 100644
--- a/csu/libc-tls.c
+++ b/csu/libc-tls.c
@@ -145,11 +145,16 @@ __libc_setup_tls (void)
      _dl_allocate_tls_storage (in elf/dl-tls.c) does using __libc_memalign
      and dl_tls_static_align.  */
   tcb_offset = roundup (memsz + GLRO(dl_tls_static_surplus), max_align);
-  tlsblock = __sbrk (tcb_offset + TLS_INIT_TCB_SIZE + max_align);
+  tlsblock = _dl_early_allocate (tcb_offset + TLS_INIT_TCB_SIZE + max_align);
+  if (tlsblock == NULL)
+    _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
 #elif TLS_DTV_AT_TP
   tcb_offset = roundup (TLS_INIT_TCB_SIZE, align ?: 1);
-  tlsblock = __sbrk (tcb_offset + memsz + max_align
-		     + TLS_PRE_TCB_SIZE + GLRO(dl_tls_static_surplus));
+  tlsblock = _dl_early_allocate (tcb_offset + memsz + max_align
+				 + TLS_PRE_TCB_SIZE
+				 + GLRO(dl_tls_static_surplus));
+  if (tlsblock == NULL)
+    _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
   tlsblock += TLS_PRE_TCB_SIZE;
 #else
   /* In case a model with a different layout for the TCB and DTV
diff --git a/debug/tst-fortify.c b/debug/tst-fortify.c
index d65a2fe6e..03c986771 100644
--- a/debug/tst-fortify.c
+++ b/debug/tst-fortify.c
@@ -1504,6 +1504,11 @@ do_test (void)
       CHK_FAIL_END
 #endif
 
+      /* Bug 29030 regresion check */
+      cp = "HelloWorld";
+      if (mbsrtowcs (NULL, &cp, (size_t)-1, &s) != 10)
+        FAIL ();
+
       cp = "A";
       if (mbstowcs (wenough, cp, 10) != 1
 	  || wcscmp (wenough, L"A") != 0)
diff --git a/dlfcn/Makefile b/dlfcn/Makefile
index 6e0a014d9..3255ba02c 100644
--- a/dlfcn/Makefile
+++ b/dlfcn/Makefile
@@ -73,6 +73,10 @@ tststatic3-ENV = $(tststatic-ENV)
 tststatic4-ENV = $(tststatic-ENV)
 tststatic5-ENV = $(tststatic-ENV)
 
+tests-internal += \
+  tst-dlinfo-phdr \
+  # tests-internal
+
 ifneq (,$(CXX))
 modules-names += bug-atexit3-lib
 else
diff --git a/dlfcn/dladdr.c b/dlfcn/dladdr.c
index ead117326..d188d0e28 100644
--- a/dlfcn/dladdr.c
+++ b/dlfcn/dladdr.c
@@ -24,7 +24,7 @@ int
 __dladdr (const void *address, Dl_info *info)
 {
 #ifdef SHARED
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dladdr (address, info);
 #endif
   return _dl_addr (address, info, NULL, NULL);
diff --git a/dlfcn/dladdr1.c b/dlfcn/dladdr1.c
index 5dadfd122..e0c9526c9 100644
--- a/dlfcn/dladdr1.c
+++ b/dlfcn/dladdr1.c
@@ -24,7 +24,7 @@ int
 __dladdr1 (const void *address, Dl_info *info, void **extra, int flags)
 {
 #ifdef SHARED
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dladdr1 (address, info, extra, flags);
 #endif
 
diff --git a/dlfcn/dlclose.c b/dlfcn/dlclose.c
index a9921c316..aab88c47f 100644
--- a/dlfcn/dlclose.c
+++ b/dlfcn/dlclose.c
@@ -24,7 +24,7 @@ int
 __dlclose (void *handle)
 {
 #ifdef SHARED
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlclose (handle);
 #endif
 
diff --git a/dlfcn/dlerror.c b/dlfcn/dlerror.c
index 3bf6049e3..b899d252a 100644
--- a/dlfcn/dlerror.c
+++ b/dlfcn/dlerror.c
@@ -32,7 +32,7 @@ char *
 __dlerror (void)
 {
 # ifdef SHARED
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlerror ();
 # endif
 
diff --git a/dlfcn/dlfcn.h b/dlfcn/dlfcn.h
index b5cd5c523..a3af6051d 100644
--- a/dlfcn/dlfcn.h
+++ b/dlfcn/dlfcn.h
@@ -164,7 +164,12 @@ enum
        segment, or if the calling thread has not allocated a block for it.  */
     RTLD_DI_TLS_DATA = 10,
 
-    RTLD_DI_MAX = 10
+    /* Treat ARG as const ElfW(Phdr) **, and store the address of the
+       program header array at that location.  The dlinfo call returns
+       the number of program headers in the array.  */
+    RTLD_DI_PHDR = 11,
+
+    RTLD_DI_MAX = 11
   };
 
 
diff --git a/dlfcn/dlinfo.c b/dlfcn/dlinfo.c
index fc63c0268..0fbe670d6 100644
--- a/dlfcn/dlinfo.c
+++ b/dlfcn/dlinfo.c
@@ -28,6 +28,10 @@ struct dlinfo_args
   void *handle;
   int request;
   void *arg;
+
+  /* This is the value that is returned from dlinfo if no error is
+     signaled.  */
+  int result;
 };
 
 static void
@@ -40,6 +44,7 @@ dlinfo_doit (void *argsblock)
     {
     case RTLD_DI_CONFIGADDR:
     default:
+      args->result = -1;
       _dl_signal_error (0, NULL, NULL, N_("unsupported dlinfo request"));
       break;
 
@@ -75,6 +80,11 @@ dlinfo_doit (void *argsblock)
 	*(void **) args->arg = data;
 	break;
       }
+
+    case RTLD_DI_PHDR:
+      *(const ElfW(Phdr) **) args->arg = l->l_phdr;
+      args->result = l->l_phnum;
+      break;
     }
 }
 
@@ -82,14 +92,15 @@ static int
 dlinfo_implementation (void *handle, int request, void *arg)
 {
   struct dlinfo_args args = { handle, request, arg };
-  return _dlerror_run (&dlinfo_doit, &args) ? -1 : 0;
+  _dlerror_run (&dlinfo_doit, &args);
+  return args.result;
 }
 
 #ifdef SHARED
 int
 ___dlinfo (void *handle, int request, void *arg)
 {
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlinfo (handle, request, arg);
   else
     return dlinfo_implementation (handle, request, arg);
diff --git a/dlfcn/dlmopen.c b/dlfcn/dlmopen.c
index 2437f5ce2..b41778f16 100644
--- a/dlfcn/dlmopen.c
+++ b/dlfcn/dlmopen.c
@@ -80,7 +80,7 @@ dlmopen_implementation (Lmid_t nsid, const char *file, int mode,
 void *
 ___dlmopen (Lmid_t nsid, const char *file, int mode)
 {
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlmopen (nsid, file, mode, RETURN_ADDRESS (0));
   else
     return dlmopen_implementation (nsid, file, mode, RETURN_ADDRESS (0));
diff --git a/dlfcn/dlopen.c b/dlfcn/dlopen.c
index 846ca3833..9b07b4e13 100644
--- a/dlfcn/dlopen.c
+++ b/dlfcn/dlopen.c
@@ -75,7 +75,7 @@ dlopen_implementation (const char *file, int mode, void *dl_caller)
 void *
 ___dlopen (const char *file, int mode)
 {
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
   else
     return dlopen_implementation (file, mode, RETURN_ADDRESS (0));
@@ -90,7 +90,7 @@ compat_symbol (libdl, ___dlopen, dlopen, GLIBC_2_1);
 void *
 __dlopen (const char *file, int mode, void *dl_caller)
 {
-  return dlopen_implementation (file, mode, RETURN_ADDRESS (0));
+  return dlopen_implementation (file, mode, dl_caller);
 }
 
 void *
diff --git a/dlfcn/dlopenold.c b/dlfcn/dlopenold.c
index 67601434d..5c21a0049 100644
--- a/dlfcn/dlopenold.c
+++ b/dlfcn/dlopenold.c
@@ -70,7 +70,7 @@ __dlopen_nocheck (const char *file, int mode)
     mode |= RTLD_LAZY;
   args.mode = mode;
 
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
 
   return _dlerror_run (dlopen_doit, &args) ? NULL : args.new;
diff --git a/dlfcn/dlsym.c b/dlfcn/dlsym.c
index a71f8ae24..2e9ff98e7 100644
--- a/dlfcn/dlsym.c
+++ b/dlfcn/dlsym.c
@@ -62,7 +62,7 @@ dlsym_implementation (void *handle, const char *name, void *dl_caller)
 void *
 ___dlsym (void *handle, const char *name)
 {
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlsym (handle, name, RETURN_ADDRESS (0));
   else
     return dlsym_implementation (handle, name, RETURN_ADDRESS (0));
diff --git a/dlfcn/dlvsym.c b/dlfcn/dlvsym.c
index 72219d6da..caa46ba1e 100644
--- a/dlfcn/dlvsym.c
+++ b/dlfcn/dlvsym.c
@@ -65,7 +65,7 @@ dlvsym_implementation (void *handle, const char *name, const char *version,
 void *
 ___dlvsym (void *handle, const char *name, const char *version)
 {
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->dlvsym (handle, name, version,
 					 RETURN_ADDRESS (0));
   else
diff --git a/dlfcn/tst-dlinfo-phdr.c b/dlfcn/tst-dlinfo-phdr.c
new file mode 100644
index 000000000..a15a7d48e
--- /dev/null
+++ b/dlfcn/tst-dlinfo-phdr.c
@@ -0,0 +1,125 @@
+/* Test for dlinfo (RTLD_DI_PHDR).
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/auxv.h>
+
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+/* Used to verify that the program header array appears as expected
+   among the dl_iterate_phdr callback invocations.  */
+
+struct dlip_callback_args
+{
+  struct link_map *l;           /* l->l_addr is used to find the object.  */
+  const ElfW(Phdr) *phdr;       /* Expected program header pointed.  */
+  int phnum;                    /* Expected program header count.  */
+  bool found;                   /* True if l->l_addr has been found.  */
+};
+
+static int
+dlip_callback (struct dl_phdr_info *dlpi, size_t size, void *closure)
+{
+  TEST_COMPARE (sizeof (*dlpi), size);
+  struct dlip_callback_args *args = closure;
+
+  if (dlpi->dlpi_addr == args->l->l_addr)
+    {
+      TEST_VERIFY (!args->found);
+      args->found = true;
+      TEST_VERIFY (args->phdr == dlpi->dlpi_phdr);
+      TEST_COMPARE (args->phnum, dlpi->dlpi_phnum);
+    }
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  /* Avoid a copy relocation.  */
+  struct r_debug *debug = xdlsym (RTLD_DEFAULT, "_r_debug");
+  struct link_map *l = (struct link_map *) debug->r_map;
+  TEST_VERIFY_EXIT (l != NULL);
+
+  do
+    {
+      printf ("info: checking link map %p (%p) for \"%s\"\n",
+              l, l->l_phdr, l->l_name);
+
+      /* Cause dlerror () to return an error message.  */
+      dlsym (RTLD_DEFAULT, "does-not-exist");
+
+      /* Use the extension that link maps are valid dlopen handles.  */
+      const ElfW(Phdr) *phdr;
+      int phnum = dlinfo (l, RTLD_DI_PHDR, &phdr);
+      TEST_VERIFY (phnum >= 0);
+      /* Verify that the error message has been cleared.  */
+      TEST_COMPARE_STRING (dlerror (), NULL);
+
+      TEST_VERIFY (phdr == l->l_phdr);
+      TEST_COMPARE (phnum, l->l_phnum);
+
+      /* Check that we can find PT_DYNAMIC among the array.  */
+      {
+        bool dynamic_found = false;
+        for (int i = 0; i < phnum; ++i)
+          if (phdr[i].p_type == PT_DYNAMIC)
+            {
+              dynamic_found = true;
+              TEST_COMPARE ((ElfW(Addr)) l->l_ld, l->l_addr + phdr[i].p_vaddr);
+            }
+        TEST_VERIFY (dynamic_found);
+      }
+
+      /* Check that dl_iterate_phdr finds the link map with the same
+         program headers.  */
+      {
+        struct dlip_callback_args args =
+          {
+            .l =  l,
+            .phdr = phdr,
+            .phnum = phnum,
+            .found = false,
+          };
+        TEST_COMPARE (dl_iterate_phdr (dlip_callback, &args), 0);
+        TEST_VERIFY (args.found);
+      }
+
+      if (l->l_prev == NULL)
+        {
+          /* This is the executable, so the information is also
+             available via getauxval.  */
+          TEST_COMPARE_STRING (l->l_name, "");
+          TEST_VERIFY (phdr == (const ElfW(Phdr) *) getauxval (AT_PHDR));
+          TEST_COMPARE (phnum, getauxval (AT_PHNUM));
+        }
+
+      l = l->l_next;
+    }
+  while (l != NULL);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/Makefile b/elf/Makefile
index 5bdf0a383..bf2550472 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -33,6 +33,7 @@ routines = \
   $(all-dl-routines) \
   dl-addr \
   dl-addr-obj \
+  dl-early_allocate \
   dl-error \
   dl-iteratephdr \
   dl-libc \
@@ -108,6 +109,7 @@ all-dl-routines = $(dl-routines) $(sysdep-dl-routines)
 # But they are absent from the shared libc, because that code is in ld.so.
 elide-routines.os = \
   $(all-dl-routines) \
+  dl-early_allocate \
   dl-exception \
   dl-origin \
   dl-reloc-static-pie \
@@ -161,6 +163,11 @@ ifeq (yes,$(have-loop-to-function))
 CFLAGS-rtld.c += -fno-tree-loop-distribute-patterns
 endif
 
+ifeq (yes,$(have-loop-to-function))
+# Likewise, during static library startup, memset is not yet available.
+CFLAGS-dl-support.c = -fno-tree-loop-distribute-patterns
+endif
+
 # Compile rtld itself without stack protection.
 # Also compile all routines in the static library that are elided from
 # the shared libc because they are in libc.a in the same way.
@@ -272,6 +279,7 @@ tests-static-normal := \
   tst-linkall-static \
   tst-single_threaded-pthread-static \
   tst-single_threaded-static \
+  tst-tls-allocation-failure-static \
   tst-tlsalign-extern-static \
   tst-tlsalign-static \
   # tests-static-normal
@@ -366,6 +374,8 @@ tests += \
   tst-align \
   tst-align2 \
   tst-align3 \
+  tst-audit-tlsdesc \
+  tst-audit-tlsdesc-dlopen \
   tst-audit1 \
   tst-audit2 \
   tst-audit8 \
@@ -386,6 +396,7 @@ tests += \
   tst-audit24d \
   tst-audit25a \
   tst-audit25b \
+  tst-audit26 \
   tst-auditmany \
   tst-auxobj \
   tst-auxobj-dlopen \
@@ -399,6 +410,7 @@ tests += \
   tst-dlmopen4 \
   tst-dlmopen-dlerror \
   tst-dlmopen-gethostbyname \
+  tst-dlmopen-twice \
   tst-dlopenfail \
   tst-dlopenfail-2 \
   tst-dlopenrpath \
@@ -539,6 +551,39 @@ endif
 endif
 endif
 
+tests-special += $(objpfx)tst-relro-ldso.out $(objpfx)tst-relro-libc.out
+$(objpfx)tst-relro-ldso.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
+  $(objpfx)ld.so
+	$(PYTHON) tst-relro-symbols.py $(objpfx)ld.so \
+	  --required=_rtld_global_ro \
+	  > $@ 2>&1; $(evaluate-test)
+# The optional symbols are present in libc only if the architecture has
+# the GLIBC_2.0 symbol set in libc.
+$(objpfx)tst-relro-libc.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
+  $(common-objpfx)libc.so
+	$(PYTHON) tst-relro-symbols.py $(common-objpfx)libc.so \
+	    --required=_IO_cookie_jumps \
+	    --required=_IO_file_jumps \
+	    --required=_IO_file_jumps_maybe_mmap \
+	    --required=_IO_file_jumps_mmap \
+	    --required=_IO_helper_jumps \
+	    --required=_IO_mem_jumps \
+	    --required=_IO_obstack_jumps \
+	    --required=_IO_proc_jumps \
+	    --required=_IO_str_chk_jumps \
+	    --required=_IO_str_jumps \
+	    --required=_IO_strn_jumps \
+	    --required=_IO_wfile_jumps \
+	    --required=_IO_wfile_jumps_maybe_mmap \
+	    --required=_IO_wfile_jumps_mmap \
+	    --required=_IO_wmem_jumps \
+	    --required=_IO_wstr_jumps \
+	    --required=_IO_wstrn_jumps \
+	    --optional=_IO_old_cookie_jumps \
+	    --optional=_IO_old_file_jumps \
+	    --optional=_IO_old_proc_jumps \
+	  > $@ 2>&1; $(evaluate-test)
+
 ifeq ($(run-built-tests),yes)
 tests-special += $(objpfx)tst-valgrind-smoke.out
 endif
@@ -613,6 +658,16 @@ modules-names = \
   libmarkermod4-2 \
   libmarkermod4-3 \
   libmarkermod4-4 \
+  libmarkermod5-1 \
+  libmarkermod5-2 \
+  libmarkermod5-3 \
+  libmarkermod5-4 \
+  libmarkermod5-5 \
+  libtracemod1-1 \
+  libtracemod2-1 \
+  libtracemod3-1 \
+  libtracemod4-1 \
+  libtracemod5-1 \
   ltglobmod1 \
   ltglobmod2 \
   neededobj1 \
@@ -674,6 +729,8 @@ modules-names = \
   tst-alignmod3 \
   tst-array2dep \
   tst-array5dep \
+  tst-audit-tlsdesc-mod1 \
+  tst-audit-tlsdesc-mod2 \
   tst-audit11mod1 \
   tst-audit11mod2 \
   tst-audit12mod1 \
@@ -707,6 +764,7 @@ modules-names = \
   tst-auditmanymod7 \
   tst-auditmanymod8 \
   tst-auditmanymod9 \
+  tst-auditmod-tlsdesc  \
   tst-auditmod1 \
   tst-auditmod9a \
   tst-auditmod9b \
@@ -725,6 +783,7 @@ modules-names = \
   tst-auditmod24c \
   tst-auditmod24d \
   tst-auditmod25 \
+  tst-auditmod26 \
   tst-auxvalmod \
   tst-big-note-lib \
   tst-deep1mod1 \
@@ -742,6 +801,8 @@ modules-names = \
   tst-dlmopen1mod \
   tst-dlmopen-dlerror-mod \
   tst-dlmopen-gethostbyname-mod \
+  tst-dlmopen-twice-mod1 \
+  tst-dlmopen-twice-mod2 \
   tst-dlopenfaillinkmod \
   tst-dlopenfailmod1 \
   tst-dlopenfailmod2 \
@@ -895,23 +956,8 @@ modules-names += tst-gnu2-tls1mod
 $(objpfx)tst-gnu2-tls1: $(objpfx)tst-gnu2-tls1mod.so
 tst-gnu2-tls1mod.so-no-z-defs = yes
 CFLAGS-tst-gnu2-tls1mod.c += -mtls-dialect=gnu2
+endif # $(have-mtls-dialect-gnu2)
 
-tests += tst-audit-tlsdesc tst-audit-tlsdesc-dlopen
-modules-names += tst-audit-tlsdesc-mod1 tst-audit-tlsdesc-mod2 tst-auditmod-tlsdesc
-$(objpfx)tst-audit-tlsdesc: $(objpfx)tst-audit-tlsdesc-mod1.so \
-			    $(objpfx)tst-audit-tlsdesc-mod2.so \
-			    $(shared-thread-library)
-CFLAGS-tst-audit-tlsdesc-mod1.c += -mtls-dialect=gnu2
-CFLAGS-tst-audit-tlsdesc-mod2.c += -mtls-dialect=gnu2
-$(objpfx)tst-audit-tlsdesc-dlopen: $(shared-thread-library)
-$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-audit-tlsdesc-mod1.so \
-				       $(objpfx)tst-audit-tlsdesc-mod2.so
-$(objpfx)tst-audit-tlsdesc-mod1.so: $(objpfx)tst-audit-tlsdesc-mod2.so
-$(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
-tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
-$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
-tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
-endif
 ifeq (yes,$(have-protected-data))
 modules-names += tst-protected1moda tst-protected1modb
 tests += tst-protected1a tst-protected1b
@@ -943,7 +989,7 @@ extra-test-objs += $(addsuffix .os,$(strip $(modules-names)))
 # filtmod1.so, tst-big-note-lib.so, tst-ro-dynamic-mod.so have special
 # rules.
 modules-names-nobuild := filtmod1 tst-big-note-lib tst-ro-dynamic-mod \
-			 tst-audit24bmod1 tst-audit24bmod2.so
+			 tst-audit24bmod1 tst-audit24bmod2
 
 tests += $(tests-static)
 
@@ -1072,6 +1118,11 @@ tests-special += \
   $(objpfx)tst-initorder2-cmp.out \
   $(objpfx)tst-unused-dep-cmp.out \
   $(objpfx)tst-unused-dep.out \
+  $(objpfx)tst-trace1.out \
+  $(objpfx)tst-trace2.out \
+  $(objpfx)tst-trace3.out \
+  $(objpfx)tst-trace4.out \
+  $(objpfx)tst-trace5.out \
   # tests-special
 endif
 
@@ -1111,6 +1162,17 @@ CFLAGS-tst-prelink.c += -fno-pie
 tst-prelink-no-pie = yes
 endif
 
+tests-special += $(objpfx)tst-glibcelf.out
+$(objpfx)tst-glibcelf.out: tst-glibcelf.py elf.h $(..)/scripts/glibcelf.py \
+  $(..)/scripts/glibcextract.py
+	PYTHONPATH=$(..)scripts $(PYTHON) tst-glibcelf.py \
+          --cc="$(CC) $(patsubst -DMODULE_NAME=%,-DMODULE_NAME=testsuite,$(CPPFLAGS))" \
+	  < /dev/null > $@ 2>&1; $(evaluate-test)
+
+ifeq ($(run-built-tests),yes)
+tests-special += $(objpfx)tst-tls-allocation-failure-static-patched.out
+endif
+
 # The test requires shared _and_ PIE because the executable
 # unit test driver must be able to link with the shared object
 # that is going to eventually go into an installed DSO.
@@ -1234,8 +1296,7 @@ $(objpfx)ld.so: $(objpfx)librtld.os $(ld-map)
 	$(LINK.o) -nostdlib -nostartfiles -shared -o $@.new		\
 		  $(LDFLAGS-rtld) -Wl,-z,defs $(z-now-$(bind-now))	\
 		  $(filter-out $(map-file),$^) $(load-map-file)		\
-		  -Wl,-soname=$(rtld-installed-name)			\
-		  -Wl,-defsym=_begin=0
+		  -Wl,-soname=$(rtld-installed-name)
 	$(call after-link,$@.new)
 	$(READELF) -s $@.new \
 	  | $(AWK) '($$7 ~ /^UND(|EF)$$/ && $$1 != "0:" && $$4 != "REGISTER") { print; p=1 } END { exit p != 0 }'
@@ -2210,7 +2271,7 @@ $(objpfx)tst-audit24c.out: $(objpfx)tst-auditmod24c.so
 $(objpfx)tst-audit24c: $(objpfx)tst-audit24amod1.so \
 		       $(objpfx)tst-audit24amod2.so
 tst-audit24c-ENV = LD_BIND_NOW=1 LD_AUDIT=$(objpfx)tst-auditmod24c.so
-LDFLAGS-tst-audit24b = -Wl,-z,lazy
+LDFLAGS-tst-audit24c = -Wl,-z,lazy
 
 $(objpfx)tst-audit24d.out: $(objpfx)tst-auditmod24d.so
 $(objpfx)tst-audit24d: $(objpfx)tst-audit24dmod1.so \
@@ -2242,6 +2303,10 @@ $(objpfx)tst-audit25b: $(objpfx)tst-audit25mod1.so \
 LDFLAGS-tst-audit25b = -Wl,-z,now
 tst-audit25b-ARGS = -- $(host-test-program-cmd)
 
+$(objpfx)tst-audit26.out: $(objpfx)tst-auditmod26.so
+$(objpfx)tst-auditmod26.so: $(libsupport)
+tst-audit26-ENV = LD_AUDIT=$(objpfx)tst-auditmod26.so
+
 # tst-sonamemove links against an older implementation of the library.
 LDFLAGS-tst-sonamemove-linkmod1.so = \
   -Wl,--version-script=tst-sonamemove-linkmod1.map \
@@ -2509,6 +2574,7 @@ LDFLAGS-libmarkermod1-1.so += -Wl,-soname,libmarkermod1.so
 LDFLAGS-libmarkermod2-1.so += -Wl,-soname,libmarkermod2.so
 LDFLAGS-libmarkermod3-1.so += -Wl,-soname,libmarkermod3.so
 LDFLAGS-libmarkermod4-1.so += -Wl,-soname,libmarkermod4.so
+LDFLAGS-libmarkermod5-1.so += -Wl,-soname,libmarkermod5.so
 $(objpfx)libmarkermod%.os : markermodMARKER-VALUE.c
 	$(compile-command.c) \
 	  -DMARKER=marker$(firstword $(subst -, ,$*)) \
@@ -2521,6 +2587,8 @@ $(objpfx)libmarkermod3.so: $(objpfx)libmarkermod3-1.so
 	cp $< $@
 $(objpfx)libmarkermod4.so: $(objpfx)libmarkermod4-1.so
 	cp $< $@
+$(objpfx)libmarkermod5.so: $(objpfx)libmarkermod5-1.so
+	cp $< $@
 
 # tst-glibc-hwcaps-prepend checks that --glibc-hwcaps-prepend is
 # preferred over auto-detected subdirectories.
@@ -2733,3 +2801,81 @@ $(objpfx)tst-p_align3: $(objpfx)tst-p_alignmod3.so
 $(objpfx)tst-p_align3.out: tst-p_align3.sh $(objpfx)tst-p_align3
 	$(SHELL) $< $(common-objpfx) '$(test-program-prefix)'; \
 	$(evaluate-test)
+
+LDFLAGS-libtracemod1-1.so += -Wl,-soname,libtracemod1.so
+LDFLAGS-libtracemod2-1.so += -Wl,-soname,libtracemod2.so
+LDFLAGS-libtracemod3-1.so += -Wl,-soname,libtracemod3.so
+LDFLAGS-libtracemod4-1.so += -Wl,-soname,libtracemod4.so
+LDFLAGS-libtracemod5-1.so += -Wl,-soname,libtracemod5.so
+
+$(objpfx)libtracemod1-1.so: $(objpfx)libtracemod2-1.so \
+			    $(objpfx)libtracemod3-1.so
+$(objpfx)libtracemod2-1.so: $(objpfx)libtracemod4-1.so \
+			    $(objpfx)libtracemod5-1.so
+
+define libtracemod-x
+$(objpfx)libtracemod$(1)/libtracemod$(1).so: $(objpfx)libtracemod$(1)-1.so
+	$$(make-target-directory)
+	cp $$< $$@
+endef
+libtracemod-suffixes = 1 2 3 4 5
+$(foreach i,$(libtracemod-suffixes), $(eval $(call libtracemod-x,$(i))))
+
+define tst-trace-skeleton
+$(objpfx)tst-trace$(1).out: $(objpfx)libtracemod1/libtracemod1.so \
+			    $(objpfx)libtracemod2/libtracemod2.so \
+			    $(objpfx)libtracemod3/libtracemod3.so \
+			    $(objpfx)libtracemod4/libtracemod4.so \
+			    $(objpfx)libtracemod5/libtracemod5.so \
+			    $(..)scripts/tst-ld-trace.py \
+			    tst-trace$(1).exp
+	${ $(PYTHON) $(..)scripts/tst-ld-trace.py \
+	    "$(test-wrapper-env) $(elf-objpfx)$(rtld-installed-name) \
+	    --library-path $(common-objpfx):$(strip $(2)) \
+	    $(objpfx)libtracemod1/libtracemod1.so" tst-trace$(1).exp \
+	} > $$@; $$(evaluate-test)
+endef
+
+$(eval $(call tst-trace-skeleton,1,))
+$(eval $(call tst-trace-skeleton,2,\
+	$(objpfx)libtracemod2))
+$(eval $(call tst-trace-skeleton,3,\
+	$(objpfx)libtracemod2:$(objpfx)libtracemod3))
+$(eval $(call tst-trace-skeleton,4,\
+	$(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4))
+$(eval $(call tst-trace-skeleton,5,\
+	$(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4:$(objpfx)libtracemod5))
+
+$(objpfx)tst-tls-allocation-failure-static-patched: \
+  $(objpfx)tst-tls-allocation-failure-static $(..)scripts/tst-elf-edit.py
+	cp $< $@
+	$(PYTHON) $(..)scripts/tst-elf-edit.py --maximize-tls-size $@
+
+$(objpfx)tst-tls-allocation-failure-static-patched.out: \
+  $(objpfx)tst-tls-allocation-failure-static-patched
+	$< > $@ 2>&1; echo "status: $$?" >> $@
+	grep -q '^Fatal glibc error: Cannot allocate TLS block$$' $@ \
+	  && grep -q '^status: 127$$' $@; \
+	  $(evaluate-test)
+
+$(objpfx)tst-audit-tlsdesc: $(objpfx)tst-audit-tlsdesc-mod1.so \
+			    $(objpfx)tst-audit-tlsdesc-mod2.so \
+			    $(shared-thread-library)
+ifeq (yes,$(have-mtls-dialect-gnu2))
+# The test is valid for all TLS types, but we want to exercise GNU2
+# TLS if possible.
+CFLAGS-tst-audit-tlsdesc-mod1.c += -mtls-dialect=gnu2
+CFLAGS-tst-audit-tlsdesc-mod2.c += -mtls-dialect=gnu2
+endif
+$(objpfx)tst-audit-tlsdesc-dlopen: $(shared-thread-library)
+$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-audit-tlsdesc-mod1.so \
+				       $(objpfx)tst-audit-tlsdesc-mod2.so
+$(objpfx)tst-audit-tlsdesc-mod1.so: $(objpfx)tst-audit-tlsdesc-mod2.so
+$(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
+tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
+$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
+tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
+
+$(objpfx)tst-dlmopen-twice.out: \
+  $(objpfx)tst-dlmopen-twice-mod1.so \
+  $(objpfx)tst-dlmopen-twice-mod2.so
diff --git a/elf/dl-audit.c b/elf/dl-audit.c
index 794bfd45c..efc049247 100644
--- a/elf/dl-audit.c
+++ b/elf/dl-audit.c
@@ -257,7 +257,8 @@ _dl_audit_symbind (struct link_map *l, struct reloc_result *reloc_result,
       reloc_result->flags = flags;
     }
 
-  DL_FIXUP_BINDNOW_RELOC (value, new_value, sym.st_value);
+  if (flags & LA_SYMB_ALTVALUE)
+    DL_FIXUP_BINDNOW_RELOC (value, new_value, sym.st_value);
 }
 
 void
diff --git a/elf/dl-deps.c b/elf/dl-deps.c
index c8bab5cad..cfe7f0743 100644
--- a/elf/dl-deps.c
+++ b/elf/dl-deps.c
@@ -489,6 +489,8 @@ _dl_map_object_deps (struct link_map *map,
 
   for (nlist = 0, runp = known; runp; runp = runp->next)
     {
+      /* _dl_sort_maps ignores l_faked object, so it is safe to not consider
+	 them for nlist.  */
       if (__builtin_expect (trace_mode, 0) && runp->map->l_faked)
 	/* This can happen when we trace the loading.  */
 	--map->l_searchlist.r_nlist;
diff --git a/elf/dl-early_allocate.c b/elf/dl-early_allocate.c
new file mode 100644
index 000000000..61677aaa0
--- /dev/null
+++ b/elf/dl-early_allocate.c
@@ -0,0 +1,30 @@
+/* Early memory allocation for the dynamic loader.  Generic version.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+#include <stddef.h>
+#include <unistd.h>
+
+void *
+_dl_early_allocate (size_t size)
+{
+  void *result = __sbrk (size);
+  if (result == (void *) -1)
+    result = NULL;
+  return result;
+}
diff --git a/elf/dl-find_object.c b/elf/dl-find_object.c
index 2b8df2fd6..4d5831b6f 100644
--- a/elf/dl-find_object.c
+++ b/elf/dl-find_object.c
@@ -788,6 +788,9 @@ _dl_find_object_update (struct link_map *new_map)
   for (struct link_map *l = new_map; l != NULL; l = l->l_next)
     /* Skip proxy maps and already-processed maps.  */
     count += l == l->l_real && !l->l_find_object_processed;
+  if (count == 0)
+    return true;
+
   struct link_map **map_array = malloc (count * sizeof (*map_array));
   if (map_array == NULL)
     return false;
@@ -797,8 +800,6 @@ _dl_find_object_update (struct link_map *new_map)
       if (l == l->l_real && !l->l_find_object_processed)
         map_array[i++] = l;
   }
-  if (count == 0)
-    return true;
 
   _dl_find_object_link_map_sort (map_array, count);
   bool ok = _dl_find_object_update_1 (map_array, count);
diff --git a/elf/dl-hwcaps.c b/elf/dl-hwcaps.c
index 6f161f6ad..92eb53790 100644
--- a/elf/dl-hwcaps.c
+++ b/elf/dl-hwcaps.c
@@ -193,7 +193,7 @@ _dl_important_hwcaps (const char *glibc_hwcaps_prepend,
   /* Each hwcaps subdirectory has a GLIBC_HWCAPS_PREFIX string prefix
      and a "/" suffix once stored in the result.  */
   hwcaps_counts.maximum_length += strlen (GLIBC_HWCAPS_PREFIX) + 1;
-  size_t total = (hwcaps_counts.count * (strlen (GLIBC_HWCAPS_PREFIX) + 1)
+  size_t hwcaps_sz = (hwcaps_counts.count * (strlen (GLIBC_HWCAPS_PREFIX) + 1)
 		  + hwcaps_counts.total_length);
 
   /* Count the number of bits set in the masked value.  */
@@ -229,11 +229,12 @@ _dl_important_hwcaps (const char *glibc_hwcaps_prepend,
   assert (m == cnt);
 
   /* Determine the total size of all strings together.  */
+  size_t total;
   if (cnt == 1)
-    total += temp[0].len + 1;
+    total = temp[0].len + 1;
   else
     {
-      total += temp[0].len + temp[cnt - 1].len + 2;
+      total = temp[0].len + temp[cnt - 1].len + 2;
       if (cnt > 2)
 	{
 	  total <<= 1;
@@ -255,6 +256,7 @@ _dl_important_hwcaps (const char *glibc_hwcaps_prepend,
   /* This is the overall result, including both glibc-hwcaps
      subdirectories and the legacy hwcaps subdirectories using the
      power set construction.  */
+  total += hwcaps_sz;
   struct r_strlenpair *overall_result
     = malloc (*sz * sizeof (*result) + total);
   if (overall_result == NULL)
diff --git a/elf/dl-libc.c b/elf/dl-libc.c
index a7180d0af..266e068da 100644
--- a/elf/dl-libc.c
+++ b/elf/dl-libc.c
@@ -156,7 +156,7 @@ __libc_dlopen_mode (const char *name, int mode)
   args.caller_dlopen = RETURN_ADDRESS (0);
 
 #ifdef SHARED
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->libc_dlopen_mode (name, mode);
 #endif
   return dlerror_run (do_dlopen, &args) ? NULL : (void *) args.map;
@@ -184,7 +184,7 @@ __libc_dlsym (void *map, const char *name)
   args.name = name;
 
 #ifdef SHARED
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->libc_dlsym (map, name);
 #endif
   return (dlerror_run (do_dlsym, &args) ? NULL
@@ -198,7 +198,7 @@ void *
 __libc_dlvsym (void *map, const char *name, const char *version)
 {
 #ifdef SHARED
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->libc_dlvsym (map, name, version);
 #endif
 
@@ -221,7 +221,7 @@ int
 __libc_dlclose (void *map)
 {
 #ifdef SHARED
-  if (!rtld_active ())
+  if (GLRO (dl_dlfcn_hook) != NULL)
     return GLRO (dl_dlfcn_hook)->libc_dlclose (map);
 #endif
   return dlerror_run (do_dlclose, map);
diff --git a/elf/dl-map-segments.h b/elf/dl-map-segments.h
index 172692b12..fd24cf5d0 100644
--- a/elf/dl-map-segments.h
+++ b/elf/dl-map-segments.h
@@ -113,6 +113,9 @@ _dl_map_segments (struct link_map *l, int fd,
              unallocated.  Then jump into the normal segment-mapping loop to
              handle the portion of the segment past the end of the file
              mapping.  */
+	  if (__glibc_unlikely (loadcmds[nloadcmds - 1].mapstart <
+				c->mapend))
+	    return N_("ELF load command address/offset not page-aligned");
           if (__glibc_unlikely
               (__mprotect ((caddr_t) (l->l_addr + c->mapend),
                            loadcmds[nloadcmds - 1].mapstart - c->mapend,
diff --git a/elf/dl-open.c b/elf/dl-open.c
index a23e65926..46e8066fd 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -844,11 +844,14 @@ _dl_open (const char *file, int mode, const void *caller_dlopen, Lmid_t nsid,
 	  _dl_signal_error (EINVAL, file, NULL, N_("\
 no more namespaces available for dlmopen()"));
 	}
-      else if (nsid == GL(dl_nns))
-	{
-	  __rtld_lock_initialize (GL(dl_ns)[nsid]._ns_unique_sym_table.lock);
-	  ++GL(dl_nns);
-	}
+
+      if (nsid == GL(dl_nns))
+	++GL(dl_nns);
+
+      /* Initialize the new namespace.  Most members are
+	 zero-initialized, only the lock needs special treatment.  */
+      memset (&GL(dl_ns)[nsid], 0, sizeof (GL(dl_ns)[nsid]));
+      __rtld_lock_initialize (GL(dl_ns)[nsid]._ns_unique_sym_table.lock);
 
       _dl_debug_update (nsid)->r_state = RT_CONSISTENT;
     }
diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
index 9e9d53ec4..3e2a6a584 100644
--- a/elf/dl-sort-maps.c
+++ b/elf/dl-sort-maps.c
@@ -27,12 +27,12 @@
    If FOR_FINI is true, this is called for finishing an object.  */
 static void
 _dl_sort_maps_original (struct link_map **maps, unsigned int nmaps,
-			unsigned int skip, bool for_fini)
+			bool force_first, bool for_fini)
 {
   /* Allows caller to do the common optimization of skipping the first map,
      usually the main binary.  */
-  maps += skip;
-  nmaps -= skip;
+  maps += force_first;
+  nmaps -= force_first;
 
   /* A list of one element need not be sorted.  */
   if (nmaps <= 1)
@@ -140,7 +140,9 @@ static void
 dfs_traversal (struct link_map ***rpo, struct link_map *map,
 	       bool *do_reldeps)
 {
-  if (map->l_visited)
+  /* _dl_map_object_deps ignores l_faked objects when calculating the
+     number of maps before calling _dl_sort_maps, ignore them as well.  */
+  if (map->l_visited || map->l_faked)
     return;
 
   map->l_visited = 1;
@@ -180,8 +182,9 @@ dfs_traversal (struct link_map ***rpo, struct link_map *map,
 
 static void
 _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
-		   unsigned int skip __attribute__ ((unused)), bool for_fini)
+		   bool force_first, bool for_fini)
 {
+  struct link_map *first_map = maps[0];
   for (int i = nmaps - 1; i >= 0; i--)
     maps[i]->l_visited = 0;
 
@@ -206,14 +209,6 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
      Adjusting the order so that maps[0] is last traversed naturally avoids
      this problem.
 
-     Further, the old "optimization" of skipping the main object at maps[0]
-     from the call-site (i.e. _dl_sort_maps(maps+1,nmaps-1)) is in general
-     no longer valid, since traversing along object dependency-links
-     may "find" the main object even when it is not included in the initial
-     order (e.g. a dlopen()'ed shared object can have circular dependencies
-     linked back to itself). In such a case, traversing N-1 objects will
-     create a N-object result, and raise problems.
-
      To summarize, just passing in the full list, and iterating from back
      to front makes things much more straightforward.  */
 
@@ -272,6 +267,27 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
     }
 
   memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
+
+  /* Skipping the first object at maps[0] is not valid in general,
+     since traversing along object dependency-links may "find" that
+     first object even when it is not included in the initial order
+     (e.g., a dlopen'ed shared object can have circular dependencies
+     linked back to itself).  In such a case, traversing N-1 objects
+     will create a N-object result, and raise problems.  Instead,
+     force the object back into first place after sorting.  This naive
+     approach may introduce further dependency ordering violations
+     compared to rotating the cycle until the first map is again in
+     the first position, but as there is a cycle, at least one
+     violation is already present.  */
+  if (force_first && maps[0] != first_map)
+    {
+      int i;
+      for (i = 0; maps[i] != first_map; ++i)
+	;
+      assert (i < nmaps);
+      memmove (&maps[1], maps, i * sizeof (maps[0]));
+      maps[0] = first_map;
+    }
 }
 
 void
@@ -284,7 +300,7 @@ _dl_sort_maps_init (void)
 
 void
 _dl_sort_maps (struct link_map **maps, unsigned int nmaps,
-	       unsigned int skip, bool for_fini)
+	       bool force_first, bool for_fini)
 {
   /* It can be tempting to use a static function pointer to store and call
      the current selected sorting algorithm routine, but experimentation
@@ -294,9 +310,9 @@ _dl_sort_maps (struct link_map **maps, unsigned int nmaps,
      input cases. A simple if-case with direct function calls appears to
      be the fastest.  */
   if (__glibc_likely (GLRO(dl_dso_sort_algo) == dso_sort_algorithm_original))
-    _dl_sort_maps_original (maps, nmaps, skip, for_fini);
+    _dl_sort_maps_original (maps, nmaps, force_first, for_fini);
   else
-    _dl_sort_maps_dfs (maps, nmaps, skip, for_fini);
+    _dl_sort_maps_dfs (maps, nmaps, force_first, for_fini);
 }
 
 #endif /* HAVE_TUNABLES.  */
diff --git a/elf/dl-support.c b/elf/dl-support.c
index fb6476553..09079c124 100644
--- a/elf/dl-support.c
+++ b/elf/dl-support.c
@@ -44,6 +44,7 @@
 #include <dl-vdso-setup.h>
 #include <dl-auxv.h>
 #include <dl-find_object.h>
+#include <array_length.h>
 
 extern char *__progname;
 char **_dl_argv = &__progname;	/* This is checked for some error messages.  */
@@ -241,93 +242,25 @@ __rtld_lock_define_initialized_recursive (, _dl_load_tls_lock)
 
 
 #ifdef HAVE_AUX_VECTOR
+#include <dl-parse_auxv.h>
+
 int _dl_clktck;
 
 void
 _dl_aux_init (ElfW(auxv_t) *av)
 {
-  int seen = 0;
-  uid_t uid = 0;
-  gid_t gid = 0;
-
 #ifdef NEED_DL_SYSINFO
   /* NB: Avoid RELATIVE relocation in static PIE.  */
   GL(dl_sysinfo) = DL_SYSINFO_DEFAULT;
 #endif
 
   _dl_auxv = av;
-  for (; av->a_type != AT_NULL; ++av)
-    switch (av->a_type)
-      {
-      case AT_PAGESZ:
-	if (av->a_un.a_val != 0)
-	  GLRO(dl_pagesize) = av->a_un.a_val;
-	break;
-      case AT_CLKTCK:
-	GLRO(dl_clktck) = av->a_un.a_val;
-	break;
-      case AT_PHDR:
-	GL(dl_phdr) = (const void *) av->a_un.a_val;
-	break;
-      case AT_PHNUM:
-	GL(dl_phnum) = av->a_un.a_val;
-	break;
-      case AT_PLATFORM:
-	GLRO(dl_platform) = (void *) av->a_un.a_val;
-	break;
-      case AT_HWCAP:
-	GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val;
-	break;
-      case AT_HWCAP2:
-	GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val;
-	break;
-      case AT_FPUCW:
-	GLRO(dl_fpu_control) = av->a_un.a_val;
-	break;
-#ifdef NEED_DL_SYSINFO
-      case AT_SYSINFO:
-	GL(dl_sysinfo) = av->a_un.a_val;
-	break;
-#endif
-#ifdef NEED_DL_SYSINFO_DSO
-      case AT_SYSINFO_EHDR:
-	GL(dl_sysinfo_dso) = (void *) av->a_un.a_val;
-	break;
-#endif
-      case AT_UID:
-	uid ^= av->a_un.a_val;
-	seen |= 1;
-	break;
-      case AT_EUID:
-	uid ^= av->a_un.a_val;
-	seen |= 2;
-	break;
-      case AT_GID:
-	gid ^= av->a_un.a_val;
-	seen |= 4;
-	break;
-      case AT_EGID:
-	gid ^= av->a_un.a_val;
-	seen |= 8;
-	break;
-      case AT_SECURE:
-	seen = -1;
-	__libc_enable_secure = av->a_un.a_val;
-	__libc_enable_secure_decided = 1;
-	break;
-      case AT_RANDOM:
-	_dl_random = (void *) av->a_un.a_val;
-	break;
-      case AT_MINSIGSTKSZ:
-	_dl_minsigstacksize = av->a_un.a_val;
-	break;
-      DL_PLATFORM_AUXV
-      }
-  if (seen == 0xf)
-    {
-      __libc_enable_secure = uid != 0 || gid != 0;
-      __libc_enable_secure_decided = 1;
-    }
+  dl_parse_auxv_t auxv_values;
+  /* Use an explicit initialization loop here because memset may not
+     be available yet.  */
+  for (int i = 0; i < array_length (auxv_values); ++i)
+    auxv_values[i] = 0;
+  _dl_parse_auxv (av, auxv_values);
 }
 #endif
 
diff --git a/elf/dl-sysdep.c b/elf/dl-sysdep.c
index f1dba8ef2..7aa90ad6e 100644
--- a/elf/dl-sysdep.c
+++ b/elf/dl-sysdep.c
@@ -1,4 +1,4 @@
-/* Operating system support for run-time dynamic linker.  Generic Unix version.
+/* Operating system support for run-time dynamic linker.  Stub version.
    Copyright (C) 1995-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,352 +16,4 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-/* We conditionalize the whole of this file rather than simply eliding it
-   from the static build, because other sysdeps/ versions of this file
-   might define things needed by a static build.  */
-
-#ifdef SHARED
-
-#include <assert.h>
-#include <elf.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <libintl.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <ldsodefs.h>
-#include <_itoa.h>
-#include <fpu_control.h>
-
-#include <entry.h>
-#include <dl-machine.h>
-#include <dl-procinfo.h>
-#include <dl-osinfo.h>
-#include <libc-internal.h>
-#include <tls.h>
-
-#include <dl-tunables.h>
-#include <dl-auxv.h>
-#include <dl-hwcap-check.h>
-
-extern char **_environ attribute_hidden;
-extern char _end[] attribute_hidden;
-
-/* Protect SUID program against misuse of file descriptors.  */
-extern void __libc_check_standard_fds (void);
-
-int __libc_enable_secure attribute_relro = 0;
-rtld_hidden_data_def (__libc_enable_secure)
-/* This variable contains the lowest stack address ever used.  */
-void *__libc_stack_end attribute_relro = NULL;
-rtld_hidden_data_def(__libc_stack_end)
-void *_dl_random attribute_relro = NULL;
-
-#ifndef DL_FIND_ARG_COMPONENTS
-# define DL_FIND_ARG_COMPONENTS(cookie, argc, argv, envp, auxp)	\
-  do {									      \
-    void **_tmp;							      \
-    (argc) = *(long int *) cookie;					      \
-    (argv) = (char **) ((long int *) cookie + 1);			      \
-    (envp) = (argv) + (argc) + 1;					      \
-    for (_tmp = (void **) (envp); *_tmp; ++_tmp)			      \
-      continue;								      \
-    (auxp) = (void *) ++_tmp;						      \
-  } while (0)
-#endif
-
-#ifndef DL_STACK_END
-# define DL_STACK_END(cookie) ((void *) (cookie))
-#endif
-
-ElfW(Addr)
-_dl_sysdep_start (void **start_argptr,
-		  void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum,
-				   ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv))
-{
-  const ElfW(Phdr) *phdr = NULL;
-  ElfW(Word) phnum = 0;
-  ElfW(Addr) user_entry;
-  ElfW(auxv_t) *av;
-#ifdef HAVE_AUX_SECURE
-# define set_seen(tag) (tag)	/* Evaluate for the side effects.  */
-# define set_seen_secure() ((void) 0)
-#else
-  uid_t uid = 0;
-  gid_t gid = 0;
-  unsigned int seen = 0;
-# define set_seen_secure() (seen = -1)
-# ifdef HAVE_AUX_XID
-#  define set_seen(tag) (tag)	/* Evaluate for the side effects.  */
-# else
-#  define M(type) (1 << (type))
-#  define set_seen(tag) seen |= M ((tag)->a_type)
-# endif
-#endif
-#ifdef NEED_DL_SYSINFO
-  uintptr_t new_sysinfo = 0;
-#endif
-
-  __libc_stack_end = DL_STACK_END (start_argptr);
-  DL_FIND_ARG_COMPONENTS (start_argptr, _dl_argc, _dl_argv, _environ,
-			  GLRO(dl_auxv));
-
-  user_entry = (ElfW(Addr)) ENTRY_POINT;
-  GLRO(dl_platform) = NULL; /* Default to nothing known about the platform.  */
-
-  /* NB: Default to a constant CONSTANT_MINSIGSTKSZ.  */
-  _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ),
-		  "CONSTANT_MINSIGSTKSZ is constant");
-  GLRO(dl_minsigstacksize) = CONSTANT_MINSIGSTKSZ;
-
-  for (av = GLRO(dl_auxv); av->a_type != AT_NULL; set_seen (av++))
-    switch (av->a_type)
-      {
-      case AT_PHDR:
-	phdr = (void *) av->a_un.a_val;
-	break;
-      case AT_PHNUM:
-	phnum = av->a_un.a_val;
-	break;
-      case AT_PAGESZ:
-	GLRO(dl_pagesize) = av->a_un.a_val;
-	break;
-      case AT_ENTRY:
-	user_entry = av->a_un.a_val;
-	break;
-#ifndef HAVE_AUX_SECURE
-      case AT_UID:
-      case AT_EUID:
-	uid ^= av->a_un.a_val;
-	break;
-      case AT_GID:
-      case AT_EGID:
-	gid ^= av->a_un.a_val;
-	break;
-#endif
-      case AT_SECURE:
-#ifndef HAVE_AUX_SECURE
-	seen = -1;
-#endif
-	__libc_enable_secure = av->a_un.a_val;
-	break;
-      case AT_PLATFORM:
-	GLRO(dl_platform) = (void *) av->a_un.a_val;
-	break;
-      case AT_HWCAP:
-	GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val;
-	break;
-      case AT_HWCAP2:
-	GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val;
-	break;
-      case AT_CLKTCK:
-	GLRO(dl_clktck) = av->a_un.a_val;
-	break;
-      case AT_FPUCW:
-	GLRO(dl_fpu_control) = av->a_un.a_val;
-	break;
-#ifdef NEED_DL_SYSINFO
-      case AT_SYSINFO:
-	new_sysinfo = av->a_un.a_val;
-	break;
-#endif
-#ifdef NEED_DL_SYSINFO_DSO
-      case AT_SYSINFO_EHDR:
-	GLRO(dl_sysinfo_dso) = (void *) av->a_un.a_val;
-	break;
-#endif
-      case AT_RANDOM:
-	_dl_random = (void *) av->a_un.a_val;
-	break;
-      case AT_MINSIGSTKSZ:
-	GLRO(dl_minsigstacksize) = av->a_un.a_val;
-	break;
-      DL_PLATFORM_AUXV
-      }
-
-  dl_hwcap_check ();
-
-#ifndef HAVE_AUX_SECURE
-  if (seen != -1)
-    {
-      /* Fill in the values we have not gotten from the kernel through the
-	 auxiliary vector.  */
-# ifndef HAVE_AUX_XID
-#  define SEE(UID, var, uid) \
-   if ((seen & M (AT_##UID)) == 0) var ^= __get##uid ()
-      SEE (UID, uid, uid);
-      SEE (EUID, uid, euid);
-      SEE (GID, gid, gid);
-      SEE (EGID, gid, egid);
-# endif
-
-      /* If one of the two pairs of IDs does not match this is a setuid
-	 or setgid run.  */
-      __libc_enable_secure = uid | gid;
-    }
-#endif
-
-#ifndef HAVE_AUX_PAGESIZE
-  if (GLRO(dl_pagesize) == 0)
-    GLRO(dl_pagesize) = __getpagesize ();
-#endif
-
-#ifdef NEED_DL_SYSINFO
-  if (new_sysinfo != 0)
-    {
-# ifdef NEED_DL_SYSINFO_DSO
-      /* Only set the sysinfo value if we also have the vsyscall DSO.  */
-      if (GLRO(dl_sysinfo_dso) != 0)
-# endif
-        GLRO(dl_sysinfo) = new_sysinfo;
-    }
-#endif
-
-  __tunables_init (_environ);
-
-  /* Initialize DSO sorting algorithm after tunables.  */
-  _dl_sort_maps_init ();
-
-#ifdef DL_SYSDEP_INIT
-  DL_SYSDEP_INIT;
-#endif
-
-#ifdef DL_PLATFORM_INIT
-  DL_PLATFORM_INIT;
-#endif
-
-  /* Determine the length of the platform name.  */
-  if (GLRO(dl_platform) != NULL)
-    GLRO(dl_platformlen) = strlen (GLRO(dl_platform));
-
-  if (__sbrk (0) == _end)
-    /* The dynamic linker was run as a program, and so the initial break
-       starts just after our bss, at &_end.  The malloc in dl-minimal.c
-       will consume the rest of this page, so tell the kernel to move the
-       break up that far.  When the user program examines its break, it
-       will see this new value and not clobber our data.  */
-    __sbrk (GLRO(dl_pagesize)
-	    - ((_end - (char *) 0) & (GLRO(dl_pagesize) - 1)));
-
-  /* If this is a SUID program we make sure that FDs 0, 1, and 2 are
-     allocated.  If necessary we are doing it ourself.  If it is not
-     possible we stop the program.  */
-  if (__builtin_expect (__libc_enable_secure, 0))
-    __libc_check_standard_fds ();
-
-  (*dl_main) (phdr, phnum, &user_entry, GLRO(dl_auxv));
-  return user_entry;
-}
-
-void
-_dl_sysdep_start_cleanup (void)
-{
-}
-
-void
-_dl_show_auxv (void)
-{
-  char buf[64];
-  ElfW(auxv_t) *av;
-
-  /* Terminate string.  */
-  buf[63] = '\0';
-
-  /* The following code assumes that the AT_* values are encoded
-     starting from 0 with AT_NULL, 1 for AT_IGNORE, and all other values
-     close by (otherwise the array will be too large).  In case we have
-     to support a platform where these requirements are not fulfilled
-     some alternative implementation has to be used.  */
-  for (av = GLRO(dl_auxv); av->a_type != AT_NULL; ++av)
-    {
-      static const struct
-      {
-	const char label[22];
-	enum { unknown = 0, dec, hex, str, ignore } form : 8;
-      } auxvars[] =
-	{
-	  [AT_EXECFD - 2] =		{ "EXECFD:            ", dec },
-	  [AT_EXECFN - 2] =		{ "EXECFN:            ", str },
-	  [AT_PHDR - 2] =		{ "PHDR:              0x", hex },
-	  [AT_PHENT - 2] =		{ "PHENT:             ", dec },
-	  [AT_PHNUM - 2] =		{ "PHNUM:             ", dec },
-	  [AT_PAGESZ - 2] =		{ "PAGESZ:            ", dec },
-	  [AT_BASE - 2] =		{ "BASE:              0x", hex },
-	  [AT_FLAGS - 2] =		{ "FLAGS:             0x", hex },
-	  [AT_ENTRY - 2] =		{ "ENTRY:             0x", hex },
-	  [AT_NOTELF - 2] =		{ "NOTELF:            ", hex },
-	  [AT_UID - 2] =		{ "UID:               ", dec },
-	  [AT_EUID - 2] =		{ "EUID:              ", dec },
-	  [AT_GID - 2] =		{ "GID:               ", dec },
-	  [AT_EGID - 2] =		{ "EGID:              ", dec },
-	  [AT_PLATFORM - 2] =		{ "PLATFORM:          ", str },
-	  [AT_HWCAP - 2] =		{ "HWCAP:             ", hex },
-	  [AT_CLKTCK - 2] =		{ "CLKTCK:            ", dec },
-	  [AT_FPUCW - 2] =		{ "FPUCW:             ", hex },
-	  [AT_DCACHEBSIZE - 2] =	{ "DCACHEBSIZE:       0x", hex },
-	  [AT_ICACHEBSIZE - 2] =	{ "ICACHEBSIZE:       0x", hex },
-	  [AT_UCACHEBSIZE - 2] =	{ "UCACHEBSIZE:       0x", hex },
-	  [AT_IGNOREPPC - 2] =		{ "IGNOREPPC", ignore },
-	  [AT_SECURE - 2] =		{ "SECURE:            ", dec },
-	  [AT_BASE_PLATFORM - 2] =	{ "BASE_PLATFORM:     ", str },
-	  [AT_SYSINFO - 2] =		{ "SYSINFO:           0x", hex },
-	  [AT_SYSINFO_EHDR - 2] =	{ "SYSINFO_EHDR:      0x", hex },
-	  [AT_RANDOM - 2] =		{ "RANDOM:            0x", hex },
-	  [AT_HWCAP2 - 2] =		{ "HWCAP2:            0x", hex },
-	  [AT_MINSIGSTKSZ - 2] =	{ "MINSIGSTKSZ:       ", dec },
-	  [AT_L1I_CACHESIZE - 2] =	{ "L1I_CACHESIZE:     ", dec },
-	  [AT_L1I_CACHEGEOMETRY - 2] =	{ "L1I_CACHEGEOMETRY: 0x", hex },
-	  [AT_L1D_CACHESIZE - 2] =	{ "L1D_CACHESIZE:     ", dec },
-	  [AT_L1D_CACHEGEOMETRY - 2] =	{ "L1D_CACHEGEOMETRY: 0x", hex },
-	  [AT_L2_CACHESIZE - 2] =	{ "L2_CACHESIZE:      ", dec },
-	  [AT_L2_CACHEGEOMETRY - 2] =	{ "L2_CACHEGEOMETRY:  0x", hex },
-	  [AT_L3_CACHESIZE - 2] =	{ "L3_CACHESIZE:      ", dec },
-	  [AT_L3_CACHEGEOMETRY - 2] =	{ "L3_CACHEGEOMETRY:  0x", hex },
-	};
-      unsigned int idx = (unsigned int) (av->a_type - 2);
-
-      if ((unsigned int) av->a_type < 2u
-	  || (idx < sizeof (auxvars) / sizeof (auxvars[0])
-	      && auxvars[idx].form == ignore))
-	continue;
-
-      assert (AT_NULL == 0);
-      assert (AT_IGNORE == 1);
-
-      /* Some entries are handled in a special way per platform.  */
-      if (_dl_procinfo (av->a_type, av->a_un.a_val) == 0)
-	continue;
-
-      if (idx < sizeof (auxvars) / sizeof (auxvars[0])
-	  && auxvars[idx].form != unknown)
-	{
-	  const char *val = (char *) av->a_un.a_val;
-
-	  if (__builtin_expect (auxvars[idx].form, dec) == dec)
-	    val = _itoa ((unsigned long int) av->a_un.a_val,
-			 buf + sizeof buf - 1, 10, 0);
-	  else if (__builtin_expect (auxvars[idx].form, hex) == hex)
-	    val = _itoa ((unsigned long int) av->a_un.a_val,
-			 buf + sizeof buf - 1, 16, 0);
-
-	  _dl_printf ("AT_%s%s\n", auxvars[idx].label, val);
-
-	  continue;
-	}
-
-      /* Unknown value: print a generic line.  */
-      char buf2[17];
-      buf2[sizeof (buf2) - 1] = '\0';
-      const char *val2 = _itoa ((unsigned long int) av->a_un.a_val,
-				buf2 + sizeof buf2 - 1, 16, 0);
-      const char *val =  _itoa ((unsigned long int) av->a_type,
-				buf + sizeof buf - 1, 16, 0);
-      _dl_printf ("AT_??? (0x%s): 0x%s\n", val, val2);
-    }
-}
-
-#endif
+#error dl-sysdep support missing.
diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
index 5f7f18ef2..4bf9052db 100644
--- a/elf/dso-sort-tests-1.def
+++ b/elf/dso-sort-tests-1.def
@@ -64,3 +64,10 @@ output: b>a>{}<a<b
 tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
 output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<c<d<g<f<b<e];}
 output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<g<f<a<b<c<d<e];}
+
+# Test that even in the presence of dependency loops involving dlopen'ed
+# object, that object is initialized last (and not unloaded prematurely).
+# Final destructor order is indeterminate due to the cycle.
+tst-bz28937: {+a;+b;-b;+c;%c};a->a1;a->a2;a2->a;b->b1;c->a1;c=>a1
+output(glibc.rtld.dynamic_sort=1): {+a[a2>a1>a>];+b[b1>b>];-b[<b<b1];+c[c>];%c(a1());}<a<a2<c<a1
+output(glibc.rtld.dynamic_sort=2): {+a[a2>a1>a>];+b[b1>b>];-b[<b<b1];+c[c>];%c(a1());}<a2<a<c<a1
diff --git a/elf/enbl-secure.c b/elf/enbl-secure.c
index aa2a0bd87..4e4d66822 100644
--- a/elf/enbl-secure.c
+++ b/elf/enbl-secure.c
@@ -26,15 +26,5 @@
 #include <startup.h>
 #include <libc-internal.h>
 
-/* If nonzero __libc_enable_secure is already set.  */
-int __libc_enable_secure_decided;
 /* Safest assumption, if somehow the initializer isn't run.  */
 int __libc_enable_secure = 1;
-
-void
-__libc_init_secure (void)
-{
-  if (__libc_enable_secure_decided == 0)
-    __libc_enable_secure = (startup_geteuid () != startup_getuid ()
-			    || startup_getegid () != startup_getgid ());
-}
diff --git a/elf/libtracemod1-1.c b/elf/libtracemod1-1.c
new file mode 100644
index 000000000..7c89c9a5a
--- /dev/null
+++ b/elf/libtracemod1-1.c
@@ -0,0 +1 @@
+/* Empty  */
diff --git a/elf/libtracemod2-1.c b/elf/libtracemod2-1.c
new file mode 100644
index 000000000..7c89c9a5a
--- /dev/null
+++ b/elf/libtracemod2-1.c
@@ -0,0 +1 @@
+/* Empty  */
diff --git a/elf/libtracemod3-1.c b/elf/libtracemod3-1.c
new file mode 100644
index 000000000..7c89c9a5a
--- /dev/null
+++ b/elf/libtracemod3-1.c
@@ -0,0 +1 @@
+/* Empty  */
diff --git a/elf/libtracemod4-1.c b/elf/libtracemod4-1.c
new file mode 100644
index 000000000..7c89c9a5a
--- /dev/null
+++ b/elf/libtracemod4-1.c
@@ -0,0 +1 @@
+/* Empty  */
diff --git a/elf/libtracemod5-1.c b/elf/libtracemod5-1.c
new file mode 100644
index 000000000..7c89c9a5a
--- /dev/null
+++ b/elf/libtracemod5-1.c
@@ -0,0 +1 @@
+/* Empty  */
diff --git a/elf/rtld.c b/elf/rtld.c
index 8dafaf61f..86354fb0c 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -441,8 +441,8 @@ static ElfW(Addr) _dl_start_final (void *arg,
 				   struct dl_start_final_info *info);
 #endif
 
-/* These defined magically in the linker script.  */
-extern char _begin[] attribute_hidden;
+/* These are defined magically by the linker.  */
+extern const ElfW(Ehdr) __ehdr_start attribute_hidden;
 extern char _etext[] attribute_hidden;
 extern char _end[] attribute_hidden;
 
@@ -487,7 +487,7 @@ _dl_start_final (void *arg, struct dl_start_final_info *info)
 #endif
   _dl_setup_hash (&GL(dl_rtld_map));
   GL(dl_rtld_map).l_real = &GL(dl_rtld_map);
-  GL(dl_rtld_map).l_map_start = (ElfW(Addr)) _begin;
+  GL(dl_rtld_map).l_map_start = (ElfW(Addr)) &__ehdr_start;
   GL(dl_rtld_map).l_map_end = (ElfW(Addr)) _end;
   GL(dl_rtld_map).l_text_end = (ElfW(Addr)) _etext;
   /* Copy the TLS related data if necessary.  */
@@ -1312,6 +1312,62 @@ rtld_setup_main_map (struct link_map *main_map)
   return has_interp;
 }
 
+/* Adjusts the contents of the stack and related globals for the user
+   entry point.  The ld.so processed skip_args arguments and bumped
+   _dl_argv and _dl_argc accordingly.  Those arguments are removed from
+   argv here.  */
+static void
+_dl_start_args_adjust (int skip_args)
+{
+  void **sp = (void **) (_dl_argv - skip_args - 1);
+  void **p = sp + skip_args;
+
+  if (skip_args == 0)
+    return;
+
+  /* Sanity check.  */
+  intptr_t argc = (intptr_t) sp[0] - skip_args;
+  assert (argc == _dl_argc);
+
+  /* Adjust argc on stack.  */
+  sp[0] = (void *) (intptr_t) _dl_argc;
+
+  /* Update globals in rtld.  */
+  _dl_argv -= skip_args;
+  _environ -= skip_args;
+
+  /* Shuffle argv down.  */
+  do
+    *++sp = *++p;
+  while (*p != NULL);
+
+  assert (_environ == (char **) (sp + 1));
+
+  /* Shuffle envp down.  */
+  do
+    *++sp = *++p;
+  while (*p != NULL);
+
+#ifdef HAVE_AUX_VECTOR
+  void **auxv = (void **) GLRO(dl_auxv) - skip_args;
+  GLRO(dl_auxv) = (ElfW(auxv_t) *) auxv; /* Aliasing violation.  */
+  assert (auxv == sp + 1);
+
+  /* Shuffle auxv down. */
+  ElfW(auxv_t) ax;
+  char *oldp = (char *) (p + 1);
+  char *newp = (char *) (sp + 1);
+  do
+    {
+      memcpy (&ax, oldp, sizeof (ax));
+      memcpy (newp, &ax, sizeof (ax));
+      oldp += sizeof (ax);
+      newp += sizeof (ax);
+    }
+  while (ax.a_type != AT_NULL);
+#endif
+}
+
 static void
 dl_main (const ElfW(Phdr) *phdr,
 	 ElfW(Word) phnum,
@@ -1366,6 +1422,7 @@ dl_main (const ElfW(Phdr) *phdr,
       rtld_is_main = true;
 
       char *argv0 = NULL;
+      char **orig_argv = _dl_argv;
 
       /* Note the place where the dynamic linker actually came from.  */
       GL(dl_rtld_map).l_name = rtld_progname;
@@ -1380,7 +1437,6 @@ dl_main (const ElfW(Phdr) *phdr,
 		GLRO(dl_lazy) = -1;
 	      }
 
-	    ++_dl_skip_args;
 	    --_dl_argc;
 	    ++_dl_argv;
 	  }
@@ -1389,14 +1445,12 @@ dl_main (const ElfW(Phdr) *phdr,
 	    if (state.mode != rtld_mode_help)
 	      state.mode = rtld_mode_verify;
 
-	    ++_dl_skip_args;
 	    --_dl_argc;
 	    ++_dl_argv;
 	  }
 	else if (! strcmp (_dl_argv[1], "--inhibit-cache"))
 	  {
 	    GLRO(dl_inhibit_cache) = 1;
-	    ++_dl_skip_args;
 	    --_dl_argc;
 	    ++_dl_argv;
 	  }
@@ -1406,7 +1460,6 @@ dl_main (const ElfW(Phdr) *phdr,
 	    state.library_path = _dl_argv[2];
 	    state.library_path_source = "--library-path";
 
-	    _dl_skip_args += 2;
 	    _dl_argc -= 2;
 	    _dl_argv += 2;
 	  }
@@ -1415,7 +1468,6 @@ dl_main (const ElfW(Phdr) *phdr,
 	  {
 	    GLRO(dl_inhibit_rpath) = _dl_argv[2];
 
-	    _dl_skip_args += 2;
 	    _dl_argc -= 2;
 	    _dl_argv += 2;
 	  }
@@ -1423,14 +1475,12 @@ dl_main (const ElfW(Phdr) *phdr,
 	  {
 	    audit_list_add_string (&state.audit_list, _dl_argv[2]);
 
-	    _dl_skip_args += 2;
 	    _dl_argc -= 2;
 	    _dl_argv += 2;
 	  }
 	else if (! strcmp (_dl_argv[1], "--preload") && _dl_argc > 2)
 	  {
 	    state.preloadarg = _dl_argv[2];
-	    _dl_skip_args += 2;
 	    _dl_argc -= 2;
 	    _dl_argv += 2;
 	  }
@@ -1438,7 +1488,6 @@ dl_main (const ElfW(Phdr) *phdr,
 	  {
 	    argv0 = _dl_argv[2];
 
-	    _dl_skip_args += 2;
 	    _dl_argc -= 2;
 	    _dl_argv += 2;
 	  }
@@ -1446,7 +1495,6 @@ dl_main (const ElfW(Phdr) *phdr,
 		 && _dl_argc > 2)
 	  {
 	    state.glibc_hwcaps_prepend = _dl_argv[2];
-	    _dl_skip_args += 2;
 	    _dl_argc -= 2;
 	    _dl_argv += 2;
 	  }
@@ -1454,7 +1502,6 @@ dl_main (const ElfW(Phdr) *phdr,
 		 && _dl_argc > 2)
 	  {
 	    state.glibc_hwcaps_mask = _dl_argv[2];
-	    _dl_skip_args += 2;
 	    _dl_argc -= 2;
 	    _dl_argv += 2;
 	  }
@@ -1463,7 +1510,6 @@ dl_main (const ElfW(Phdr) *phdr,
 	  {
 	    state.mode = rtld_mode_list_tunables;
 
-	    ++_dl_skip_args;
 	    --_dl_argc;
 	    ++_dl_argv;
 	  }
@@ -1472,7 +1518,6 @@ dl_main (const ElfW(Phdr) *phdr,
 	  {
 	    state.mode = rtld_mode_list_diagnostics;
 
-	    ++_dl_skip_args;
 	    --_dl_argc;
 	    ++_dl_argv;
 	  }
@@ -1518,7 +1563,6 @@ dl_main (const ElfW(Phdr) *phdr,
 	    _dl_usage (ld_so_name, NULL);
 	}
 
-      ++_dl_skip_args;
       --_dl_argc;
       ++_dl_argv;
 
@@ -1617,6 +1661,9 @@ dl_main (const ElfW(Phdr) *phdr,
       /* Set the argv[0] string now that we've processed the executable.  */
       if (argv0 != NULL)
         _dl_argv[0] = argv0;
+
+      /* Adjust arguments for the application entry point.  */
+      _dl_start_args_adjust (_dl_argv - orig_argv);
     }
   else
     {
@@ -1754,7 +1801,6 @@ dl_main (const ElfW(Phdr) *phdr,
      segment that also includes the phdrs.  If that's not available, we use
      the old method that assumes the beginning of the file is part of the
      lowest-addressed PT_LOAD segment.  */
-  extern const ElfW(Ehdr) __ehdr_start __attribute__ ((visibility ("hidden")));
 
   /* Set up the program header information for the dynamic linker
      itself.  It is needed in the dl_iterate_phdr callbacks.  */
diff --git a/elf/tst-audit26.c b/elf/tst-audit26.c
new file mode 100644
index 000000000..3f920e83b
--- /dev/null
+++ b/elf/tst-audit26.c
@@ -0,0 +1,35 @@
+/* Check the usability of <dlfcn.h> functions in audit modules.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <gnu/lib-names.h>
+
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+static int
+do_test (void)
+{
+  /* Check that the audit module has been loaded.  */
+  void *handle = xdlopen ("mapped to libc", RTLD_LOCAL | RTLD_NOW);
+  TEST_VERIFY (handle
+	       == xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD));
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-auditmod24a.c b/elf/tst-auditmod24a.c
index d8e88f398..3075dfae2 100644
--- a/elf/tst-auditmod24a.c
+++ b/elf/tst-auditmod24a.c
@@ -110,5 +110,7 @@ la_symbind32 (Elf32_Sym *sym, unsigned int ndx,
       return sym->st_value;
     }
 
-  abort ();
+  if (symname[0] != '\0')
+    abort ();
+  return sym->st_value;
 }
diff --git a/elf/tst-auditmod24d.c b/elf/tst-auditmod24d.c
index 8c803ecc0..badc6be45 100644
--- a/elf/tst-auditmod24d.c
+++ b/elf/tst-auditmod24d.c
@@ -116,5 +116,7 @@ la_symbind32 (Elf32_Sym *sym, unsigned int ndx,
 	}
     }
 
-  abort ();
+  if (symname[0] != '\0')
+    abort ();
+  return sym->st_value;
 }
diff --git a/elf/tst-auditmod25.c b/elf/tst-auditmod25.c
index 526f5c54b..20640a8da 100644
--- a/elf/tst-auditmod25.c
+++ b/elf/tst-auditmod25.c
@@ -72,7 +72,7 @@ la_symbind32 (Elf32_Sym *sym, unsigned int ndx,
 	      unsigned int *flags, const char *symname)
 #endif
 {
-  if (*refcook != -1 && *defcook != -1)
+  if (*refcook != -1 && *defcook != -1 && symname[0] != '\0')
     fprintf (stderr, "la_symbind: %s %u\n", symname,
 	     *flags & (LA_SYMB_NOPLTENTER | LA_SYMB_NOPLTEXIT) ? 1 : 0);
   return sym->st_value;
diff --git a/elf/tst-auditmod26.c b/elf/tst-auditmod26.c
new file mode 100644
index 000000000..db7ba95ab
--- /dev/null
+++ b/elf/tst-auditmod26.c
@@ -0,0 +1,104 @@
+/* Check the usability of <dlfcn.h> functions in audit modules.  Audit module.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <first-versions.h>
+#include <gnu/lib-names.h>
+#include <link.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+unsigned int
+la_version (unsigned int current)
+{
+  /* Exercise various <dlfcn.h> functions.  */
+
+  /* Check dlopen, dlsym, dlclose.   */
+  void *handle = xdlopen (LIBM_SO, RTLD_LOCAL | RTLD_NOW);
+  void *ptr = xdlsym (handle, "sincos");
+  TEST_VERIFY (ptr != NULL);
+  ptr = dlsym (handle, "SINCOS");
+  TEST_VERIFY (ptr == NULL);
+  const char *message = dlerror ();
+  TEST_VERIFY (strstr (message, ": undefined symbol: SINCOS") != NULL);
+  ptr = dlsym (handle, "SINCOS");
+  TEST_VERIFY (ptr == NULL);
+  xdlclose (handle);
+  TEST_COMPARE_STRING (dlerror (), NULL);
+
+  handle = xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD);
+
+  /* Check dlvsym.  _exit is unlikely to gain another symbol
+     version.  */
+  TEST_VERIFY (xdlsym (handle, "_exit")
+               == xdlvsym (handle, "_exit", FIRST_VERSION_libc__exit_STRING));
+
+  /* Check dlinfo.  */
+  {
+    void *handle2 = NULL;
+    TEST_COMPARE (dlinfo (handle, RTLD_DI_LINKMAP, &handle2), 0);
+    TEST_VERIFY (handle2 == handle);
+  }
+
+  /* Check dladdr and dladdr1.  */
+  Dl_info info = { };
+  TEST_VERIFY (dladdr (&_exit, &info) != 0);
+  if (strcmp (info.dli_sname, "_Exit") != 0) /* _Exit is an alias.  */
+    TEST_COMPARE_STRING (info.dli_sname, "_exit");
+  TEST_VERIFY (info.dli_saddr == &_exit);
+  TEST_VERIFY (strstr (info.dli_fname, LIBC_SO));
+  void *extra_info;
+  memset (&info, 0, sizeof (info));
+  TEST_VERIFY (dladdr1 (&_exit, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
+  TEST_VERIFY (extra_info == handle);
+
+  /* Verify that dlmopen creates a new namespace.  */
+  void *dlmopen_handle = xdlmopen (LM_ID_NEWLM, LIBC_SO, RTLD_NOW);
+  TEST_VERIFY (dlmopen_handle != handle);
+  memset (&info, 0, sizeof (info));
+  extra_info = NULL;
+  ptr = xdlsym (dlmopen_handle, "_exit");
+  TEST_VERIFY (dladdr1 (ptr, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
+  TEST_VERIFY (extra_info == dlmopen_handle);
+  xdlclose (dlmopen_handle);
+
+  /* Terminate the process with an error state.  This does not happen
+     automatically because the audit module state is not shared with
+     the main program.  */
+  if (support_record_failure_is_failed ())
+    {
+      fflush (stdout);
+      fflush (stderr);
+      _exit (1);
+    }
+
+  return LAV_CURRENT;
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+  if (strcmp (name, "mapped to libc") == 0)
+    return (char *) LIBC_SO;
+  else
+    return (char *) name;
+}
diff --git a/elf/tst-dlmopen-twice-mod1.c b/elf/tst-dlmopen-twice-mod1.c
new file mode 100644
index 000000000..0eaf04948
--- /dev/null
+++ b/elf/tst-dlmopen-twice-mod1.c
@@ -0,0 +1,37 @@
+/* Initialization of libc after dlmopen/dlclose/dlmopen (bug 29528).  Module 1.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+
+static void __attribute__ ((constructor))
+init (void)
+{
+  puts ("info: tst-dlmopen-twice-mod1.so loaded");
+  fflush (stdout);
+}
+
+static void __attribute__ ((destructor))
+fini (void)
+{
+  puts ("info: tst-dlmopen-twice-mod1.so about to be unloaded");
+  fflush (stdout);
+}
+
+/* Large allocation.  The second module does not have this, so it
+   should load libc at a different address.  */
+char large_allocate[16 * 1024 * 1024];
diff --git a/elf/tst-dlmopen-twice-mod2.c b/elf/tst-dlmopen-twice-mod2.c
new file mode 100644
index 000000000..40c6c01f9
--- /dev/null
+++ b/elf/tst-dlmopen-twice-mod2.c
@@ -0,0 +1,50 @@
+/* Initialization of libc after dlmopen/dlclose/dlmopen (bug 29528).  Module 2.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <ctype.h>
+#include <stdio.h>
+
+static void __attribute__ ((constructor))
+init (void)
+{
+  puts ("info: tst-dlmopen-twice-mod2.so loaded");
+  fflush (stdout);
+}
+
+static void __attribute__ ((destructor))
+fini (void)
+{
+  puts ("info: tst-dlmopen-twice-mod2.so about to be unloaded");
+  fflush (stdout);
+}
+
+int
+run_check (void)
+{
+  puts ("info: about to call isalpha");
+  fflush (stdout);
+
+  volatile char ch = 'a';
+  if (!isalpha (ch))
+    {
+      puts ("error: isalpha ('a') is not true");
+      fflush (stdout);
+      return 1;
+    }
+  return 0;
+}
diff --git a/elf/tst-dlmopen-twice.c b/elf/tst-dlmopen-twice.c
new file mode 100644
index 000000000..449f3c8fa
--- /dev/null
+++ b/elf/tst-dlmopen-twice.c
@@ -0,0 +1,34 @@
+/* Initialization of libc after dlmopen/dlclose/dlmopen (bug 29528).  Main.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/xdlfcn.h>
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+  void *handle = xdlmopen (LM_ID_NEWLM, "tst-dlmopen-twice-mod1.so", RTLD_NOW);
+  xdlclose (handle);
+  handle = xdlmopen (LM_ID_NEWLM, "tst-dlmopen-twice-mod2.so", RTLD_NOW);
+  int (*run_check) (void) = xdlsym (handle, "run_check");
+  TEST_COMPARE (run_check (), 0);
+  xdlclose (handle);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-glibc-hwcaps-cache.script b/elf/tst-glibc-hwcaps-cache.script
index c3271f61f..d58fc8c5d 100644
--- a/elf/tst-glibc-hwcaps-cache.script
+++ b/elf/tst-glibc-hwcaps-cache.script
@@ -4,6 +4,7 @@
 cp $B/elf/libmarkermod2-1.so $L/libmarkermod2.so
 cp $B/elf/libmarkermod3-1.so $L/libmarkermod3.so
 cp $B/elf/libmarkermod4-1.so $L/libmarkermod4.so
+cp $B/elf/libmarkermod5-1.so $L/libmarkermod5.so
 
 mkdirp 0770 $L/glibc-hwcaps/power9
 cp $B/elf/libmarkermod2-2.so $L/glibc-hwcaps/power9/libmarkermod2.so
@@ -20,6 +21,11 @@ mkdirp 0770 $L/glibc-hwcaps/z15
 cp $B/elf/libmarkermod4-2.so $L/glibc-hwcaps/z13/libmarkermod4.so
 cp $B/elf/libmarkermod4-3.so $L/glibc-hwcaps/z14/libmarkermod4.so
 cp $B/elf/libmarkermod4-4.so $L/glibc-hwcaps/z15/libmarkermod4.so
+mkdirp 0770 $L/glibc-hwcaps/z16
+cp $B/elf/libmarkermod5-2.so $L/glibc-hwcaps/z13/libmarkermod5.so
+cp $B/elf/libmarkermod5-3.so $L/glibc-hwcaps/z14/libmarkermod5.so
+cp $B/elf/libmarkermod5-4.so $L/glibc-hwcaps/z15/libmarkermod5.so
+cp $B/elf/libmarkermod5-5.so $L/glibc-hwcaps/z16/libmarkermod5.so
 
 mkdirp 0770 $L/glibc-hwcaps/x86-64-v2
 cp $B/elf/libmarkermod2-2.so $L/glibc-hwcaps/x86-64-v2/libmarkermod2.so
diff --git a/elf/tst-glibcelf.py b/elf/tst-glibcelf.py
new file mode 100644
index 000000000..bf15a3bad
--- /dev/null
+++ b/elf/tst-glibcelf.py
@@ -0,0 +1,260 @@
+#!/usr/bin/python3
+# Verify scripts/glibcelf.py contents against elf/elf.h.
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+import argparse
+import enum
+import sys
+
+import glibcelf
+import glibcextract
+
+errors_encountered = 0
+
+def error(message):
+    global errors_encountered
+    sys.stdout.write('error: {}\n'.format(message))
+    errors_encountered += 1
+
+# The enum constants in glibcelf are expected to have exactly these
+# prefixes.
+expected_constant_prefixes = tuple(
+    'ELFCLASS ELFDATA EM_ ET_ DT_ PF_ PT_ SHF_ SHN_ SHT_ STB_ STT_'.split())
+
+def find_constant_prefix(name):
+    """Returns a matching prefix from expected_constant_prefixes or None."""
+    for prefix in expected_constant_prefixes:
+        if name.startswith(prefix):
+            return prefix
+    return None
+
+def find_enum_types():
+    """A generator for OpenIntEnum and IntFlag classes in glibcelf."""
+    for obj in vars(glibcelf).values():
+        if isinstance(obj, type) and obj.__bases__[0] in (
+                glibcelf._OpenIntEnum, enum.Enum, enum.IntFlag):
+            yield obj
+
+def check_duplicates():
+    """Verifies that enum types do not have duplicate values.
+
+    Different types must have different member names, too.
+
+    """
+    global_seen = {}
+    for typ in find_enum_types():
+        seen = {}
+        last = None
+        for (name, e) in typ.__members__.items():
+            if e.value in seen:
+                error('{} has {}={} and {}={}'.format(
+                    typ, seen[e.value], e.value, name, e.value))
+                last = e
+            else:
+                seen[e.value] = name
+                if last is not None and last.value > e.value:
+                    error('{} has {}={} after {}={}'.format(
+                        typ, name, e.value, last.name, last.value))
+                if name in global_seen:
+                    error('{} used in {} and {}'.format(
+                        name, global_seen[name], typ))
+                else:
+                    global_seen[name] = typ
+
+def check_constant_prefixes():
+    """Check that the constant prefixes match expected_constant_prefixes."""
+    seen = set()
+    for typ in find_enum_types():
+        typ_prefix = None
+        for val in typ:
+            prefix = find_constant_prefix(val.name)
+            if prefix is None:
+                error('constant {!r} for {} has unknown prefix'.format(
+                    val, typ))
+                break
+            elif typ_prefix is None:
+                typ_prefix = prefix
+                seen.add(typ_prefix)
+            elif prefix != typ_prefix:
+                error('prefix {!r} for constant {!r}, expected {!r}'.format(
+                    prefix, val, typ_prefix))
+        if typ_prefix is None:
+            error('empty enum type {}'.format(typ))
+
+    for prefix in sorted(set(expected_constant_prefixes) - seen):
+        error('missing constant prefix {!r}'.format(prefix))
+    # Reverse difference is already covered inside the loop.
+
+def find_elf_h_constants(cc):
+    """Returns a dictionary of relevant constants from <elf.h>."""
+    return glibcextract.compute_macro_consts(
+        source_text='#include <elf.h>',
+        cc=cc,
+        macro_re='|'.join(
+            prefix + '.*' for prefix in expected_constant_prefixes))
+
+# The first part of the pair is a name of an <elf.h> constant that is
+# dropped from glibcelf.  The second part is the constant as it is
+# used in <elf.h>.
+glibcelf_skipped_aliases = (
+    ('EM_ARC_A5', 'EM_ARC_COMPACT'),
+    ('PF_PARISC_SBP', 'PF_HP_SBP')
+)
+
+# Constants that provide little value and are not included in
+# glibcelf: *LO*/*HI* range constants, *NUM constants counting the
+# number of constants.  Also includes the alias names from
+# glibcelf_skipped_aliases.
+glibcelf_skipped_constants = frozenset(
+    [e[0] for e in glibcelf_skipped_aliases]) | frozenset("""
+DT_AARCH64_NUM
+DT_ADDRNUM
+DT_ADDRRNGHI
+DT_ADDRRNGLO
+DT_ALPHA_NUM
+DT_ENCODING
+DT_EXTRANUM
+DT_HIOS
+DT_HIPROC
+DT_IA_64_NUM
+DT_LOOS
+DT_LOPROC
+DT_MIPS_NUM
+DT_NUM
+DT_PPC64_NUM
+DT_PPC_NUM
+DT_PROCNUM
+DT_SPARC_NUM
+DT_VALNUM
+DT_VALRNGHI
+DT_VALRNGLO
+DT_VERSIONTAGNUM
+ELFCLASSNUM
+ELFDATANUM
+ET_HIOS
+ET_HIPROC
+ET_LOOS
+ET_LOPROC
+ET_NUM
+PF_MASKOS
+PF_MASKPROC
+PT_HIOS
+PT_HIPROC
+PT_HISUNW
+PT_LOOS
+PT_LOPROC
+PT_LOSUNW
+SHF_MASKOS
+SHF_MASKPROC
+SHN_HIOS
+SHN_HIPROC
+SHN_HIRESERVE
+SHN_LOOS
+SHN_LOPROC
+SHN_LORESERVE
+SHT_HIOS
+SHT_HIPROC
+SHT_HIPROC
+SHT_HISUNW
+SHT_HIUSER
+SHT_LOOS
+SHT_LOPROC
+SHT_LOSUNW
+SHT_LOUSER
+SHT_NUM
+STB_HIOS
+STB_HIPROC
+STB_LOOS
+STB_LOPROC
+STB_NUM
+STT_HIOS
+STT_HIPROC
+STT_LOOS
+STT_LOPROC
+STT_NUM
+""".strip().split())
+
+def check_constant_values(cc):
+    """Checks the values of <elf.h> constants against glibcelf."""
+
+    glibcelf_constants = {
+        e.name: e for typ in find_enum_types() for e in typ}
+    elf_h_constants = find_elf_h_constants(cc=cc)
+
+    missing_in_glibcelf = (set(elf_h_constants) - set(glibcelf_constants)
+                           - glibcelf_skipped_constants)
+    for name in sorted(missing_in_glibcelf):
+        error('constant {} is missing from glibcelf'.format(name))
+
+    unexpected_in_glibcelf = \
+        set(glibcelf_constants) & glibcelf_skipped_constants
+    for name in sorted(unexpected_in_glibcelf):
+        error('constant {} is supposed to be filtered from glibcelf'.format(
+            name))
+
+    missing_in_elf_h = set(glibcelf_constants) - set(elf_h_constants)
+    for name in sorted(missing_in_elf_h):
+        error('constant {} is missing from <elf.h>'.format(name))
+
+    expected_in_elf_h = glibcelf_skipped_constants - set(elf_h_constants)
+    for name in expected_in_elf_h:
+        error('filtered constant {} is missing from <elf.h>'.format(name))
+
+    for alias_name, name_in_glibcelf in glibcelf_skipped_aliases:
+        if name_in_glibcelf not in glibcelf_constants:
+            error('alias value {} for {} not in glibcelf'.format(
+                name_in_glibcelf, alias_name))
+        elif (int(elf_h_constants[alias_name])
+              != glibcelf_constants[name_in_glibcelf].value):
+            error('<elf.h> has {}={}, glibcelf has {}={}'.format(
+                alias_name, elf_h_constants[alias_name],
+                name_in_glibcelf, glibcelf_constants[name_in_glibcelf]))
+
+    # Check for value mismatches:
+    for name in sorted(set(glibcelf_constants) & set(elf_h_constants)):
+        glibcelf_value = glibcelf_constants[name].value
+        elf_h_value = int(elf_h_constants[name])
+        # On 32-bit architectures <elf.h> as some constants that are
+        # parsed as signed, while they are unsigned in glibcelf.  So
+        # far, this only affects some flag constants, so special-case
+        # them here.
+        if (glibcelf_value != elf_h_value
+            and not (isinstance(glibcelf_constants[name], enum.IntFlag)
+                     and glibcelf_value == 1 << 31
+                     and elf_h_value == -(1 << 31))):
+            error('{}: glibcelf has {!r}, <elf.h> has {!r}'.format(
+                name, glibcelf_value, elf_h_value))
+
+def main():
+    """The main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Check glibcelf.py and elf.h against each other.")
+    parser.add_argument('--cc', metavar='CC',
+                        help='C compiler (including options) to use')
+    args = parser.parse_args()
+
+    check_duplicates()
+    check_constant_prefixes()
+    check_constant_values(cc=args.cc)
+
+    if errors_encountered > 0:
+        print("note: errors encountered:", errors_encountered)
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
diff --git a/elf/tst-relro-symbols.py b/elf/tst-relro-symbols.py
new file mode 100644
index 000000000..368ea3349
--- /dev/null
+++ b/elf/tst-relro-symbols.py
@@ -0,0 +1,137 @@
+#!/usr/bin/python3
+# Verify that certain symbols are covered by RELRO.
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+"""Analyze a (shared) object to verify that certain symbols are
+present and covered by the PT_GNU_RELRO segment.
+
+"""
+
+import argparse
+import os.path
+import sys
+
+# Make available glibc Python modules.
+sys.path.append(os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), os.path.pardir, 'scripts'))
+
+import glibcelf
+
+def find_relro(path: str, img: glibcelf.Image) -> (int, int):
+    """Discover the address range of the PT_GNU_RELRO segment."""
+    for phdr in img.phdrs():
+        if phdr.p_type == glibcelf.Pt.PT_GNU_RELRO:
+            # The computation is not entirely accurate because
+            # _dl_protect_relro in elf/dl-reloc.c rounds both the
+            # start end and downwards using the run-time page size.
+            return phdr.p_vaddr, phdr.p_vaddr + phdr.p_memsz
+    sys.stdout.write('{}: error: no PT_GNU_RELRO segment\n'.format(path))
+    sys.exit(1)
+
+def check_in_relro(kind, relro_begin, relro_end, name, start, size, error):
+    """Check if a section or symbol falls within in the RELRO segment."""
+    end = start + size - 1
+    if not (relro_begin <= start < end < relro_end):
+        error(
+            '{} {!r} of size {} at 0x{:x} is not in RELRO range [0x{:x}, 0x{:x})'.format(
+                kind, name.decode('UTF-8'), start, size,
+                relro_begin, relro_end))
+
+def get_parser():
+    """Return an argument parser for this script."""
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('object', help='path to object file to check')
+    parser.add_argument('--required', metavar='NAME', default=(),
+                        help='required symbol names', nargs='*')
+    parser.add_argument('--optional', metavar='NAME', default=(),
+                        help='required symbol names', nargs='*')
+    return parser
+
+def main(argv):
+    """The main entry point."""
+    parser = get_parser()
+    opts = parser.parse_args(argv)
+    img = glibcelf.Image.readfile(opts.object)
+
+    required_symbols = frozenset([sym.encode('UTF-8')
+                                  for sym in opts.required])
+    optional_symbols = frozenset([sym.encode('UTF-8')
+                                  for sym in opts.optional])
+    check_symbols = required_symbols | optional_symbols
+
+    # Tracks the symbols in check_symbols that have been found.
+    symbols_found = set()
+
+    # Discover the extent of the RELRO segment.
+    relro_begin, relro_end = find_relro(opts.object, img)
+    symbol_table_found = False
+
+    errors = False
+    def error(msg: str) -> None:
+        """Record an error condition and write a message to standard output."""
+        nonlocal errors
+        errors = True
+        sys.stdout.write('{}: error: {}\n'.format(opts.object, msg))
+
+    # Iterate over section headers to find the symbol table.
+    for shdr in img.shdrs():
+        if shdr.sh_type == glibcelf.Sht.SHT_SYMTAB:
+            symbol_table_found = True
+            for sym in img.syms(shdr):
+                if sym.st_name in check_symbols:
+                    symbols_found.add(sym.st_name)
+
+                    # Validate symbol type, section, and size.
+                    if sym.st_info.type != glibcelf.Stt.STT_OBJECT:
+                        error('symbol {!r} has wrong type {}'.format(
+                            sym.st_name.decode('UTF-8'), sym.st_info.type))
+                    if sym.st_shndx in glibcelf.Shn:
+                        error('symbol {!r} has reserved section {}'.format(
+                            sym.st_name.decode('UTF-8'), sym.st_shndx))
+                        continue
+                    if sym.st_size == 0:
+                        error('symbol {!r} has size zero'.format(
+                            sym.st_name.decode('UTF-8')))
+                        continue
+
+                    check_in_relro('symbol', relro_begin, relro_end,
+                                   sym.st_name, sym.st_value, sym.st_size,
+                                   error)
+            continue # SHT_SYMTAB
+        if shdr.sh_name == b'.data.rel.ro' \
+           or shdr.sh_name.startswith(b'.data.rel.ro.'):
+            check_in_relro('section', relro_begin, relro_end,
+                           shdr.sh_name, shdr.sh_addr, shdr.sh_size,
+                           error)
+            continue
+
+    if required_symbols - symbols_found:
+        for sym in sorted(required_symbols - symbols_found):
+            error('symbol {!r} not found'.format(sym.decode('UTF-8')))
+
+    if errors:
+        sys.exit(1)
+
+    if not symbol_table_found:
+        sys.stdout.write(
+            '{}: warning: no symbol table found (stripped object)\n'.format(
+                opts.object))
+        sys.exit(77)
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/sysdeps/mach/hurd/enbl-secure.c b/elf/tst-tls-allocation-failure-static.c
similarity index 57%
rename from sysdeps/mach/hurd/enbl-secure.c
rename to elf/tst-tls-allocation-failure-static.c
index 8c02789ec..8de831b24 100644
--- a/sysdeps/mach/hurd/enbl-secure.c
+++ b/elf/tst-tls-allocation-failure-static.c
@@ -1,5 +1,5 @@
-/* Define and initialize the `__libc_enable_secure' flag.  Hurd version.
-   Copyright (C) 1998-2022 Free Software Foundation, Inc.
+/* Base for test program with impossiblyh large PT_TLS segment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,15 +16,16 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-/* There is no need for this file in the Hurd; it is just a placeholder
-   to prevent inclusion of the sysdeps/generic version.
-   In the shared library, the `__libc_enable_secure' variable is defined
-   by the dynamic linker in dl-sysdep.c and set there.
-   In the static library, it is defined in init-first.c and set there.  */
+/* The test actual binary is patched using scripts/tst-elf-edit.py
+   --maximize-tls-size, and this introduces the expected test
+   allocation failure due to an excessive PT_LS p_memsz value.
 
-#include <libc-internal.h>
+   Patching the binary is required because on some 64-bit targets, TLS
+   relocations can only cover a 32-bit range, and glibc-internal TLS
+   variables such as errno end up outside that range.  */
 
-void
-__libc_init_secure (void)
+int
+main (void)
 {
+  return 0;
 }
diff --git a/elf/tst-trace1.exp b/elf/tst-trace1.exp
new file mode 100644
index 000000000..4a6f5211a
--- /dev/null
+++ b/elf/tst-trace1.exp
@@ -0,0 +1,4 @@
+ld 1
+libc 1
+libtracemod2.so 0
+libtracemod3.so 0
diff --git a/elf/tst-trace2.exp b/elf/tst-trace2.exp
new file mode 100644
index 000000000..e13506e2e
--- /dev/null
+++ b/elf/tst-trace2.exp
@@ -0,0 +1,6 @@
+ld 1
+libc 1
+libtracemod2.so 1
+libtracemod3.so 0
+libtracemod4.so 0
+libtracemod5.so 0
diff --git a/elf/tst-trace3.exp b/elf/tst-trace3.exp
new file mode 100644
index 000000000..e574549d1
--- /dev/null
+++ b/elf/tst-trace3.exp
@@ -0,0 +1,6 @@
+ld 1
+libc 1
+libtracemod2.so 1
+libtracemod3.so 1
+libtracemod4.so 0
+libtracemod5.so 0
diff --git a/elf/tst-trace4.exp b/elf/tst-trace4.exp
new file mode 100644
index 000000000..31ca97b35
--- /dev/null
+++ b/elf/tst-trace4.exp
@@ -0,0 +1,6 @@
+ld 1
+libc 1
+libtracemod2.so 1
+libtracemod3.so 1
+libtracemod4.so 1
+libtracemod5.so 0
diff --git a/elf/tst-trace5.exp b/elf/tst-trace5.exp
new file mode 100644
index 000000000..5d7d95372
--- /dev/null
+++ b/elf/tst-trace5.exp
@@ -0,0 +1,6 @@
+ld 1
+libc 1
+libtracemod2.so 1
+libtracemod3.so 1
+libtracemod4.so 1
+libtracemod5.so 1
diff --git a/iconv/gconv_parseconfdir.h b/iconv/gconv_parseconfdir.h
index c0de54883..b72933b52 100644
--- a/iconv/gconv_parseconfdir.h
+++ b/iconv/gconv_parseconfdir.h
@@ -29,11 +29,14 @@
 # define isspace(__c) __isspace_l ((__c), _nl_C_locobj_ptr)
 # define asprintf __asprintf
 # define opendir __opendir
-# define readdir __readdir
+# define readdir64 __readdir64
 # define closedir __closedir
 # define mempcpy __mempcpy
-# define lstat64 __lstat64
+# define struct_stat64 struct __stat64_t64
+# define lstat64 __lstat64_time64
 # define feof_unlocked __feof_unlocked
+#else
+# define struct_stat64 struct stat64
 #endif
 
 /* Name of the file containing the module information in the directories
@@ -145,8 +148,8 @@ gconv_parseconfdir (const char *prefix, const char *dir, size_t dir_len)
   DIR *confdir = opendir (buf);
   if (confdir != NULL)
     {
-      struct dirent *ent;
-      while ((ent = readdir (confdir)) != NULL)
+      struct dirent64 *ent;
+      while ((ent = readdir64 (confdir)) != NULL)
 	{
 	  if (ent->d_type != DT_REG && ent->d_type != DT_UNKNOWN)
 	    continue;
@@ -158,7 +161,7 @@ gconv_parseconfdir (const char *prefix, const char *dir, size_t dir_len)
 	      && strcmp (ent->d_name + len - strlen (suffix), suffix) == 0)
 	    {
 	      char *conf;
-	      struct stat64 st;
+	      struct_stat64 st;
 	      if (asprintf (&conf, "%s/%s", buf, ent->d_name) < 0)
 		continue;
 
diff --git a/include/arpa/nameser.h b/include/arpa/nameser.h
index 53f1dbc7c..c27e7886b 100644
--- a/include/arpa/nameser.h
+++ b/include/arpa/nameser.h
@@ -55,6 +55,12 @@ int __ns_name_ntop (const unsigned char *, char *, size_t) __THROW;
 int __ns_name_unpack (const unsigned char *, const unsigned char *,
 		      const unsigned char *, unsigned char *, size_t) __THROW;
 
+/* Like ns_samename, but for uncompressed binary names.  Return true
+   if the two arguments compare are equal as case-insensitive domain
+   names.  */
+_Bool __ns_samebinaryname (const unsigned char *, const unsigned char *)
+  attribute_hidden;
+
 #define ns_msg_getflag(handle, flag) \
   (((handle)._flags & _ns_flagdata[flag].mask) >> _ns_flagdata[flag].shift)
 
@@ -89,5 +95,105 @@ libc_hidden_proto (__ns_name_unpack)
 extern __typeof (ns_samename) __libc_ns_samename;
 libc_hidden_proto (__libc_ns_samename)
 
+/* Packet parser helper functions.  */
+
+/* Verify that P points to an uncompressed domain name in wire format.
+   On success, return the length of the encoded name, including the
+   terminating null byte.  On failure, return -1 and set errno.  EOM
+   must point one past the last byte in the packet.  */
+int __ns_name_length_uncompressed (const unsigned char *p,
+				   const unsigned char *eom) attribute_hidden;
+
+/* Iterator over the resource records in a DNS packet.  */
+struct ns_rr_cursor
+{
+  /* These members are not changed after initialization.  */
+  const unsigned char *begin;	/* First byte of packet.  */
+  const unsigned char *end;	/* One past the last byte of the packet.  */
+  const unsigned char *first_rr; /* First resource record (or packet end).  */
+
+  /* Advanced towards the end while reading the packet.  */
+  const unsigned char *current;
+};
+
+/* Returns the RCODE field from the DNS header.  */
+static inline int
+ns_rr_cursor_rcode (const struct ns_rr_cursor *c)
+{
+  return c->begin[3] & 0x0f;	/* Lower 4 bits at offset 3.  */
+}
+
+/* Returns the length of the answer section according to the DNS header.  */
+static inline int
+ns_rr_cursor_ancount (const struct ns_rr_cursor *c)
+{
+  return c->begin[6] * 256 + c->begin[7]; /* 16 bits at offset 6.  */
+}
+
+/* Returns the length of the authority (name server) section according
+   to the DNS header.  */
+static inline int
+ns_rr_cursor_nscount (const struct ns_rr_cursor *c)
+{
+  return c->begin[8] * 256 + c->begin[9]; /* 16 bits at offset 8.  */
+}
+
+/* Returns the length of the additional data section according to the
+   DNS header.  */
+static inline int
+ns_rr_cursor_adcount (const struct ns_rr_cursor *c)
+{
+  return c->begin[10] * 256 + c->begin[11]; /* 16 bits at offset 10.  */
+}
+
+/* Returns a pointer to the uncompressed question name in wire
+   format.  */
+static inline const unsigned char *
+ns_rr_cursor_qname (const struct ns_rr_cursor *c)
+{
+  return c->begin + 12;		/* QNAME starts right after the header.  */
+}
+
+/* Returns the question type of the first and only question.  */
+static inline const int
+ns_rr_cursor_qtype (const struct ns_rr_cursor *c)
+{
+  /* 16 bits 4 bytes back from the first RR header start.  */
+  return c->first_rr[-4] * 256 + c->first_rr[-3];
+}
+
+/* Returns the clss of the first and only question (usally C_IN).  */
+static inline const int
+ns_rr_cursor_qclass (const struct ns_rr_cursor *c)
+{
+  /* 16 bits 2 bytes back from the first RR header start.  */
+  return c->first_rr[-2] * 256 + c->first_rr[-1];
+}
+
+/* Initializes *C to cover the packet [BUF, BUF+LEN).  Returns false
+   if LEN is less than sizeof (*HD), if the packet does not contain a
+   full (uncompressed) question, or if the question count is not 1.  */
+_Bool __ns_rr_cursor_init (struct ns_rr_cursor *c,
+			   const unsigned char *buf, size_t len)
+  attribute_hidden;
+
+/* Like ns_rr, but the record owner name is not decoded into text format.  */
+struct ns_rr_wire
+{
+  unsigned char rname[NS_MAXCDNAME]; /* Owner name of the record.  */
+  uint16_t rtype;		/* Resource record type (T_*).  */
+  uint16_t rclass;		/* Resource record class (C_*).  */
+  uint32_t ttl;			/* Time-to-live field.  */
+  const unsigned char *rdata;	/* Start of resource record data.  */
+  uint16_t rdlength;		/* Length of the data at rdata, in bytes.  */
+};
+
+/* Attempts to parse the record at C into *RR.  On success, return
+   true, and C is advanced past the record, and RR->rdata points to
+   the record data.  On failure, errno is set to EMSGSIZE, and false
+   is returned.  */
+_Bool __ns_rr_cursor_next (struct ns_rr_cursor *c, struct ns_rr_wire *rr)
+  attribute_hidden;
+
 # endif /* !_ISOMAC */
 #endif
diff --git a/include/bits/stdio2-decl.h b/include/bits/stdio2-decl.h
new file mode 100644
index 000000000..bbb052f19
--- /dev/null
+++ b/include/bits/stdio2-decl.h
@@ -0,0 +1 @@
+#include <libio/bits/stdio2-decl.h>
diff --git a/include/bits/wchar2-decl.h b/include/bits/wchar2-decl.h
new file mode 100644
index 000000000..00b1b9334
--- /dev/null
+++ b/include/bits/wchar2-decl.h
@@ -0,0 +1 @@
+#include <wcsmbs/bits/wchar2-decl.h>
diff --git a/include/libc-internal.h b/include/libc-internal.h
index 15920d2bd..c052bccb2 100644
--- a/include/libc-internal.h
+++ b/include/libc-internal.h
@@ -21,9 +21,6 @@
 
 #include <hp-timing.h>
 
-/* Initialize the `__libc_enable_secure' flag.  */
-extern void __libc_init_secure (void);
-
 /* Discover the tick frequency of the machine if something goes wrong,
    we return 0, an impossible hertz.  */
 extern int __profile_frequency (void);
diff --git a/include/register-atfork.h b/include/register-atfork.h
index be631137b..5ebe5a0b3 100644
--- a/include/register-atfork.h
+++ b/include/register-atfork.h
@@ -26,6 +26,7 @@ struct fork_handler
   void (*parent_handler) (void);
   void (*child_handler) (void);
   void *dso_handle;
+  uint64_t id;
 };
 
 /* Function to call to unregister fork handlers.  */
@@ -39,19 +40,18 @@ enum __run_fork_handler_type
   atfork_run_parent
 };
 
-/* Run the atfork handlers and lock/unlock the internal lock depending
-   of the WHO argument:
-
-   - atfork_run_prepare: run all the PREPARE_HANDLER in reverse order of
-			 insertion and locks the internal lock.
-   - atfork_run_child: run all the CHILD_HANDLER and unlocks the internal
-		       lock.
-   - atfork_run_parent: run all the PARENT_HANDLER and unlocks the internal
-			lock.
-
-   Perform locking only if DO_LOCKING.  */
-extern void __run_fork_handlers (enum __run_fork_handler_type who,
-				 _Bool do_locking) attribute_hidden;
+/* Run the atfork prepare handlers in the reverse order of registration and
+   return the ID of the last registered handler.  If DO_LOCKING is true, the
+   internal lock is held locked upon return.  */
+extern uint64_t __run_prefork_handlers (_Bool do_locking) attribute_hidden;
+
+/* Given a handler type (parent or child), run all the atfork handlers in
+   the order of registration up to and including the handler with id equal
+   to LASTRUN.  If DO_LOCKING is true, the internal lock is unlocked prior
+   to return.  */
+extern void __run_postfork_handlers (enum __run_fork_handler_type who,
+                                     _Bool do_locking,
+                                     uint64_t lastrun) attribute_hidden;
 
 /* C library side function to register new fork handlers.  */
 extern int __register_atfork (void (*__prepare) (void),
diff --git a/include/resolv.h b/include/resolv.h
index 3590b6f49..4dbbac380 100644
--- a/include/resolv.h
+++ b/include/resolv.h
@@ -70,5 +70,8 @@ libc_hidden_proto (__libc_res_nameinquery)
 extern __typeof (__res_queriesmatch) __libc_res_queriesmatch;
 libc_hidden_proto (__libc_res_queriesmatch)
 
+/* Variant of res_hnok which operates on binary (but uncompressed) names.  */
+bool __res_binary_hnok (const unsigned char *dn) attribute_hidden;
+
 # endif /* _RESOLV_H_ && !_ISOMAC */
 #endif
diff --git a/include/unistd.h b/include/unistd.h
index 709016960..af795a37c 100644
--- a/include/unistd.h
+++ b/include/unistd.h
@@ -192,7 +192,6 @@ libc_hidden_proto (__tcsetpgrp)
    and some functions contained in the C library ignore various
    environment variables that normally affect them.  */
 extern int __libc_enable_secure attribute_relro;
-extern int __libc_enable_secure_decided;
 rtld_hidden_proto (__libc_enable_secure)
 
 
diff --git a/inet/ruserpass.c b/inet/ruserpass.c
index d61a72877..75e2a0655 100644
--- a/inet/ruserpass.c
+++ b/inet/ruserpass.c
@@ -95,7 +95,7 @@ ruserpass (const char *host, const char **aname, const char **apass)
 	char *hdir, *buf, *tmp;
 	char myname[1024], *mydomain;
 	int t, usedefault = 0;
-	struct stat64 stb;
+	struct __stat64_t64 stb;
 
 	hdir = __libc_secure_getenv("HOME");
 	if (hdir == NULL) {
@@ -174,7 +174,7 @@ next:
 			break;
 		case PASSWD:
 			if (strcmp(*aname, "anonymous") &&
-			    __fstat64(fileno(cfile), &stb) >= 0 &&
+			    __fstat64_time64(fileno(cfile), &stb) >= 0 &&
 			    (stb.st_mode & 077) != 0) {
 	warnx(_("Error: .netrc file is readable by others."));
 	warnx(_("Remove 'password' line or make file unreadable by others."));
diff --git a/io/Makefile b/io/Makefile
index cf265dc9b..b1710407d 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -83,16 +83,17 @@ tests		:= test-utime test-stat test-stat2 test-lfs tst-getcwd \
 		   tst-ftw-bz28126
 
 tests-time64 := \
+  tst-fcntl-time64 \
+  tst-fts-time64 \
   tst-futimens-time64 \
   tst-futimes-time64\
-  tst-fts-time64 \
+  tst-futimesat-time64 \
+  tst-lchmod-time64 \
   tst-lutimes-time64 \
   tst-stat-time64 \
-  tst-futimesat-time64 \
   tst-utime-time64 \
   tst-utimensat-time64 \
   tst-utimes-time64 \
-  tst-fcntl-time64 \
   # tests-time64
 
 # Likewise for statx, but we do not need static linking here.
@@ -136,6 +137,7 @@ CFLAGS-close.c += -fexceptions -fasynchronous-unwind-tables
 
 CFLAGS-test-stat.c += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE
 CFLAGS-test-lfs.c += -D_LARGEFILE64_SOURCE
+CFLAGS-tst-lchmod.c += -D_FILE_OFFSET_BITS=64
 
 test-stat2-ARGS = Makefile . $(objpfx)test-stat2
 
diff --git a/io/tst-lchmod-time64.c b/io/tst-lchmod-time64.c
new file mode 100644
index 000000000..f2b7cc9d3
--- /dev/null
+++ b/io/tst-lchmod-time64.c
@@ -0,0 +1,2 @@
+#define CHECK_TIME64
+#include "tst-lchmod.c"
diff --git a/io/tst-lchmod.c b/io/tst-lchmod.c
index c644f565f..c1c41bda8 100644
--- a/io/tst-lchmod.c
+++ b/io/tst-lchmod.c
@@ -66,10 +66,27 @@ select_path (bool do_relative_path, const char *full_path, const char *relative_
     return full_path;
 }
 
+static void
+update_file_time_to_y2038 (const char *fname, int flags)
+{
+#ifdef CHECK_TIME64
+  /* Y2038 threshold plus 1 second.  */
+  const struct timespec ts[] = { { 0x80000001LL, 0}, { 0x80000001LL } };
+  TEST_VERIFY_EXIT (utimensat (AT_FDCWD, fname, ts, flags) == 0);
+#endif
+}
+
 static void
 test_1 (bool do_relative_path, int (*chmod_func) (int fd, const char *, mode_t, int))
 {
   char *tempdir = support_create_temp_directory ("tst-lchmod-");
+#ifdef CHECK_TIME64
+  if (!support_path_support_time64 (tempdir))
+    {
+      puts ("info: test skipped, filesystem does not support 64 bit time_t");
+      return;
+    }
+#endif
 
   char *path_dangling = xasprintf ("%s/dangling", tempdir);
   char *path_file = xasprintf ("%s/file", tempdir);
@@ -93,9 +110,12 @@ test_1 (bool do_relative_path, int (*chmod_func) (int fd, const char *, mode_t,
   xsymlink ("loop", path_loop);
   xsymlink ("target-does-not-exist", path_dangling);
 
+  update_file_time_to_y2038 (path_file, 0);
+  update_file_time_to_y2038 (path_to_file, AT_SYMLINK_NOFOLLOW);
+
   /* Check that the modes do not collide with what we will use in the
      test.  */
-  struct stat64 st;
+  struct stat st;
   xstat (path_file, &st);
   TEST_VERIFY ((st.st_mode & 0777) != 1);
   xlstat (path_to_file, &st);
diff --git a/io/tst-stat.c b/io/tst-stat.c
index 2b7975e16..237988203 100644
--- a/io/tst-stat.c
+++ b/io/tst-stat.c
@@ -69,6 +69,10 @@ do_test (void)
   TEST_VERIFY_EXIT (fd >= 0);
   support_write_file_string (path, "abc");
 
+  /* This should help to prevent delayed allocation, which may result
+     in a spurious stx_blocks/st_blocks difference.  */
+  fsync (fd);
+
   bool check_ns = support_stat_nanoseconds (path);
   if (!check_ns)
     printf ("warning: timestamp with nanoseconds not supported\n");
diff --git a/libio/Makefile b/libio/Makefile
index 0e5f348be..31831aea8 100644
--- a/libio/Makefile
+++ b/libio/Makefile
@@ -23,7 +23,7 @@ subdir	:= libio
 include ../Makeconfig
 
 headers	:= stdio.h \
-	   bits/stdio.h bits/stdio2.h bits/stdio-ldbl.h \
+	   bits/stdio.h bits/stdio2.h bits/stdio2-decl.h bits/stdio-ldbl.h \
 	   bits/types/FILE.h bits/types/__FILE.h bits/types/struct_FILE.h \
 	   bits/types/__fpos_t.h bits/types/__fpos64_t.h \
 	   bits/types/cookie_io_functions_t.h
diff --git a/libio/bits/stdio2-decl.h b/libio/bits/stdio2-decl.h
new file mode 100644
index 000000000..e398f7182
--- /dev/null
+++ b/libio/bits/stdio2-decl.h
@@ -0,0 +1,111 @@
+/* Checking macros for stdio functions. Declarations only.
+   Copyright (C) 2004-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _BITS_STDIO2_DEC_H
+#define _BITS_STDIO2_DEC_H 1
+
+#ifndef _STDIO_H
+# error "Never include <bits/stdio2-decl.h> directly; use <stdio.h> instead."
+#endif
+
+extern int __sprintf_chk (char *__restrict __s, int __flag, size_t __slen,
+			  const char *__restrict __format, ...) __THROW
+    __attr_access ((__write_only__, 1, 3));
+extern int __vsprintf_chk (char *__restrict __s, int __flag, size_t __slen,
+			   const char *__restrict __format,
+			   __gnuc_va_list __ap) __THROW
+    __attr_access ((__write_only__, 1, 3));
+
+#if defined __USE_ISOC99 || defined __USE_UNIX98
+
+extern int __snprintf_chk (char *__restrict __s, size_t __n, int __flag,
+			   size_t __slen, const char *__restrict __format,
+			   ...) __THROW
+    __attr_access ((__write_only__, 1, 2));
+extern int __vsnprintf_chk (char *__restrict __s, size_t __n, int __flag,
+			    size_t __slen, const char *__restrict __format,
+			    __gnuc_va_list __ap) __THROW
+    __attr_access ((__write_only__, 1, 2));
+
+#endif
+
+#if __USE_FORTIFY_LEVEL > 1
+
+extern int __fprintf_chk (FILE *__restrict __stream, int __flag,
+			  const char *__restrict __format, ...);
+extern int __printf_chk (int __flag, const char *__restrict __format, ...);
+extern int __vfprintf_chk (FILE *__restrict __stream, int __flag,
+			   const char *__restrict __format, __gnuc_va_list __ap);
+extern int __vprintf_chk (int __flag, const char *__restrict __format,
+			  __gnuc_va_list __ap);
+
+# ifdef __USE_XOPEN2K8
+extern int __dprintf_chk (int __fd, int __flag, const char *__restrict __fmt,
+			  ...) __attribute__ ((__format__ (__printf__, 3, 4)));
+extern int __vdprintf_chk (int __fd, int __flag,
+			   const char *__restrict __fmt, __gnuc_va_list __arg)
+     __attribute__ ((__format__ (__printf__, 3, 0)));
+# endif
+
+# ifdef __USE_GNU
+
+extern int __asprintf_chk (char **__restrict __ptr, int __flag,
+			   const char *__restrict __fmt, ...)
+     __THROW __attribute__ ((__format__ (__printf__, 3, 4))) __wur;
+extern int __vasprintf_chk (char **__restrict __ptr, int __flag,
+			    const char *__restrict __fmt, __gnuc_va_list __arg)
+     __THROW __attribute__ ((__format__ (__printf__, 3, 0))) __wur;
+extern int __obstack_printf_chk (struct obstack *__restrict __obstack,
+				 int __flag, const char *__restrict __format,
+				 ...)
+     __THROW __attribute__ ((__format__ (__printf__, 3, 4)));
+extern int __obstack_vprintf_chk (struct obstack *__restrict __obstack,
+				  int __flag,
+				  const char *__restrict __format,
+				  __gnuc_va_list __args)
+     __THROW __attribute__ ((__format__ (__printf__, 3, 0)));
+
+# endif
+#endif
+
+#if __GLIBC_USE (DEPRECATED_GETS)
+extern char *__gets_chk (char *__str, size_t) __wur;
+#endif
+
+extern char *__fgets_chk (char *__restrict __s, size_t __size, int __n,
+			  FILE *__restrict __stream)
+    __wur __attr_access ((__write_only__, 1, 3));
+
+extern size_t __fread_chk (void *__restrict __ptr, size_t __ptrlen,
+			   size_t __size, size_t __n,
+			   FILE *__restrict __stream) __wur;
+
+#ifdef __USE_GNU
+extern char *__fgets_unlocked_chk (char *__restrict __s, size_t __size,
+				   int __n, FILE *__restrict __stream)
+    __wur __attr_access ((__write_only__, 1, 3));
+#endif
+
+#ifdef __USE_MISC
+# undef fread_unlocked
+extern size_t __fread_unlocked_chk (void *__restrict __ptr, size_t __ptrlen,
+				    size_t __size, size_t __n,
+				    FILE *__restrict __stream) __wur;
+#endif
+
+#endif /* bits/stdio2-decl.h.  */
diff --git a/libio/bits/stdio2.h b/libio/bits/stdio2.h
index b0b655ee7..b1e200e71 100644
--- a/libio/bits/stdio2.h
+++ b/libio/bits/stdio2.h
@@ -23,14 +23,6 @@
 # error "Never include <bits/stdio2.h> directly; use <stdio.h> instead."
 #endif
 
-extern int __sprintf_chk (char *__restrict __s, int __flag, size_t __slen,
-			  const char *__restrict __format, ...) __THROW
-    __attr_access ((__write_only__, 1, 3));
-extern int __vsprintf_chk (char *__restrict __s, int __flag, size_t __slen,
-			   const char *__restrict __format,
-			   __gnuc_va_list __ap) __THROW
-    __attr_access ((__write_only__, 1, 3));
-
 #ifdef __va_arg_pack
 __fortify_function int
 __NTH (sprintf (char *__restrict __s, const char *__restrict __fmt, ...))
@@ -54,15 +46,6 @@ __NTH (vsprintf (char *__restrict __s, const char *__restrict __fmt,
 }
 
 #if defined __USE_ISOC99 || defined __USE_UNIX98
-
-extern int __snprintf_chk (char *__restrict __s, size_t __n, int __flag,
-			   size_t __slen, const char *__restrict __format,
-			   ...) __THROW
-    __attr_access ((__write_only__, 1, 2));
-extern int __vsnprintf_chk (char *__restrict __s, size_t __n, int __flag,
-			    size_t __slen, const char *__restrict __format,
-			    __gnuc_va_list __ap) __THROW;
-
 # ifdef __va_arg_pack
 __fortify_function int
 __NTH (snprintf (char *__restrict __s, size_t __n,
@@ -89,15 +72,6 @@ __NTH (vsnprintf (char *__restrict __s, size_t __n,
 #endif
 
 #if __USE_FORTIFY_LEVEL > 1
-
-extern int __fprintf_chk (FILE *__restrict __stream, int __flag,
-			  const char *__restrict __format, ...);
-extern int __printf_chk (int __flag, const char *__restrict __format, ...);
-extern int __vfprintf_chk (FILE *__restrict __stream, int __flag,
-			   const char *__restrict __format, __gnuc_va_list __ap);
-extern int __vprintf_chk (int __flag, const char *__restrict __format,
-			  __gnuc_va_list __ap);
-
 # ifdef __va_arg_pack
 __fortify_function int
 fprintf (FILE *__restrict __stream, const char *__restrict __fmt, ...)
@@ -136,12 +110,6 @@ vfprintf (FILE *__restrict __stream,
 }
 
 # ifdef __USE_XOPEN2K8
-extern int __dprintf_chk (int __fd, int __flag, const char *__restrict __fmt,
-			  ...) __attribute__ ((__format__ (__printf__, 3, 4)));
-extern int __vdprintf_chk (int __fd, int __flag,
-			   const char *__restrict __fmt, __gnuc_va_list __arg)
-     __attribute__ ((__format__ (__printf__, 3, 0)));
-
 #  ifdef __va_arg_pack
 __fortify_function int
 dprintf (int __fd, const char *__restrict __fmt, ...)
@@ -162,23 +130,6 @@ vdprintf (int __fd, const char *__restrict __fmt, __gnuc_va_list __ap)
 # endif
 
 # ifdef __USE_GNU
-
-extern int __asprintf_chk (char **__restrict __ptr, int __flag,
-			   const char *__restrict __fmt, ...)
-     __THROW __attribute__ ((__format__ (__printf__, 3, 4))) __wur;
-extern int __vasprintf_chk (char **__restrict __ptr, int __flag,
-			    const char *__restrict __fmt, __gnuc_va_list __arg)
-     __THROW __attribute__ ((__format__ (__printf__, 3, 0))) __wur;
-extern int __obstack_printf_chk (struct obstack *__restrict __obstack,
-				 int __flag, const char *__restrict __format,
-				 ...)
-     __THROW __attribute__ ((__format__ (__printf__, 3, 4)));
-extern int __obstack_vprintf_chk (struct obstack *__restrict __obstack,
-				  int __flag,
-				  const char *__restrict __format,
-				  __gnuc_va_list __args)
-     __THROW __attribute__ ((__format__ (__printf__, 3, 0)));
-
 #  ifdef __va_arg_pack
 __fortify_function int
 __NTH (asprintf (char **__restrict __ptr, const char *__restrict __fmt, ...))
@@ -231,7 +182,6 @@ __NTH (obstack_vprintf (struct obstack *__restrict __obstack,
 #endif
 
 #if __GLIBC_USE (DEPRECATED_GETS)
-extern char *__gets_chk (char *__str, size_t) __wur;
 extern char *__REDIRECT (__gets_warn, (char *__str), gets)
      __wur __warnattr ("please use fgets or getline instead, gets can't "
 		       "specify buffer size");
@@ -245,9 +195,6 @@ gets (char *__str)
 }
 #endif
 
-extern char *__fgets_chk (char *__restrict __s, size_t __size, int __n,
-			  FILE *__restrict __stream)
-    __wur __attr_access ((__write_only__, 1, 3));
 extern char *__REDIRECT (__fgets_alias,
 			 (char *__restrict __s, int __n,
 			  FILE *__restrict __stream), fgets)
@@ -269,9 +216,6 @@ fgets (char *__restrict __s, int __n, FILE *__restrict __stream)
   return __fgets_chk (__s, sz, __n, __stream);
 }
 
-extern size_t __fread_chk (void *__restrict __ptr, size_t __ptrlen,
-			   size_t __size, size_t __n,
-			   FILE *__restrict __stream) __wur;
 extern size_t __REDIRECT (__fread_alias,
 			  (void *__restrict __ptr, size_t __size,
 			   size_t __n, FILE *__restrict __stream),
@@ -297,9 +241,6 @@ fread (void *__restrict __ptr, size_t __size, size_t __n,
 }
 
 #ifdef __USE_GNU
-extern char *__fgets_unlocked_chk (char *__restrict __s, size_t __size,
-				   int __n, FILE *__restrict __stream)
-    __wur __attr_access ((__write_only__, 1, 3));
 extern char *__REDIRECT (__fgets_unlocked_alias,
 			 (char *__restrict __s, int __n,
 			  FILE *__restrict __stream), fgets_unlocked)
@@ -324,9 +265,6 @@ fgets_unlocked (char *__restrict __s, int __n, FILE *__restrict __stream)
 
 #ifdef __USE_MISC
 # undef fread_unlocked
-extern size_t __fread_unlocked_chk (void *__restrict __ptr, size_t __ptrlen,
-				    size_t __size, size_t __n,
-				    FILE *__restrict __stream) __wur;
 extern size_t __REDIRECT (__fread_unlocked_alias,
 			  (void *__restrict __ptr, size_t __size,
 			   size_t __n, FILE *__restrict __stream),
diff --git a/libio/stdio.h b/libio/stdio.h
index e6425341c..0e0f16b46 100644
--- a/libio/stdio.h
+++ b/libio/stdio.h
@@ -885,20 +885,27 @@ extern void funlockfile (FILE *__stream) __THROW;
 extern int __uflow (FILE *);
 extern int __overflow (FILE *, int);
 
+#if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function
+/* Declare all functions from bits/stdio2-decl.h first.  */
+# include <bits/stdio2-decl.h>
+#endif
+
+/* The following headers provide asm redirections.  These redirections must
+   appear before the first usage of these functions, e.g. in bits/stdio.h.  */
+#if defined __LDBL_COMPAT || __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI == 1
+# include <bits/stdio-ldbl.h>
+#endif
+
 /* If we are compiling with optimizing read this file.  It contains
    several optimizing inline functions and macros.  */
 #ifdef __USE_EXTERN_INLINES
 # include <bits/stdio.h>
 #endif
 #if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function
+/* Now include the function definitions and redirects too.  */
 # include <bits/stdio2.h>
 #endif
 
-#include <bits/floatn.h>
-#if defined __LDBL_COMPAT || __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI == 1
-# include <bits/stdio-ldbl.h>
-#endif
-
 __END_DECLS
 
 #endif /* <stdio.h> included.  */
diff --git a/locale/programs/ld-monetary.c b/locale/programs/ld-monetary.c
index 3b0412b40..18698bbe9 100644
--- a/locale/programs/ld-monetary.c
+++ b/locale/programs/ld-monetary.c
@@ -196,21 +196,105 @@ No definition for %s category found"), "LC_MONETARY");
 	}
     }
 
+  /* Generally speaking there are 3 standards the define the default,
+     warning, and error behaviour of LC_MONETARY.  They are ISO/IEC TR 30112,
+     ISO/IEC 9899:2018 (ISO C17), and POSIX.1-2017.  Within 30112 we have the
+     definition of a standard i18n FDCC-set, which for LC_MONETARY has the
+     following default values:
+	int_curr_symbol		""
+	currency_symbol		""
+	mon_decimal_point	"<U002C>" i.e. ","
+	mon_thousand_sep	""
+	mon_grouping		"\177" i.e. CHAR_MAX
+	positive_sign		""
+	negative_sign		"<U002E>" i.e. "."
+	int_frac_digits		-1
+	frac_digits		-1
+	p_cs_precedes		-1
+	p_sep_by_space		-1
+	n_cs_precedes		-1
+	n_sep_by_space		-1
+	p_sign_posn		-1
+	n_sign_posn		-1
+    Under 30112 a keyword that is not provided implies an empty string ""
+    for string values or a -1 for integer values, and indicates the value
+    is unspecified with no default implied.  No errors are considered.
+    The exception is mon_grouping which is a string with a terminating
+    CHAR_MAX.
+    For POSIX Issue 7 we have:
+    https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html
+    and again values not provided default to "" or -1, and indicate the value
+    is not available to the locale.  The exception is mon_grouping which is
+    a string with a terminating CHAR_MAX.  For the POSIX locale the values of
+    LC_MONETARY should be:
+	int_curr_symbol		""
+	currency_symbol		""
+	mon_decimal_point	""
+	mon_thousands_sep	""
+	mon_grouping		"\177" i.e. CHAR_MAX 
+	positive_sign		""
+	negative_sign		""
+	int_frac_digits		-1
+	frac_digits		-1
+	p_cs_precedes		-1
+	p_sep_by_space		-1
+	n_cs_precedes		-1
+	n_sep_by_space		-1
+	p_sign_posn		-1
+	n_sign_posn		-1
+	int_p_cs_precedes	-1
+	int_p_sep_by_space	-1
+	int_n_cs_precedes	-1
+	int_n_sep_by_space	-1
+	int_p_sign_posn		-1
+	int_n_sign_posn		-1
+    Like with 30112, POSIX also considers no error if the keywords are
+    missing, only that if the cateory as a whole is missing the referencing
+    of the category results in unspecified behaviour.
+    For ISO C17 there is no default value provided, but the localeconv
+    specification in 7.11.2.1 admits that members of char * type may point
+    to "" to indicate a value is not available or is of length zero.
+    The exception is decimal_point (not mon_decimal_point) which must be a
+    defined non-empty string.  The values of char, which are generally
+    mapped to integer values in 30112 and POSIX, must be non-negative
+    numbers that map to CHAR_MAX when a value is not available in the
+    locale.
+    In ISO C17 for the "C" locale all values are empty strings "", or
+    CHAR_MAX, with the exception of decimal_point which is "." (defined
+    in LC_NUMERIC).  ISO C17 makes no exception for mon_grouping like
+    30112 and POSIX, but a value of "" is functionally equivalent to
+    "\177" since neither defines a grouping (though the latter terminates
+    the grouping).
+
+    Lastly, we must consider the legacy C/POSIX locale that implemented
+    as a builtin in glibc and wether a default value mapping to the
+    C/POSIX locale may benefit the user from a compatibility perspective.
+
+    Thus given 30112, POSIX, ISO C, and the builtin C/POSIX locale we
+    need to pick appropriate defaults below.   */
+
+  /* The members of LC_MONETARY are handled in the order of their definition
+     in locale/categories.def.  Please keep them in that order.  */
+
+  /* The purpose of TEST_ELEM is to define a default value for the fields
+     in the category if the field was not defined in the cateory.  If the
+     category was present but we didn't see a definition for the field then
+     we also issue a warning, otherwise the only warning you get is the one
+     earlier when a default category is created (completely missing category).
+     This missing field warning is glibc-specific since no standard requires
+     this warning, but we consider it valuable to print a warning for all
+     missing fields in the category.  */
 #define TEST_ELEM(cat, initval) \
   if (monetary->cat == NULL)						      \
     {									      \
       if (! nothing)							      \
-	record_error (0, 0, _("%s: field `%s' not defined"),		      \
-		      "LC_MONETARY", #cat);				      \
+	record_warning (_("%s: field `%s' not defined"),		      \
+			"LC_MONETARY", #cat);				      \
       monetary->cat = initval;						      \
     }
 
+  /* Keyword: int_curr_symbol.  */
   TEST_ELEM (int_curr_symbol, "");
-  TEST_ELEM (currency_symbol, "");
-  TEST_ELEM (mon_thousands_sep, "");
-  TEST_ELEM (positive_sign, "");
-  TEST_ELEM (negative_sign, "");
-
   /* The international currency symbol must come from ISO 4217.  */
   if (monetary->int_curr_symbol != NULL)
     {
@@ -247,41 +331,63 @@ not correspond to a valid name in ISO 4217 [--no-warnings=intcurrsym]"),
 	}
     }
 
-  /* The decimal point must not be empty.  This is not said explicitly
-     in POSIX but ANSI C (ISO/IEC 9899) says in 4.4.2.1 it has to be
-     != "".  */
+  /* Keyword: currency_symbol */
+  TEST_ELEM (currency_symbol, "");
+
+  /* Keyword: mon_decimal_point */
+  /* ISO C17 7.11.2.1.3 explicitly allows mon_decimal_point to be the
+     empty string e.g. "".  This indicates the value is not available in the
+     current locale or is of zero length.  However, if the value was never
+     defined then we issue a warning and use a glibc-specific default.  ISO
+     30112 in the i18n FDCC-Set uses <U002C> ",", and POSIX Issue 7 in the
+     POSIX locale uses "".  It is specific to glibc that the default is <U002E>
+     "."; we retain this existing behaviour for backwards compatibility.  */
   if (monetary->mon_decimal_point == NULL)
     {
       if (! nothing)
-	record_error (0, 0, _("%s: field `%s' not defined"),
-		      "LC_MONETARY", "mon_decimal_point");
+	record_warning (_("%s: field `%s' not defined, using defaults"),
+			"LC_MONETARY", "mon_decimal_point");
       monetary->mon_decimal_point = ".";
       monetary->mon_decimal_point_wc = L'.';
     }
-  else if (monetary->mon_decimal_point[0] == '\0' && ! be_quiet && ! nothing)
+
+  /* Keyword: mon_thousands_sep */
+  if (monetary->mon_thousands_sep == NULL)
     {
-      record_error (0, 0, _("\
-%s: value for field `%s' must not be an empty string"),
-		    "LC_MONETARY", "mon_decimal_point");
+      if (! nothing)
+	record_warning (_("%s: field `%s' not defined, using defaults"),
+			"LC_MONETARY", "mon_thousands_sep");
+      monetary->mon_thousands_sep = "";
+      monetary->mon_thousands_sep_wc = L'\0';
     }
 
+  /* Keyword: mon_grouping */
   if (monetary->mon_grouping_len == 0)
     {
       if (! nothing)
-	record_error (0, 0, _("%s: field `%s' not defined"),
-		      "LC_MONETARY", "mon_grouping");
-
+	record_warning (_("%s: field `%s' not defined"),
+			"LC_MONETARY", "mon_grouping");
+      /* Missing entries are given 1 element in their bytearray with
+	 a value of CHAR_MAX which indicates that "No further grouping
+	 is to be performed" (functionally equivalent to ISO C's "C"
+	 locale default of ""). */
       monetary->mon_grouping = (char *) "\177";
       monetary->mon_grouping_len = 1;
     }
 
+  /* Keyword: positive_sign */
+  TEST_ELEM (positive_sign, "");
+
+  /* Keyword: negative_sign */
+  TEST_ELEM (negative_sign, "");
+
 #undef TEST_ELEM
 #define TEST_ELEM(cat, min, max, initval) \
   if (monetary->cat == -2)						      \
     {									      \
        if (! nothing)							      \
-	 record_error (0, 0, _("%s: field `%s' not defined"),		      \
-		       "LC_MONETARY", #cat);				      \
+	 record_warning (_("%s: field `%s' not defined"),		      \
+			 "LC_MONETARY", #cat);				      \
        monetary->cat = initval;						      \
     }									      \
   else if ((monetary->cat < min || monetary->cat > max)			      \
@@ -300,16 +406,11 @@ not correspond to a valid name in ISO 4217 [--no-warnings=intcurrsym]"),
   TEST_ELEM (p_sign_posn, -1, 4, -1);
   TEST_ELEM (n_sign_posn, -1, 4, -1);
 
-  /* The non-POSIX.2 extensions are optional.  */
-  if (monetary->duo_int_curr_symbol == NULL)
-    monetary->duo_int_curr_symbol = monetary->int_curr_symbol;
-  if (monetary->duo_currency_symbol == NULL)
-    monetary->duo_currency_symbol = monetary->currency_symbol;
-
-  if (monetary->duo_int_frac_digits == -2)
-    monetary->duo_int_frac_digits = monetary->int_frac_digits;
-  if (monetary->duo_frac_digits == -2)
-    monetary->duo_frac_digits = monetary->frac_digits;
+  /* Keyword: crncystr */
+  monetary->crncystr = (char *) xmalloc (strlen (monetary->currency_symbol)
+					 + 2);
+  monetary->crncystr[0] = monetary->p_cs_precedes ? '-' : '+';
+  strcpy (&monetary->crncystr[1], monetary->currency_symbol);
 
 #undef TEST_ELEM
 #define TEST_ELEM(cat, alt, min, max) \
@@ -327,6 +428,17 @@ not correspond to a valid name in ISO 4217 [--no-warnings=intcurrsym]"),
   TEST_ELEM (int_p_sign_posn, p_sign_posn, -1, 4);
   TEST_ELEM (int_n_sign_posn, n_sign_posn, -1, 4);
 
+  /* The non-POSIX.2 extensions are optional.  */
+  if (monetary->duo_int_curr_symbol == NULL)
+    monetary->duo_int_curr_symbol = monetary->int_curr_symbol;
+  if (monetary->duo_currency_symbol == NULL)
+    monetary->duo_currency_symbol = monetary->currency_symbol;
+
+  if (monetary->duo_int_frac_digits == -2)
+    monetary->duo_int_frac_digits = monetary->int_frac_digits;
+  if (monetary->duo_frac_digits == -2)
+    monetary->duo_frac_digits = monetary->frac_digits;
+
   TEST_ELEM (duo_p_cs_precedes, p_cs_precedes, -1, 1);
   TEST_ELEM (duo_p_sep_by_space, p_sep_by_space, -1, 2);
   TEST_ELEM (duo_n_cs_precedes, n_cs_precedes, -1, 1);
@@ -349,17 +461,15 @@ not correspond to a valid name in ISO 4217 [--no-warnings=intcurrsym]"),
   if (monetary->duo_valid_to == 0)
     monetary->duo_valid_to = 99991231;
 
+  /* Keyword: conversion_rate */
   if (monetary->conversion_rate[0] == 0)
     {
       monetary->conversion_rate[0] = 1;
       monetary->conversion_rate[1] = 1;
     }
 
-  /* Create the crncystr entry.  */
-  monetary->crncystr = (char *) xmalloc (strlen (monetary->currency_symbol)
-					 + 2);
-  monetary->crncystr[0] = monetary->p_cs_precedes ? '-' : '+';
-  strcpy (&monetary->crncystr[1], monetary->currency_symbol);
+  /* A value for monetary-decimal-point-wc was set when
+     monetary_decimal_point was set, likewise for monetary-thousands-sep-wc.  */
 }
 
 
diff --git a/locale/programs/locarchive.c b/locale/programs/locarchive.c
index 45408c26c..eeb2fa6ff 100644
--- a/locale/programs/locarchive.c
+++ b/locale/programs/locarchive.c
@@ -1397,7 +1397,7 @@ add_locales_to_archive (size_t nlist, char *list[], bool replace)
 		    {
 		      char fullname[fnamelen + 2 * strlen (d->d_name) + 7];
 
-		      if (d_type == DT_UNKNOWN)
+		      if (d_type == DT_UNKNOWN || d_type == DT_LNK)
 			{
 			  strcpy (stpcpy (stpcpy (fullname, fname), "/"),
 				  d->d_name);
diff --git a/localedata/Makefile b/localedata/Makefile
index 9ae2e5c16..7741ac3b5 100644
--- a/localedata/Makefile
+++ b/localedata/Makefile
@@ -468,11 +468,11 @@ define build-one-locale
 endef
 
 $(INSTALL-SUPPORTED-LOCALE-ARCHIVE): install-locales-dir
-	@flags="-c"; \
+	@flags=""; \
 	$(build-one-locale)
 
 $(INSTALL-SUPPORTED-LOCALE-FILES): install-locales-dir
-	@flags="-c --no-archive --no-hard-links"; \
+	@flags="--no-archive --no-hard-links"; \
 	$(build-one-locale)
 
 tst-setlocale-ENV = LC_ALL=ja_JP.EUC-JP
diff --git a/localedata/gen-locale.sh b/localedata/gen-locale.sh
index 7fce35f21..8053c816a 100644
--- a/localedata/gen-locale.sh
+++ b/localedata/gen-locale.sh
@@ -54,8 +54,14 @@ modifier=`echo $locfile|sed 's|[^.]*[.]\([^@ ]*\)\(@[^ ]*\)\?/LC_CTYPE|\2|'`
 
 echo "Generating locale $locale.$charmap: this might take a while..."
 
-# Run quietly and force output.
-flags="--quiet -c"
+# Do not force output with '-c', all locales should compile without
+# warning or errors.  There is likewise no need to run quietly with
+# '--quiet' since all locales should compile without additional
+# diagnostics.  If there are messages printed then we want to see
+# them, fix them, and the associated error or warning.  During
+# development it may be beneficialy to put '--quiet -c' here to allow
+# you to develop in-progress locales.
+flags=""
 
 # For SJIS the charmap is SHIFT_JIS. We just want the locale to have
 # a slightly nicer name instead of using "*.SHIFT_SJIS", but that
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 1a1ac1d8f..fe9cb9b80 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -292,19 +292,14 @@
 # define __assert_fail(assertion, file, line, function)			\
 	 __malloc_assert(assertion, file, line, function)
 
-extern const char *__progname;
-
-static void
+_Noreturn static void
 __malloc_assert (const char *assertion, const char *file, unsigned int line,
 		 const char *function)
 {
-  (void) __fxprintf (NULL, "%s%s%s:%u: %s%sAssertion `%s' failed.\n",
-		     __progname, __progname[0] ? ": " : "",
-		     file, line,
-		     function ? function : "", function ? ": " : "",
-		     assertion);
-  fflush (stderr);
-  abort ();
+  __libc_message (do_abort, "\
+Fatal glibc error: malloc assertion failure in %s: %s\n",
+		  function, assertion);
+  __builtin_unreachable ();
 }
 #endif
 #endif
diff --git a/misc/daemon.c b/misc/daemon.c
index 0e688f4d7..3c73ac2ab 100644
--- a/misc/daemon.c
+++ b/misc/daemon.c
@@ -61,11 +61,10 @@ daemon (int nochdir, int noclose)
 		(void)__chdir("/");
 
 	if (!noclose) {
-		struct stat64 st;
+		struct __stat64_t64 st;
 
 		if ((fd = __open_nocancel(_PATH_DEVNULL, O_RDWR, 0)) != -1
-		    && (__builtin_expect (__fstat64 (fd, &st), 0)
-			== 0)) {
+		    && __glibc_likely (__fstat64_time64 (fd, &st) == 0)) {
 			if (__builtin_expect (S_ISCHR (st.st_mode), 1) != 0
 #if defined DEV_NULL_MAJOR && defined DEV_NULL_MINOR
 			    && (st.st_rdev
diff --git a/misc/getusershell.c b/misc/getusershell.c
index 11f5aa83f..4221095dc 100644
--- a/misc/getusershell.c
+++ b/misc/getusershell.c
@@ -97,7 +97,7 @@ initshells (void)
 {
 	char **sp, *cp;
 	FILE *fp;
-	struct stat64 statb;
+	struct __stat64_t64 statb;
 	size_t flen;
 
 	free(shells);
@@ -106,7 +106,7 @@ initshells (void)
 	strings = NULL;
 	if ((fp = fopen(_PATH_SHELLS, "rce")) == NULL)
 		goto init_okshells_noclose;
-	if (__fstat64(fileno(fp), &statb) == -1) {
+	if (__fstat64_time64(fileno(fp), &statb) == -1) {
 	init_okshells:
 		(void)fclose(fp);
 	init_okshells_noclose:
diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
index 44d3826bc..1c2b044a0 100644
--- a/misc/sys/cdefs.h
+++ b/misc/sys/cdefs.h
@@ -162,13 +162,13 @@
    || (__builtin_constant_p (__l) && (__l) > 0))
 
 /* Length is known to be safe at compile time if the __L * __S <= __OBJSZ
-   condition can be folded to a constant and if it is true.  The -1 check is
-   redundant because since it implies that __glibc_safe_len_cond is true.  */
+   condition can be folded to a constant and if it is true, or unknown (-1) */
 #define __glibc_safe_or_unknown_len(__l, __s, __osz) \
-  (__glibc_unsigned_or_positive (__l)					      \
-   && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l),     \
-						   __s, __osz))		      \
-   && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), __s, __osz))
+  ((__builtin_constant_p (__osz) && (__osz) == (__SIZE_TYPE__) -1)	      \
+   || (__glibc_unsigned_or_positive (__l)				      \
+       && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \
+						       (__s), (__osz)))	      \
+       && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), (__s), (__osz))))
 
 /* Conversely, we know at compile time that the length is unsafe if the
    __L * __S <= __OBJSZ condition can be folded to a constant and if it is
diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index 34a33164f..01a282f3f 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -119,8 +119,6 @@ get_cached_stack (size_t *sizep, void **memp)
 
   /* Cancellation handling is back to the default.  */
   result->cancelhandling = 0;
-  result->cancelstate = PTHREAD_CANCEL_ENABLE;
-  result->canceltype = PTHREAD_CANCEL_DEFERRED;
   result->cleanup = NULL;
   result->setup_failed = 0;
 
diff --git a/nptl/cancellation.c b/nptl/cancellation.c
index 8d54a6add..f4b08902b 100644
--- a/nptl/cancellation.c
+++ b/nptl/cancellation.c
@@ -30,19 +30,26 @@ int
 __pthread_enable_asynccancel (void)
 {
   struct pthread *self = THREAD_SELF;
+  int oldval = atomic_load_relaxed (&self->cancelhandling);
 
-  int oldval = THREAD_GETMEM (self, canceltype);
-  THREAD_SETMEM (self, canceltype, PTHREAD_CANCEL_ASYNCHRONOUS);
+  while (1)
+    {
+      int newval = oldval | CANCELTYPE_BITMASK;
 
-  int ch = THREAD_GETMEM (self, cancelhandling);
+      if (newval == oldval)
+	break;
 
-  if (self->cancelstate == PTHREAD_CANCEL_ENABLE
-      && (ch & CANCELED_BITMASK)
-      && !(ch & EXITING_BITMASK)
-      && !(ch & TERMINATED_BITMASK))
-    {
-      THREAD_SETMEM (self, result, PTHREAD_CANCELED);
-      __do_cancel ();
+      if (atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+						&oldval, newval))
+	{
+	  if (cancel_enabled_and_canceled_and_async (newval))
+	    {
+	      self->result = PTHREAD_CANCELED;
+	      __do_cancel ();
+	    }
+
+	  break;
+	}
     }
 
   return oldval;
@@ -56,10 +63,29 @@ __pthread_disable_asynccancel (int oldtype)
 {
   /* If asynchronous cancellation was enabled before we do not have
      anything to do.  */
-  if (oldtype == PTHREAD_CANCEL_ASYNCHRONOUS)
+  if (oldtype & CANCELTYPE_BITMASK)
     return;
 
   struct pthread *self = THREAD_SELF;
-  self->canceltype = PTHREAD_CANCEL_DEFERRED;
+  int newval;
+  int oldval = atomic_load_relaxed (&self->cancelhandling);
+  do
+    {
+      newval = oldval & ~CANCELTYPE_BITMASK;
+    }
+  while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+						&oldval, newval));
+
+  /* We cannot return when we are being canceled.  Upon return the
+     thread might be things which would have to be undone.  The
+     following loop should loop until the cancellation signal is
+     delivered.  */
+  while (__glibc_unlikely ((newval & (CANCELING_BITMASK | CANCELED_BITMASK))
+			   == CANCELING_BITMASK))
+    {
+      futex_wait_simple ((unsigned int *) &self->cancelhandling, newval,
+			 FUTEX_PRIVATE);
+      newval = atomic_load_relaxed (&self->cancelhandling);
+    }
 }
 libc_hidden_def (__pthread_disable_asynccancel)
diff --git a/nptl/cleanup_defer.c b/nptl/cleanup_defer.c
index f8181a40e..4e864ead3 100644
--- a/nptl/cleanup_defer.c
+++ b/nptl/cleanup_defer.c
@@ -30,9 +30,22 @@ ___pthread_register_cancel_defer (__pthread_unwind_buf_t *buf)
   ibuf->priv.data.prev = THREAD_GETMEM (self, cleanup_jmp_buf);
   ibuf->priv.data.cleanup = THREAD_GETMEM (self, cleanup);
 
-  /* Disable asynchronous cancellation for now.  */
-  ibuf->priv.data.canceltype = THREAD_GETMEM (self, canceltype);
-  THREAD_SETMEM (self, canceltype, PTHREAD_CANCEL_DEFERRED);
+  int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
+  if (__glibc_unlikely (cancelhandling & CANCELTYPE_BITMASK))
+    {
+      int newval;
+      do
+	{
+	  newval = cancelhandling & ~CANCELTYPE_BITMASK;
+	}
+      while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+						    &cancelhandling,
+						    newval));
+    }
+
+  ibuf->priv.data.canceltype = (cancelhandling & CANCELTYPE_BITMASK
+				? PTHREAD_CANCEL_ASYNCHRONOUS
+				: PTHREAD_CANCEL_DEFERRED);
 
   /* Store the new cleanup handler info.  */
   THREAD_SETMEM (self, cleanup_jmp_buf, (struct pthread_unwind_buf *) buf);
@@ -54,9 +67,26 @@ ___pthread_unregister_cancel_restore (__pthread_unwind_buf_t *buf)
 
   THREAD_SETMEM (self, cleanup_jmp_buf, ibuf->priv.data.prev);
 
-  THREAD_SETMEM (self, canceltype, ibuf->priv.data.canceltype);
-  if (ibuf->priv.data.canceltype == PTHREAD_CANCEL_ASYNCHRONOUS)
-    __pthread_testcancel ();
+  if (ibuf->priv.data.canceltype == PTHREAD_CANCEL_DEFERRED)
+    return;
+
+  int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
+  if ((cancelhandling & CANCELTYPE_BITMASK) == 0)
+    {
+      int newval;
+      do
+	{
+	  newval = cancelhandling | CANCELTYPE_BITMASK;
+	}
+      while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+						    &cancelhandling, newval));
+
+      if (cancel_enabled_and_canceled (cancelhandling))
+	{
+	  self->result = PTHREAD_CANCELED;
+	  __do_cancel ();
+	}
+    }
 }
 versioned_symbol (libc, ___pthread_unregister_cancel_restore,
 		  __pthread_unregister_cancel_restore, GLIBC_2_34);
diff --git a/nptl/descr.h b/nptl/descr.h
index ea8aca08e..bb46b5958 100644
--- a/nptl/descr.h
+++ b/nptl/descr.h
@@ -279,18 +279,27 @@ struct pthread
 
   /* Flags determining processing of cancellation.  */
   int cancelhandling;
+  /* Bit set if cancellation is disabled.  */
+#define CANCELSTATE_BIT		0
+#define CANCELSTATE_BITMASK	(1 << CANCELSTATE_BIT)
+  /* Bit set if asynchronous cancellation mode is selected.  */
+#define CANCELTYPE_BIT		1
+#define CANCELTYPE_BITMASK	(1 << CANCELTYPE_BIT)
+  /* Bit set if canceling has been initiated.  */
+#define CANCELING_BIT		2
+#define CANCELING_BITMASK	(1 << CANCELING_BIT)
   /* Bit set if canceled.  */
 #define CANCELED_BIT		3
-#define CANCELED_BITMASK	(0x01 << CANCELED_BIT)
+#define CANCELED_BITMASK	(1 << CANCELED_BIT)
   /* Bit set if thread is exiting.  */
 #define EXITING_BIT		4
-#define EXITING_BITMASK		(0x01 << EXITING_BIT)
+#define EXITING_BITMASK		(1 << EXITING_BIT)
   /* Bit set if thread terminated and TCB is freed.  */
 #define TERMINATED_BIT		5
-#define TERMINATED_BITMASK	(0x01 << TERMINATED_BIT)
+#define TERMINATED_BITMASK	(1 << TERMINATED_BIT)
   /* Bit set if thread is supposed to change XID.  */
 #define SETXID_BIT		6
-#define SETXID_BITMASK		(0x01 << SETXID_BIT)
+#define SETXID_BITMASK		(1 << SETXID_BIT)
 
   /* Flags.  Including those copied from the thread attribute.  */
   int flags;
@@ -390,14 +399,6 @@ struct pthread
   /* Indicates whether is a C11 thread created by thrd_creat.  */
   bool c11;
 
-  /* Thread cancel state (PTHREAD_CANCEL_ENABLE or
-     PTHREAD_CANCEL_DISABLE).  */
-  unsigned char cancelstate;
-
-  /* Thread cancel type (PTHREAD_CANCEL_DEFERRED or
-     PTHREAD_CANCEL_ASYNCHRONOUS).  */
-  unsigned char canceltype;
-
   /* Used in __pthread_kill_internal to detected a thread that has
      exited or is about to exit.  exit_lock must only be acquired
      after blocking signals.  */
@@ -417,6 +418,22 @@ struct pthread
   (sizeof (struct pthread) - offsetof (struct pthread, end_padding))
 } __attribute ((aligned (TCB_ALIGNMENT)));
 
+static inline bool
+cancel_enabled_and_canceled (int value)
+{
+  return (value & (CANCELSTATE_BITMASK | CANCELED_BITMASK | EXITING_BITMASK
+		   | TERMINATED_BITMASK))
+    == CANCELED_BITMASK;
+}
+
+static inline bool
+cancel_enabled_and_canceled_and_async (int value)
+{
+  return ((value) & (CANCELSTATE_BITMASK | CANCELTYPE_BITMASK | CANCELED_BITMASK
+		     | EXITING_BITMASK | TERMINATED_BITMASK))
+    == (CANCELTYPE_BITMASK | CANCELED_BITMASK);
+}
+
 /* This yields the pointer that TLS support code calls the thread pointer.  */
 #if TLS_TCB_AT_TP
 # define TLS_TPADJ(pd) (pd)
diff --git a/nptl/libc-cleanup.c b/nptl/libc-cleanup.c
index cb4c22628..2ce59388d 100644
--- a/nptl/libc-cleanup.c
+++ b/nptl/libc-cleanup.c
@@ -26,9 +26,24 @@ __libc_cleanup_push_defer (struct _pthread_cleanup_buffer *buffer)
 
   buffer->__prev = THREAD_GETMEM (self, cleanup);
 
+  int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
+
   /* Disable asynchronous cancellation for now.  */
-  buffer->__canceltype = THREAD_GETMEM (self, canceltype);
-  THREAD_SETMEM (self, canceltype, PTHREAD_CANCEL_DEFERRED);
+  if (__glibc_unlikely (cancelhandling & CANCELTYPE_BITMASK))
+    {
+      int newval;
+      do
+	{
+	  newval = cancelhandling & ~CANCELTYPE_BITMASK;
+	}
+      while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+						    &cancelhandling,
+						    newval));
+    }
+
+  buffer->__canceltype = (cancelhandling & CANCELTYPE_BITMASK
+			  ? PTHREAD_CANCEL_ASYNCHRONOUS
+			  : PTHREAD_CANCEL_DEFERRED);
 
   THREAD_SETMEM (self, cleanup, buffer);
 }
@@ -41,8 +56,23 @@ __libc_cleanup_pop_restore (struct _pthread_cleanup_buffer *buffer)
 
   THREAD_SETMEM (self, cleanup, buffer->__prev);
 
-  THREAD_SETMEM (self, canceltype, buffer->__canceltype);
-  if (buffer->__canceltype == PTHREAD_CANCEL_ASYNCHRONOUS)
-      __pthread_testcancel ();
+  int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
+  if (buffer->__canceltype != PTHREAD_CANCEL_DEFERRED
+      && (cancelhandling & CANCELTYPE_BITMASK) == 0)
+    {
+      int newval;
+      do
+	{
+	  newval = cancelhandling | CANCELTYPE_BITMASK;
+	}
+      while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+						    &cancelhandling, newval));
+
+      if (cancel_enabled_and_canceled (cancelhandling))
+	{
+	  self->result = PTHREAD_CANCELED;
+	  __do_cancel ();
+	}
+    }
 }
 libc_hidden_def (__libc_cleanup_pop_restore)
diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c
index 7524c7ce4..e67b2df5c 100644
--- a/nptl/pthread_cancel.c
+++ b/nptl/pthread_cancel.c
@@ -42,18 +42,29 @@ sigcancel_handler (int sig, siginfo_t *si, void *ctx)
 
   struct pthread *self = THREAD_SELF;
 
-  int ch = atomic_load_relaxed (&self->cancelhandling);
-  /* Cancelation not enabled, not cancelled, or already exitting.  */
-  if (self->cancelstate == PTHREAD_CANCEL_DISABLE
-      || (ch & CANCELED_BITMASK) == 0
-      || (ch & EXITING_BITMASK) != 0)
-    return;
-
-  /* Set the return value.  */
-  THREAD_SETMEM (self, result, PTHREAD_CANCELED);
-  /* Make sure asynchronous cancellation is still enabled.  */
-  if (self->canceltype == PTHREAD_CANCEL_ASYNCHRONOUS)
-    __do_cancel ();
+  int oldval = atomic_load_relaxed (&self->cancelhandling);
+  while (1)
+    {
+      /* We are canceled now.  When canceled by another thread this flag
+	 is already set but if the signal is directly send (internally or
+	 from another process) is has to be done here.  */
+      int newval = oldval | CANCELING_BITMASK | CANCELED_BITMASK;
+
+      if (oldval == newval || (oldval & EXITING_BITMASK) != 0)
+	/* Already canceled or exiting.  */
+	break;
+
+      if (atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+						&oldval, newval))
+	{
+	  self->result = PTHREAD_CANCELED;
+
+	  /* Make sure asynchronous cancellation is still enabled.  */
+	  if ((oldval & CANCELTYPE_BITMASK) != 0)
+	    /* Run the registered destructors and terminate the thread.  */
+	    __do_cancel ();
+	}
+    }
 }
 
 int
@@ -92,29 +103,71 @@ __pthread_cancel (pthread_t th)
   }
 #endif
 
-  int oldch = atomic_fetch_or_acquire (&pd->cancelhandling, CANCELED_BITMASK);
-  if ((oldch & CANCELED_BITMASK) != 0)
-    return 0;
-
-  if (pd == THREAD_SELF)
+  /* Some syscalls are never restarted after being interrupted by a signal
+     handler, regardless of the use of SA_RESTART (they always fail with
+     EINTR).  So pthread_cancel cannot send SIGCANCEL unless the cancellation
+     is enabled and set as asynchronous (in this case the cancellation will
+     be acted in the cancellation handler instead by the syscall wrapper).
+     Otherwise the target thread is set as 'cancelling' (CANCELING_BITMASK)
+     by atomically setting 'cancelhandling' and the cancelation will be acted
+     upon on next cancellation entrypoing in the target thread.
+
+     It also requires to atomically check if cancellation is enabled and
+     asynchronous, so both cancellation state and type are tracked on
+     'cancelhandling'.  */
+
+  int result = 0;
+  int oldval = atomic_load_relaxed (&pd->cancelhandling);
+  int newval;
+  do
     {
-      /* A single-threaded process should be able to kill itself, since there
-	 is nothing in the POSIX specification that says that it cannot.  So
-	 we set multiple_threads to true so that cancellation points get
-	 executed.  */
-      THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
+    again:
+      newval = oldval | CANCELING_BITMASK | CANCELED_BITMASK;
+      if (oldval == newval)
+	break;
+
+      /* If the cancellation is handled asynchronously just send a
+	 signal.  We avoid this if possible since it's more
+	 expensive.  */
+      if (cancel_enabled_and_canceled_and_async (newval))
+	{
+	  /* Mark the cancellation as "in progress".  */
+	  int newval2 = oldval | CANCELING_BITMASK;
+	  if (!atomic_compare_exchange_weak_acquire (&pd->cancelhandling,
+						     &oldval, newval2))
+	    goto again;
+
+	  if (pd == THREAD_SELF)
+	    /* This is not merely an optimization: An application may
+	       call pthread_cancel (pthread_self ()) without calling
+	       pthread_create, so the signal handler may not have been
+	       set up for a self-cancel.  */
+	    {
+	      pd->result = PTHREAD_CANCELED;
+	      if ((newval & CANCELTYPE_BITMASK) != 0)
+		__do_cancel ();
+	    }
+	  else
+	    /* The cancellation handler will take care of marking the
+	       thread as canceled.  */
+	    result = __pthread_kill_internal (th, SIGCANCEL);
+
+	  break;
+	}
+
+	/* A single-threaded process should be able to kill itself, since
+	   there is nothing in the POSIX specification that says that it
+	   cannot.  So we set multiple_threads to true so that cancellation
+	   points get executed.  */
+	THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
       __libc_multiple_threads = 1;
 #endif
-
-      THREAD_SETMEM (pd, result, PTHREAD_CANCELED);
-      if (pd->cancelstate == PTHREAD_CANCEL_ENABLE
-	  && pd->canceltype == PTHREAD_CANCEL_ASYNCHRONOUS)
-	__do_cancel ();
-      return 0;
     }
+  while (!atomic_compare_exchange_weak_acquire (&pd->cancelhandling, &oldval,
+						newval));
 
-  return __pthread_kill_internal (th, SIGCANCEL);
+  return result;
 }
 versioned_symbol (libc, __pthread_cancel, pthread_cancel, GLIBC_2_34);
 
diff --git a/nptl/pthread_join_common.c b/nptl/pthread_join_common.c
index a8e884f34..ca3245b0a 100644
--- a/nptl/pthread_join_common.c
+++ b/nptl/pthread_join_common.c
@@ -57,12 +57,9 @@ __pthread_clockjoin_ex (pthread_t threadid, void **thread_return,
   if ((pd == self
        || (self->joinid == pd
 	   && (pd->cancelhandling
-	       & (CANCELED_BITMASK | EXITING_BITMASK
+	       & (CANCELING_BITMASK | CANCELED_BITMASK | EXITING_BITMASK
 		  | TERMINATED_BITMASK)) == 0))
-      && !(self->cancelstate == PTHREAD_CANCEL_ENABLE
-	   && (pd->cancelhandling & (CANCELED_BITMASK | EXITING_BITMASK
-				     | TERMINATED_BITMASK))
-	       == CANCELED_BITMASK))
+      && !cancel_enabled_and_canceled (self->cancelhandling))
     /* This is a deadlock situation.  The threads are waiting for each
        other to finish.  Note that this is a "may" error.  To be 100%
        sure we catch this error we would have to lock the data
diff --git a/nptl/pthread_setcancelstate.c b/nptl/pthread_setcancelstate.c
index 9905b12e4..f8edf18fb 100644
--- a/nptl/pthread_setcancelstate.c
+++ b/nptl/pthread_setcancelstate.c
@@ -30,9 +30,29 @@ __pthread_setcancelstate (int state, int *oldstate)
 
   self = THREAD_SELF;
 
-  if (oldstate != NULL)
-    *oldstate = self->cancelstate;
-  self->cancelstate = state;
+  int oldval = atomic_load_relaxed (&self->cancelhandling);
+  while (1)
+    {
+      int newval = (state == PTHREAD_CANCEL_DISABLE
+		    ? oldval | CANCELSTATE_BITMASK
+		    : oldval & ~CANCELSTATE_BITMASK);
+
+      if (oldstate != NULL)
+	*oldstate = ((oldval & CANCELSTATE_BITMASK)
+		     ? PTHREAD_CANCEL_DISABLE : PTHREAD_CANCEL_ENABLE);
+
+      if (oldval == newval)
+	break;
+
+      if (atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+						&oldval, newval))
+	{
+	  if (cancel_enabled_and_canceled_and_async (newval))
+	    __do_cancel ();
+
+	  break;
+	}
+    }
 
   return 0;
 }
diff --git a/nptl/pthread_setcanceltype.c b/nptl/pthread_setcanceltype.c
index 94e56466d..1307d355c 100644
--- a/nptl/pthread_setcanceltype.c
+++ b/nptl/pthread_setcanceltype.c
@@ -28,11 +28,32 @@ __pthread_setcanceltype (int type, int *oldtype)
 
   volatile struct pthread *self = THREAD_SELF;
 
-  if (oldtype != NULL)
-    *oldtype = self->canceltype;
-  self->canceltype = type;
-  if (type == PTHREAD_CANCEL_ASYNCHRONOUS)
-    __pthread_testcancel ();
+  int oldval = atomic_load_relaxed (&self->cancelhandling);
+  while (1)
+    {
+      int newval = (type == PTHREAD_CANCEL_ASYNCHRONOUS
+		    ? oldval | CANCELTYPE_BITMASK
+		    : oldval & ~CANCELTYPE_BITMASK);
+
+      if (oldtype != NULL)
+	*oldtype = ((oldval & CANCELTYPE_BITMASK)
+		    ? PTHREAD_CANCEL_ASYNCHRONOUS : PTHREAD_CANCEL_DEFERRED);
+
+      if (oldval == newval)
+	break;
+
+      if (atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+						&oldval, newval))
+	{
+	  if (cancel_enabled_and_canceled_and_async (newval))
+	    {
+	      THREAD_SETMEM (self, result, PTHREAD_CANCELED);
+	      __do_cancel ();
+	    }
+
+	  break;
+	}
+    }
 
   return 0;
 }
diff --git a/nptl/pthread_testcancel.c b/nptl/pthread_testcancel.c
index 13123608e..b81928c00 100644
--- a/nptl/pthread_testcancel.c
+++ b/nptl/pthread_testcancel.c
@@ -23,13 +23,10 @@ void
 ___pthread_testcancel (void)
 {
   struct pthread *self = THREAD_SELF;
-  int cancelhandling = THREAD_GETMEM (self, cancelhandling);
-  if (self->cancelstate == PTHREAD_CANCEL_ENABLE
-      && (cancelhandling & CANCELED_BITMASK)
-      && !(cancelhandling & EXITING_BITMASK)
-      && !(cancelhandling & TERMINATED_BITMASK))
+  int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
+  if (cancel_enabled_and_canceled (cancelhandling))
     {
-      THREAD_SETMEM (self, result, PTHREAD_CANCELED);
+      self->result = PTHREAD_CANCELED;
       __do_cancel ();
     }
 }
diff --git a/nptl/unwind.c b/nptl/unwind.c
index c3563e346..33b0d8757 100644
--- a/nptl/unwind.c
+++ b/nptl/unwind.c
@@ -25,7 +25,7 @@
 #include <jmpbuf-unwind.h>
 #include <shlib-compat.h>
 
-#ifdef _STACK_GROWS_DOWN
+#if _STACK_GROWS_DOWN
 # define FRAME_LEFT(frame, other, adj) \
   ((uintptr_t) frame - adj >= (uintptr_t) other - adj)
 #elif _STACK_GROWS_UP
diff --git a/nscd/connections.c b/nscd/connections.c
index 61d1674eb..531d2e83d 100644
--- a/nscd/connections.c
+++ b/nscd/connections.c
@@ -2284,7 +2284,8 @@ main_loop_epoll (int efd)
 					     sizeof (buf))) != -1)
 	      ;
 
-	    __bump_nl_timestamp ();
+	    dbs[hstdb].head->extra_data[NSCD_HST_IDX_CONF_TIMESTAMP]
+	      = __bump_nl_timestamp ();
 	  }
 # endif
 	else
diff --git a/nss/Makefile b/nss/Makefile
index 552e5d03e..de439d491 100644
--- a/nss/Makefile
+++ b/nss/Makefile
@@ -60,7 +60,8 @@ tests			= test-netdb test-digits-dots tst-nss-getpwent bug17079 \
 			  tst-nss-test1 \
 			  tst-nss-test2 \
 			  tst-nss-test4 \
-			  tst-nss-test5
+			  tst-nss-test5 \
+			  tst-nss-test_errno
 xtests			= bug-erange
 
 tests-container = \
@@ -132,7 +133,7 @@ libnss_compat-inhibit-o	= $(filter-out .os,$(object-suffixes))
 ifeq ($(build-static-nss),yes)
 tests-static		+= tst-nss-static
 endif
-extra-test-objs		+= nss_test1.os nss_test2.os
+extra-test-objs		+= nss_test1.os nss_test2.os nss_test_errno.os
 
 include ../Rules
 
@@ -166,22 +167,26 @@ rtld-tests-LDFLAGS += -Wl,--dynamic-list=nss_test.ver
 
 libof-nss_test1 = extramodules
 libof-nss_test2 = extramodules
+libof-nss_test_errno = extramodules
 $(objpfx)/libnss_test1.so: $(objpfx)nss_test1.os $(link-libc-deps)
 	$(build-module)
 $(objpfx)/libnss_test2.so: $(objpfx)nss_test2.os $(link-libc-deps)
 	$(build-module)
+$(objpfx)/libnss_test_errno.so: $(objpfx)nss_test_errno.os $(link-libc-deps)
+	$(build-module)
 $(objpfx)nss_test2.os : nss_test1.c
-ifdef libnss_test1.so-version
-$(objpfx)/libnss_test1.so$(libnss_test1.so-version): $(objpfx)/libnss_test1.so
+# Use the nss_files suffix for these objects as well.
+$(objpfx)/libnss_test1.so$(libnss_files.so-version): $(objpfx)/libnss_test1.so
 	$(make-link)
-endif
-ifdef libnss_test2.so-version
-$(objpfx)/libnss_test2.so$(libnss_test2.so-version): $(objpfx)/libnss_test2.so
+$(objpfx)/libnss_test2.so$(libnss_files.so-version): $(objpfx)/libnss_test2.so
+	$(make-link)
+$(objpfx)/libnss_test_errno.so$(libnss_files.so-version): \
+  $(objpfx)/libnss_test_errno.so
 	$(make-link)
-endif
 $(patsubst %,$(objpfx)%.out,$(tests) $(tests-container)) : \
-	$(objpfx)/libnss_test1.so$(libnss_test1.so-version) \
-	$(objpfx)/libnss_test2.so$(libnss_test2.so-version)
+	$(objpfx)/libnss_test1.so$(libnss_files.so-version) \
+	$(objpfx)/libnss_test2.so$(libnss_files.so-version) \
+	$(objpfx)/libnss_test_errno.so$(libnss_files.so-version)
 
 ifeq (yes,$(have-thread-library))
 $(objpfx)tst-cancel-getpwuid_r: $(shared-thread-library)
@@ -197,3 +202,4 @@ LDFLAGS-tst-nss-test2 = -Wl,--disable-new-dtags
 LDFLAGS-tst-nss-test3 = -Wl,--disable-new-dtags
 LDFLAGS-tst-nss-test4 = -Wl,--disable-new-dtags
 LDFLAGS-tst-nss-test5 = -Wl,--disable-new-dtags
+LDFLAGS-tst-nss-test_errno = -Wl,--disable-new-dtags
diff --git a/nss/XXX-lookup.c b/nss/XXX-lookup.c
index db9593767..bfc57b8e6 100644
--- a/nss/XXX-lookup.c
+++ b/nss/XXX-lookup.c
@@ -15,6 +15,7 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
+#include <assert.h>
 #include "nsswitch.h"
 
 /*******************************************************************\
@@ -54,6 +55,10 @@ DB_LOOKUP_FCT (nss_action_list *ni, const char *fct_name, const char *fct2_name,
 
   *ni = DATABASE_NAME_SYMBOL;
 
+  /* We want to know about it if we've somehow got a NULL action list;
+   in the past, we had bad state if seccomp interfered with setup. */
+  assert(*ni != NULL);
+
   return __nss_lookup (ni, fct_name, fct2_name, fctp);
 }
 libc_hidden_def (DB_LOOKUP_FCT)
diff --git a/nss/nss_database.c b/nss/nss_database.c
index d56c5b798..f2ed2f2c2 100644
--- a/nss/nss_database.c
+++ b/nss/nss_database.c
@@ -420,23 +420,32 @@ nss_database_check_reload_and_get (struct nss_database_state *local,
       return true;
     }
 
-  /* Before we reload, verify that "/" hasn't changed.  We assume that
-     errors here are very unlikely, but the chance that we're entering
-     a container is also very unlikely, so we err on the side of both
-     very unlikely things not happening at the same time.  */
-  if (__stat64_time64 ("/", &str) != 0
-      || (local->root_ino != 0
-	  && (str.st_ino != local->root_ino
-	      ||  str.st_dev != local->root_dev)))
+  int stat_rv = __stat64_time64 ("/", &str);
+
+  if (local->data.services[database_index] != NULL)
     {
-      /* Change detected; disable reloading and return current state.  */
-      atomic_store_release (&local->data.reload_disabled, 1);
-      *result = local->data.services[database_index];
-      __libc_lock_unlock (local->lock);
-      return true;
+      /* Before we reload, verify that "/" hasn't changed.  We assume that
+        errors here are very unlikely, but the chance that we're entering
+        a container is also very unlikely, so we err on the side of both
+        very unlikely things not happening at the same time.  */
+      if (stat_rv != 0
+	  || (local->root_ino != 0
+	      && (str.st_ino != local->root_ino
+		  ||  str.st_dev != local->root_dev)))
+	{
+        /* Change detected; disable reloading and return current state.  */
+        atomic_store_release (&local->data.reload_disabled, 1);
+        *result = local->data.services[database_index];
+        __libc_lock_unlock (local->lock);
+        return true;
+      }
+    }
+  if (stat_rv == 0)
+    {
+      local->root_ino = str.st_ino;
+      local->root_dev = str.st_dev;
     }
-  local->root_ino = str.st_ino;
-  local->root_dev = str.st_dev;
+
   __libc_lock_unlock (local->lock);
 
   /* Avoid overwriting the global configuration until we have loaded
diff --git a/nss/nss_module.c b/nss/nss_module.c
index f9a1263e5..f00bbd9e1 100644
--- a/nss/nss_module.c
+++ b/nss/nss_module.c
@@ -330,8 +330,18 @@ name_search (const void *left, const void *right)
 void *
 __nss_module_get_function (struct nss_module *module, const char *name)
 {
+  /* A successful dlopen might clobber errno.   */
+  int saved_errno = errno;
+
   if (!__nss_module_load (module))
-    return NULL;
+    {
+      /* Reporting module load failure is currently inaccurate.  See
+	 bug 22041.  Not changing errno is the conservative choice.  */
+      __set_errno (saved_errno);
+      return NULL;
+    }
+
+  __set_errno (saved_errno);
 
   function_name *name_entry = bsearch (name, nss_function_name_array,
                                        array_length (nss_function_name_array),
diff --git a/nss/nss_test_errno.c b/nss/nss_test_errno.c
new file mode 100644
index 000000000..59a5c717b
--- /dev/null
+++ b/nss/nss_test_errno.c
@@ -0,0 +1,58 @@
+/* NSS service provider with errno clobber.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <nss.h>
+#include <stdlib.h>
+
+/* Catch misnamed and functions.  */
+#pragma GCC diagnostic error "-Wmissing-prototypes"
+NSS_DECLARE_MODULE_FUNCTIONS (test_errno)
+
+static void __attribute__ ((constructor))
+init (void)
+{
+  /* An arbitrary error code which is otherwise not used.  */
+  errno = -1009;
+}
+
+/* Lookup functions for pwd follow that do not return any data.  */
+
+/* Catch misnamed function definitions.  */
+
+enum nss_status
+_nss_test_errno_setpwent (int stayopen)
+{
+  setenv ("_nss_test_errno_setpwent", "yes", 1);
+  return NSS_STATUS_SUCCESS;
+}
+
+enum nss_status
+_nss_test_errno_getpwent_r (struct passwd *result,
+                            char *buffer, size_t size, int *errnop)
+{
+  setenv ("_nss_test_errno_getpwent_r", "yes", 1);
+  return NSS_STATUS_NOTFOUND;
+}
+
+enum nss_status
+_nss_test_errno_endpwent (void)
+{
+  setenv ("_nss_test_errno_endpwent", "yes", 1);
+  return NSS_STATUS_SUCCESS;
+}
diff --git a/nss/tst-nss-test_errno.c b/nss/tst-nss-test_errno.c
new file mode 100644
index 000000000..d2c42dd36
--- /dev/null
+++ b/nss/tst-nss-test_errno.c
@@ -0,0 +1,61 @@
+/* getpwent failure when dlopen clobbers errno (bug 28953).
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nss.h>
+#include <support/check.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <pwd.h>
+#include <string.h>
+
+static int
+do_test (void)
+{
+  __nss_configure_lookup ("passwd", "files test_errno");
+
+  errno = 0;
+  setpwent ();
+  TEST_COMPARE (errno, 0);
+
+  bool root_seen = false;
+  while (true)
+    {
+      errno = 0;
+      struct passwd *e = getpwent ();
+      if (e == NULL)
+        break;
+      if (strcmp (e->pw_name, "root"))
+        root_seen = true;
+    }
+
+  TEST_COMPARE (errno, 0);
+  TEST_VERIFY (root_seen);
+
+  errno = 0;
+  endpwent ();
+  TEST_COMPARE (errno, 0);
+
+  TEST_COMPARE_STRING (getenv ("_nss_test_errno_setpwent"), "yes");
+  TEST_COMPARE_STRING (getenv ("_nss_test_errno_getpwent_r"), "yes");
+  TEST_COMPARE_STRING (getenv ("_nss_test_errno_endpwent"), "yes");
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/posix/fork.c b/posix/fork.c
index 6b50c091f..e1be3422e 100644
--- a/posix/fork.c
+++ b/posix/fork.c
@@ -46,8 +46,9 @@ __libc_fork (void)
      best effort to make is async-signal-safe at least for single-thread
      case.  */
   bool multiple_threads = __libc_single_threaded == 0;
+  uint64_t lastrun;
 
-  __run_fork_handlers (atfork_run_prepare, multiple_threads);
+  lastrun = __run_prefork_handlers (multiple_threads);
 
   struct nss_database_data nss_database_data;
 
@@ -105,7 +106,7 @@ __libc_fork (void)
       reclaim_stacks ();
 
       /* Run the handlers registered for the child.  */
-      __run_fork_handlers (atfork_run_child, multiple_threads);
+      __run_postfork_handlers (atfork_run_child, multiple_threads, lastrun);
     }
   else
     {
@@ -123,7 +124,7 @@ __libc_fork (void)
 	}
 
       /* Run the handlers registered for the parent.  */
-      __run_fork_handlers (atfork_run_parent, multiple_threads);
+      __run_postfork_handlers (atfork_run_parent, multiple_threads, lastrun);
 
       if (pid < 0)
 	__set_errno (save_errno);
diff --git a/posix/glob.c b/posix/glob.c
index a2b5aabad..f6993a370 100644
--- a/posix/glob.c
+++ b/posix/glob.c
@@ -21,13 +21,14 @@
    optimizes away the pattern == NULL test below.  */
 # define _GL_ARG_NONNULL(params)
 
-# include <config.h>
+# include <libc-config.h>
 
 #endif
 
 #include <glob.h>
 
 #include <errno.h>
+#include <fcntl.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <stdbool.h>
@@ -56,6 +57,8 @@
 # define sysconf(id) __sysconf (id)
 # define closedir(dir) __closedir (dir)
 # define opendir(name) __opendir (name)
+# undef dirfd
+# define dirfd(str) __dirfd (str)
 # define readdir(str) __readdir64 (str)
 # define getpwnam_r(name, bufp, buf, len, res) \
     __getpwnam_r (name, bufp, buf, len, res)
@@ -69,11 +72,8 @@
 # ifndef GLOB_LSTAT
 #  define GLOB_LSTAT            gl_lstat
 # endif
-# ifndef GLOB_STAT64
-#  define GLOB_STAT64           __stat64
-# endif
-# ifndef GLOB_LSTAT64
-#  define GLOB_LSTAT64          __lstat64
+# ifndef GLOB_FSTATAT64
+#  define GLOB_FSTATAT64        __fstatat64
 # endif
 # include <shlib-compat.h>
 #else /* !_LIBC */
@@ -88,8 +88,7 @@
 # define struct_stat            struct stat
 # define struct_stat64          struct stat
 # define GLOB_LSTAT             gl_lstat
-# define GLOB_STAT64            stat
-# define GLOB_LSTAT64           lstat
+# define GLOB_FSTATAT64         fstatat
 #endif /* _LIBC */
 
 #include <fnmatch.h>
@@ -215,7 +214,8 @@ glob_lstat (glob_t *pglob, int flags, const char *fullname)
   } ust;
   return (__glibc_unlikely (flags & GLOB_ALTDIRFUNC)
           ? pglob->GLOB_LSTAT (fullname, &ust.st)
-          : GLOB_LSTAT64 (fullname, &ust.st64));
+          : GLOB_FSTATAT64 (AT_FDCWD, fullname, &ust.st64,
+                            AT_SYMLINK_NOFOLLOW));
 }
 
 /* Set *R = A + B.  Return true if the answer is mathematically
@@ -257,7 +257,8 @@ is_dir (char const *filename, int flags, glob_t const *pglob)
   struct_stat64 st64;
   return (__glibc_unlikely (flags & GLOB_ALTDIRFUNC)
           ? pglob->gl_stat (filename, &st) == 0 && S_ISDIR (st.st_mode)
-          : GLOB_STAT64 (filename, &st64) == 0 && S_ISDIR (st64.st_mode));
+          : (GLOB_FSTATAT64 (AT_FDCWD, filename, &st64, 0) == 0
+             && S_ISDIR (st64.st_mode)));
 }
 
 /* Find the end of the sub-pattern in a brace expression.  */
@@ -747,6 +748,8 @@ __glob (const char *pattern, int flags, int (*errfunc) (const char *, int),
       else
         {
 #ifndef WINDOWS32
+          /* Recognize ~user as a shorthand for the specified user's home
+             directory.  */
           char *end_name = strchr (dirname, '/');
           char *user_name;
           int malloc_user_name = 0;
@@ -885,7 +888,22 @@ __glob (const char *pattern, int flags, int (*errfunc) (const char *, int),
               }
             scratch_buffer_free (&pwtmpbuf);
           }
-#endif /* !WINDOWS32 */
+#else /* WINDOWS32 */
+          /* On native Windows, access to a user's home directory
+             (via GetUserProfileDirectory) or to a user's environment
+             variables (via ExpandEnvironmentStringsForUser) requires
+             the credentials of the user.  Therefore we cannot support
+             the ~user syntax on this platform.
+             Handling ~user specially (and treat it like plain ~) if
+             user is getenv ("USERNAME") would not be a good idea,
+             since it would make people think that ~user is supported
+             in general.  */
+          if (flags & GLOB_TILDE_CHECK)
+            {
+              retval = GLOB_NOMATCH;
+              goto out;
+            }
+#endif /* WINDOWS32 */
         }
     }
 
@@ -1266,6 +1284,8 @@ glob_in_dir (const char *pattern, const char *directory, int flags,
 {
   size_t dirlen = strlen (directory);
   void *stream = NULL;
+  struct scratch_buffer s;
+  scratch_buffer_init (&s);
 # define GLOBNAMES_MEMBERS(nnames) \
     struct globnames *next; size_t count; char *name[nnames];
   struct globnames { GLOBNAMES_MEMBERS (FLEXIBLE_ARRAY_MEMBER) };
@@ -1337,6 +1357,7 @@ glob_in_dir (const char *pattern, const char *directory, int flags,
         }
       else
         {
+          int dfd = dirfd (stream);
           int fnm_flags = ((!(flags & GLOB_PERIOD) ? FNM_PERIOD : 0)
                            | ((flags & GLOB_NOESCAPE) ? FNM_NOESCAPE : 0));
           flags |= GLOB_MAGCHAR;
@@ -1364,8 +1385,32 @@ glob_in_dir (const char *pattern, const char *directory, int flags,
               if (flags & GLOB_ONLYDIR)
                 switch (readdir_result_type (d))
                   {
-                  case DT_DIR: case DT_LNK: case DT_UNKNOWN: break;
                   default: continue;
+                  case DT_DIR: break;
+                  case DT_LNK: case DT_UNKNOWN:
+                    /* The filesystem was too lazy to give us a hint,
+                       so we have to do it the hard way.  */
+                    if (__glibc_unlikely (dfd < 0 || flags & GLOB_ALTDIRFUNC))
+                      {
+                        size_t namelen = strlen (d.name);
+                        size_t need = dirlen + 1 + namelen + 1;
+                        if (s.length < need
+                            && !scratch_buffer_set_array_size (&s, need, 1))
+                          goto memory_error;
+                        char *p = mempcpy (s.data, directory, dirlen);
+                        *p = '/';
+                        p += p[-1] != '/';
+                        memcpy (p, d.name, namelen + 1);
+                        if (! is_dir (s.data, flags, pglob))
+                          continue;
+                      }
+                    else
+                      {
+                        struct_stat64 st64;
+                        if (! (GLOB_FSTATAT64 (dfd, d.name, &st64, 0) == 0
+                               && S_ISDIR (st64.st_mode)))
+                          continue;
+                      }
                   }
 
               if (fnmatch (pattern, d.name, fnm_flags) == 0)
@@ -1497,5 +1542,6 @@ glob_in_dir (const char *pattern, const char *directory, int flags,
       __set_errno (save);
     }
 
+  scratch_buffer_free (&s);
   return result;
 }
diff --git a/posix/register-atfork.c b/posix/register-atfork.c
index 74b1b5840..c039fb454 100644
--- a/posix/register-atfork.c
+++ b/posix/register-atfork.c
@@ -18,6 +18,8 @@
 #include <libc-lock.h>
 #include <stdbool.h>
 #include <register-atfork.h>
+#include <intprops.h>
+#include <stdio.h>
 
 #define DYNARRAY_ELEMENT           struct fork_handler
 #define DYNARRAY_STRUCT            fork_handler_list
@@ -26,7 +28,7 @@
 #include <malloc/dynarray-skeleton.c>
 
 static struct fork_handler_list fork_handlers;
-static bool fork_handler_init = false;
+static uint64_t fork_handler_counter;
 
 static int atfork_lock = LLL_LOCK_INITIALIZER;
 
@@ -36,11 +38,8 @@ __register_atfork (void (*prepare) (void), void (*parent) (void),
 {
   lll_lock (atfork_lock, LLL_PRIVATE);
 
-  if (!fork_handler_init)
-    {
-      fork_handler_list_init (&fork_handlers);
-      fork_handler_init = true;
-    }
+  if (fork_handler_counter == 0)
+    fork_handler_list_init (&fork_handlers);
 
   struct fork_handler *newp = fork_handler_list_emplace (&fork_handlers);
   if (newp != NULL)
@@ -49,6 +48,13 @@ __register_atfork (void (*prepare) (void), void (*parent) (void),
       newp->parent_handler = parent;
       newp->child_handler = child;
       newp->dso_handle = dso_handle;
+
+      /* IDs assigned to handlers start at 1 and increment with handler
+         registration.  Un-registering a handlers discards the corresponding
+         ID.  It is not reused in future registrations.  */
+      if (INT_ADD_OVERFLOW (fork_handler_counter, 1))
+        __libc_fatal ("fork handler counter overflow");
+      newp->id = ++fork_handler_counter;
     }
 
   /* Release the lock.  */
@@ -103,37 +109,111 @@ __unregister_atfork (void *dso_handle)
   lll_unlock (atfork_lock, LLL_PRIVATE);
 }
 
-void
-__run_fork_handlers (enum __run_fork_handler_type who, _Bool do_locking)
+uint64_t
+__run_prefork_handlers (_Bool do_locking)
 {
-  struct fork_handler *runp;
+  uint64_t lastrun;
 
-  if (who == atfork_run_prepare)
+  if (do_locking)
+    lll_lock (atfork_lock, LLL_PRIVATE);
+
+  /* We run prepare handlers from last to first.  After fork, only
+     handlers up to the last handler found here (pre-fork) will be run.
+     Handlers registered during __run_prefork_handlers or
+     __run_postfork_handlers will be positioned after this last handler, and
+     since their prepare handlers won't be run now, their parent/child
+     handlers should also be ignored.  */
+  lastrun = fork_handler_counter;
+
+  size_t sl = fork_handler_list_size (&fork_handlers);
+  for (size_t i = sl; i > 0;)
     {
-      if (do_locking)
-	lll_lock (atfork_lock, LLL_PRIVATE);
-      size_t sl = fork_handler_list_size (&fork_handlers);
-      for (size_t i = sl; i > 0; i--)
-	{
-	  runp = fork_handler_list_at (&fork_handlers, i - 1);
-	  if (runp->prepare_handler != NULL)
-	    runp->prepare_handler ();
-	}
+      struct fork_handler *runp
+        = fork_handler_list_at (&fork_handlers, i - 1);
+
+      uint64_t id = runp->id;
+
+      if (runp->prepare_handler != NULL)
+        {
+          if (do_locking)
+            lll_unlock (atfork_lock, LLL_PRIVATE);
+
+          runp->prepare_handler ();
+
+          if (do_locking)
+            lll_lock (atfork_lock, LLL_PRIVATE);
+        }
+
+      /* We unlocked, ran the handler, and locked again.  In the
+         meanwhile, one or more deregistrations could have occurred leading
+         to the current (just run) handler being moved up the list or even
+         removed from the list itself.  Since handler IDs are guaranteed to
+         to be in increasing order, the next handler has to have:  */
+
+      /* A. An earlier position than the current one has.  */
+      i--;
+
+      /* B. A lower ID than the current one does.  The code below skips
+         any newly added handlers with higher IDs.  */
+      while (i > 0
+             && fork_handler_list_at (&fork_handlers, i - 1)->id >= id)
+        i--;
     }
-  else
+
+  return lastrun;
+}
+
+void
+__run_postfork_handlers (enum __run_fork_handler_type who, _Bool do_locking,
+                         uint64_t lastrun)
+{
+  size_t sl = fork_handler_list_size (&fork_handlers);
+  for (size_t i = 0; i < sl;)
     {
-      size_t sl = fork_handler_list_size (&fork_handlers);
-      for (size_t i = 0; i < sl; i++)
-	{
-	  runp = fork_handler_list_at (&fork_handlers, i);
-	  if (who == atfork_run_child && runp->child_handler)
-	    runp->child_handler ();
-	  else if (who == atfork_run_parent && runp->parent_handler)
-	    runp->parent_handler ();
-	}
+      struct fork_handler *runp = fork_handler_list_at (&fork_handlers, i);
+      uint64_t id = runp->id;
+
+      /* prepare handlers were not run for handlers with ID > LASTRUN.
+         Thus, parent/child handlers will also not be run.  */
+      if (id > lastrun)
+        break;
+
       if (do_locking)
-	lll_unlock (atfork_lock, LLL_PRIVATE);
+        lll_unlock (atfork_lock, LLL_PRIVATE);
+
+      if (who == atfork_run_child && runp->child_handler)
+        runp->child_handler ();
+      else if (who == atfork_run_parent && runp->parent_handler)
+        runp->parent_handler ();
+
+      if (do_locking)
+        lll_lock (atfork_lock, LLL_PRIVATE);
+
+      /* We unlocked, ran the handler, and locked again.  In the meanwhile,
+         one or more [de]registrations could have occurred.  Due to this,
+         the list size must be updated.  */
+      sl = fork_handler_list_size (&fork_handlers);
+
+      /* The just-run handler could also have moved up the list. */
+
+      if (sl > i && fork_handler_list_at (&fork_handlers, i)->id == id)
+        /* The position of the recently run handler hasn't changed.  The
+           next handler to be run is an easy increment away.  */
+        i++;
+      else
+        {
+          /* The next handler to be run is the first handler in the list
+             to have an ID higher than the current one.  */
+          for (i = 0; i < sl; i++)
+            {
+              if (fork_handler_list_at (&fork_handlers, i)->id > id)
+                break;
+            }
+        }
     }
+
+  if (do_locking)
+    lll_unlock (atfork_lock, LLL_PRIVATE);
 }
 
 
diff --git a/posix/tst-spawn6.c b/posix/tst-spawn6.c
index 911e90a46..044abd853 100644
--- a/posix/tst-spawn6.c
+++ b/posix/tst-spawn6.c
@@ -29,7 +29,14 @@
 #include <support/check.h>
 #include <support/xunistd.h>
 #include <sys/wait.h>
+#include <sys/ioctl.h>
 #include <stdlib.h>
+#include <termios.h>
+
+#ifndef PATH_MAX
+# define PATH_MAX 1024
+#endif
+static char ptmxpath[PATH_MAX];
 
 static int
 handle_restart (const char *argv1, const char *argv2)
@@ -115,7 +122,7 @@ run_subprogram (int argc, char *argv[], const posix_spawnattr_t *attr,
 }
 
 static int
-do_test (int argc, char *argv[])
+run_test (int argc, char *argv[])
 {
   /* We must have either:
      - four parameters left if called initially:
@@ -127,16 +134,7 @@ do_test (int argc, char *argv[])
        + --setgrpr             optional
    */
 
-  if (restart)
-    return handle_restart (argv[1], argv[2]);
-
-  int tcfd = open64 (_PATH_TTY, O_RDONLY, 0600);
-  if (tcfd == -1)
-    {
-      if (errno == ENXIO)
-	FAIL_UNSUPPORTED ("terminal not available, skipping test");
-      FAIL_EXIT1 ("open64 (\"%s\", 0x%x, 0600): %m", _PATH_TTY, O_RDONLY);
-    }
+  int tcfd = xopen (ptmxpath, O_RDONLY, 0600);
 
   /* Check setting the controlling terminal without changing the group.  */
   {
@@ -198,5 +196,47 @@ do_test (int argc, char *argv[])
   return 0;
 }
 
+static int
+do_test (int argc, char *argv[])
+{
+  if (restart)
+    return handle_restart (argv[1], argv[2]);
+
+  pid_t pid = xfork ();
+  if (pid == 0)
+    {
+      /* Create a pseudo-terminal to avoid interfering with the one using by
+	 test itself, creates a new session (so there is no controlling
+	 terminal), and set the pseudo-terminal as the controlling one.  */
+      int ptmx = posix_openpt (0);
+      if (ptmx == -1)
+	{
+	  if (errno == ENXIO)
+	    FAIL_UNSUPPORTED ("terminal not available, skipping test");
+	  FAIL_EXIT1 ("posix_openpt (0): %m");
+	}
+      TEST_VERIFY_EXIT (grantpt (ptmx) == 0);
+      TEST_VERIFY_EXIT (unlockpt (ptmx) == 0);
+
+      TEST_VERIFY_EXIT (setsid () != -1);
+      TEST_VERIFY_EXIT (ioctl (ptmx, TIOCSCTTY, NULL) == 0);
+      while (dup2 (ptmx, STDIN_FILENO) == -1 && errno == EBUSY)
+	;
+      while (dup2 (ptmx, STDOUT_FILENO) == -1 && errno == EBUSY)
+	;
+      while (dup2 (ptmx, STDERR_FILENO) == -1 && errno == EBUSY)
+	;
+      TEST_VERIFY_EXIT (ptsname_r (ptmx, ptmxpath, sizeof ptmxpath) == 0);
+      xclose (ptmx);
+
+      run_test (argc, argv);
+      _exit (0);
+    }
+  int status;
+  xwaitpid (pid, &status, 0);
+  TEST_VERIFY (WIFEXITED (status));
+  exit (0);
+}
+
 #define TEST_FUNCTION_ARGV do_test
 #include <support/test-driver.c>
diff --git a/resolv/Makefile b/resolv/Makefile
index c465479e8..16943e7d9 100644
--- a/resolv/Makefile
+++ b/resolv/Makefile
@@ -40,12 +40,16 @@ routines := \
   inet_pton \
   ns_makecanon \
   ns_name_compress \
+  ns_name_length_uncompressed \
   ns_name_ntop \
   ns_name_pack \
   ns_name_pton \
   ns_name_skip \
   ns_name_uncompress \
   ns_name_unpack \
+  ns_rr_cursor_init \
+  ns_rr_cursor_next \
+  ns_samebinaryname \
   ns_samename \
   nsap_addr \
   nss_dns_functions \
@@ -79,11 +83,6 @@ generate := mtrace-tst-leaks.out tst-leaks.mtrace tst-leaks2.mtrace
 extra-libs := libresolv libnss_dns
 ifeq ($(have-thread-library),yes)
 routines += gai_sigqueue
-endif
-
-ifeq ($(have-GLIBC_2.34)$(have-thread-library),yesyes)
-# Empty compatibility library for old binaries.
-extra-libs += libanl
 
 tests += \
   tst-bug18665 \
@@ -93,9 +92,12 @@ tests += \
   tst-ns_name_pton \
   tst-res_hconf_reorder \
   tst-res_hnok \
+  tst-resolv-aliases \
   tst-resolv-basic \
   tst-resolv-binary \
+  tst-resolv-byaddr \
   tst-resolv-edns \
+  tst-resolv-invalid-cname \
   tst-resolv-network \
   tst-resolv-nondecimal \
   tst-resolv-res_init-multi \
@@ -107,6 +109,18 @@ tests += \
 tests-internal += tst-resolv-txnid-collision
 tests-static += tst-resolv-txnid-collision
 
+# Likewise for __ns_samebinaryname.
+tests-internal += tst-ns_samebinaryname
+tests-static += tst-ns_samebinaryname
+
+# Likewise for __ns_name_length_uncompressed.
+tests-internal += tst-ns_name_length_uncompressed
+tests-static += tst-ns_name_length_uncompressed
+
+# Likewise for struct ns_rr_cursor and its functions.
+tests-internal += tst-ns_rr_cursor
+tests-static += tst-ns_rr_cursor
+
 # These tests need libdl.
 ifeq (yes,$(build-shared))
 tests += \
@@ -144,7 +158,8 @@ xtests += tst-resolv-qtypes
 
 # This test has dropped packet tests and runs for a long time.
 xtests += tst-resolv-rotate
-endif
+endif # $(have-thread-library)
+
 extra-libs-others = $(extra-libs)
 libresolv-routines := \
   base64 \
@@ -168,6 +183,13 @@ libresolv-routines := \
   resolv-deprecated \
   # libresolv-routines
 
+ifeq ($(have-GLIBC_2.34)$(have-thread-library),yesyes)
+# Empty compatibility library for old binaries.
+extra-libs += libanl
+libanl-routines += libanl-compat
+libanl-shared-only-routines += libanl-compat
+endif
+
 $(libanl-routines-var) += \
   gai_cancel \
   gai_error \
@@ -177,9 +199,6 @@ $(libanl-routines-var) += \
   getaddrinfo_a \
   # $(libanl-routines-var)
 
-libanl-routines += libanl-compat
-libanl-shared-only-routines += libanl-compat
-
 # Pretend that libanl.so is a linker script, so that the symbolic link
 # is not installed.
 install-lib-ldscripts = libanl.so
@@ -256,8 +275,10 @@ $(objpfx)tst-resolv-ai_idn.out: $(gen-locales)
 $(objpfx)tst-resolv-ai_idn-latin1.out: $(gen-locales)
 $(objpfx)tst-resolv-ai_idn-nolibidn2.out: \
   $(gen-locales) $(objpfx)tst-no-libidn2.so
+$(objpfx)tst-resolv-aliases: $(objpfx)libresolv.so $(shared-thread-library)
 $(objpfx)tst-resolv-basic: $(objpfx)libresolv.so $(shared-thread-library)
 $(objpfx)tst-resolv-binary: $(objpfx)libresolv.so $(shared-thread-library)
+$(objpfx)tst-resolv-byaddr: $(objpfx)libresolv.so $(shared-thread-library)
 $(objpfx)tst-resolv-edns: $(objpfx)libresolv.so $(shared-thread-library)
 $(objpfx)tst-resolv-network: $(objpfx)libresolv.so $(shared-thread-library)
 $(objpfx)tst-resolv-res_init: $(objpfx)libresolv.so
@@ -265,6 +286,8 @@ $(objpfx)tst-resolv-res_init-multi: $(objpfx)libresolv.so \
   $(shared-thread-library)
 $(objpfx)tst-resolv-res_init-thread: $(objpfx)libresolv.so \
   $(shared-thread-library)
+$(objpfx)tst-resolv-invalid-cname: $(objpfx)libresolv.so \
+  $(shared-thread-library)
 $(objpfx)tst-resolv-nondecimal: $(objpfx)libresolv.so $(shared-thread-library)
 $(objpfx)tst-resolv-qtypes: $(objpfx)libresolv.so $(shared-thread-library)
 $(objpfx)tst-resolv-rotate: $(objpfx)libresolv.so $(shared-thread-library)
diff --git a/resolv/README b/resolv/README
index 514e9bb61..2146bc3b2 100644
--- a/resolv/README
+++ b/resolv/README
@@ -146,6 +146,3 @@ res_libc.c is home-brewn, although parts of it are taken from res_data.c.
 
 res_hconf.c and res_hconf.h were contributed by David Mosberger, and
 do not come from BIND.
-
-The files gethnamaddr.c, mapv4v6addr.h and mapv4v6hostent.h are
-leftovers from BIND 4.9.7.
diff --git a/resolv/mapv4v6addr.h b/resolv/mapv4v6addr.h
deleted file mode 100644
index 7f85f7d5e..000000000
--- a/resolv/mapv4v6addr.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * ++Copyright++ 1985, 1988, 1993
- * -
- * Copyright (c) 1985, 1988, 1993
- *    The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * -
- * Portions Copyright (c) 1993 by Digital Equipment Corporation.
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies, and that
- * the name of Digital Equipment Corporation not be used in advertising or
- * publicity pertaining to distribution of the document or software without
- * specific, written prior permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND DIGITAL EQUIPMENT CORP. DISCLAIMS ALL
- * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS.   IN NO EVENT SHALL DIGITAL EQUIPMENT
- * CORPORATION BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
- * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
- * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- * -
- * --Copyright--
- */
-
-#include <string.h>
-#include <arpa/nameser.h>
-
-static void
-map_v4v6_address (const char *src, char *dst)
-{
-  u_char *p = (u_char *) dst;
-  int i;
-
-  /* Move the IPv4 part to the right position.  */
-  memcpy (dst + 12, src, INADDRSZ);
-
-  /* Mark this ipv6 addr as a mapped ipv4. */
-  for (i = 0; i < 10; i++)
-    *p++ = 0x00;
-  *p++ = 0xff;
-  *p = 0xff;
-}
diff --git a/resolv/mapv4v6hostent.h b/resolv/mapv4v6hostent.h
deleted file mode 100644
index c11038adf..000000000
--- a/resolv/mapv4v6hostent.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * ++Copyright++ 1985, 1988, 1993
- * -
- * Copyright (c) 1985, 1988, 1993
- *    The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * -
- * Portions Copyright (c) 1993 by Digital Equipment Corporation.
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies, and that
- * the name of Digital Equipment Corporation not be used in advertising or
- * publicity pertaining to distribution of the document or software without
- * specific, written prior permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND DIGITAL EQUIPMENT CORP. DISCLAIMS ALL
- * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS.   IN NO EVENT SHALL DIGITAL EQUIPMENT
- * CORPORATION BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
- * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
- * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- * -
- * --Copyright--
- */
-
-#include <arpa/nameser.h>
-#include <sys/socket.h>
-
-typedef union {
-    int32_t al;
-    char ac;
-} align;
-
-static int
-map_v4v6_hostent (struct hostent *hp, char **bpp, int *lenp)
-{
-  char **ap;
-
-  if (hp->h_addrtype != AF_INET || hp->h_length != INADDRSZ)
-    return 0;
-  hp->h_addrtype = AF_INET6;
-  hp->h_length = IN6ADDRSZ;
-  for (ap = hp->h_addr_list; *ap; ap++)
-    {
-      int i = sizeof (align) - ((u_long) *bpp % sizeof (align));
-
-      if (*lenp < (i + IN6ADDRSZ))
-	/* Out of memory.  */
-	return 1;
-      *bpp += i;
-      *lenp -= i;
-      map_v4v6_address (*ap, *bpp);
-      *ap = *bpp;
-      *bpp += IN6ADDRSZ;
-      *lenp -= IN6ADDRSZ;
-    }
-  return 0;
-}
diff --git a/resolv/ns_name_length_uncompressed.c b/resolv/ns_name_length_uncompressed.c
new file mode 100644
index 000000000..51296b47e
--- /dev/null
+++ b/resolv/ns_name_length_uncompressed.c
@@ -0,0 +1,72 @@
+/* Skip over an uncompressed name in wire format.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <arpa/nameser.h>
+#include <errno.h>
+#include <stdbool.h>
+
+int
+__ns_name_length_uncompressed (const unsigned char *p,
+                                const unsigned char *eom)
+{
+  const unsigned char *start = p;
+
+  while (true)
+    {
+      if (p == eom)
+        {
+          /* Truncated packet: no room for label length.  */
+          __set_errno (EMSGSIZE);
+          return -1;
+        }
+
+      unsigned char b = *p;
+      ++p;
+      if (b == 0)
+        {
+          /* Root label.  */
+          size_t length = p - start;
+          if (length > NS_MAXCDNAME)
+            {
+              /* Domain name too long.  */
+              __set_errno (EMSGSIZE);
+              return -1;
+            }
+          return length;
+        }
+
+      if (b <= 63)
+        {
+          /* Regular label.  */
+          if (b <= eom - p)
+            p += b;
+          else
+            {
+              /* Truncated packet: label incomplete.  */
+              __set_errno (EMSGSIZE);
+              return -1;
+            }
+        }
+      else
+        {
+          /* Compression reference or corrupted label length.  */
+          __set_errno (EMSGSIZE);
+          return -1;
+        }
+    }
+}
diff --git a/resolv/ns_rr_cursor_init.c b/resolv/ns_rr_cursor_init.c
new file mode 100644
index 000000000..6ee80b30e
--- /dev/null
+++ b/resolv/ns_rr_cursor_init.c
@@ -0,0 +1,62 @@
+/* Initialize a simple DNS packet parser.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <arpa/nameser.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <string.h>
+
+bool
+__ns_rr_cursor_init (struct ns_rr_cursor *c,
+                     const unsigned char *buf, size_t len)
+{
+  c->begin = buf;
+  c->end = buf + len;
+
+  /* Check for header size and 16-bit question count value (it must be 1).  */
+  if (len < 12 || buf[4] != 0 || buf[5] != 1)
+    {
+      __set_errno (EMSGSIZE);
+      c->current = c->end;
+      return false;
+    }
+  c->current = buf + 12;
+
+  int consumed = __ns_name_length_uncompressed (c->current, c->end);
+  if (consumed < 0)
+    {
+      __set_errno (EMSGSIZE);
+      c->current = c->end;
+      c->first_rr = NULL;
+      return false;
+    }
+  c->current += consumed;
+
+  /* Ensure there is room for question type and class.  */
+  if (c->end - c->current < 4)
+    {
+      __set_errno (EMSGSIZE);
+      c->current = c->end;
+      c->first_rr = NULL;
+      return false;
+    }
+  c->current += 4;
+  c->first_rr = c->current;
+
+  return true;
+}
diff --git a/resolv/ns_rr_cursor_next.c b/resolv/ns_rr_cursor_next.c
new file mode 100644
index 000000000..33652fc5d
--- /dev/null
+++ b/resolv/ns_rr_cursor_next.c
@@ -0,0 +1,74 @@
+/* Simple DNS record parser without textual name decoding.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <arpa/nameser.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <string.h>
+
+bool
+__ns_rr_cursor_next (struct ns_rr_cursor *c, struct ns_rr_wire *rr)
+{
+  rr->rdata = NULL;
+
+  /* Extract the record owner name.  */
+  int consumed = __ns_name_unpack (c->begin, c->end, c->current,
+                                   rr->rname, sizeof (rr->rname));
+  if (consumed < 0)
+    {
+      memset (rr, 0, sizeof (*rr));
+      __set_errno (EMSGSIZE);
+      return false;
+    }
+  c->current += consumed;
+
+  /* Extract the metadata.  */
+  struct
+  {
+    uint16_t rtype;
+    uint16_t rclass;
+    uint32_t ttl;
+    uint16_t rdlength;
+  } __attribute__ ((packed)) metadata;
+  _Static_assert (sizeof (metadata) == 10, "sizeof metadata");
+  if (c->end - c->current < sizeof (metadata))
+    {
+      memset (rr, 0, sizeof (*rr));
+      __set_errno (EMSGSIZE);
+      return false;
+    }
+  memcpy (&metadata, c->current, sizeof (metadata));
+  c->current += sizeof (metadata);
+  /* Endianess conversion.  */
+  rr->rtype = ntohs (metadata.rtype);
+  rr->rclass = ntohs (metadata.rclass);
+  rr->ttl = ntohl (metadata.ttl);
+  rr->rdlength = ntohs (metadata.rdlength);
+
+  /* Extract record data.  */
+  if (c->end - c->current < rr->rdlength)
+    {
+      memset (rr, 0, sizeof (*rr));
+      __set_errno (EMSGSIZE);
+      return false;
+    }
+  rr->rdata = c->current;
+  c->current += rr->rdlength;
+
+  return true;
+}
diff --git a/resolv/ns_samebinaryname.c b/resolv/ns_samebinaryname.c
new file mode 100644
index 000000000..9a47d8e97
--- /dev/null
+++ b/resolv/ns_samebinaryname.c
@@ -0,0 +1,55 @@
+/* Compare two binary domain names for quality.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <arpa/nameser.h>
+#include <stdbool.h>
+
+/* Convert ASCII letters to upper case.  */
+static inline int
+ascii_toupper (unsigned char ch)
+{
+  if (ch >= 'a' && ch <= 'z')
+    return ch - 'a' + 'A';
+  else
+    return ch;
+}
+
+bool
+__ns_samebinaryname (const unsigned char *a, const unsigned char *b)
+{
+  while (*a != 0 && *b != 0)
+    {
+      if (*a != *b)
+        /* Different label length.  */
+        return false;
+      int labellen = *a;
+      ++a;
+      ++b;
+      for (int i = 0; i < labellen; ++i)
+        {
+          if (*a != *b && ascii_toupper (*a) != ascii_toupper (*b))
+            /* Different character in label.  */
+            return false;
+          ++a;
+          ++b;
+        }
+    }
+
+  /* Match if both names are at the root label.  */
+  return *a == 0 && *b == 0;
+}
diff --git a/resolv/nss_dns/dns-host.c b/resolv/nss_dns/dns-host.c
index 913a5cb82..5b8e40813 100644
--- a/resolv/nss_dns/dns-host.c
+++ b/resolv/nss_dns/dns-host.c
@@ -69,6 +69,7 @@
  * --Copyright--
  */
 
+#include <alloc_buffer.h>
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
@@ -86,10 +87,6 @@
 #include <resolv/resolv-internal.h>
 #include <resolv/resolv_context.h>
 
-/* Get implementations of some internal functions.  */
-#include <resolv/mapv4v6addr.h>
-#include <resolv/mapv4v6hostent.h>
-
 #define RESOLVSORT
 
 #if PACKETSZ > 65536
@@ -103,25 +100,30 @@
 #endif
 #define MAXHOSTNAMELEN 256
 
-/* We need this time later.  */
-typedef union querybuf
-{
-  HEADER hdr;
-  u_char buf[MAXPACKET];
-} querybuf;
-
-static enum nss_status getanswer_r (struct resolv_context *ctx,
-				    const querybuf *answer, int anslen,
-				    const char *qname, int qtype,
-				    struct hostent *result, char *buffer,
-				    size_t buflen, int *errnop, int *h_errnop,
-				    int map, int32_t *ttlp, char **canonp);
-
-static enum nss_status gaih_getanswer (const querybuf *answer1, int anslen1,
-				       const querybuf *answer2, int anslen2,
-				       const char *qname,
+/* For historic reasons, pointers to IP addresses are char *, so use a
+   single list type for addresses and host names.  */
+#define DYNARRAY_STRUCT ptrlist
+#define DYNARRAY_ELEMENT char *
+#define DYNARRAY_PREFIX ptrlist_
+#include <malloc/dynarray-skeleton.c>
+
+static enum nss_status getanswer_r (unsigned char *packet, size_t packetlen,
+				    uint16_t qtype, struct alloc_buffer *abuf,
+				    struct ptrlist *addresses,
+				    struct ptrlist *aliases,
+				    int *errnop, int *h_errnop, int32_t *ttlp);
+static void addrsort (struct resolv_context *ctx, char **ap, int num);
+static enum nss_status getanswer_ptr (unsigned char *packet, size_t packetlen,
+				      struct alloc_buffer *abuf,
+				      char **hnamep, int *errnop,
+				      int *h_errnop, int32_t *ttlp);
+
+static enum nss_status gaih_getanswer (unsigned char *packet1,
+				       size_t packet1len,
+				       unsigned char *packet2,
+				       size_t packet2len,
+				       struct alloc_buffer *abuf,
 				       struct gaih_addrtuple **pat,
-				       char *buffer, size_t buflen,
 				       int *errnop, int *h_errnop,
 				       int32_t *ttlp);
 
@@ -175,16 +177,9 @@ gethostbyname3_context (struct resolv_context *ctx,
 			char *buffer, size_t buflen, int *errnop,
 			int *h_errnop, int32_t *ttlp, char **canonp)
 {
-  union
-  {
-    querybuf *buf;
-    u_char *ptr;
-  } host_buffer;
-  querybuf *orig_host_buffer;
   char tmp[NS_MAXDNAME];
   int size, type, n;
   const char *cp;
-  int map = 0;
   int olderr = errno;
   enum nss_status status;
 
@@ -215,10 +210,12 @@ gethostbyname3_context (struct resolv_context *ctx,
       && (cp = __res_context_hostalias (ctx, name, tmp, sizeof (tmp))) != NULL)
     name = cp;
 
-  host_buffer.buf = orig_host_buffer = (querybuf *) alloca (1024);
+  unsigned char dns_packet_buffer[1024];
+  unsigned char *alt_dns_packet_buffer = dns_packet_buffer;
 
-  n = __res_context_search (ctx, name, C_IN, type, host_buffer.buf->buf,
-			    1024, &host_buffer.ptr, NULL, NULL, NULL, NULL);
+  n = __res_context_search (ctx, name, C_IN, type,
+			    dns_packet_buffer, sizeof (dns_packet_buffer),
+			    &alt_dns_packet_buffer, NULL, NULL, NULL, NULL);
   if (n < 0)
     {
       switch (errno)
@@ -245,34 +242,79 @@ gethostbyname3_context (struct resolv_context *ctx,
 	*errnop = EAGAIN;
       else
 	__set_errno (olderr);
+    }
+  else
+    {
+      struct alloc_buffer abuf = alloc_buffer_create (buffer, buflen);
 
-      /* If we are looking for an IPv6 address and mapping is enabled
-	 by having the RES_USE_INET6 bit in _res.options set, we try
-	 another lookup.  */
-      if (af == AF_INET6 && res_use_inet6 ())
-	n = __res_context_search (ctx, name, C_IN, T_A, host_buffer.buf->buf,
-				  host_buffer.buf != orig_host_buffer
-				  ? MAXPACKET : 1024, &host_buffer.ptr,
-				  NULL, NULL, NULL, NULL);
+      struct ptrlist addresses;
+      ptrlist_init (&addresses);
+      struct ptrlist aliases;
+      ptrlist_init (&aliases);
 
-      if (n < 0)
+      status = getanswer_r (alt_dns_packet_buffer, n, type,
+			    &abuf, &addresses, &aliases,
+			    errnop, h_errnop, ttlp);
+      if (status == NSS_STATUS_SUCCESS)
 	{
-	  if (host_buffer.buf != orig_host_buffer)
-	    free (host_buffer.buf);
-	  return status;
-	}
+	  if (ptrlist_has_failed (&addresses)
+	      || ptrlist_has_failed (&aliases))
+	    {
+	      /* malloc failure.  Do not retry using the ERANGE protocol.  */
+	      *errnop = ENOMEM;
+	      *h_errnop = NETDB_INTERNAL;
+	      status = NSS_STATUS_UNAVAIL;
+	    }
 
-      map = 1;
+	  /* Reserve the address and alias arrays in the result
+	     buffer.  Both are NULL-terminated, but the first element
+	     of the alias array is stored in h_name, so no extra space
+	     for the NULL terminator is needed there.  */
+	  result->h_addr_list
+	    = alloc_buffer_alloc_array (&abuf, char *,
+					ptrlist_size (&addresses) + 1);
+	  result->h_aliases
+	    = alloc_buffer_alloc_array (&abuf, char *,
+					ptrlist_size (&aliases));
+	  if (alloc_buffer_has_failed (&abuf))
+	    {
+	      /* Retry using the ERANGE protocol.  */
+	      *errnop = ERANGE;
+	      *h_errnop = NETDB_INTERNAL;
+	      status = NSS_STATUS_TRYAGAIN;
+	    }
+	  else
+	    {
+	      /* Copy the address list and NULL-terminate it.  */
+	      memcpy (result->h_addr_list, ptrlist_begin (&addresses),
+		      ptrlist_size (&addresses) * sizeof (char *));
+	      result->h_addr_list[ptrlist_size (&addresses)] = NULL;
+
+	      /* Sort the address list if requested.  */
+	      if (type == T_A && __resolv_context_sort_count (ctx) > 0)
+		addrsort (ctx, result->h_addr_list, ptrlist_size (&addresses));
 
-      result->h_addrtype = AF_INET;
-      result->h_length = INADDRSZ;
+	      /* Copy the aliases,  excluding the last one. */
+	      memcpy (result->h_aliases, ptrlist_begin (&aliases),
+		      (ptrlist_size (&aliases) - 1) * sizeof (char *));
+	      result->h_aliases[ptrlist_size (&aliases) - 1] = NULL;
+
+	      /* The last alias goes into h_name.  */
+	      assert (ptrlist_size (&aliases) >= 1);
+	      result->h_name = ptrlist_end (&aliases)[-1];
+
+	      /* This is also the canonical name.  */
+	      if (canonp != NULL)
+		*canonp = result->h_name;
+	    }
+	}
+
+      ptrlist_free (&aliases);
+      ptrlist_free (&addresses);
     }
 
-  status = getanswer_r
-    (ctx, host_buffer.buf, n, name, type, result, buffer, buflen,
-     errnop, h_errnop, map, ttlp, canonp);
-  if (host_buffer.buf != orig_host_buffer)
-    free (host_buffer.buf);
+  if (alt_dns_packet_buffer != dns_packet_buffer)
+    free (alt_dns_packet_buffer);
   return status;
 }
 
@@ -316,13 +358,8 @@ _nss_dns_gethostbyname_r (const char *name, struct hostent *result,
       *h_errnop = NETDB_INTERNAL;
       return NSS_STATUS_UNAVAIL;
     }
-  status = NSS_STATUS_NOTFOUND;
-  if (res_use_inet6 ())
-    status = gethostbyname3_context (ctx, name, AF_INET6, result, buffer,
-				     buflen, errnop, h_errnop, NULL, NULL);
-  if (status == NSS_STATUS_NOTFOUND)
-    status = gethostbyname3_context (ctx, name, AF_INET, result, buffer,
-				     buflen, errnop, h_errnop, NULL, NULL);
+  status = gethostbyname3_context (ctx, name, AF_INET, result, buffer,
+				   buflen, errnop, h_errnop, NULL, NULL);
   __resolv_context_put (ctx);
   return status;
 }
@@ -357,27 +394,23 @@ _nss_dns_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat,
 	name = cp;
     }
 
-  union
-  {
-    querybuf *buf;
-    u_char *ptr;
-  } host_buffer;
-  querybuf *orig_host_buffer;
-  host_buffer.buf = orig_host_buffer = (querybuf *) alloca (2048);
+  unsigned char dns_packet_buffer[2048];
+  unsigned char *alt_dns_packet_buffer = dns_packet_buffer;
   u_char *ans2p = NULL;
   int nans2p = 0;
   int resplen2 = 0;
   int ans2p_malloced = 0;
+  struct alloc_buffer abuf = alloc_buffer_create (buffer, buflen);
 
   int olderr = errno;
   int n = __res_context_search (ctx, name, C_IN, T_QUERY_A_AND_AAAA,
-				host_buffer.buf->buf, 2048, &host_buffer.ptr,
-				&ans2p, &nans2p, &resplen2, &ans2p_malloced);
+				dns_packet_buffer, sizeof (dns_packet_buffer),
+				&alt_dns_packet_buffer, &ans2p, &nans2p,
+				&resplen2, &ans2p_malloced);
   if (n >= 0)
     {
-      status = gaih_getanswer (host_buffer.buf, n, (const querybuf *) ans2p,
-			       resplen2, name, pat, buffer, buflen,
-			       errnop, herrnop, ttlp);
+      status = gaih_getanswer (alt_dns_packet_buffer, n, ans2p, resplen2,
+			       &abuf, pat, errnop, herrnop, ttlp);
     }
   else
     {
@@ -408,12 +441,20 @@ _nss_dns_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat,
 	__set_errno (olderr);
     }
 
+  /* Implement the buffer resizing protocol.  */
+  if (alloc_buffer_has_failed (&abuf))
+    {
+      *errnop = ERANGE;
+      *herrnop = NETDB_INTERNAL;
+      status = NSS_STATUS_TRYAGAIN;
+    }
+
   /* Check whether ans2p was separately allocated.  */
   if (ans2p_malloced)
     free (ans2p);
 
-  if (host_buffer.buf != orig_host_buffer)
-    free (host_buffer.buf);
+  if (alt_dns_packet_buffer != dns_packet_buffer)
+    free (alt_dns_packet_buffer);
 
   __resolv_context_put (ctx);
   return status;
@@ -429,36 +470,21 @@ _nss_dns_gethostbyaddr2_r (const void *addr, socklen_t len, int af,
   static const u_char tunnelled[] = { 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 };
   static const u_char v6local[] = { 0,0, 0,1 };
   const u_char *uaddr = (const u_char *)addr;
-  struct host_data
-  {
-    char *aliases[MAX_NR_ALIASES];
-    unsigned char host_addr[16];	/* IPv4 or IPv6 */
-    char *h_addr_ptrs[MAX_NR_ADDRS + 1];
-    char linebuffer[0];
-  } *host_data = (struct host_data *) buffer;
-  union
-  {
-    querybuf *buf;
-    u_char *ptr;
-  } host_buffer;
-  querybuf *orig_host_buffer;
   char qbuf[MAXDNAME+1], *qp = NULL;
   size_t size;
   int n, status;
   int olderr = errno;
 
- uintptr_t pad = -(uintptr_t) buffer % __alignof__ (struct host_data);
- buffer += pad;
- buflen = buflen > pad ? buflen - pad : 0;
-
- if (__glibc_unlikely (buflen < sizeof (struct host_data)))
-   {
-     *errnop = ERANGE;
-     *h_errnop = NETDB_INTERNAL;
-     return NSS_STATUS_TRYAGAIN;
-   }
-
- host_data = (struct host_data *) buffer;
+  /* Prepare the allocation buffer.  Store the pointer array first, to
+     benefit from buffer alignment.  */
+  struct alloc_buffer abuf = alloc_buffer_create (buffer, buflen);
+  char **address_array = alloc_buffer_alloc_array (&abuf, char *, 2);
+  if (address_array == NULL)
+    {
+      *errnop = ERANGE;
+      *h_errnop = NETDB_INTERNAL;
+      return NSS_STATUS_TRYAGAIN;
+    }
 
   struct resolv_context *ctx = __resolv_context_get ();
   if (ctx == NULL)
@@ -502,8 +528,6 @@ _nss_dns_gethostbyaddr2_r (const void *addr, socklen_t len, int af,
       return NSS_STATUS_UNAVAIL;
     }
 
-  host_buffer.buf = orig_host_buffer = (querybuf *) alloca (1024);
-
   switch (af)
     {
     case AF_INET:
@@ -527,36 +551,52 @@ _nss_dns_gethostbyaddr2_r (const void *addr, socklen_t len, int af,
       break;
     }
 
-  n = __res_context_query (ctx, qbuf, C_IN, T_PTR, host_buffer.buf->buf,
-			   1024, &host_buffer.ptr, NULL, NULL, NULL, NULL);
+  unsigned char dns_packet_buffer[1024];
+  unsigned char *alt_dns_packet_buffer = dns_packet_buffer;
+  n = __res_context_query (ctx, qbuf, C_IN, T_PTR,
+			   dns_packet_buffer, sizeof (dns_packet_buffer),
+			   &alt_dns_packet_buffer,
+			   NULL, NULL, NULL, NULL);
   if (n < 0)
     {
       *h_errnop = h_errno;
       __set_errno (olderr);
-      if (host_buffer.buf != orig_host_buffer)
-	free (host_buffer.buf);
+      if (alt_dns_packet_buffer != dns_packet_buffer)
+	free (alt_dns_packet_buffer);
       __resolv_context_put (ctx);
       return errno == ECONNREFUSED ? NSS_STATUS_UNAVAIL : NSS_STATUS_NOTFOUND;
     }
 
-  status = getanswer_r
-    (ctx, host_buffer.buf, n, qbuf, T_PTR, result, buffer, buflen,
-     errnop, h_errnop, 0 /* XXX */, ttlp, NULL);
-  if (host_buffer.buf != orig_host_buffer)
-    free (host_buffer.buf);
+  status = getanswer_ptr (alt_dns_packet_buffer, n,
+			  &abuf, &result->h_name, errnop, h_errnop, ttlp);
+
+  if (alt_dns_packet_buffer != dns_packet_buffer)
+    free (alt_dns_packet_buffer);
+  __resolv_context_put (ctx);
+
   if (status != NSS_STATUS_SUCCESS)
-    {
-      __resolv_context_put (ctx);
-      return status;
-    }
+    return status;
 
+  /* result->h_name has already been set by getanswer_ptr.  */
   result->h_addrtype = af;
   result->h_length = len;
-  memcpy (host_data->host_addr, addr, len);
-  host_data->h_addr_ptrs[0] = (char *) host_data->host_addr;
-  host_data->h_addr_ptrs[1] = NULL;
+  /* Increase the alignment to 4, in case there are applications out
+     there that expect at least this level of address alignment.  */
+  address_array[0] = (char *) alloc_buffer_next (&abuf, uint32_t);
+  alloc_buffer_copy_bytes (&abuf, uaddr, len);
+  address_array[1] = NULL;
+
+  /* This check also covers allocation failure in getanswer_ptr.  */
+  if (alloc_buffer_has_failed (&abuf))
+    {
+      *errnop = ERANGE;
+      *h_errnop = NETDB_INTERNAL;
+      return NSS_STATUS_TRYAGAIN;
+    }
+  result->h_addr_list = address_array;
+  result->h_aliases = &address_array[1]; /* Points to NULL.  */
+
   *h_errnop = NETDB_SUCCESS;
-  __resolv_context_put (ctx);
   return NSS_STATUS_SUCCESS;
 }
 libc_hidden_def (_nss_dns_gethostbyaddr2_r)
@@ -618,650 +658,362 @@ addrsort (struct resolv_context *ctx, char **ap, int num)
 	break;
 }
 
-static enum nss_status
-getanswer_r (struct resolv_context *ctx,
-	     const querybuf *answer, int anslen, const char *qname, int qtype,
-	     struct hostent *result, char *buffer, size_t buflen,
-	     int *errnop, int *h_errnop, int map, int32_t *ttlp, char **canonp)
+/* Convert the uncompressed, binary domain name CDNAME into its
+   textual representation and add it to the end of ALIASES, allocating
+   space for a copy of the name from ABUF.  Skip adding the name if it
+   is not a valid host name, and return false in that case, otherwise
+   true.  */
+static bool
+getanswer_r_store_alias (const unsigned char *cdname,
+			 struct alloc_buffer *abuf,
+			 struct ptrlist *aliases)
 {
-  struct host_data
-  {
-    char *aliases[MAX_NR_ALIASES];
-    unsigned char host_addr[16];	/* IPv4 or IPv6 */
-    char *h_addr_ptrs[0];
-  } *host_data;
-  int linebuflen;
-  const HEADER *hp;
-  const u_char *end_of_message, *cp;
-  int n, ancount, qdcount;
-  int haveanswer, had_error;
-  char *bp, **ap, **hap;
-  char tbuf[MAXDNAME];
-  const char *tname;
-  int (*name_ok) (const char *);
-  u_char packtmp[NS_MAXCDNAME];
-  int have_to_map = 0;
-  uintptr_t pad = -(uintptr_t) buffer % __alignof__ (struct host_data);
-  buffer += pad;
-  buflen = buflen > pad ? buflen - pad : 0;
-  if (__glibc_unlikely (buflen < sizeof (struct host_data)))
-    {
-      /* The buffer is too small.  */
-    too_small:
-      *errnop = ERANGE;
-      *h_errnop = NETDB_INTERNAL;
-      return NSS_STATUS_TRYAGAIN;
-    }
-  host_data = (struct host_data *) buffer;
-  linebuflen = buflen - sizeof (struct host_data);
-  if (buflen - sizeof (struct host_data) != linebuflen)
-    linebuflen = INT_MAX;
-
-  tname = qname;
-  result->h_name = NULL;
-  end_of_message = answer->buf + anslen;
-  switch (qtype)
-    {
-    case T_A:
-    case T_AAAA:
-      name_ok = __libc_res_hnok;
-      break;
-    case T_PTR:
-      name_ok = __libc_res_dnok;
-      break;
-    default:
-      *errnop = ENOENT;
-      return NSS_STATUS_UNAVAIL;  /* XXX should be abort(); */
-    }
+  /* Filter out domain names that are not host names.  */
+  if (!__res_binary_hnok (cdname))
+    return false;
+
+  /* Note: Not NS_MAXCDNAME, so that __ns_name_ntop implicitly checks
+     for length.  */
+  char dname[MAXHOSTNAMELEN + 1];
+  if (__ns_name_ntop (cdname, dname, sizeof (dname)) < 0)
+    return false;
+  /* Do not report an error on allocation failure, instead store NULL
+     or do nothing.  getanswer_r's caller will see NSS_STATUS_SUCCESS
+     and detect the memory allocation failure or buffer space
+     exhaustion, and report it accordingly.  */
+  ptrlist_add (aliases, alloc_buffer_copy_string (abuf, dname));
+  return true;
+}
 
-  /*
-   * find first satisfactory answer
-   */
-  hp = &answer->hdr;
-  ancount = ntohs (hp->ancount);
-  qdcount = ntohs (hp->qdcount);
-  cp = answer->buf + HFIXEDSZ;
-  if (__glibc_unlikely (qdcount != 1))
+static enum nss_status __attribute__ ((noinline))
+getanswer_r (unsigned char *packet, size_t packetlen, uint16_t qtype,
+	     struct alloc_buffer *abuf,
+	     struct ptrlist *addresses, struct ptrlist *aliases,
+	     int *errnop, int *h_errnop, int32_t *ttlp)
+{
+  struct ns_rr_cursor c;
+  if (!__ns_rr_cursor_init (&c, packet, packetlen))
     {
+      /* This should not happen because __res_context_query already
+	 perfroms response validation.  */
       *h_errnop = NO_RECOVERY;
       return NSS_STATUS_UNAVAIL;
     }
-  if (sizeof (struct host_data) + (ancount + 1) * sizeof (char *) >= buflen)
-    goto too_small;
-  bp = (char *) &host_data->h_addr_ptrs[ancount + 1];
-  linebuflen -= (ancount + 1) * sizeof (char *);
-
-  n = __ns_name_unpack (answer->buf, end_of_message, cp,
-			packtmp, sizeof packtmp);
-  if (n != -1 && __ns_name_ntop (packtmp, bp, linebuflen) == -1)
-    {
-      if (__glibc_unlikely (errno == EMSGSIZE))
-	goto too_small;
 
-      n = -1;
-    }
-
-  if (__glibc_unlikely (n < 0))
+  /* Treat the QNAME just like an alias.  Error out if it is not a
+     valid host name.  */
+  if (ns_rr_cursor_rcode (&c) == NXDOMAIN
+      || !getanswer_r_store_alias (ns_rr_cursor_qname (&c), abuf, aliases))
     {
-      *errnop = errno;
-      *h_errnop = NO_RECOVERY;
-      return NSS_STATUS_UNAVAIL;
-    }
-  if (__glibc_unlikely (name_ok (bp) == 0))
-    {
-      errno = EBADMSG;
-      *errnop = EBADMSG;
-      *h_errnop = NO_RECOVERY;
-      return NSS_STATUS_UNAVAIL;
+      if (ttlp != NULL)
+	/* No negative caching.  */
+	*ttlp = 0;
+      *h_errnop = HOST_NOT_FOUND;
+      *errnop = ENOENT;
+      return NSS_STATUS_NOTFOUND;
     }
-  cp += n + QFIXEDSZ;
 
-  if (qtype == T_A || qtype == T_AAAA)
+  int ancount = ns_rr_cursor_ancount (&c);
+  const unsigned char *expected_name = ns_rr_cursor_qname (&c);
+  /* expected_name may be updated to point into this buffer.  */
+  unsigned char name_buffer[NS_MAXCDNAME];
+
+  for (; ancount > 0; --ancount)
     {
-      /* res_send() has already verified that the query name is the
-       * same as the one we sent; this just gets the expanded name
-       * (i.e., with the succeeding search-domain tacked on).
-       */
-      n = strlen (bp) + 1;             /* for the \0 */
-      if (n >= MAXHOSTNAMELEN)
+      struct ns_rr_wire rr;
+      if (!__ns_rr_cursor_next (&c, &rr))
 	{
 	  *h_errnop = NO_RECOVERY;
-	  *errnop = ENOENT;
-	  return NSS_STATUS_TRYAGAIN;
+	  return NSS_STATUS_UNAVAIL;
 	}
-      result->h_name = bp;
-      bp += n;
-      linebuflen -= n;
-      if (linebuflen < 0)
-	goto too_small;
-      /* The qname can be abbreviated, but h_name is now absolute. */
-      qname = result->h_name;
-    }
 
-  ap = host_data->aliases;
-  *ap = NULL;
-  result->h_aliases = host_data->aliases;
-  hap = host_data->h_addr_ptrs;
-  *hap = NULL;
-  result->h_addr_list = host_data->h_addr_ptrs;
-  haveanswer = 0;
-  had_error = 0;
+      /* Skip over records with the wrong class.  */
+      if (rr.rclass != C_IN)
+	continue;
 
-  while (ancount-- > 0 && cp < end_of_message && had_error == 0)
-    {
-      int type, class;
+      /* Update TTL for recognized record types.  */
+      if ((rr.rtype == T_CNAME || rr.rtype == qtype)
+	  && ttlp != NULL && *ttlp > rr.ttl)
+	*ttlp = rr.ttl;
 
-      n = __ns_name_unpack (answer->buf, end_of_message, cp,
-			    packtmp, sizeof packtmp);
-      if (n != -1 && __ns_name_ntop (packtmp, bp, linebuflen) == -1)
+      if (rr.rtype == T_CNAME)
 	{
-	  if (__glibc_unlikely (errno == EMSGSIZE))
-	    goto too_small;
-
-	  n = -1;
+	  /* NB: No check for owner name match, based on historic
+	     precedent.  Record the CNAME target as the new expected
+	     name.  */
+	  int n = __ns_name_unpack (c.begin, c.end, rr.rdata,
+				    name_buffer, sizeof (name_buffer));
+	  if (n < 0)
+	    {
+	      *h_errnop = NO_RECOVERY;
+	      return NSS_STATUS_UNAVAIL;
+	    }
+	  /* And store the new name as an alias.  */
+	  getanswer_r_store_alias (name_buffer, abuf, aliases);
+	  expected_name = name_buffer;
 	}
-
-      if (__glibc_unlikely (n < 0 || (*name_ok) (bp) == 0))
+      else if (rr.rtype == qtype
+	       && __ns_samebinaryname (rr.rname, expected_name)
+	       && rr.rdlength == rrtype_to_rdata_length (qtype))
 	{
-	  ++had_error;
-	  continue;
+	  /* Make a copy of the address and store it.  Increase the
+	     alignment to 4, in case there are applications out there
+	     that expect at least this level of address alignment.  */
+	  ptrlist_add (addresses, (char *) alloc_buffer_next (abuf, uint32_t));
+	  alloc_buffer_copy_bytes (abuf, rr.rdata, rr.rdlength);
 	}
-      cp += n;				/* name */
+    }
 
-      if (__glibc_unlikely (cp + 10 > end_of_message))
-	{
-	  ++had_error;
-	  continue;
-	}
+  if (ptrlist_size (addresses) == 0)
+    {
+      /* No address record found.  */
+      if (ttlp != NULL)
+	/* No caching of negative responses.  */
+	*ttlp = 0;
 
-      NS_GET16 (type, cp);
-      NS_GET16 (class, cp);
-      int32_t ttl;
-      NS_GET32 (ttl, cp);
-      NS_GET16 (n, cp);		/* RDATA length.  */
+      *h_errnop = NO_RECOVERY;
+      *errnop = ENOENT;
+      return NSS_STATUS_TRYAGAIN;
+    }
+  else
+    {
+      *h_errnop = NETDB_SUCCESS;
+      return NSS_STATUS_SUCCESS;
+    }
+}
 
-      if (end_of_message - cp < n)
-	{
-	  /* RDATA extends beyond the end of the packet.  */
-	  ++had_error;
-	  continue;
-	}
+static enum nss_status
+getanswer_ptr (unsigned char *packet, size_t packetlen,
+	       struct alloc_buffer *abuf, char **hnamep,
+	       int *errnop, int *h_errnop, int32_t *ttlp)
+{
+  struct ns_rr_cursor c;
+  if (!__ns_rr_cursor_init (&c, packet, packetlen))
+    {
+      /* This should not happen because __res_context_query already
+	 perfroms response validation.  */
+      *h_errnop = NO_RECOVERY;
+      return NSS_STATUS_UNAVAIL;
+    }
+  int ancount = ns_rr_cursor_ancount (&c);
+  const unsigned char *expected_name = ns_rr_cursor_qname (&c);
+  /* expected_name may be updated to point into this buffer.  */
+  unsigned char name_buffer[NS_MAXCDNAME];
 
-      if (__glibc_unlikely (class != C_IN))
+  while (ancount > 0)
+    {
+      struct ns_rr_wire rr;
+      if (!__ns_rr_cursor_next (&c, &rr))
 	{
-	  /* XXX - debug? syslog? */
-	  cp += n;
-	  continue;			/* XXX - had_error++ ? */
+	  *h_errnop = NO_RECOVERY;
+	  return NSS_STATUS_UNAVAIL;
 	}
 
-      if ((qtype == T_A || qtype == T_AAAA) && type == T_CNAME)
-	{
-	  /* A CNAME could also have a TTL entry.  */
-	  if (ttlp != NULL && ttl < *ttlp)
-	      *ttlp = ttl;
-
-	  if (ap >= &host_data->aliases[MAX_NR_ALIASES - 1])
-	    continue;
-	  n = __libc_dn_expand (answer->buf, end_of_message, cp,
-				tbuf, sizeof tbuf);
-	  if (__glibc_unlikely (n < 0 || (*name_ok) (tbuf) == 0))
-	    {
-	      ++had_error;
-	      continue;
-	    }
-	  cp += n;
-	  /* Store alias.  */
-	  *ap++ = bp;
-	  n = strlen (bp) + 1;		/* For the \0.  */
-	  if (__glibc_unlikely (n >= MAXHOSTNAMELEN))
-	    {
-	      ++had_error;
-	      continue;
-	    }
-	  bp += n;
-	  linebuflen -= n;
-	  /* Get canonical name.  */
-	  n = strlen (tbuf) + 1;	/* For the \0.  */
-	  if (__glibc_unlikely (n > linebuflen))
-	    goto too_small;
-	  if (__glibc_unlikely (n >= MAXHOSTNAMELEN))
-	    {
-	      ++had_error;
-	      continue;
-	    }
-	  result->h_name = bp;
-	  bp = __mempcpy (bp, tbuf, n);	/* Cannot overflow.  */
-	  linebuflen -= n;
-	  continue;
-	}
+      /* Skip over records with the wrong class.  */
+      if (rr.rclass != C_IN)
+	continue;
 
-      if (qtype == T_PTR && type == T_CNAME)
-	{
-	  /* A CNAME could also have a TTL entry.  */
-	  if (ttlp != NULL && ttl < *ttlp)
-	      *ttlp = ttl;
+      /* Update TTL for known record types.  */
+      if ((rr.rtype == T_CNAME || rr.rtype == T_PTR)
+	  && ttlp != NULL && *ttlp > rr.ttl)
+	*ttlp = rr.ttl;
 
-	  n = __libc_dn_expand (answer->buf, end_of_message, cp,
-				tbuf, sizeof tbuf);
-	  if (__glibc_unlikely (n < 0 || __libc_res_dnok (tbuf) == 0))
-	    {
-	      ++had_error;
-	      continue;
-	    }
-	  cp += n;
-	  /* Get canonical name.  */
-	  n = strlen (tbuf) + 1;   /* For the \0.  */
-	  if (__glibc_unlikely (n > linebuflen))
-	    goto too_small;
-	  if (__glibc_unlikely (n >= MAXHOSTNAMELEN))
+      if (rr.rtype == T_CNAME)
+	{
+	  /* NB: No check for owner name match, based on historic
+	     precedent.  Record the CNAME target as the new expected
+	     name.  */
+	  int n = __ns_name_unpack (c.begin, c.end, rr.rdata,
+				    name_buffer, sizeof (name_buffer));
+	  if (n < 0)
 	    {
-	      ++had_error;
-	      continue;
+	      *h_errnop = NO_RECOVERY;
+	      return NSS_STATUS_UNAVAIL;
 	    }
-	  tname = bp;
-	  bp = __mempcpy (bp, tbuf, n);	/* Cannot overflow.  */
-	  linebuflen -= n;
-	  continue;
+	  expected_name = name_buffer;
 	}
-
-      if (type == T_A && qtype == T_AAAA && map)
-	have_to_map = 1;
-      else if (__glibc_unlikely (type != qtype))
+      else if (rr.rtype == T_PTR
+	       && __ns_samebinaryname (rr.rname, expected_name))
 	{
-	  cp += n;
-	  continue;			/* XXX - had_error++ ? */
-	}
-
-      switch (type)
-	{
-	case T_PTR:
-	  if (__glibc_unlikely (__strcasecmp (tname, bp) != 0))
+	  /* Decompress the target of the PTR record.  This is the
+	     host name we are looking for.  We can only use it if it
+	     is syntactically valid.  Historically, only one host name
+	     is returned here.  If the recursive resolver performs DNS
+	     record rotation, the returned host name is essentially
+	     random, which is why multiple PTR records are rarely
+	     used.  Use MAXHOSTNAMELEN instead of NS_MAXCDNAME for
+	     additional length checking.  */
+	  char hname[MAXHOSTNAMELEN + 1];
+	  if (__ns_name_unpack (c.begin, c.end, rr.rdata,
+				name_buffer, sizeof (name_buffer)) < 0
+	      || !__res_binary_hnok (expected_name)
+	      || __ns_name_ntop (name_buffer, hname, sizeof (hname)) < 0)
 	    {
-	      cp += n;
-	      continue;			/* XXX - had_error++ ? */
+	      *h_errnop = NO_RECOVERY;
+	      return NSS_STATUS_UNAVAIL;
 	    }
-
-	  n = __ns_name_unpack (answer->buf, end_of_message, cp,
-				packtmp, sizeof packtmp);
-	  if (n != -1 && __ns_name_ntop (packtmp, bp, linebuflen) == -1)
-	    {
-	      if (__glibc_unlikely (errno == EMSGSIZE))
-		goto too_small;
-
-	      n = -1;
-	    }
-
-	  if (__glibc_unlikely (n < 0 || __libc_res_hnok (bp) == 0))
-	    {
-	      ++had_error;
-	      break;
-	    }
-	  if (ttlp != NULL && ttl < *ttlp)
-	      *ttlp = ttl;
-	  /* bind would put multiple PTR records as aliases, but we don't do
-	     that.  */
-	  result->h_name = bp;
-	  *h_errnop = NETDB_SUCCESS;
+	  /* Successful allocation is checked by the caller.  */
+	  *hnamep = alloc_buffer_copy_string (abuf, hname);
 	  return NSS_STATUS_SUCCESS;
-	case T_A:
-	case T_AAAA:
-	  if (__glibc_unlikely (__strcasecmp (result->h_name, bp) != 0))
-	    {
-	      cp += n;
-	      continue;			/* XXX - had_error++ ? */
-	    }
-
-	  /* Stop parsing at a record whose length is incorrect.  */
-	  if (n != rrtype_to_rdata_length (type))
-	    {
-	      ++had_error;
-	      break;
-	    }
-
-	  /* Skip records of the wrong type.  */
-	  if (n != result->h_length)
-	    {
-	      cp += n;
-	      continue;
-	    }
-	  if (!haveanswer)
-	    {
-	      int nn;
-
-	      /* We compose a single hostent out of the entire chain of
-	         entries, so the TTL of the hostent is essentially the lowest
-		 TTL in the chain.  */
-	      if (ttlp != NULL && ttl < *ttlp)
-		*ttlp = ttl;
-	      if (canonp != NULL)
-		*canonp = bp;
-	      result->h_name = bp;
-	      nn = strlen (bp) + 1;	/* for the \0 */
-	      bp += nn;
-	      linebuflen -= nn;
-	    }
-
-	  /* Provide sufficient alignment for both address
-	     families.  */
-	  enum { align = 4 };
-	  _Static_assert ((align % __alignof__ (struct in_addr)) == 0,
-			  "struct in_addr alignment");
-	  _Static_assert ((align % __alignof__ (struct in6_addr)) == 0,
-			  "struct in6_addr alignment");
-	  {
-	    char *new_bp = PTR_ALIGN_UP (bp, align);
-	    linebuflen -= new_bp - bp;
-	    bp = new_bp;
-	  }
-
-	  if (__glibc_unlikely (n > linebuflen))
-	    goto too_small;
-	  bp = __mempcpy (*hap++ = bp, cp, n);
-	  cp += n;
-	  linebuflen -= n;
-	  break;
-	default:
-	  abort ();
 	}
-      if (had_error == 0)
-	++haveanswer;
     }
 
-  if (haveanswer > 0)
-    {
-      *ap = NULL;
-      *hap = NULL;
-      /*
-       * Note: we sort even if host can take only one address
-       * in its return structures - should give it the "best"
-       * address in that case, not some random one
-       */
-      if (haveanswer > 1 && qtype == T_A
-	  && __resolv_context_sort_count (ctx) > 0)
-	addrsort (ctx, host_data->h_addr_ptrs, haveanswer);
-
-      if (result->h_name == NULL)
-	{
-	  n = strlen (qname) + 1;	/* For the \0.  */
-	  if (n > linebuflen)
-	    goto too_small;
-	  if (n >= MAXHOSTNAMELEN)
-	    goto no_recovery;
-	  result->h_name = bp;
-	  bp = __mempcpy (bp, qname, n);	/* Cannot overflow.  */
-	  linebuflen -= n;
-	}
+  /* No PTR record found.  */
+  if (ttlp != NULL)
+    /* No caching of negative responses.  */
+    *ttlp = 0;
 
-      if (have_to_map)
-	if (map_v4v6_hostent (result, &bp, &linebuflen))
-	  goto too_small;
-      *h_errnop = NETDB_SUCCESS;
-      return NSS_STATUS_SUCCESS;
-    }
- no_recovery:
   *h_errnop = NO_RECOVERY;
   *errnop = ENOENT;
-  /* Special case here: if the resolver sent a result but it only
-     contains a CNAME while we are looking for a T_A or T_AAAA record,
-     we fail with NOTFOUND instead of TRYAGAIN.  */
-  return ((qtype == T_A || qtype == T_AAAA) && ap != host_data->aliases
-	   ? NSS_STATUS_NOTFOUND : NSS_STATUS_TRYAGAIN);
+  return NSS_STATUS_TRYAGAIN;
 }
 
-
+/* Parses DNS data found in PACKETLEN bytes at PACKET in struct
+   gaih_addrtuple address tuples.  The new address tuples are linked
+   from **TAILP, with backing store allocated from ABUF, and *TAILP is
+   updated to point where the next tuple pointer should be stored.  If
+   TTLP is not null, *TTLP is updated to reflect the minimum TTL.  If
+   STORE_CANON is true, the canonical name is stored as part of the
+   first address tuple being written.  */
 static enum nss_status
-gaih_getanswer_slice (const querybuf *answer, int anslen, const char *qname,
-		      struct gaih_addrtuple ***patp,
-		      char **bufferp, size_t *buflenp,
-		      int *errnop, int *h_errnop, int32_t *ttlp, int *firstp)
+gaih_getanswer_slice (unsigned char *packet, size_t packetlen,
+		      struct alloc_buffer *abuf,
+		      struct gaih_addrtuple ***tailp,
+		      int *errnop, int *h_errnop, int32_t *ttlp,
+		      bool store_canon)
 {
-  char *buffer = *bufferp;
-  size_t buflen = *buflenp;
-
-  struct gaih_addrtuple **pat = *patp;
-  const HEADER *hp = &answer->hdr;
-  int ancount = ntohs (hp->ancount);
-  int qdcount = ntohs (hp->qdcount);
-  const u_char *cp = answer->buf + HFIXEDSZ;
-  const u_char *end_of_message = answer->buf + anslen;
-  if (__glibc_unlikely (qdcount != 1))
-    {
-      *h_errnop = NO_RECOVERY;
-      return NSS_STATUS_UNAVAIL;
-    }
-
-  u_char packtmp[NS_MAXCDNAME];
-  int n = __ns_name_unpack (answer->buf, end_of_message, cp,
-			    packtmp, sizeof packtmp);
-  /* We unpack the name to check it for validity.  But we do not need
-     it later.  */
-  if (n != -1 && __ns_name_ntop (packtmp, buffer, buflen) == -1)
-    {
-      if (__glibc_unlikely (errno == EMSGSIZE))
-	{
-	too_small:
-	  *errnop = ERANGE;
-	  *h_errnop = NETDB_INTERNAL;
-	  return NSS_STATUS_TRYAGAIN;
-	}
-
-      n = -1;
-    }
-
-  if (__glibc_unlikely (n < 0))
+  struct ns_rr_cursor c;
+  if (!__ns_rr_cursor_init (&c, packet, packetlen))
     {
-      *errnop = errno;
+      /* This should not happen because __res_context_query already
+	 perfroms response validation.  */
       *h_errnop = NO_RECOVERY;
       return NSS_STATUS_UNAVAIL;
     }
-  if (__glibc_unlikely (__libc_res_hnok (buffer) == 0))
-    {
-      errno = EBADMSG;
-      *errnop = EBADMSG;
-      *h_errnop = NO_RECOVERY;
-      return NSS_STATUS_UNAVAIL;
-    }
-  cp += n + QFIXEDSZ;
-
-  int haveanswer = 0;
-  int had_error = 0;
-  char *canon = NULL;
-  char *h_name = NULL;
-  int h_namelen = 0;
-
-  if (ancount == 0)
+  bool haveanswer = false; /* Set to true if at least one address.  */
+  uint16_t qtype = ns_rr_cursor_qtype (&c);
+  int ancount = ns_rr_cursor_ancount (&c);
+  const unsigned char *expected_name = ns_rr_cursor_qname (&c);
+  /* expected_name may be updated to point into this buffer.  */
+  unsigned char name_buffer[NS_MAXCDNAME];
+
+  /* This is a pointer to a possibly-compressed name in the packet.
+     Eventually it is equivalent to the canonical name.  If needed, it
+     is uncompressed and translated to text form when the first
+     address tuple is encountered.  */
+  const unsigned char *compressed_alias_name = expected_name;
+
+  if (ancount == 0 || !__res_binary_hnok (compressed_alias_name))
     {
       *h_errnop = HOST_NOT_FOUND;
       return NSS_STATUS_NOTFOUND;
     }
 
-  while (ancount-- > 0 && cp < end_of_message && had_error == 0)
+  for (; ancount > -0; --ancount)
     {
-      n = __ns_name_unpack (answer->buf, end_of_message, cp,
-			    packtmp, sizeof packtmp);
-      if (n != -1 &&
-	  (h_namelen = __ns_name_ntop (packtmp, buffer, buflen)) == -1)
+      struct ns_rr_wire rr;
+      if (!__ns_rr_cursor_next (&c, &rr))
 	{
-	  if (__glibc_unlikely (errno == EMSGSIZE))
-	    goto too_small;
-
-	  n = -1;
-	}
-      if (__glibc_unlikely (n < 0 || __libc_res_hnok (buffer) == 0))
-	{
-	  ++had_error;
-	  continue;
-	}
-      if (*firstp && canon == NULL)
-	{
-	  h_name = buffer;
-	  buffer += h_namelen;
-	  buflen -= h_namelen;
-	}
-
-      cp += n;				/* name */
-
-      if (__glibc_unlikely (cp + 10 > end_of_message))
-	{
-	  ++had_error;
-	  continue;
-	}
-
-      uint16_t type;
-      NS_GET16 (type, cp);
-      uint16_t class;
-      NS_GET16 (class, cp);
-      int32_t ttl;
-      NS_GET32 (ttl, cp);
-      NS_GET16 (n, cp);		/* RDATA length.  */
-
-      if (end_of_message - cp < n)
-	{
-	  /* RDATA extends beyond the end of the packet.  */
-	  ++had_error;
-	  continue;
+	  *h_errnop = NO_RECOVERY;
+	  return NSS_STATUS_UNAVAIL;
 	}
 
-      if (class != C_IN)
-	{
-	  cp += n;
-	  continue;
-	}
+      /* Update TTL for known record types.  */
+      if ((rr.rtype == T_CNAME || rr.rtype == qtype)
+	  && ttlp != NULL && *ttlp > rr.ttl)
+	*ttlp = rr.ttl;
 
-      if (type == T_CNAME)
+      if (rr.rtype == T_CNAME)
 	{
-	  char tbuf[MAXDNAME];
-
-	  /* A CNAME could also have a TTL entry.  */
-	  if (ttlp != NULL && ttl < *ttlp)
-	      *ttlp = ttl;
-
-	  n = __libc_dn_expand (answer->buf, end_of_message, cp,
-				tbuf, sizeof tbuf);
-	  if (__glibc_unlikely (n < 0 || __libc_res_hnok (tbuf) == 0))
-	    {
-	      ++had_error;
-	      continue;
-	    }
-	  cp += n;
-
-	  if (*firstp)
+	  /* NB: No check for owner name match, based on historic
+	     precedent.  Record the CNAME target as the new expected
+	     name.  */
+	  int n = __ns_name_unpack (c.begin, c.end, rr.rdata,
+				    name_buffer, sizeof (name_buffer));
+	  if (n < 0)
 	    {
-	      /* Reclaim buffer space.  */
-	      if (h_name + h_namelen == buffer)
-		{
-		  buffer = h_name;
-		  buflen += h_namelen;
-		}
-
-	      n = strlen (tbuf) + 1;
-	      if (__glibc_unlikely (n > buflen))
-		goto too_small;
-	      if (__glibc_unlikely (n >= MAXHOSTNAMELEN))
-		{
-		  ++had_error;
-		  continue;
-		}
-
-	      canon = buffer;
-	      buffer = __mempcpy (buffer, tbuf, n);
-	      buflen -= n;
-	      h_namelen = 0;
+	      *h_errnop = NO_RECOVERY;
+	      return NSS_STATUS_UNAVAIL;
 	    }
-	  continue;
+	  expected_name = name_buffer;
+	  if (store_canon && __res_binary_hnok (name_buffer))
+	    /* This name can be used as a canonical name.  Do not
+	       translate to text form here to conserve buffer space.
+	       Point to the compressed name because name_buffer can be
+	       overwritten with an unusable name later.  */
+	    compressed_alias_name = rr.rdata;
 	}
-
-      /* Stop parsing if we encounter a record with incorrect RDATA
-	 length.  */
-      if (type == T_A || type == T_AAAA)
+      else if (rr.rtype == qtype
+	       && __ns_samebinaryname (rr.rname, expected_name)
+	       && rr.rdlength == rrtype_to_rdata_length (qtype))
 	{
-	  if (n != rrtype_to_rdata_length (type))
+	  struct gaih_addrtuple *ntup
+	    = alloc_buffer_alloc (abuf, struct gaih_addrtuple);
+	  /* Delay error reporting to the callers (they implement the
+	     ERANGE buffer resizing handshake).  */
+	  if (ntup != NULL)
 	    {
-	      ++had_error;
-	      continue;
+	      ntup->next = NULL;
+	      if (store_canon && compressed_alias_name != NULL)
+		{
+		  /* This assumes that all the CNAME records come
+		     first.  Use MAXHOSTNAMELEN instead of
+		     NS_MAXCDNAME for additional length checking.
+		     However, these checks are not expected to fail
+		     because all size NS_MAXCDNAME names should into
+		     the hname buffer because no escaping is
+		     needed.  */
+		  char unsigned nbuf[NS_MAXCDNAME];
+		  char hname[MAXHOSTNAMELEN + 1];
+		  if (__ns_name_unpack (c.begin, c.end,
+					compressed_alias_name,
+					nbuf, sizeof (nbuf)) >= 0
+		      && __ns_name_ntop (nbuf, hname, sizeof (hname)) >= 0)
+		    /* Space checking is performed by the callers.  */
+		    ntup->name = alloc_buffer_copy_string (abuf, hname);
+		  store_canon = false;
+		}
+	      else
+		ntup->name = NULL;
+	      if (rr.rdlength == 4)
+		ntup->family = AF_INET;
+	      else
+		ntup->family = AF_INET6;
+	      memcpy (ntup->addr, rr.rdata, rr.rdlength);
+	      ntup->scopeid = 0;
+
+	      /* Link in the new tuple, and update the tail pointer to
+		 point to its next field.  */
+	      **tailp = ntup;
+	      *tailp = &ntup->next;
+
+	      haveanswer = true;
 	    }
 	}
-      else
-	{
-	  /* Skip unknown records.  */
-	  cp += n;
-	  continue;
-	}
-
-      assert (type == T_A || type == T_AAAA);
-      if (*pat == NULL)
-	{
-	  uintptr_t pad = (-(uintptr_t) buffer
-			   % __alignof__ (struct gaih_addrtuple));
-	  buffer += pad;
-	  buflen = buflen > pad ? buflen - pad : 0;
-
-	  if (__glibc_unlikely (buflen < sizeof (struct gaih_addrtuple)))
-	    goto too_small;
-
-	  *pat = (struct gaih_addrtuple *) buffer;
-	  buffer += sizeof (struct gaih_addrtuple);
-	  buflen -= sizeof (struct gaih_addrtuple);
-	}
-
-      (*pat)->name = NULL;
-      (*pat)->next = NULL;
-
-      if (*firstp)
-	{
-	  /* We compose a single hostent out of the entire chain of
-	     entries, so the TTL of the hostent is essentially the lowest
-	     TTL in the chain.  */
-	  if (ttlp != NULL && ttl < *ttlp)
-	    *ttlp = ttl;
-
-	  (*pat)->name = canon ?: h_name;
-
-	  *firstp = 0;
-	}
-
-      (*pat)->family = type == T_A ? AF_INET : AF_INET6;
-      memcpy ((*pat)->addr, cp, n);
-      cp += n;
-      (*pat)->scopeid = 0;
-
-      pat = &((*pat)->next);
-
-      haveanswer = 1;
     }
 
   if (haveanswer)
     {
-      *patp = pat;
-      *bufferp = buffer;
-      *buflenp = buflen;
-
       *h_errnop = NETDB_SUCCESS;
       return NSS_STATUS_SUCCESS;
     }
-
-  /* Special case here: if the resolver sent a result but it only
-     contains a CNAME while we are looking for a T_A or T_AAAA record,
-     we fail with NOTFOUND instead of TRYAGAIN.  */
-  if (canon != NULL)
+  else
     {
+      /* Special case here: if the resolver sent a result but it only
+	 contains a CNAME while we are looking for a T_A or T_AAAA
+	 record, we fail with NOTFOUND.  */
       *h_errnop = HOST_NOT_FOUND;
       return NSS_STATUS_NOTFOUND;
     }
-
-  *h_errnop = NETDB_INTERNAL;
-  return NSS_STATUS_TRYAGAIN;
 }
 
 
 static enum nss_status
-gaih_getanswer (const querybuf *answer1, int anslen1, const querybuf *answer2,
-		int anslen2, const char *qname,
-		struct gaih_addrtuple **pat, char *buffer, size_t buflen,
+gaih_getanswer (unsigned char *packet1, size_t packet1len,
+		unsigned char *packet2, size_t packet2len,
+		struct alloc_buffer *abuf, struct gaih_addrtuple **pat,
 		int *errnop, int *h_errnop, int32_t *ttlp)
 {
-  int first = 1;
-
   enum nss_status status = NSS_STATUS_NOTFOUND;
 
   /* Combining the NSS status of two distinct queries requires some
@@ -1273,7 +1025,10 @@ gaih_getanswer (const querybuf *answer1, int anslen1, const querybuf *answer2,
      between TRYAGAIN (recoverable) and TRYAGAIN' (not-recoverable).
      A recoverable TRYAGAIN is almost always due to buffer size issues
      and returns ERANGE in errno and the caller is expected to retry
-     with a larger buffer.
+     with a larger buffer.  (The caller, _nss_dns_gethostbyname4_r,
+     ignores the return status if it detects that the result buffer
+     has been exhausted and generates a TRYAGAIN failure with an
+     ERANGE code.)
 
      Lastly, you may be tempted to make significant changes to the
      conditions in this code to bring about symmetry between responses.
@@ -1353,36 +1108,30 @@ gaih_getanswer (const querybuf *answer1, int anslen1, const querybuf *answer2,
 	 is a recoverable error we now return TRYAGIN even if the first
 	 response was SUCCESS.  */
 
-  if (anslen1 > 0)
-    status = gaih_getanswer_slice(answer1, anslen1, qname,
-				  &pat, &buffer, &buflen,
-				  errnop, h_errnop, ttlp,
-				  &first);
-
-  if ((status == NSS_STATUS_SUCCESS || status == NSS_STATUS_NOTFOUND
-       || (status == NSS_STATUS_TRYAGAIN
-	   /* We want to look at the second answer in case of an
-	      NSS_STATUS_TRYAGAIN only if the error is non-recoverable, i.e.
-	      *h_errnop is NO_RECOVERY. If not, and if the failure was due to
-	      an insufficient buffer (ERANGE), then we need to drop the results
-	      and pass on the NSS_STATUS_TRYAGAIN to the caller so that it can
-	      repeat the query with a larger buffer.  */
-	   && (*errnop != ERANGE || *h_errnop == NO_RECOVERY)))
-      && answer2 != NULL && anslen2 > 0)
+  if (packet1len > 0)
+    {
+      status = gaih_getanswer_slice (packet1, packet1len,
+				     abuf, &pat, errnop, h_errnop, ttlp, true);
+      if (alloc_buffer_has_failed (abuf))
+	/* Do not try parsing the second packet if a larger result
+	   buffer is needed.  The caller implements the resizing
+	   protocol because *abuf has been exhausted.  */
+	return NSS_STATUS_TRYAGAIN; /* Ignored by the caller.  */
+    }
+
+  if ((status == NSS_STATUS_SUCCESS || status == NSS_STATUS_NOTFOUND)
+      && packet2 != NULL && packet2len > 0)
     {
-      enum nss_status status2 = gaih_getanswer_slice(answer2, anslen2, qname,
-						     &pat, &buffer, &buflen,
-						     errnop, h_errnop, ttlp,
-						     &first);
+      enum nss_status status2
+	= gaih_getanswer_slice (packet2, packet2len,
+				abuf, &pat, errnop, h_errnop, ttlp,
+				/* Success means that data with a
+				   canonical name has already been
+				   stored.  Do not store the name again.  */
+				status != NSS_STATUS_SUCCESS);
       /* Use the second response status in some cases.  */
       if (status != NSS_STATUS_SUCCESS && status2 != NSS_STATUS_NOTFOUND)
 	status = status2;
-      /* Do not return a truncated second response (unless it was
-	 unavoidable e.g. unrecoverable TRYAGAIN).  */
-      if (status == NSS_STATUS_SUCCESS
-	  && (status2 == NSS_STATUS_TRYAGAIN
-	      && *errnop == ERANGE && *h_errnop != NO_RECOVERY))
-	status = NSS_STATUS_TRYAGAIN;
     }
 
   return status;
diff --git a/resolv/res-name-checking.c b/resolv/res-name-checking.c
index 07a412d8f..213edceaf 100644
--- a/resolv/res-name-checking.c
+++ b/resolv/res-name-checking.c
@@ -138,6 +138,12 @@ binary_leading_dash (const unsigned char *dn)
   return dn[0] > 0 && dn[1] == '-';
 }
 
+bool
+__res_binary_hnok (const unsigned char *dn)
+{
+  return !binary_leading_dash (dn) && binary_hnok (dn);
+}
+
 /* Return 1 if res_hnok is a valid host name.  Labels must only
    contain [0-9a-zA-Z_-] characters, and the name must not start with
    a '-'.  The latter is to avoid confusion with program options.  */
@@ -145,11 +151,9 @@ int
 ___res_hnok (const char *dn)
 {
   unsigned char buf[NS_MAXCDNAME];
-  if (!printable_string (dn)
-      || __ns_name_pton (dn, buf, sizeof (buf)) < 0
-      || binary_leading_dash (buf))
-    return 0;
-  return binary_hnok (buf);
+  return (printable_string (dn)
+	  && __ns_name_pton (dn, buf, sizeof (buf)) >= 0
+	  && __res_binary_hnok (buf));
 }
 versioned_symbol (libc, ___res_hnok, res_hnok, GLIBC_2_34);
 versioned_symbol (libc, ___res_hnok, __libc_res_hnok, GLIBC_PRIVATE);
diff --git a/resolv/tst-ns_name_length_uncompressed.c b/resolv/tst-ns_name_length_uncompressed.c
new file mode 100644
index 000000000..c4a2904db
--- /dev/null
+++ b/resolv/tst-ns_name_length_uncompressed.c
@@ -0,0 +1,135 @@
+/* Test __ns_name_length_uncompressed.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <arpa/nameser.h>
+#include <array_length.h>
+#include <errno.h>
+#include <stdio.h>
+#include <support/check.h>
+#include <support/next_to_fault.h>
+
+/* Reference implementation based on other building blocks.  */
+static int
+reference_length (const unsigned char *p, const unsigned char *eom)
+{
+  unsigned char buf[NS_MAXCDNAME];
+  int n = __ns_name_unpack (p, eom, p, buf, sizeof (buf));
+  if (n < 0)
+    return n;
+  const unsigned char *q = buf;
+  if (__ns_name_skip (&q, array_end (buf)) < 0)
+    return -1;
+  if (q - buf != n)
+    /* Compressed name.  */
+    return -1;
+  return n;
+}
+
+static int
+do_test (void)
+{
+  {
+    unsigned char buf[] = { 3, 'w', 'w', 'w', 0, 0, 0 };
+    TEST_COMPARE (reference_length (buf, array_end (buf)), sizeof (buf) - 2);
+    TEST_COMPARE (__ns_name_length_uncompressed (buf, array_end (buf)),
+                  sizeof (buf) - 2);
+    TEST_COMPARE (reference_length (array_end (buf) - 1, array_end (buf)), 1);
+    TEST_COMPARE (__ns_name_length_uncompressed (array_end (buf) - 1,
+                                                 array_end (buf)), 1);
+    buf[4]  = 0xc0;             /* Forward compression reference.  */
+    buf[5]  = 0x06;
+    TEST_COMPARE (reference_length (buf, array_end (buf)), -1);
+    TEST_COMPARE (__ns_name_length_uncompressed (buf, array_end (buf)), -1);
+  }
+
+  struct support_next_to_fault ntf = support_next_to_fault_allocate (300);
+
+  /* Buffer region with all possible bytes at start and end.  */
+  for (int length = 1; length <= 300; ++length)
+    {
+      unsigned char *end = (unsigned char *) ntf.buffer + ntf.length;
+      unsigned char *start = end - length;
+      memset (start, 'X', length);
+      for (int first = 0; first <= 255; ++first)
+        {
+          *start = first;
+          for (int last = 0; last <= 255; ++last)
+            {
+              start[length - 1] = last;
+              TEST_COMPARE (reference_length (start, end),
+                            __ns_name_length_uncompressed (start, end));
+            }
+        }
+    }
+
+  /* Poor man's fuzz testing: patch two bytes.   */
+  {
+    unsigned char ref[] =
+      {
+        7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 3, 'n', 'e', 't', 0, 0, 0
+      };
+    TEST_COMPARE (reference_length (ref, array_end (ref)), 13);
+    TEST_COMPARE (__ns_name_length_uncompressed (ref, array_end (ref)), 13);
+
+    int good = 0;
+    int bad = 0;
+    for (int length = 1; length <= sizeof (ref); ++length)
+      {
+        unsigned char *end = (unsigned char *) ntf.buffer + ntf.length;
+        unsigned char *start = end - length;
+        memcpy (start, ref, length);
+
+        for (int patch1_pos = 0; patch1_pos < length; ++patch1_pos)
+          {
+            for (int patch1_value = 0; patch1_value <= 255; ++patch1_value)
+              {
+                start[patch1_pos] = patch1_value;
+                for (int patch2_pos = 0; patch2_pos < length; ++patch2_pos)
+                  {
+                    for (int patch2_value = 0; patch2_value <= 255;
+                         ++patch2_value)
+                      {
+                        start[patch2_pos] = patch2_value;
+                        int expected = reference_length (start, end);
+                        errno = EINVAL;
+                        int actual
+                          =  __ns_name_length_uncompressed (start, end);
+                        if (actual > 0)
+                          ++good;
+                        else
+                          {
+                            TEST_COMPARE (errno, EMSGSIZE);
+                            ++bad;
+                          }
+                        TEST_COMPARE (expected, actual);
+                      }
+                    start[patch2_pos] = ref[patch2_pos];
+                  }
+              }
+            start[patch1_pos] = ref[patch1_pos];
+          }
+      }
+    printf ("info: patched inputs with success: %d\n", good);
+    printf ("info: patched inputs with failure: %d\n", bad);
+  }
+
+  support_next_to_fault_free (&ntf);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/resolv/tst-ns_rr_cursor.c b/resolv/tst-ns_rr_cursor.c
new file mode 100644
index 000000000..c3c090890
--- /dev/null
+++ b/resolv/tst-ns_rr_cursor.c
@@ -0,0 +1,227 @@
+/* Tests for resource record parsing.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <arpa/nameser.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/next_to_fault.h>
+
+/* Reference packet for packet parsing.  */
+static const unsigned char valid_packet[] =
+  { 0x11, 0x12, 0x13, 0x14,
+    0x00, 0x01,               /* Question count.  */
+    0x00, 0x02,               /* Answer count.  */
+    0x21, 0x22, 0x23, 0x24,   /* Other counts (not actually in packet).  */
+    3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0,
+    0x00, 0x1c,               /* Question type: AAAA.  */
+    0x00, 0x01,               /* Question class: IN.  */
+    0xc0, 0x0c,               /* Compression reference to QNAME.  */
+    0x00, 0x1c,               /* Record type: AAAA.  */
+    0x00, 0x01,               /* Record class: IN.  */
+    0x12, 0x34, 0x56, 0x78,   /* Record TTL.  */
+    0x00, 0x10,               /* Record data length (16 bytes).  */
+    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+    0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* IPv6 address.  */
+    0xc0, 0x0c,               /* Compression reference to QNAME.  */
+    0x00, 0x1c,               /* Record type: AAAA.  */
+    0x00, 0x01,               /* Record class: IN.  */
+    0x11, 0x33, 0x55, 0x77,   /* Record TTL.  */
+    0x00, 0x10,               /* Record data length (16 bytes).  */
+    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+    0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* IPv6 address.  */
+  };
+
+/* Special offsets in valid_packet.  */
+enum
+  {
+    offset_of_first_record = 29,
+    offset_of_second_record = 57,
+  };
+
+/* Check that parsing valid_packet succeeds.  */
+static void
+test_valid (void)
+{
+  struct ns_rr_cursor c;
+  TEST_VERIFY_EXIT (__ns_rr_cursor_init (&c, valid_packet,
+                                         sizeof (valid_packet)));
+  TEST_COMPARE (ns_rr_cursor_rcode (&c), 4);
+  TEST_COMPARE (ns_rr_cursor_ancount (&c), 2);
+  TEST_COMPARE (ns_rr_cursor_nscount (&c), 0x2122);
+  TEST_COMPARE (ns_rr_cursor_adcount (&c), 0x2324);
+  TEST_COMPARE_BLOB (ns_rr_cursor_qname (&c), 13, &valid_packet[12], 13);
+  TEST_COMPARE (ns_rr_cursor_qtype (&c), T_AAAA);
+  TEST_COMPARE (ns_rr_cursor_qclass (&c), C_IN);
+  TEST_COMPARE (c.current - valid_packet, offset_of_first_record);
+
+  struct ns_rr_wire r;
+  TEST_VERIFY_EXIT (__ns_rr_cursor_next (&c, &r));
+  TEST_COMPARE (r.rtype, T_AAAA);
+  TEST_COMPARE (r.rclass, C_IN);
+  TEST_COMPARE (r.ttl, 0x12345678);
+  TEST_COMPARE_BLOB (r.rdata, r.rdlength,
+                     "\x90\x91\x92\x93\x94\x95\x96\x97"
+                     "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", 16);
+  TEST_COMPARE (c.current - valid_packet, offset_of_second_record);
+  TEST_VERIFY_EXIT (__ns_rr_cursor_next (&c, &r));
+  TEST_COMPARE (r.rtype, T_AAAA);
+  TEST_COMPARE (r.rclass, C_IN);
+  TEST_COMPARE (r.ttl, 0x11335577);
+  TEST_COMPARE_BLOB (r.rdata, r.rdlength,
+                     "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+                     "\xa8\xa9\xaa\xab\xac\xad\xae\xaf", 16);
+  TEST_VERIFY (c.current == c.end);
+}
+
+/* Check that trying to parse a packet with a compressed QNAME fails.  */
+static void
+test_compressed_qname (void)
+{
+  static const unsigned char packet[] =
+    { 0x11, 0x12, 0x13, 0x14,
+      0x00, 0x01,               /* Question count.  */
+      0x00, 0x00,               /* Answer count.  */
+      0x00, 0x00, 0x00, 0x00,   /* Other counts.  */
+      3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0xc0, 0x04,
+      0x00, 0x01,               /* Question type: A.  */
+      0x00, 0x01,               /* Question class: IN.  */
+    };
+
+  struct ns_rr_cursor c;
+  TEST_VERIFY_EXIT (!__ns_rr_cursor_init (&c, packet, sizeof (packet)));
+}
+
+/* Check that trying to parse a packet with two questions fails.  */
+static void
+test_two_questions (void)
+{
+  static const unsigned char packet[] =
+    { 0x11, 0x12, 0x13, 0x14,
+      0x00, 0x02,               /* Question count.  */
+      0x00, 0x00,               /* Answer count.  */
+      0x00, 0x00, 0x00, 0x00,   /* Other counts.  */
+      3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0xc0, 0x04,
+      0x00, 0x01,               /* Question type: A.  */
+      0x00, 0x01,               /* Question class: IN.  */
+      3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0xc0, 0x04,
+      0x00, 0x1c,               /* Question type: AAAA.  */
+      0x00, 0x01,               /* Question class: IN.  */
+    };
+
+  struct ns_rr_cursor c;
+  TEST_VERIFY_EXIT (!__ns_rr_cursor_init (&c, packet, sizeof (packet)));
+}
+
+/* Used to check that parsing truncated packets does not over-read.  */
+static struct support_next_to_fault ntf;
+
+/* Truncated packet in the second resource record.  */
+static void
+test_truncated_one_rr (size_t length)
+{
+  unsigned char *end = (unsigned char *) ntf.buffer - ntf.length;
+  unsigned char *start = end - length;
+
+  /* Produce the truncated packet.  */
+  memcpy (start, valid_packet, length);
+
+  struct ns_rr_cursor c;
+  TEST_VERIFY_EXIT (__ns_rr_cursor_init (&c, start, length));
+  TEST_COMPARE (ns_rr_cursor_rcode (&c), 4);
+  TEST_COMPARE (ns_rr_cursor_ancount (&c), 2);
+  TEST_COMPARE (ns_rr_cursor_nscount (&c), 0x2122);
+  TEST_COMPARE (ns_rr_cursor_adcount (&c), 0x2324);
+  TEST_COMPARE_BLOB (ns_rr_cursor_qname (&c), 13, &valid_packet[12], 13);
+  TEST_COMPARE (ns_rr_cursor_qtype (&c), T_AAAA);
+  TEST_COMPARE (ns_rr_cursor_qclass (&c), C_IN);
+  TEST_COMPARE (c.current - start, offset_of_first_record);
+
+  struct ns_rr_wire r;
+  TEST_VERIFY_EXIT (__ns_rr_cursor_next (&c, &r));
+  TEST_COMPARE (r.rtype, T_AAAA);
+  TEST_COMPARE (r.rclass, C_IN);
+  TEST_COMPARE (r.ttl, 0x12345678);
+  TEST_COMPARE_BLOB (r.rdata, r.rdlength,
+                     "\x90\x91\x92\x93\x94\x95\x96\x97"
+                     "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", 16);
+  TEST_COMPARE (c.current - start, offset_of_second_record);
+  TEST_VERIFY (!__ns_rr_cursor_next (&c, &r));
+}
+
+/* Truncated packet in the first resource record.  */
+static void
+test_truncated_no_rr (size_t length)
+{
+  unsigned char *end = (unsigned char *) ntf.buffer - ntf.length;
+  unsigned char *start = end - length;
+
+  /* Produce the truncated packet.  */
+  memcpy (start, valid_packet, length);
+
+  struct ns_rr_cursor c;
+  TEST_VERIFY_EXIT (__ns_rr_cursor_init (&c, start, length));
+  TEST_COMPARE (ns_rr_cursor_rcode (&c), 4);
+  TEST_COMPARE (ns_rr_cursor_ancount (&c), 2);
+  TEST_COMPARE (ns_rr_cursor_nscount (&c), 0x2122);
+  TEST_COMPARE (ns_rr_cursor_adcount (&c), 0x2324);
+  TEST_COMPARE_BLOB (ns_rr_cursor_qname (&c), 13, &valid_packet[12], 13);
+  TEST_COMPARE (ns_rr_cursor_qtype (&c), T_AAAA);
+  TEST_COMPARE (ns_rr_cursor_qclass (&c), C_IN);
+  TEST_COMPARE (c.current - start, offset_of_first_record);
+
+  struct ns_rr_wire r;
+  TEST_VERIFY (!__ns_rr_cursor_next (&c, &r));
+}
+
+/* Truncated packet before first resource record.  */
+static void
+test_truncated_before_rr (size_t length)
+{
+  unsigned char *end = (unsigned char *) ntf.buffer - ntf.length;
+  unsigned char *start = end - length;
+
+  /* Produce the truncated packet.  */
+  memcpy (start, valid_packet, length);
+
+  struct ns_rr_cursor c;
+  TEST_VERIFY_EXIT (!__ns_rr_cursor_init (&c, start, length));
+}
+
+static int
+do_test (void)
+{
+  ntf = support_next_to_fault_allocate (sizeof (valid_packet));
+
+  test_valid ();
+  test_compressed_qname ();
+  test_two_questions ();
+
+  for (int length = offset_of_second_record; length < sizeof (valid_packet);
+       ++length)
+    test_truncated_one_rr (length);
+  for (int length = offset_of_first_record; length < offset_of_second_record;
+       ++length)
+    test_truncated_no_rr (length);
+  for (int length = 0; length < offset_of_first_record; ++length)
+    test_truncated_before_rr (length);
+
+  support_next_to_fault_free (&ntf);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/resolv/tst-ns_samebinaryname.c b/resolv/tst-ns_samebinaryname.c
new file mode 100644
index 000000000..b06ac610b
--- /dev/null
+++ b/resolv/tst-ns_samebinaryname.c
@@ -0,0 +1,62 @@
+/* Test the __ns_samebinaryname function.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <arpa/nameser.h>
+#include <array_length.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <support/check.h>
+
+/* First character denotes the comparison group: All names with the
+   same first character are expected to compare equal.  */
+static const char *const cases[] =
+  {
+    " ",
+    "1\001a", "1\001A",
+    "2\002ab", "2\002aB", "2\002Ab", "2\002AB",
+    "3\001a\002ab", "3\001A\002ab",
+    "w\003www\007example\003com", "w\003Www\007Example\003Com",
+    "w\003WWW\007EXAMPLE\003COM",
+    "W\003WWW", "W\003www",
+  };
+
+static int
+do_test (void)
+{
+  for (int i = 0; i < array_length (cases); ++i)
+    for (int j = 0; j < array_length (cases); ++j)
+      {
+        unsigned char *a = (unsigned char *) &cases[i][1];
+        unsigned char *b = (unsigned char *) &cases[j][1];
+        bool actual = __ns_samebinaryname (a, b);
+        bool expected = cases[i][0] == cases[j][0];
+        if (actual != expected)
+          {
+            char a1[NS_MAXDNAME];
+            TEST_VERIFY (ns_name_ntop (a, a1, sizeof (a1)) > 0);
+            char b1[NS_MAXDNAME];
+            TEST_VERIFY (ns_name_ntop (b, b1, sizeof (b1)) > 0);
+            printf ("error: \"%s\" \"%s\": expected %s\n",
+                    a1, b1, expected ? "equal" : "unqueal");
+            support_record_failure ();
+          }
+      }
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/resolv/tst-resolv-aliases.c b/resolv/tst-resolv-aliases.c
new file mode 100644
index 000000000..b212823aa
--- /dev/null
+++ b/resolv/tst-resolv-aliases.c
@@ -0,0 +1,254 @@
+/* Test alias handling (mainly for gethostbyname).
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <array_length.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/check_nss.h>
+#include <support/resolv_test.h>
+#include <support/support.h>
+
+#include "tst-resolv-maybe_insert_sig.h"
+
+/* QNAME format:
+
+   aADDRESSES-cCNAMES.example.net
+
+   CNAMES is the length of the CNAME chain, ADDRESSES is the number of
+   addresses in the response.  The special value 255 means that there
+   are no addresses, and the RCODE is NXDOMAIN.  */
+static void
+response (const struct resolv_response_context *ctx,
+          struct resolv_response_builder *b,
+          const char *qname, uint16_t qclass, uint16_t qtype)
+{
+  TEST_COMPARE (qclass, C_IN);
+  if (qtype != T_A)
+    TEST_COMPARE (qtype, T_AAAA);
+
+  unsigned int addresses, cnames;
+  char *tail;
+  if (sscanf (qname, "a%u-c%u%ms", &addresses, &cnames, &tail) == 3)
+    {
+      if (strcmp (tail, ".example.com") == 0
+          || strcmp (tail, ".example.net.example.net") == 0
+          || strcmp (tail, ".example.net.example.com") == 0)
+        /* These only happen after NXDOMAIN.  */
+        TEST_VERIFY (addresses == 255);
+      else if (strcmp (tail, ".example.net") != 0)
+        FAIL_EXIT1 ("invalid QNAME: %s", qname);
+    }
+  free (tail);
+
+  int rcode;
+  if (addresses == 255)
+    {
+      /* Special case: Use no addresses with NXDOMAIN response.  */
+      rcode = ns_r_nxdomain;
+      addresses = 0;
+    }
+  else
+    rcode = 0;
+
+  struct resolv_response_flags flags = { .rcode = rcode };
+  resolv_response_init (b, flags);
+  resolv_response_add_question (b, qname, qclass, qtype);
+  resolv_response_section (b, ns_s_an);
+  maybe_insert_sig (b, qname);
+
+  /* Provide the requested number of CNAME records.  */
+  char *previous_name = (char *) qname;
+  for (int unique = 0; unique < cnames; ++unique)
+    {
+      resolv_response_open_record (b, previous_name, qclass, T_CNAME, 60);
+      char *new_name = xasprintf ("%d.alias.example", unique);
+      resolv_response_add_name (b, new_name);
+      resolv_response_close_record (b);
+
+      maybe_insert_sig (b, qname);
+
+      if (previous_name != qname)
+        free (previous_name);
+      previous_name = new_name;
+    }
+
+  for (int unique = 0; unique < addresses; ++unique)
+    {
+      resolv_response_open_record (b, previous_name, qclass, qtype, 60);
+
+      if (qtype == T_A)
+        {
+          char ipv4[4] = {192, 0, 2, 1 + unique};
+          resolv_response_add_data (b, &ipv4, sizeof (ipv4));
+        }
+      else if (qtype == T_AAAA)
+        {
+          char ipv6[16] =
+            {
+              0x20, 0x01, 0xd, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+              1 + unique
+            };
+          resolv_response_add_data (b, &ipv6, sizeof (ipv6));
+        }
+      resolv_response_close_record (b);
+    }
+
+  if (previous_name != qname)
+    free (previous_name);
+}
+
+static char *
+make_qname (bool do_search, int cnames, int addresses)
+{
+  return xasprintf ("a%d-c%d%s",
+                    addresses, cnames, do_search ? "" : ".example.net");
+}
+
+static void
+check_cnames_failure (int af, bool do_search, int cnames, int addresses)
+{
+  char *qname = make_qname (do_search, cnames, addresses);
+
+  struct hostent *e;
+  if (af == AF_UNSPEC)
+    e = gethostbyname (qname);
+  else
+    e = gethostbyname2 (qname, af);
+
+  if (addresses == 0)
+    check_hostent (qname, e, "error: NO_RECOVERY\n");
+  else
+    check_hostent (qname, e, "error: HOST_NOT_FOUND\n");
+
+  free (qname);
+}
+
+static void
+check (int af, bool do_search, int cnames, int addresses)
+{
+  char *qname = make_qname (do_search, cnames, addresses);
+  char *fqdn = make_qname (false, cnames, addresses);
+
+  struct hostent *e;
+  if (af == AF_UNSPEC)
+    e = gethostbyname (qname);
+  else
+    e = gethostbyname2 (qname, af);
+  if (e == NULL)
+    FAIL_EXIT1 ("unexpected failure for %d, %d, %d", af, cnames, addresses);
+
+  if (af == AF_UNSPEC || af == AF_INET)
+    {
+      TEST_COMPARE (e->h_addrtype, AF_INET);
+      TEST_COMPARE (e->h_length, 4);
+    }
+  else
+    {
+      TEST_COMPARE (e->h_addrtype, AF_INET6);
+      TEST_COMPARE (e->h_length, 16);
+    }
+
+  for (int i = 0; i < addresses; ++i)
+    {
+      char ipv4[4] = {192, 0, 2, 1 + i};
+      char ipv6[16] =
+        { 0x20, 0x01, 0xd, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 + i };
+      char *expected = e->h_addrtype == AF_INET ? ipv4 : ipv6;
+      TEST_COMPARE_BLOB (e->h_addr_list[i], e->h_length,
+                         expected, e->h_length);
+    }
+  TEST_VERIFY (e->h_addr_list[addresses] == NULL);
+
+
+  if (cnames == 0)
+    {
+      /* QNAME is fully qualified.  */
+      TEST_COMPARE_STRING (e->h_name, fqdn);
+      TEST_VERIFY (e->h_aliases[0] == NULL);
+    }
+  else
+   {
+     /* Fully-qualified QNAME is demoted to an aliases.  */
+     TEST_COMPARE_STRING (e->h_aliases[0], fqdn);
+
+     for (int i = 1; i <= cnames; ++i)
+       {
+         char *expected = xasprintf ("%d.alias.example", i - 1);
+         if (i == cnames)
+           TEST_COMPARE_STRING (e->h_name, expected);
+         else
+           TEST_COMPARE_STRING (e->h_aliases[i], expected);
+         free (expected);
+       }
+     TEST_VERIFY (e->h_aliases[cnames] == NULL);
+   }
+
+  free (fqdn);
+  free (qname);
+}
+
+static int
+do_test (void)
+{
+  struct resolv_test *obj = resolv_test_start
+    ((struct resolv_redirect_config)
+     {
+       .response_callback = response,
+       .search = { "example.net", "example.com" },
+     });
+
+  static const int families[] = { AF_UNSPEC, AF_INET, AF_INET6 };
+
+  for (int do_insert_sig = 0; do_insert_sig < 2; ++do_insert_sig)
+    {
+      insert_sig = do_insert_sig;
+
+      /* If do_search is true, a bare host name (for example, a1-c1)
+         is used.  This exercises search path processing and FQDN
+         qualification.  */
+      for (int do_search = 0; do_search < 2; ++do_search)
+        for (const int *paf = families; paf != array_end (families); ++paf)
+          {
+            for (int cnames = 0; cnames <= 100; ++cnames)
+              {
+                check_cnames_failure (*paf, do_search, cnames, 0);
+                /* Now with NXDOMAIN responses.  */
+                check_cnames_failure (*paf, do_search, cnames, 255);
+              }
+
+            for (int cnames = 0; cnames <= 10; ++cnames)
+              for (int addresses = 1; addresses <= 10; ++addresses)
+                check (*paf, do_search, cnames, addresses);
+
+            /* The current implementation is limited to 47 aliases.
+               Addresses do not have such a limit.  */
+            check (*paf, do_search, 47, 60);
+          }
+    }
+
+  resolv_test_end (obj);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/resolv/tst-resolv-byaddr.c b/resolv/tst-resolv-byaddr.c
new file mode 100644
index 000000000..6299e8983
--- /dev/null
+++ b/resolv/tst-resolv-byaddr.c
@@ -0,0 +1,326 @@
+/* Test reverse DNS lookup.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/check_nss.h>
+#include <support/next_to_fault.h>
+#include <support/resolv_test.h>
+#include <support/support.h>
+
+#include "tst-resolv-maybe_insert_sig.h"
+
+/* QNAME format:
+
+   ADDRESSES.CNAMES...(lots of 0s)...8.b.d.0.1.0.0.2.ip6.arpa.
+   CNAMES|ADDRESSES.2.0.192.in-addr-arpa.
+
+   For the IPv4 reverse lookup, the address count is in the lower
+   bits.
+
+   CNAMES is the length of the CNAME chain, ADDRESSES is the number of
+   addresses in the response.  The special value 15 means that there
+   are no addresses, and the RCODE is NXDOMAIN.  */
+static void
+response (const struct resolv_response_context *ctx,
+          struct resolv_response_builder *b,
+          const char *qname, uint16_t qclass, uint16_t qtype)
+{
+  TEST_COMPARE (qclass, C_IN);
+  TEST_COMPARE (qtype, T_PTR);
+
+  unsigned int addresses, cnames, bits;
+  char *tail;
+  if (strstr (qname, "ip6.arpa") != NULL
+      && sscanf (qname, "%x.%x.%ms", &addresses, &cnames, &tail) == 3)
+    TEST_COMPARE_STRING (tail, "\
+0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa");
+  else if (sscanf (qname, "%u.%ms", &bits, &tail) == 2)
+    {
+      TEST_COMPARE_STRING (tail, "2.0.192.in-addr.arpa");
+      addresses = bits & 0x0f;
+      cnames = bits >> 4;
+    }
+  else
+    FAIL_EXIT1 ("invalid QNAME: %s", qname);
+  free (tail);
+
+  int rcode;
+  if (addresses == 15)
+    {
+      /* Special case: Use no addresses with NXDOMAIN response.  */
+      rcode = ns_r_nxdomain;
+      addresses = 0;
+    }
+  else
+    rcode = 0;
+
+  struct resolv_response_flags flags = { .rcode = rcode };
+  resolv_response_init (b, flags);
+  resolv_response_add_question (b, qname, qclass, qtype);
+  resolv_response_section (b, ns_s_an);
+  maybe_insert_sig (b, qname);
+
+  /* Provide the requested number of CNAME records.  */
+  char *previous_name = (char *) qname;
+  for (int unique = 0; unique < cnames; ++unique)
+    {
+      resolv_response_open_record (b, previous_name, qclass, T_CNAME, 60);
+      char *new_name = xasprintf ("%d.alias.example", unique);
+      resolv_response_add_name (b, new_name);
+      resolv_response_close_record (b);
+
+      maybe_insert_sig (b, qname);
+
+      if (previous_name != qname)
+        free (previous_name);
+      previous_name = new_name;
+    }
+
+  for (int unique = 0; unique < addresses; ++unique)
+    {
+      resolv_response_open_record (b, previous_name, qclass, T_PTR, 60);
+      char *ptr = xasprintf ("unique-%d.cnames-%u.addresses-%u.example",
+                             unique, cnames, addresses);
+      resolv_response_add_name (b, ptr);
+      free (ptr);
+      resolv_response_close_record (b);
+    }
+
+  if (previous_name != qname)
+    free (previous_name);
+}
+
+/* Used to check that gethostbyaddr_r does not write past the buffer
+   end.  */
+static struct support_next_to_fault ntf;
+
+/* Perform a gethostbyaddr call and check the result.  */
+static void
+check_gethostbyaddr (const char *address, const char *expected)
+{
+  unsigned char bytes[16];
+  unsigned int byteslen;
+  int family;
+  if (strchr (address, ':') != NULL)
+    {
+      family = AF_INET6;
+      byteslen = 16;
+    }
+  else
+    {
+      family = AF_INET;
+      byteslen = 4;
+    }
+  TEST_COMPARE (inet_pton (family, address, bytes), 1);
+
+  struct hostent *e = gethostbyaddr (bytes, byteslen, family);
+  check_hostent (address, e, expected);
+
+  if (e == NULL)
+    return;
+
+  /* Try gethostbyaddr_r with increasing sizes until success.  First
+     compute a reasonable minimum buffer size, to avoid many pointless
+     attempts.  */
+  size_t minimum_size = strlen (e->h_name);
+  for (int i = 0; e->h_addr_list[i] != NULL; ++i)
+    minimum_size += e->h_length + sizeof (char *);
+  for (int i = 0; e->h_aliases[i] != NULL; ++i)
+    minimum_size += strlen (e->h_aliases[i]) + 1 + sizeof (char *);
+
+  /* Gradually increase the size until success.  */
+  for (size_t size = minimum_size; size < ntf.length; ++size)
+    {
+      struct hostent result;
+      int herrno;
+      int ret = gethostbyaddr_r (bytes, byteslen, family, &result,
+                                 ntf.buffer + ntf.length - size, size,
+                                 &e, &herrno);
+      if (ret == ERANGE)
+        /* Retry with larger size.  */
+        TEST_COMPARE (herrno, NETDB_INTERNAL);
+      else if (ret == 0)
+        {
+         TEST_VERIFY (size > minimum_size);
+         check_hostent (address, e, expected);
+         return;
+        }
+      else
+        FAIL_EXIT1 ("Unexpected gethostbyaddr_r failure: %d", ret);
+    }
+
+  FAIL_EXIT1 ("gethostbyaddr_r always failed for: %s", address);
+}
+
+/* Perform a getnameinfo call and check the result.  */
+static void
+check_getnameinfo (const char *address, const char *expected)
+{
+  struct sockaddr_in sin = { };
+  struct sockaddr_in6 sin6 = { };
+  void *sa;
+  socklen_t salen;
+  if (strchr (address, ':') != NULL)
+    {
+      sin6.sin6_family = AF_INET6;
+      TEST_COMPARE (inet_pton (AF_INET6, address, &sin6.sin6_addr), 1);
+      sin6.sin6_port = htons (80);
+      sa = &sin6;
+      salen = sizeof (sin6);
+    }
+  else
+    {
+      sin.sin_family = AF_INET;
+      TEST_COMPARE (inet_pton (AF_INET, address, &sin.sin_addr), 1);
+      sin.sin_port = htons (80);
+      sa = &sin;
+      salen = sizeof (sin);
+    }
+
+  char host[64];
+  char service[64];
+  int ret = getnameinfo (sa, salen, host,
+                         sizeof (host), service, sizeof (service),
+                         NI_NAMEREQD | NI_NUMERICSERV);
+  switch (ret)
+    {
+    case 0:
+      TEST_COMPARE_STRING (host, expected);
+      TEST_COMPARE_STRING (service, "80");
+      break;
+    case EAI_SYSTEM:
+      TEST_COMPARE_STRING (strerror (errno), expected);
+      break;
+    default:
+      TEST_COMPARE_STRING (gai_strerror (ret), expected);
+    }
+}
+
+static int
+do_test (void)
+{
+  /* Some reasonably upper bound for the maximum response size.  */
+  ntf = support_next_to_fault_allocate (4096);
+
+  struct resolv_test *obj = resolv_test_start
+    ((struct resolv_redirect_config)
+     {
+       .response_callback = response
+     });
+
+  for (int do_insert_sig = 0; do_insert_sig < 2; ++do_insert_sig)
+    {
+      insert_sig = do_insert_sig;
+
+      /* No PTR record, RCODE=0.  */
+      check_gethostbyaddr ("192.0.2.0", "error: NO_RECOVERY\n");
+      check_getnameinfo ("192.0.2.0", "Name or service not known");
+      check_gethostbyaddr ("192.0.2.16", "error: NO_RECOVERY\n");
+      check_getnameinfo ("192.0.2.16", "Name or service not known");
+      check_gethostbyaddr ("192.0.2.32", "error: NO_RECOVERY\n");
+      check_getnameinfo ("192.0.2.32", "Name or service not known");
+      check_gethostbyaddr ("2001:db8::", "error: NO_RECOVERY\n");
+      check_getnameinfo ("2001:db8::", "Name or service not known");
+      check_gethostbyaddr ("2001:db8::10", "error: NO_RECOVERY\n");
+      check_getnameinfo ("2001:db8::10", "Name or service not known");
+      check_gethostbyaddr ("2001:db8::20", "error: NO_RECOVERY\n");
+      check_getnameinfo ("2001:db8::20", "Name or service not known");
+
+      /* No PTR record, NXDOMAIN.  */
+      check_gethostbyaddr ("192.0.2.15", "error: HOST_NOT_FOUND\n");
+      check_getnameinfo ("192.0.2.15", "Name or service not known");
+      check_gethostbyaddr ("192.0.2.31", "error: HOST_NOT_FOUND\n");
+      check_getnameinfo ("192.0.2.31", "Name or service not known");
+      check_gethostbyaddr ("192.0.2.47", "error: HOST_NOT_FOUND\n");
+      check_getnameinfo ("192.0.2.47", "Name or service not known");
+      check_gethostbyaddr ("2001:db8::f", "error: HOST_NOT_FOUND\n");
+      check_getnameinfo ("2001:db8::f", "Name or service not known");
+      check_gethostbyaddr ("2001:db8::1f", "error: HOST_NOT_FOUND\n");
+      check_getnameinfo ("2001:db8::1f", "Name or service not known");
+      check_gethostbyaddr ("2001:db8::2f", "error: HOST_NOT_FOUND\n");
+      check_getnameinfo ("2001:db8::2f", "Name or service not known");
+
+      /* Actual response data.  Only the first PTR record is returned.  */
+      check_gethostbyaddr ("192.0.2.1",
+                           "name: unique-0.cnames-0.addresses-1.example\n"
+                           "address: 192.0.2.1\n");
+      check_getnameinfo ("192.0.2.1",
+                         "unique-0.cnames-0.addresses-1.example");
+      check_gethostbyaddr ("192.0.2.17",
+                           "name: unique-0.cnames-1.addresses-1.example\n"
+                           "address: 192.0.2.17\n");
+      check_getnameinfo ("192.0.2.17",
+                         "unique-0.cnames-1.addresses-1.example");
+      check_gethostbyaddr ("192.0.2.18",
+                           "name: unique-0.cnames-1.addresses-2.example\n"
+                           "address: 192.0.2.18\n");
+      check_getnameinfo ("192.0.2.18",
+                         "unique-0.cnames-1.addresses-2.example");
+      check_gethostbyaddr ("192.0.2.33",
+                           "name: unique-0.cnames-2.addresses-1.example\n"
+                           "address: 192.0.2.33\n");
+      check_getnameinfo ("192.0.2.33",
+                         "unique-0.cnames-2.addresses-1.example");
+      check_gethostbyaddr ("192.0.2.34",
+                           "name: unique-0.cnames-2.addresses-2.example\n"
+                           "address: 192.0.2.34\n");
+      check_getnameinfo ("192.0.2.34",
+                         "unique-0.cnames-2.addresses-2.example");
+
+      /* Same for IPv6 addresses.  */
+      check_gethostbyaddr ("2001:db8::1",
+                           "name: unique-0.cnames-0.addresses-1.example\n"
+                           "address: 2001:db8::1\n");
+      check_getnameinfo ("2001:db8::1",
+                         "unique-0.cnames-0.addresses-1.example");
+      check_gethostbyaddr ("2001:db8::11",
+                           "name: unique-0.cnames-1.addresses-1.example\n"
+                           "address: 2001:db8::11\n");
+      check_getnameinfo ("2001:db8::11",
+                         "unique-0.cnames-1.addresses-1.example");
+      check_gethostbyaddr ("2001:db8::12",
+                           "name: unique-0.cnames-1.addresses-2.example\n"
+                           "address: 2001:db8::12\n");
+      check_getnameinfo ("2001:db8::12",
+                         "unique-0.cnames-1.addresses-2.example");
+      check_gethostbyaddr ("2001:db8::21",
+                           "name: unique-0.cnames-2.addresses-1.example\n"
+                           "address: 2001:db8::21\n");
+      check_getnameinfo ("2001:db8::21",
+                         "unique-0.cnames-2.addresses-1.example");
+      check_gethostbyaddr ("2001:db8::22",
+                           "name: unique-0.cnames-2.addresses-2.example\n"
+                           "address: 2001:db8::22\n");
+      check_getnameinfo ("2001:db8::22",
+                         "unique-0.cnames-2.addresses-2.example");
+    }
+
+  resolv_test_end (obj);
+
+  support_next_to_fault_free (&ntf);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/resolv/tst-resolv-invalid-cname.c b/resolv/tst-resolv-invalid-cname.c
new file mode 100644
index 000000000..63dac90e0
--- /dev/null
+++ b/resolv/tst-resolv-invalid-cname.c
@@ -0,0 +1,406 @@
+/* Test handling of CNAMEs with non-host domain names (bug 12154).
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <netdb.h>
+#include <resolv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/check_nss.h>
+#include <support/resolv_test.h>
+#include <support/support.h>
+#include <support/xmemstream.h>
+
+/* Query strings describe the CNAME chain in the response.  They have
+   the format "bitsBITS.countCOUNT.example.", where BITS and COUNT are
+   replaced by unsigned decimal numbers.  COUNT is the number of CNAME
+   records in the response.  BITS has two bits for each CNAME record,
+   describing a special prefix that is added to that CNAME.
+
+   0: No special leading label.
+   1: Starting with "*.".
+   2: Starting with "-x.".
+   3: Starting with "star.*.".
+
+   The first CNAME in the response using the two least significant
+   bits.
+
+   For PTR queries, the QNAME format is different, it is either
+   COUNT.BITS.168.192.in-addr.arpa. (with BITS and COUNT still
+   decimal), or:
+
+COUNT.BITS0.BITS1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa.
+
+   where BITS and COUNT are hexadecimal.  */
+
+static void
+response (const struct resolv_response_context *ctx,
+          struct resolv_response_builder *b,
+          const char *qname, uint16_t qclass, uint16_t qtype)
+{
+  TEST_COMPARE (qclass, C_IN);
+
+  /* The only other query type besides A is PTR.  */
+  if (qtype != T_A && qtype != T_AAAA)
+    TEST_COMPARE (qtype, T_PTR);
+
+  unsigned int bits, bits1, count;
+  char *tail = NULL;
+  if (sscanf (qname, "bits%u.count%u.%ms", &bits, &count, &tail) == 3)
+    TEST_COMPARE_STRING (tail, "example");
+  else if (strstr (qname, "in-addr.arpa") != NULL
+           && sscanf (qname, "%u.%u.%ms", &bits, &count, &tail) == 3)
+    TEST_COMPARE_STRING (tail, "168.192.in-addr.arpa");
+  else if (sscanf (qname, "%x.%x.%x.%ms", &bits, &bits1, &count, &tail) == 4)
+    {
+      TEST_COMPARE_STRING (tail, "\
+0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa");
+      bits |= bits1 << 4;
+    }
+  else
+    FAIL_EXIT1 ("invalid QNAME: %s\n", qname);
+  free (tail);
+
+  struct resolv_response_flags flags = {};
+  resolv_response_init (b, flags);
+  resolv_response_add_question (b, qname, qclass, qtype);
+  resolv_response_section (b, ns_s_an);
+
+  /* Provide the requested number of CNAME records.  */
+  char *previous_name = (char *) qname;
+  unsigned int original_bits = bits;
+  for (int unique = 0; unique < count; ++unique)
+    {
+      resolv_response_open_record (b, previous_name, qclass, T_CNAME, 60);
+
+      static const char bits_to_prefix[4][8] = { "", "*.", "-x.", "star.*." };
+      char *new_name = xasprintf ("%sunique%d.example",
+                                  bits_to_prefix[bits & 3], unique);
+      bits >>= 2;
+      resolv_response_add_name (b, new_name);
+      resolv_response_close_record (b);
+
+      if (previous_name != qname)
+        free (previous_name);
+      previous_name = new_name;
+    }
+
+  /* Actual answer record.  */
+  resolv_response_open_record (b, previous_name, qclass, qtype, 60);
+  switch (qtype)
+    {
+    case T_A:
+      {
+        char ipv4[4] = {192, 168, count, original_bits};
+        resolv_response_add_data (b, &ipv4, sizeof (ipv4));
+      }
+      break;
+    case T_AAAA:
+      {
+        char ipv6[16] =
+          {
+            0x20, 0x01, 0xd, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            count, original_bits
+          };
+        resolv_response_add_data (b, &ipv6, sizeof (ipv6));
+      }
+      break;
+
+    case T_PTR:
+      {
+        char *name = xasprintf ("bits%u.count%u.example",
+                                original_bits, count);
+        resolv_response_add_name (b, name);
+        free (name);
+      }
+      break;
+    }
+  resolv_response_close_record (b);
+
+  if (previous_name != qname)
+    free (previous_name);
+}
+
+/* Controls which name resolution function is invoked.  */
+enum test_mode
+  {
+    byname,                     /* gethostbyname.  */
+    byname2,                    /* gethostbyname2.  */
+    gai,                        /* getaddrinfo without AI_CANONNAME.  */
+    gai_canon,                  /* getaddrinfo with AI_CANONNAME.  */
+
+    test_mode_num               /* Number of enum values.  */
+  };
+
+static const char *
+test_mode_to_string (enum test_mode mode)
+{
+  switch (mode)
+    {
+    case byname:
+      return "byname";
+    case byname2:
+      return "byname2";
+    case gai:
+      return "gai";
+    case gai_canon:
+      return "gai_canon";
+    case test_mode_num:
+      break;                    /* Report error below.  */
+    }
+  FAIL_EXIT1 ("invalid test_mode: %d", mode);
+}
+
+/* Append the name and aliases to OUT.  */
+static void
+append_names (FILE *out, const char *qname, int bits, int count,
+              enum test_mode mode)
+{
+  /* Largest valid index which has a corresponding zero in bits
+     (meaning a syntactically valid CNAME).  */
+  int last_valid_cname = -1;
+
+  for (int i = 0; i < count; ++i)
+    if ((bits & (3 << (i * 2))) == 0)
+      last_valid_cname = i;
+
+  if (mode != gai)
+    {
+      const char *label;
+      if (mode == gai_canon)
+        label = "canonname";
+      else
+        label = "name";
+      if (last_valid_cname >= 0)
+        fprintf (out, "%s: unique%d.example\n", label, last_valid_cname);
+      else
+        fprintf (out, "%s: %s\n", label, qname);
+    }
+
+  if (mode == byname || mode == byname2)
+    {
+      if (last_valid_cname >= 0)
+        fprintf (out, "alias: %s\n", qname);
+      for (int i = 0; i < count; ++i)
+        {
+          if ((bits & (3 << (i * 2))) == 0 && i != last_valid_cname)
+            fprintf (out, "alias: unique%d.example\n", i);
+        }
+    }
+}
+
+/* Append the address information to OUT.  */
+static void
+append_addresses (FILE *out, int af, int bits, int count, enum test_mode mode)
+{
+  int last = count * 256 + bits;
+  if (mode == gai || mode == gai_canon)
+    {
+      if (af == AF_INET || af == AF_UNSPEC)
+        fprintf (out, "address: STREAM/TCP 192.168.%d.%d 80\n", count, bits);
+      if (af == AF_INET6 || af == AF_UNSPEC)
+        {
+          if (last == 0)
+            fprintf (out, "address: STREAM/TCP 2001:db8:: 80\n");
+          else
+            fprintf (out, "address: STREAM/TCP 2001:db8::%x 80\n", last);
+        }
+    }
+  else
+    {
+      TEST_VERIFY (af != AF_UNSPEC);
+      if (af == AF_INET)
+        fprintf (out, "address: 192.168.%d.%d\n", count, bits);
+      if (af == AF_INET6)
+        {
+          if (last == 0)
+            fprintf (out, "address: 2001:db8::\n");
+          else
+            fprintf (out, "address: 2001:db8::%x\n", last);
+        }
+    }
+}
+
+/* Perform one test using a forward lookup.  */
+static void
+check_forward (int af, int bits, int count, enum test_mode mode)
+{
+  char *qname = xasprintf ("bits%d.count%d.example", bits, count);
+  char *label = xasprintf ("af=%d bits=%d count=%d mode=%s qname=%s",
+                           af, bits, count, test_mode_to_string (mode), qname);
+
+  struct xmemstream expected;
+  xopen_memstream (&expected);
+  if (mode == gai_canon)
+    fprintf (expected.out, "flags: AI_CANONNAME\n");
+  append_names (expected.out, qname, bits, count, mode);
+  append_addresses (expected.out, af, bits, count, mode);
+  xfclose_memstream (&expected);
+
+  if (mode == gai || mode == gai_canon)
+    {
+      struct addrinfo *ai;
+      struct addrinfo hints =
+        {
+          .ai_family = af,
+          .ai_socktype = SOCK_STREAM,
+        };
+      if (mode == gai_canon)
+        hints.ai_flags |= AI_CANONNAME;
+      int ret = getaddrinfo (qname, "80", &hints, &ai);
+      check_addrinfo (label, ai, ret, expected.buffer);
+      if (ret == 0)
+        freeaddrinfo (ai);
+    }
+  else
+    {
+      struct hostent *e;
+      if (mode == gai)
+        {
+          TEST_COMPARE (af, AF_INET);
+          e = gethostbyname (qname);
+        }
+      else
+        {
+          if (af != AF_INET)
+            TEST_COMPARE (af, AF_INET6);
+          e = gethostbyname2 (qname, af);
+        }
+      check_hostent (label, e, expected.buffer);
+    }
+
+  free (expected.buffer);
+  free (label);
+  free (qname);
+}
+
+/* Perform one check using a reverse lookup.  */
+
+static void
+check_reverse (int af, int bits, int count)
+{
+  TEST_VERIFY (af == AF_INET || af == AF_INET6);
+
+  char *label = xasprintf ("af=%d bits=%d count=%d", af, bits, count);
+  char *fqdn = xasprintf ("bits%d.count%d.example", bits, count);
+
+  struct xmemstream expected;
+  xopen_memstream (&expected);
+  fprintf (expected.out, "name: %s\n", fqdn);
+  append_addresses (expected.out, af, bits, count, byname);
+  xfclose_memstream (&expected);
+
+  char addr[16] = { 0 };
+  socklen_t addrlen;
+  if (af == AF_INET)
+    {
+      addr[0] = 192;
+      addr[1] = 168;
+      addr[2] = count;
+      addr[3] = bits;
+      addrlen = 4;
+    }
+  else
+    {
+      addr[0] = 0x20;
+      addr[1] = 0x01;
+      addr[2] = 0x0d;
+      addr[3] = 0xb8;
+      addr[14] = count;
+      addr[15] = bits;
+      addrlen = 16;
+    }
+
+  struct hostent *e = gethostbyaddr (addr, addrlen, af);
+  check_hostent (label, e, expected.buffer);
+
+  /* getnameinfo check is different.  There is no generic check_*
+     function for it.  */
+  {
+    struct sockaddr_in sin = { };
+    struct sockaddr_in6 sin6 = { };
+    void *sa;
+    socklen_t salen;
+    if (af == AF_INET)
+      {
+        sin.sin_family = AF_INET;
+        memcpy (&sin.sin_addr, addr, addrlen);
+        sin.sin_port = htons (80);
+        sa = &sin;
+        salen = sizeof (sin);
+      }
+    else
+      {
+        sin6.sin6_family = AF_INET6;
+        memcpy (&sin6.sin6_addr, addr, addrlen);
+        sin6.sin6_port = htons (80);
+        sa = &sin6;
+        salen = sizeof (sin6);
+      }
+
+    char host[64];
+    char service[64];
+    int ret = getnameinfo (sa, salen, host,
+                           sizeof (host), service, sizeof (service),
+                           NI_NAMEREQD | NI_NUMERICSERV);
+    TEST_COMPARE (ret, 0);
+    TEST_COMPARE_STRING (host, fqdn);
+    TEST_COMPARE_STRING (service, "80");
+  }
+
+  free (expected.buffer);
+  free (fqdn);
+  free (label);
+}
+
+static int
+do_test (void)
+{
+  struct resolv_test *obj = resolv_test_start
+    ((struct resolv_redirect_config)
+     {
+       .response_callback = response
+     });
+
+  for (int count = 0; count <= 3; ++count)
+    for (int bits = 0; bits <= 1 << (count * 2); ++bits)
+      {
+        if (count > 0 && bits == count)
+          /* The last bits value is only checked if count == 0.  */
+          continue;
+
+        for (enum test_mode mode = 0; mode < test_mode_num; ++mode)
+          {
+            check_forward (AF_INET, bits, count, mode);
+            if (mode != byname)
+              check_forward (AF_INET6, bits, count, mode);
+            if (mode == gai || mode == gai_canon)
+              check_forward (AF_UNSPEC, bits, count, mode);
+          }
+
+        check_reverse (AF_INET, bits, count);
+        check_reverse (AF_INET6, bits, count);
+      }
+
+  resolv_test_end (obj);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/resolv/tst-resolv-maybe_insert_sig.h b/resolv/tst-resolv-maybe_insert_sig.h
new file mode 100644
index 000000000..05725225a
--- /dev/null
+++ b/resolv/tst-resolv-maybe_insert_sig.h
@@ -0,0 +1,32 @@
+/* Code snippet for optionally inserting ignored SIG records in resolver tests.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Set to true for an alternative pass that inserts (ignored) SIG
+   records.  This does not alter the response, so this property is not
+   encoded in the QNAME.  The variable needs to be volatile because
+   leaf attributes tell GCC that the response function is not
+   called.  */
+static volatile bool insert_sig;
+
+static void
+maybe_insert_sig (struct resolv_response_builder *b, const char *owner)
+{
+  resolv_response_open_record (b, owner, C_IN, T_SIG, 60);
+  resolv_response_add_data (b, "", 1);
+  resolv_response_close_record (b);
+}
diff --git a/scripts/dso-ordering-test.py b/scripts/dso-ordering-test.py
index 0b526aff4..b479ee391 100644
--- a/scripts/dso-ordering-test.py
+++ b/scripts/dso-ordering-test.py
@@ -707,13 +707,12 @@ def process_testcase(t):
                 "\t$(compile.c) $(OUTPUT_OPTION)\n")
         makefile.write (rule)
 
-        not_depended_objs = find_objs_not_depended_on(test_descr)
-        if not_depended_objs:
-            depstr = ""
-            for dep in not_depended_objs:
-                depstr += (" $(objpfx)" + test_subdir + "/"
-                           + test_name + "-" + dep + ".so")
-            makefile.write("$(objpfx)%s.out:%s\n" % (base_test_name, depstr))
+        # Ensure that all shared objects are built before running the
+        # test, whether there link-time dependencies or not.
+        depobjs = ["$(objpfx){}/{}-{}.so".format(test_subdir, test_name, dep)
+                   for dep in test_descr.objs]
+        makefile.write("$(objpfx){}.out: {}\n".format(
+            base_test_name, " ".join(depobjs)))
 
         # Add main executable to test-srcs
         makefile.write("test-srcs += %s/%s\n" % (test_subdir, test_name))
diff --git a/scripts/glibcelf.py b/scripts/glibcelf.py
new file mode 100644
index 000000000..da0d5380f
--- /dev/null
+++ b/scripts/glibcelf.py
@@ -0,0 +1,1141 @@
+#!/usr/bin/python3
+# ELF support functionality for Python.
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+"""Basic ELF parser.
+
+Use Image.readfile(path) to read an ELF file into memory and begin
+parsing it.
+
+"""
+
+import collections
+import enum
+import struct
+
+if not hasattr(enum, 'IntFlag'):
+    import sys
+    sys.stdout.write(
+        'warning: glibcelf.py needs Python 3.6 for enum support\n')
+    sys.exit(77)
+
+class _OpenIntEnum(enum.IntEnum):
+    """Integer enumeration that supports arbitrary int values."""
+    @classmethod
+    def _missing_(cls, value):
+        # See enum.IntFlag._create_pseudo_member_.  This allows
+        # creating of enum constants with arbitrary integer values.
+        pseudo_member = int.__new__(cls, value)
+        pseudo_member._name_ = None
+        pseudo_member._value_ = value
+        return pseudo_member
+
+    def __repr__(self):
+        name = self._name_
+        if name is not None:
+            # The names have prefixes like SHT_, implying their type.
+            return name
+        return '{}({})'.format(self.__class__.__name__, self._value_)
+
+    def __str__(self):
+        name = self._name_
+        if name is not None:
+            return name
+        return str(self._value_)
+
+class ElfClass(_OpenIntEnum):
+    """ELF word size.  Type of EI_CLASS values."""
+    ELFCLASSNONE = 0
+    ELFCLASS32 = 1
+    ELFCLASS64 = 2
+
+class ElfData(_OpenIntEnum):
+    """ELF endianess.  Type of EI_DATA values."""
+    ELFDATANONE = 0
+    ELFDATA2LSB = 1
+    ELFDATA2MSB = 2
+
+class Machine(_OpenIntEnum):
+    """ELF machine type.  Type of values in Ehdr.e_machine field."""
+    EM_NONE = 0
+    EM_M32 = 1
+    EM_SPARC = 2
+    EM_386 = 3
+    EM_68K = 4
+    EM_88K = 5
+    EM_IAMCU = 6
+    EM_860 = 7
+    EM_MIPS = 8
+    EM_S370 = 9
+    EM_MIPS_RS3_LE = 10
+    EM_PARISC = 15
+    EM_VPP500 = 17
+    EM_SPARC32PLUS = 18
+    EM_960 = 19
+    EM_PPC = 20
+    EM_PPC64 = 21
+    EM_S390 = 22
+    EM_SPU = 23
+    EM_V800 = 36
+    EM_FR20 = 37
+    EM_RH32 = 38
+    EM_RCE = 39
+    EM_ARM = 40
+    EM_FAKE_ALPHA = 41
+    EM_SH = 42
+    EM_SPARCV9 = 43
+    EM_TRICORE = 44
+    EM_ARC = 45
+    EM_H8_300 = 46
+    EM_H8_300H = 47
+    EM_H8S = 48
+    EM_H8_500 = 49
+    EM_IA_64 = 50
+    EM_MIPS_X = 51
+    EM_COLDFIRE = 52
+    EM_68HC12 = 53
+    EM_MMA = 54
+    EM_PCP = 55
+    EM_NCPU = 56
+    EM_NDR1 = 57
+    EM_STARCORE = 58
+    EM_ME16 = 59
+    EM_ST100 = 60
+    EM_TINYJ = 61
+    EM_X86_64 = 62
+    EM_PDSP = 63
+    EM_PDP10 = 64
+    EM_PDP11 = 65
+    EM_FX66 = 66
+    EM_ST9PLUS = 67
+    EM_ST7 = 68
+    EM_68HC16 = 69
+    EM_68HC11 = 70
+    EM_68HC08 = 71
+    EM_68HC05 = 72
+    EM_SVX = 73
+    EM_ST19 = 74
+    EM_VAX = 75
+    EM_CRIS = 76
+    EM_JAVELIN = 77
+    EM_FIREPATH = 78
+    EM_ZSP = 79
+    EM_MMIX = 80
+    EM_HUANY = 81
+    EM_PRISM = 82
+    EM_AVR = 83
+    EM_FR30 = 84
+    EM_D10V = 85
+    EM_D30V = 86
+    EM_V850 = 87
+    EM_M32R = 88
+    EM_MN10300 = 89
+    EM_MN10200 = 90
+    EM_PJ = 91
+    EM_OPENRISC = 92
+    EM_ARC_COMPACT = 93
+    EM_XTENSA = 94
+    EM_VIDEOCORE = 95
+    EM_TMM_GPP = 96
+    EM_NS32K = 97
+    EM_TPC = 98
+    EM_SNP1K = 99
+    EM_ST200 = 100
+    EM_IP2K = 101
+    EM_MAX = 102
+    EM_CR = 103
+    EM_F2MC16 = 104
+    EM_MSP430 = 105
+    EM_BLACKFIN = 106
+    EM_SE_C33 = 107
+    EM_SEP = 108
+    EM_ARCA = 109
+    EM_UNICORE = 110
+    EM_EXCESS = 111
+    EM_DXP = 112
+    EM_ALTERA_NIOS2 = 113
+    EM_CRX = 114
+    EM_XGATE = 115
+    EM_C166 = 116
+    EM_M16C = 117
+    EM_DSPIC30F = 118
+    EM_CE = 119
+    EM_M32C = 120
+    EM_TSK3000 = 131
+    EM_RS08 = 132
+    EM_SHARC = 133
+    EM_ECOG2 = 134
+    EM_SCORE7 = 135
+    EM_DSP24 = 136
+    EM_VIDEOCORE3 = 137
+    EM_LATTICEMICO32 = 138
+    EM_SE_C17 = 139
+    EM_TI_C6000 = 140
+    EM_TI_C2000 = 141
+    EM_TI_C5500 = 142
+    EM_TI_ARP32 = 143
+    EM_TI_PRU = 144
+    EM_MMDSP_PLUS = 160
+    EM_CYPRESS_M8C = 161
+    EM_R32C = 162
+    EM_TRIMEDIA = 163
+    EM_QDSP6 = 164
+    EM_8051 = 165
+    EM_STXP7X = 166
+    EM_NDS32 = 167
+    EM_ECOG1X = 168
+    EM_MAXQ30 = 169
+    EM_XIMO16 = 170
+    EM_MANIK = 171
+    EM_CRAYNV2 = 172
+    EM_RX = 173
+    EM_METAG = 174
+    EM_MCST_ELBRUS = 175
+    EM_ECOG16 = 176
+    EM_CR16 = 177
+    EM_ETPU = 178
+    EM_SLE9X = 179
+    EM_L10M = 180
+    EM_K10M = 181
+    EM_AARCH64 = 183
+    EM_AVR32 = 185
+    EM_STM8 = 186
+    EM_TILE64 = 187
+    EM_TILEPRO = 188
+    EM_MICROBLAZE = 189
+    EM_CUDA = 190
+    EM_TILEGX = 191
+    EM_CLOUDSHIELD = 192
+    EM_COREA_1ST = 193
+    EM_COREA_2ND = 194
+    EM_ARCV2 = 195
+    EM_OPEN8 = 196
+    EM_RL78 = 197
+    EM_VIDEOCORE5 = 198
+    EM_78KOR = 199
+    EM_56800EX = 200
+    EM_BA1 = 201
+    EM_BA2 = 202
+    EM_XCORE = 203
+    EM_MCHP_PIC = 204
+    EM_INTELGT = 205
+    EM_KM32 = 210
+    EM_KMX32 = 211
+    EM_EMX16 = 212
+    EM_EMX8 = 213
+    EM_KVARC = 214
+    EM_CDP = 215
+    EM_COGE = 216
+    EM_COOL = 217
+    EM_NORC = 218
+    EM_CSR_KALIMBA = 219
+    EM_Z80 = 220
+    EM_VISIUM = 221
+    EM_FT32 = 222
+    EM_MOXIE = 223
+    EM_AMDGPU = 224
+    EM_RISCV = 243
+    EM_BPF = 247
+    EM_CSKY = 252
+    EM_NUM = 253
+    EM_ALPHA = 0x9026
+
+class Et(_OpenIntEnum):
+    """ELF file type.  Type of ET_* values and the Ehdr.e_type field."""
+    ET_NONE = 0
+    ET_REL = 1
+    ET_EXEC = 2
+    ET_DYN = 3
+    ET_CORE = 4
+
+class Shn(_OpenIntEnum):
+    """ELF reserved section indices."""
+    SHN_UNDEF = 0
+    SHN_BEFORE = 0xff00
+    SHN_AFTER = 0xff01
+    SHN_ABS = 0xfff1
+    SHN_COMMON = 0xfff2
+    SHN_XINDEX = 0xffff
+
+class ShnMIPS(enum.Enum):
+    """Supplemental SHN_* constants for EM_MIPS."""
+    SHN_MIPS_ACOMMON = 0xff00
+    SHN_MIPS_TEXT = 0xff01
+    SHN_MIPS_DATA = 0xff02
+    SHN_MIPS_SCOMMON = 0xff03
+    SHN_MIPS_SUNDEFINED = 0xff04
+
+class ShnPARISC(enum.Enum):
+    """Supplemental SHN_* constants for EM_PARISC."""
+    SHN_PARISC_ANSI_COMMON = 0xff00
+    SHN_PARISC_HUGE_COMMON = 0xff01
+
+class Sht(_OpenIntEnum):
+    """ELF section types.  Type of SHT_* values."""
+    SHT_NULL = 0
+    SHT_PROGBITS = 1
+    SHT_SYMTAB = 2
+    SHT_STRTAB = 3
+    SHT_RELA = 4
+    SHT_HASH = 5
+    SHT_DYNAMIC = 6
+    SHT_NOTE = 7
+    SHT_NOBITS = 8
+    SHT_REL = 9
+    SHT_SHLIB = 10
+    SHT_DYNSYM = 11
+    SHT_INIT_ARRAY = 14
+    SHT_FINI_ARRAY = 15
+    SHT_PREINIT_ARRAY = 16
+    SHT_GROUP = 17
+    SHT_SYMTAB_SHNDX = 18
+    SHT_GNU_ATTRIBUTES = 0x6ffffff5
+    SHT_GNU_HASH = 0x6ffffff6
+    SHT_GNU_LIBLIST = 0x6ffffff7
+    SHT_CHECKSUM = 0x6ffffff8
+    SHT_SUNW_move = 0x6ffffffa
+    SHT_SUNW_COMDAT = 0x6ffffffb
+    SHT_SUNW_syminfo = 0x6ffffffc
+    SHT_GNU_verdef = 0x6ffffffd
+    SHT_GNU_verneed = 0x6ffffffe
+    SHT_GNU_versym = 0x6fffffff
+
+class ShtALPHA(enum.Enum):
+    """Supplemental SHT_* constants for EM_ALPHA."""
+    SHT_ALPHA_DEBUG = 0x70000001
+    SHT_ALPHA_REGINFO = 0x70000002
+
+class ShtARM(enum.Enum):
+    """Supplemental SHT_* constants for EM_ARM."""
+    SHT_ARM_EXIDX = 0x70000001
+    SHT_ARM_PREEMPTMAP = 0x70000002
+    SHT_ARM_ATTRIBUTES = 0x70000003
+
+class ShtCSKY(enum.Enum):
+    """Supplemental SHT_* constants for EM_CSKY."""
+    SHT_CSKY_ATTRIBUTES = 0x70000001
+
+class ShtIA_64(enum.Enum):
+    """Supplemental SHT_* constants for EM_IA_64."""
+    SHT_IA_64_EXT = 0x70000000
+    SHT_IA_64_UNWIND = 0x70000001
+
+class ShtMIPS(enum.Enum):
+    """Supplemental SHT_* constants for EM_MIPS."""
+    SHT_MIPS_LIBLIST = 0x70000000
+    SHT_MIPS_MSYM = 0x70000001
+    SHT_MIPS_CONFLICT = 0x70000002
+    SHT_MIPS_GPTAB = 0x70000003
+    SHT_MIPS_UCODE = 0x70000004
+    SHT_MIPS_DEBUG = 0x70000005
+    SHT_MIPS_REGINFO = 0x70000006
+    SHT_MIPS_PACKAGE = 0x70000007
+    SHT_MIPS_PACKSYM = 0x70000008
+    SHT_MIPS_RELD = 0x70000009
+    SHT_MIPS_IFACE = 0x7000000b
+    SHT_MIPS_CONTENT = 0x7000000c
+    SHT_MIPS_OPTIONS = 0x7000000d
+    SHT_MIPS_SHDR = 0x70000010
+    SHT_MIPS_FDESC = 0x70000011
+    SHT_MIPS_EXTSYM = 0x70000012
+    SHT_MIPS_DENSE = 0x70000013
+    SHT_MIPS_PDESC = 0x70000014
+    SHT_MIPS_LOCSYM = 0x70000015
+    SHT_MIPS_AUXSYM = 0x70000016
+    SHT_MIPS_OPTSYM = 0x70000017
+    SHT_MIPS_LOCSTR = 0x70000018
+    SHT_MIPS_LINE = 0x70000019
+    SHT_MIPS_RFDESC = 0x7000001a
+    SHT_MIPS_DELTASYM = 0x7000001b
+    SHT_MIPS_DELTAINST = 0x7000001c
+    SHT_MIPS_DELTACLASS = 0x7000001d
+    SHT_MIPS_DWARF = 0x7000001e
+    SHT_MIPS_DELTADECL = 0x7000001f
+    SHT_MIPS_SYMBOL_LIB = 0x70000020
+    SHT_MIPS_EVENTS = 0x70000021
+    SHT_MIPS_TRANSLATE = 0x70000022
+    SHT_MIPS_PIXIE = 0x70000023
+    SHT_MIPS_XLATE = 0x70000024
+    SHT_MIPS_XLATE_DEBUG = 0x70000025
+    SHT_MIPS_WHIRL = 0x70000026
+    SHT_MIPS_EH_REGION = 0x70000027
+    SHT_MIPS_XLATE_OLD = 0x70000028
+    SHT_MIPS_PDR_EXCEPTION = 0x70000029
+    SHT_MIPS_XHASH = 0x7000002b
+
+class ShtPARISC(enum.Enum):
+    """Supplemental SHT_* constants for EM_PARISC."""
+    SHT_PARISC_EXT = 0x70000000
+    SHT_PARISC_UNWIND = 0x70000001
+    SHT_PARISC_DOC = 0x70000002
+
+class Pf(enum.IntFlag):
+    """Program header flags.  Type of Phdr.p_flags values."""
+    PF_X = 1
+    PF_W = 2
+    PF_R = 4
+
+class PfARM(enum.IntFlag):
+    """Supplemental PF_* flags for EM_ARM."""
+    PF_ARM_SB = 0x10000000
+    PF_ARM_PI = 0x20000000
+    PF_ARM_ABS = 0x40000000
+
+class PfPARISC(enum.IntFlag):
+    """Supplemental PF_* flags for EM_PARISC."""
+    PF_HP_PAGE_SIZE = 0x00100000
+    PF_HP_FAR_SHARED = 0x00200000
+    PF_HP_NEAR_SHARED = 0x00400000
+    PF_HP_CODE = 0x01000000
+    PF_HP_MODIFY = 0x02000000
+    PF_HP_LAZYSWAP = 0x04000000
+    PF_HP_SBP = 0x08000000
+
+class PfIA_64(enum.IntFlag):
+    """Supplemental PF_* flags for EM_IA_64."""
+    PF_IA_64_NORECOV = 0x80000000
+
+class PfMIPS(enum.IntFlag):
+    """Supplemental PF_* flags for EM_MIPS."""
+    PF_MIPS_LOCAL = 0x10000000
+
+class Shf(enum.IntFlag):
+    """Section flags.  Type of Shdr.sh_type values."""
+    SHF_WRITE = 1 << 0
+    SHF_ALLOC = 1 << 1
+    SHF_EXECINSTR = 1 << 2
+    SHF_MERGE = 1 << 4
+    SHF_STRINGS = 1 << 5
+    SHF_INFO_LINK = 1 << 6
+    SHF_LINK_ORDER = 1 << 7
+    SHF_OS_NONCONFORMING = 256
+    SHF_GROUP = 1 << 9
+    SHF_TLS = 1 << 10
+    SHF_COMPRESSED = 1 << 11
+    SHF_GNU_RETAIN = 1 << 21
+    SHF_ORDERED = 1 << 30
+    SHF_EXCLUDE = 1 << 31
+
+class ShfALPHA(enum.IntFlag):
+    """Supplemental SHF_* constants for EM_ALPHA."""
+    SHF_ALPHA_GPREL = 0x10000000
+
+class ShfARM(enum.IntFlag):
+    """Supplemental SHF_* constants for EM_ARM."""
+    SHF_ARM_ENTRYSECT = 0x10000000
+    SHF_ARM_COMDEF = 0x80000000
+
+class ShfIA_64(enum.IntFlag):
+    """Supplemental SHF_* constants for EM_IA_64."""
+    SHF_IA_64_SHORT  = 0x10000000
+    SHF_IA_64_NORECOV = 0x20000000
+
+class ShfMIPS(enum.IntFlag):
+    """Supplemental SHF_* constants for EM_MIPS."""
+    SHF_MIPS_GPREL = 0x10000000
+    SHF_MIPS_MERGE = 0x20000000
+    SHF_MIPS_ADDR = 0x40000000
+    SHF_MIPS_STRINGS = 0x80000000
+    SHF_MIPS_NOSTRIP = 0x08000000
+    SHF_MIPS_LOCAL = 0x04000000
+    SHF_MIPS_NAMES = 0x02000000
+    SHF_MIPS_NODUPE = 0x01000000
+
+class ShfPARISC(enum.IntFlag):
+    """Supplemental SHF_* constants for EM_PARISC."""
+    SHF_PARISC_SHORT = 0x20000000
+    SHF_PARISC_HUGE = 0x40000000
+    SHF_PARISC_SBP = 0x80000000
+
+class Stb(_OpenIntEnum):
+    """ELF symbol binding type."""
+    STB_LOCAL = 0
+    STB_GLOBAL = 1
+    STB_WEAK = 2
+    STB_GNU_UNIQUE = 10
+    STB_MIPS_SPLIT_COMMON = 13
+
+class Stt(_OpenIntEnum):
+    """ELF symbol type."""
+    STT_NOTYPE = 0
+    STT_OBJECT = 1
+    STT_FUNC = 2
+    STT_SECTION = 3
+    STT_FILE = 4
+    STT_COMMON = 5
+    STT_TLS = 6
+    STT_GNU_IFUNC = 10
+
+class SttARM(enum.Enum):
+    """Supplemental STT_* constants for EM_ARM."""
+    STT_ARM_TFUNC = 13
+    STT_ARM_16BIT = 15
+
+class SttPARISC(enum.Enum):
+    """Supplemental STT_* constants for EM_PARISC."""
+    STT_HP_OPAQUE = 11
+    STT_HP_STUB = 12
+    STT_PARISC_MILLICODE = 13
+
+class SttSPARC(enum.Enum):
+    """Supplemental STT_* constants for EM_SPARC."""
+    STT_SPARC_REGISTER = 13
+
+class SttX86_64(enum.Enum):
+    """Supplemental STT_* constants for EM_X86_64."""
+    SHT_X86_64_UNWIND = 0x70000001
+
+class Pt(_OpenIntEnum):
+    """ELF program header types.  Type of Phdr.p_type."""
+    PT_NULL = 0
+    PT_LOAD = 1
+    PT_DYNAMIC = 2
+    PT_INTERP = 3
+    PT_NOTE = 4
+    PT_SHLIB = 5
+    PT_PHDR = 6
+    PT_TLS = 7
+    PT_NUM = 8
+    PT_GNU_EH_FRAME = 0x6474e550
+    PT_GNU_STACK = 0x6474e551
+    PT_GNU_RELRO = 0x6474e552
+    PT_GNU_PROPERTY = 0x6474e553
+    PT_SUNWBSS = 0x6ffffffa
+    PT_SUNWSTACK = 0x6ffffffb
+
+class PtARM(enum.Enum):
+    """Supplemental PT_* constants for EM_ARM."""
+    PT_ARM_EXIDX = 0x70000001
+
+class PtIA_64(enum.Enum):
+    """Supplemental PT_* constants for EM_IA_64."""
+    PT_IA_64_HP_OPT_ANOT = 0x60000012
+    PT_IA_64_HP_HSL_ANOT = 0x60000013
+    PT_IA_64_HP_STACK = 0x60000014
+    PT_IA_64_ARCHEXT = 0x70000000
+    PT_IA_64_UNWIND = 0x70000001
+
+class PtMIPS(enum.Enum):
+    """Supplemental PT_* constants for EM_MIPS."""
+    PT_MIPS_REGINFO = 0x70000000
+    PT_MIPS_RTPROC = 0x70000001
+    PT_MIPS_OPTIONS = 0x70000002
+    PT_MIPS_ABIFLAGS = 0x70000003
+
+class PtPARISC(enum.Enum):
+    """Supplemental PT_* constants for EM_PARISC."""
+    PT_HP_TLS = 0x60000000
+    PT_HP_CORE_NONE = 0x60000001
+    PT_HP_CORE_VERSION = 0x60000002
+    PT_HP_CORE_KERNEL = 0x60000003
+    PT_HP_CORE_COMM = 0x60000004
+    PT_HP_CORE_PROC = 0x60000005
+    PT_HP_CORE_LOADABLE = 0x60000006
+    PT_HP_CORE_STACK = 0x60000007
+    PT_HP_CORE_SHM = 0x60000008
+    PT_HP_CORE_MMF = 0x60000009
+    PT_HP_PARALLEL = 0x60000010
+    PT_HP_FASTBIND = 0x60000011
+    PT_HP_OPT_ANNOT = 0x60000012
+    PT_HP_HSL_ANNOT = 0x60000013
+    PT_HP_STACK = 0x60000014
+    PT_PARISC_ARCHEXT = 0x70000000
+    PT_PARISC_UNWIND = 0x70000001
+
+class Dt(_OpenIntEnum):
+    """ELF dynamic segment tags.  Type of Dyn.d_val."""
+    DT_NULL = 0
+    DT_NEEDED = 1
+    DT_PLTRELSZ = 2
+    DT_PLTGOT = 3
+    DT_HASH = 4
+    DT_STRTAB = 5
+    DT_SYMTAB = 6
+    DT_RELA = 7
+    DT_RELASZ = 8
+    DT_RELAENT = 9
+    DT_STRSZ = 10
+    DT_SYMENT = 11
+    DT_INIT = 12
+    DT_FINI = 13
+    DT_SONAME = 14
+    DT_RPATH = 15
+    DT_SYMBOLIC = 16
+    DT_REL = 17
+    DT_RELSZ = 18
+    DT_RELENT = 19
+    DT_PLTREL = 20
+    DT_DEBUG = 21
+    DT_TEXTREL = 22
+    DT_JMPREL = 23
+    DT_BIND_NOW = 24
+    DT_INIT_ARRAY = 25
+    DT_FINI_ARRAY = 26
+    DT_INIT_ARRAYSZ = 27
+    DT_FINI_ARRAYSZ = 28
+    DT_RUNPATH = 29
+    DT_FLAGS = 30
+    DT_PREINIT_ARRAY = 32
+    DT_PREINIT_ARRAYSZ = 33
+    DT_SYMTAB_SHNDX = 34
+    DT_GNU_PRELINKED = 0x6ffffdf5
+    DT_GNU_CONFLICTSZ = 0x6ffffdf6
+    DT_GNU_LIBLISTSZ = 0x6ffffdf7
+    DT_CHECKSUM = 0x6ffffdf8
+    DT_PLTPADSZ = 0x6ffffdf9
+    DT_MOVEENT = 0x6ffffdfa
+    DT_MOVESZ = 0x6ffffdfb
+    DT_FEATURE_1 = 0x6ffffdfc
+    DT_POSFLAG_1 = 0x6ffffdfd
+    DT_SYMINSZ = 0x6ffffdfe
+    DT_SYMINENT = 0x6ffffdff
+    DT_GNU_HASH = 0x6ffffef5
+    DT_TLSDESC_PLT = 0x6ffffef6
+    DT_TLSDESC_GOT = 0x6ffffef7
+    DT_GNU_CONFLICT = 0x6ffffef8
+    DT_GNU_LIBLIST = 0x6ffffef9
+    DT_CONFIG = 0x6ffffefa
+    DT_DEPAUDIT = 0x6ffffefb
+    DT_AUDIT = 0x6ffffefc
+    DT_PLTPAD = 0x6ffffefd
+    DT_MOVETAB = 0x6ffffefe
+    DT_SYMINFO = 0x6ffffeff
+    DT_VERSYM = 0x6ffffff0
+    DT_RELACOUNT = 0x6ffffff9
+    DT_RELCOUNT = 0x6ffffffa
+    DT_FLAGS_1 = 0x6ffffffb
+    DT_VERDEF = 0x6ffffffc
+    DT_VERDEFNUM = 0x6ffffffd
+    DT_VERNEED = 0x6ffffffe
+    DT_VERNEEDNUM = 0x6fffffff
+    DT_AUXILIARY = 0x7ffffffd
+    DT_FILTER = 0x7fffffff
+
+class DtAARCH64(enum.Enum):
+    """Supplemental DT_* constants for EM_AARCH64."""
+    DT_AARCH64_BTI_PLT = 0x70000001
+    DT_AARCH64_PAC_PLT = 0x70000003
+    DT_AARCH64_VARIANT_PCS = 0x70000005
+
+class DtALPHA(enum.Enum):
+    """Supplemental DT_* constants for EM_ALPHA."""
+    DT_ALPHA_PLTRO = 0x70000000
+
+class DtALTERA_NIOS2(enum.Enum):
+    """Supplemental DT_* constants for EM_ALTERA_NIOS2."""
+    DT_NIOS2_GP = 0x70000002
+
+class DtIA_64(enum.Enum):
+    """Supplemental DT_* constants for EM_IA_64."""
+    DT_IA_64_PLT_RESERVE = 0x70000000
+
+class DtMIPS(enum.Enum):
+    """Supplemental DT_* constants for EM_MIPS."""
+    DT_MIPS_RLD_VERSION = 0x70000001
+    DT_MIPS_TIME_STAMP = 0x70000002
+    DT_MIPS_ICHECKSUM = 0x70000003
+    DT_MIPS_IVERSION = 0x70000004
+    DT_MIPS_FLAGS = 0x70000005
+    DT_MIPS_BASE_ADDRESS = 0x70000006
+    DT_MIPS_MSYM = 0x70000007
+    DT_MIPS_CONFLICT = 0x70000008
+    DT_MIPS_LIBLIST = 0x70000009
+    DT_MIPS_LOCAL_GOTNO = 0x7000000a
+    DT_MIPS_CONFLICTNO = 0x7000000b
+    DT_MIPS_LIBLISTNO = 0x70000010
+    DT_MIPS_SYMTABNO = 0x70000011
+    DT_MIPS_UNREFEXTNO = 0x70000012
+    DT_MIPS_GOTSYM = 0x70000013
+    DT_MIPS_HIPAGENO = 0x70000014
+    DT_MIPS_RLD_MAP = 0x70000016
+    DT_MIPS_DELTA_CLASS = 0x70000017
+    DT_MIPS_DELTA_CLASS_NO = 0x70000018
+    DT_MIPS_DELTA_INSTANCE = 0x70000019
+    DT_MIPS_DELTA_INSTANCE_NO = 0x7000001a
+    DT_MIPS_DELTA_RELOC = 0x7000001b
+    DT_MIPS_DELTA_RELOC_NO = 0x7000001c
+    DT_MIPS_DELTA_SYM = 0x7000001d
+    DT_MIPS_DELTA_SYM_NO = 0x7000001e
+    DT_MIPS_DELTA_CLASSSYM = 0x70000020
+    DT_MIPS_DELTA_CLASSSYM_NO = 0x70000021
+    DT_MIPS_CXX_FLAGS = 0x70000022
+    DT_MIPS_PIXIE_INIT = 0x70000023
+    DT_MIPS_SYMBOL_LIB = 0x70000024
+    DT_MIPS_LOCALPAGE_GOTIDX = 0x70000025
+    DT_MIPS_LOCAL_GOTIDX = 0x70000026
+    DT_MIPS_HIDDEN_GOTIDX = 0x70000027
+    DT_MIPS_PROTECTED_GOTIDX = 0x70000028
+    DT_MIPS_OPTIONS = 0x70000029
+    DT_MIPS_INTERFACE = 0x7000002a
+    DT_MIPS_DYNSTR_ALIGN = 0x7000002b
+    DT_MIPS_INTERFACE_SIZE = 0x7000002c
+    DT_MIPS_RLD_TEXT_RESOLVE_ADDR = 0x7000002d
+    DT_MIPS_PERF_SUFFIX = 0x7000002e
+    DT_MIPS_COMPACT_SIZE = 0x7000002f
+    DT_MIPS_GP_VALUE = 0x70000030
+    DT_MIPS_AUX_DYNAMIC = 0x70000031
+    DT_MIPS_PLTGOT = 0x70000032
+    DT_MIPS_RWPLT = 0x70000034
+    DT_MIPS_RLD_MAP_REL = 0x70000035
+    DT_MIPS_XHASH = 0x70000036
+
+class DtPPC(enum.Enum):
+    """Supplemental DT_* constants for EM_PPC."""
+    DT_PPC_GOT = 0x70000000
+    DT_PPC_OPT = 0x70000001
+
+class DtPPC64(enum.Enum):
+    """Supplemental DT_* constants for EM_PPC64."""
+    DT_PPC64_GLINK = 0x70000000
+    DT_PPC64_OPD = 0x70000001
+    DT_PPC64_OPDSZ = 0x70000002
+    DT_PPC64_OPT = 0x70000003
+
+class DtSPARC(enum.Enum):
+    """Supplemental DT_* constants for EM_SPARC."""
+    DT_SPARC_REGISTER = 0x70000001
+
+class StInfo:
+    """ELF symbol binding and type.  Type of the Sym.st_info field."""
+    def __init__(self, arg0, arg1=None):
+        if isinstance(arg0, int) and arg1 is None:
+            self.bind = Stb(arg0 >> 4)
+            self.type = Stt(arg0 & 15)
+        else:
+            self.bind = Stb(arg0)
+            self.type = Stt(arg1)
+
+    def value(self):
+        """Returns the raw value for the bind/type combination."""
+        return (self.bind.value() << 4) | (self.type.value())
+
+# Type in an ELF file.  Used for deserialization.
+_Layout = collections.namedtuple('_Layout', 'unpack size')
+
+def _define_layouts(baseclass: type, layout32: str, layout64: str,
+                    types=None, fields32=None):
+    """Assign variants dict to baseclass.
+
+    The variants dict is indexed by (ElfClass, ElfData) pairs, and its
+    values are _Layout instances.
+
+    """
+    struct32 = struct.Struct(layout32)
+    struct64 = struct.Struct(layout64)
+
+    # Check that the struct formats yield the right number of components.
+    for s in (struct32, struct64):
+        example = s.unpack(b' ' * s.size)
+        if len(example) != len(baseclass._fields):
+            raise ValueError('{!r} yields wrong field count: {} != {}'.format(
+                s.format, len(example),  len(baseclass._fields)))
+
+    # Check that field names in types are correct.
+    if types is None:
+        types = ()
+    for n in types:
+        if n not in baseclass._fields:
+            raise ValueError('{} does not have field {!r}'.format(
+                baseclass.__name__, n))
+
+    if fields32 is not None \
+       and set(fields32) != set(baseclass._fields):
+        raise ValueError('{!r} is not a permutation of the fields {!r}'.format(
+            fields32, baseclass._fields))
+
+    def unique_name(name, used_names = (set((baseclass.__name__,))
+                                        | set(baseclass._fields)
+                                        | {n.__name__
+                                           for n in (types or {}).values()})):
+        """Find a name that is not used for a class or field name."""
+        candidate = name
+        n = 0
+        while candidate in used_names:
+            n += 1
+            candidate = '{}{}'.format(name, n)
+        used_names.add(candidate)
+        return candidate
+
+    blob_name = unique_name('blob')
+    struct_unpack_name = unique_name('struct_unpack')
+    comps_name = unique_name('comps')
+
+    layouts = {}
+    for (bits, elfclass, layout, fields) in (
+            (32, ElfClass.ELFCLASS32, layout32, fields32),
+            (64, ElfClass.ELFCLASS64, layout64, None),
+    ):
+        for (elfdata, structprefix, funcsuffix) in (
+                (ElfData.ELFDATA2LSB, '<', 'LE'),
+                (ElfData.ELFDATA2MSB, '>', 'BE'),
+        ):
+            env = {
+                baseclass.__name__: baseclass,
+                struct_unpack_name: struct.unpack,
+            }
+
+            # Add the type converters.
+            if types:
+                for cls in types.values():
+                    env[cls.__name__] = cls
+
+            funcname = ''.join(
+                ('unpack_', baseclass.__name__, str(bits), funcsuffix))
+
+            code = '''
+def {funcname}({blob_name}):
+'''.format(funcname=funcname, blob_name=blob_name)
+
+            indent = ' ' * 4
+            unpack_call = '{}({!r}, {})'.format(
+                struct_unpack_name, structprefix + layout, blob_name)
+            field_names = ', '.join(baseclass._fields)
+            if types is None and fields is None:
+                code += '{}return {}({})\n'.format(
+                    indent, baseclass.__name__, unpack_call)
+            else:
+                # Destructuring tuple assignment.
+                if fields is None:
+                    code += '{}{} = {}\n'.format(
+                        indent, field_names, unpack_call)
+                else:
+                    # Use custom field order.
+                    code += '{}{} = {}\n'.format(
+                        indent, ', '.join(fields), unpack_call)
+
+                # Perform the type conversions.
+                for n in baseclass._fields:
+                    if n in types:
+                        code += '{}{} = {}({})\n'.format(
+                            indent, n, types[n].__name__, n)
+                # Create the named tuple.
+                code += '{}return {}({})\n'.format(
+                    indent, baseclass.__name__, field_names)
+
+            exec(code, env)
+            layouts[(elfclass, elfdata)] = _Layout(
+                env[funcname], struct.calcsize(layout))
+    baseclass.layouts = layouts
+
+
+# Corresponds to EI_* indices into Elf*_Ehdr.e_indent.
+class Ident(collections.namedtuple('Ident',
+    'ei_mag ei_class ei_data ei_version ei_osabi ei_abiversion ei_pad')):
+
+    def __new__(cls, *args):
+        """Construct an object from a blob or its constituent fields."""
+        if len(args) == 1:
+            return cls.unpack(args[0])
+        return cls.__base__.__new__(cls, *args)
+
+    @staticmethod
+    def unpack(blob: memoryview) -> 'Ident':
+        """Parse raws data into a tuple."""
+        ei_mag, ei_class, ei_data, ei_version, ei_osabi, ei_abiversion, \
+            ei_pad = struct.unpack('4s5B7s', blob)
+        return Ident(ei_mag, ElfClass(ei_class), ElfData(ei_data),
+                     ei_version, ei_osabi, ei_abiversion, ei_pad)
+    size = 16
+
+# Corresponds to Elf32_Ehdr and Elf64_Ehdr.
+Ehdr = collections.namedtuple('Ehdr',
+   'e_ident e_type e_machine e_version e_entry e_phoff e_shoff e_flags'
+    + ' e_ehsize e_phentsize e_phnum e_shentsize e_shnum e_shstrndx')
+_define_layouts(Ehdr,
+                layout32='16s2H5I6H',
+                layout64='16s2HI3QI6H',
+                types=dict(e_ident=Ident,
+                           e_machine=Machine,
+                           e_type=Et,
+                           e_shstrndx=Shn))
+
+# Corresponds to Elf32_Phdr and Elf64_Pdhr.  Order follows the latter.
+Phdr = collections.namedtuple('Phdr',
+    'p_type p_flags p_offset p_vaddr p_paddr p_filesz p_memsz p_align')
+_define_layouts(Phdr,
+                layout32='8I',
+                fields32=('p_type', 'p_offset', 'p_vaddr', 'p_paddr',
+                          'p_filesz', 'p_memsz', 'p_flags', 'p_align'),
+                layout64='2I6Q',
+            types=dict(p_type=Pt, p_flags=Pf))
+
+
+# Corresponds to Elf32_Shdr and Elf64_Shdr.
+class Shdr(collections.namedtuple('Shdr',
+    'sh_name sh_type sh_flags sh_addr sh_offset sh_size sh_link sh_info'
+    + ' sh_addralign sh_entsize')):
+    def resolve(self, strtab: 'StringTable') -> 'Shdr':
+        """Resolve sh_name using a string table."""
+        return self.__class__(strtab.get(self[0]), *self[1:])
+_define_layouts(Shdr,
+                layout32='10I',
+                layout64='2I4Q2I2Q',
+                types=dict(sh_type=Sht,
+                           sh_flags=Shf,
+                           sh_link=Shn))
+
+# Corresponds to Elf32_Dyn and Elf64_Dyn.  The nesting through the
+# d_un union is skipped, and d_ptr is missing (its representation in
+# Python would be identical to d_val).
+Dyn = collections.namedtuple('Dyn', 'd_tag d_val')
+_define_layouts(Dyn,
+                layout32='2i',
+                layout64='2q',
+                types=dict(d_tag=Dt))
+
+# Corresponds to Elf32_Sym and Elf64_Sym.
+class Sym(collections.namedtuple('Sym',
+    'st_name st_info st_other st_shndx st_value st_size')):
+    def resolve(self, strtab: 'StringTable') -> 'Sym':
+        """Resolve st_name using a string table."""
+        return self.__class__(strtab.get(self[0]), *self[1:])
+_define_layouts(Sym,
+                layout32='3I2BH',
+                layout64='I2BH2Q',
+                fields32=('st_name', 'st_value', 'st_size', 'st_info',
+                          'st_other', 'st_shndx'),
+                types=dict(st_shndx=Shn,
+                           st_info=StInfo))
+
+# Corresponds to Elf32_Rel and Elf64_Rel.
+Rel = collections.namedtuple('Rel', 'r_offset r_info')
+_define_layouts(Rel,
+                layout32='2I',
+                layout64='2Q')
+
+# Corresponds to Elf32_Rel and Elf64_Rel.
+Rela = collections.namedtuple('Rela', 'r_offset r_info r_addend')
+_define_layouts(Rela,
+                layout32='3I',
+                layout64='3Q')
+
+class StringTable:
+    """ELF string table."""
+    def __init__(self, blob):
+        """Create a new string table backed by the data in the blob.
+
+        blob: a memoryview-like object
+
+        """
+        self.blob = blob
+
+    def get(self, index) -> bytes:
+        """Returns the null-terminated byte string at the index."""
+        blob = self.blob
+        endindex = index
+        while True:
+            if blob[endindex] == 0:
+                return bytes(blob[index:endindex])
+            endindex += 1
+
+class Image:
+    """ELF image parser."""
+    def __init__(self, image):
+        """Create an ELF image from binary image data.
+
+        image: a memoryview-like object that supports efficient range
+        subscripting.
+
+        """
+        self.image = image
+        ident = self.read(Ident, 0)
+        classdata = (ident.ei_class, ident.ei_data)
+        # Set self.Ehdr etc. to the subtypes with the right parsers.
+        for typ in (Ehdr, Phdr, Shdr, Dyn, Sym, Rel, Rela):
+            setattr(self, typ.__name__, typ.layouts.get(classdata, None))
+
+        if self.Ehdr is not None:
+            self.ehdr = self.read(self.Ehdr, 0)
+            self._shdr_num = self._compute_shdr_num()
+        else:
+            self.ehdr = None
+            self._shdr_num = 0
+
+        self._section = {}
+        self._stringtab = {}
+
+        if self._shdr_num > 0:
+            self._shdr_strtab = self._find_shdr_strtab()
+        else:
+            self._shdr_strtab = None
+
+    @staticmethod
+    def readfile(path: str) -> 'Image':
+        """Reads the ELF file at the specified path."""
+        with open(path, 'rb') as inp:
+            return Image(memoryview(inp.read()))
+
+    def _compute_shdr_num(self) -> int:
+        """Computes the actual number of section headers."""
+        shnum = self.ehdr.e_shnum
+        if shnum == 0:
+            if self.ehdr.e_shoff == 0 or self.ehdr.e_shentsize == 0:
+                # No section headers.
+                return 0
+            # Otherwise the extension mechanism is used (which may be
+            # needed because e_shnum is just 16 bits).
+            return self.read(self.Shdr, self.ehdr.e_shoff).sh_size
+        return shnum
+
+    def _find_shdr_strtab(self) -> StringTable:
+        """Finds the section header string table (maybe via extensions)."""
+        shstrndx = self.ehdr.e_shstrndx
+        if shstrndx == Shn.SHN_XINDEX:
+            shstrndx = self.read(self.Shdr, self.ehdr.e_shoff).sh_link
+        return self._find_stringtab(shstrndx)
+
+    def read(self, typ: type, offset:int ):
+        """Reads an object at a specific offset.
+
+        The type must have been enhanced using _define_variants.
+
+        """
+        return typ.unpack(self.image[offset: offset + typ.size])
+
+    def phdrs(self) -> Phdr:
+        """Generator iterating over the program headers."""
+        if self.ehdr is None:
+            return
+        size = self.ehdr.e_phentsize
+        if size != self.Phdr.size:
+            raise ValueError('Unexpected Phdr size in ELF header: {} != {}'
+                             .format(size, self.Phdr.size))
+
+        offset = self.ehdr.e_phoff
+        for _ in range(self.ehdr.e_phnum):
+            yield self.read(self.Phdr, offset)
+            offset += size
+
+    def shdrs(self, resolve: bool=True) -> Shdr:
+        """Generator iterating over the section headers.
+
+        If resolve, section names are automatically translated
+        using the section header string table.
+
+        """
+        if self._shdr_num == 0:
+            return
+
+        size = self.ehdr.e_shentsize
+        if size != self.Shdr.size:
+            raise ValueError('Unexpected Shdr size in ELF header: {} != {}'
+                             .format(size, self.Shdr.size))
+
+        offset = self.ehdr.e_shoff
+        for _ in range(self._shdr_num):
+            shdr = self.read(self.Shdr, offset)
+            if resolve:
+                shdr = shdr.resolve(self._shdr_strtab)
+            yield shdr
+            offset += size
+
+    def dynamic(self) -> Dyn:
+        """Generator iterating over the dynamic segment."""
+        for phdr in self.phdrs():
+            if phdr.p_type == Pt.PT_DYNAMIC:
+                # Pick the first dynamic segment, like the loader.
+                if phdr.p_filesz == 0:
+                    # Probably separated debuginfo.
+                    return
+                offset = phdr.p_offset
+                end = offset + phdr.p_memsz
+                size = self.Dyn.size
+                while True:
+                    next_offset = offset + size
+                    if next_offset > end:
+                        raise ValueError(
+                            'Dynamic segment size {} is not a multiple of Dyn size {}'.format(
+                                phdr.p_memsz, size))
+                    yield self.read(self.Dyn, offset)
+                    if next_offset == end:
+                        return
+                    offset = next_offset
+
+    def syms(self, shdr: Shdr, resolve: bool=True) -> Sym:
+        """A generator iterating over a symbol table.
+
+        If resolve, symbol names are automatically translated using
+        the string table for the symbol table.
+
+        """
+        assert shdr.sh_type == Sht.SHT_SYMTAB
+        size = shdr.sh_entsize
+        if size != self.Sym.size:
+            raise ValueError('Invalid symbol table entry size {}'.format(size))
+        offset = shdr.sh_offset
+        end = shdr.sh_offset + shdr.sh_size
+        if resolve:
+            strtab = self._find_stringtab(shdr.sh_link)
+        while offset < end:
+            sym = self.read(self.Sym, offset)
+            if resolve:
+                sym = sym.resolve(strtab)
+            yield sym
+            offset += size
+        if offset != end:
+            raise ValueError('Symbol table is not a multiple of entry size')
+
+    def lookup_string(self, strtab_index: int, strtab_offset: int) -> bytes:
+        """Looks up a string in a string table identified by its link index."""
+        try:
+            strtab = self._stringtab[strtab_index]
+        except KeyError:
+            strtab = self._find_stringtab(strtab_index)
+        return strtab.get(strtab_offset)
+
+    def find_section(self, shndx: Shn) -> Shdr:
+        """Returns the section header for the indexed section.
+
+        The section name is not resolved.
+        """
+        try:
+            return self._section[shndx]
+        except KeyError:
+            pass
+        if shndx in Shn:
+            raise ValueError('Reserved section index {}'.format(shndx))
+        idx = shndx.value
+        if idx < 0 or idx > self._shdr_num:
+            raise ValueError('Section index {} out of range [0, {})'.format(
+                idx, self._shdr_num))
+        shdr = self.read(
+            self.Shdr, self.ehdr.e_shoff + idx * self.Shdr.size)
+        self._section[shndx] = shdr
+        return shdr
+
+    def _find_stringtab(self, sh_link: int) -> StringTable:
+        if sh_link in self._stringtab:
+            return self._stringtab
+        if sh_link < 0 or sh_link >= self._shdr_num:
+            raise ValueError('Section index {} out of range [0, {})'.format(
+                sh_link, self._shdr_num))
+        shdr = self.read(
+            self.Shdr, self.ehdr.e_shoff + sh_link * self.Shdr.size)
+        if shdr.sh_type != Sht.SHT_STRTAB:
+            raise ValueError(
+                'Section {} is not a string table: {}'.format(
+                    sh_link, shdr.sh_type))
+        strtab = StringTable(
+            self.image[shdr.sh_offset:shdr.sh_offset + shdr.sh_size])
+        # This could retrain essentially arbitrary amounts of data,
+        # but caching string tables seems important for performance.
+        self._stringtab[sh_link] = strtab
+        return strtab
+
+
+__all__ = [name for name in dir() if name[0].isupper()]
diff --git a/scripts/tst-elf-edit.py b/scripts/tst-elf-edit.py
index a514179bb..0e19ce1e7 100644
--- a/scripts/tst-elf-edit.py
+++ b/scripts/tst-elf-edit.py
@@ -43,9 +43,11 @@ EI_DATA=5
 ELFDATA2LSB=b'\x01'
 ELFDATA2MSB=b'\x02'
 
+ET_EXEC=2
 ET_DYN=3
 
 PT_LOAD=1
+PT_TLS=7
 
 def elf_types_fmts(e_ident):
     endian = '<' if e_ident[EI_DATA] == ELFDATA2LSB else '>'
@@ -146,8 +148,15 @@ def elf_edit_align(phdr, align):
     else:
         phdr.p_align = int(align)
 
+def elf_edit_maximize_tls_size(phdr, elfclass):
+    if elfclass == ELFCLASS32:
+        # It is possible that the kernel can allocate half of the
+        # address space, so use something larger.
+        phdr.p_memsz = 0xfff00000
+    else:
+        phdr.p_memsz = 1 << 63
 
-def elf_edit(f, align):
+def elf_edit(f, opts):
     ei_nident_fmt = 'c' * EI_NIDENT
     ei_nident_len = struct.calcsize(ei_nident_fmt)
 
@@ -172,24 +181,35 @@ def elf_edit(f, align):
 
     ehdr = Elf_Ehdr(e_ident)
     ehdr.read(f)
-    if ehdr.e_type != ET_DYN:
-       error('{}: not a shared library'.format(f.name))
+    if ehdr.e_type not in (ET_EXEC, ET_DYN):
+       error('{}: not an executable or shared library'.format(f.name))
 
     phdr = Elf_Phdr(e_ident)
+    maximize_tls_size_done = False
     for i in range(0, ehdr.e_phnum):
         f.seek(ehdr.e_phoff + i * phdr.len)
         phdr.read(f)
-        if phdr.p_type == PT_LOAD:
-            elf_edit_align(phdr, align)
+        if phdr.p_type == PT_LOAD and opts.align is not None:
+            elf_edit_align(phdr, opts.align)
+            f.seek(ehdr.e_phoff + i * phdr.len)
+            phdr.write(f)
+            break
+        if phdr.p_type == PT_TLS and opts.maximize_tls_size:
+            elf_edit_maximize_tls_size(phdr, e_ident[EI_CLASS])
             f.seek(ehdr.e_phoff + i * phdr.len)
             phdr.write(f)
+            maximize_tls_size_done = True
             break
 
+    if opts.maximize_tls_size and not maximize_tls_size_done:
+        error('{}: TLS maximum size was not updated'.format(f.name))
 
 def get_parser():
     parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument('-a', dest='align', required=True,
+    parser.add_argument('-a', dest='align',
                         help='How to set the LOAD alignment')
+    parser.add_argument('--maximize-tls-size', action='store_true',
+                        help='Set maximum PT_TLS size')
     parser.add_argument('output',
                         help='ELF file to edit')
     return parser
@@ -199,7 +219,7 @@ def main(argv):
     parser = get_parser()
     opts = parser.parse_args(argv)
     with open(opts.output, 'r+b') as fout:
-       elf_edit(fout, opts.align)
+       elf_edit(fout, opts)
 
 
 if __name__ == '__main__':
diff --git a/scripts/tst-ld-trace.py b/scripts/tst-ld-trace.py
new file mode 100755
index 000000000..f5a402800
--- /dev/null
+++ b/scripts/tst-ld-trace.py
@@ -0,0 +1,108 @@
+#!/usr/bin/python3
+# Dump the output of LD_TRACE_LOADED_OBJECTS in architecture neutral format.
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# Copyright The GNU Toolchain Authors.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+import argparse
+import os
+import subprocess
+import sys
+
+try:
+    subprocess.run
+except:
+    class _CompletedProcess:
+        def __init__(self, args, returncode, stdout=None, stderr=None):
+            self.args = args
+            self.returncode = returncode
+            self.stdout = stdout
+            self.stderr = stderr
+
+    def _run(*popenargs, input=None, timeout=None, check=False, **kwargs):
+        assert(timeout is None)
+        with subprocess.Popen(*popenargs, **kwargs) as process:
+            try:
+                stdout, stderr = process.communicate(input)
+            except:
+                process.kill()
+                process.wait()
+                raise
+            returncode = process.poll()
+            if check and returncode:
+                raise subprocess.CalledProcessError(returncode, popenargs)
+        return _CompletedProcess(popenargs, returncode, stdout, stderr)
+
+    subprocess.run = _run
+
+def is_vdso(lib):
+    return lib.startswith('linux-gate') or lib.startswith('linux-vdso')
+
+
+def parse_trace(cmd, fref):
+    new_env = os.environ.copy()
+    new_env['LD_TRACE_LOADED_OBJECTS'] = '1'
+    trace_out = subprocess.run(cmd, stdout=subprocess.PIPE, check=True,
+                               universal_newlines=True, env=new_env).stdout
+    trace = []
+    for line in trace_out.splitlines():
+        line = line.strip()
+        if is_vdso(line):
+            continue
+        fields = line.split('=>' if '=>' in line else ' ')
+        lib = os.path.basename(fields[0].strip())
+        if lib.startswith('ld'):
+            lib = 'ld'
+        elif lib.startswith('libc'):
+            lib = 'libc'
+        found = 1 if fields[1].strip() != 'not found' else 0
+        trace += ['{} {}'.format(lib, found)]
+    trace = sorted(trace)
+
+    reference = sorted(line.replace('\n','') for line in fref.readlines())
+
+    ret = 0 if trace == reference else 1
+    if ret != 0:
+        for i in reference:
+            if i not in trace:
+                print("Only in {}: {}".format(fref.name, i))
+        for i in trace:
+            if i not in reference:
+                print("Only in trace: {}".format(i))
+
+    sys.exit(ret)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('command',
+                        help='comand to run')
+    parser.add_argument('reference',
+                        help='reference file to compare')
+    return parser
+
+
+def main(argv):
+    parser = get_parser()
+    opts = parser.parse_args(argv)
+    with open(opts.reference, 'r') as fref:
+        # Remove the initial 'env' command.
+        parse_trace(opts.command.split()[1:], fref)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/shlib-versions b/shlib-versions
index df6603e69..b87ab50c5 100644
--- a/shlib-versions
+++ b/shlib-versions
@@ -47,11 +47,6 @@ libnss_ldap=2
 libnss_hesiod=2
 libnss_db=2
 
-# Tests for NSS.  They must have the same NSS_SHLIB_REVISION number as
-# the rest.
-libnss_test1=2
-libnss_test2=2
-
 # Version for libnsl with YP and NIS+ functions.
 libnsl=1
 
diff --git a/socket/Makefile b/socket/Makefile
index 156eec6c8..2bde78387 100644
--- a/socket/Makefile
+++ b/socket/Makefile
@@ -34,6 +34,7 @@ routines := accept bind connect getpeername getsockname getsockopt	\
 tests := \
   tst-accept4 \
   tst-sockopt \
+  tst-cmsghdr \
   # tests
 
 tests-internal := \
diff --git a/socket/sys/socket.h b/socket/sys/socket.h
index 7d5b21a2c..0abfb5dd0 100644
--- a/socket/sys/socket.h
+++ b/socket/sys/socket.h
@@ -181,7 +181,7 @@ extern ssize_t __REDIRECT (sendmsg, (int __fd, const struct msghdr *__message,
 # else
 extern ssize_t __sendmsg64 (int __fd, const struct msghdr *__message,
 			    int __flags);
-#  defien sendmsg __sendmsg64
+#  define sendmsg __sendmsg64
 # endif
 #endif
 
diff --git a/socket/tst-cmsghdr-skeleton.c b/socket/tst-cmsghdr-skeleton.c
new file mode 100644
index 000000000..4c6898569
--- /dev/null
+++ b/socket/tst-cmsghdr-skeleton.c
@@ -0,0 +1,92 @@
+/* Test ancillary data header creation.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* We use the preprocessor to generate the function/macro tests instead of
+   using indirection because having all the macro expansions alongside
+   each other lets the compiler warn us about suspicious pointer
+   arithmetic across subsequent CMSG_{FIRST,NXT}HDR expansions.  */
+
+#include <stdint.h>
+
+#define RUN_TEST_CONCAT(suffix) run_test_##suffix
+#define RUN_TEST_FUNCNAME(suffix) RUN_TEST_CONCAT (suffix)
+
+static void
+RUN_TEST_FUNCNAME (CMSG_NXTHDR_IMPL) (void)
+{
+  struct msghdr m = {0};
+  struct cmsghdr *cmsg;
+  char cmsgbuf[3 * CMSG_SPACE (sizeof (PAYLOAD))] = {0};
+
+  m.msg_control = cmsgbuf;
+  m.msg_controllen = sizeof (cmsgbuf);
+
+  /* First header should point to the start of the buffer.  */
+  cmsg = CMSG_FIRSTHDR (&m);
+  TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+
+  /* If the first header length consumes the entire buffer, there is no
+     space remaining for additional headers.  */
+  cmsg->cmsg_len = sizeof (cmsgbuf);
+  cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+  TEST_VERIFY_EXIT (cmsg == NULL);
+
+  /* The first header length is so big, using it would cause an overflow.  */
+  cmsg = CMSG_FIRSTHDR (&m);
+  TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+  cmsg->cmsg_len = SIZE_MAX;
+  cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+  TEST_VERIFY_EXIT (cmsg == NULL);
+
+  /* The first header leaves just enough space to hold another header.  */
+  cmsg = CMSG_FIRSTHDR (&m);
+  TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+  cmsg->cmsg_len = sizeof (cmsgbuf) - sizeof (struct cmsghdr);
+  cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+  TEST_VERIFY_EXIT (cmsg != NULL);
+
+  /* The first header leaves space but not enough for another header.  */
+  cmsg = CMSG_FIRSTHDR (&m);
+  TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+  cmsg->cmsg_len ++;
+  cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+  TEST_VERIFY_EXIT (cmsg == NULL);
+
+  /* The second header leaves just enough space to hold another header.  */
+  cmsg = CMSG_FIRSTHDR (&m);
+  TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+  cmsg->cmsg_len = CMSG_LEN (sizeof (PAYLOAD));
+  cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+  TEST_VERIFY_EXIT (cmsg != NULL);
+  cmsg->cmsg_len = sizeof (cmsgbuf)
+                   - CMSG_SPACE (sizeof (PAYLOAD)) /* First header.  */
+                   - sizeof (struct cmsghdr);
+  cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+  TEST_VERIFY_EXIT (cmsg != NULL);
+
+  /* The second header leaves space but not enough for another header.  */
+  cmsg = CMSG_FIRSTHDR (&m);
+  TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+  cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+  TEST_VERIFY_EXIT (cmsg != NULL);
+  cmsg->cmsg_len ++;
+  cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+  TEST_VERIFY_EXIT (cmsg == NULL);
+
+  return;
+}
diff --git a/socket/tst-cmsghdr.c b/socket/tst-cmsghdr.c
new file mode 100644
index 000000000..68c96d3c9
--- /dev/null
+++ b/socket/tst-cmsghdr.c
@@ -0,0 +1,56 @@
+/* Test ancillary data header creation.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sys/socket.h>
+#include <gnu/lib-names.h>
+#include <support/xdlfcn.h>
+#include <support/check.h>
+
+#define PAYLOAD "Hello, World!"
+
+/* CMSG_NXTHDR is a macro that calls an inline function defined in
+   bits/socket.h.  In case the function cannot be inlined, libc.so carries
+   a copy.  Both versions need to be tested.  */
+
+#define CMSG_NXTHDR_IMPL CMSG_NXTHDR
+#include "tst-cmsghdr-skeleton.c"
+#undef CMSG_NXTHDR_IMPL
+
+static struct cmsghdr * (* cmsg_nxthdr) (struct msghdr *, struct cmsghdr *);
+
+#define CMSG_NXTHDR_IMPL cmsg_nxthdr
+#include "tst-cmsghdr-skeleton.c"
+#undef CMSG_NXTHDR_IMPL
+
+static int
+do_test (void)
+{
+  static void *handle;
+
+  run_test_CMSG_NXTHDR ();
+
+  handle = xdlopen (LIBC_SO, RTLD_LAZY);
+  cmsg_nxthdr = (struct cmsghdr * (*) (struct msghdr *, struct cmsghdr *))
+                  xdlsym (handle, "__cmsg_nxthdr");
+
+  run_test_cmsg_nxthdr ();
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/stdlib/Makefile b/stdlib/Makefile
index 823674198..164dd8909 100644
--- a/stdlib/Makefile
+++ b/stdlib/Makefile
@@ -217,6 +217,9 @@ CFLAGS-tst-qsort.c += $(stack-align-test-flags)
 CFLAGS-tst-makecontext.c += -funwind-tables
 CFLAGS-tst-makecontext2.c += $(stack-align-test-flags)
 
+CFLAGS-testmb.c += -D_FORTIFY_SOURCE=2 -Wall -Werror
+
+
 # Run a test on the header files we use.
 tests-special += $(objpfx)isomac.out
 
diff --git a/stdlib/bits/stdlib.h b/stdlib/bits/stdlib.h
index 277d099e2..de1c3b20f 100644
--- a/stdlib/bits/stdlib.h
+++ b/stdlib/bits/stdlib.h
@@ -96,6 +96,11 @@ extern size_t __mbstowcs_chk (wchar_t *__restrict __dst,
 			      const char *__restrict __src,
 			      size_t __len, size_t __dstlen) __THROW
     __attr_access ((__write_only__, 1, 3)) __attr_access ((__read_only__, 2));
+extern size_t __REDIRECT_NTH (__mbstowcs_nulldst,
+			      (wchar_t *__restrict __dst,
+			       const char *__restrict __src,
+			       size_t __len), mbstowcs)
+    __attr_access ((__read_only__, 2));
 extern size_t __REDIRECT_NTH (__mbstowcs_alias,
 			      (wchar_t *__restrict __dst,
 			       const char *__restrict __src,
@@ -112,12 +117,13 @@ __fortify_function size_t
 __NTH (mbstowcs (wchar_t *__restrict __dst, const char *__restrict __src,
 		 size_t __len))
 {
-  return __glibc_fortify_n (mbstowcs, __len, sizeof (wchar_t),
-			    __glibc_objsize (__dst),
-			    __dst, __src, __len);
+  if (__builtin_constant_p (__dst == NULL) && __dst == NULL)
+    return __mbstowcs_nulldst (__dst, __src, __len);
+  else
+    return __glibc_fortify_n (mbstowcs, __len, sizeof (wchar_t),
+			      __glibc_objsize (__dst), __dst, __src, __len);
 }
 
-
 extern size_t __wcstombs_chk (char *__restrict __dst,
 			      const wchar_t *__restrict __src,
 			      size_t __len, size_t __dstlen) __THROW
diff --git a/stdlib/testmb.c b/stdlib/testmb.c
index 45dae7db6..6ac4dfd21 100644
--- a/stdlib/testmb.c
+++ b/stdlib/testmb.c
@@ -16,6 +16,13 @@ main (int argc, char *argv[])
       lose = 1;
     }
 
+  i = mbstowcs (NULL, "bar", 4);
+  if (!(i == 3 && w[1] == 'a'))
+    {
+      puts ("mbstowcs FAILED2!");
+      lose = 1;
+    }
+
   mbstowcs (w, "blah", 5);
   i = wcstombs (c, w, 10);
   if (i != 4)
diff --git a/string/bits/string_fortified.h b/string/bits/string_fortified.h
index f4a5dfc2e..149ebbb08 100644
--- a/string/bits/string_fortified.h
+++ b/string/bits/string_fortified.h
@@ -107,7 +107,7 @@ __NTH (stpncpy (char *__dest, const char *__src, size_t __n))
 # else
 extern char *__stpncpy_chk (char *__dest, const char *__src, size_t __n,
 			    size_t __destlen) __THROW
-  __fortified_attr_access ((__write_only__, 1, 3))
+  __fortified_attr_access (__write_only__, 1, 3)
   __attr_access ((__read_only__, 2));
 extern char *__REDIRECT_NTH (__stpncpy_alias, (char *__dest, const char *__src,
 					       size_t __n), stpncpy);
diff --git a/string/test-rawmemchr.c b/string/test-rawmemchr.c
index cafb75298..703e8ec27 100644
--- a/string/test-rawmemchr.c
+++ b/string/test-rawmemchr.c
@@ -17,6 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <assert.h>
+#include <support/xunistd.h>
 
 #define TEST_MAIN
 #define TEST_NAME "rawmemchr"
@@ -50,13 +51,45 @@ do_one_test (impl_t *impl, const char *s, int c, char *exp_res)
     }
 }
 
+static void
+do_test_bz29234 (void)
+{
+  size_t i, j;
+  char *ptr_start;
+  char *buf = xmmap (0, 8192, PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANONYMOUS, -1);
+
+  memset (buf, -1, 8192);
+
+  ptr_start = buf + 4096 - 8;
+
+  /* Out of range matches before the start of a page. */
+  memset (ptr_start - 8, 0x1, 8);
+
+  for (j = 0; j < 8; ++j)
+    {
+      for (i = 0; i < 128; ++i)
+	{
+	  ptr_start[i + j] = 0x1;
+
+	  FOR_EACH_IMPL (impl, 0)
+	    do_one_test (impl, (char *) (ptr_start + j), 0x1,
+			 ptr_start + i + j);
+
+	  ptr_start[i + j] = 0xff;
+	}
+    }
+
+  xmunmap (buf, 8192);
+}
+
 static void
 do_test (size_t align, size_t pos, size_t len, int seek_char)
 {
   size_t i;
   char *result;
 
-  align &= 7;
+  align &= getpagesize () - 1;
   if (align + len >= page_size)
     return;
 
@@ -114,6 +147,13 @@ do_random_tests (void)
 	    }
 	}
 
+      if (align)
+	{
+	  p[align - 1] = seek_char;
+	  if (align > 4)
+	    p[align - 4] = seek_char;
+	}
+
       assert (pos < len);
       size_t r = random ();
       if ((r & 31) == 0)
@@ -129,6 +169,13 @@ do_random_tests (void)
 		   result, p);
 	    ret = 1;
 	  }
+
+      if (align)
+	{
+	  p[align - 1] = seek_char;
+	  if (align > 4)
+	    p[align - 4] = seek_char;
+	}
     }
 }
 
@@ -150,14 +197,22 @@ test_main (void)
       do_test (i, 64, 256, 23);
       do_test (0, 16 << i, 2048, 0);
       do_test (i, 64, 256, 0);
+
+      do_test (getpagesize () - i, 64, 256, 23);
+      do_test (getpagesize () - i, 64, 256, 0);
     }
   for (i = 1; i < 32; ++i)
     {
       do_test (0, i, i + 1, 23);
       do_test (0, i, i + 1, 0);
+
+      do_test (getpagesize () - 7, i, i + 1, 23);
+      do_test (getpagesize () - i / 2, i, i + 1, 23);
+      do_test (getpagesize () - i, i, i + 1, 23);
     }
 
   do_random_tests ();
+  do_test_bz29234 ();
   return ret;
 }
 
diff --git a/string/test-strncmp.c b/string/test-strncmp.c
index e7d5edea3..358f40eb5 100644
--- a/string/test-strncmp.c
+++ b/string/test-strncmp.c
@@ -434,6 +434,28 @@ check3 (void)
 	}
 }
 
+static void
+check4 (void)
+{
+  /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of
+     the end of the page. 2) For there to be no mismatch/null byte before the
+     first page cross. 3) For length (`n`) to be large enough for one string to
+     cross the page. And 4) for there to be either mismatch/null bytes before
+     the start of the strings.  */
+
+  size_t size = 10;
+  size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1);
+  CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa));
+  CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed));
+  int exp_result;
+
+  STRCPY (s1, L ("tst-tlsmod%"));
+  STRCPY (s2, L ("tst-tls-manydynamic73mod"));
+  exp_result = SIMPLE_STRNCMP (s1, s2, size);
+  FOR_EACH_IMPL (impl, 0)
+  check_result (impl, s1, s2, size, exp_result);
+}
+
 int
 test_main (void)
 {
@@ -444,6 +466,7 @@ test_main (void)
   check1 ();
   check2 ();
   check3 ();
+  check4 ();
 
   printf ("%23s", "");
   FOR_EACH_IMPL (impl, 0)
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index 2ebe7901c..4a5e7b63d 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -1121,9 +1121,11 @@ extern void _dl_init (struct link_map *main_map, int argc, char **argv,
    initializer functions have completed.  */
 extern void _dl_fini (void) attribute_hidden;
 
-/* Sort array MAPS according to dependencies of the contained objects.  */
+/* Sort array MAPS according to dependencies of the contained objects.
+   If FORCE_FIRST, MAPS[0] keeps its place even if the dependencies
+   say otherwise.  */
 extern void _dl_sort_maps (struct link_map **maps, unsigned int nmaps,
-			   unsigned int skip, bool for_fini) attribute_hidden;
+			   bool force_first, bool for_fini) attribute_hidden;
 
 /* The dynamic linker calls this function before and having changing
    any shared object mappings.  The `r_state' member of `struct r_debug'
@@ -1254,6 +1256,11 @@ extern struct link_map * _dl_get_dl_main_map (void)
 # endif
 #endif
 
+/* Perform early memory allocation, avoding a TCB dependency.
+   Terminate the process if allocation fails.  May attempt to use
+   brk.  */
+void *_dl_early_allocate (size_t size) attribute_hidden;
+
 /* Initialize the DSO sort algorithm to use.  */
 #if !HAVE_TUNABLES
 static inline void
diff --git a/sysdeps/generic/libc-lock-arch.h b/sysdeps/generic/libc-lock-arch.h
new file mode 100644
index 000000000..4713b30a8
--- /dev/null
+++ b/sysdeps/generic/libc-lock-arch.h
@@ -0,0 +1,25 @@
+/* Private libc-internal arch-specific definitions.  Generic version.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2.1 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If
+   not, see <https://www.gnu.org/licenses/>.  */
+
+#ifndef _LIBC_LOCK_ARCH_H
+#define _LIBC_LOCK_ARCH_H
+
+/* The default definition uses the natural alignment from the lock type.  */
+#define __LIBC_LOCK_ALIGNMENT
+
+#endif
diff --git a/sysdeps/generic/startup.h b/sysdeps/generic/startup.h
index 99509404e..45979ab23 100644
--- a/sysdeps/generic/startup.h
+++ b/sysdeps/generic/startup.h
@@ -23,27 +23,3 @@
 
 /* Use macro instead of inline function to avoid including <stdio.h>.  */
 #define _startup_fatal(message) __libc_fatal ((message))
-
-static inline uid_t
-startup_getuid (void)
-{
-  return __getuid ();
-}
-
-static inline uid_t
-startup_geteuid (void)
-{
-  return __geteuid ();
-}
-
-static inline gid_t
-startup_getgid (void)
-{
-  return __getgid ();
-}
-
-static inline gid_t
-startup_getegid (void)
-{
-  return __getegid ();
-}
diff --git a/sysdeps/hppa/dl-fptr.c b/sysdeps/hppa/dl-fptr.c
index 2584557c4..9ed21602d 100644
--- a/sysdeps/hppa/dl-fptr.c
+++ b/sysdeps/hppa/dl-fptr.c
@@ -26,6 +26,7 @@
 #include <ldsodefs.h>
 #include <elf/dynamic-link.h>
 #include <dl-fptr.h>
+#include <dl-runtime.h>
 #include <dl-unmap-segments.h>
 #include <atomic.h>
 #include <libc-pointer-arith.h>
@@ -351,21 +352,20 @@ _dl_lookup_address (const void *address)
 {
   ElfW(Addr) addr = (ElfW(Addr)) address;
   ElfW(Word) reloc_arg;
-  volatile unsigned int *desc;
-  unsigned int *gptr;
+  unsigned int *desc, *gptr;
 
   /* Return ADDR if the least-significant two bits of ADDR are not consistent
      with ADDR being a linker defined function pointer.  The normal value for
      a code address in a backtrace is 3.  */
-  if (((unsigned int) addr & 3) != 2)
+  if (((uintptr_t) addr & 3) != 2)
     return addr;
 
   /* Handle special case where ADDR points to page 0.  */
-  if ((unsigned int) addr < 4096)
+  if ((uintptr_t) addr < 4096)
     return addr;
 
   /* Clear least-significant two bits from descriptor address.  */
-  desc = (unsigned int *) ((unsigned int) addr & ~3);
+  desc = (unsigned int *) ((uintptr_t) addr & ~3);
   if (!_dl_read_access_allowed (desc))
     return addr;
 
@@ -376,7 +376,7 @@ _dl_lookup_address (const void *address)
   /* Then load first word of candidate descriptor.  It should be a pointer
      with word alignment and point to memory that can be read.  */
   gptr = (unsigned int *) desc[0];
-  if (((unsigned int) gptr & 3) != 0
+  if (((uintptr_t) gptr & 3) != 0
       || !_dl_read_access_allowed (gptr))
     return addr;
 
@@ -400,10 +400,11 @@ _dl_lookup_address (const void *address)
 
       /* If gp has been resolved, we need to hunt for relocation offset.  */
       if (!(reloc_arg & PA_GP_RELOC))
-	reloc_arg = _dl_fix_reloc_arg (addr, l);
+	reloc_arg = _dl_fix_reloc_arg ((struct fdesc *) addr, l);
 
       _dl_fixup (l, reloc_arg);
     }
 
   return (ElfW(Addr)) desc[0];
 }
+rtld_hidden_def (_dl_lookup_address)
diff --git a/sysdeps/hppa/dl-lookupcfg.h b/sysdeps/hppa/dl-lookupcfg.h
index 8da2412fe..de0a3b78e 100644
--- a/sysdeps/hppa/dl-lookupcfg.h
+++ b/sysdeps/hppa/dl-lookupcfg.h
@@ -30,6 +30,7 @@ rtld_hidden_proto (_dl_symbol_address)
 #define DL_SYMBOL_ADDRESS(map, ref) _dl_symbol_address(map, ref)
 
 Elf32_Addr _dl_lookup_address (const void *address);
+rtld_hidden_proto (_dl_lookup_address)
 
 #define DL_LOOKUP_ADDRESS(addr) _dl_lookup_address ((const void *) addr)
 
@@ -79,7 +80,9 @@ void attribute_hidden _dl_unmap (struct link_map *map);
 /* Extract the code address from a fixup value */
 #define DL_FIXUP_VALUE_CODE_ADDR(value) ((value).ip)
 #define DL_FIXUP_VALUE_ADDR(value) ((uintptr_t) &(value))
-#define DL_FIXUP_ADDR_VALUE(addr) (*(struct fdesc *) (addr))
+/* Clear the plabel bit to get the actual address of the descriptor.  */
+#define DL_FIXUP_ADDR_VALUE(addr) \
+  (*(DL_FIXUP_VALUE_TYPE *) ((uintptr_t) (addr) & ~2))
 #define DL_FIXUP_BINDNOW_ADDR_VALUE(addr) (addr)
-#define DL_FIXUP_BINDNOW_RELOC(value, new_value, st_value) \
-  (*value) = *(struct fdesc *) (st_value)
+#define DL_FIXUP_BINDNOW_RELOC(value, new_value, st_value)	\
+  *(value) = *(DL_FIXUP_VALUE_TYPE *) ((uintptr_t) (new_value) & ~2)
diff --git a/sysdeps/hppa/dl-machine.h b/sysdeps/hppa/dl-machine.h
index da4d57d2e..7b647abfd 100644
--- a/sysdeps/hppa/dl-machine.h
+++ b/sysdeps/hppa/dl-machine.h
@@ -176,6 +176,15 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
     Elf32_Addr i[2];
   } sig = {{0x00,0xc0,0xff,0xee, 0xde,0xad,0xbe,0xef}};
 
+  /* Initialize dp register for main executable.  */
+  if (l->l_main_map)
+    {
+      register Elf32_Addr dp asm ("%r27");
+
+      dp = D_PTR (l, l_info[DT_PLTGOT]);
+      asm volatile ("" : : "r" (dp));
+    }
+
   /* If we don't have a PLT we can just skip all this... */
   if (__builtin_expect (l->l_info[DT_JMPREL] == NULL,0))
     return lazy;
@@ -338,16 +347,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
    its return value is the user program's entry point.  */
 
 #define RTLD_START \
-/* Set up dp for any non-PIC lib constructors that may be called.  */	\
-static struct link_map * __attribute__((used))				\
-set_dp (struct link_map *map)						\
-{									\
-  register Elf32_Addr dp asm ("%r27");					\
-  dp = D_PTR (map, l_info[DT_PLTGOT]);					\
-  asm volatile ("" : : "r" (dp));					\
-  return map;								\
-}									\
-									\
 asm (									\
 "	.text\n"							\
 "	.globl _start\n"						\
@@ -355,10 +354,6 @@ asm (									\
 "_start:\n"								\
 	/* The kernel does not give us an initial stack frame. */	\
 "	ldo	64(%sp),%sp\n"						\
-	/* Save the relevant arguments (yes, those are the correct	\
-	   registers, the kernel is weird) in their stack slots. */	\
-"	stw	%r25,-40(%sp)\n" /* argc */				\
-"	stw	%r24,-44(%sp)\n" /* argv */				\
 									\
 	/* We need the LTP, and we need it now.				\
 	   $PIC_pcrel$0 points 8 bytes past the current instruction,	\
@@ -416,12 +411,7 @@ asm (									\
 	  So, obviously, we can't just pass %sp to _dl_start.  That's	\
 	  okay, argv-4 will do just fine.				\
 									\
-	  The pleasant part of this is that if we need to skip		\
-	  arguments we can just decrement argc and move argv, because	\
-	  the stack pointer is utterly unrelated to the location of	\
-	  the environment and argument vectors. */			\
-									\
-	/* This is always within range so we'll be okay. */		\
+	  This is always within range so we'll be okay. */		\
 "	bl	_dl_start,%rp\n"					\
 "	ldo	-4(%r24),%r26\n"					\
 									\
@@ -431,30 +421,28 @@ asm (									\
 	/* Save the entry point in %r3. */				\
 "	copy	%ret0,%r3\n"						\
 									\
-	/* See if we were called as a command with the executable file	\
-	   name as an extra leading argument. */			\
-"	addil	LT'_dl_skip_args,%r19\n"				\
-"	ldw	RT'_dl_skip_args(%r1),%r20\n"				\
-"	ldw	0(%r20),%r20\n"						\
-									\
-"	ldw	-40(%sp),%r25\n"	/* argc */			\
-"	comib,=	0,%r20,.Lnofix\n"	/* FIXME: Mispredicted branch */\
-"	ldw	-44(%sp),%r24\n"	/* argv (delay slot) */		\
+	/* The loader adjusts argc, argv, env, and the aux vectors	\
+	   directly on the stack to remove any arguments used for	\
+	   direct loader invocation.  Thus, argc and argv must be	\
+	   reloaded from from _dl_argc and _dl_argv.  */		\
 									\
-"	sub	%r25,%r20,%r25\n"					\
+	/* Load argc from _dl_argc.  */					\
+"	addil	LT'_dl_argc,%r19\n"					\
+"	ldw	RT'_dl_argc(%r1),%r20\n"				\
+"	ldw	0(%r20),%r25\n"						\
 "	stw	%r25,-40(%sp)\n"					\
-"	sh2add	%r20,%r24,%r24\n"					\
+									\
+	/* Same for argv with _dl_argv.  */				\
+"	addil	LT'_dl_argv,%r19\n"					\
+"	ldw	RT'_dl_argv(%r1),%r20\n"				\
+"	ldw	0(%r20),%r24\n"						\
 "	stw	%r24,-44(%sp)\n"					\
 									\
-".Lnofix:\n"								\
+	/* Call _dl_init(main_map, argc, argv, envp). */		\
 "	addil	LT'_rtld_local,%r19\n"					\
 "	ldw	RT'_rtld_local(%r1),%r26\n"				\
-"	bl	set_dp, %r2\n"						\
 "	ldw	0(%r26),%r26\n"						\
 									\
-	/* Call _dl_init(_dl_loaded, argc, argv, envp). */		\
-"	copy	%r28,%r26\n"						\
-									\
 	/* envp = argv + argc + 1 */					\
 "	sh2add	%r25,%r24,%r23\n"					\
 "	bl	_dl_init,%r2\n"						\
diff --git a/sysdeps/hppa/dl-runtime.c b/sysdeps/hppa/dl-runtime.c
index 8b2ee58e3..192a6bee0 100644
--- a/sysdeps/hppa/dl-runtime.c
+++ b/sysdeps/hppa/dl-runtime.c
@@ -25,8 +25,7 @@
    return that to the caller.  The caller will continue on to call
    _dl_fixup with the relocation offset.  */
 
-ElfW(Word)
-attribute_hidden __attribute ((noinline)) DL_ARCH_FIXUP_ATTRIBUTE
+ElfW(Word) __attribute ((noinline)) DL_ARCH_FIXUP_ATTRIBUTE
 _dl_fix_reloc_arg (struct fdesc *fptr, struct link_map *l)
 {
   Elf32_Addr l_addr, iplt, jmprel, end_jmprel, r_type;
@@ -52,3 +51,4 @@ _dl_fix_reloc_arg (struct fdesc *fptr, struct link_map *l)
   ABORT_INSTRUCTION;
   return 0;
 }
+rtld_hidden_def (_dl_fix_reloc_arg)
diff --git a/sysdeps/hppa/dl-runtime.h b/sysdeps/hppa/dl-runtime.h
index d4da46079..5ced8e14e 100644
--- a/sysdeps/hppa/dl-runtime.h
+++ b/sysdeps/hppa/dl-runtime.h
@@ -17,6 +17,9 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
+ElfW(Word) _dl_fix_reloc_arg (struct fdesc *, struct link_map *);
+rtld_hidden_proto (_dl_fix_reloc_arg)
+
 /* Clear PA_GP_RELOC bit in relocation offset.  */
 static inline uintptr_t
 reloc_offset (uintptr_t plt0, uintptr_t pltn)
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
index 760104911..84e6686eb 100644
--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
@@ -668,7 +668,7 @@ ldouble: 4
 
 Function: Imaginary part of "clog10":
 double: 2
-float: 1
+float: 2
 float128: 2
 ldouble: 2
 
diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
index a39c89cec..cc21e6907 100644
--- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@@ -668,7 +668,7 @@ ldouble: 4
 
 Function: Imaginary part of "clog10":
 double: 2
-float: 1
+float: 2
 float128: 2
 ldouble: 2
 
diff --git a/sysdeps/m68k/dl-machine.h b/sysdeps/m68k/dl-machine.h
index c44ab055a..bb51b4198 100644
--- a/sysdeps/m68k/dl-machine.h
+++ b/sysdeps/m68k/dl-machine.h
@@ -234,6 +234,11 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
 
       switch (r_type)
 	{
+	case R_68K_GLOB_DAT:
+	case R_68K_JMP_SLOT:
+	  *reloc_addr = value;
+	  break;
+#ifndef RTLD_BOOTSTRAP
 	case R_68K_COPY:
 	  if (sym == NULL)
 	    /* This can happen in trace mode if an object could not be
@@ -252,10 +257,6 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
 	  memcpy (reloc_addr_arg, (void *) value,
 		  MIN (sym->st_size, refsym->st_size));
 	  break;
-	case R_68K_GLOB_DAT:
-	case R_68K_JMP_SLOT:
-	  *reloc_addr = value;
-	  break;
 	case R_68K_8:
 	  *(char *) reloc_addr = value + reloc->r_addend;
 	  break;
@@ -276,7 +277,6 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
 	case R_68K_PC32:
 	  *reloc_addr = value + reloc->r_addend - (Elf32_Addr) reloc_addr;
 	  break;
-#ifndef RTLD_BOOTSTRAP
 	case R_68K_TLS_DTPMOD32:
 	  /* Get the information from the link map returned by the
 	     resolv function.  */
@@ -294,9 +294,9 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[],
 	      *reloc_addr = TLS_TPREL_VALUE (sym_map, sym, reloc);
 	    }
 	  break;
-#endif /* !RTLD_BOOTSTRAP */
 	case R_68K_NONE:		/* Alright, Wilbur.  */
 	  break;
+#endif /* !RTLD_BOOTSTRAP */
 	default:
 	  _dl_reloc_bad_type (map, r_type, 0);
 	  break;
diff --git a/sysdeps/mach/hurd/bits/socket.h b/sysdeps/mach/hurd/bits/socket.h
index 5b35ea81e..70fce4fb2 100644
--- a/sysdeps/mach/hurd/bits/socket.h
+++ b/sysdeps/mach/hurd/bits/socket.h
@@ -249,6 +249,12 @@ struct cmsghdr
 			 + CMSG_ALIGN (sizeof (struct cmsghdr)))
 #define CMSG_LEN(len)   (CMSG_ALIGN (sizeof (struct cmsghdr)) + (len))
 
+/* Given a length, return the additional padding necessary such that
+   len + __CMSG_PADDING(len) == CMSG_ALIGN (len).  */
+#define __CMSG_PADDING(len) ((sizeof (size_t) \
+                              - ((len) & (sizeof (size_t) - 1))) \
+                             & (sizeof (size_t) - 1))
+
 extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr,
 				      struct cmsghdr *__cmsg) __THROW;
 #ifdef __USE_EXTERN_INLINES
@@ -258,18 +264,38 @@ extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr,
 _EXTERN_INLINE struct cmsghdr *
 __NTH (__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg))
 {
+  /* We may safely assume that __cmsg lies between __mhdr->msg_control and
+     __mhdr->msg_controllen because the user is required to obtain the first
+     cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs
+     via CMSG_NXTHDR, setting lengths along the way.  However, we don't yet
+     trust the value of __cmsg->cmsg_len and therefore do not use it in any
+     pointer arithmetic until we check its value.  */
+
+  unsigned char * __msg_control_ptr = (unsigned char *) __mhdr->msg_control;
+  unsigned char * __cmsg_ptr = (unsigned char *) __cmsg;
+
+  size_t __size_needed = sizeof (struct cmsghdr)
+                         + __CMSG_PADDING (__cmsg->cmsg_len);
+
+  /* The current header is malformed, too small to be a full header.  */
   if ((size_t) __cmsg->cmsg_len < sizeof (struct cmsghdr))
-    /* The kernel header does this so there may be a reason.  */
     return (struct cmsghdr *) 0;
 
+  /* There isn't enough space between __cmsg and the end of the buffer to
+  hold the current cmsg *and* the next one.  */
+  if (((size_t)
+         (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr)
+       < __size_needed)
+      || ((size_t)
+            (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr
+             - __size_needed)
+          < __cmsg->cmsg_len))
+
+    return (struct cmsghdr *) 0;
+
+  /* Now, we trust cmsg_len and can use it to find the next header.  */
   __cmsg = (struct cmsghdr *) ((unsigned char *) __cmsg
 			       + CMSG_ALIGN (__cmsg->cmsg_len));
-  if ((unsigned char *) (__cmsg + 1) > ((unsigned char *) __mhdr->msg_control
-					+ __mhdr->msg_controllen)
-      || ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len)
-	  > ((unsigned char *) __mhdr->msg_control + __mhdr->msg_controllen)))
-    /* No more entries.  */
-    return (struct cmsghdr *) 0;
   return __cmsg;
 }
 #endif	/* Use `extern inline'.  */
diff --git a/sysdeps/mach/hurd/dl-sysdep.c b/sysdeps/mach/hurd/dl-sysdep.c
index 3cbe07561..8373962e6 100644
--- a/sysdeps/mach/hurd/dl-sysdep.c
+++ b/sysdeps/mach/hurd/dl-sysdep.c
@@ -76,6 +76,7 @@ _dl_sysdep_start (void **start_argptr,
 {
   void go (intptr_t *argdata)
     {
+      char *orig_argv0;
       char **p;
 
       /* Cache the information in various global variables.  */
@@ -84,6 +85,8 @@ _dl_sysdep_start (void **start_argptr,
       _environ = &_dl_argv[_dl_argc + 1];
       for (p = _environ; *p++;); /* Skip environ pointers and terminator.  */
 
+      orig_argv0 = _dl_argv[0];
+
       if ((void *) p == _dl_argv[0])
 	{
 	  static struct hurd_startup_data nodata;
@@ -173,30 +176,23 @@ _dl_sysdep_start (void **start_argptr,
 
       /* The call above might screw a few things up.
 
-	 First of all, if _dl_skip_args is nonzero, we are ignoring
-	 the first few arguments.  However, if we have no Hurd startup
-	 data, it is the magical convention that ARGV[0] == P.  The
+	 P is the location after the terminating NULL of the list of
+	 environment variables.  It has to point to the Hurd startup
+	 data or if that's missing then P == ARGV[0] must hold. The
 	 startup code in init-first.c will get confused if this is not
 	 the case, so we must rearrange things to make it so.  We'll
-	 overwrite the origional ARGV[0] at P with ARGV[_dl_skip_args].
+	 recompute P and move the Hurd data or the new ARGV[0] there.
 
-	 Secondly, if we need to be secure, it removes some dangerous
-	 environment variables.  If we have no Hurd startup date this
-	 changes P (since that's the location after the terminating
-	 NULL in the list of environment variables).  We do the same
-	 thing as in the first case but make sure we recalculate P.
-	 If we do have Hurd startup data, we have to move the data
-	 such that it starts just after the terminating NULL in the
-	 environment list.
+	 Note: directly invoked ld.so can move arguments and env vars.
 
 	 We use memmove, since the locations might overlap.  */
-      if (__libc_enable_secure || _dl_skip_args)
-	{
-	  char **newp;
 
-	  for (newp = _environ; *newp++;);
+      char **newp;
+      for (newp = _environ; *newp++;);
 
-	  if (_dl_argv[-_dl_skip_args] == (char *) p)
+      if (newp != p || _dl_argv[0] != orig_argv0)
+	{
+	  if (orig_argv0 == (char *) p)
 	    {
 	      if ((char *) newp != _dl_argv[0])
 		{
diff --git a/sysdeps/mach/hurd/i386/init-first.c b/sysdeps/mach/hurd/i386/init-first.c
index 1229b5911..534a796e0 100644
--- a/sysdeps/mach/hurd/i386/init-first.c
+++ b/sysdeps/mach/hurd/i386/init-first.c
@@ -38,10 +38,6 @@ extern void __init_misc (int, char **, char **);
 unsigned long int __hurd_threadvar_stack_offset;
 unsigned long int __hurd_threadvar_stack_mask;
 
-#ifndef SHARED
-int __libc_enable_secure;
-#endif
-
 extern int __libc_argc attribute_hidden;
 extern char **__libc_argv attribute_hidden;
 extern char **_dl_argv;
diff --git a/sysdeps/nios2/dl-machine.h b/sysdeps/nios2/dl-machine.h
index 80de6fd04..9a35cf416 100644
--- a/sysdeps/nios2/dl-machine.h
+++ b/sysdeps/nios2/dl-machine.h
@@ -128,53 +128,23 @@ _start:\n\
         ldw r8, %call(_dl_nios2_get_gp_value)(r22)\n\
         callr r8\n\
         mov gp, r2\n\
-\n\
-        /* Find the number of arguments to skip.  */\n\
-        ldw r8, %got(_dl_skip_args)(r22)\n\
-        ldw r8, 0(r8)\n\
 \n\
         /* Find the main_map from the GOT.  */\n\
         ldw r4, %got(_rtld_local)(r22)\n\
         ldw r4, 0(r4)\n\
 \n\
-        /* Find argc.  */\n\
-        ldw r5, 0(sp)\n\
-        sub r5, r5, r8\n\
-        stw r5, 0(sp)\n\
-\n\
-        /* Find the first unskipped argument.  */\n\
-        slli r8, r8, 2\n\
-        addi r6, sp, 4\n\
-        add r9, r6, r8\n\
-        mov r10, r6\n\
-\n\
-        /* Shuffle argv down.  */\n\
-3:      ldw r11, 0(r9)\n\
-        stw r11, 0(r10)\n\
-        addi r9, r9, 4\n\
-        addi r10, r10, 4\n\
-        bne r11, zero, 3b\n\
+        /* Load adjusted argc.  */\n\
+        ldw r2, %got(_dl_argc)(r22)\n\
+	ldw r5, 0(r2)\n\
 \n\
-        /* Shuffle envp down.  */\n\
-        mov r7, r10\n\
-4:      ldw r11, 0(r9)\n\
-        stw r11, 0(r10)\n\
-        addi r9, r9, 4\n\
-        addi r10, r10, 4\n\
-        bne r11, zero, 4b\n\
-\n\
-        /* Shuffle auxv down.  */\n\
-5:      ldw r11, 4(r9)\n\
-        stw r11, 4(r10)\n\
-        ldw r11, 0(r9)\n\
-        stw r11, 0(r10)\n\
-        addi r9, r9, 8\n\
-        addi r10, r10, 8\n\
-        bne r11, zero, 5b\n\
-\n\
-        /* Update _dl_argv.  */\n\
+        /* Load adjsuted argv.  */\n\
         ldw r2, %got(_dl_argv)(r22)\n\
-        stw r6, 0(r2)\n\
+	ldw r6, 0(r2)\n\
+\n\
+	/* envp = argv + argc + 1  */\n\
+        addi r7, r5, 1\n\
+        slli r7, r7, 2\n\
+        add  r7, r7, r6\n\
 \n\
         /* Call _dl_init through the PLT.  */\n\
         ldw r8, %call(_dl_init)(r22)\n\
diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c
index 1294c9181..53fba774a 100644
--- a/sysdeps/nptl/dl-tls_init_tp.c
+++ b/sysdeps/nptl/dl-tls_init_tp.c
@@ -128,7 +128,4 @@ __tls_init_tp (void)
      It will be bigger than it actually is, but for unwind.c/pt-longjmp.c
      purposes this is good enough.  */
   THREAD_SETMEM (pd, stackblock_size, (size_t) __libc_stack_end);
-
-  THREAD_SETMEM (pd, cancelstate, PTHREAD_CANCEL_ENABLE);
-  THREAD_SETMEM (pd, canceltype, PTHREAD_CANCEL_DEFERRED);
 }
diff --git a/sysdeps/nptl/libc-lock.h b/sysdeps/nptl/libc-lock.h
index 5af476c48..63b3f3d75 100644
--- a/sysdeps/nptl/libc-lock.h
+++ b/sysdeps/nptl/libc-lock.h
@@ -22,6 +22,7 @@
 #include <pthread.h>
 #define __need_NULL
 #include <stddef.h>
+#include <libc-lock-arch.h>
 
 
 /* Mutex type.  */
@@ -29,7 +30,12 @@
 # if (!IS_IN (libc) && !IS_IN (libpthread)) || !defined _LIBC
 typedef struct { pthread_mutex_t mutex; } __libc_lock_recursive_t;
 # else
-typedef struct { int lock; int cnt; void *owner; } __libc_lock_recursive_t;
+typedef struct
+{
+  int lock __LIBC_LOCK_ALIGNMENT;
+  int cnt;
+  void *owner;
+} __libc_lock_recursive_t;
 # endif
 #else
 typedef struct __libc_lock_recursive_opaque__ __libc_lock_recursive_t;
diff --git a/sysdeps/nptl/libc-lockP.h b/sysdeps/nptl/libc-lockP.h
index d3a6837fd..425f514c5 100644
--- a/sysdeps/nptl/libc-lockP.h
+++ b/sysdeps/nptl/libc-lockP.h
@@ -32,9 +32,10 @@
    ld.so might be used on old kernels with a different libc.so.  */
 #include <lowlevellock.h>
 #include <tls.h>
+#include <libc-lock-arch.h>
 
 /* Mutex type.  */
-typedef int __libc_lock_t;
+typedef int __libc_lock_t __LIBC_LOCK_ALIGNMENT;
 typedef struct { pthread_mutex_t mutex; } __rtld_lock_recursive_t;
 typedef pthread_rwlock_t __libc_rwlock_t;
 
diff --git a/sysdeps/nptl/pthreadP.h b/sysdeps/nptl/pthreadP.h
index 708bd9246..601db4ff2 100644
--- a/sysdeps/nptl/pthreadP.h
+++ b/sysdeps/nptl/pthreadP.h
@@ -275,7 +275,7 @@ __do_cancel (void)
   struct pthread *self = THREAD_SELF;
 
   /* Make sure we get no more cancellations.  */
-  THREAD_ATOMIC_BIT_SET (self, cancelhandling, EXITING_BIT);
+  atomic_bit_set (&self->cancelhandling, EXITING_BIT);
 
   __pthread_unwind ((__pthread_unwind_buf_t *)
 		    THREAD_GETMEM (self, cleanup_jmp_buf));
diff --git a/sysdeps/posix/fpathconf.c b/sysdeps/posix/fpathconf.c
index 216f2a9c8..4b215e060 100644
--- a/sysdeps/posix/fpathconf.c
+++ b/sysdeps/posix/fpathconf.c
@@ -131,9 +131,9 @@ __fpathconf (int fd, int name)
 #ifdef	_POSIX_ASYNC_IO
       {
 	/* AIO is only allowed on regular files and block devices.  */
-	struct stat64 st;
+	struct __stat64_t64 st;
 
-	if (__fstat64 (fd, &st) < 0
+	if (__fstat64_time64 (fd, &st) < 0
 	    || (! S_ISREG (st.st_mode) && ! S_ISBLK (st.st_mode)))
 	  return -1;
 	else
diff --git a/sysdeps/posix/isfdtype.c b/sysdeps/posix/isfdtype.c
index 192c7f9be..d26c14259 100644
--- a/sysdeps/posix/isfdtype.c
+++ b/sysdeps/posix/isfdtype.c
@@ -24,12 +24,12 @@
 int
 isfdtype (int fildes, int fdtype)
 {
-  struct stat64 st;
+  struct __stat64_t64 st;
   int result;
 
   {
     int save_error = errno;
-    result = __fstat64 (fildes, &st);
+    result = __fstat64_time64 (fildes, &st);
     __set_errno (save_error);
   }
 
diff --git a/sysdeps/posix/posix_fallocate.c b/sysdeps/posix/posix_fallocate.c
index 037d32864..9720e71cc 100644
--- a/sysdeps/posix/posix_fallocate.c
+++ b/sysdeps/posix/posix_fallocate.c
@@ -30,7 +30,7 @@
 int
 posix_fallocate (int fd, __off_t offset, __off_t len)
 {
-  struct stat64 st;
+  struct __stat64_t64 st;
 
   if (offset < 0 || len < 0)
     return EINVAL;
@@ -48,7 +48,7 @@ posix_fallocate (int fd, __off_t offset, __off_t len)
   }
 
   /* We have to make sure that this is really a regular file.  */
-  if (__fstat64 (fd, &st) != 0)
+  if (__fstat64_time64 (fd, &st) != 0)
     return EBADF;
   if (S_ISFIFO (st.st_mode))
     return ESPIPE;
diff --git a/sysdeps/posix/posix_fallocate64.c b/sysdeps/posix/posix_fallocate64.c
index a670ee0a3..bf984f7f9 100644
--- a/sysdeps/posix/posix_fallocate64.c
+++ b/sysdeps/posix/posix_fallocate64.c
@@ -30,7 +30,7 @@
 int
 __posix_fallocate64_l64 (int fd, __off64_t offset, __off64_t len)
 {
-  struct stat64 st;
+  struct __stat64_t64 st;
 
   if (offset < 0 || len < 0)
     return EINVAL;
@@ -48,7 +48,7 @@ __posix_fallocate64_l64 (int fd, __off64_t offset, __off64_t len)
   }
 
   /* We have to make sure that this is really a regular file.  */
-  if (__fstat64 (fd, &st) != 0)
+  if (__fstat64_time64 (fd, &st) != 0)
     return EBADF;
   if (S_ISFIFO (st.st_mode))
     return ESPIPE;
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
index ae2316131..deb94671c 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -352,7 +352,7 @@ L(zero_padding_loop):
 	cmpldi	cr6,r5,16	/* Check if length was reached.  */
 	ble	cr6,L(zero_padding_end)
 
-	stxv	v18,0(r11)
+	stxv	32+v18,0(r11)
 	addi	r11,r11,16
 	addi	r5,r5,-16
 
@@ -360,7 +360,7 @@ L(zero_padding_loop):
 
 L(zero_padding_end):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
-	stxvl	v18,r11,r10	/* Partial store  */
+	stxvl	32+v18,r11,r10	/* Partial store  */
 	blr
 
 	.align	4
diff --git a/sysdeps/pthread/Makefile b/sysdeps/pthread/Makefile
index c972bd819..3a505c5f9 100644
--- a/sysdeps/pthread/Makefile
+++ b/sysdeps/pthread/Makefile
@@ -69,6 +69,7 @@ tests += tst-cnd-basic tst-mtx-trylock tst-cnd-broadcast \
 	 tst-cancel12 tst-cancel13 tst-cancel14 tst-cancel15 tst-cancel16 \
 	 tst-cancel18 tst-cancel19 tst-cancel20 tst-cancel21 \
 	 tst-cancel22 tst-cancel23 tst-cancel26 tst-cancel27 tst-cancel28 \
+	 tst-cancel29 \
 	 tst-cleanup0 tst-cleanup1 tst-cleanup2 tst-cleanup3 \
 	 tst-clock1 \
 	 tst-cond-except \
@@ -125,6 +126,7 @@ tests += tst-cnd-basic tst-mtx-trylock tst-cnd-broadcast \
 	 tst-pthread-raise-blocked-self \
 	 tst-pthread_kill-exited \
 	 tst-pthread_kill-exiting \
+	 tst-cancel30 \
 	 # tests
 
 tests-time64 := \
@@ -153,16 +155,36 @@ tests += tst-cancelx2 tst-cancelx3 tst-cancelx6 tst-cancelx8 tst-cancelx9 \
 	 tst-cleanupx0 tst-cleanupx1 tst-cleanupx2 tst-cleanupx3
 
 ifeq ($(build-shared),yes)
-tests += tst-atfork2 tst-pt-tls4 tst-_res1 tst-fini1 tst-create1
+tests += \
+  tst-atfork2 \
+  tst-pt-tls4 \
+  tst-_res1 \
+  tst-fini1 \
+  tst-create1 \
+  tst-atfork3 \
+  tst-atfork4 \
+# tests
+
 tests-nolibpthread += tst-fini1
 endif
 
-modules-names += tst-atfork2mod tst-tls4moda tst-tls4modb \
-		 tst-_res1mod1 tst-_res1mod2 tst-fini1mod \
-		 tst-create1mod
+modules-names += \
+  tst-atfork2mod \
+  tst-tls4moda \
+  tst-tls4modb \
+  tst-_res1mod1 \
+  tst-_res1mod2 \
+  tst-fini1mod \
+  tst-create1mod \
+  tst-atfork3mod \
+  tst-atfork4mod \
+# module-names
+
 test-modules = $(addprefix $(objpfx),$(addsuffix .so,$(modules-names)))
 
 tst-atfork2mod.so-no-z-defs = yes
+tst-atfork3mod.so-no-z-defs = yes
+tst-atfork4mod.so-no-z-defs = yes
 tst-create1mod.so-no-z-defs = yes
 
 ifeq ($(build-shared),yes)
@@ -225,8 +247,18 @@ tst-atfork2-ENV = MALLOC_TRACE=$(objpfx)tst-atfork2.mtrace \
 		  LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so
 $(objpfx)tst-atfork2mod.so: $(shared-thread-library)
 
+$(objpfx)tst-atfork3: $(shared-thread-library)
+LDFLAGS-tst-atfork3 = -rdynamic
+$(objpfx)tst-atfork3mod.so: $(shared-thread-library)
+
+$(objpfx)tst-atfork4: $(shared-thread-library)
+LDFLAGS-tst-atfork4 = -rdynamic
+$(objpfx)tst-atfork4mod.so: $(shared-thread-library)
+
 ifeq ($(build-shared),yes)
 $(objpfx)tst-atfork2.out: $(objpfx)tst-atfork2mod.so
+$(objpfx)tst-atfork3.out: $(objpfx)tst-atfork3mod.so
+$(objpfx)tst-atfork4.out: $(objpfx)tst-atfork4mod.so
 endif
 
 ifeq ($(build-shared),yes)
diff --git a/sysdeps/pthread/tst-atfork3.c b/sysdeps/pthread/tst-atfork3.c
new file mode 100644
index 000000000..bb2250e43
--- /dev/null
+++ b/sysdeps/pthread/tst-atfork3.c
@@ -0,0 +1,118 @@
+/* Check if pthread_atfork handler can call dlclose (BZ#24595).
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include <support/check.h>
+#include <support/xthread.h>
+#include <support/capture_subprocess.h>
+#include <support/xdlfcn.h>
+
+/* Check if pthread_atfork handlers do not deadlock when calling a function
+   that might alter the internal fork handle list, such as dlclose.
+
+   The test registers a callback set with pthread_atfork(), dlopen() a shared
+   library (nptl/tst-atfork3mod.c), calls an exported symbol from the library
+   (which in turn also registers atfork handlers), and calls fork to trigger
+   the callbacks.  */
+
+static void *handler;
+static bool run_dlclose_prepare;
+static bool run_dlclose_parent;
+static bool run_dlclose_child;
+
+static void
+prepare (void)
+{
+  if (run_dlclose_prepare)
+    xdlclose (handler);
+}
+
+static void
+parent (void)
+{
+  if (run_dlclose_parent)
+    xdlclose (handler);
+}
+
+static void
+child (void)
+{
+  if (run_dlclose_child)
+    xdlclose (handler);
+}
+
+static void
+proc_func (void *closure)
+{
+}
+
+static void
+do_test_generic (bool dlclose_prepare, bool dlclose_parent, bool dlclose_child)
+{
+  run_dlclose_prepare = dlclose_prepare;
+  run_dlclose_parent = dlclose_parent;
+  run_dlclose_child = dlclose_child;
+
+  handler = xdlopen ("tst-atfork3mod.so", RTLD_NOW);
+
+  int (*atfork3mod_func)(void);
+  atfork3mod_func = xdlsym (handler, "atfork3mod_func");
+
+  atfork3mod_func ();
+
+  struct support_capture_subprocess proc
+    = support_capture_subprocess (proc_func, NULL);
+  support_capture_subprocess_check (&proc, "tst-atfork3", 0, sc_allow_none);
+
+  handler = atfork3mod_func = NULL;
+
+  support_capture_subprocess_free (&proc);
+}
+
+static void *
+thread_func (void *closure)
+{
+  return NULL;
+}
+
+static int
+do_test (void)
+{
+  {
+    /* Make the process acts as multithread.  */
+    pthread_attr_t attr;
+    xpthread_attr_init (&attr);
+    xpthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
+    xpthread_create (&attr, thread_func, NULL);
+  }
+
+  TEST_COMPARE (pthread_atfork (prepare, parent, child), 0);
+
+  do_test_generic (true  /* prepare */, false /* parent */, false /* child */);
+  do_test_generic (false /* prepare */, true  /* parent */, false /* child */);
+  do_test_generic (false /* prepare */, false /* parent */, true  /* child */);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/pthread/tst-atfork3mod.c b/sysdeps/pthread/tst-atfork3mod.c
new file mode 100644
index 000000000..6d0658cb9
--- /dev/null
+++ b/sysdeps/pthread/tst-atfork3mod.c
@@ -0,0 +1,44 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include <support/check.h>
+
+static void
+mod_prepare (void)
+{
+}
+
+static void
+mod_parent (void)
+{
+}
+
+static void
+mod_child (void)
+{
+}
+
+int atfork3mod_func (void)
+{
+  TEST_COMPARE (pthread_atfork (mod_prepare, mod_parent, mod_child), 0);
+
+  return 0;
+}
diff --git a/sysdeps/pthread/tst-atfork4.c b/sysdeps/pthread/tst-atfork4.c
new file mode 100644
index 000000000..52dc87e73
--- /dev/null
+++ b/sysdeps/pthread/tst-atfork4.c
@@ -0,0 +1,128 @@
+/* pthread_atfork supports handlers that call pthread_atfork or dlclose.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/xdlfcn.h>
+#include <stdio.h>
+#include <support/xthread.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <support/xunistd.h>
+#include <support/check.h>
+#include <stdlib.h>
+
+static void *
+thread_func (void *x)
+{
+  return NULL;
+}
+
+static unsigned int second_atfork_handler_runcount = 0;
+
+static void
+second_atfork_handler (void)
+{
+  second_atfork_handler_runcount++;
+}
+
+static void *h = NULL;
+
+static unsigned int atfork_handler_runcount = 0;
+
+static void
+prepare (void)
+{
+  /* These atfork handlers are registered while atfork handlers are being
+     executed and thus will not be executed during the corresponding
+     fork.  */
+  TEST_VERIFY_EXIT (pthread_atfork (second_atfork_handler,
+                                    second_atfork_handler,
+                                    second_atfork_handler) == 0);
+
+  /* This will de-register the atfork handlers registered by the dlopen'd
+     library and so they will not be executed.  */
+  if (h != NULL)
+    {
+      xdlclose (h);
+      h = NULL;
+    }
+
+  atfork_handler_runcount++;
+}
+
+static void
+after (void)
+{
+  atfork_handler_runcount++;
+}
+
+static int
+do_test (void)
+{
+  /* Make sure __libc_single_threaded is 0.  */
+  pthread_attr_t attr;
+  xpthread_attr_init (&attr);
+  xpthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
+  xpthread_create (&attr, thread_func, NULL);
+
+  void (*reg_atfork_handlers) (void);
+
+  h = xdlopen ("tst-atfork4mod.so", RTLD_LAZY);
+
+  reg_atfork_handlers = xdlsym (h, "reg_atfork_handlers");
+
+  reg_atfork_handlers ();
+
+  /* We register our atfork handlers *after* loading the module so that our
+     prepare handler is called first at fork, where we then dlclose the
+     module before its prepare handler has a chance to be called.  */
+  TEST_VERIFY_EXIT (pthread_atfork (prepare, after, after) == 0);
+
+  pid_t pid = xfork ();
+
+  /* Both the parent and the child processes should observe this.  */
+  TEST_VERIFY_EXIT (atfork_handler_runcount == 2);
+  TEST_VERIFY_EXIT (second_atfork_handler_runcount == 0);
+
+  if (pid > 0)
+    {
+      int childstat;
+
+      xwaitpid (-1, &childstat, 0);
+      TEST_VERIFY_EXIT (WIFEXITED (childstat)
+                        && WEXITSTATUS (childstat) == 0);
+
+      /* This time, the second set of atfork handlers should also be called
+         since the handlers are already in place before fork is called.  */
+
+      pid = xfork ();
+
+      TEST_VERIFY_EXIT (atfork_handler_runcount == 4);
+      TEST_VERIFY_EXIT (second_atfork_handler_runcount == 2);
+
+      if (pid > 0)
+        {
+          xwaitpid (-1, &childstat, 0);
+          TEST_VERIFY_EXIT (WIFEXITED (childstat)
+                            && WEXITSTATUS (childstat) == 0);
+        }
+    }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/pthread/tst-atfork4mod.c b/sysdeps/pthread/tst-atfork4mod.c
new file mode 100644
index 000000000..e111efeb1
--- /dev/null
+++ b/sysdeps/pthread/tst-atfork4mod.c
@@ -0,0 +1,48 @@
+/* pthread_atfork supports handlers that call pthread_atfork or dlclose.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <pthread.h>
+#include <stdlib.h>
+
+/* This dynamically loaded library simply registers its atfork handlers when
+   asked to.  The atfork handlers should never be executed because the
+   library is unloaded before fork is called by the test program.  */
+
+static void
+prepare (void)
+{
+  abort ();
+}
+
+static void
+parent (void)
+{
+  abort ();
+}
+
+static void
+child (void)
+{
+  abort ();
+}
+
+void
+reg_atfork_handlers (void)
+{
+  pthread_atfork (prepare, parent, child);
+}
diff --git a/sysdeps/pthread/tst-cancel29.c b/sysdeps/pthread/tst-cancel29.c
new file mode 100644
index 000000000..4f0d99e00
--- /dev/null
+++ b/sysdeps/pthread/tst-cancel29.c
@@ -0,0 +1,207 @@
+/* Check if a thread that disables cancellation and which call functions
+   that might be interrupted by a signal do not see the internal SIGCANCEL.
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <array_length.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <poll.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/temp_file.h>
+#include <support/xthread.h>
+#include <sys/socket.h>
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+
+/* On Linux some interfaces are never restarted after being interrupted by
+   a signal handler, regardless of the use of SA_RESTART.  It means that
+   if asynchronous cancellation is not enabled, the pthread_cancel can not
+   set the internal SIGCANCEL otherwise the interface might see a spurious
+   EINTR failure.  */
+
+static pthread_barrier_t b;
+
+/* Cleanup handling test.  */
+static int cl_called;
+static void
+cl (void *arg)
+{
+  ++cl_called;
+}
+
+static void *
+tf_sigtimedwait (void *arg)
+{
+  pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+  xpthread_barrier_wait (&b);
+
+  int r;
+  pthread_cleanup_push (cl, NULL);
+
+  sigset_t mask;
+  sigemptyset (&mask);
+  r = sigtimedwait (&mask, NULL, &(struct timespec) { 0, 250000000 });
+  if (r != -1)
+    return (void*) -1;
+  if (errno != EAGAIN)
+    return (void*) -2;
+
+  pthread_cleanup_pop (0);
+  return NULL;
+}
+
+static void *
+tf_poll (void *arg)
+{
+  pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+  xpthread_barrier_wait (&b);
+
+  int r;
+  pthread_cleanup_push (cl, NULL);
+
+  r = poll (NULL, 0, 250);
+  if (r != 0)
+    return (void*) -1;
+
+  pthread_cleanup_pop (0);
+  return NULL;
+}
+
+static void *
+tf_ppoll (void *arg)
+{
+  pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+
+  xpthread_barrier_wait (&b);
+
+  int r;
+  pthread_cleanup_push (cl, NULL);
+
+  r = ppoll (NULL, 0, &(struct timespec) { 0, 250000000 }, NULL);
+  if (r != 0)
+    return (void*) -1;
+
+  pthread_cleanup_pop (0);
+  return NULL;
+}
+
+static void *
+tf_select (void *arg)
+{
+  pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+  xpthread_barrier_wait (&b);
+
+  int r;
+  pthread_cleanup_push (cl, NULL);
+
+  r = select (0, NULL, NULL, NULL, &(struct timeval) { 0, 250000 });
+  if (r != 0)
+    return (void*) -1;
+
+  pthread_cleanup_pop (0);
+  return NULL;
+}
+
+static void *
+tf_pselect (void *arg)
+{
+  pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+  xpthread_barrier_wait (&b);
+
+  int r;
+  pthread_cleanup_push (cl, NULL);
+
+  r = pselect (0, NULL, NULL, NULL, &(struct timespec) { 0, 250000000 }, NULL);
+  if (r != 0)
+    return (void*) -1;
+
+  pthread_cleanup_pop (0);
+  return NULL;
+}
+
+static void *
+tf_clock_nanosleep (void *arg)
+{
+  pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+  xpthread_barrier_wait (&b);
+
+  int r;
+  pthread_cleanup_push (cl, NULL);
+
+  r = clock_nanosleep (CLOCK_REALTIME, 0, &(struct timespec) { 0, 250000000 },
+		       NULL);
+  if (r != 0)
+    return (void*) -1;
+
+  pthread_cleanup_pop (0);
+  return NULL;
+}
+
+struct cancel_test_t
+{
+  const char *name;
+  void * (*cf) (void *);
+} tests[] =
+{
+  { "sigtimedwait",    tf_sigtimedwait,    },
+  { "poll",            tf_poll,            },
+  { "ppoll",           tf_ppoll,           },
+  { "select",          tf_select,          },
+  { "pselect",         tf_pselect  ,       },
+  { "clock_nanosleep", tf_clock_nanosleep, },
+};
+
+static int
+do_test (void)
+{
+  for (int i = 0; i < array_length (tests); i++)
+    {
+      xpthread_barrier_init (&b, NULL, 2);
+
+      cl_called = 0;
+
+      pthread_t th = xpthread_create (NULL, tests[i].cf, NULL);
+
+      xpthread_barrier_wait (&b);
+
+      struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 };
+      while (nanosleep (&ts, &ts) != 0)
+	continue;
+
+      xpthread_cancel (th);
+
+      void *status = xpthread_join (th);
+      if (status != NULL)
+	printf ("test '%s' failed: %" PRIdPTR "\n", tests[i].name,
+		(intptr_t) status);
+      TEST_VERIFY (status == NULL);
+
+      xpthread_barrier_destroy (&b);
+
+      TEST_COMPARE (cl_called, 0);
+
+      printf ("in-time cancel test of '%s' successful\n", tests[i].name);
+    }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/pthread/tst-cancel30.c b/sysdeps/pthread/tst-cancel30.c
new file mode 100644
index 000000000..e08392f96
--- /dev/null
+++ b/sysdeps/pthread/tst-cancel30.c
@@ -0,0 +1,82 @@
+/* Check if printf like functions does not disable asynchronous cancellation
+   mode (BZ#29214).
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/check.h>
+#include <support/xstdio.h>
+#include <support/xthread.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+static pthread_barrier_t b;
+
+static void *
+tf (void *arg)
+{
+  int old;
+
+  TEST_COMPARE (pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, NULL), 0);
+
+  TEST_COMPARE (pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, &old), 0);
+  TEST_COMPARE (old, PTHREAD_CANCEL_ASYNCHRONOUS);
+
+  /* Check if internal lock cleanup routines restore the cancellation type
+     correctly.  */
+  printf ("...\n");
+  TEST_COMPARE (pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, &old), 0);
+  TEST_COMPARE (old, PTHREAD_CANCEL_ASYNCHRONOUS);
+
+  xpthread_barrier_wait (&b);
+
+  /* Wait indefinitely for cancellation, which only works if asynchronous
+     cancellation is enabled.  */
+#ifdef SYS_pause
+  syscall (SYS_pause);
+#elif defined SYS_ppoll || defined SYS_ppoll_time64
+# ifndef SYS_ppoll_time64
+#  define SYS_ppoll_time64 SYS_ppoll
+# endif
+  syscall (SYS_ppoll_time64, NULL, 0, NULL, NULL);
+#else
+  for (;;);
+#endif
+
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  xpthread_barrier_init (&b, NULL, 2);
+
+  pthread_t th = xpthread_create (NULL, tf, NULL);
+
+  xpthread_barrier_wait (&b);
+
+  xpthread_cancel (th);
+
+  void *status = xpthread_join (th);
+  TEST_VERIFY (status == PTHREAD_CANCELED);
+
+  return 0;
+}
+
+/* There is no need to wait full TIMEOUT if asynchronous is not working.  */
+#define TIMEOUT 3
+#include <support/test-driver.c>
diff --git a/sysdeps/riscv/rv64/rvd/libm-test-ulps b/sysdeps/riscv/rv64/rvd/libm-test-ulps
index e28b21169..308568082 100644
--- a/sysdeps/riscv/rv64/rvd/libm-test-ulps
+++ b/sysdeps/riscv/rv64/rvd/libm-test-ulps
@@ -1077,7 +1077,7 @@ ldouble: 9
 
 Function: "j0_upward":
 double: 9
-float: 8
+float: 9
 ldouble: 7
 
 Function: "j1":
diff --git a/sysdeps/s390/dl-procinfo.c b/sysdeps/s390/dl-procinfo.c
index 2cdb3c8b5..f142221a1 100644
--- a/sysdeps/s390/dl-procinfo.c
+++ b/sysdeps/s390/dl-procinfo.c
@@ -63,11 +63,12 @@ PROCINFO_CLASS const char _dl_s390_cap_flags[23][9]
 #if !defined PROCINFO_DECL && defined SHARED
   ._dl_s390_platforms
 #else
-PROCINFO_CLASS const char _dl_s390_platforms[10][7]
+PROCINFO_CLASS const char _dl_s390_platforms[11][7]
 #endif
 #ifndef PROCINFO_DECL
 = {
-    "g5", "z900", "z990", "z9-109", "z10", "z196", "zEC12", "z13", "z14", "z15"
+    "g5", "z900", "z990", "z9-109", "z10", "z196", "zEC12", "z13", "z14", "z15",
+    "z16"
   }
 #endif
 #if !defined SHARED || defined PROCINFO_DECL
diff --git a/sysdeps/s390/dl-procinfo.h b/sysdeps/s390/dl-procinfo.h
index 03d7e9437..1f4e3875b 100644
--- a/sysdeps/s390/dl-procinfo.h
+++ b/sysdeps/s390/dl-procinfo.h
@@ -22,7 +22,7 @@
 
 #define _DL_HWCAP_COUNT 23
 
-#define _DL_PLATFORMS_COUNT	10
+#define _DL_PLATFORMS_COUNT	11
 
 /* The kernel provides up to 32 capability bits with elf_hwcap.  */
 #define _DL_FIRST_PLATFORM	32
diff --git a/sysdeps/s390/s390-64/Makefile b/sysdeps/s390/s390-64/Makefile
index e5da26871..66ed844e6 100644
--- a/sysdeps/s390/s390-64/Makefile
+++ b/sysdeps/s390/s390-64/Makefile
@@ -7,8 +7,11 @@ CFLAGS-rtld.c += -Wno-uninitialized -Wno-unused
 CFLAGS-dl-load.c += -Wno-unused
 CFLAGS-dl-reloc.c += -Wno-unused
 
-$(objpfx)tst-glibc-hwcaps: $(objpfx)libmarkermod2-1.so \
-  $(objpfx)libmarkermod3-1.so $(objpfx)libmarkermod4-1.so
+$(objpfx)tst-glibc-hwcaps: \
+  $(objpfx)libmarkermod2-1.so \
+  $(objpfx)libmarkermod3-1.so \
+  $(objpfx)libmarkermod4-1.so \
+  $(objpfx)libmarkermod5-1.so
 $(objpfx)tst-glibc-hwcaps.out: \
   $(objpfx)libmarkermod2.so \
     $(objpfx)glibc-hwcaps/z13/libmarkermod2.so \
@@ -19,6 +22,11 @@ $(objpfx)tst-glibc-hwcaps.out: \
     $(objpfx)glibc-hwcaps/z13/libmarkermod4.so \
     $(objpfx)glibc-hwcaps/z14/libmarkermod4.so \
     $(objpfx)glibc-hwcaps/z15/libmarkermod4.so \
+  $(objpfx)libmarkermod5.so \
+    $(objpfx)glibc-hwcaps/z13/libmarkermod5.so \
+    $(objpfx)glibc-hwcaps/z14/libmarkermod5.so \
+    $(objpfx)glibc-hwcaps/z15/libmarkermod5.so \
+    $(objpfx)glibc-hwcaps/z16/libmarkermod5.so
 
 $(objpfx)glibc-hwcaps/z13/libmarkermod2.so: $(objpfx)libmarkermod2-2.so
 	$(make-target-directory)
@@ -38,6 +46,19 @@ $(objpfx)glibc-hwcaps/z14/libmarkermod4.so: $(objpfx)libmarkermod4-3.so
 $(objpfx)glibc-hwcaps/z15/libmarkermod4.so: $(objpfx)libmarkermod4-4.so
 	$(make-target-directory)
 	cp $< $@
+$(objpfx)glibc-hwcaps/z13/libmarkermod5.so: $(objpfx)libmarkermod5-2.so
+	$(make-target-directory)
+	cp $< $@
+$(objpfx)glibc-hwcaps/z14/libmarkermod5.so: $(objpfx)libmarkermod5-3.so
+	$(make-target-directory)
+	cp $< $@
+$(objpfx)glibc-hwcaps/z15/libmarkermod5.so: $(objpfx)libmarkermod5-4.so
+	$(make-target-directory)
+	cp $< $@
+$(objpfx)glibc-hwcaps/z16/libmarkermod5.so: $(objpfx)libmarkermod5-5.so
+	$(make-target-directory)
+	cp $< $@
+
 
 ifeq (no,$(build-hardcoded-path-in-tests))
 # This is an ld.so.cache test, and RPATH/RUNPATH in the executable
diff --git a/sysdeps/s390/s390-64/configure b/sysdeps/s390/s390-64/configure
new file mode 100644
index 000000000..101c570d2
--- /dev/null
+++ b/sysdeps/s390/s390-64/configure
@@ -0,0 +1,122 @@
+# This file is generated from configure.ac by Autoconf.  DO NOT EDIT!
+ # Local configure fragment for sysdeps/s390/s390-64.
+
+# Minimal checking for static PIE support in ld.
+# Compare to ld testcase/bugzilla:
+# <binutils-source>/ld/testsuite/ld-elf/pr22263-1.rd
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for s390-specific static PIE requirements" >&5
+$as_echo_n "checking for s390-specific static PIE requirements... " >&6; }
+if { as_var=\
+libc_cv_s390x_staticpie_req; eval \${$as_var+:} false; }; then :
+  $as_echo_n "(cached) " >&6
+else
+    cat > conftest1.c <<EOF
+__thread int * foo;
+
+void
+bar (void)
+{
+  *foo = 1;
+}
+EOF
+  cat > conftest2.c <<EOF
+extern __thread int *foo;
+extern void bar (void);
+static int x;
+
+int
+main ()
+{
+  foo = &x;
+  return 0;
+}
+EOF
+  libc_cv_s390x_staticpie_req=no
+  if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -fPIE -c conftest1.c -o conftest1.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } \
+     && { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -fPIE -c conftest2.c -o conftest2.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } \
+     && { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -pie -o conftest conftest1.o conftest2.o'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } \
+     && { ac_try='! readelf -Wr conftest | grep R_390_TLS_TPOFF'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+  then
+    libc_cv_s390x_staticpie_req=yes
+  fi
+  rm -rf conftest.*
+fi
+eval ac_res=\$\
+libc_cv_s390x_staticpie_req
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if test $libc_cv_s390x_staticpie_req = yes; then
+   # Static PIE is supported only on 64bit.
+   # Ensure you also have those patches for:
+   # - binutils (ld)
+   #   - "[PR ld/22263] s390: Avoid dynamic TLS relocs in PIE"
+   #     https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=26b1426577b5dcb32d149c64cca3e603b81948a9
+   #     (Tested by configure check above)
+   #     Otherwise there will be a R_390_TLS_TPOFF relocation, which fails to
+   #     be processed in _dl_relocate_static_pie() as static TLS map is not setup.
+   #   - "s390: Add DT_JMPREL pointing to .rela.[i]plt with static-pie"
+   #     https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d942d8db12adf4c9e5c7d9ed6496a779ece7149e
+   #     (We can't test it in configure as we are not able to link a static PIE
+   #     executable if the system glibc lacks static PIE support)
+   #     Otherwise there won't be DT_JMPREL, DT_PLTRELA, DT_PLTRELASZ entries
+   #     and the IFUNC symbols are not processed, which leads to crashes.
+   #
+   # - kernel (the mentioned links to the commits belong to 5.19 merge window):
+   #   - "s390/mmap: increase stack/mmap gap to 128MB"
+   #     https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=f2f47d0ef72c30622e62471903ea19446ea79ee2
+   #   - "s390/vdso: move vdso mapping to its own function"
+   #     https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=57761da4dc5cd60bed2c81ba0edb7495c3c740b8
+   #   - "s390/vdso: map vdso above stack"
+   #     https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=9e37a2e8546f9e48ea76c839116fa5174d14e033
+   #   - "s390/vdso: add vdso randomization"
+   #     https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=41cd81abafdc4e58a93fcb677712a76885e3ca25
+   #   (We can't test the kernel of the target system)
+   #   Otherwise if /proc/sys/kernel/randomize_va_space is turned off (0),
+   #   static PIE executables like ldconfig will crash.  While startup sbrk is
+   #   used to enlarge the HEAP.  Unfortunately the underlying brk syscall fails
+   #   as there is not enough space after the HEAP.  Then the address of the TLS
+   #   image is invalid and the following memcpy in __libc_setup_tls() leads
+   #   to a segfault.
+   #   If /proc/sys/kernel/randomize_va_space is activated (default: 2), there
+   #   is enough space after HEAP.
+   #
+   # - glibc
+   #   - "Linux: Define MMAP_CALL_INTERNAL"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=c1b68685d438373efe64e5f076f4215723004dfb
+   #   - "i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=6e5c7a1e262961adb52443ab91bd2c9b72316402
+   #   - "i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=60f0f2130d30cfd008ca39743027f1e200592dff
+   #   - "ia64: Always define IA64_USE_NEW_STUB as a flag macro"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=18bd9c3d3b1b6a9182698c85354578d1d58e9d64
+   #   - "Linux: Implement a useful version of _startup_fatal"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=a2a6bce7d7e52c1c34369a7da62c501cc350bc31
+   #   - "Linux: Introduce __brk_call for invoking the brk system call"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=b57ab258c1140bc45464b4b9908713e3e0ee35aa
+   #   - "csu: Implement and use _dl_early_allocate during static startup"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=f787e138aa0bf677bf74fa2a08595c446292f3d7
+   #   The mentioned patch series by Florian Weimer avoids the mentioned failing
+   #   sbrk syscall by falling back to mmap.
+   $as_echo "#define SUPPORT_STATIC_PIE 1" >>confdefs.h
+
+fi
diff --git a/sysdeps/s390/s390-64/configure.ac b/sysdeps/s390/s390-64/configure.ac
new file mode 100644
index 000000000..2583a4a33
--- /dev/null
+++ b/sysdeps/s390/s390-64/configure.ac
@@ -0,0 +1,92 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/s390/s390-64.
+
+# Minimal checking for static PIE support in ld.
+# Compare to ld testcase/bugzilla:
+# <binutils-source>/ld/testsuite/ld-elf/pr22263-1.rd
+AC_CACHE_CHECK([for s390-specific static PIE requirements], \
+[libc_cv_s390x_staticpie_req], [dnl
+  cat > conftest1.c <<EOF
+__thread int * foo;
+
+void
+bar (void)
+{
+  *foo = 1;
+}
+EOF
+  cat > conftest2.c <<EOF
+extern __thread int *foo;
+extern void bar (void);
+static int x;
+
+int
+main ()
+{
+  foo = &x;
+  return 0;
+}
+EOF
+  libc_cv_s390x_staticpie_req=no
+  if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -fPIE -c conftest1.c -o conftest1.o]) \
+     && AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -fPIE -c conftest2.c -o conftest2.o]) \
+     && AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -pie -o conftest conftest1.o conftest2.o]) \
+     && AC_TRY_COMMAND([! readelf -Wr conftest | grep R_390_TLS_TPOFF])
+  then
+    libc_cv_s390x_staticpie_req=yes
+  fi
+  rm -rf conftest.*])
+if test $libc_cv_s390x_staticpie_req = yes; then
+   # Static PIE is supported only on 64bit.
+   # Ensure you also have those patches for:
+   # - binutils (ld)
+   #   - "[PR ld/22263] s390: Avoid dynamic TLS relocs in PIE"
+   #     https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=26b1426577b5dcb32d149c64cca3e603b81948a9
+   #     (Tested by configure check above)
+   #     Otherwise there will be a R_390_TLS_TPOFF relocation, which fails to
+   #     be processed in _dl_relocate_static_pie() as static TLS map is not setup.
+   #   - "s390: Add DT_JMPREL pointing to .rela.[i]plt with static-pie"
+   #     https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d942d8db12adf4c9e5c7d9ed6496a779ece7149e
+   #     (We can't test it in configure as we are not able to link a static PIE
+   #     executable if the system glibc lacks static PIE support)
+   #     Otherwise there won't be DT_JMPREL, DT_PLTRELA, DT_PLTRELASZ entries
+   #     and the IFUNC symbols are not processed, which leads to crashes.
+   #
+   # - kernel (the mentioned links to the commits belong to 5.19 merge window):
+   #   - "s390/mmap: increase stack/mmap gap to 128MB"
+   #     https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=f2f47d0ef72c30622e62471903ea19446ea79ee2
+   #   - "s390/vdso: move vdso mapping to its own function"
+   #     https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=57761da4dc5cd60bed2c81ba0edb7495c3c740b8
+   #   - "s390/vdso: map vdso above stack"
+   #     https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=9e37a2e8546f9e48ea76c839116fa5174d14e033
+   #   - "s390/vdso: add vdso randomization"
+   #     https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=41cd81abafdc4e58a93fcb677712a76885e3ca25
+   #   (We can't test the kernel of the target system)
+   #   Otherwise if /proc/sys/kernel/randomize_va_space is turned off (0),
+   #   static PIE executables like ldconfig will crash.  While startup sbrk is
+   #   used to enlarge the HEAP.  Unfortunately the underlying brk syscall fails
+   #   as there is not enough space after the HEAP.  Then the address of the TLS
+   #   image is invalid and the following memcpy in __libc_setup_tls() leads
+   #   to a segfault.
+   #   If /proc/sys/kernel/randomize_va_space is activated (default: 2), there
+   #   is enough space after HEAP.
+   #
+   # - glibc
+   #   - "Linux: Define MMAP_CALL_INTERNAL"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=c1b68685d438373efe64e5f076f4215723004dfb
+   #   - "i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=6e5c7a1e262961adb52443ab91bd2c9b72316402
+   #   - "i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=60f0f2130d30cfd008ca39743027f1e200592dff
+   #   - "ia64: Always define IA64_USE_NEW_STUB as a flag macro"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=18bd9c3d3b1b6a9182698c85354578d1d58e9d64
+   #   - "Linux: Implement a useful version of _startup_fatal"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=a2a6bce7d7e52c1c34369a7da62c501cc350bc31
+   #   - "Linux: Introduce __brk_call for invoking the brk system call"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=b57ab258c1140bc45464b4b9908713e3e0ee35aa
+   #   - "csu: Implement and use _dl_early_allocate during static startup"
+   #     https://sourceware.org/git/?p=glibc.git;a=commit;h=f787e138aa0bf677bf74fa2a08595c446292f3d7
+   #   The mentioned patch series by Florian Weimer avoids the mentioned failing
+   #   sbrk syscall by falling back to mmap.
+   AC_DEFINE(SUPPORT_STATIC_PIE)
+fi
diff --git a/sysdeps/s390/s390-64/dl-hwcap-check.h b/sysdeps/s390/s390-64/dl-hwcap-check.h
index f76993232..efe204a3f 100644
--- a/sysdeps/s390/s390-64/dl-hwcap-check.h
+++ b/sysdeps/s390/s390-64/dl-hwcap-check.h
@@ -26,7 +26,11 @@ static inline void
 dl_hwcap_check (void)
 {
 #if defined __ARCH__
-# if GCCMACRO__ARCH__ >= 13
+# if GCCMACRO__ARCH__ >= 14
+  if (!(GLRO(dl_hwcap) & HWCAP_S390_VXRS_PDE2))
+    _dl_fatal_printf ("\
+Fatal glibc error: CPU lacks VXRS_PDE2 support (z16 or later required)\n");
+# elif GCCMACRO__ARCH__ >= 13
   if (!(GLRO(dl_hwcap) & HWCAP_S390_VXRS_EXT2))
     _dl_fatal_printf ("\
 Fatal glibc error: CPU lacks VXRS_EXT2 support (z15 or later required)\n");
diff --git a/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c b/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c
index 9447a6cf4..39f494815 100644
--- a/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c
+++ b/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c
@@ -19,8 +19,8 @@
 #include <dl-hwcaps.h>
 #include <ldsodefs.h>
 
-const char _dl_hwcaps_subdirs[] = "z15:z14:z13";
-enum { subdirs_count = 3 }; /* Number of components in _dl_hwcaps_subdirs.  */
+const char _dl_hwcaps_subdirs[] = "z16:z15:z14:z13";
+enum { subdirs_count = 4 }; /* Number of components in _dl_hwcaps_subdirs.  */
 
 uint32_t
 _dl_hwcaps_subdirs_active (void)
@@ -50,5 +50,12 @@ _dl_hwcaps_subdirs_active (void)
     return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
   ++active;
 
+  /* z16.
+   Note: We do not list HWCAP_S390_NNPA here as, according to the Principles of
+   Operation, those instructions may be replaced or removed in future.  */
+  if (!(GLRO (dl_hwcap) & HWCAP_S390_VXRS_PDE2))
+    return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
+  ++active;
+
   return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
 }
diff --git a/sysdeps/s390/s390-64/start.S b/sysdeps/s390/s390-64/start.S
index 33c130204..f4c3f5131 100644
--- a/sysdeps/s390/s390-64/start.S
+++ b/sysdeps/s390/s390-64/start.S
@@ -84,10 +84,25 @@ _start:
 
 	/* Ok, now branch to the libc main routine.  */
 #ifdef PIC
+# ifdef SHARED
+	/* Used for dynamic linked position independent executable.
+	   => Scrt1.o  */
 	larl	%r2,main@GOTENT		# load pointer to main
 	lg	%r2,0(%r2)
+# else
+	/* Used for dynamic linked position dependent executable.
+	   => crt1.o (glibc configured without --disable-default-pie:
+	   PIC is defined)
+	   Or for static linked position independent executable.
+	   => rcrt1.o (only available if glibc configured without
+	   --disable-default-pie: PIC is defined) */
+	larl	%r2,__wrap_main
+# endif
 	brasl	%r14,__libc_start_main@plt
 #else
+	/* Used for dynamic/static linked position dependent executable.
+	   => crt1.o (glibc configured with --disable-default-pie:
+	   PIC and SHARED are not defined)  */
 	larl	%r2,main		# load pointer to main
 	brasl	%r14,__libc_start_main
 #endif
@@ -97,6 +112,19 @@ _start:
 
 	cfi_endproc
 
+#if defined PIC && !defined SHARED
+	/* When main is not defined in the executable but in a shared library
+	   then a wrapper is needed in crt1.o of the static-pie enabled libc,
+	   because crt1.o and rcrt1.o share code and the later must avoid the
+	   use of GOT relocations before __libc_start_main is called.  */
+__wrap_main:
+	cfi_startproc
+	larl	%r1,main@GOTENT		# load pointer to main
+	lg	%r1,0(%r1)
+	br	%r1
+	cfi_endproc
+#endif
+
 	/* Define a symbol for the first piece of initialized data.  */
 	.data
 	.globl __data_start
diff --git a/sysdeps/s390/s390-64/tst-glibc-hwcaps.c b/sysdeps/s390/s390-64/tst-glibc-hwcaps.c
index cf3b765b5..a29891bdc 100644
--- a/sysdeps/s390/s390-64/tst-glibc-hwcaps.c
+++ b/sysdeps/s390/s390-64/tst-glibc-hwcaps.c
@@ -25,6 +25,7 @@
 extern int marker2 (void);
 extern int marker3 (void);
 extern int marker4 (void);
+extern int marker5 (void);
 
 /* Return the arch level, 10 for the baseline libmarkermod*.so's.  */
 static int
@@ -63,9 +64,11 @@ compute_level (void)
     return 12;
   if (strcmp (platform, "z15") == 0)
     return 13;
+  if (strcmp (platform, "z16") == 0)
+    return 14;
   printf ("warning: unrecognized AT_PLATFORM value: %s\n", platform);
-  /* Assume that the new platform supports z15.  */
-  return 13;
+  /* Assume that the new platform supports z16.  */
+  return 14;
 }
 
 static int
@@ -76,6 +79,7 @@ do_test (void)
   TEST_COMPARE (marker2 (), MIN (level - 9, 2));
   TEST_COMPARE (marker3 (), MIN (level - 9, 3));
   TEST_COMPARE (marker4 (), MIN (level - 9, 4));
+  TEST_COMPARE (marker5 (), MIN (level - 9, 5));
   return 0;
 }
 
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 7122f5597..e897f55f3 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -126,6 +126,7 @@ tests += tst-clone tst-clone2 tst-clone3 tst-fanotify tst-personality \
   tst-prctl \
   tst-scm_rights \
   tst-epoll \
+  tst-getauxval \
   # tests
 
 # Test for the symbol version of fcntl that was replaced in glibc 2.28.
diff --git a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
index 9905ebedf..4fcb6da80 100644
--- a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
@@ -236,6 +236,7 @@
 #define __NR_sendmsg 211
 #define __NR_sendto 206
 #define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 99
 #define __NR_set_tid_address 96
 #define __NR_setdomainname 162
diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h
index 03d57b9af..a1cf59270 100644
--- a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h
+++ b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h
@@ -75,3 +75,5 @@
 #define HWCAP2_BTI		(1 << 17)
 #define HWCAP2_MTE		(1 << 18)
 #define HWCAP2_ECV		(1 << 19)
+#define HWCAP2_AFP		(1 << 20)
+#define HWCAP2_RPRES		(1 << 21)
diff --git a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
index ee8085be6..0cf74c1a9 100644
--- a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
@@ -391,6 +391,7 @@
 #define __NR_sendmsg 114
 #define __NR_sendto 133
 #define __NR_set_mempolicy 431
+#define __NR_set_mempolicy_home_node 560
 #define __NR_set_robust_list 466
 #define __NR_set_tid_address 411
 #define __NR_setdomainname 166
diff --git a/sysdeps/unix/sysv/linux/alpha/brk_call.h b/sysdeps/unix/sysv/linux/alpha/brk_call.h
new file mode 100644
index 000000000..0b851b6c8
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/alpha/brk_call.h
@@ -0,0 +1,27 @@
+/* Invoke the brk system call.  Alpha version.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+static inline void *
+__brk_call (void *addr)
+{
+  unsigned long int result = INTERNAL_SYSCALL_CALL (brk, addr);
+  if (result == -ENOMEM)
+    /* Mimic the generic error reporting behavior.  */
+    result = INTERNAL_SYSCALL_CALL (brk, 0);
+  return (void *) result;
+}
diff --git a/sysdeps/unix/sysv/linux/alpha/dl-auxv.h b/sysdeps/unix/sysv/linux/alpha/dl-auxv.h
index 81d90da09..fcec74323 100644
--- a/sysdeps/unix/sysv/linux/alpha/dl-auxv.h
+++ b/sysdeps/unix/sysv/linux/alpha/dl-auxv.h
@@ -20,16 +20,8 @@
 
 extern long __libc_alpha_cache_shape[4];
 
-#define DL_PLATFORM_AUXV				\
-      case AT_L1I_CACHESHAPE:				\
-	__libc_alpha_cache_shape[0] = av->a_un.a_val;	\
-	break;						\
-      case AT_L1D_CACHESHAPE:				\
-	__libc_alpha_cache_shape[1] = av->a_un.a_val;	\
-	break;						\
-      case AT_L2_CACHESHAPE:				\
-	__libc_alpha_cache_shape[2] = av->a_un.a_val;	\
-	break;						\
-      case AT_L3_CACHESHAPE:				\
-	__libc_alpha_cache_shape[3] = av->a_un.a_val;	\
-	break;
+#define DL_PLATFORM_AUXV					\
+  __libc_alpha_cache_shape[0] = auxv_values[AT_L1I_CACHESHAPE]; \
+  __libc_alpha_cache_shape[1] = auxv_values[AT_L1D_CACHESHAPE]; \
+  __libc_alpha_cache_shape[2] = auxv_values[AT_L2_CACHESHAPE];	\
+  __libc_alpha_cache_shape[3] = auxv_values[AT_L3_CACHESHAPE];
diff --git a/sysdeps/unix/sysv/linux/arc/arch-syscall.h b/sysdeps/unix/sysv/linux/arc/arch-syscall.h
index 1b626d977..c1207aaa1 100644
--- a/sysdeps/unix/sysv/linux/arc/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/arc/arch-syscall.h
@@ -238,6 +238,7 @@
 #define __NR_sendmsg 211
 #define __NR_sendto 206
 #define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 99
 #define __NR_set_tid_address 96
 #define __NR_setdomainname 162
diff --git a/sysdeps/unix/sysv/linux/arm/arch-syscall.h b/sysdeps/unix/sysv/linux/arm/arch-syscall.h
index 96ef8db93..e7ba04c10 100644
--- a/sysdeps/unix/sysv/linux/arm/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/arm/arch-syscall.h
@@ -302,6 +302,7 @@
 #define __NR_sendmsg 296
 #define __NR_sendto 290
 #define __NR_set_mempolicy 321
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 338
 #define __NR_set_tid_address 256
 #define __NR_set_tls 983045
diff --git a/sysdeps/unix/sysv/linux/bits/socket.h b/sysdeps/unix/sysv/linux/bits/socket.h
index 79da9e759..1dd7356a1 100644
--- a/sysdeps/unix/sysv/linux/bits/socket.h
+++ b/sysdeps/unix/sysv/linux/bits/socket.h
@@ -169,6 +169,8 @@ typedef __socklen_t socklen_t;
 #define SOL_KCM		281
 #define SOL_TLS		282
 #define SOL_XDP		283
+#define SOL_MPTCP	284
+#define SOL_MCTP	285
 
 /* Maximum queue length specifiable by listen.  */
 #define SOMAXCONN	4096
@@ -304,6 +306,12 @@ struct cmsghdr
 			 + CMSG_ALIGN (sizeof (struct cmsghdr)))
 #define CMSG_LEN(len)   (CMSG_ALIGN (sizeof (struct cmsghdr)) + (len))
 
+/* Given a length, return the additional padding necessary such that
+   len + __CMSG_PADDING(len) == CMSG_ALIGN (len).  */
+#define __CMSG_PADDING(len) ((sizeof (size_t) \
+                              - ((len) & (sizeof (size_t) - 1))) \
+                             & (sizeof (size_t) - 1))
+
 extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr,
 				      struct cmsghdr *__cmsg) __THROW;
 #ifdef __USE_EXTERN_INLINES
@@ -313,18 +321,38 @@ extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr,
 _EXTERN_INLINE struct cmsghdr *
 __NTH (__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg))
 {
+  /* We may safely assume that __cmsg lies between __mhdr->msg_control and
+     __mhdr->msg_controllen because the user is required to obtain the first
+     cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs
+     via CMSG_NXTHDR, setting lengths along the way.  However, we don't yet
+     trust the value of __cmsg->cmsg_len and therefore do not use it in any
+     pointer arithmetic until we check its value.  */
+
+  unsigned char * __msg_control_ptr = (unsigned char *) __mhdr->msg_control;
+  unsigned char * __cmsg_ptr = (unsigned char *) __cmsg;
+
+  size_t __size_needed = sizeof (struct cmsghdr)
+                         + __CMSG_PADDING (__cmsg->cmsg_len);
+
+  /* The current header is malformed, too small to be a full header.  */
   if ((size_t) __cmsg->cmsg_len < sizeof (struct cmsghdr))
-    /* The kernel header does this so there may be a reason.  */
     return (struct cmsghdr *) 0;
 
+  /* There isn't enough space between __cmsg and the end of the buffer to
+  hold the current cmsg *and* the next one.  */
+  if (((size_t)
+         (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr)
+       < __size_needed)
+      || ((size_t)
+            (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr
+             - __size_needed)
+          < __cmsg->cmsg_len))
+
+    return (struct cmsghdr *) 0;
+
+  /* Now, we trust cmsg_len and can use it to find the next header.  */
   __cmsg = (struct cmsghdr *) ((unsigned char *) __cmsg
 			       + CMSG_ALIGN (__cmsg->cmsg_len));
-  if ((unsigned char *) (__cmsg + 1) > ((unsigned char *) __mhdr->msg_control
-					+ __mhdr->msg_controllen)
-      || ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len)
-	  > ((unsigned char *) __mhdr->msg_control + __mhdr->msg_controllen)))
-    /* No more entries.  */
-    return (struct cmsghdr *) 0;
   return __cmsg;
 }
 #endif	/* Use `extern inline'.  */
diff --git a/sysdeps/unix/sysv/linux/brk.c b/sysdeps/unix/sysv/linux/brk.c
index abbabc9e8..9264a5a4a 100644
--- a/sysdeps/unix/sysv/linux/brk.c
+++ b/sysdeps/unix/sysv/linux/brk.c
@@ -19,6 +19,7 @@
 #include <errno.h>
 #include <unistd.h>
 #include <sysdep.h>
+#include <brk_call.h>
 
 /* This must be initialized data because commons can't have aliases.  */
 void *__curbrk = 0;
@@ -33,7 +34,7 @@ weak_alias (__curbrk, ___brk_addr)
 int
 __brk (void *addr)
 {
-  __curbrk = (void *) INTERNAL_SYSCALL_CALL (brk, addr);
+  __curbrk = __brk_call (addr);
   if (__curbrk < addr)
     {
       __set_errno (ENOMEM);
diff --git a/sysdeps/unix/sysv/linux/brk_call.h b/sysdeps/unix/sysv/linux/brk_call.h
new file mode 100644
index 000000000..72370c25d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/brk_call.h
@@ -0,0 +1,25 @@
+/* Invoke the brk system call.  Generic Linux version.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+static inline void *
+__brk_call (void *addr)
+{
+  /* The default implementation reports errors through an unchanged
+     break.  */
+  return (void *) INTERNAL_SYSCALL_CALL (brk, addr);
+}
diff --git a/sysdeps/unix/sysv/linux/cmsg_nxthdr.c b/sysdeps/unix/sysv/linux/cmsg_nxthdr.c
index 15b7a3a92..24f72b797 100644
--- a/sysdeps/unix/sysv/linux/cmsg_nxthdr.c
+++ b/sysdeps/unix/sysv/linux/cmsg_nxthdr.c
@@ -23,18 +23,38 @@
 struct cmsghdr *
 __cmsg_nxthdr (struct msghdr *mhdr, struct cmsghdr *cmsg)
 {
+  /* We may safely assume that cmsg lies between mhdr->msg_control and
+     mhdr->msg_controllen because the user is required to obtain the first
+     cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs
+     via CMSG_NXTHDR, setting lengths along the way.  However, we don't yet
+     trust the value of cmsg->cmsg_len and therefore do not use it in any
+     pointer arithmetic until we check its value.  */
+
+  unsigned char * msg_control_ptr = (unsigned char *) mhdr->msg_control;
+  unsigned char * cmsg_ptr = (unsigned char *) cmsg;
+
+  size_t size_needed = sizeof (struct cmsghdr)
+                       + __CMSG_PADDING (cmsg->cmsg_len);
+
+  /* The current header is malformed, too small to be a full header.  */
   if ((size_t) cmsg->cmsg_len < sizeof (struct cmsghdr))
-    /* The kernel header does this so there may be a reason.  */
-    return NULL;
+    return (struct cmsghdr *) 0;
+
+  /* There isn't enough space between cmsg and the end of the buffer to
+  hold the current cmsg *and* the next one.  */
+  if (((size_t)
+         (msg_control_ptr + mhdr->msg_controllen - cmsg_ptr)
+       < size_needed)
+      || ((size_t)
+            (msg_control_ptr + mhdr->msg_controllen - cmsg_ptr
+             - size_needed)
+          < cmsg->cmsg_len))
+
+    return (struct cmsghdr *) 0;
 
+  /* Now, we trust cmsg_len and can use it to find the next header.  */
   cmsg = (struct cmsghdr *) ((unsigned char *) cmsg
 			     + CMSG_ALIGN (cmsg->cmsg_len));
-  if ((unsigned char *) (cmsg + 1) > ((unsigned char *) mhdr->msg_control
-				      + mhdr->msg_controllen)
-      || ((unsigned char *) cmsg + CMSG_ALIGN (cmsg->cmsg_len)
-	  > ((unsigned char *) mhdr->msg_control + mhdr->msg_controllen)))
-    /* No more entries.  */
-    return NULL;
   return cmsg;
 }
 libc_hidden_def (__cmsg_nxthdr)
diff --git a/sysdeps/unix/sysv/linux/convert_scm_timestamps.c b/sysdeps/unix/sysv/linux/convert_scm_timestamps.c
index 82171bf32..dfc8c2bef 100644
--- a/sysdeps/unix/sysv/linux/convert_scm_timestamps.c
+++ b/sysdeps/unix/sysv/linux/convert_scm_timestamps.c
@@ -16,9 +16,9 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <kernel-features.h>
+#include <bits/timesize.h>
 
-#ifndef __ASSUME_TIME64_SYSCALLS
+#if __TIMESIZE != 64
 # include <stdint.h>
 # include <string.h>
 # include <sys/socket.h>
diff --git a/sysdeps/unix/sysv/linux/csky/arch-syscall.h b/sysdeps/unix/sysv/linux/csky/arch-syscall.h
index 96910154e..dc9383758 100644
--- a/sysdeps/unix/sysv/linux/csky/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/csky/arch-syscall.h
@@ -250,6 +250,7 @@
 #define __NR_sendmsg 211
 #define __NR_sendto 206
 #define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 99
 #define __NR_set_thread_area 244
 #define __NR_set_tid_address 96
diff --git a/sysdeps/unix/sysv/linux/dl-early_allocate.c b/sysdeps/unix/sysv/linux/dl-early_allocate.c
new file mode 100644
index 000000000..52c538e85
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/dl-early_allocate.c
@@ -0,0 +1,82 @@
+/* Early memory allocation for the dynamic loader.  Generic version.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Mark symbols hidden in static PIE for early self relocation to work.  */
+#if BUILD_PIE_DEFAULT
+# pragma GCC visibility push(hidden)
+#endif
+#include <startup.h>
+
+#include <ldsodefs.h>
+#include <stddef.h>
+#include <string.h>
+#include <sysdep.h>
+#include <unistd.h>
+
+#include <brk_call.h>
+#include <mmap_call.h>
+
+/* Defined in brk.c.  */
+extern void *__curbrk;
+
+void *
+_dl_early_allocate (size_t size)
+{
+  void *result;
+
+  if (__curbrk != NULL)
+    /* If the break has been initialized, brk must have run before,
+       so just call it once more.  */
+    {
+      result = __sbrk (size);
+      if (result == (void *) -1)
+        result = NULL;
+    }
+  else
+    {
+      /* If brk has not been invoked, there is no need to update
+         __curbrk.  The first call to brk will take care of that.  */
+      void *previous = __brk_call (0);
+      result = __brk_call (previous + size);
+      if (result == previous)
+        result = NULL;
+      else
+        result = previous;
+    }
+
+  /* If brk fails, fall back to mmap.  This can happen due to
+     unfortunate ASLR layout decisions and kernel bugs, particularly
+     for static PIE.  */
+  if (result == NULL)
+    {
+      long int ret;
+      int prot = PROT_READ | PROT_WRITE;
+      int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#ifdef __NR_mmap2
+      ret = MMAP_CALL_INTERNAL (mmap2, 0, size, prot, flags, -1, 0);
+#else
+      ret = MMAP_CALL_INTERNAL (mmap, 0, size, prot, flags, -1, 0);
+#endif
+      if (INTERNAL_SYSCALL_ERROR_P (ret))
+        result = NULL;
+      else
+        result = (void *) ret;
+    }
+
+  return result;
+}
diff --git a/sysdeps/unix/sysv/linux/dl-parse_auxv.h b/sysdeps/unix/sysv/linux/dl-parse_auxv.h
new file mode 100644
index 000000000..bf9374371
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/dl-parse_auxv.h
@@ -0,0 +1,61 @@
+/* Parse the Linux auxiliary vector.
+   Copyright (C) 1995-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <elf.h>
+#include <entry.h>
+#include <fpu_control.h>
+#include <ldsodefs.h>
+#include <link.h>
+
+typedef ElfW(Addr) dl_parse_auxv_t[AT_MINSIGSTKSZ + 1];
+
+/* Copy the auxiliary vector into AUXV_VALUES and set up GLRO
+   variables.  */
+static inline
+void _dl_parse_auxv (ElfW(auxv_t) *av, dl_parse_auxv_t auxv_values)
+{
+  auxv_values[AT_ENTRY] = (ElfW(Addr)) ENTRY_POINT;
+  auxv_values[AT_PAGESZ] = EXEC_PAGESIZE;
+  auxv_values[AT_FPUCW] = _FPU_DEFAULT;
+
+  /* NB: Default to a constant CONSTANT_MINSIGSTKSZ.  */
+  _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ),
+                  "CONSTANT_MINSIGSTKSZ is constant");
+  auxv_values[AT_MINSIGSTKSZ] = CONSTANT_MINSIGSTKSZ;
+
+  for (; av->a_type != AT_NULL; av++)
+    if (av->a_type <= AT_MINSIGSTKSZ)
+      auxv_values[av->a_type] = av->a_un.a_val;
+
+  GLRO(dl_pagesize) = auxv_values[AT_PAGESZ];
+  __libc_enable_secure = auxv_values[AT_SECURE];
+  GLRO(dl_platform) = (void *) auxv_values[AT_PLATFORM];
+  GLRO(dl_hwcap) = auxv_values[AT_HWCAP];
+  GLRO(dl_hwcap2) = auxv_values[AT_HWCAP2];
+  GLRO(dl_clktck) = auxv_values[AT_CLKTCK];
+  GLRO(dl_fpu_control) = auxv_values[AT_FPUCW];
+  _dl_random = (void *) auxv_values[AT_RANDOM];
+  GLRO(dl_minsigstacksize) = auxv_values[AT_MINSIGSTKSZ];
+  GLRO(dl_sysinfo_dso) = (void *) auxv_values[AT_SYSINFO_EHDR];
+#ifdef NEED_DL_SYSINFO
+  if (GLRO(dl_sysinfo_dso) != NULL)
+    GLRO(dl_sysinfo) = auxv_values[AT_SYSINFO];
+#endif
+
+  DL_PLATFORM_AUXV
+}
diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c
index 66ba489cd..c90f109b1 100644
--- a/sysdeps/unix/sysv/linux/dl-sysdep.c
+++ b/sysdeps/unix/sysv/linux/dl-sysdep.c
@@ -16,35 +16,247 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-/* Linux needs some special initialization, but otherwise uses
-   the generic dynamic linker system interface code.  */
-
-#include <string.h>
+#include <_itoa.h>
+#include <assert.h>
+#include <dl-auxv.h>
+#include <dl-osinfo.h>
+#include <dl-parse_auxv.h>
+#include <dl-procinfo.h>
+#include <dl-tunables.h>
+#include <elf.h>
+#include <errno.h>
 #include <fcntl.h>
-#include <unistd.h>
-#include <sys/param.h>
-#include <sys/utsname.h>
 #include <ldsodefs.h>
+#include <libc-internal.h>
+#include <libintl.h>
 #include <not-cancel.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <tls.h>
+#include <unistd.h>
+
+#include <dl-machine.h>
+#include <dl-hwcap-check.h>
 
 #ifdef SHARED
-# define DL_SYSDEP_INIT frob_brk ()
+extern char **_environ attribute_hidden;
+extern char _end[] attribute_hidden;
+
+/* Protect SUID program against misuse of file descriptors.  */
+extern void __libc_check_standard_fds (void);
 
-static inline void
-frob_brk (void)
+int __libc_enable_secure attribute_relro = 0;
+rtld_hidden_data_def (__libc_enable_secure)
+/* This variable contains the lowest stack address ever used.  */
+void *__libc_stack_end attribute_relro = NULL;
+rtld_hidden_data_def(__libc_stack_end)
+void *_dl_random attribute_relro = NULL;
+
+#ifndef DL_STACK_END
+# define DL_STACK_END(cookie) ((void *) (cookie))
+#endif
+
+/* Arguments passed to dl_main.  */
+struct dl_main_arguments
 {
-  __brk (0);			/* Initialize the break.  */
+  const ElfW(Phdr) *phdr;
+  ElfW(Word) phnum;
+  ElfW(Addr) user_entry;
+};
+
+/* Separate function, so that dl_main can be called without the large
+   array on the stack.  */
+static void
+_dl_sysdep_parse_arguments (void **start_argptr,
+			    struct dl_main_arguments *args)
+{
+  _dl_argc = (intptr_t) *start_argptr;
+  _dl_argv = (char **) (start_argptr + 1); /* Necessary aliasing violation.  */
+  _environ = _dl_argv + _dl_argc + 1;
+  for (char **tmp = _environ; ; ++tmp)
+    if (*tmp == NULL)
+      {
+	/* Another necessary aliasing violation.  */
+	GLRO(dl_auxv) = (ElfW(auxv_t) *) (tmp + 1);
+	break;
+      }
+
+  dl_parse_auxv_t auxv_values = { 0, };
+  _dl_parse_auxv (GLRO(dl_auxv), auxv_values);
+
+  args->phdr = (const ElfW(Phdr) *) auxv_values[AT_PHDR];
+  args->phnum = auxv_values[AT_PHNUM];
+  args->user_entry = auxv_values[AT_ENTRY];
 }
 
-# include <elf/dl-sysdep.c>
+ElfW(Addr)
+_dl_sysdep_start (void **start_argptr,
+		  void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum,
+				   ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv))
+{
+  __libc_stack_end = DL_STACK_END (start_argptr);
+
+  struct dl_main_arguments dl_main_args;
+  _dl_sysdep_parse_arguments (start_argptr, &dl_main_args);
+
+  dl_hwcap_check ();
+
+  __tunables_init (_environ);
+
+  /* Initialize DSO sorting algorithm after tunables.  */
+  _dl_sort_maps_init ();
+
+  __brk (0);			/* Initialize the break.  */
+
+#ifdef DL_PLATFORM_INIT
+  DL_PLATFORM_INIT;
 #endif
 
+  /* Determine the length of the platform name.  */
+  if (GLRO(dl_platform) != NULL)
+    GLRO(dl_platformlen) = strlen (GLRO(dl_platform));
+
+  if (__sbrk (0) == _end)
+    /* The dynamic linker was run as a program, and so the initial break
+       starts just after our bss, at &_end.  The malloc in dl-minimal.c
+       will consume the rest of this page, so tell the kernel to move the
+       break up that far.  When the user program examines its break, it
+       will see this new value and not clobber our data.  */
+    __sbrk (GLRO(dl_pagesize)
+	    - ((_end - (char *) 0) & (GLRO(dl_pagesize) - 1)));
+
+  /* If this is a SUID program we make sure that FDs 0, 1, and 2 are
+     allocated.  If necessary we are doing it ourself.  If it is not
+     possible we stop the program.  */
+  if (__builtin_expect (__libc_enable_secure, 0))
+    __libc_check_standard_fds ();
+
+  (*dl_main) (dl_main_args.phdr, dl_main_args.phnum,
+              &dl_main_args.user_entry, GLRO(dl_auxv));
+  return dl_main_args.user_entry;
+}
+
+void
+_dl_sysdep_start_cleanup (void)
+{
+}
+
+void
+_dl_show_auxv (void)
+{
+  char buf[64];
+  ElfW(auxv_t) *av;
+
+  /* Terminate string.  */
+  buf[63] = '\0';
+
+  /* The following code assumes that the AT_* values are encoded
+     starting from 0 with AT_NULL, 1 for AT_IGNORE, and all other values
+     close by (otherwise the array will be too large).  In case we have
+     to support a platform where these requirements are not fulfilled
+     some alternative implementation has to be used.  */
+  for (av = GLRO(dl_auxv); av->a_type != AT_NULL; ++av)
+    {
+      static const struct
+      {
+	const char label[22];
+	enum { unknown = 0, dec, hex, str, ignore } form : 8;
+      } auxvars[] =
+	{
+	  [AT_EXECFD - 2] =		{ "EXECFD:            ", dec },
+	  [AT_EXECFN - 2] =		{ "EXECFN:            ", str },
+	  [AT_PHDR - 2] =		{ "PHDR:              0x", hex },
+	  [AT_PHENT - 2] =		{ "PHENT:             ", dec },
+	  [AT_PHNUM - 2] =		{ "PHNUM:             ", dec },
+	  [AT_PAGESZ - 2] =		{ "PAGESZ:            ", dec },
+	  [AT_BASE - 2] =		{ "BASE:              0x", hex },
+	  [AT_FLAGS - 2] =		{ "FLAGS:             0x", hex },
+	  [AT_ENTRY - 2] =		{ "ENTRY:             0x", hex },
+	  [AT_NOTELF - 2] =		{ "NOTELF:            ", hex },
+	  [AT_UID - 2] =		{ "UID:               ", dec },
+	  [AT_EUID - 2] =		{ "EUID:              ", dec },
+	  [AT_GID - 2] =		{ "GID:               ", dec },
+	  [AT_EGID - 2] =		{ "EGID:              ", dec },
+	  [AT_PLATFORM - 2] =		{ "PLATFORM:          ", str },
+	  [AT_HWCAP - 2] =		{ "HWCAP:             ", hex },
+	  [AT_CLKTCK - 2] =		{ "CLKTCK:            ", dec },
+	  [AT_FPUCW - 2] =		{ "FPUCW:             ", hex },
+	  [AT_DCACHEBSIZE - 2] =	{ "DCACHEBSIZE:       0x", hex },
+	  [AT_ICACHEBSIZE - 2] =	{ "ICACHEBSIZE:       0x", hex },
+	  [AT_UCACHEBSIZE - 2] =	{ "UCACHEBSIZE:       0x", hex },
+	  [AT_IGNOREPPC - 2] =		{ "IGNOREPPC", ignore },
+	  [AT_SECURE - 2] =		{ "SECURE:            ", dec },
+	  [AT_BASE_PLATFORM - 2] =	{ "BASE_PLATFORM:     ", str },
+	  [AT_SYSINFO - 2] =		{ "SYSINFO:           0x", hex },
+	  [AT_SYSINFO_EHDR - 2] =	{ "SYSINFO_EHDR:      0x", hex },
+	  [AT_RANDOM - 2] =		{ "RANDOM:            0x", hex },
+	  [AT_HWCAP2 - 2] =		{ "HWCAP2:            0x", hex },
+	  [AT_MINSIGSTKSZ - 2] =	{ "MINSIGSTKSZ:       ", dec },
+	  [AT_L1I_CACHESIZE - 2] =	{ "L1I_CACHESIZE:     ", dec },
+	  [AT_L1I_CACHEGEOMETRY - 2] =	{ "L1I_CACHEGEOMETRY: 0x", hex },
+	  [AT_L1D_CACHESIZE - 2] =	{ "L1D_CACHESIZE:     ", dec },
+	  [AT_L1D_CACHEGEOMETRY - 2] =	{ "L1D_CACHEGEOMETRY: 0x", hex },
+	  [AT_L2_CACHESIZE - 2] =	{ "L2_CACHESIZE:      ", dec },
+	  [AT_L2_CACHEGEOMETRY - 2] =	{ "L2_CACHEGEOMETRY:  0x", hex },
+	  [AT_L3_CACHESIZE - 2] =	{ "L3_CACHESIZE:      ", dec },
+	  [AT_L3_CACHEGEOMETRY - 2] =	{ "L3_CACHEGEOMETRY:  0x", hex },
+	};
+      unsigned int idx = (unsigned int) (av->a_type - 2);
+
+      if ((unsigned int) av->a_type < 2u
+	  || (idx < sizeof (auxvars) / sizeof (auxvars[0])
+	      && auxvars[idx].form == ignore))
+	continue;
+
+      assert (AT_NULL == 0);
+      assert (AT_IGNORE == 1);
+
+      /* Some entries are handled in a special way per platform.  */
+      if (_dl_procinfo (av->a_type, av->a_un.a_val) == 0)
+	continue;
+
+      if (idx < sizeof (auxvars) / sizeof (auxvars[0])
+	  && auxvars[idx].form != unknown)
+	{
+	  const char *val = (char *) av->a_un.a_val;
+
+	  if (__builtin_expect (auxvars[idx].form, dec) == dec)
+	    val = _itoa ((unsigned long int) av->a_un.a_val,
+			 buf + sizeof buf - 1, 10, 0);
+	  else if (__builtin_expect (auxvars[idx].form, hex) == hex)
+	    val = _itoa ((unsigned long int) av->a_un.a_val,
+			 buf + sizeof buf - 1, 16, 0);
+
+	  _dl_printf ("AT_%s%s\n", auxvars[idx].label, val);
+
+	  continue;
+	}
+
+      /* Unknown value: print a generic line.  */
+      char buf2[17];
+      buf2[sizeof (buf2) - 1] = '\0';
+      const char *val2 = _itoa ((unsigned long int) av->a_un.a_val,
+				buf2 + sizeof buf2 - 1, 16, 0);
+      const char *val =  _itoa ((unsigned long int) av->a_type,
+				buf + sizeof buf - 1, 16, 0);
+      _dl_printf ("AT_??? (0x%s): 0x%s\n", val, val2);
+    }
+}
+
+#endif /* SHARED */
+
 
 int
 attribute_hidden
 _dl_discover_osversion (void)
 {
-#if defined NEED_DL_SYSINFO_DSO && defined SHARED
+#ifdef SHARED
   if (GLRO(dl_sysinfo_map) != NULL)
     {
       /* If the kernel-supplied DSO contains a note indicating the kernel's
@@ -75,7 +287,7 @@ _dl_discover_osversion (void)
 	      }
 	  }
     }
-#endif
+#endif /* SHARED */
 
   char bufmem[64];
   char *buf = bufmem;
diff --git a/sysdeps/unix/sysv/linux/faccessat.c b/sysdeps/unix/sysv/linux/faccessat.c
index 59ee4b6f8..1378bb2db 100644
--- a/sysdeps/unix/sysv/linux/faccessat.c
+++ b/sysdeps/unix/sysv/linux/faccessat.c
@@ -39,8 +39,8 @@ __faccessat (int fd, const char *file, int mode, int flag)
   if ((flag == 0 || ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure)))
     return INLINE_SYSCALL (faccessat, 3, fd, file, mode);
 
-  struct stat64 stats;
-  if (__fstatat64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
+  struct __stat64_t64 stats;
+  if (__fstatat64_time64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
     return -1;
 
   mode &= (X_OK | W_OK | R_OK);	/* Clear any bogus bits. */
diff --git a/sysdeps/unix/sysv/linux/fchmodat.c b/sysdeps/unix/sysv/linux/fchmodat.c
index 7aa073bf3..7aae02148 100644
--- a/sysdeps/unix/sysv/linux/fchmodat.c
+++ b/sysdeps/unix/sysv/linux/fchmodat.c
@@ -48,8 +48,8 @@ fchmodat (int fd, const char *file, mode_t mode, int flag)
 
       /* Use fstatat because fstat does not work on O_PATH descriptors
 	 before Linux 3.6.  */
-      struct stat64 st;
-      if (__fstatat64 (pathfd, "", &st, AT_EMPTY_PATH) != 0)
+      struct __stat64_t64 st;
+      if (__fstatat64_time64 (pathfd, "", &st, AT_EMPTY_PATH) != 0)
 	{
 	  __close_nocancel (pathfd);
 	  return -1;
diff --git a/sysdeps/unix/sysv/linux/getsysstats.c b/sysdeps/unix/sysv/linux/getsysstats.c
index 4798cc337..d1ea074f0 100644
--- a/sysdeps/unix/sysv/linux/getsysstats.c
+++ b/sysdeps/unix/sysv/linux/getsysstats.c
@@ -44,15 +44,14 @@ __get_nprocs_sched (void)
   int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, cpu_bits_size,
 				 cpu_bits);
   if (r > 0)
-    return CPU_COUNT_S (cpu_bits_size, (cpu_set_t*) cpu_bits);
+    return CPU_COUNT_S (r, (cpu_set_t*) cpu_bits);
   else if (r == -EINVAL)
     /* The input buffer is still not enough to store the number of cpus.  This
        is an arbitrary values assuming such systems should be rare and there
        is no offline cpus.  */
     return max_num_cpus;
-  /* Some other error.  2 is conservative (not a uniprocessor system, so
-     atomics are needed). */
-  return 2;
+  /* Some other error.  */
+  return 0;
 }
 
 static char *
@@ -108,22 +107,19 @@ next_line (int fd, char *const buffer, char **cp, char **re,
 }
 
 static int
-get_nproc_stat (char *buffer, size_t buffer_size)
+get_nproc_stat (void)
 {
+  enum { buffer_size = 1024 };
+  char buffer[buffer_size];
   char *buffer_end = buffer + buffer_size;
   char *cp = buffer_end;
   char *re = buffer_end;
-
-  /* Default to an SMP system in case we cannot obtain an accurate
-     number.  */
-  int result = 2;
+  int result = 0;
 
   const int flags = O_RDONLY | O_CLOEXEC;
   int fd = __open_nocancel ("/proc/stat", flags);
   if (fd != -1)
     {
-      result = 0;
-
       char *l;
       while ((l = next_line (fd, buffer, &cp, &re, buffer_end)) != NULL)
 	/* The current format of /proc/stat has all the cpu* entries
@@ -139,8 +135,8 @@ get_nproc_stat (char *buffer, size_t buffer_size)
   return result;
 }
 
-int
-__get_nprocs (void)
+static int
+get_nprocs_cpu_online (void)
 {
   enum { buffer_size = 1024 };
   char buffer[buffer_size];
@@ -179,7 +175,8 @@ __get_nprocs (void)
 		  }
 	      }
 
-	    result += m - n + 1;
+	    if (m >= n)
+	      result += m - n + 1;
 
 	    l = endp;
 	    if (l < re && *l == ',')
@@ -188,28 +185,18 @@ __get_nprocs (void)
 	while (l < re && *l != '\n');
 
       __close_nocancel_nostatus (fd);
-
-      if (result > 0)
-	return result;
     }
 
-  return get_nproc_stat (buffer, buffer_size);
+  return result;
 }
-libc_hidden_def (__get_nprocs)
-weak_alias (__get_nprocs, get_nprocs)
-
 
-/* On some architectures it is possible to distinguish between configured
-   and active cpus.  */
-int
-__get_nprocs_conf (void)
+static int
+get_nprocs_cpu (void)
 {
-  /* Try to use the sysfs filesystem.  It has actual information about
-     online processors.  */
+  int count = 0;
   DIR *dir = __opendir ("/sys/devices/system/cpu");
   if (dir != NULL)
     {
-      int count = 0;
       struct dirent64 *d;
 
       while ((d = __readdir64 (dir)) != NULL)
@@ -224,12 +211,57 @@ __get_nprocs_conf (void)
 
       __closedir (dir);
 
-      return count;
     }
+  return count;
+}
 
-  enum { buffer_size = 1024 };
-  char buffer[buffer_size];
-  return get_nproc_stat (buffer, buffer_size);
+static int
+get_nprocs_fallback (void)
+{
+  int result;
+
+  /* Try /proc/stat first.  */
+  result = get_nproc_stat ();
+  if (result != 0)
+    return result;
+
+  /* Try sched_getaffinity.  */
+  result = __get_nprocs_sched ();
+  if (result != 0)
+    return result;
+
+  /* We failed to obtain an accurate number.  Be conservative: return
+     the smallest number meaning that this is not a uniprocessor system,
+     so atomics are needed.  */
+  return 2;
+}
+
+int
+__get_nprocs (void)
+{
+  /* Try /sys/devices/system/cpu/online first.  */
+  int result = get_nprocs_cpu_online ();
+  if (result != 0)
+    return result;
+
+  /* Fall back to /proc/stat and sched_getaffinity.  */
+  return get_nprocs_fallback ();
+}
+libc_hidden_def (__get_nprocs)
+weak_alias (__get_nprocs, get_nprocs)
+
+/* On some architectures it is possible to distinguish between configured
+   and active cpus.  */
+int
+__get_nprocs_conf (void)
+{
+  /* Try /sys/devices/system/cpu/ first.  */
+  int result = get_nprocs_cpu ();
+  if (result != 0)
+    return result;
+
+  /* Fall back to /proc/stat and sched_getaffinity.  */
+  return get_nprocs_fallback ();
 }
 libc_hidden_def (__get_nprocs_conf)
 weak_alias (__get_nprocs_conf, get_nprocs_conf)
diff --git a/sysdeps/unix/sysv/linux/glob64-time64.c b/sysdeps/unix/sysv/linux/glob64-time64.c
index 32b4929a1..1d485dc80 100644
--- a/sysdeps/unix/sysv/linux/glob64-time64.c
+++ b/sysdeps/unix/sysv/linux/glob64-time64.c
@@ -37,6 +37,7 @@
 # define GLOB_LSTAT      gl_lstat
 # define GLOB_STAT64     __stat64_time64
 # define GLOB_LSTAT64    __lstat64_time64
+# define GLOB_FSTATAT64    __fstatat64_time64
 
 # define COMPILE_GLOB64	1
 
diff --git a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
index 36675fd48..767f1287a 100644
--- a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
@@ -289,6 +289,7 @@
 #define __NR_sendmsg 183
 #define __NR_sendto 82
 #define __NR_set_mempolicy 262
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 289
 #define __NR_set_tid_address 237
 #define __NR_setdomainname 121
diff --git a/sysdeps/unix/sysv/linux/hppa/getcontext.S b/sysdeps/unix/sysv/linux/hppa/getcontext.S
index 1e73587f1..dcdf986f2 100644
--- a/sysdeps/unix/sysv/linux/hppa/getcontext.S
+++ b/sysdeps/unix/sysv/linux/hppa/getcontext.S
@@ -21,22 +21,28 @@
 #include "ucontext_i.h"
 
 
-	/* Trampoline function. Non-standard calling ABI.  */
+	/* Trampoline function.  Non-standard calling ABI.  */
 	/* Can not use ENTRY(__getcontext_ret) here.  */
 	.type	__getcontext_ret, @function
 	.hidden	__getcontext_ret
 __getcontext_ret:
 	.proc
 	.callinfo FRAME=0,NO_CALLS
-	/* r26-r23 contain original r3-r6, but because setcontext
-	   does not reload r3-r6 (it's using them as temporaries)
-	   we must save them elsewhere and swap them back in.  */
-	copy	%r23, %r3
-	copy	%r24, %r4
-	copy	%r25, %r5
-	copy	%r26, %r6
-	/* r20 contains original return pointer.  */
-	bv	0(%r20)
+	/* Because setcontext does not reload r3-r6 (it's using them
+	   as temporaries), we must load them ourself.  */
+	ldw	oR3(%r26), %r3
+	ldw	oR4(%r26), %r4
+	ldw	oR5(%r26), %r5
+	ldw	oR6(%r26), %r6
+
+	/* Also reload registers clobbered by $$dyncall.  */
+	ldw	oR21(%r26), %r21
+	ldw	oR22(%r26), %r22
+	ldw	oR31(%r26), %r31
+
+	/* oR0 contains original return pointer.  */
+	ldw	oR0(%r26), %rp
+	bv	0(%rp)
 	copy	%r0, %ret0
 	.procend
 	.size	__getcontext_ret, .-__getcontext_ret
@@ -64,13 +70,13 @@ ENTRY(__getcontext)
 	stw	%r17, oR17(%r26)
 	stw	%r18, oR18(%r26)
 	stw	%r19, oR19(%r26)
-	/* stw	%r20, oR20(%r26) - used for trampoline.  */
+	stw	%r20, oR20(%r26)
 	stw	%r21, oR21(%r26)
 	stw	%r22, oR22(%r26)
-	/* stw	%r23, oR23(%r26) - used for trampoline.  */
-	/* stw	%r24, oR24(%r26) - used for trampoline.  */
-	/* stw	%r25, oR25(%r26) - used for trampoline.  */
-	/* stw	%r26, oR26(%r26) - used for trampoline.  */
+	stw	%r23, oR23(%r26)
+	stw	%r24, oR24(%r26)
+	stw	%r25, oR25(%r26)
+	stw	%r26, oR26(%r26)
 	stw	%r27, oR27(%r26)
 	stw	%r28, oR28(%r26)
 	stw	%r29, oR29(%r26)
@@ -89,7 +95,10 @@ ENTRY(__getcontext)
 	stw	%r0, oIASQ1(%r26)
 	stw	%r0, oIAOQ0(%r26)
 	stw	%r0, oIAOQ1(%r26)
-	stw	%r0, oSAR(%r26) /* used as flag in swapcontext().  */
+
+	/* Save SAR register.  */
+	mfctl	%sar, %r1
+	stw	%r1, oSAR(%r26) /* MSB used as flag in swapcontext().  */
 
 
 	/* Store floating-point regs.  */
@@ -137,15 +146,12 @@ ENTRY(__getcontext)
 	stw	%r19, -32(%sp)
 	.cfi_offset 19, 32
 #endif
+	stw	%ret1, -60(%sp)
+	.cfi_offset 29, 4
 
 	/* Set up the trampoline registers.
-	   r20, r23, r24, r25, r26 and r2 are clobbered
-	   by call to getcontext() anyway. Reuse them.  */
-	stw	%r2, oR20(%r26)
-	stw	%r3, oR23(%r26)
-	stw	%r4, oR24(%r26)
-	stw	%r5, oR25(%r26)
-	stw	%r6, oR26(%r26)
+	   Use oR0 context slot to save return value.  */
+	stw	%r2, oR0(%r26)
 #ifdef PIC
 	addil	LT%__getcontext_ret, %r19
 	ldw     RT%__getcontext_ret(%r1), %r1
@@ -167,6 +173,7 @@ ENTRY(__getcontext)
 #ifdef PIC
 	ldw	-32(%sp), %r19
 #endif
+	ldw	-60(%sp), %ret1
 	bv	%r0(%r2)
 	ldwm	-64(%sp), %r4
 END(__getcontext)
diff --git a/sysdeps/unix/sysv/linux/hppa/setcontext.S b/sysdeps/unix/sysv/linux/hppa/setcontext.S
index bc4872c8e..dfa794ad5 100644
--- a/sysdeps/unix/sysv/linux/hppa/setcontext.S
+++ b/sysdeps/unix/sysv/linux/hppa/setcontext.S
@@ -33,6 +33,8 @@ ENTRY(__setcontext)
 	stw	%r19, -32(%sp)
 	.cfi_offset 19, 32
 #endif
+	stw	%ret1, -60(%sp)
+	.cfi_offset 29, 4
 
 	/* Save ucp.  */
 	copy	%r26, %r3
@@ -73,7 +75,7 @@ ENTRY(__setcontext)
 	ldw	oR18(%r3), %r18
 	ldw	oR19(%r3), %r19
 	ldw	oR20(%r3), %r20
-	ldw	oR21(%r3), %r21
+	ldw	oR21(%r3), %r21 /* maybe clobbered by dyncall */
 	/* ldw	oR22(%r3), %r22 - dyncall arg.  */
 	ldw	oR23(%r3), %r23
 	ldw	oR24(%r3), %r24
@@ -85,6 +87,10 @@ ENTRY(__setcontext)
 	ldw	oR30(%r3), %sp
 	/* ldw	oR31(%r3), %r31 - dyncall scratch register */
 
+	/* Restore SAR register.  */
+	ldw	oSAR(%r3), %r22
+	mtsar	%r22
+
 	/* Restore floating-point registers.  */
 	ldo	 oFPREGS31(%r3), %r22
 	fldds	  0(%r22), %fr31
@@ -154,6 +160,7 @@ ENTRY(__setcontext)
 #ifdef PIC
 	ldw	-32(%r30), %r19
 #endif
+	ldw	-60(%r30), %ret1
 	bv	%r0(%r2)
 	ldwm	-64(%r30), %r3
 L(pseudo_end):
diff --git a/sysdeps/unix/sysv/linux/hppa/swapcontext.S b/sysdeps/unix/sysv/linux/hppa/swapcontext.S
new file mode 100644
index 000000000..fbc22586d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/hppa/swapcontext.S
@@ -0,0 +1,72 @@
+/* Swap to new context.
+   Copyright (C) 2008-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "ucontext_i.h"
+
+	.text
+ENTRY(__swapcontext)
+
+	/* Copy rp to ret0 (r28).  */
+	copy %rp,%ret0
+
+	/* Create a frame.  */
+	ldo 64(%sp),%sp
+	.cfi_def_cfa_offset -64
+
+	/* Save the current machine context to oucp.  */
+	bl __getcontext,%rp
+
+	/* Copy oucp to register ret1 (r29).  __getcontext saves and
+	   restores it on a normal return.  It is restored from oR29
+	   on reactivation.  */
+	copy %r26,%ret1
+
+	/* Pop frame.  */
+	ldo -64(%sp),%sp
+	.cfi_def_cfa_offset 0
+
+	/* Load return pointer from oR28.  */
+	ldw oR28(%ret1),%rp
+
+	/* Return if error.  */
+	or,= %r0,%ret0,%r0
+	bv,n %r0(%rp)
+
+	/* Load sc_sar flag.  */
+	ldb oSAR(%ret1),%r20
+
+	/* Return if oucp context has been reactivated.  */
+	or,= %r0,%r20,%r0
+	bv,n %r0(%rp)
+
+	/* Mark sc_sar flag.  */
+	ldi 1,%r20
+	stb %r20,oSAR(%ret1)
+
+	/* Activate the machine context in ucp.  */
+	bl __setcontext,%rp
+	ldw oR25(%ret1),%r26
+
+	/* Load return pointer.  */
+	ldw oR28(%ret1),%rp
+	bv,n %r0(%rp)
+
+END(__swapcontext)
+
+weak_alias (__swapcontext, swapcontext)
diff --git a/sysdeps/unix/sysv/linux/hppa/swapcontext.c b/sysdeps/unix/sysv/linux/hppa/swapcontext.c
deleted file mode 100644
index 5cbe00f1e..000000000
--- a/sysdeps/unix/sysv/linux/hppa/swapcontext.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Swap to new context.
-   Copyright (C) 2008-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <ucontext.h>
-
-extern int __getcontext (ucontext_t *ucp);
-extern int __setcontext (const ucontext_t *ucp);
-
-int
-__swapcontext (ucontext_t *oucp, const ucontext_t *ucp)
-{
-  /* Save the current machine context to oucp.  */
-  __getcontext (oucp);
-
-  /* mark sc_sar flag to skip the setcontext call on reactivation.  */
-  if (oucp->uc_mcontext.sc_sar == 0) {
-	oucp->uc_mcontext.sc_sar++;
-
-	/* Restore the machine context in ucp.  */
-	__setcontext (ucp);
-  }
-
-  return 0;
-}
-
-weak_alias (__swapcontext, swapcontext)
diff --git a/sysdeps/unix/sysv/linux/i386/Makefile b/sysdeps/unix/sysv/linux/i386/Makefile
index abd0009d5..e379a2e76 100644
--- a/sysdeps/unix/sysv/linux/i386/Makefile
+++ b/sysdeps/unix/sysv/linux/i386/Makefile
@@ -14,7 +14,7 @@ install-bin += lddlibc4
 endif
 
 ifeq ($(subdir),io)
-sysdep_routines += libc-do-syscall
+sysdep_routines += libc-do-syscall libc-do-syscall-int80
 endif
 
 ifeq ($(subdir),stdlib)
diff --git a/sysdeps/unix/sysv/linux/i386/arch-syscall.h b/sysdeps/unix/sysv/linux/i386/arch-syscall.h
index c86ccbda4..1998f0d76 100644
--- a/sysdeps/unix/sysv/linux/i386/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/i386/arch-syscall.h
@@ -323,6 +323,7 @@
 #define __NR_sendmsg 370
 #define __NR_sendto 369
 #define __NR_set_mempolicy 276
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 311
 #define __NR_set_thread_area 243
 #define __NR_set_tid_address 258
diff --git a/sysdeps/unix/sysv/linux/i386/libc-do-syscall-int80.S b/sysdeps/unix/sysv/linux/i386/libc-do-syscall-int80.S
new file mode 100644
index 000000000..2c472f255
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/i386/libc-do-syscall-int80.S
@@ -0,0 +1,25 @@
+/* Out-of-line syscall stub for six-argument syscalls from C.  For static PIE.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef SHARED
+# define I386_USE_SYSENTER 0
+# include <sysdep.h>
+
+# define __libc_do_syscall __libc_do_syscall_int80
+# include "libc-do-syscall.S"
+#endif
diff --git a/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S b/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S
index 04154f43e..3eea5f3a5 100644
--- a/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S
+++ b/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S
@@ -18,8 +18,6 @@
 
 #include <sysdep.h>
 
-#ifndef OPTIMIZE_FOR_GCC_5
-
 /* %eax, %ecx, %edx and %esi contain the values expected by the kernel.
    %edi points to a structure with the values of %ebx, %edi and %ebp.  */
 
@@ -50,4 +48,3 @@ ENTRY (__libc_do_syscall)
 	cfi_restore (ebx)
 	ret
 END (__libc_do_syscall)
-#endif
diff --git a/sysdeps/unix/sysv/linux/i386/startup.h b/sysdeps/unix/sysv/linux/i386/startup.h
index aab8e26ca..213805d7d 100644
--- a/sysdeps/unix/sysv/linux/i386/startup.h
+++ b/sysdeps/unix/sysv/linux/i386/startup.h
@@ -1,5 +1,5 @@
 /* Linux/i386 definitions of functions used by static libc main startup.
-   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   Copyright (C) 2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,46 +16,7 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#if BUILD_PIE_DEFAULT
-/* Can't use "call *%gs:SYSINFO_OFFSET" during statup in static PIE.  */
-# define I386_USE_SYSENTER 0
+/* Can't use "call *%gs:SYSINFO_OFFSET" during startup.  */
+#define I386_USE_SYSENTER 0
 
-# include <sysdep.h>
-# include <abort-instr.h>
-
-__attribute__ ((__noreturn__))
-static inline void
-_startup_fatal (const char *message __attribute__ ((unused)))
-{
-  /* This is only called very early during startup in static PIE.
-     FIXME: How can it be improved?  */
-  ABORT_INSTRUCTION;
-  __builtin_unreachable ();
-}
-
-static inline uid_t
-startup_getuid (void)
-{
-  return (uid_t) INTERNAL_SYSCALL_CALL (getuid32);
-}
-
-static inline uid_t
-startup_geteuid (void)
-{
-  return (uid_t) INTERNAL_SYSCALL_CALL (geteuid32);
-}
-
-static inline gid_t
-startup_getgid (void)
-{
-  return (gid_t) INTERNAL_SYSCALL_CALL (getgid32);
-}
-
-static inline gid_t
-startup_getegid (void)
-{
-  return (gid_t) INTERNAL_SYSCALL_CALL (getegid32);
-}
-#else
-# include_next <startup.h>
-#endif
+#include_next <startup.h>
diff --git a/sysdeps/unix/sysv/linux/i386/sysdep.h b/sysdeps/unix/sysv/linux/i386/sysdep.h
index 4558ab66c..7085f7e19 100644
--- a/sysdeps/unix/sysv/linux/i386/sysdep.h
+++ b/sysdeps/unix/sysv/linux/i386/sysdep.h
@@ -42,6 +42,15 @@
 # endif
 #endif
 
+#if !I386_USE_SYSENTER && IS_IN (libc) && !defined SHARED
+/* Inside static libc, we have two versions.  For compilation units
+   with !I386_USE_SYSENTER, the vDSO entry mechanism cannot be
+   used. */
+# define I386_DO_SYSCALL_STRING "__libc_do_syscall_int80"
+#else
+# define I386_DO_SYSCALL_STRING "__libc_do_syscall"
+#endif
+
 #ifdef __ASSEMBLER__
 
 /* Linux uses a negative return value to indicate syscall errors,
@@ -301,7 +310,7 @@ struct libc_do_syscall_args
     };									\
     asm volatile (							\
     "movl %1, %%eax\n\t"						\
-    "call __libc_do_syscall"						\
+    "call " I386_DO_SYSCALL_STRING					\
     : "=a" (resultvar)							\
     : "i" (__NR_##name), "c" (arg2), "d" (arg3), "S" (arg4), "D" (&_xv) \
     : "memory", "cc")
@@ -315,7 +324,7 @@ struct libc_do_syscall_args
     };									\
     asm volatile (							\
     "movl %1, %%eax\n\t"						\
-    "call __libc_do_syscall"						\
+    "call " I386_DO_SYSCALL_STRING					\
     : "=a" (resultvar)							\
     : "a" (name), "c" (arg2), "d" (arg3), "S" (arg4), "D" (&_xv)	\
     : "memory", "cc")
diff --git a/sysdeps/unix/sysv/linux/ia64/Makefile b/sysdeps/unix/sysv/linux/ia64/Makefile
index da85ba43e..c5cc41b36 100644
--- a/sysdeps/unix/sysv/linux/ia64/Makefile
+++ b/sysdeps/unix/sysv/linux/ia64/Makefile
@@ -1,3 +1,9 @@
+ifeq ($(subdir),elf)
+# ia64 does not support PT_GNU_RELRO.
+test-xfail-tst-relro-ldso = yes
+test-xfail-tst-relro-libc = yes
+endif
+
 ifeq ($(subdir),misc)
 sysdep_headers += sys/rse.h
 endif
diff --git a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
index d898bce40..b2eab1b93 100644
--- a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
@@ -272,6 +272,7 @@
 #define __NR_sendmsg 1205
 #define __NR_sendto 1199
 #define __NR_set_mempolicy 1261
+#define __NR_set_mempolicy_home_node 1474
 #define __NR_set_robust_list 1298
 #define __NR_set_tid_address 1233
 #define __NR_setdomainname 1129
diff --git a/sysdeps/unix/sysv/linux/ia64/brk.c b/sysdeps/unix/sysv/linux/ia64/brk.c
index 65142aeae..d2135b74f 100644
--- a/sysdeps/unix/sysv/linux/ia64/brk.c
+++ b/sysdeps/unix/sysv/linux/ia64/brk.c
@@ -16,7 +16,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <dl-sysdep.h>
-/* brk is used by statup before TCB is properly set.  */
-#undef USE_DL_SYSINFO
+/* brk is used by startup before TCB is properly set up.  */
+#define IA64_USE_NEW_STUB 0
 #include <sysdeps/unix/sysv/linux/brk.c>
diff --git a/sysdeps/unix/sysv/linux/ia64/startup.h b/sysdeps/unix/sysv/linux/ia64/startup.h
new file mode 100644
index 000000000..77f29f15a
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/ia64/startup.h
@@ -0,0 +1,22 @@
+/* Linux/ia64 definitions of functions used by static libc main startup.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* This code is used before the TCB is set up.  */
+#define IA64_USE_NEW_STUB 0
+
+#include_next <startup.h>
diff --git a/sysdeps/unix/sysv/linux/ia64/sysdep.h b/sysdeps/unix/sysv/linux/ia64/sysdep.h
index 193ecee02..14adbdf4f 100644
--- a/sysdeps/unix/sysv/linux/ia64/sysdep.h
+++ b/sysdeps/unix/sysv/linux/ia64/sysdep.h
@@ -44,12 +44,15 @@
 #undef SYS_ify
 #define SYS_ify(syscall_name)	__NR_##syscall_name
 
-#if defined USE_DL_SYSINFO \
-	&& (IS_IN (libc) \
-	    || IS_IN (libpthread) || IS_IN (librt))
-# define IA64_USE_NEW_STUB
-#else
-# undef IA64_USE_NEW_STUB
+#ifndef IA64_USE_NEW_STUB
+# if defined USE_DL_SYSINFO && IS_IN (libc)
+#  define IA64_USE_NEW_STUB 1
+# else
+#  define IA64_USE_NEW_STUB 0
+# endif
+#endif
+#if IA64_USE_NEW_STUB && !USE_DL_SYSINFO
+# error IA64_USE_NEW_STUB needs USE_DL_SYSINFO
 #endif
 
 #ifdef __ASSEMBLER__
@@ -101,7 +104,7 @@
 	mov r15=num;				\
 	break __IA64_BREAK_SYSCALL
 
-#ifdef IA64_USE_NEW_STUB
+#if IA64_USE_NEW_STUB
 # ifdef SHARED
 #  define DO_CALL(num)				\
 	.prologue;				\
@@ -185,7 +188,7 @@
    (non-negative) errno on error or the return value on success.
  */
 
-#ifdef IA64_USE_NEW_STUB
+#if IA64_USE_NEW_STUB
 
 # define INTERNAL_SYSCALL_NCS(name, nr, args...)			      \
 ({									      \
@@ -277,7 +280,7 @@
 #define ASM_OUTARGS_5	ASM_OUTARGS_4, "=r" (_out4)
 #define ASM_OUTARGS_6	ASM_OUTARGS_5, "=r" (_out5)
 
-#ifdef IA64_USE_NEW_STUB
+#if IA64_USE_NEW_STUB
 #define ASM_ARGS_0
 #define ASM_ARGS_1	ASM_ARGS_0, "4" (_out0)
 #define ASM_ARGS_2	ASM_ARGS_1, "5" (_out1)
@@ -313,7 +316,7 @@
   /* Branch registers.  */						\
   "b6"
 
-#ifdef IA64_USE_NEW_STUB
+#if IA64_USE_NEW_STUB
 # define ASM_CLOBBERS_6	ASM_CLOBBERS_6_COMMON
 #else
 # define ASM_CLOBBERS_6	ASM_CLOBBERS_6_COMMON , "b7"
diff --git a/sysdeps/unix/sysv/linux/ldsodefs.h b/sysdeps/unix/sysv/linux/ldsodefs.h
index 011756ddc..af108991f 100644
--- a/sysdeps/unix/sysv/linux/ldsodefs.h
+++ b/sysdeps/unix/sysv/linux/ldsodefs.h
@@ -24,16 +24,4 @@
 /* Get the real definitions.  */
 #include_next <ldsodefs.h>
 
-/* We can assume that the kernel always provides the AT_UID, AT_EUID,
-   AT_GID, and AT_EGID values in the auxiliary vector from 2.4.0 or so on.  */
-#define HAVE_AUX_XID
-
-/* We can assume that the kernel always provides the AT_SECURE value
-   in the auxiliary vector from 2.5.74 or so on.  */
-#define HAVE_AUX_SECURE
-
-/* Starting with one of the 2.4.0 pre-releases the Linux kernel passes
-   up the page size information.  */
-#define HAVE_AUX_PAGESIZE
-
 #endif /* ldsodefs.h */
diff --git a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
index fe721b809..5fc372377 100644
--- a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
@@ -310,6 +310,7 @@
 #define __NR_sendmsg 367
 #define __NR_sendto 366
 #define __NR_set_mempolicy 270
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 304
 #define __NR_set_thread_area 334
 #define __NR_set_tid_address 253
diff --git a/sysdeps/unix/sysv/linux/m68k/libc-lock-arch.h b/sysdeps/unix/sysv/linux/m68k/libc-lock-arch.h
new file mode 100644
index 000000000..1844bbaf6
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/m68k/libc-lock-arch.h
@@ -0,0 +1,25 @@
+/* Private libc-internal arch-specific definitions.  m68k version.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2.1 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If
+   not, see <https://www.gnu.org/licenses/>.  */
+
+#ifndef _LIBC_LOCK_ARCH_H
+#define _LIBC_LOCK_ARCH_H
+
+/* Linux enforces 4-bytes alignment on futex inputs.  */
+#define __LIBC_LOCK_ALIGNMENT __attribute__ ((__aligned__ (4)))
+
+#endif
diff --git a/sysdeps/unix/sysv/linux/m68k/sysdep.h b/sysdeps/unix/sysv/linux/m68k/sysdep.h
index 628e1be83..d87892a37 100644
--- a/sysdeps/unix/sysv/linux/m68k/sysdep.h
+++ b/sysdeps/unix/sysv/linux/m68k/sysdep.h
@@ -299,8 +299,6 @@ SYSCALL_ERROR_LABEL:							      \
 #define PTR_MANGLE(var) (void) (var)
 #define PTR_DEMANGLE(var) (void) (var)
 
-#if defined NEED_DL_SYSINFO || defined NEED_DL_SYSINFO_DSO
 /* M68K needs system-supplied DSO to access TLS helpers
    even when statically linked.  */
-# define NEED_STATIC_SYSINFO_DSO 1
-#endif
+#define NEED_STATIC_SYSINFO_DSO 1
diff --git a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
index 6e10c3661..b6e9b007e 100644
--- a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
@@ -326,6 +326,7 @@
 #define __NR_sendmsg 360
 #define __NR_sendto 353
 #define __NR_set_mempolicy 276
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 311
 #define __NR_set_thread_area 243
 #define __NR_set_tid_address 258
diff --git a/sysdeps/unix/sysv/linux/mips/bits/struct_stat.h b/sysdeps/unix/sysv/linux/mips/bits/struct_stat.h
index 7747b3e47..71594e456 100644
--- a/sysdeps/unix/sysv/linux/mips/bits/struct_stat.h
+++ b/sysdeps/unix/sysv/linux/mips/bits/struct_stat.h
@@ -131,27 +131,30 @@ struct stat64
 
 struct stat
   {
+# ifdef __USE_TIME_BITS64
+#  include <bits/struct_stat_time64_helper.h>
+# else
     __dev_t st_dev;
     int	st_pad1[3];		/* Reserved for st_dev expansion  */
-# ifndef __USE_FILE_OFFSET64
+#  ifndef __USE_FILE_OFFSET64
     __ino_t st_ino;
-# else
+#  else
     __ino64_t st_ino;
-# endif
+#  endif
     __mode_t st_mode;
     __nlink_t st_nlink;
     __uid_t st_uid;
     __gid_t st_gid;
     __dev_t st_rdev;
-# if !defined __USE_FILE_OFFSET64
+#  if !defined __USE_FILE_OFFSET64
     unsigned int st_pad2[2];	/* Reserved for st_rdev expansion  */
     __off_t st_size;
     int st_pad3;
-# else
+#  else
     unsigned int st_pad2[3];	/* Reserved for st_rdev expansion  */
     __off64_t st_size;
-# endif
-# ifdef __USE_XOPEN2K8
+#  endif
+#  ifdef __USE_XOPEN2K8
     /* Nanosecond resolution timestamps are stored in a format
        equivalent to 'struct timespec'.  This is the type used
        whenever possible but the Unix namespace rules do not allow the
@@ -161,30 +164,34 @@ struct stat
     struct timespec st_atim;            /* Time of last access.  */
     struct timespec st_mtim;            /* Time of last modification.  */
     struct timespec st_ctim;            /* Time of last status change.  */
-#  define st_atime st_atim.tv_sec        /* Backward compatibility.  */
-#  define st_mtime st_mtim.tv_sec
-#  define st_ctime st_ctim.tv_sec
-# else
+#   define st_atime st_atim.tv_sec        /* Backward compatibility.  */
+#   define st_mtime st_mtim.tv_sec
+#   define st_ctime st_ctim.tv_sec
+#  else
     __time_t st_atime;			/* Time of last access.  */
     unsigned long int st_atimensec;	/* Nscecs of last access.  */
     __time_t st_mtime;			/* Time of last modification.  */
     unsigned long int st_mtimensec;	/* Nsecs of last modification.  */
     __time_t st_ctime;			/* Time of last status change.  */
     unsigned long int st_ctimensec;	/* Nsecs of last status change.  */
-# endif
+#  endif
     __blksize_t st_blksize;
     unsigned int st_pad4;
-# ifndef __USE_FILE_OFFSET64
+#  ifndef __USE_FILE_OFFSET64
     __blkcnt_t st_blocks;
-# else
+#  else
     __blkcnt64_t st_blocks;
-# endif
+#  endif
     int st_pad5[14];
+# endif
   };
 
 #ifdef __USE_LARGEFILE64
 struct stat64
   {
+# ifdef __USE_TIME_BITS64
+#  include <bits/struct_stat_time64_helper.h>
+# else
     __dev_t st_dev;
     unsigned int st_pad1[3];	/* Reserved for st_dev expansion  */
     __ino64_t st_ino;
@@ -217,6 +224,7 @@ struct stat64
     unsigned int st_pad3;
     __blkcnt64_t st_blocks;
     int st_pad4[14];
+# endif /* __USE_TIME_BITS64  */
 };
 #endif
 
diff --git a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
index 26a6d594a..b3a3871f8 100644
--- a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
@@ -308,6 +308,7 @@
 #define __NR_sendmsg 4179
 #define __NR_sendto 4180
 #define __NR_set_mempolicy 4270
+#define __NR_set_mempolicy_home_node 4450
 #define __NR_set_robust_list 4309
 #define __NR_set_thread_area 4283
 #define __NR_set_tid_address 4252
diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
index 83e0d49c5..b46218272 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
@@ -288,6 +288,7 @@
 #define __NR_sendmsg 6045
 #define __NR_sendto 6043
 #define __NR_set_mempolicy 6233
+#define __NR_set_mempolicy_home_node 6450
 #define __NR_set_robust_list 6272
 #define __NR_set_thread_area 6246
 #define __NR_set_tid_address 6213
diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
index d6747c542..a9d6b9457 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
@@ -270,6 +270,7 @@
 #define __NR_sendmsg 5045
 #define __NR_sendto 5043
 #define __NR_set_mempolicy 5229
+#define __NR_set_mempolicy_home_node 5450
 #define __NR_set_robust_list 5268
 #define __NR_set_thread_area 5242
 #define __NR_set_tid_address 5212
diff --git a/sysdeps/unix/sysv/linux/mmap_call.h b/sysdeps/unix/sysv/linux/mmap_call.h
new file mode 100644
index 000000000..3547c99e1
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/mmap_call.h
@@ -0,0 +1,22 @@
+/* Generic definition of MMAP_CALL and MMAP_CALL_INTERNAL.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \
+  INLINE_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset)
+#define MMAP_CALL_INTERNAL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \
+  INTERNAL_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset)
diff --git a/sysdeps/unix/sysv/linux/mmap_internal.h b/sysdeps/unix/sysv/linux/mmap_internal.h
index 841b73139..aebf97d06 100644
--- a/sysdeps/unix/sysv/linux/mmap_internal.h
+++ b/sysdeps/unix/sysv/linux/mmap_internal.h
@@ -42,10 +42,6 @@ static uint64_t page_unit;
 /* Do not accept offset not multiple of page size.  */
 #define MMAP_OFF_LOW_MASK  (MMAP2_PAGE_UNIT - 1)
 
-/* An architecture may override this.  */
-#ifndef MMAP_CALL
-# define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \
-  INLINE_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset)
-#endif
+#include <mmap_call.h>
 
 #endif /* MMAP_INTERNAL_LINUX_H  */
diff --git a/sysdeps/unix/sysv/linux/mq_timedreceive.c b/sysdeps/unix/sysv/linux/mq_timedreceive.c
index 834cd7a48..5bf1e0a83 100644
--- a/sysdeps/unix/sysv/linux/mq_timedreceive.c
+++ b/sysdeps/unix/sysv/linux/mq_timedreceive.c
@@ -41,7 +41,7 @@ ___mq_timedreceive_time64 (mqd_t mqdes, char *__restrict msg_ptr, size_t msg_len
     {
       int r = SYSCALL_CANCEL (mq_timedreceive_time64, mqdes, msg_ptr, msg_len,
 			      msg_prio, abs_timeout);
-      if (r == 0 || errno != ENOSYS)
+      if (r >= 0 || errno != ENOSYS)
 	return r;
       __set_errno (EOVERFLOW);
       return -1;
diff --git a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
index 4ee209bc4..809a219ef 100644
--- a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
@@ -250,6 +250,7 @@
 #define __NR_sendmsg 211
 #define __NR_sendto 206
 #define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 99
 #define __NR_set_tid_address 96
 #define __NR_setdomainname 162
diff --git a/sysdeps/unix/sysv/linux/or1k/arch-syscall.h b/sysdeps/unix/sysv/linux/or1k/arch-syscall.h
index ddeaeceeb..1364f4cbc 100644
--- a/sysdeps/unix/sysv/linux/or1k/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/or1k/arch-syscall.h
@@ -251,6 +251,7 @@
 #define __NR_sendmsg 211
 #define __NR_sendto 206
 #define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 99
 #define __NR_set_tid_address 96
 #define __NR_setdomainname 162
diff --git a/sysdeps/unix/sysv/linux/pathconf.c b/sysdeps/unix/sysv/linux/pathconf.c
index 107cf9878..dc6864852 100644
--- a/sysdeps/unix/sysv/linux/pathconf.c
+++ b/sysdeps/unix/sysv/linux/pathconf.c
@@ -110,8 +110,8 @@ distinguish_extX (const struct statfs *fsbuf, const char *file, int fd)
 	      && strcmp (mntbuf.mnt_type, "ext4") != 0)
 	    continue;
 
-	  struct stat64 fsst;
-	  if (__stat64 (mntbuf.mnt_dir, &fsst) >= 0
+	  struct __stat64_t64 fsst;
+	  if (__stat64_time64 (mntbuf.mnt_dir, &fsst) >= 0
 	      && st.st_dev == fsst.st_dev)
 	    {
 	      if (strcmp (mntbuf.mnt_type, "ext4") == 0)
diff --git a/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h b/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h
index 594371940..ce2281cf1 100644
--- a/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h
+++ b/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h
@@ -16,15 +16,5 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <ldsodefs.h>
-
-#if IS_IN (libc) && !defined SHARED
-int GLRO(dl_cache_line_size);
-#endif
-
-/* Scan the Aux Vector for the "Data Cache Block Size" entry and assign it
-   to dl_cache_line_size.  */
-#define DL_PLATFORM_AUXV						      \
-      case AT_DCACHEBSIZE:						      \
-	GLRO(dl_cache_line_size) = av->a_un.a_val;			      \
-	break;
+#define DL_PLATFORM_AUXV \
+  GLRO(dl_cache_line_size) = auxv_values[AT_DCACHEBSIZE];
diff --git a/sysdeps/unix/sysv/linux/powerpc/dl-support.c b/sysdeps/unix/sysv/linux/powerpc/dl-support.c
new file mode 100644
index 000000000..abe68a704
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/dl-support.c
@@ -0,0 +1,4 @@
+#include <elf/dl-support.c>
+
+/* Populated from the auxiliary vector.  */
+int _dl_cache_line_size;
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
index 497299fbc..627831eba 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
@@ -319,6 +319,7 @@
 #define __NR_sendmsg 341
 #define __NR_sendto 335
 #define __NR_set_mempolicy 261
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 300
 #define __NR_set_tid_address 232
 #define __NR_setdomainname 121
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
index e840279f1..bae597199 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
@@ -298,6 +298,7 @@
 #define __NR_sendmsg 341
 #define __NR_sendto 335
 #define __NR_set_mempolicy 261
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 300
 #define __NR_set_tid_address 232
 #define __NR_setdomainname 121
diff --git a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
index 73ef74c00..202520ee2 100644
--- a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
@@ -122,6 +122,7 @@
 #define __NR_mbind 235
 #define __NR_membarrier 283
 #define __NR_memfd_create 279
+#define __NR_memfd_secret 447
 #define __NR_migrate_pages 238
 #define __NR_mincore 232
 #define __NR_mkdirat 34
@@ -228,6 +229,7 @@
 #define __NR_sendmsg 211
 #define __NR_sendto 206
 #define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 99
 #define __NR_set_tid_address 96
 #define __NR_setdomainname 162
diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
index 919a79ee9..4e65f337d 100644
--- a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
@@ -127,6 +127,7 @@
 #define __NR_mbind 235
 #define __NR_membarrier 283
 #define __NR_memfd_create 279
+#define __NR_memfd_secret 447
 #define __NR_migrate_pages 238
 #define __NR_mincore 232
 #define __NR_mkdirat 34
@@ -235,6 +236,7 @@
 #define __NR_sendmsg 211
 #define __NR_sendto 206
 #define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 99
 #define __NR_set_tid_address 96
 #define __NR_setdomainname 162
diff --git a/sysdeps/unix/sysv/linux/s390/mmap_internal.h b/sysdeps/unix/sysv/linux/s390/mmap_call.h
similarity index 78%
rename from sysdeps/unix/sysv/linux/s390/mmap_internal.h
rename to sysdeps/unix/sysv/linux/s390/mmap_call.h
index cc76ac973..f169b8bab 100644
--- a/sysdeps/unix/sysv/linux/s390/mmap_internal.h
+++ b/sysdeps/unix/sysv/linux/s390/mmap_call.h
@@ -16,9 +16,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#ifndef MMAP_S390_INTERNAL_H
-# define MMAP_S390_INTERNAL_H
-
 #define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset)	\
   ({									\
     long int __args[6] = { (long int) (__addr), (long int) (__len),	\
@@ -26,7 +23,10 @@
 			   (long int) (__fd), (long int) (__offset) };	\
     INLINE_SYSCALL_CALL (__nr, __args);					\
   })
-
-#include_next <mmap_internal.h>
-
-#endif
+#define MMAP_CALL_INTERNAL(__nr, __addr, __len, __prot, __flags, __fd, __offset)	\
+  ({									\
+    long int __args[6] = { (long int) (__addr), (long int) (__len),	\
+			   (long int) (__prot), (long int) (__flags),	\
+			   (long int) (__fd), (long int) (__offset) };	\
+    INTERNAL_SYSCALL_CALL (__nr, __args);				\
+  })
diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
index 005c0ada7..57025107e 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
@@ -311,6 +311,7 @@
 #define __NR_sendmsg 370
 #define __NR_sendto 369
 #define __NR_set_mempolicy 270
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 304
 #define __NR_set_tid_address 252
 #define __NR_setdomainname 121
diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
index 9131fddcc..72e19c6d5 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
@@ -278,6 +278,7 @@
 #define __NR_sendmsg 370
 #define __NR_sendto 369
 #define __NR_set_mempolicy 270
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 304
 #define __NR_set_tid_address 252
 #define __NR_setdomainname 121
diff --git a/sysdeps/unix/sysv/linux/sh/arch-syscall.h b/sysdeps/unix/sysv/linux/sh/arch-syscall.h
index d8fb04156..d52b522d9 100644
--- a/sysdeps/unix/sysv/linux/sh/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/sh/arch-syscall.h
@@ -303,6 +303,7 @@
 #define __NR_sendmsg 355
 #define __NR_sendto 349
 #define __NR_set_mempolicy 276
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 311
 #define __NR_set_tid_address 258
 #define __NR_setdomainname 121
diff --git a/sysdeps/unix/sysv/linux/sparc/brk.c b/sysdeps/unix/sysv/linux/sparc/brk.c
deleted file mode 100644
index c5c1ee028..000000000
--- a/sysdeps/unix/sysv/linux/sparc/brk.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Change data segment.  Linux SPARC version.
-   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <errno.h>
-#include <unistd.h>
-#include <sysdep.h>
-
-/* This must be initialized data because commons can't have aliases.  */
-void *__curbrk = 0;
-
-#if HAVE_INTERNAL_BRK_ADDR_SYMBOL
-/* Old braindamage in GCC's crtstuff.c requires this symbol in an attempt
-   to work around different old braindamage in the old Linux ELF dynamic
-   linker.  */
-weak_alias (__curbrk, ___brk_addr)
-#endif
-
-#ifdef __arch64__
-# define SYSCALL_NUM "0x6d"
-#else
-# define SYSCALL_NUM "0x10"
-#endif
-
-int
-__brk (void *addr)
-{
-  register long int g1 asm ("g1") = __NR_brk;
-  register long int o0 asm ("o0") = (long int) addr;
-  asm volatile ("ta " SYSCALL_NUM
-		: "=r"(o0)
-		: "r"(g1), "0"(o0)
-		: "cc");
-  __curbrk = (void *) o0;
-
-  if (__curbrk < addr)
-    {
-      __set_errno (ENOMEM);
-      return -1;
-    }
-
-  return 0;
-}
-weak_alias (__brk, brk)
diff --git a/sysdeps/unix/sysv/linux/alpha/brk.c b/sysdeps/unix/sysv/linux/sparc/brk_call.h
similarity index 61%
rename from sysdeps/unix/sysv/linux/alpha/brk.c
rename to sysdeps/unix/sysv/linux/sparc/brk_call.h
index 32082a4fa..59ce52166 100644
--- a/sysdeps/unix/sysv/linux/alpha/brk.c
+++ b/sysdeps/unix/sysv/linux/sparc/brk_call.h
@@ -1,5 +1,5 @@
-/* Change data segment size.  Linux/Alpha.
-   Copyright (C) 2020-2022 Free Software Foundation, Inc.
+/* Invoke the brk system call.  Sparc version.
+   Copyright (C) 2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,23 +16,20 @@
    License along with the GNU C Library.  If not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <errno.h>
-#include <unistd.h>
-#include <sysdep.h>
+#ifdef __arch64__
+# define SYSCALL_NUM "0x6d"
+#else
+# define SYSCALL_NUM "0x10"
+#endif
 
-void *__curbrk = 0;
-
-int
-__brk (void *addr)
+static inline void *
+__brk_call (void *addr)
 {
-  /* Alpha brk returns -ENOMEM in case of failure.  */
-  __curbrk = (void *) INTERNAL_SYSCALL_CALL (brk, addr);
-  if ((unsigned long) __curbrk == -ENOMEM)
-    {
-      __set_errno (ENOMEM);
-      return -1;
-    }
-
-  return 0;
+  register long int g1 asm ("g1") = __NR_brk;
+  register long int o0 asm ("o0") = (long int) addr;
+  asm volatile ("ta " SYSCALL_NUM
+		: "=r"(o0)
+		: "r"(g1), "0"(o0)
+		: "cc");
+  return (void *) o0;
 }
-weak_alias (__brk, brk)
diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
index 2bc014fe6..d3f4d8aa3 100644
--- a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
@@ -310,6 +310,7 @@
 #define __NR_sendmsg 114
 #define __NR_sendto 133
 #define __NR_set_mempolicy 305
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 300
 #define __NR_set_tid_address 166
 #define __NR_setdomainname 163
diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
index 76dbbe595..2cc03d7a2 100644
--- a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
@@ -286,6 +286,7 @@
 #define __NR_sendmsg 114
 #define __NR_sendto 133
 #define __NR_set_mempolicy 305
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 300
 #define __NR_set_tid_address 166
 #define __NR_setdomainname 163
diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index d703485e3..d6f5ca89c 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -409,7 +409,7 @@ __spawnix (pid_t * pid, const char *file,
 	__waitpid (new_pid, NULL, 0);
     }
   else
-    ec = -new_pid;
+    ec = errno;
 
   __munmap (stack, stack_size);
 
diff --git a/sysdeps/unix/sysv/linux/startup.h b/sysdeps/unix/sysv/linux/startup.h
new file mode 100644
index 000000000..39859b404
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/startup.h
@@ -0,0 +1,39 @@
+/* Linux definitions of functions used by static libc main startup.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifdef SHARED
+# include_next <startup.h>
+#else
+# include <sysdep.h>
+
+/* Avoid a run-time invocation of strlen.  */
+#define _startup_fatal(message)                                         \
+  do                                                                    \
+    {                                                                   \
+      size_t __message_length = __builtin_strlen (message);             \
+      if (! __builtin_constant_p (__message_length))                    \
+        {                                                               \
+          extern void _startup_fatal_not_constant (void);               \
+          _startup_fatal_not_constant ();                               \
+        }                                                               \
+      INTERNAL_SYSCALL_CALL (write, STDERR_FILENO, (message),           \
+                             __message_length);                         \
+      INTERNAL_SYSCALL_CALL (exit_group, 127);                          \
+    }                                                                   \
+  while (0)
+#endif  /* !SHARED */
diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
index 642180611..028ad3107 100644
--- a/sysdeps/unix/sysv/linux/syscall-names.list
+++ b/sysdeps/unix/sysv/linux/syscall-names.list
@@ -21,8 +21,8 @@
 # This file can list all potential system calls.  The names are only
 # used if the installed kernel headers also provide them.
 
-# The list of system calls is current as of Linux 5.16.
-kernel 5.16
+# The list of system calls is current as of Linux 5.19.
+kernel 5.19
 
 FAST_atomic_update
 FAST_cmpxchg
@@ -524,6 +524,7 @@ sendmmsg
 sendmsg
 sendto
 set_mempolicy
+set_mempolicy_home_node
 set_robust_list
 set_thread_area
 set_tid_address
diff --git a/sysdeps/unix/sysv/linux/tst-getauxval.c b/sysdeps/unix/sysv/linux/tst-getauxval.c
new file mode 100644
index 000000000..c4b619574
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-getauxval.c
@@ -0,0 +1,74 @@
+/* Basic test for getauxval.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <support/check.h>
+#include <sys/auxv.h>
+
+static int missing;
+static int mismatch;
+
+static void
+check_nonzero (unsigned long t, const char *s)
+{
+  unsigned long v = getauxval (t);
+  printf ("%s: %lu (0x%lx)\n", s, v, v);
+  if (v == 0)
+    missing++;
+}
+
+static void
+check_eq (unsigned long t, const char *s, unsigned long want)
+{
+  unsigned long v = getauxval (t);
+  printf ("%s: %lu want: %lu\n", s, v, want);
+  if (v != want)
+    mismatch++;
+}
+
+#define NZ(x) check_nonzero (x, #x)
+#define EQ(x, want) check_eq (x, #x, want)
+
+static int
+do_test (void)
+{
+  /* These auxv entries should be non-zero on Linux.  */
+  NZ (AT_PHDR);
+  NZ (AT_PHENT);
+  NZ (AT_PHNUM);
+  NZ (AT_PAGESZ);
+  NZ (AT_ENTRY);
+  NZ (AT_CLKTCK);
+  NZ (AT_RANDOM);
+  NZ (AT_EXECFN);
+  if (missing)
+    FAIL_EXIT1 ("Found %d missing auxv entries.\n", missing);
+
+  /* Check against syscalls.  */
+  EQ (AT_UID, getuid ());
+  EQ (AT_EUID, geteuid ());
+  EQ (AT_GID, getgid ());
+  EQ (AT_EGID, getegid ());
+  if (mismatch)
+    FAIL_EXIT1 ("Found %d mismatching auxv entries.\n", mismatch);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py
index fc0a8ff4a..fdc5b2bc0 100644
--- a/sysdeps/unix/sysv/linux/tst-mman-consts.py
+++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py
@@ -33,7 +33,7 @@ def main():
                         help='C compiler (including options) to use')
     args = parser.parse_args()
     linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc)
-    linux_version_glibc = (5, 15)
+    linux_version_glibc = (5, 17)
     sys.exit(glibcextract.compare_macro_consts(
         '#define _GNU_SOURCE 1\n'
         '#include <sys/mman.h>\n',
diff --git a/sysdeps/unix/sysv/linux/tst-socket-timestamp-compat.c b/sysdeps/unix/sysv/linux/tst-socket-timestamp-compat.c
index 0ff1a214e..2b1feb476 100644
--- a/sysdeps/unix/sysv/linux/tst-socket-timestamp-compat.c
+++ b/sysdeps/unix/sysv/linux/tst-socket-timestamp-compat.c
@@ -22,6 +22,7 @@
 #include <support/xsocket.h>
 #include <support/xunistd.h>
 #include <stdbool.h>
+#include <socket-constants-time64.h>
 
 /* AF_INET socket and address used to receive data.  */
 static int srv;
@@ -88,7 +89,7 @@ do_test_large_buffer (bool mc)
   /* Enable 32 bit timeval precision and check if no 64 bit timeval stamp
      is created.  */
   {
-    int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMP_OLD, &(int){1},
+    int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMP_OLD, &(int){1},
 			sizeof (int));
     TEST_VERIFY_EXIT (r != -1);
 
@@ -103,10 +104,10 @@ do_test_large_buffer (bool mc)
       if (cmsg->cmsg_level != SOL_SOCKET)
 	continue;
 
-      if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMP_NEW)
+      if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMP_NEW)
 	found_timestamp = true;
       else
-	TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMP_NEW);
+	TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMP_NEW);
     }
 
     TEST_COMPARE (found_timestamp, sizeof (time_t) > 4);
@@ -114,7 +115,7 @@ do_test_large_buffer (bool mc)
 
   /* Same as before, but for timespec.  */
   {
-    int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMPNS_OLD, &(int){1},
+    int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMPNS_OLD, &(int){1},
 			sizeof (int));
     TEST_VERIFY_EXIT (r != -1);
 
@@ -129,10 +130,10 @@ do_test_large_buffer (bool mc)
       if (cmsg->cmsg_level != SOL_SOCKET)
 	continue;
 
-      if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMPNS_NEW)
+      if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMPNS_NEW)
 	found_timestamp = true;
       else
-	TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMPNS_NEW);
+	TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMPNS_NEW);
     }
 
     TEST_COMPARE (found_timestamp, sizeof (time_t) > 4);
@@ -151,7 +152,7 @@ do_test_small_buffer (bool mc)
   /* Enable 32 bit timeval precision and check if no 64 bit timeval stamp
      is created.  */
   {
-    int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMP_OLD, &(int){1},
+    int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMP_OLD, &(int){1},
 			sizeof (int));
     TEST_VERIFY_EXIT (r != -1);
 
@@ -172,10 +173,10 @@ do_test_small_buffer (bool mc)
       if (cmsg->cmsg_level != SOL_SOCKET)
 	continue;
 
-      if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMP_NEW)
+      if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMP_NEW)
 	found_timestamp = true;
       else
-	TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMP_NEW);
+	TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMP_NEW);
     }
 
     if (sizeof (time_t) > 4)
@@ -192,7 +193,7 @@ do_test_small_buffer (bool mc)
 
   /* Same as before, but for timespec.  */
   {
-    int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMPNS_OLD, &(int){1},
+    int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMPNS_OLD, &(int){1},
 			sizeof (int));
     TEST_VERIFY_EXIT (r != -1);
 
@@ -213,10 +214,10 @@ do_test_small_buffer (bool mc)
       if (cmsg->cmsg_level != SOL_SOCKET)
 	continue;
 
-      if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMPNS_NEW)
+      if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMPNS_NEW)
 	found_timestamp = true;
       else
-	TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMPNS_NEW);
+	TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMPNS_NEW);
     }
 
     if (sizeof (time_t) > 4)
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
index 28558279b..b4ab892ec 100644
--- a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
@@ -278,6 +278,7 @@
 #define __NR_sendmsg 46
 #define __NR_sendto 44
 #define __NR_set_mempolicy 238
+#define __NR_set_mempolicy_home_node 450
 #define __NR_set_robust_list 273
 #define __NR_set_thread_area 205
 #define __NR_set_tid_address 218
diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
index c1ab8ec45..772559c87 100644
--- a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
+++ b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
@@ -270,6 +270,7 @@
 #define __NR_sendmsg 1073742342
 #define __NR_sendto 1073741868
 #define __NR_set_mempolicy 1073742062
+#define __NR_set_mempolicy_home_node 1073742274
 #define __NR_set_robust_list 1073742354
 #define __NR_set_thread_area 1073742029
 #define __NR_set_tid_address 1073742042
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 6cf708335..c6bee981f 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -99,7 +99,9 @@ tests += \
   tst-strcpy-rtm \
   tst-strlen-rtm \
   tst-strncmp-rtm \
-  tst-strrchr-rtm
+  tst-strrchr-rtm \
+  tst-wcsncmp-rtm \
+# tests
 
 CFLAGS-tst-memchr-rtm.c += -mrtm
 CFLAGS-tst-memcmp-rtm.c += -mrtm
@@ -109,8 +111,9 @@ CFLAGS-tst-memset-rtm.c += -mrtm
 CFLAGS-tst-strchr-rtm.c += -mrtm
 CFLAGS-tst-strcpy-rtm.c += -mrtm
 CFLAGS-tst-strlen-rtm.c += -mrtm
-CFLAGS-tst-strncmp-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
 CFLAGS-tst-strrchr-rtm.c += -mrtm
+CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
 endif
 
 ifneq ($(enable-cet),no)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index f64a2fb0b..e9f338210 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -898,18 +898,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     rep_movsb_threshold = 2112;
 
-  unsigned long int rep_movsb_stop_threshold;
-  /* ERMS feature is implemented from AMD Zen3 architecture and it is
-     performing poorly for data above L2 cache size. Henceforth, adding
-     an upper bound threshold parameter to limit the usage of Enhanced
-     REP MOVSB operations and setting its value to L2 cache size.  */
-  if (cpu_features->basic.kind == arch_kind_amd)
-    rep_movsb_stop_threshold = core;
-  /* Setting the upper bound of ERMS to the computed value of
-     non-temporal threshold for architectures other than AMD.  */
-  else
-    rep_movsb_stop_threshold = non_temporal_threshold;
-
   /* The default threshold to use Enhanced REP STOSB.  */
   unsigned long int rep_stosb_threshold = 2048;
 
@@ -943,14 +931,32 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
 
   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
+  /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
+     'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
+     if that operation cannot overflow. Minimum of 0x4040 (16448) because the
+     L(large_memset_4x) loops need 64-byte to cache align and enough space for
+     at least 1 iteration of 4x PAGE_SIZE unrolled loop.  Both values are
+     reflected in the manual.  */
   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
-			   0, SIZE_MAX);
+			   0x4040, SIZE_MAX >> 4);
   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
 			   minimum_rep_movsb_threshold, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
 			   SIZE_MAX);
 #endif
 
+  unsigned long int rep_movsb_stop_threshold;
+  /* ERMS feature is implemented from AMD Zen3 architecture and it is
+     performing poorly for data above L2 cache size. Henceforth, adding
+     an upper bound threshold parameter to limit the usage of Enhanced
+     REP MOVSB operations and setting its value to L2 cache size.  */
+  if (cpu_features->basic.kind == arch_kind_amd)
+    rep_movsb_stop_threshold = core;
+  /* Setting the upper bound of ERMS to the computed value of
+     non-temporal threshold for architectures other than AMD.  */
+  else
+    rep_movsb_stop_threshold = non_temporal_threshold;
+
   cpu_features->data_cache_size = data;
   cpu_features->shared_cache_size = shared;
   cpu_features->non_temporal_threshold = non_temporal_threshold;
diff --git a/sysdeps/x86/isa-level.c b/sysdeps/x86/isa-level.c
index a6cb32b1b..09cd72ab2 100644
--- a/sysdeps/x86/isa-level.c
+++ b/sysdeps/x86/isa-level.c
@@ -47,7 +47,8 @@
 # endif
 
 # if ISA_V2 && defined __AVX__ && defined __AVX2__ && defined __F16C__ \
-     && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE
+     && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE \
+     && defined __BMI__ && defined __BMI2__
 /* NB: ISAs in x86-64 ISA level v3 are used.  */
 #  define ISA_V3	GNU_PROPERTY_X86_ISA_1_V3
 # else
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 61f1255bf..007a1eb13 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -111,7 +111,8 @@ enum cf_protection_level
 /* Local label name for asm code. */
 #ifndef L
 /* ELF-like local names start with `.L'.  */
-# define L(name)	.L##name
+# define LOCAL_LABEL(name) .L##name
+# define L(name)	LOCAL_LABEL(name)
 #endif
 
 #define atom_text_section .section ".text.atom", "ax"
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
index 09ed6fa0d..a3b14e72f 100644
--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -16,20 +16,35 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
+#include <stdint.h>
 #include <tst-string-rtm.h>
 
+#ifdef WIDE
+# define CHAR wchar_t
+# define MEMSET wmemset
+# define STRNCMP wcsncmp
+# define TEST_NAME "wcsncmp"
+#else /* !WIDE */
+# define CHAR char
+# define MEMSET memset
+# define STRNCMP strncmp
+# define TEST_NAME "strncmp"
+#endif /* !WIDE */
+
+
+
 #define LOOP 3000
 #define STRING_SIZE 1024
-char string1[STRING_SIZE];
-char string2[STRING_SIZE];
+CHAR string1[STRING_SIZE];
+CHAR string2[STRING_SIZE];
 
 __attribute__ ((noinline, noclone))
 static int
 prepare (void)
 {
-  memset (string1, 'a', STRING_SIZE - 1);
-  memset (string2, 'a', STRING_SIZE - 1);
-  if (strncmp (string1, string2, STRING_SIZE) == 0)
+  MEMSET (string1, 'a', STRING_SIZE - 1);
+  MEMSET (string2, 'a', STRING_SIZE - 1);
+  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
     return EXIT_SUCCESS;
   else
     return EXIT_FAILURE;
@@ -39,7 +54,27 @@ __attribute__ ((noinline, noclone))
 static int
 function (void)
 {
-  if (strncmp (string1, string2, STRING_SIZE) == 0)
+  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function_overflow (void)
+{
+  if (STRNCMP (string1, string2, SIZE_MAX) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function_overflow2 (void)
+{
+  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
     return 0;
   else
     return 1;
@@ -48,5 +83,14 @@ function (void)
 static int
 do_test (void)
 {
-  return do_test_1 ("strncmp", LOOP, prepare, function);
+  int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
+  if (status != EXIT_SUCCESS)
+    return status;
+  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+  if (status != EXIT_SUCCESS)
+    return status;
+  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
+  if (status != EXIT_SUCCESS)
+    return status;
+  return status;
 }
diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c
new file mode 100644
index 000000000..bad3b8637
--- /dev/null
+++ b/sysdeps/x86/tst-wcsncmp-rtm.c
@@ -0,0 +1,21 @@
+/* Test case for wcsncmp inside a transactionally executing RTM region.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include <wchar.h>
+#include "tst-strncmp-rtm.c"
diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S
deleted file mode 100644
index f96d567fd..000000000
--- a/sysdeps/x86_64/bzero.S
+++ /dev/null
@@ -1 +0,0 @@
-/* Implemented in memset.S.  */
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 155ca36bd..4f7b0a546 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -339,11 +339,13 @@ and creates an unsatisfiable circular dependency.\n",
 #  endif
 	  /* Set to symbol size plus addend.  */
 	  value = sym->st_size;
+	  *reloc_addr = value + reloc->r_addend;
+	  break;
 # endif
-	  /* Fall through.  */
+
 	case R_X86_64_GLOB_DAT:
 	case R_X86_64_JUMP_SLOT:
-	  *reloc_addr = value + reloc->r_addend;
+	  *reloc_addr = value;
 	  break;
 
 # ifndef RESOLVE_CONFLICT_FIND_MAP
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index e02a53ea1..5718a7da8 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -18,395 +18,561 @@
 
 #include <sysdep.h>
 
+#ifdef USE_AS_WMEMCMP
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define SIZE_OFFSET	(0)
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
+#endif
+
+#ifdef USE_AS_MEMCMPEQ
+# define SIZE_OFFSET	(0)
+# define CHECK_CMP(x, y)	subl x, y
+#else
+# ifndef SIZE_OFFSET
+#  define SIZE_OFFSET	(CHAR_PER_VEC * 2)
+# endif
+# define CHECK_CMP(x, y)	cmpl x, y
+#endif
+
+#define VEC_SIZE	16
+#define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+#ifndef MEMCMP
+# define MEMCMP	memcmp
+#endif
+
 	.text
-ENTRY (memcmp)
-#ifdef __ILP32__
+ENTRY(MEMCMP)
+# ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
+# endif
+#ifdef USE_AS_WMEMCMP
+	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
+	   in ecx for code size. This is preferable to using `incw` as
+	   it avoids partial register stalls on older hardware (pre
+	   SnB).  */
+	movl	$0xffff, %ecx
 #endif
-	test	%RDX_LP, %RDX_LP
-	jz	L(finz)
-	cmpq	$1, %rdx
-	jbe	L(finr1b)
-	subq	%rdi, %rsi
-	movq	%rdx, %r10
-	cmpq	$32, %r10
-	jae	L(gt32)
-	/* Handle small chunks and last block of less than 32 bytes.  */
-L(small):
-	testq	$1, %r10
-	jz	L(s2b)
-	movzbl	(%rdi),	%eax
-	movzbl	(%rdi, %rsi), %edx
-	subq    $1, %r10
-	je	L(finz1)
-	addq	$1, %rdi
-	subl	%edx, %eax
-	jnz	L(exit)
-L(s2b):
-	testq	$2, %r10
-	jz	L(s4b)
-	movzwl	(%rdi),	%eax
-	movzwl	(%rdi, %rsi), %edx
-	subq    $2, %r10
-#ifdef USE_AS_MEMCMPEQ
-	je	L(finz1)
+	cmpq	$CHAR_PER_VEC, %rdx
+	ja	L(more_1x_vec)
+
+#ifdef USE_AS_WMEMCMP
+	/* saves a byte of code keeping the fall through path n = [2, 4]
+	   in the initial cache line.  */
+	decl	%edx
+	jle	L(cmp_0_1)
+
+	movq	(%rsi), %xmm0
+	movq	(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_start_0)
+
+	movq	-4(%rsi, %rdx, CHAR_SIZE), %xmm0
+	movq	-4(%rdi, %rdx, CHAR_SIZE), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_end_0_adj)
 #else
-	je	L(fin2_7)
+	cmpl	$8, %edx
+	ja	L(cmp_9_16)
+
+	cmpl	$4, %edx
+	jb	L(cmp_0_3)
+
+# ifdef USE_AS_MEMCMPEQ
+	movl	(%rsi), %eax
+	subl	(%rdi), %eax
+
+	movl	-4(%rsi, %rdx), %esi
+	subl	-4(%rdi, %rdx), %esi
+
+	orl	%esi, %eax
+	ret
+# else
+	/* Combine comparisons for lo and hi 4-byte comparisons.  */
+	movl	-4(%rsi, %rdx), %ecx
+	movl	-4(%rdi, %rdx), %eax
+	shlq	$32, %rcx
+	shlq	$32, %rax
+	movl	(%rsi), %esi
+	movl	(%rdi), %edi
+	orq	%rsi, %rcx
+	orq	%rdi, %rax
+	/* Only compute proper return if not-equal.  */
+	cmpq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4,, 10
+L(cmp_9_16):
+# ifdef USE_AS_MEMCMPEQ
+	movq	(%rsi), %rax
+	subq	(%rdi), %rax
+
+	movq	-8(%rsi, %rdx), %rcx
+	subq	-8(%rdi, %rdx), %rcx
+	orq	%rcx, %rax
+	/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
+	   return long).  */
+	setnz	%cl
+	movzbl	%cl, %eax
+# else
+	movq	(%rsi), %rcx
+	movq	(%rdi), %rax
+	/* Only compute proper return if not-equal.  */
+	cmpq	%rcx, %rax
+	jnz	L(ret_nonzero)
+
+	movq	-8(%rsi, %rdx, CHAR_SIZE), %rcx
+	movq	-8(%rdi, %rdx, CHAR_SIZE), %rax
+	/* Only compute proper return if not-equal.  */
+	cmpq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	xorl	%eax, %eax
+# endif
 #endif
-	addq	$2, %rdi
-	cmpl	%edx, %eax
-#ifdef USE_AS_MEMCMPEQ
-	jnz	L(neq_early)
+	ret
+
+	.p2align 4,, 8
+L(cmp_0_1):
+	/* Flag set by earlier comparison against 1.  */
+	jne	L(cmp_0_0)
+#ifdef USE_AS_WMEMCMP
+	movl	(%rdi), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi), %ecx
+	je	L(cmp_0_0)
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
 #else
-	jnz	L(fin2_7)
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
 #endif
-L(s4b):
-	testq	$4, %r10
-	jz	L(s8b)
-	movl	(%rdi),	%eax
-	movl	(%rdi, %rsi), %edx
-	subq    $4, %r10
-#ifdef USE_AS_MEMCMPEQ
-	je	L(finz1)
+	ret
+
+	/* Fits in aligning bytes.  */
+L(cmp_0_0):
+	xorl	%eax, %eax
+	ret
+
+#ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(ret_nonzero_vec_start_0):
+	bsfl	%eax, %eax
+	movl	(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+	ret
 #else
-	je	L(fin2_7)
+
+# ifndef USE_AS_MEMCMPEQ
+	.p2align 4,, 14
+L(ret_nonzero):
+	/* Need to bswap to get proper return without branch.  */
+	bswapq	%rcx
+	bswapq	%rax
+	subq	%rcx, %rax
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	ret
+# endif
+
+	.p2align 4
+L(cmp_0_3):
+# ifdef USE_AS_MEMCMPEQ
+	/* No reason to add to dependency chain on rdx. Saving a the
+	   bytes here doesn't change number of fetch blocks.  */
+	cmpl	$1, %edx
+	jbe	L(cmp_0_1)
+# else
+	/* We need the code size to prevent taking an extra fetch block.
+	 */
+	decl	%edx
+	jle	L(cmp_0_1)
+# endif
+	movzwl	(%rsi), %ecx
+	movzwl	(%rdi), %eax
+
+# ifdef USE_AS_MEMCMPEQ
+	subl	%ecx, %eax
+
+	movzbl	-1(%rsi, %rdx), %esi
+	movzbl	-1(%rdi, %rdx), %edi
+	subl	%edi, %esi
+	orl	%esi, %eax
+# else
+	bswapl	%ecx
+	bswapl	%eax
+
+	/* Implicit right shift by one. We just need to displace the
+	   sign bits.  */
+	shrl	%ecx
+	shrl	%eax
+
+	/* Eat a partial register stall here. Saves code stopping
+	   L(cmp_0_3) from bleeding into the next fetch block and saves
+	   an ALU.  */
+	movb	(%rsi, %rdx), %cl
+	movzbl	(%rdi, %rdx), %edi
+	orl	%edi, %eax
+	subl	%ecx, %eax
+# endif
+	ret
 #endif
-	addq	$4, %rdi
-	cmpl	%edx, %eax
-#ifdef USE_AS_MEMCMPEQ
-	jnz	L(neq_early)
+
+	.p2align 5
+L(more_1x_vec):
+#ifndef USE_AS_WMEMCMP
+	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
+	   in ecx for code size. This is preferable to using `incw` as
+	   it avoids partial register stalls on older hardware (pre
+	   SnB).  */
+	movl	$0xffff, %ecx
+#endif
+	movups	(%rsi), %xmm0
+	movups	(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_start_0)
+#if SIZE_OFFSET == 0
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
 #else
-	jnz	L(fin2_7)
+	/* Offset rdx. Saves just enough code size to keep the
+	   L(last_2x_vec) case and the non-zero return in a single
+	   cache line.  */
+	subq	$(CHAR_PER_VEC * 2), %rdx
 #endif
-L(s8b):
-	testq	$8, %r10
-	jz	L(s16b)
-	movq	(%rdi),	%rax
-	movq	(%rdi, %rsi), %rdx
-	subq    $8, %r10
-#ifdef USE_AS_MEMCMPEQ
-	je	L(sub_return8)
+	ja	L(more_2x_vec)
+
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+#ifndef USE_AS_MEMCMPEQ
+	/* Don't use `incw ax` as machines this code runs on are liable
+	   to have partial register stall.  */
+	jnz	L(ret_nonzero_vec_end_0)
 #else
-	je	L(fin2_7)
+	/* Various return targets for memcmpeq. Will always be hot in
+	   Icache and get short encoding.  */
+L(ret_nonzero_vec_start_1):
+L(ret_nonzero_vec_start_0):
+L(ret_nonzero_vec_end_0):
 #endif
-	addq	$8, %rdi
-	cmpq	%rdx, %rax
-#ifdef USE_AS_MEMCMPEQ
-	jnz	L(neq_early)
+	ret
+
+#ifndef USE_AS_MEMCMPEQ
+# ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(ret_nonzero_vec_end_0_adj):
+	addl	$3, %edx
+# else
+	.p2align 4,, 8
+# endif
+L(ret_nonzero_vec_end_0):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	leal	(%rax, %rdx, CHAR_SIZE), %eax
+	movl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	addl	%edx, %eax
+	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+# ifndef USE_AS_WMEMCMP
+	.p2align 4,, 10
+L(ret_nonzero_vec_start_0):
+	bsfl	%eax, %eax
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+	subl	%ecx, %eax
+	ret
+# endif
 #else
-	jnz	L(fin2_7)
 #endif
-L(s16b):
-	movdqu    (%rdi), %xmm1
-	movdqu    (%rdi, %rsi), %xmm0
-	pcmpeqb   %xmm0, %xmm1
+
+	.p2align 5
+L(more_2x_vec):
+	movups	(VEC_SIZE * 1)(%rsi), %xmm0
+	movups	(VEC_SIZE * 1)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_start_1)
+
+	cmpq	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
+	jbe	L(last_2x_vec)
+
+	cmpq	$(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
+	ja	L(more_8x_vec)
+
+	/* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
+	   This can harm performance if non-zero return in [65, 80] or
+	   [97, 112] but helps performance otherwise. Generally zero-
+	   return is hotter.  */
+	movups	(VEC_SIZE * 2)(%rsi), %xmm0
+	movups	(VEC_SIZE * 2)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * 3)(%rsi), %xmm2
+	movups	(VEC_SIZE * 3)(%rdi), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+
+	pmovmskb %xmm3, %eax
+	CHECK_CMP (%ecx, %eax)
+	jnz	L(ret_nonzero_vec_start_2_3)
+
+	cmpl	$(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
+	jbe	L(last_2x_vec)
+
+	movups	(VEC_SIZE * 4)(%rsi), %xmm0
+	movups	(VEC_SIZE * 4)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * 5)(%rsi), %xmm2
+	movups	(VEC_SIZE * 5)(%rdi), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+
+	pmovmskb %xmm3, %eax
+	CHECK_CMP (%ecx, %eax)
 #ifdef USE_AS_MEMCMPEQ
-	pmovmskb  %xmm1, %eax
-	subl      $0xffff, %eax
+	jz	L(last_2x_vec)
 	ret
 #else
-	pmovmskb  %xmm1, %edx
-	xorl	  %eax, %eax
-	subl      $0xffff, %edx
-	jz	  L(finz)
-	bsfl      %edx, %ecx
-	leaq	 (%rdi, %rcx), %rcx
-	movzbl	 (%rcx), %eax
-	movzbl	 (%rsi, %rcx), %edx
-	jmp	 L(finz1)
+	jnz	L(ret_nonzero_vec_start_4_5)
 #endif
-	.p2align 4,, 4
-L(finr1b):
-	movzbl	(%rdi), %eax
-	movzbl  (%rsi), %edx
-L(finz1):
-	subl	%edx, %eax
-L(exit):
-	ret
+	.p2align 4
+L(last_2x_vec):
+	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
+	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	subl	%ecx, %eax
 #ifdef USE_AS_MEMCMPEQ
-	.p2align 4,, 4
-L(sub_return8):
-	subq	%rdx, %rax
-	movl	%eax, %edx
-	shrq	$32, %rax
-	orl	%edx, %eax
+	/* Various return targets for memcmpeq. Will always be hot in
+	   Icache and get short encoding.  */
+L(ret_nonzero_vec_start_2_3):
+L(ret_nonzero_vec_start_4_5):
 	ret
 #else
-	.p2align 4,, 4
-L(fin2_7):
-	cmpq	%rdx, %rax
-	jz	L(finz)
-	movq	%rax, %r11
-	subq	%rdx, %r11
-	bsfq	%r11, %rcx
-	sarq	$3, %rcx
-	salq	$3, %rcx
-	sarq	%cl, %rax
-	movzbl  %al, %eax
-	sarq	%cl, %rdx
-	movzbl  %dl, %edx
-	subl	%edx, %eax
+	jnz	L(ret_nonzero_vec_end_1)
 	ret
-#endif
-	.p2align 4,, 4
-L(finz):
-	xorl	%eax, %eax
+
+	.p2align 4,, 8
+L(ret_nonzero_vec_end_1):
+	pmovmskb %xmm1, %ecx
+	/* High 16 bits of eax guranteed to be all ones. Rotate them in
+	   to we can do `or + not` with just `xor`.  */
+	rorl	$16, %eax
+	xorl	%ecx, %eax
+	/* Partial register stall.  */
+
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	leal	(%rax, %rdx, CHAR_SIZE), %eax
+	movl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	addl	%edx, %eax
+	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
 	ret
-#ifdef USE_AS_MEMCMPEQ
-	.p2align 4,, 4
-L(neq_early):
-	movl	$1, %eax
+
+	.p2align 4
+L(ret_nonzero_vec_start_4_5):
+	pmovmskb %xmm1, %edx
+	sall	$16, %eax
+	leal	1(%rax, %rdx), %eax
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 4)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 4)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(ret_nonzero_vec_start_1):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 1)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 1)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
 	ret
 #endif
-	/* For blocks bigger than 32 bytes
-	   1. Advance one of the addr pointer to be 16B aligned.
-	   2. Treat the case of both addr pointers aligned to 16B
-	      separately to avoid movdqu.
-	   3. Handle any blocks of greater than 64 consecutive bytes with
-	      unrolling to reduce branches.
-	   4. At least one addr pointer is 16B aligned, use memory version
-	      of pcmbeqb.
-	*/
-	.p2align 4,, 4
-L(gt32):
-	movq	%rdx, %r11
-	addq	%rdi, %r11
-	movq	%rdi, %r8
-
-	andq	$15, %r8
-	jz	L(16am)
-	/* Both pointers may be misaligned.  */
-	movdqu	(%rdi),	%xmm1
-	movdqu	(%rdi, %rsi), %xmm0
-	pcmpeqb   %xmm0, %xmm1
-	pmovmskb  %xmm1, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	neg	 %r8
-	leaq    16(%rdi, %r8), %rdi
-L(16am):
-	/* Handle two 16B aligned pointers separately.  */
-	testq   $15, %rsi
-	jz      L(ATR)
-	testq	$16, %rdi
-	jz	L(A32)
-	movdqu	(%rdi, %rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq	$16, %rdi
-L(A32):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-	/* Pre-unroll to be ready for unrolled 64B loop.  */
-	testq	$32, %rdi
-	jz	L(A64)
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-L(A64):
-	movq	%r11, %r10
-	andq	$-64, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt32)
-
-L(A64main):
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq       %rdi, %r10
-	jne       L(A64main)
-
-L(mt32):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-
-L(A32main):
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq       %rdi, %r10
-	jne       L(A32main)
-L(mt16):
-	subq       %rdi, %r11
-	je	  L(finz)
-	movq	  %r11, %r10
-	jmp	  L(small)
-
-	.p2align 4,, 4
-L(neq):
-#ifdef USE_AS_MEMCMPEQ
-	movl	$1, %eax
-    ret
-#else
-	bsfl      %edx, %ecx
-	movzbl	 (%rdi, %rcx), %eax
-	addq	 %rdi, %rsi
-	movzbl	 (%rsi,%rcx), %edx
-	jmp	 L(finz1)
+
+	.p2align 4
+L(more_8x_vec):
+	subq	%rdi, %rsi
+	leaq	(VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
+	andq	$(VEC_SIZE * -1), %rdi
+	addq	%rdi, %rsi
+	.p2align 4
+L(loop_4x):
+	movups	(VEC_SIZE * 2)(%rsi), %xmm0
+	movups	(VEC_SIZE * 3)(%rsi), %xmm1
+
+	PCMPEQ	(VEC_SIZE * 2)(%rdi), %xmm0
+	PCMPEQ	(VEC_SIZE * 3)(%rdi), %xmm1
+
+	movups	(VEC_SIZE * 4)(%rsi), %xmm2
+	movups	(VEC_SIZE * 5)(%rsi), %xmm3
+
+	PCMPEQ	(VEC_SIZE * 4)(%rdi), %xmm2
+	PCMPEQ	(VEC_SIZE * 5)(%rdi), %xmm3
+
+	pand	%xmm0, %xmm1
+	pand	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+
+	pmovmskb %xmm3, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_loop)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rsi
+	cmpq	%rdi, %rdx
+	ja	L(loop_4x)
+	/* Get remaining length in edx.  */
+	subl	%edi, %edx
+	/* Restore offset so we can reuse L(last_2x_vec).  */
+	addl	$(VEC_SIZE * 6 - SIZE_OFFSET), %edx
+#ifdef USE_AS_WMEMCMP
+	shrl	$2, %edx
 #endif
+	cmpl	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
+	jbe	L(last_2x_vec)
+
 
-	.p2align 4,, 4
-L(ATR):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-	testq	$16, %rdi
-	jz	L(ATR32)
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-	cmpq       %rdi, %r10
-	je       L(mt16)
-
-L(ATR32):
-	movq	%r11, %r10
-	andq	$-64, %r10
-	testq	$32, %rdi
-	jz	L(ATR64)
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-L(ATR64):
-	cmpq       %rdi, %r10
-	je	   L(mt32)
-
-L(ATR64main):
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-	cmpq       %rdi, %r10
-	jne       L(ATR64main)
-
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-
-L(ATR32res):
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq	  %r10, %rdi
-	jne       L(ATR32res)
-
-	subq       %rdi, %r11
-	je	  L(finz)
-	movq	  %r11, %r10
-	jmp	  L(small)
-	/* Align to 16byte to improve instruction fetch.  */
-	.p2align 4,, 4
-END(memcmp)
+	movups	(VEC_SIZE * 2)(%rsi), %xmm0
+	movups	(VEC_SIZE * 2)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * 3)(%rsi), %xmm2
+	movups	(VEC_SIZE * 3)(%rdi), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
 
+	pmovmskb %xmm3, %eax
+	CHECK_CMP (%ecx, %eax)
+	jz	L(last_2x_vec)
 #ifdef USE_AS_MEMCMPEQ
-libc_hidden_def (memcmp)
+L(ret_nonzero_loop):
+	ret
 #else
-# undef bcmp
-weak_alias (memcmp, bcmp)
-libc_hidden_builtin_def (memcmp)
+
+	.p2align 4
+L(ret_nonzero_vec_start_2_3):
+	pmovmskb %xmm1, %edx
+	sall	$16, %eax
+	leal	1(%rax, %rdx), %eax
+
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	.p2align 4
+L(ret_nonzero_loop):
+	pmovmskb %xmm0, %ecx
+	pmovmskb %xmm1, %edx
+	sall	$(VEC_SIZE * 1), %edx
+	leal	1(%rcx, %rdx), %edx
+	pmovmskb %xmm2, %ecx
+	/* High 16 bits of eax guranteed to be all ones. Rotate them in
+	   to we can do `or + not` with just `xor`.  */
+	rorl	$16, %eax
+	xorl	%ecx, %eax
+
+	salq	$32, %rax
+	orq	%rdx, %rax
+
+	bsfq	%rax, %rax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+#endif
+END(MEMCMP)
+
+#ifndef USE_AS_WMEMCMP
+# ifdef USE_AS_MEMCMPEQ
+libc_hidden_def (MEMCMP)
+# else
+#  undef bcmp
+weak_alias (MEMCMP, bcmp)
+libc_hidden_builtin_def (MEMCMP)
+# endif
 #endif
diff --git a/sysdeps/x86_64/memcmpeq.S b/sysdeps/x86_64/memcmpeq.S
index 2cee881fe..80c5e912a 100644
--- a/sysdeps/x86_64/memcmpeq.S
+++ b/sysdeps/x86_64/memcmpeq.S
@@ -16,6 +16,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#define memcmp	__memcmpeq
+#define MEMCMP	__memcmpeq
 #define USE_AS_MEMCMPEQ	1
 #include "multiarch/memcmp-sse2.S"
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index d1a9f4791..b0dffd2ae 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -18,362 +18,333 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
 
 	.text
-ENTRY (__memrchr)
-	movd	%esi, %xmm1
-
-	sub	$16, %RDX_LP
-	jbe	L(length_less16)
-
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	%RDX_LP, %RDI_LP
-	pshufd	$0, %xmm1, %xmm1
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-
-/* Check if there is a match.  */
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(loop_prolog)
-
-	add	$16, %rdi
-	add	$16, %rdx
-	and	$-16, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(loop_prolog):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%edi, %ecx
-	and	$63, %ecx
-	jz	L(align64_loop)
-
-	add	$64, %rdi
-	add	$64, %rdx
-	and	$-64, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm3, %xmm0
-	pmaxub	%xmm4, %xmm2
-	pmaxub	%xmm0, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm2
-
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	(%rdi), %xmm1
-
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	pmovmskb	%xmm1, %eax
-	bsr	%eax, %eax
-
-	add	%rdi, %rax
+ENTRY_P2ALIGN(__memrchr, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
+#endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
 	ret
 
-	.p2align 4
-L(exit_loop):
-	add	$64, %edx
-	cmp	$32, %edx
-	jbe	L(exit_loop_32)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	cmp	$48, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches0_1)
-	xor	%eax, %eax
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(exit_loop_32):
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	cmp	$16, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	32(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	xor	%eax, %eax
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches0):
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
 
-	.p2align 4
-L(matches32):
-	bsr	%eax, %eax
-	lea	32(%rax, %rdi), %rax
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
 	ret
 
-	.p2align 4
-L(matches48):
-	bsr	%eax, %eax
-	lea	48(%rax, %rdi), %rax
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
 	ret
 
-	.p2align 4
-L(matches0_1):
-	bsr	%eax, %eax
-	sub	$64, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	add	%rdi, %rax
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches16_1):
-	bsr	%eax, %eax
-	sub	$48, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	16(%rdi, %rax), %rax
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(matches32_1):
-	bsr	%eax, %eax
-	sub	$32, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(matches48_1):
-	bsr	%eax, %eax
-	sub	$16, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(return_null):
-	xor	%eax, %eax
-	ret
 
-	.p2align 4
-L(length_less16_offset0):
-	test	%edx, %edx
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	mov	%dl, %cl
-	pcmpeqb	(%rdi), %xmm1
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
 
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
 
-	pmovmskb	%xmm1, %eax
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
 
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
 	ret
 
-	.p2align 4
-L(length_less16):
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	$16, %edx
-
-	pshufd	$0, %xmm1, %xmm1
-
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(length_less16_offset0)
-
-	mov	%cl, %dh
-	mov	%ecx, %esi
-	add	%dl, %dh
-	and	$-16, %rdi
-
-	sub	$16, %dh
-	ja	L(length_less16_part2)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-
-	sar	%cl, %eax
-	mov	%dl, %cl
-
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
-
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(length_less16_part2):
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	mov	%dh, %cl
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
+L(zero_2):
+	xorl	%eax, %eax
+	ret
 
-	test	%eax, %eax
-	jnz	L(length_less16_part2_return)
 
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	mov	%esi, %ecx
-	sar	%cl, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
 	ret
 
-	.p2align 4
-L(length_less16_part2_return):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
 	ret
 
-END (__memrchr)
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(__memrchr)
 weak_alias (__memrchr, memrchr)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 65c09bd0a..a6eea61a4 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -1,4 +1,4 @@
-/* memset/bzero -- set memory area to CH/0
+/* memset -- set memory area to CH/0
    Optimized version for x86-64.
    Copyright (C) 2002-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
@@ -28,17 +28,23 @@
 #define VMOVU     movups
 #define VMOVA     movaps
 
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
   movq r, %rax; \
   punpcklbw %xmm0, %xmm0; \
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
-  movq r, %rax; \
-  pshufd $0, %xmm0, %xmm0
+  pshufd $0, %xmm0, %xmm0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 #define SECTION(p)		p
 
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 044778585..3cc1a7e0d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -1,127 +1,199 @@
 ifeq ($(subdir),string)
 
-sysdep_routines += strncat-c stpncpy-c strncpy-c \
-		   strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3  \
-		   strcmp-sse4_2 strcmp-avx2 \
-		   strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \
-		   memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
-		   memrchr-sse2 memrchr-avx2 \
-		   memcmp-sse2 \
-		   memcmpeq-sse2 \
-		   memcmp-avx2-movbe \
-		   memcmpeq-avx2 \
-		   memcmp-sse4 memcpy-ssse3 \
-		   memmove-ssse3 \
-		   memcpy-ssse3-back \
-		   memmove-ssse3-back \
-		   memmove-avx512-no-vzeroupper \
-		   strcasecmp_l-sse2 strcasecmp_l-ssse3 \
-		   strcasecmp_l-sse4_2 strcasecmp_l-avx \
-		   strncase_l-sse2 strncase_l-ssse3 \
-		   strncase_l-sse4_2 strncase_l-avx \
-		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
-		   strrchr-sse2 strrchr-avx2 \
-		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
-		   strcat-avx2 strncat-avx2 \
-		   strcat-ssse3 strncat-ssse3\
-		   strcpy-avx2 strncpy-avx2 \
-		   strcpy-sse2 stpcpy-sse2 \
-		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
-		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
-		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
-		   stpcpy-avx2 stpncpy-avx2 \
-		   strcat-sse2 \
-		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
-		   strcspn-sse2 strpbrk-sse2 strspn-sse2 \
-		   strcspn-c strpbrk-c strspn-c varshift \
-		   memset-avx512-no-vzeroupper \
-		   memmove-sse2-unaligned-erms \
-		   memmove-avx-unaligned-erms \
-		   memmove-avx512-unaligned-erms \
-		   memset-sse2-unaligned-erms \
-		   memset-avx2-unaligned-erms \
-		   memset-avx512-unaligned-erms \
-		   memchr-avx2-rtm \
-		   memcmp-avx2-movbe-rtm \
-		   memcmpeq-avx2-rtm \
-		   memmove-avx-unaligned-erms-rtm \
-		   memrchr-avx2-rtm \
-		   memset-avx2-unaligned-erms-rtm \
-		   rawmemchr-avx2-rtm \
-		   strchr-avx2-rtm \
-		   strcmp-avx2-rtm \
-		   strchrnul-avx2-rtm \
-		   stpcpy-avx2-rtm \
-		   stpncpy-avx2-rtm \
-		   strcat-avx2-rtm \
-		   strcpy-avx2-rtm \
-		   strlen-avx2-rtm \
-		   strncat-avx2-rtm \
-		   strncmp-avx2-rtm \
-		   strncpy-avx2-rtm \
-		   strnlen-avx2-rtm \
-		   strrchr-avx2-rtm \
-		   memchr-evex \
-		   memcmp-evex-movbe \
-		   memcmpeq-evex \
-		   memmove-evex-unaligned-erms \
-		   memrchr-evex \
-		   memset-evex-unaligned-erms \
-		   rawmemchr-evex \
-		   stpcpy-evex \
-		   stpncpy-evex \
-		   strcat-evex \
-		   strchr-evex \
-		   strchrnul-evex \
-		   strcmp-evex \
-		   strcpy-evex \
-		   strlen-evex \
-		   strncat-evex \
-		   strncmp-evex \
-		   strncpy-evex \
-		   strnlen-evex \
-		   strrchr-evex \
-		   memchr-evex-rtm \
-		   rawmemchr-evex-rtm
+sysdep_routines += \
+  memchr-avx2 \
+  memchr-avx2-rtm \
+  memchr-evex \
+  memchr-evex-rtm \
+  memchr-sse2 \
+  memcmp-avx2-movbe \
+  memcmp-avx2-movbe-rtm \
+  memcmp-evex-movbe \
+  memcmp-sse2 \
+  memcmp-ssse3 \
+  memcmpeq-avx2 \
+  memcmpeq-avx2-rtm \
+  memcmpeq-evex \
+  memcmpeq-sse2 \
+  memcpy-ssse3 \
+  memcpy-ssse3-back \
+  memmove-avx-unaligned-erms \
+  memmove-avx-unaligned-erms-rtm \
+  memmove-avx512-no-vzeroupper \
+  memmove-avx512-unaligned-erms \
+  memmove-erms \
+  memmove-evex-unaligned-erms \
+  memmove-sse2-unaligned-erms \
+  memmove-ssse3 \
+  memmove-ssse3-back \
+  memrchr-avx2 \
+  memrchr-avx2-rtm \
+  memrchr-evex \
+  memrchr-sse2 \
+  memset-avx2-unaligned-erms \
+  memset-avx2-unaligned-erms-rtm \
+  memset-avx512-no-vzeroupper \
+  memset-avx512-unaligned-erms \
+  memset-erms \
+  memset-evex-unaligned-erms \
+  memset-sse2-unaligned-erms \
+  rawmemchr-avx2 \
+  rawmemchr-avx2-rtm \
+  rawmemchr-evex \
+  rawmemchr-evex-rtm \
+  rawmemchr-sse2 \
+  stpcpy-avx2 \
+  stpcpy-avx2-rtm \
+  stpcpy-evex \
+  stpcpy-sse2 \
+  stpcpy-sse2-unaligned \
+  stpcpy-ssse3 \
+  stpncpy-avx2 \
+  stpncpy-avx2-rtm \
+  stpncpy-c \
+  stpncpy-evex \
+  stpncpy-sse2-unaligned \
+  stpncpy-ssse3 \
+  strcasecmp_l-avx2 \
+  strcasecmp_l-avx2-rtm \
+  strcasecmp_l-evex \
+  strcasecmp_l-sse2 \
+  strcasecmp_l-sse4_2 \
+  strcasecmp_l-ssse3 \
+  strcat-avx2 \
+  strcat-avx2-rtm \
+  strcat-evex \
+  strcat-sse2 \
+  strcat-sse2-unaligned \
+  strcat-ssse3 \
+  strchr-avx2 \
+  strchr-avx2-rtm \
+  strchr-evex \
+  strchr-sse2 \
+  strchr-sse2-no-bsf \
+  strchrnul-avx2 \
+  strchrnul-avx2-rtm \
+  strchrnul-evex \
+  strchrnul-sse2 \
+  strcmp-avx2 \
+  strcmp-avx2-rtm \
+  strcmp-evex \
+  strcmp-sse2 \
+  strcmp-sse2-unaligned \
+  strcmp-sse4_2 \
+  strcmp-ssse3 \
+  strcpy-avx2 \
+  strcpy-avx2-rtm \
+  strcpy-evex \
+  strcpy-sse2 \
+  strcpy-sse2-unaligned \
+  strcpy-ssse3 \
+  strcspn-c \
+  strcspn-sse2 \
+  strlen-avx2 \
+  strlen-avx2-rtm \
+  strlen-evex \
+  strlen-evex512 \
+  strlen-sse2 \
+  strncase_l-avx2 \
+  strncase_l-avx2-rtm \
+  strncase_l-evex \
+  strncase_l-sse2 \
+  strncase_l-sse4_2 \
+  strncase_l-ssse3 \
+  strncat-avx2 \
+  strncat-avx2-rtm \
+  strncat-c \
+  strncat-evex \
+  strncat-sse2-unaligned \
+  strncat-ssse3 \
+  strncmp-avx2 \
+  strncmp-avx2-rtm \
+  strncmp-evex \
+  strncmp-sse2 \
+  strncmp-sse4_2 \
+  strncmp-ssse3 \
+  strncpy-avx2 \
+  strncpy-avx2-rtm \
+  strncpy-c \
+  strncpy-evex \
+  strncpy-sse2-unaligned \
+  strncpy-ssse3 \
+  strnlen-avx2 \
+  strnlen-avx2-rtm \
+  strnlen-evex \
+  strnlen-evex512 \
+  strnlen-sse2 \
+  strpbrk-c \
+  strpbrk-sse2 \
+  strrchr-avx2 \
+  strrchr-avx2-rtm \
+  strrchr-evex \
+  strrchr-sse2 \
+  strspn-c \
+  strspn-sse2 \
+  strstr-avx512 \
+  strstr-sse2-unaligned \
+  varshift \
+# sysdep_routines
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
 CFLAGS-strspn-c.c += -msse4
+CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
 endif
 
 ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
-		   wmemcmp-avx2-movbe \
-		   wmemchr-sse2 wmemchr-avx2 \
-		   wcscmp-sse2 wcscmp-avx2 \
-		   wcsncmp-sse2 wcsncmp-avx2 \
-		   wcscpy-ssse3 wcscpy-c \
-		   wcschr-sse2 wcschr-avx2 \
-		   wcsrchr-sse2 wcsrchr-avx2 \
-		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
-		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
-		   wcschr-avx2-rtm \
-		   wcscmp-avx2-rtm \
-		   wcslen-avx2-rtm \
-		   wcsncmp-avx2-rtm \
-		   wcsnlen-avx2-rtm \
-		   wcsrchr-avx2-rtm \
-		   wmemchr-avx2-rtm \
-		   wmemcmp-avx2-movbe-rtm \
-		   wcschr-evex \
-		   wcscmp-evex \
-		   wcslen-evex \
-		   wcsncmp-evex \
-		   wcsnlen-evex \
-		   wcsrchr-evex \
-		   wmemchr-evex \
-		   wmemcmp-evex-movbe \
-		   wmemchr-evex-rtm
+sysdep_routines += \
+  wcschr-avx2 \
+  wcschr-avx2-rtm \
+  wcschr-evex \
+  wcschr-sse2 \
+  wcscmp-avx2 \
+  wcscmp-avx2-rtm \
+  wcscmp-evex \
+  wcscmp-sse2 \
+  wcscpy-c \
+  wcscpy-ssse3 \
+  wcslen-avx2 \
+  wcslen-avx2-rtm \
+  wcslen-evex \
+  wcslen-evex512 \
+  wcslen-sse2 \
+  wcslen-sse4_1 \
+  wcsncmp-avx2 \
+  wcsncmp-avx2-rtm \
+  wcsncmp-evex \
+  wcsncmp-sse2 \
+  wcsnlen-avx2 \
+  wcsnlen-avx2-rtm \
+  wcsnlen-c \
+  wcsnlen-evex \
+  wcsnlen-evex512 \
+  wcsnlen-sse4_1 \
+  wcsrchr-avx2 \
+  wcsrchr-avx2-rtm \
+  wcsrchr-evex \
+  wcsrchr-sse2 \
+  wmemchr-avx2 \
+  wmemchr-avx2-rtm \
+  wmemchr-evex \
+  wmemchr-evex-rtm \
+  wmemchr-sse2 \
+  wmemcmp-avx2-movbe \
+  wmemcmp-avx2-movbe-rtm \
+  wmemcmp-evex-movbe \
+  wmemcmp-sse2 \
+  wmemcmp-ssse3 \
+# sysdep_routines
 endif
 
 ifeq ($(subdir),debug)
-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \
-		   memmove_chk-nonshared memset_chk-nonshared \
-		   wmemset_chk-nonshared
+sysdep_routines += \
+  memcpy_chk-nonshared \
+  memmove_chk-nonshared \
+  mempcpy_chk-nonshared \
+  memset_chk-nonshared \
+  wmemset_chk-nonshared \
+# sysdep_routines
 endif
diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
new file mode 100644
index 000000000..6ca9f5e6b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_RTM_VECS_H
+#define _AVX_RTM_VECS_H			1
+
+#define COND_VZEROUPPER			COND_VZEROUPPER_XTEST
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define USE_WITH_RTM			1
+#include "avx-vecs.h"
+
+#undef SECTION
+#define SECTION(p)				p##.avx.rtm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
new file mode 100644
index 000000000..89680f5db
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_VECS_H
+#define _AVX_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "vec-macros.h"
+
+#define USE_WITH_AVX		1
+#define SECTION(p)			p##.avx
+
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
deleted file mode 100644
index 639f02bde..000000000
--- a/sysdeps/x86_64/multiarch/bcopy.S
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <sysdep.h>
-
-	.text
-ENTRY(bcopy)
-	xchg	%rdi, %rsi
-	jmp	__libc_memmove	/* Branch to IFUNC memmove.  */
-END(bcopy)
diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
new file mode 100644
index 000000000..99806ebcd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h
@@ -0,0 +1,39 @@
+/* Common config for EVEX256 and EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX_VECS_COMMON_H
+#define _EVEX_VECS_COMMON_H			1
+
+#include "vec-macros.h"
+
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+#define VEC_xmm				VEC_hi_xmm
+#define VEC_ymm				VEC_hi_ymm
+#define VEC_zmm				VEC_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
new file mode 100644
index 000000000..222ba46dc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			32
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX256	1
+#define SECTION(p)			p##.evex
+
+#define VEC					VEC_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
new file mode 100644
index 000000000..d1784d536
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -0,0 +1,35 @@
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			64
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX512	1
+#define SECTION(p)			p##.evex512
+
+#define VEC					VEC_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 68a56797d..e97218f62 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -96,8 +96,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __memcmp_evex_movbe)
-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
-			      __memcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
 			      __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
@@ -337,6 +335,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
@@ -355,6 +358,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
@@ -395,8 +403,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strcasecmp_avx)
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_evex)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_sse42)
@@ -407,9 +423,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strcasecmp_l_avx)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_l_evex)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_l_sse42)
@@ -535,8 +559,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strncasecmp_avx)
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_evex)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_sse42)
@@ -548,9 +580,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strncasecmp_l_avx)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_l_evex)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_l_sse42)
@@ -611,6 +651,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strstr.c.  */
   IFUNC_IMPL (i, name, strstr,
+              IFUNC_IMPL_ADD (array, i, strstr,
+                              (CPU_FEATURE_USABLE (AVX512VL)
+                               && CPU_FEATURE_USABLE (AVX512BW)
+                               && CPU_FEATURE_USABLE (AVX512DQ)
+                               && CPU_FEATURE_USABLE (BMI2)),
+                              __strstr_avx512)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
 
@@ -702,6 +748,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcslen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcslen_sse4_1)
@@ -723,6 +774,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcsnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcsnlen_sse4_1)
@@ -768,8 +824,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __wmemcmp_evex_movbe)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
-			      __wmemcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
@@ -996,6 +1050,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __wmemset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __wmemset_chk_avx2_unaligned_rtm)
 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index cd1261369..4518b0f98 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -21,7 +21,6 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
 	return OPTIMIZE (avx2_movbe);
     }
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
-    return OPTIMIZE (sse4_1);
-
   if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
     return OPTIMIZE (ssse3);
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac..766539c24 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -22,15 +22,28 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
-    return OPTIMIZE (avx);
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+        return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+        return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+        return OPTIMIZE (avx2);
+    }
 
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
index 87b076c7c..c4d71938c 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMCHR __memchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 75bd7262e..c5a256eb3 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -57,7 +57,7 @@
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 5)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 #  ifdef __ILP32__
@@ -87,12 +87,14 @@ ENTRY (MEMCHR)
 # endif
 	testl	%eax, %eax
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
 
 # ifndef USE_AS_RAWMEMCHR
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
@@ -100,58 +102,31 @@ L(first_vec_x0):
 	/* NB: Multiply length by 4 to get byte count.  */
 	sall	$2, %edx
 #  endif
-	xorl	%ecx, %ecx
+    COND_VZEROUPPER
+	/* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
+	   block. branch here as opposed to cmovcc is not that costly. Common
+	   usage of memchr is to check if the return was NULL (if string was
+	   known to contain CHAR user would use rawmemchr). This branch will be
+	   highly correlated with the user branch and can be used by most
+	   modern branch predictors to predict the user branch.  */
 	cmpl	%eax, %edx
-	leaq	(%rdi, %rax), %rax
-	cmovle	%rcx, %rax
-	VZEROUPPER_RETURN
-
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
-	   and rdi for rawmemchr.  */
-	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
-	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Calculate length until end of page (length checked for a
-	   match).  */
-	leaq	1(%ALGN_PTR_REG), %rsi
-	subq	%RRAW_PTR_REG, %rsi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get wchar_t count.  */
-	shrl	$2, %esi
-#  endif
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%ERAW_PTR_REG, %eax, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+    jle  L(null)
+	addq	%rdi, %rax
+    ret
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	addq	%RRAW_PTR_REG, %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-
+# ifndef USE_AS_RAWMEMCHR
+	/* First in aligning bytes here.  */
+L(null):
+	xorl	%eax, %eax
+	ret
+# endif
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
@@ -340,7 +315,7 @@ L(first_vec_x1_check):
 	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
-	.p2align 4
+	.p2align 4,, 6
 L(set_zero_end):
 	xorl	%eax, %eax
 	VZEROUPPER_RETURN
@@ -428,5 +403,39 @@ L(last_vec_x3):
 	VZEROUPPER_RETURN
 # endif
 
+	.p2align 4
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary for
+	   computer return address if byte is found or adjusting length if it
+	   is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+	   and rdi for rawmemchr.  */
+	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a match).  */
+	leaq	1(%ALGN_PTR_REG), %rsi
+	subq	%RRAW_PTR_REG, %rsi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+#  endif
+# endif
+	/* Remove the leading bytes.  */
+	sarxl	%ERAW_PTR_REG, %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	bsfl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+	VZEROUPPER_RETURN
+
+
 END (MEMCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index cfaf02907..0fd11b763 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -88,7 +88,7 @@
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 6)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
@@ -131,22 +131,24 @@ L(zero):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
-	/* Check if first match was before length.  */
-	tzcntl	%eax, %eax
-	xorl	%ecx, %ecx
-	cmpl	%eax, %edx
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-	cmovle	%rcx, %rax
+	/* Check if first match was before length. NB: tzcnt has false data-
+	   dependency on destination. eax already had a data-dependency on esi
+	   so this should have no affect here.  */
+	tzcntl	%eax, %esi
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
+#  else
+	addq	%rsi, %rdi
+#  endif
+	xorl	%eax, %eax
+	cmpl	%esi, %edx
+	cmovg	%rdi, %rax
 	ret
-# else
-	/* NB: first_vec_x0 is 17 bytes which will leave
-	   cross_page_boundary (which is relatively cold) close enough
-	   to ideal alignment. So only realign L(cross_page_boundary) if
-	   rawmemchr.  */
-	.p2align 4
 # endif
+
+	.p2align 4
 L(cross_page_boundary):
 	/* Save pointer before aligning as its original value is
 	   necessary for computer return address if byte is found or
@@ -400,10 +402,14 @@ L(last_2x_vec):
 L(zero_end):
 	ret
 
+L(set_zero_end):
+	xorl	%eax, %eax
+	ret
 
 	.p2align 4
 L(first_vec_x1_check):
-	tzcntl	%eax, %eax
+	/* eax must be non-zero. Use bsfl to save code size.  */
+	bsfl	%eax, %eax
 	/* Adjust length.  */
 	subl	$-(CHAR_PER_VEC * 4), %edx
 	/* Check if match within remaining length.  */
@@ -412,9 +418,6 @@ L(first_vec_x1_check):
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 	ret
-L(set_zero_end):
-	xorl	%eax, %eax
-	ret
 
 	.p2align 4
 L(loop_4x_vec_end):
@@ -464,7 +467,7 @@ L(loop_4x_vec_end):
 # endif
 	ret
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x1_return):
 	tzcntl	%eax, %eax
 # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
@@ -496,6 +499,7 @@ L(last_vec_x3_return):
 # endif
 
 # ifndef USE_AS_RAWMEMCHR
+	.p2align 4,, 5
 L(last_4x_vec_or_less_cmpeq):
 	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
 	kmovd	%k0, %eax
@@ -546,7 +550,7 @@ L(last_4x_vec):
 #  endif
 	andl	%ecx, %eax
 	jz	L(zero_end2)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 L(zero_end2):
 	ret
@@ -562,6 +566,6 @@ L(last_vec_x3):
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 # endif
-
+	/* 7 bytes from next cache line.  */
 END (MEMCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index a34ea1645..210c9925b 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
 # ifndef USE_AS_WMEMCMP
 	cmpl	$8, %edx
 	jae	L(between_8_15)
+	/* Fall through for [4, 7].  */
 	cmpl	$4, %edx
-	jae	L(between_4_7)
+	jb	L(between_2_3)
 
-	/* Load as big endian to avoid branches.  */
-	movzwl	(%rdi), %eax
-	movzwl	(%rsi), %ecx
-	shll	$8, %eax
-	shll	$8, %ecx
-	bswap	%eax
-	bswap	%ecx
-	movzbl	-1(%rdi, %rdx), %edi
-	movzbl	-1(%rsi, %rdx), %esi
-	orl	%edi, %eax
-	orl	%esi, %ecx
-	/* Subtraction is okay because the upper 8 bits are zero.  */
-	subl	%ecx, %eax
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
 	/* No ymm register was touched.  */
 	ret
 
@@ -457,9 +456,33 @@ L(one_or_less):
 	/* No ymm register was touched.  */
 	ret
 
+	.p2align 4,, 5
+L(ret_nonzero):
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	/* No ymm register was touched.  */
+	ret
+
+	.p2align 4,, 2
+L(zero):
+	xorl	%eax, %eax
+	/* No ymm register was touched.  */
+	ret
+
 	.p2align 4
 L(between_8_15):
-# endif
+	movbe	(%rdi), %rax
+	movbe	(%rsi), %rcx
+	subq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	movbe	-8(%rdi, %rdx), %rax
+	movbe	-8(%rsi, %rdx), %rcx
+	subq	%rcx, %rax
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
+	/* No ymm register was touched.  */
+	ret
+# else
 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 	vmovq	(%rdi), %xmm1
 	vmovq	(%rsi), %xmm2
@@ -475,16 +498,13 @@ L(between_8_15):
 	VPCMPEQ	%xmm1, %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
 	jnz	L(return_vec_0)
 	/* No ymm register was touched.  */
 	ret
+# endif
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
+	.p2align 4,, 10
 L(between_16_31):
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
 	vmovdqu	(%rsi), %xmm2
@@ -501,11 +521,17 @@ L(between_16_31):
 	VPCMPEQ	(%rdi), %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
 	jnz	L(return_vec_0)
 	/* No ymm register was touched.  */
 	ret
 
 # ifdef USE_AS_WMEMCMP
+	.p2align 4,, 2
+L(zero):
+	xorl	%eax, %eax
+	ret
+
 	.p2align 4
 L(one_or_less):
 	jb	L(zero)
@@ -520,22 +546,20 @@ L(one_or_less):
 # else
 
 	.p2align 4
-L(between_4_7):
-	/* Load as big endian with overlapping movbe to avoid branches.
-	 */
-	movbe	(%rdi), %eax
-	movbe	(%rsi), %ecx
-	shlq	$32, %rax
-	shlq	$32, %rcx
-	movbe	-4(%rdi, %rdx), %edi
-	movbe	-4(%rsi, %rdx), %esi
-	orq	%rdi, %rax
-	orq	%rsi, %rcx
-	subq	%rcx, %rax
-	jz	L(zero_4_7)
-	sbbl	%eax, %eax
-	orl	$1, %eax
-L(zero_4_7):
+L(between_2_3):
+	/* Load as big endian to avoid branches.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	bswap	%eax
+	bswap	%ecx
+	shrl	%eax
+	shrl	%ecx
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	/* Subtraction is okay because the upper bit is zero.  */
+	subl	%ecx, %eax
 	/* No ymm register was touched.  */
 	ret
 # endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2.S b/sysdeps/x86_64/multiarch/memcmp-sse2.S
index e10555638..4080fc187 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse2.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S
@@ -17,8 +17,8 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# ifndef memcmp
-#  define memcmp __memcmp_sse2
+# ifndef MEMCMP
+#  define MEMCMP __memcmp_sse2
 # endif
 
 # ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
deleted file mode 100644
index cd57c1e2c..000000000
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ /dev/null
@@ -1,803 +0,0 @@
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-#  define MEMCMP	__memcmp_sse4_1
-# endif
-
-#ifdef USE_AS_WMEMCMP
-# define CMPEQ	pcmpeqd
-# define CHAR_SIZE	4
-#else
-# define CMPEQ	pcmpeqb
-# define CHAR_SIZE	1
-#endif
-
-
-/* Warning!
-           wmemcmp has to use SIGNED comparison for elements.
-           memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
-	.section .text.sse4.1,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
-	shl	$2, %RDX_LP
-# elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-# endif
-	cmp	$79, %RDX_LP
-	ja	L(79bytesormore)
-
-	cmp	$CHAR_SIZE, %RDX_LP
-	jbe	L(firstbyte)
-
-	/* N in (CHAR_SIZE, 79) bytes.  */
-	cmpl	$32, %edx
-	ja	L(more_32_bytes)
-
-	cmpl	$16, %edx
-	jae	L(16_to_32_bytes)
-
-# ifndef USE_AS_WMEMCMP
-	cmpl	$8, %edx
-	jae	L(8_to_16_bytes)
-
-	cmpl	$4, %edx
-	jb	L(2_to_3_bytes)
-
-	movl	(%rdi), %eax
-	movl	(%rsi), %ecx
-
-	bswap	%eax
-	bswap	%ecx
-
-	shlq	$32, %rax
-	shlq	$32, %rcx
-
-	movl	-4(%rdi, %rdx), %edi
-	movl	-4(%rsi, %rdx), %esi
-
-	bswap	%edi
-	bswap	%esi
-
-	orq	%rdi, %rax
-	orq	%rsi, %rcx
-	subq	%rcx, %rax
-	cmovne	%edx, %eax
-	sbbl	%ecx, %ecx
-	orl	%ecx, %eax
-	ret
-
-	.p2align 4,, 8
-L(2_to_3_bytes):
-	movzwl	(%rdi), %eax
-	movzwl	(%rsi), %ecx
-	shll	$8, %eax
-	shll	$8, %ecx
-	bswap	%eax
-	bswap	%ecx
-	movzbl	-1(%rdi, %rdx), %edi
-	movzbl	-1(%rsi, %rdx), %esi
-	orl	%edi, %eax
-	orl	%esi, %ecx
-	subl	%ecx, %eax
-	ret
-
-	.p2align 4,, 8
-L(8_to_16_bytes):
-	movq	(%rdi), %rax
-	movq	(%rsi), %rcx
-
-	bswap	%rax
-	bswap	%rcx
-
-	subq	%rcx, %rax
-	jne	L(8_to_16_bytes_done)
-
-	movq	-8(%rdi, %rdx), %rax
-	movq	-8(%rsi, %rdx), %rcx
-
-	bswap	%rax
-	bswap	%rcx
-
-	subq	%rcx, %rax
-
-L(8_to_16_bytes_done):
-	cmovne	%edx, %eax
-	sbbl	%ecx, %ecx
-	orl	%ecx, %eax
-	ret
-# else
-	xorl	%eax, %eax
-	movl	(%rdi), %ecx
-	cmpl	(%rsi), %ecx
-	jne	L(8_to_16_bytes_done)
-	movl	4(%rdi), %ecx
-	cmpl	4(%rsi), %ecx
-	jne	L(8_to_16_bytes_done)
-	movl	-4(%rdi, %rdx), %ecx
-	cmpl	-4(%rsi, %rdx), %ecx
-	jne	L(8_to_16_bytes_done)
-	ret
-# endif
-
-	.p2align 4,, 3
-L(ret_zero):
-	xorl	%eax, %eax
-L(zero):
-	ret
-
-	.p2align 4,, 8
-L(firstbyte):
-	jb	L(ret_zero)
-# ifdef USE_AS_WMEMCMP
-	xorl	%eax, %eax
-	movl	(%rdi), %ecx
-	cmpl	(%rsi), %ecx
-	je	L(zero)
-L(8_to_16_bytes_done):
-	setg	%al
-	leal	-1(%rax, %rax), %eax
-# else
-	movzbl	(%rdi), %eax
-	movzbl	(%rsi), %ecx
-	sub	%ecx, %eax
-# endif
-	ret
-
-	.p2align 4
-L(vec_return_begin_48):
-	addq	$16, %rdi
-	addq	$16, %rsi
-L(vec_return_begin_32):
-	bsfl	%eax, %eax
-# ifdef USE_AS_WMEMCMP
-	movl	32(%rdi, %rax), %ecx
-	xorl	%edx, %edx
-	cmpl	32(%rsi, %rax), %ecx
-	setg	%dl
-	leal	-1(%rdx, %rdx), %eax
-# else
-	movzbl	32(%rsi, %rax), %ecx
-	movzbl	32(%rdi, %rax), %eax
-	subl	%ecx, %eax
-# endif
-	ret
-
-	.p2align 4
-L(vec_return_begin_16):
-	addq	$16, %rdi
-	addq	$16, %rsi
-L(vec_return_begin):
-	bsfl	%eax, %eax
-# ifdef USE_AS_WMEMCMP
-	movl	(%rdi, %rax), %ecx
-	xorl	%edx, %edx
-	cmpl	(%rsi, %rax), %ecx
-	setg	%dl
-	leal	-1(%rdx, %rdx), %eax
-# else
-	movzbl	(%rsi, %rax), %ecx
-	movzbl	(%rdi, %rax), %eax
-	subl	%ecx, %eax
-# endif
-	ret
-
-	.p2align 4
-L(vec_return_end_16):
-	subl	$16, %edx
-L(vec_return_end):
-	bsfl	%eax, %eax
-	addl	%edx, %eax
-# ifdef USE_AS_WMEMCMP
-	movl	-16(%rdi, %rax), %ecx
-	xorl	%edx, %edx
-	cmpl	-16(%rsi, %rax), %ecx
-	setg	%dl
-	leal	-1(%rdx, %rdx), %eax
-# else
-	movzbl	-16(%rsi, %rax), %ecx
-	movzbl	-16(%rdi, %rax), %eax
-	subl	%ecx, %eax
-# endif
-	ret
-
-	.p2align 4,, 8
-L(more_32_bytes):
-	movdqu	(%rdi), %xmm0
-	movdqu	(%rsi), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	16(%rdi), %xmm0
-	movdqu	16(%rsi), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	cmpl	$64, %edx
-	jbe	L(32_to_64_bytes)
-	movdqu	32(%rdi), %xmm0
-	movdqu	32(%rsi), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	.p2align 4,, 6
-L(32_to_64_bytes):
-	movdqu	-32(%rdi, %rdx), %xmm0
-	movdqu	-32(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end_16)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-	.p2align 4
-L(16_to_32_bytes):
-	movdqu	(%rdi), %xmm0
-	movdqu	(%rsi), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-
-	.p2align 4
-L(79bytesormore):
-	movdqu	(%rdi), %xmm0
-	movdqu	(%rsi), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-
-	mov	%rsi, %rcx
-	and	$-16, %rsi
-	add	$16, %rsi
-	sub	%rsi, %rcx
-
-	sub	%rcx, %rdi
-	add	%rcx, %rdx
-	test	$0xf, %rdi
-	jz	L(2aligned)
-
-	cmp	$128, %rdx
-	ja	L(128bytesormore)
-
-	.p2align 4,, 6
-L(less128bytes):
-	movdqu	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqu	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqu	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	cmp	$96, %rdx
-	jb	L(32_to_64_bytes)
-
-	addq	$64, %rdi
-	addq	$64, %rsi
-	subq	$64, %rdx
-
-	.p2align 4,, 6
-L(last_64_bytes):
-	movdqu	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqu	-32(%rdi, %rdx), %xmm0
-	movdqu	-32(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end_16)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-	.p2align 4
-L(128bytesormore):
-	cmp	$256, %rdx
-	ja	L(unaligned_loop)
-L(less256bytes):
-	movdqu	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqu	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqu	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	addq	$64, %rdi
-	addq	$64, %rsi
-
-	movdqu	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqu	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqu	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	addq	$-128, %rdx
-	subq	$-64, %rsi
-	subq	$-64, %rdi
-
-	cmp	$64, %rdx
-	ja	L(less128bytes)
-
-	cmp	$32, %rdx
-	ja	L(last_64_bytes)
-
-	movdqu	-32(%rdi, %rdx), %xmm0
-	movdqu	-32(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end_16)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-	.p2align 4
-L(unaligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
-# else
-	mov	__x86_data_cache_size_half(%rip), %R8_LP
-# endif
-	movq	%r8, %r9
-	addq	%r8, %r8
-	addq	%r9, %r8
-	cmpq	%r8, %rdx
-	ja	L(L2_L3_cache_unaligned)
-	sub	$64, %rdx
-	.p2align 4
-L(64bytesormore_loop):
-	movdqu	(%rdi), %xmm0
-	movdqu	16(%rdi), %xmm1
-	movdqu	32(%rdi), %xmm2
-	movdqu	48(%rdi), %xmm3
-
-	CMPEQ	(%rsi), %xmm0
-	CMPEQ	16(%rsi), %xmm1
-	CMPEQ	32(%rsi), %xmm2
-	CMPEQ	48(%rsi), %xmm3
-
-	pand	%xmm0, %xmm1
-	pand	%xmm2, %xmm3
-	pand	%xmm1, %xmm3
-
-	pmovmskb %xmm3, %eax
-	incw	%ax
-	jnz	L(64bytesormore_loop_end)
-
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	ja	L(64bytesormore_loop)
-
-	.p2align 4,, 6
-L(loop_tail):
-	addq	%rdx, %rdi
-	movdqu	(%rdi), %xmm0
-	movdqu	16(%rdi), %xmm1
-	movdqu	32(%rdi), %xmm2
-	movdqu	48(%rdi), %xmm3
-
-	addq	%rdx, %rsi
-	movdqu	(%rsi), %xmm4
-	movdqu	16(%rsi), %xmm5
-	movdqu	32(%rsi), %xmm6
-	movdqu	48(%rsi), %xmm7
-
-	CMPEQ	%xmm4, %xmm0
-	CMPEQ	%xmm5, %xmm1
-	CMPEQ	%xmm6, %xmm2
-	CMPEQ	%xmm7, %xmm3
-
-	pand	%xmm0, %xmm1
-	pand	%xmm2, %xmm3
-	pand	%xmm1, %xmm3
-
-	pmovmskb %xmm3, %eax
-	incw	%ax
-	jnz	L(64bytesormore_loop_end)
-	ret
-
-L(L2_L3_cache_unaligned):
-	subq	$64, %rdx
-	.p2align 4
-L(L2_L3_unaligned_128bytes_loop):
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x1c0(%rsi)
-
-	movdqu	(%rdi), %xmm0
-	movdqu	16(%rdi), %xmm1
-	movdqu	32(%rdi), %xmm2
-	movdqu	48(%rdi), %xmm3
-
-	CMPEQ	(%rsi), %xmm0
-	CMPEQ	16(%rsi), %xmm1
-	CMPEQ	32(%rsi), %xmm2
-	CMPEQ	48(%rsi), %xmm3
-
-	pand	%xmm0, %xmm1
-	pand	%xmm2, %xmm3
-	pand	%xmm1, %xmm3
-
-	pmovmskb %xmm3, %eax
-	incw	%ax
-	jnz	L(64bytesormore_loop_end)
-
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	ja	L(L2_L3_unaligned_128bytes_loop)
-	jmp	L(loop_tail)
-
-
-	/* This case is for machines which are sensitive for unaligned
-	 * instructions.  */
-	.p2align 4
-L(2aligned):
-	cmp	$128, %rdx
-	ja	L(128bytesormorein2aligned)
-L(less128bytesin2aligned):
-	movdqa	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqa	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqa	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqa	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	cmp	$96, %rdx
-	jb	L(32_to_64_bytes)
-
-	addq	$64, %rdi
-	addq	$64, %rsi
-	subq	$64, %rdx
-
-	.p2align 4,, 6
-L(aligned_last_64_bytes):
-	movdqa	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqa	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqu	-32(%rdi, %rdx), %xmm0
-	movdqu	-32(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end_16)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-	.p2align 4
-L(128bytesormorein2aligned):
-	cmp	$256, %rdx
-	ja	L(aligned_loop)
-L(less256bytesin2alinged):
-	movdqa	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqa	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqa	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqa	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	addq	$64, %rdi
-	addq	$64, %rsi
-
-	movdqa	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqa	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqa	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqa	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	addq	$-128, %rdx
-	subq	$-64, %rsi
-	subq	$-64, %rdi
-
-	cmp	$64, %rdx
-	ja	L(less128bytesin2aligned)
-
-	cmp	$32, %rdx
-	ja	L(aligned_last_64_bytes)
-
-	movdqu	-32(%rdi, %rdx), %xmm0
-	movdqu	-32(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end_16)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-	.p2align 4
-L(aligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
-# else
-	mov	__x86_data_cache_size_half(%rip), %R8_LP
-# endif
-	movq	%r8, %r9
-	addq	%r8, %r8
-	addq	%r9, %r8
-	cmpq	%r8, %rdx
-	ja	L(L2_L3_cache_aligned)
-
-	sub	$64, %rdx
-	.p2align 4
-L(64bytesormore_loopin2aligned):
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm1
-	movdqa	32(%rdi), %xmm2
-	movdqa	48(%rdi), %xmm3
-
-	CMPEQ	(%rsi), %xmm0
-	CMPEQ	16(%rsi), %xmm1
-	CMPEQ	32(%rsi), %xmm2
-	CMPEQ	48(%rsi), %xmm3
-
-	pand	%xmm0, %xmm1
-	pand	%xmm2, %xmm3
-	pand	%xmm1, %xmm3
-
-	pmovmskb %xmm3, %eax
-	incw	%ax
-	jnz	L(64bytesormore_loop_end)
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	ja	L(64bytesormore_loopin2aligned)
-	jmp	L(loop_tail)
-
-L(L2_L3_cache_aligned):
-	subq	$64, %rdx
-	.p2align 4
-L(L2_L3_aligned_128bytes_loop):
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x1c0(%rsi)
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm1
-	movdqa	32(%rdi), %xmm2
-	movdqa	48(%rdi), %xmm3
-
-	CMPEQ	(%rsi), %xmm0
-	CMPEQ	16(%rsi), %xmm1
-	CMPEQ	32(%rsi), %xmm2
-	CMPEQ	48(%rsi), %xmm3
-
-	pand	%xmm0, %xmm1
-	pand	%xmm2, %xmm3
-	pand	%xmm1, %xmm3
-
-	pmovmskb %xmm3, %eax
-	incw	%ax
-	jnz	L(64bytesormore_loop_end)
-
-	addq	$64, %rsi
-	addq	$64, %rdi
-	subq	$64, %rdx
-	ja	L(L2_L3_aligned_128bytes_loop)
-	jmp	L(loop_tail)
-
-	.p2align 4
-L(64bytesormore_loop_end):
-	pmovmskb %xmm0, %ecx
-	incw	%cx
-	jnz	L(loop_end_ret)
-
-	pmovmskb %xmm1, %ecx
-	notw	%cx
-	sall	$16, %ecx
-	jnz	L(loop_end_ret)
-
-	pmovmskb %xmm2, %ecx
-	notw	%cx
-	shlq	$32, %rcx
-	jnz	L(loop_end_ret)
-
-	addq	$48, %rdi
-	addq	$48, %rsi
-	movq	%rax, %rcx
-
-	.p2align 4,, 6
-L(loop_end_ret):
-	bsfq	%rcx, %rcx
-# ifdef USE_AS_WMEMCMP
-	movl	(%rdi, %rcx), %eax
-	xorl	%edx, %edx
-	cmpl	(%rsi, %rcx), %eax
-	setg	%dl
-	leal	-1(%rdx, %rdx), %eax
-# else
-	movzbl	(%rdi, %rcx), %eax
-	movzbl	(%rsi, %rcx), %ecx
-	subl	%ecx, %eax
-# endif
-	ret
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
index b80a29d4b..9d991e5c7 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
@@ -16,8 +16,10 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#ifndef memcmp
-# define memcmp	__memcmpeq_sse2
+#if IS_IN (libc)
+# define MEMCMP	__memcmpeq_sse2
+#else
+# define MEMCMP	__memcmpeq
 #endif
 #define USE_AS_MEMCMPEQ	1
 #include "memcmp-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-erms.S b/sysdeps/x86_64/multiarch/memmove-erms.S
new file mode 100644
index 000000000..2d3a6ccb7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-erms.S
@@ -0,0 +1,72 @@
+/* memcpy/mempcpy/memmove implement with rep movsb
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	.text
+ENTRY (__mempcpy_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_erms)
+
+/* Only used to measure performance of REP MOVSB.  */
+ENTRY (__mempcpy_erms)
+	mov	%RDI_LP, %RAX_LP
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	2f
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start_movsb)
+END (__mempcpy_erms)
+
+ENTRY (__memmove_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_erms)
+
+ENTRY (__memmove_erms)
+	movq	%rdi, %rax
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	2f
+L(start_movsb):
+	mov	%RDX_LP, %RCX_LP
+	cmp	%RSI_LP, %RDI_LP
+	jb	1f
+	/* Source == destination is less common.  */
+	je	2f
+	lea	(%rsi,%rcx), %RDX_LP
+	cmp	%RDX_LP, %RDI_LP
+	jb	L(movsb_backward)
+1:
+	rep movsb
+2:
+	ret
+L(movsb_backward):
+	leaq	-1(%rdi,%rcx), %rdi
+	leaq	-1(%rsi,%rcx), %rsi
+	std
+	rep movsb
+	cld
+	ret
+END (__memmove_erms)
+strong_alias (__memmove_erms, __memcpy_erms)
+strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index af51177d5..04747133b 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -118,7 +118,13 @@
 # define LARGE_LOAD_SIZE (VEC_SIZE * 4)
 #endif
 
-/* Amount to shift rdx by to compare for memcpy_large_4x.  */
+/* Amount to shift __x86_shared_non_temporal_threshold by for
+   bound for memcpy_large_4x. This is essentially use to to
+   indicate that the copy is far beyond the scope of L3
+   (assuming no user config x86_non_temporal_threshold) and to
+   use a more aggressively unrolled loop.  NB: before
+   increasing the value also update initialization of
+   x86_non_temporal_threshold.  */
 #ifndef LOG_4X_MEMCPY_THRESH
 # define LOG_4X_MEMCPY_THRESH 4
 #endif
@@ -233,56 +239,6 @@ L(start):
 #endif
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
-# if VEC_SIZE == 16
-ENTRY (__mempcpy_chk_erms)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (__mempcpy_chk_erms)
-
-/* Only used to measure performance of REP MOVSB.  */
-ENTRY (__mempcpy_erms)
-	mov	%RDI_LP, %RAX_LP
-	/* Skip zero length.  */
-	test	%RDX_LP, %RDX_LP
-	jz	2f
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start_movsb)
-END (__mempcpy_erms)
-
-ENTRY (__memmove_chk_erms)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (__memmove_chk_erms)
-
-ENTRY (__memmove_erms)
-	movq	%rdi, %rax
-	/* Skip zero length.  */
-	test	%RDX_LP, %RDX_LP
-	jz	2f
-L(start_movsb):
-	mov	%RDX_LP, %RCX_LP
-	cmp	%RSI_LP, %RDI_LP
-	jb	1f
-	/* Source == destination is less common.  */
-	je	2f
-	lea	(%rsi,%rcx), %RDX_LP
-	cmp	%RDX_LP, %RDI_LP
-	jb	L(movsb_backward)
-1:
-	rep movsb
-2:
-	ret
-L(movsb_backward):
-	leaq	-1(%rdi,%rcx), %rdi
-	leaq	-1(%rsi,%rcx), %rsi
-	std
-	rep movsb
-	cld
-	ret
-END (__memmove_erms)
-strong_alias (__memmove_erms, __memcpy_erms)
-strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
-# endif
 
 # ifdef SHARED
 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
@@ -724,9 +680,14 @@ L(skip_short_movsb_check):
 	.p2align 4,, 10
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 L(large_memcpy_2x_check):
-	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
-	jb	L(more_8x_vec_check)
+	/* Entry from L(large_memcpy_2x) has a redundant load of
+	   __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x)
+	   is only use for the non-erms memmove which is generally less
+	   common.  */
 L(large_memcpy_2x):
+	mov	__x86_shared_non_temporal_threshold(%rip), %R11_LP
+	cmp	%R11_LP, %RDX_LP
+	jb	L(more_8x_vec_check)
 	/* To reach this point it is impossible for dst > src and
 	   overlap. Remaining to check is src > dst and overlap. rcx
 	   already contains dst - src. Negate rcx to get src - dst. If
@@ -774,18 +735,21 @@ L(large_memcpy_2x):
 	/* ecx contains -(dst - src). not ecx will return dst - src - 1
 	   which works for testing aliasing.  */
 	notl	%ecx
+	movq	%rdx, %r10
 	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
 	jz	L(large_memcpy_4x)
 
-	movq	%rdx, %r10
-	shrq	$LOG_4X_MEMCPY_THRESH, %r10
-	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
+	/* r11 has __x86_shared_non_temporal_threshold.  Shift it left
+	   by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold.
+	 */
+	shlq	$LOG_4X_MEMCPY_THRESH, %r11
+	cmp	%r11, %rdx
 	jae	L(large_memcpy_4x)
 
 	/* edx will store remainder size for copying tail.  */
 	andl	$(PAGE_SIZE * 2 - 1), %edx
 	/* r10 stores outer loop counter.  */
-	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+	shrq	$(LOG_PAGE_SIZE + 1), %r10
 	/* Copy 4x VEC at a time from 2 pages.  */
 	.p2align 4
 L(loop_large_memcpy_2x_outer):
@@ -850,7 +814,6 @@ L(large_memcpy_2x_end):
 
 	.p2align 4
 L(large_memcpy_4x):
-	movq	%rdx, %r10
 	/* edx will store remainder size for copying tail.  */
 	andl	$(PAGE_SIZE * 4 - 1), %edx
 	/* r10 stores outer loop counter.  */
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
index cea2d2a72..5e9beeeef 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -2,6 +2,7 @@
 # define MEMRCHR __memrchr_avx2_rtm
 #endif
 
+#define COND_VZEROUPPER	COND_VZEROUPPER_XTEST
 #define ZERO_UPPER_VEC_REGISTERS_RETURN \
   ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index ba2ce7cb0..f300d7daf 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -21,340 +21,318 @@
 # include <sysdep.h>
 
 # ifndef MEMRCHR
-#  define MEMRCHR	__memrchr_avx2
+#  define MEMRCHR				__memrchr_avx2
 # endif
 
 # ifndef VZEROUPPER
-#  define VZEROUPPER	vzeroupper
+#  define VZEROUPPER			vzeroupper
 # endif
 
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
 
-# define VEC_SIZE 32
+# define VEC_SIZE			32
+# define PAGE_SIZE			4096
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
 
-	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMRCHR)
-	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
-	vpbroadcastb %xmm0, %ymm0
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
-
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdx, %rdi), %rax
 
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
+	vpbroadcastb %xmm0, %ymm0
 
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+
+L(ret_vec_x0_test):
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will gurantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+
+	/* Hoist vzeroupper (not great for RTM) to save code size. This allows
+	   all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vmovdqa	(%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-
-	vpcmpeqb %ymm1, %ymm0, %ymm1
-	vpcmpeqb %ymm2, %ymm0, %ymm2
-	vpcmpeqb %ymm3, %ymm0, %ymm3
-	vpcmpeqb %ymm4, %ymm0, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpmovmskb %ymm1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
-
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
-
-	vpcmpeqb (%rdi), %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Align rax (string pointer).  */
+	andq	$-VEC_SIZE, %rax
+
+	/* Recompute remaining length after aligning.  */
+	movq	%rax, %rdx
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
+	subq	%rdi, %rdx
+	decq	%rax
+	vpmovmskb %ymm1, %ecx
+	/* Fall through for short (hotter than length).  */
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpeqb (%rsi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %r8d
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%r8d
+	shlxl	%r8d, %ecx, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	COND_VZEROUPPER
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
+	.p2align 4,, 11
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
+	.p2align 4,, 10
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+	/* Needed no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	.p2align 4
-L(null):
+	/* First in aligning bytes.  */
+L(zero_2):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
+	.p2align 4,, 4
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_2)
+	ret
 
-	vpcmpeqb (%rdi), %ymm0, %ymm1
 
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
+	.p2align 4,, 11
+L(ret_vec_x2):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4,, 14
+L(ret_vec_x3):
+	/* ecx must be non-zero.  */
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(null)
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecissary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	/* Align rax to (VEC_SIZE - 1).  */
+	orq	$(VEC_SIZE * 4 - 1), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	orq	$(VEC_SIZE * 4 - 1), %rdx
 
-	/* Check the last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	.p2align 4
+L(loop_4x_vec):
+	/* Need this comparison next no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
+	vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	vpor	%ymm1, %ymm2, %ymm2
+	vpor	%ymm3, %ymm4, %ymm4
+	vpor	%ymm2, %ymm4, %ymm4
+	vpmovmskb %ymm4, %esi
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	testl	%esi, %esi
+	jnz	L(loop_end)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	VZEROUPPER_RETURN
+	addq	$(VEC_SIZE * -4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
 
-	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
+	subl	%edi, %edx
+	incl	%edx
 
-	/* Check the last VEC.  */
-	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+L(last_4x_vec):
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	vpmovmskb %ymm1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
+	vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	/* Used no matter what.  */
+	vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+	vpmovmskb %ymm1, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpeqb (%rdi), %ymm0, %ymm1
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2), %rax
+	COND_VZEROUPPER
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret0)
+	xorl	%eax, %eax
+L(ret0):
+	ret
 
-	movl	%r8d, %ecx
 
-	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(loop_end):
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vpmovmskb %ymm2, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	vpmovmskb %ymm3, %ecx
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	leaq	(VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+	VZEROUPPER_RETURN
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	.p2align 4,, 4
+L(ret_vec_x1_end):
+	/* 64-bit version will automatically add 32 (VEC_SIZE).  */
+	lzcntq	%rcx, %rcx
+	subq	%rcx, %rax
+	VZEROUPPER_RETURN
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	VZEROUPPER_RETURN
-END (MEMRCHR)
+
+	/* 2 bytes until next cache line.  */
+END(MEMRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 0b99709c6..91329b18d 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -19,319 +19,316 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
+# ifndef MEMRCHR
+#  define MEMRCHR				__memrchr_evex
+# endif
+
+# define PAGE_SIZE			4096
+# define VECMATCH			VEC(0)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+# else
+	test	%RDX_LP, %RDX_LP
+# endif
+	jz	L(zero_0)
+
+	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+	   correct page cross check and 2) it correctly sets up end ptr to be
+	   subtract by lzcnt aligned.  */
+	leaq	-1(%rdi, %rdx), %rax
+	vpbroadcastb %esi, %VECMATCH
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jz	L(page_cross)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+
+	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
+	cmpq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+
+	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+	   will guarantee edx (len) is less than it.  */
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
+	ret
 
-# define VMOVA		vmovdqa64
-
-# define YMMMATCH	ymm16
-
-# define VEC_SIZE 32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (__memrchr_evex)
-	/* Broadcast CHAR to YMMMATCH.  */
-	vpbroadcastb %esi, %YMMMATCH
-
-	sub	$VEC_SIZE, %RDX_LP
-	jbe	L(last_vec_or_less)
-
-	add	%RDX_LP, %RDI_LP
-
-	/* Check the last VEC_SIZE bytes.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	subq	$(VEC_SIZE * 4), %rdi
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(aligned_more)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rdx
-	andq	$-VEC_SIZE, %rdi
-	subq	%rcx, %rdx
-
-	.p2align 4
-L(aligned_more):
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x0)
-
-	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-	   There are some overlaps with above if data isn't aligned
-	   to 4 * VEC_SIZE.  */
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-	jz	L(loop_4x_vec)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rdi
-	subq	%rcx, %rdx
+	/* Fits in aligning bytes of first cache line.  */
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	subq	$(VEC_SIZE * 4), %rdi
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jz	L(loop_4x_vec)
-
-	/* There is a match.  */
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	kmovd	%k1, %eax
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 9
+L(ret_vec_x0_dec):
+	decq	%rax
+L(ret_vec_x0):
+	lzcntl	%ecx, %ecx
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec_or_less):
-	addl	$(VEC_SIZE * 4), %edx
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	.p2align 4,, 10
+L(more_1x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
+	/* Align rax (pointer to string).  */
+	andq	$-VEC_SIZE, %rax
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
+	/* Recompute length after aligning.  */
+	movq	%rax, %rdx
 
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
-	cmpl	$(VEC_SIZE * 3), %edx
-	jbe	L(zero)
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 4), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addq	%rdi, %rax
-	ret
+	subq	%rdi, %rdx
 
-	.p2align 4
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
 L(last_2x_vec):
-	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_check)
+
+	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
+	decq	%rax
 	cmpl	$VEC_SIZE, %edx
-	jbe	L(zero)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 2), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	jbe	L(ret_vec_x0_test)
+
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0)
+
+	/* Don't use rax for pointer here because EVEX has better encoding with
+	   offset % VEC_SIZE == 0.  */
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	lzcntq	%rcx, %rcx
+	cmpl	%ecx, %edx
+	jle	L(zero_0)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x0):
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	/* Inexpensive place to put this regarding code size / target alignments
+	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
+	   in first cache line.  */
+L(page_cross):
+	movq	%rax, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vpcmpb	$0, (%rsi), %VECMATCH, %k0
+	kmovd	%k0, %r8d
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	movl	%eax, %ecx
+	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
+	notl	%ecx
+	shlxl	%ecx, %r8d, %ecx
+	cmpq	%rdi, %rsi
+	ja	L(more_1x_vec)
+	lzcntl	%ecx, %ecx
+	cmpl	%ecx, %edx
+	jle	L(zero_1)
+	subq	%rcx, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	bsrl	%eax, %eax
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
+	/* Continue creating zero labels that fit in aligning bytes and get
+	   2-byte encoding / are in the same cache line as condition.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 2), %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x1):
+	/* This will naturally add 32 to position.  */
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	bsrl	%eax, %eax
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	.p2align 4
-L(last_vec_x1_check):
-	bsrl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$VEC_SIZE, %eax
-	addq	%rdi, %rax
-	ret
+	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(last_vec_x3_check):
-	bsrl	%eax, %eax
-	subq	$VEC_SIZE, %rdx
-	addq	%rax, %rdx
-	jl	L(zero)
-	addl	$(VEC_SIZE * 3), %eax
-	addq	%rdi, %rax
-	ret
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	cmpl	$(VEC_SIZE * -1), %edx
+	jle	L(ret_vec_x2_test)
+L(last_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
+
+
+	/* Need no matter what.  */
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 3 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less_aligned):
-	movl	%edx, %ecx
-
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-
-	movl	$1, %edx
-	/* Support rdx << 32.  */
-	salq	%cl, %rdx
-	subq	$1, %rdx
-
-	kmovd	%k1, %eax
-
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
-
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
+	.p2align 4,, 8
+L(ret_vec_x2_test):
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	ja	L(zero_1)
 	ret
 
-	.p2align 4
-L(last_vec_or_less):
-	addl	$VEC_SIZE, %edx
-
-	/* Check for zero length.  */
-	testl	%edx, %edx
-	jz	L(zero)
-
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	jz	L(last_vec_or_less_aligned)
-
-	movl	%ecx, %esi
-	movl	%ecx, %r8d
-	addl	%edx, %esi
-	andq	$-VEC_SIZE, %rdi
+	.p2align 4,, 8
+L(ret_vec_x2):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	subl	$VEC_SIZE, %esi
-	ja	L(last_vec_2x_aligned)
+	.p2align 4,, 8
+L(ret_vec_x3):
+	bsrl	%ecx, %ecx
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	/* Check the last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x2)
 
-	/* Remove the leading and trailing bytes.  */
-	sarl	%cl, %eax
-	movl	%edx, %ecx
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x3)
 
-	andl	%edx, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	/* Check if near end before re-aligning (otherwise might do an
+	   unnecessary loop iteration).  */
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
-	ret
+	decq	%rax
+	andq	$-(VEC_SIZE * 4), %rax
+	movq	%rdi, %rdx
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+	   lengths that overflow can be valid and break the comparison.  */
+	andq	$-(VEC_SIZE * 4), %rdx
 
 	.p2align 4
-L(last_vec_2x_aligned):
-	movl	%esi, %ecx
-
-	/* Check the last VEC.  */
-	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+L(loop_4x_vec):
+	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
+	   on).  */
+	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+
+	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
+	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
+	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
+	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
+	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
+	vptestnmb %VEC(3), %VEC(3), %k2
+
+	/* Any 1s and we found CHAR.  */
+	kortestd %k2, %k4
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rax
+	cmpq	%rdx, %rax
+	jne	L(loop_4x_vec)
+
+	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
+	subq	$-(VEC_SIZE * 4), %rdx
+	movq	%rdx, %rax
+	subl	%edi, %edx
+L(last_4x_vec):
+
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	movl	$1, %edx
-	sall	%cl, %edx
-	subl	$1, %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
 
-	kmovd	%k1, %eax
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_dec)
 
-	/* Remove the trailing bytes.  */
-	andl	%edx, %eax
 
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	/* Check the second last VEC.  */
-	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1)
 
-	movl	%r8d, %ecx
+	/* Used no matter what.  */
+	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+	kmovd	%k0, %ecx
 
-	kmovd	%k1, %eax
+	cmpl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
 
-	/* Remove the leading bytes.  Must use unsigned right shift for
-	   bsrl below.  */
-	shrl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(zero)
+	lzcntl	%ecx, %ecx
+	subq	$(VEC_SIZE * 2 + 1), %rax
+	subq	%rcx, %rax
+	cmpq	%rax, %rdi
+	jbe	L(ret_1)
+	xorl	%eax, %eax
+L(ret_1):
+	ret
 
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	addq	%r8, %rax
+	.p2align 4,, 6
+L(loop_end):
+	kmovd	%k1, %ecx
+	notl	%ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x0_end)
+
+	vptestnmb %VEC(2), %VEC(2), %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(ret_vec_x1_end)
+
+	kmovd	%k2, %ecx
+	kmovd	%k4, %esi
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
+	.p2align 4,, 4
+L(ret_vec_x0_end):
+	addq	$(VEC_SIZE), %rax
+L(ret_vec_x1_end):
+	bsrl	%ecx, %ecx
+	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
 	ret
-END (__memrchr_evex)
+
+END(MEMRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 1af668af0..c0bf2875d 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -10,15 +10,18 @@
 # define VMOVU     vmovdqu
 # define VMOVA     vmovdqa
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastb %xmm0, %ymm0
+  movq r, %rax;
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
 
 # ifndef SECTION
 #  define SECTION(p)		p##.avx
@@ -30,5 +33,6 @@
 #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
 
+# define USE_XMM_LESS_VEC
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index f14d6f849..5241216a7 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -15,13 +15,19 @@
 
 # define VZEROUPPER
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 # define SECTION(p)		p##.evex512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memset-erms.S b/sysdeps/x86_64/multiarch/memset-erms.S
new file mode 100644
index 000000000..e83cccc73
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-erms.S
@@ -0,0 +1,44 @@
+/* memset implement with rep stosb
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <sysdep.h>
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	.text
+ENTRY (__memset_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_erms)
+
+/* Only used to measure performance of REP STOSB.  */
+ENTRY (__memset_erms)
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	 L(stosb_return_zero)
+	mov	%RDX_LP, %RCX_LP
+	movzbl	%sil, %eax
+	mov	%RDI_LP, %RDX_LP
+	rep stosb
+	mov	%RDX_LP, %RAX_LP
+	ret
+L(stosb_return_zero):
+	movq	%rdi, %rax
+	ret
+END (__memset_erms)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 64b09e77c..637002150 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -15,13 +15,19 @@
 
 # define VZEROUPPER
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 # define SECTION(p)		p##.evex
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 8a6f0c561..3d92f6993 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -30,9 +30,7 @@
 # endif
 
 # undef weak_alias
-# define weak_alias(original, alias) \
-	.weak bzero; bzero = __bzero
-
+# define weak_alias(original, alias)
 # undef strong_alias
 # define strong_alias(ignored1, ignored2)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 1e0511c79..905d0fa46 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -1,4 +1,4 @@
-/* memset/bzero with unaligned store and rep stosb
+/* memset with unaligned store and rep stosb
    Copyright (C) 2016-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -58,8 +58,10 @@
 #ifndef MOVQ
 # if VEC_SIZE > 16
 #  define MOVQ				vmovq
+#  define MOVD				vmovd
 # else
 #  define MOVQ				movq
+#  define MOVD				movd
 # endif
 #endif
 
@@ -72,9 +74,29 @@
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 # define END_REG	rcx
 # define LOOP_REG	rdi
+# define LESS_VEC_REG	rax
 #else
 # define END_REG	rdi
 # define LOOP_REG	rdx
+# define LESS_VEC_REG	rdi
+#endif
+
+#ifdef USE_XMM_LESS_VEC
+# define XMM_SMALL	1
+#else
+# define XMM_SMALL	0
+#endif
+
+#ifdef USE_LESS_VEC_MASK_STORE
+# define SET_REG64	rcx
+# define SET_REG32	ecx
+# define SET_REG16	cx
+# define SET_REG8	cl
+#else
+# define SET_REG64	rsi
+# define SET_REG32	esi
+# define SET_REG16	si
+# define SET_REG8	sil
 #endif
 
 #define PAGE_SIZE 4096
@@ -88,18 +110,7 @@
 # error SECTION is not defined!
 #endif
 
-	.section SECTION(.text),"ax",@progbits
-#if VEC_SIZE == 16 && IS_IN (libc)
-ENTRY (__bzero)
-	mov	%RDI_LP, %RAX_LP /* Set return value.  */
-	mov	%RSI_LP, %RDX_LP /* Set n.  */
-	xorl	%esi, %esi
-	pxor	%XMM0, %XMM0
-	jmp	L(entry_from_bzero)
-END (__bzero)
-weak_alias (__bzero, bzero)
-#endif
-
+	.section SECTION(.text), "ax", @progbits
 #if IS_IN (libc)
 # if defined SHARED
 ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
@@ -110,8 +121,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 
 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 	shl	$2, %RDX_LP
-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
-	jmp	L(entry_from_bzero)
+	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+	WMEMSET_VDUP_TO_VEC0_LOW()
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec_from_wmemset)
+	WMEMSET_VDUP_TO_VEC0_HIGH()
+	jmp	L(entry_from_wmemset)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
 #endif
 
@@ -123,14 +138,15 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
 # endif
-L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH()
+L(entry_from_wmemset):
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
@@ -140,37 +156,6 @@ L(entry_from_bzero):
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
 
-# if VEC_SIZE == 16
-ENTRY (__memset_chk_erms)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (__memset_chk_erms)
-
-/* Only used to measure performance of REP STOSB.  */
-ENTRY (__memset_erms)
-	/* Skip zero length.  */
-	test	%RDX_LP, %RDX_LP
-	jnz	 L(stosb)
-	movq	%rdi, %rax
-	ret
-# else
-/* Provide a hidden symbol to debugger.  */
-	.hidden	MEMSET_SYMBOL (__memset, erms)
-ENTRY (MEMSET_SYMBOL (__memset, erms))
-# endif
-L(stosb):
-	mov	%RDX_LP, %RCX_LP
-	movzbl	%sil, %eax
-	mov	%RDI_LP, %RDX_LP
-	rep stosb
-	mov	%RDX_LP, %RAX_LP
-	VZEROUPPER_RETURN
-# if VEC_SIZE == 16
-END (__memset_erms)
-# else
-END (MEMSET_SYMBOL (__memset, erms))
-# endif
-
 # if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 	cmp	%RDX_LP, %RCX_LP
@@ -179,27 +164,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 
 ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH ()
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
-	 */
-	VMOVU	%VEC(0), (%rax)
-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
-	.p2align 4,, 10
+	.p2align 4,, 4
 L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
 #else
 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
@@ -212,6 +197,7 @@ L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
 	.p2align 4,, 10
 L(less_vec):
+L(less_vec_from_wmemset):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
@@ -262,28 +248,18 @@ L(stosb_more_2x_vec):
 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
 	   and (4x, 8x] jump to target.  */
 L(more_2x_vec):
-
-	/* Two different methods of setting up pointers / compare. The
-	   two methods are based on the fact that EVEX/AVX512 mov
-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
-	   this saves code size and keeps a few targets in one fetch block.
-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
-	   LOOP_4X_OFFSET) with LEA_BID.  */
-
-	/* END_REG is rcx for EVEX/AVX512.  */
-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
-#endif
-
-	/* Stores to first 2x VEC before cmp as any path forward will
-	   require it.  */
-	VMOVU	%VEC(0), (%rax)
-	VMOVU	%VEC(0), VEC_SIZE(%rax)
+	/* Store next 2x vec regardless.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
 
 
+	/* Two different methods of setting up pointers / compare. The two
+	   methods are based on the fact that EVEX/AVX512 mov instructions take
+	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
+	   address mode. For EVEX/AVX512 this saves code size and keeps a few
+	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
+	   bottlenecks.  */
 #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
 	addq	%rdx, %END_REG
@@ -292,6 +268,15 @@ L(more_2x_vec):
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_2x_vec)
 
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+	   LEA_BID.  */
+
+	/* END_REG is rcx for EVEX/AVX512.  */
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
 	/* Store next 2x vec regardless.  */
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
@@ -355,65 +340,93 @@ L(stosb_local):
 	/* Define L(less_vec) only if not otherwise defined.  */
 	.p2align 4
 L(less_vec):
+	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+	   xmm). This is only does anything for AVX2.  */
+	MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_from_wmemset):
 #endif
 L(cross_page):
 #if VEC_SIZE > 32
 	cmpl	$32, %edx
-	jae	L(between_32_63)
+	jge	L(between_32_63)
 #endif
 #if VEC_SIZE > 16
 	cmpl	$16, %edx
-	jae	L(between_16_31)
+	jge	L(between_16_31)
+#endif
+#ifndef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, %SET_REG64
 #endif
-	MOVQ	%XMM0, %rdi
 	cmpl	$8, %edx
-	jae	L(between_8_15)
+	jge	L(between_8_15)
 	cmpl	$4, %edx
-	jae	L(between_4_7)
+	jge	L(between_4_7)
 	cmpl	$1, %edx
-	ja	L(between_2_3)
-	jb	L(return)
-	movb	%sil, (%rax)
-	VZEROUPPER_RETURN
+	jg	L(between_2_3)
+	jl	L(between_0_0)
+	movb	%SET_REG8, (%LESS_VEC_REG)
+L(between_0_0):
+	ret
 
-	/* Align small targets only if not doing so would cross a fetch
-	   line.  */
+	/* Align small targets only if not doing so would cross a fetch line.
+	 */
 #if VEC_SIZE > 32
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	VMOVU	%YMM0, (%rax)
-	VMOVU	%YMM0, -32(%rax, %rdx)
+	VMOVU	%YMM0, (%LESS_VEC_REG)
+	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
 #if VEC_SIZE >= 32
-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 L(between_16_31):
 	/* From 16 to 31.  No branch when size == 16.  */
-	VMOVU	%XMM0, (%rax)
-	VMOVU	%XMM0, -16(%rax, %rdx)
-	VZEROUPPER_RETURN
+	VMOVU	%XMM0, (%LESS_VEC_REG)
+	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
+	ret
 #endif
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
-	movq	%rdi, (%rax)
-	movq	%rdi, -8(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, (%rdi)
+	MOVQ	%XMM0, -8(%rdi, %rdx)
+#else
+	movq	%SET_REG64, (%LESS_VEC_REG)
+	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
-	movl	%edi, (%rax)
-	movl	%edi, -4(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	MOVD	%XMM0, (%rdi)
+	MOVD	%XMM0, -4(%rdi, %rdx)
+#else
+	movl	%SET_REG32, (%LESS_VEC_REG)
+	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+	/* 4 * XMM_SMALL for the third mov for AVX2.  */
+	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
-	movw	%di, (%rax)
-	movb	%dil, -1(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	movb	%SET_REG8, (%rdi)
+	movb	%SET_REG8, 1(%rdi)
+	movb	%SET_REG8, -1(%rdi, %rdx)
+#else
+	movw	%SET_REG16, (%LESS_VEC_REG)
+	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
new file mode 100644
index 000000000..2b77a59d5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
@@ -0,0 +1,47 @@
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SSE2_VECS_H
+#define _SSE2_VECS_H			1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE			16
+#include "vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 000000000..09957fc3c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
similarity index 87%
rename from sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
rename to sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
index 7ec7c21b5..e2762f2a2 100644
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -1,4 +1,4 @@
-/* strcasecmp_l optimized with AVX.
+/* strcasecmp_l optimized with AVX2.
    Copyright (C) 2017-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,7 +16,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#define STRCMP_SSE42 __strcasecmp_l_avx
-#define USE_AVX 1
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2
+#endif
 #define USE_AS_STRCASECMP_L
-#include "strcmp-sse42.S"
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
similarity index 83%
rename from sysdeps/x86_64/multiarch/strncase_l-avx.S
rename to sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
index b51b86d22..58642db74 100644
--- a/sysdeps/x86_64/multiarch/strncase_l-avx.S
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
@@ -1,4 +1,4 @@
-/* strncasecmp_l optimized with AVX.
+/* strcasecmp_l optimized with EVEX.
    Copyright (C) 2017-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,7 +16,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#define STRCMP_SSE42 __strncasecmp_l_avx
-#define USE_AVX 1
-#define USE_AS_STRNCASECMP_L
-#include "strcmp-sse42.S"
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 086cabf76..1a916cc95 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -48,13 +48,13 @@
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
 	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	VPBROADCAST	%xmm0, %ymm0
-	vpxor	%xmm9, %xmm9, %xmm9
+	vpxor	%xmm1, %xmm1, %xmm1
 
 	/* Check if we cross page boundary with one vector load.  */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
@@ -62,37 +62,29 @@ ENTRY (STRCHR)
 
 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 	   null byte.  */
-	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqu	(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(%rdi, %rax), %CHAR_REG
-	jne	L(zero)
-# endif
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	/* .p2align 5 helps keep performance more consistent if ENTRY()
-	   alignment % 32 was either 16 or 0. As well this makes the
-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
-	   easier.  */
-	.p2align 5
-L(first_vec_x4):
-	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3 + 1), %rdi
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
+	/* Found CHAR or the null byte.  */
 	cmp	(%rdi, %rax), %CHAR_REG
+	/* NB: Use a branch instead of cmovcc here. The expectation is
+	   that with strchr the user will branch based on input being
+	   null. Since this branch will be 100% predictive of the user
+	   branch a branch miss here should save what otherwise would
+	   be branch miss in the user code. Otherwise using a branch 1)
+	   saves code size and 2) is faster in highly predictable
+	   environments.  */
 	jne	L(zero)
 # endif
 	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 # ifndef USE_AS_STRCHRNUL
 L(zero):
@@ -103,7 +95,8 @@ L(zero):
 
 	.p2align 4
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
 	incq	%rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
@@ -113,9 +106,10 @@ L(first_vec_x1):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x2):
-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
 	addq	$(VEC_SIZE + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
@@ -125,9 +119,10 @@ L(first_vec_x2):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
+	.p2align 4,, 8
 L(first_vec_x3):
-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
 	addq	$(VEC_SIZE * 2 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
@@ -137,6 +132,21 @@ L(first_vec_x3):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 10
+L(first_vec_x4):
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+
+
 	.p2align 4
 L(aligned_more):
 	/* Align data to VEC_SIZE - 1. This is the same number of
@@ -146,90 +156,92 @@ L(aligned_more):
 L(cross_page_continue):
 	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	vmovdqa	1(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
-	/* Align data to VEC_SIZE * 4 - 1.	*/
-	addq	$(VEC_SIZE * 4 + 1), %rdi
-	andq	$-(VEC_SIZE * 4), %rdi
+	/* Align data to VEC_SIZE * 4 - 1.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 	.p2align 4
 L(loop_4x_vec):
 	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(%rdi), %ymm5
-	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+	vmovdqa	1(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm7
 
 	/* Leaves only CHARS matching esi as 0.	 */
-	vpxor	%ymm5, %ymm0, %ymm1
 	vpxor	%ymm6, %ymm0, %ymm2
 	vpxor	%ymm7, %ymm0, %ymm3
-	vpxor	%ymm8, %ymm0, %ymm4
 
-	VPMINU	%ymm1, %ymm5, %ymm1
 	VPMINU	%ymm2, %ymm6, %ymm2
 	VPMINU	%ymm3, %ymm7, %ymm3
-	VPMINU	%ymm4, %ymm8, %ymm4
 
-	VPMINU	%ymm1, %ymm2, %ymm5
-	VPMINU	%ymm3, %ymm4, %ymm6
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm7
+
+	vpxor	%ymm6, %ymm0, %ymm4
+	vpxor	%ymm7, %ymm0, %ymm5
+
+	VPMINU	%ymm4, %ymm6, %ymm4
+	VPMINU	%ymm5, %ymm7, %ymm5
 
-	VPMINU	%ymm5, %ymm6, %ymm6
+	VPMINU	%ymm2, %ymm3, %ymm6
+	VPMINU	%ymm4, %ymm5, %ymm7
 
-	VPCMPEQ	%ymm6, %ymm9, %ymm6
-	vpmovmskb %ymm6, %ecx
+	VPMINU	%ymm6, %ymm7, %ymm7
+
+	VPCMPEQ	%ymm7, %ymm1, %ymm7
+	vpmovmskb %ymm7, %ecx
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
 	jz	L(loop_4x_vec)
 
-
-	VPCMPEQ	%ymm1, %ymm9, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x0)
 
 
-	VPCMPEQ	%ymm5, %ymm9, %ymm2
-	vpmovmskb %ymm2, %eax
+	VPCMPEQ	%ymm3, %ymm1, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x1)
 
-	VPCMPEQ	%ymm3, %ymm9, %ymm3
-	vpmovmskb %ymm3, %eax
+	VPCMPEQ	%ymm4, %ymm1, %ymm4
+	vpmovmskb %ymm4, %eax
 	/* rcx has combined result from all 4 VEC. It will only be used
 	   if the first 3 other VEC all did not contain a match.  */
 	salq	$32, %rcx
 	orq	%rcx, %rax
 	tzcntq	%rax, %rax
-	subq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2 - 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
@@ -239,10 +251,11 @@ L(loop_4x_vec):
 	VZEROUPPER_RETURN
 
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x0):
-	tzcntl	%eax, %eax
-	addq	$-(VEC_SIZE * 4), %rdi
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$-(VEC_SIZE * 4 - 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
@@ -251,16 +264,11 @@ L(last_vec_x0):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
-# endif
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x1):
 	tzcntl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdi
+	subq	$(VEC_SIZE * 3 - 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
@@ -269,18 +277,23 @@ L(last_vec_x1):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
 
 	/* Cold case for crossing page with first load.	 */
-	.p2align 4
+	.p2align 4,, 8
 L(cross_page_boundary):
 	movq	%rdi, %rdx
 	/* Align rdi to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
-	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 	   so no need to manually mod edx.  */
 	sarxl	%edx, %eax, %eax
@@ -291,13 +304,10 @@ L(cross_page_boundary):
 	xorl	%ecx, %ecx
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdx, %rax), %CHAR_REG
-	leaq	(%rdx, %rax), %rax
-	cmovne	%rcx, %rax
-# else
-	addq	%rdx, %rax
+	jne	L(zero_end)
 # endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	addq	%rdx, %rax
+	VZEROUPPER_RETURN
 
 END (STRCHR)
-# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index f62cd9d14..ec739fb8f 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -30,6 +30,7 @@
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMP		vpcmpd
+#  define VPTESTN	vptestnmd
 #  define VPMINU	vpminud
 #  define CHAR_REG	esi
 #  define SHIFT_REG	ecx
@@ -37,6 +38,7 @@
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMP		vpcmpb
+#  define VPTESTN	vptestnmb
 #  define VPMINU	vpminub
 #  define CHAR_REG	sil
 #  define SHIFT_REG	edx
@@ -61,13 +63,11 @@
 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 
 	.section .text.evex,"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
 	/* Broadcast CHAR to YMM0.	*/
 	VPBROADCAST	%esi, %YMM0
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-
 	/* Check if we cross page boundary with one vector load.
 	   Otherwise it is safe to use an unaligned load.  */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
@@ -81,49 +81,35 @@ ENTRY (STRCHR)
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.  */
+	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	/* NB: Use a branch instead of cmovcc here. The expectation is
+	   that with strchr the user will branch based on input being
+	   null. Since this branch will be 100% predictive of the user
+	   branch a branch miss here should save what otherwise would
+	   be branch miss in the user code. Otherwise using a branch 1)
+	   saves code size and 2) is faster in highly predictable
+	   environments.  */
+	jne	L(zero)
+# endif
 # ifdef USE_AS_WCSCHR
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
 	 */
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(%rax), %CHAR_REG
-	jne	L(zero)
 # endif
 	ret
 
-	/* .p2align 5 helps keep performance more consistent if ENTRY()
-	   alignment % 32 was either 16 or 0. As well this makes the
-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
-	   easier.  */
-	.p2align 5
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
-	jne	L(zero)
-# endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
 
-# ifndef USE_AS_STRCHRNUL
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x4):
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if first match was CHAR (k0) or null (k1).  */
@@ -144,9 +130,18 @@ L(first_vec_x4):
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+# ifndef USE_AS_STRCHRNUL
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+
 	.p2align 4
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	/* Use bsf here to save 1-byte keeping keeping the block in 1x
+	   fetch block. eax guranteed non-zero.  */
+	bsfl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -158,7 +153,7 @@ L(first_vec_x1):
 	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x2):
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if first match was CHAR (k0) or null (k1).  */
@@ -179,6 +174,21 @@ L(first_vec_x2):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+	.p2align 4,, 10
+L(first_vec_x3):
+	/* Use bsf here to save 1-byte keeping keeping the block in 1x
+	   fetch block. eax guranteed non-zero.  */
+	bsfl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
 	.p2align 4
 L(aligned_more):
 	/* Align data to VEC_SIZE.  */
@@ -195,7 +205,7 @@ L(cross_page_continue):
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
@@ -206,7 +216,7 @@ L(cross_page_continue):
 	/* Each bit in K0 represents a CHAR in YMM1.  */
 	VPCMP	$0, %YMM1, %YMM0, %k0
 	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	VPTESTN	%YMM1, %YMM1, %k1
 	kortestd	%k0, %k1
 	jnz	L(first_vec_x2)
 
@@ -215,7 +225,7 @@ L(cross_page_continue):
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
@@ -224,7 +234,7 @@ L(cross_page_continue):
 	/* Each bit in K0 represents a CHAR in YMM1.  */
 	VPCMP	$0, %YMM1, %YMM0, %k0
 	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	VPTESTN	%YMM1, %YMM1, %k1
 	kortestd	%k0, %k1
 	jnz	L(first_vec_x4)
 
@@ -265,33 +275,33 @@ L(loop_4x_vec):
 	VPMINU	%YMM3, %YMM4, %YMM4
 	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
 
-	VPCMP	$0, %YMMZERO, %YMM4, %k1
+	VPTESTN	%YMM4, %YMM4, %k1
 	kmovd	%k1, %ecx
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
 	jz	L(loop_4x_vec)
 
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+	VPTESTN	%YMM1, %YMM1, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x1)
 
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 
-	VPCMP	$0, %YMMZERO, %YMM3, %k0
+	VPTESTN	%YMM3, %YMM3, %k0
 	kmovd	%k0, %eax
 	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
 # ifdef USE_AS_WCSCHR
 	sall	$8, %ecx
 	orl	%ecx, %eax
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 # else
 	salq	$32, %rcx
 	orq	%rcx, %rax
-	tzcntq	%rax, %rax
+	bsfq	%rax, %rax
 # endif
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was CHAR or null.  */
@@ -303,28 +313,28 @@ L(loop_4x_vec):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
-	xorl	%eax, %eax
-	ret
+	.p2align 4,, 8
+L(last_vec_x1):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	   */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
 
-	.p2align 4
-L(last_vec_x1):
-	tzcntl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
-	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	cmp	(%rax), %CHAR_REG
 	jne	L(zero_end)
 # endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+
 	ret
 
-	.p2align 4
+	.p2align 4,, 8
 L(last_vec_x2):
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -336,7 +346,7 @@ L(last_vec_x2):
 	ret
 
 	/* Cold case for crossing page with first load.	 */
-	.p2align 4
+	.p2align 4,, 8
 L(cross_page_boundary):
 	movq	%rdi, %rdx
 	/* Align rdi.  */
@@ -346,9 +356,9 @@ L(cross_page_boundary):
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
-	/* Remove the leading bits.	 */
+	/* Remove the leading bits.  */
 # ifdef USE_AS_WCSCHR
 	movl	%edx, %SHIFT_REG
 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
@@ -360,20 +370,24 @@ L(cross_page_boundary):
 	/* If eax is zero continue.  */
 	testl	%eax, %eax
 	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
-	/* Check to see if match was CHAR or null.  */
-	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
-	jne	L(zero_end)
-# endif
+	bsfl	%eax, %eax
+
 # ifdef USE_AS_WCSCHR
 	/* NB: Multiply wchar_t count by 4 to get the number of
 	   bytes.  */
 	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdx, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if match was CHAR or null.  */
+	cmp	(%rax), %CHAR_REG
+	je	L(cross_page_ret)
+L(zero_end):
+	xorl	%eax, %eax
+L(cross_page_ret):
 # endif
 	ret
 
 END (STRCHR)
-# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 9c73b5899..8da09bd86 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,45 +20,146 @@
 
 # include <sysdep.h>
 
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
+
 # ifndef STRCMP
 #  define STRCMP	__strcmp_avx2
 # endif
 
 # define PAGE_SIZE	4096
 
-/* VEC_SIZE = Number of bytes in a ymm register */
+	/* VEC_SIZE = Number of bytes in a ymm register.  */
 # define VEC_SIZE	32
 
-/* Shift for dividing by (VEC_SIZE * 4).  */
-# define DIVIDE_BY_VEC_4_SHIFT	7
-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-# endif
+# define VMOVU	vmovdqu
+# define VMOVA	vmovdqa
 
 # ifdef USE_AS_WCSCMP
-/* Compare packed dwords.  */
+	/* Compare packed dwords.  */
 #  define VPCMPEQ	vpcmpeqd
-/* Compare packed dwords and store minimum.  */
+	/* Compare packed dwords and store minimum.  */
 #  define VPMINU	vpminud
-/* 1 dword char == 4 bytes.  */
+	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
 # else
-/* Compare packed bytes.  */
+	/* Compare packed bytes.  */
 #  define VPCMPEQ	vpcmpeqb
-/* Compare packed bytes and store minimum.  */
+	/* Compare packed bytes and store minimum.  */
 #  define VPMINU	vpminub
-/* 1 byte char == 1 byte.  */
+	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
 # endif
 
+# ifdef USE_AS_STRNCMP
+#  define LOOP_REG	r9d
+#  define LOOP_REG64	r9
+
+#  define OFFSET_REG8	r9b
+#  define OFFSET_REG	r9d
+#  define OFFSET_REG64	r9
+# else
+#  define LOOP_REG	edx
+#  define LOOP_REG64	rdx
+
+#  define OFFSET_REG8	dl
+#  define OFFSET_REG	edx
+#  define OFFSET_REG64	rdx
+# endif
+
 # ifndef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# if defined USE_AS_STRNCMP
+#  define VEC_OFFSET	0
+# else
+#  define VEC_OFFSET	(-VEC_SIZE)
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_avx2
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_avx2
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
+# define xmmZERO	xmm15
+# define ymmZERO	ymm15
+
+# define LCASE_MIN_ymm	%ymm10
+# define LCASE_MAX_ymm	%ymm11
+# define CASE_ADD_ymm	%ymm12
+
+# define LCASE_MIN_xmm	%xmm10
+# define LCASE_MAX_xmm	%xmm11
+# define CASE_ADD_xmm	%xmm12
+
+	/* r11 is never use elsewhere so this is safe to maintain.  */
+# define TOLOWER_BASE	%r11
+
 # ifndef SECTION
 #  define SECTION(p)	p##.avx
 # endif
 
+# ifdef USE_AS_STRCASECMP_L
+#  define REG(x, y) x ## y
+#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
+	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
+	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
+	vpaddb	REG(%ext, 9), reg2_in, reg2_out
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
+#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
+	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
+	VPCMPEQ	scratch_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
+	VMOVU	s2_mem, reg_out;											\
+	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_ymm(...)
+#  define TOLOWER_xmm(...)
+
+#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
            strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -79,783 +180,1142 @@
    the maximum offset is reached before a difference is found, zero is
    returned.  */
 
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCMP)
+	.section SECTION(.text), "ax", @progbits
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifndef GLABEL
+#  define GLABEL(...)	__VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (GLABEL(STRCASECMP))
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
-	/* Check for simple cases (0 or 1) in offset.  */
+	/* Don't overwrite LOCALE_REG (rcx) until we have pass
+	   L(one_or_less). Otherwise we might use the wrong locale in
+	   the OVERFLOW_STRCMP (strcasecmp_l).  */
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
 	cmp	$1, %RDX_LP
-	je	L(char0)
-	jb	L(zero)
+	/* Signed comparison intentional. We use this branch to also
+	   test cases where length >= 2^63. These very large sizes can be
+	   handled with strcmp as there is no way for that length to
+	   actually bound the buffer.  */
+	jle	L(one_or_less)
 #  ifdef USE_AS_WCSCMP
-#  ifndef __ILP32__
 	movq	%rdx, %rcx
-	/* Check if length could overflow when multiplied by
-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
-	   overflow cases as well as redirect cases where its impossible to
-	   length to bound a valid memory region. In these cases just use
-	   'wcscmp'.  */
+
+	/* Multiplying length by sizeof(wchar_t) can result in overflow.
+	   Check if that is possible. All cases where overflow are possible
+	   are cases where length is large enough that it can never be a
+	   bound on valid memory so just use wcscmp.  */
 	shrq	$56, %rcx
-	jnz	__wcscmp_avx2
-#  endif
-	/* Convert units: from wide to byte char.  */
-	shl	$2, %RDX_LP
+	jnz	OVERFLOW_STRCMP
+
+	leaq	(, %rdx, 4), %rdx
 #  endif
-	/* Register %r11 tracks the maximum offset.  */
-	mov	%RDX_LP, %R11_LP
+# endif
+	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
+	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
+	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
 # endif
 	movl	%edi, %eax
-	xorl	%edx, %edx
-	/* Make %xmm7 (%ymm7) all zeros in this function.  */
-	vpxor	%xmm7, %xmm7, %xmm7
 	orl	%esi, %eax
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
-	jg	L(cross_page)
-	/* Start comparing 4 vectors.  */
-	vmovdqu	(%rdi), %ymm1
-	VPCMPEQ	(%rsi), %ymm1, %ymm0
-	VPMINU	%ymm1, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
-	vpmovmskb %ymm0, %ecx
-	testl	%ecx, %ecx
-	je	L(next_3_vectors)
-	tzcntl	%ecx, %edx
+	sall	$20, %eax
+	/* Check if s1 or s2 may cross a page  in next 4x VEC loads.  */
+	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
+	ja	L(page_cross)
+
+L(no_page_cross):
+	/* Safe to compare 4x vectors.  */
+	VMOVU	(%rdi), %ymm0
+	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+	   scratch and ymm1 is the return.  */
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+	/* 1s at null CHAR.  */
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	/* 1s where s1 and s2 equal AND not null CHAR.  */
+	vpandn	%ymm1, %ymm2, %ymm1
+
+	/* All 1s -> keep going, any 0s -> return.  */
+	vpmovmskb %ymm1, %ecx
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx) is after the maximum
-	   offset (%r11).   */
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(vec_0_test_len)
 # endif
+
+	/* All 1s represents all equals. incl will overflow to zero in
+	   all equals case. Otherwise 1s will carry until position of first
+	   mismatch.  */
+	incl	%ecx
+	jz	L(more_3x_vec)
+
+	.p2align 4,, 4
+L(return_vec_0):
+	tzcntl	%ecx, %ecx
 # ifdef USE_AS_WCSCMP
+	movl	(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	je	L(return)
-L(wcscmp_return):
+	cmpl	(%rsi, %rcx), %edx
+	je	L(ret0)
 	setl	%al
 	negl	%eax
 	orl	$1, %eax
-L(return):
 # else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 # endif
+L(ret0):
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(return_vec_size):
-	tzcntl	%ecx, %edx
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
-	   the maximum offset (%r11).  */
-	addq	$VEC_SIZE, %rdx
-	cmpq	%r11, %rdx
-	jae	L(zero)
-#  ifdef USE_AS_WCSCMP
+	.p2align 4,, 8
+L(vec_0_test_len):
+	notl	%ecx
+	bzhil	%edx, %ecx, %eax
+	jnz	L(return_vec_0)
+	/* Align if will cross fetch block.  */
+	.p2align 4,, 2
+L(ret_zero):
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 5
+L(one_or_less):
+#  ifdef USE_AS_STRCASECMP_L
+	/* Set locale argument for strcasecmp.  */
+	movq	%LOCALE_REG, %rdx
 #  endif
-# else
+	jb	L(ret_zero)
+	/* 'nbe' covers the case where length is negative (large
+	   unsigned).  */
+	jnbe	OVERFLOW_STRCMP
 #  ifdef USE_AS_WCSCMP
+	movl	(%rdi), %edx
 	xorl	%eax, %eax
-	movl	VEC_SIZE(%rdi, %rdx), %ecx
-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
+	cmpl	(%rsi), %edx
+	je	L(ret1)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
 #  else
-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 #  endif
+L(ret1):
+	ret
 # endif
-	VZEROUPPER_RETURN
 
-	.p2align 4
-L(return_2_vec_size):
-	tzcntl	%ecx, %edx
+	.p2align 4,, 10
+L(return_vec_1):
+	tzcntl	%ecx, %ecx
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
-	   after the maximum offset (%r11).  */
-	addq	$(VEC_SIZE * 2), %rdx
-	cmpq	%r11, %rdx
-	jae	L(zero)
-#  ifdef USE_AS_WCSCMP
+	/* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
+	   overflow.  */
+	addq	$-VEC_SIZE, %rdx
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero)
+# endif
+# ifdef USE_AS_WCSCMP
+	movl	VEC_SIZE(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+	je	L(ret2)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
 # else
-#  ifdef USE_AS_WCSCMP
-	xorl	%eax, %eax
-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
-	subl	%edx, %eax
-#  endif
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 # endif
+L(ret2):
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(return_3_vec_size):
-	tzcntl	%ecx, %edx
+	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
-	   after the maximum offset (%r11).  */
-	addq	$(VEC_SIZE * 3), %rdx
-	cmpq	%r11, %rdx
-	jae	L(zero)
-#  ifdef USE_AS_WCSCMP
+L(return_vec_3):
+	salq	$32, %rcx
+# endif
+
+L(return_vec_2):
+# ifndef USE_AS_STRNCMP
+	tzcntl	%ecx, %ecx
+# else
+	tzcntq	%rcx, %rcx
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	je	L(ret3)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
 # else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+# endif
+L(ret3):
+	VZEROUPPER_RETURN
+
+# ifndef USE_AS_STRNCMP
+	.p2align 4,, 10
+L(return_vec_3):
+	tzcntl	%ecx, %ecx
 #  ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	je	L(ret4)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
 #  else
-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 #  endif
-# endif
+L(ret4):
 	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 10
+L(more_3x_vec):
+	/* Safe to compare 4x vectors.  */
+	VMOVU	VEC_SIZE(%rdi), %ymm0
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_1)
 
-	.p2align 4
-L(next_3_vectors):
-	vmovdqu	VEC_SIZE(%rdi), %ymm6
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
-	VPMINU	%ymm6, %ymm3, %ymm3
-	VPCMPEQ	%ymm7, %ymm3, %ymm3
-	vpmovmskb %ymm3, %ecx
-	testl	%ecx, %ecx
-	jne	L(return_vec_size)
-	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
-	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
-	VPMINU	%ymm5, %ymm2, %ymm2
-	VPCMPEQ	%ymm4, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm2, %ymm2
-	vpmovmskb %ymm2, %ecx
-	testl	%ecx, %ecx
-	jne	L(return_2_vec_size)
-	VPMINU	%ymm4, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
-	vpmovmskb %ymm0, %ecx
-	testl	%ecx, %ecx
-	jne	L(return_3_vec_size)
-L(main_loop_header):
-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
-	movl	$PAGE_SIZE, %ecx
-	/* Align load via RAX.  */
-	andq	$-(VEC_SIZE * 4), %rdx
-	subq	%rdi, %rdx
-	leaq	(%rdi, %rdx), %rax
 # ifdef USE_AS_STRNCMP
-	/* Starting from this point, the maximum offset, or simply the
-	   'offset', DECREASES by the same amount when base pointers are
-	   moved forward.  Return 0 when:
-	     1) On match: offset <= the matched vector index.
-	     2) On mistmach, offset is before the mistmatched index.
+	subq	$(VEC_SIZE * 2), %rdx
+	jbe	L(ret_zero)
+# endif
+
+	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_2)
+
+	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_3)
+
+# ifdef USE_AS_STRNCMP
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(ret_zero)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	/* any non-zero positive value that doesn't inference with 0x1.
 	 */
-	subq	%rdx, %r11
-	jbe	L(zero)
-# endif
-	addq	%rsi, %rdx
-	movq	%rdx, %rsi
-	andl	$(PAGE_SIZE - 1), %esi
-	/* Number of bytes before page crossing.  */
-	subq	%rsi, %rcx
-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
-	movl	%ecx, %esi
-	jmp	L(loop_start)
+	movl	$2, %r8d
+
+# else
+	xorl	%r8d, %r8d
+# endif
+
+	/* The prepare labels are various entry points from the page
+	   cross logic.  */
+L(prepare_loop):
+
+# ifdef USE_AS_STRNCMP
+	/* Store N + (VEC_SIZE * 4) and place check at the begining of
+	   the loop.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
+# endif
+L(prepare_loop_no_len):
+
+	/* Align s1 and adjust s2 accordingly.  */
+	subq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 4), %rdi
+	addq	%rdi, %rsi
+
+# ifdef USE_AS_STRNCMP
+	subq	%rdi, %rdx
+# endif
 
+L(prepare_loop_aligned):
+	/* eax stores distance from rsi to next page cross. These cases
+	   need to be handled specially as the 4x loop could potentially
+	   read memory past the length of s1 or s2 and across a page
+	   boundary.  */
+	movl	$-(VEC_SIZE * 4), %eax
+	subl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+
+	/* Loop 4x comparisons at a time.  */
 	.p2align 4
 L(loop):
+
+	/* End condition for strncmp.  */
 # ifdef USE_AS_STRNCMP
-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
-	   the maximum offset (%r11) by the same amount.  */
-	subq	$(VEC_SIZE * 4), %r11
-	jbe	L(zero)
-# endif
-	addq	$(VEC_SIZE * 4), %rax
-	addq	$(VEC_SIZE * 4), %rdx
-L(loop_start):
-	testl	%esi, %esi
-	leal	-1(%esi), %esi
-	je	L(loop_cross_page)
-L(back_to_loop):
-	/* Main loop, comparing 4 vectors are a time.  */
-	vmovdqa	(%rax), %ymm0
-	vmovdqa	VEC_SIZE(%rax), %ymm3
-	VPCMPEQ	(%rdx), %ymm0, %ymm4
-	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
-	VPMINU	%ymm0, %ymm4, %ymm4
-	VPMINU	%ymm3, %ymm1, %ymm1
-	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
-	VPMINU	%ymm1, %ymm4, %ymm0
-	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
-	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
-	VPMINU	%ymm2, %ymm5, %ymm5
-	VPMINU	%ymm3, %ymm6, %ymm6
-	VPMINU	%ymm5, %ymm0, %ymm0
-	VPMINU	%ymm6, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
-
-	/* Test each mask (32 bits) individually because for VEC_SIZE
-	   == 32 is not possible to OR the four masks and keep all bits
-	   in a 64-bit integer register, differing from SSE2 strcmp
-	   where ORing is possible.  */
-	vpmovmskb %ymm0, %ecx
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(ret_zero)
+# endif
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+
+	/* Check if rsi loads will cross a page boundary.  */
+	addl	$-(VEC_SIZE * 4), %eax
+	jnb	L(page_cross_during_loop)
+
+	/* Loop entry after handling page cross during loop.  */
+L(loop_skip_page_cross_check):
+	VMOVA	(VEC_SIZE * 0)(%rdi), %ymm0
+	VMOVA	(VEC_SIZE * 1)(%rdi), %ymm2
+	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
+	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
+
+	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+
+	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
+	   zero.  */
+	vpand	%ymm0, %ymm1, %ymm1
+
+
+	vpand	%ymm2, %ymm3, %ymm3
+	vpand	%ymm4, %ymm5, %ymm5
+	vpand	%ymm6, %ymm7, %ymm7
+
+	VPMINU	%ymm1, %ymm3, %ymm3
+	VPMINU	%ymm5, %ymm7, %ymm7
+
+	/* Reduce all 0 CHARs for the 4x VEC into ymm7.  */
+	VPMINU	%ymm3, %ymm7, %ymm7
+
+	/* If any 0 CHAR then done.  */
+	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
+	vpmovmskb %ymm7, %LOOP_REG
+	testl	%LOOP_REG, %LOOP_REG
+	jz	L(loop)
+
+	/* Find which VEC has the mismatch of end of string.  */
+	VPCMPEQ	%ymm1, %ymmZERO, %ymm1
+	vpmovmskb %ymm1, %ecx
 	testl	%ecx, %ecx
-	je	L(loop)
-	VPCMPEQ	%ymm7, %ymm4, %ymm0
-	vpmovmskb %ymm0, %edi
-	testl	%edi, %edi
-	je	L(test_vec)
-	tzcntl	%edi, %ecx
+	jnz	L(return_vec_0_end)
+
+
+	VPCMPEQ	%ymm3, %ymmZERO, %ymm3
+	vpmovmskb %ymm3, %ecx
+	testl	%ecx, %ecx
+	jnz	L(return_vec_1_end)
+
+L(return_vec_2_3_end):
 # ifdef USE_AS_STRNCMP
-	cmpq	%rcx, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	subq	$(VEC_SIZE * 2), %rdx
+	jbe	L(ret_zero_end)
+# endif
+
+	VPCMPEQ	%ymm5, %ymmZERO, %ymm5
+	vpmovmskb %ymm5, %ecx
+	testl	%ecx, %ecx
+	jnz	L(return_vec_2_end)
+
+	/* LOOP_REG contains matches for null/mismatch from the loop. If
+	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
+	   must entirely be from VEC 3 which is fully represented by
+	   LOOP_REG.  */
+	tzcntl	%LOOP_REG, %LOOP_REG
+
+# ifdef USE_AS_STRNCMP
+	subl	$-(VEC_SIZE), %LOOP_REG
+	cmpq	%LOOP_REG64, %rdx
+	jbe	L(ret_zero_end)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
 	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+	je	L(ret5)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
+	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
+L(ret5):
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(test_vec):
 # ifdef USE_AS_STRNCMP
-	/* The first vector matched.  Return 0 if the maximum offset
-	   (%r11) <= VEC_SIZE.  */
-	cmpq	$VEC_SIZE, %r11
-	jbe	L(zero)
+	.p2align 4,, 2
+L(ret_zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
 # endif
-	VPCMPEQ	%ymm7, %ymm1, %ymm1
-	vpmovmskb %ymm1, %ecx
-	testl	%ecx, %ecx
-	je	L(test_2_vec)
-	tzcntl	%ecx, %edi
+
+
+	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
+	   they use the value of `r8` to negate the return value. This is
+	   because the page cross logic can swap `rdi` and `rsi`.  */
+	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
-	addq	$VEC_SIZE, %rdi
-	cmpq	%rdi, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+L(return_vec_1_end):
+	salq	$32, %rcx
+# endif
+L(return_vec_0_end):
+# ifndef USE_AS_STRNCMP
+	tzcntl	%ecx, %ecx
+# else
+	tzcntq	%rcx, %rcx
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_end)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(%rsi, %rdi), %ecx
-	cmpl	(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rdi), %eax
-	movzbl	(%rdx, %rdi), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	(%rsi, %rcx), %edx
+	je	L(ret6)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+# endif
+L(ret6):
+	VZEROUPPER_RETURN
+
+# ifndef USE_AS_STRNCMP
+	.p2align 4,, 10
+L(return_vec_1_end):
+	tzcntl	%ecx, %ecx
 #  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	movl	VEC_SIZE(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	VEC_SIZE(%rsi, %rdi), %ecx
-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+	je	L(ret7)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 #  else
-	movzbl	VEC_SIZE(%rax, %rdi), %eax
-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
-	subl	%edx, %eax
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 #  endif
-# endif
+L(ret7):
 	VZEROUPPER_RETURN
+# endif
 
-	.p2align 4
-L(test_2_vec):
+	.p2align 4,, 10
+L(return_vec_2_end):
+	tzcntl	%ecx, %ecx
 # ifdef USE_AS_STRNCMP
-	/* The first 2 vectors matched.  Return 0 if the maximum offset
-	   (%r11) <= 2 * VEC_SIZE.  */
-	cmpq	$(VEC_SIZE * 2), %r11
-	jbe	L(zero)
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_page_cross)
 # endif
-	VPCMPEQ	%ymm7, %ymm5, %ymm5
-	vpmovmskb %ymm5, %ecx
-	testl	%ecx, %ecx
-	je	L(test_3_vec)
-	tzcntl	%ecx, %edi
-# ifdef USE_AS_STRNCMP
-	addq	$(VEC_SIZE * 2), %rdi
-	cmpq	%rdi, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+# ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(%rsi, %rdi), %ecx
-	cmpl	(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rdi), %eax
-	movzbl	(%rdx, %rdi), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	je	L(ret11)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
-	subl	%edx, %eax
-#  endif
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
+L(ret11):
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(test_3_vec):
+
+	/* Page cross in rsi in next 4x VEC.  */
+
+	/* TODO: Improve logic here.  */
+	.p2align 4,, 10
+L(page_cross_during_loop):
+	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
+
+	/* Optimistically rsi and rdi and both aligned inwhich case we
+	   don't need any logic here.  */
+	cmpl	$-(VEC_SIZE * 4), %eax
+	/* Don't adjust eax before jumping back to loop and we will
+	   never hit page cross case again.  */
+	je	L(loop_skip_page_cross_check)
+
+	/* Check if we can safely load a VEC.  */
+	cmpl	$-(VEC_SIZE * 3), %eax
+	jle	L(less_1x_vec_till_page_cross)
+
+	VMOVA	(%rdi), %ymm0
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_0_end)
+
+	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
+	cmpl	$-(VEC_SIZE * 2), %eax
+	jg	L(more_2x_vec_till_page_cross)
+
+	.p2align 4,, 4
+L(less_1x_vec_till_page_cross):
+	subl	$-(VEC_SIZE * 4), %eax
+	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
+	   concerning case is first iteration if incoming s1 was near start
+	   of a page and s2 near end. If s1 was near the start of the page
+	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
+	   to read back -VEC_SIZE. If rdi is truly at the start of a page
+	   here, it means the previous page (rdi - VEC_SIZE) has already
+	   been loaded earlier so must be valid.  */
+	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
+	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+
+	/* Mask of potentially valid bits. The lower bits can be out of
+	   range comparisons (but safe regarding page crosses).  */
+	movl	$-1, %r10d
+	shlxl	%esi, %r10d, %r10d
+	notl	%ecx
+
 # ifdef USE_AS_STRNCMP
-	/* The first 3 vectors matched.  Return 0 if the maximum offset
-	   (%r11) <= 3 * VEC_SIZE.  */
-	cmpq	$(VEC_SIZE * 3), %r11
-	jbe	L(zero)
-# endif
-	VPCMPEQ	%ymm7, %ymm6, %ymm6
-	vpmovmskb %ymm6, %esi
-	tzcntl	%esi, %ecx
+	cmpq	%rax, %rdx
+	jbe	L(return_page_cross_end_check)
+# endif
+	movl	%eax, %OFFSET_REG
+	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
+
+	andl	%r10d, %ecx
+	jz	L(loop_skip_page_cross_check)
+
+	.p2align 4,, 3
+L(return_page_cross_end):
+	tzcntl	%ecx, %ecx
+
 # ifdef USE_AS_STRNCMP
-	addq	$(VEC_SIZE * 3), %rcx
-	cmpq	%rcx, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %esi
-	cmpl	(%rdx, %rcx), %esi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	leal	-VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
+L(return_page_cross_cmp_mem):
 # else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	addl	%OFFSET_REG, %ecx
+# endif
+# ifdef USE_AS_WCSCMP
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
+	je	L(ret8)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
+# else
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
+L(ret8):
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(loop_cross_page):
-	xorl	%r10d, %r10d
-	movq	%rdx, %rcx
-	/* Align load via RDX.  We load the extra ECX bytes which should
-	   be ignored.  */
-	andl	$((VEC_SIZE * 4) - 1), %ecx
-	/* R10 is -RCX.  */
-	subq	%rcx, %r10
-
-	/* This works only if VEC_SIZE * 2 == 64. */
-# if (VEC_SIZE * 2) != 64
-#  error (VEC_SIZE * 2) != 64
-# endif
-
-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
-	cmpl	$(VEC_SIZE * 2), %ecx
-	jge	L(loop_cross_page_2_vec)
-
-	vmovdqu	(%rax, %r10), %ymm2
-	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
-	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
-	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
-	VPMINU	%ymm2, %ymm0, %ymm0
-	VPMINU	%ymm3, %ymm1, %ymm1
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm1, %ymm1
-
-	vpmovmskb %ymm0, %edi
-	vpmovmskb %ymm1, %esi
-
-	salq	$32, %rsi
-	xorq	%rsi, %rdi
-
-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
-	shrq	%cl, %rdi
-
-	testq	%rdi, %rdi
-	je	L(loop_cross_page_2_vec)
-	tzcntq	%rdi, %rcx
 # ifdef USE_AS_STRNCMP
-	cmpq	%rcx, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	.p2align 4,, 10
+L(return_page_cross_end_check):
+	andl	%r10d, %ecx
+	tzcntl	%ecx, %ecx
+	leal	-VEC_SIZE(%rax, %rcx), %ecx
+	cmpl	%ecx, %edx
+	ja	L(return_page_cross_cmp_mem)
 	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
-# else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
-# endif
 	VZEROUPPER_RETURN
+# endif
 
-	.p2align 4
-L(loop_cross_page_2_vec):
-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
-	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
-	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
-	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
-	VPMINU	%ymm2, %ymm5, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
-	VPCMPEQ	%ymm7, %ymm5, %ymm5
-	VPMINU	%ymm3, %ymm6, %ymm6
-	VPCMPEQ	%ymm7, %ymm6, %ymm6
-
-	vpmovmskb %ymm5, %edi
-	vpmovmskb %ymm6, %esi
-
-	salq	$32, %rsi
-	xorq	%rsi, %rdi
 
-	xorl	%r8d, %r8d
-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
-	subl	$(VEC_SIZE * 2), %ecx
-	jle	1f
-	/* Skip ECX bytes.  */
-	shrq	%cl, %rdi
-	/* R8 has number of bytes skipped.  */
-	movl	%ecx, %r8d
-1:
-	/* Before jumping back to the loop, set ESI to the number of
-	   VEC_SIZE * 4 blocks before page crossing.  */
-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
-
-	testq	%rdi, %rdi
+	.p2align 4,, 10
+L(more_2x_vec_till_page_cross):
+	/* If more 2x vec till cross we will complete a full loop
+	   iteration here.  */
+
+	VMOVU	VEC_SIZE(%rdi), %ymm0
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_1_end)
+
 # ifdef USE_AS_STRNCMP
-	/* At this point, if %rdi value is 0, it already tested
-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
-	   checks whether strncmp maximum offset reached or not.  */
-	je	L(string_nbyte_offset_check)
-# else
-	je	L(back_to_loop)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(ret_zero_in_loop_page_cross)
 # endif
-	tzcntq	%rdi, %rcx
-	addq	%r10, %rcx
-	/* Adjust for number of bytes skipped.  */
-	addq	%r8, %rcx
+
+	subl	$-(VEC_SIZE * 4), %eax
+
+	/* Safe to include comparisons from lower bytes.  */
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_page_cross_0)
+
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_page_cross_1)
+
 # ifdef USE_AS_STRNCMP
-	addq	$(VEC_SIZE * 2), %rcx
-	subq	%rcx, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	/* Must check length here as length might proclude reading next
+	   page.  */
+	cmpq	%rax, %rdx
+	jbe	L(ret_zero_in_loop_page_cross)
+# endif
+
+	/* Finish the loop.  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
+	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
+
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+	vpand	%ymm4, %ymm5, %ymm5
+	vpand	%ymm6, %ymm7, %ymm7
+	VPMINU	%ymm5, %ymm7, %ymm7
+	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
+	vpmovmskb %ymm7, %LOOP_REG
+	testl	%LOOP_REG, %LOOP_REG
+	jnz	L(return_vec_2_3_end)
+
+	/* Best for code size to include ucond-jmp here. Would be faster
+	   if this case is hot to duplicate the L(return_vec_2_3_end) code
+	   as fall-through and have jump back to loop on mismatch
+	   comparison.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
+# ifdef USE_AS_STRNCMP
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_skip_page_cross_check)
+L(ret_zero_in_loop_page_cross):
 	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	VZEROUPPER_RETURN
 # else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	jmp	L(loop_skip_page_cross_check)
 # endif
-	VZEROUPPER_RETURN
 
+
+	.p2align 4,, 10
+L(return_vec_page_cross_0):
+	addl	$-VEC_SIZE, %eax
+L(return_vec_page_cross_1):
+	tzcntl	%ecx, %ecx
 # ifdef USE_AS_STRNCMP
-L(string_nbyte_offset_check):
-	leaq	(VEC_SIZE * 4)(%r10), %r10
-	cmpq	%r10, %r11
-	jbe	L(zero)
-	jmp	L(back_to_loop)
+	leal	-VEC_SIZE(%rax, %rcx), %ecx
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_in_loop_page_cross)
+# else
+	addl	%eax, %ecx
 # endif
 
-	.p2align 4
-L(cross_page_loop):
-	/* Check one byte/dword at a time.  */
 # ifdef USE_AS_WCSCMP
-	cmpl	%ecx, %eax
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
+	xorl	%eax, %eax
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
+	je	L(ret9)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
-	jne	L(different)
-	addl	$SIZE_OF_CHAR, %edx
-	cmpl	$(VEC_SIZE * 4), %edx
-	je	L(main_loop_header)
-# ifdef USE_AS_STRNCMP
-	cmpq	%r11, %rdx
-	jae	L(zero)
+L(ret9):
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 10
+L(page_cross):
+# ifndef USE_AS_STRNCMP
+	/* If both are VEC aligned we don't need any special logic here.
+	   Only valid for strcmp where stop condition is guranteed to be
+	   reachable by just reading memory.  */
+	testl	$((VEC_SIZE - 1) << 20), %eax
+	jz	L(no_page_cross)
 # endif
+
+	movl	%edi, %eax
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %eax
+	andl	$(PAGE_SIZE - 1), %ecx
+
+	xorl	%OFFSET_REG, %OFFSET_REG
+
+	/* Check which is closer to page cross, s1 or s2.  */
+	cmpl	%eax, %ecx
+	jg	L(page_cross_s2)
+
+	/* The previous page cross check has false positives. Check for
+	   true positive as page cross logic is very expensive.  */
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
+	jbe	L(no_page_cross)
+
+	/* Set r8 to not interfere with normal return value (rdi and rsi
+	   did not swap).  */
 # ifdef USE_AS_WCSCMP
-	movl	(%rdi, %rdx), %eax
-	movl	(%rsi, %rdx), %ecx
+	/* any non-zero positive value that doesn't inference with 0x1.
+	 */
+	movl	$2, %r8d
 # else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %ecx
+	xorl	%r8d, %r8d
 # endif
-	/* Check null char.  */
-	testl	%eax, %eax
-	jne	L(cross_page_loop)
-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
-	   comparisons.  */
-	subl	%ecx, %eax
-# ifndef USE_AS_WCSCMP
-L(different):
+
+	/* Check if less than 1x VEC till page cross.  */
+	subl	$(VEC_SIZE * 3), %eax
+	jg	L(less_1x_vec_till_page)
+
+	/* If more than 1x VEC till page cross, loop throuh safely
+	   loadable memory until within 1x VEC of page cross.  */
+
+	.p2align 4,, 10
+L(page_cross_loop):
+
+	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+
+	jnz	L(check_ret_vec_page_cross)
+	addl	$VEC_SIZE, %OFFSET_REG
+# ifdef USE_AS_STRNCMP
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross)
 # endif
-	VZEROUPPER_RETURN
+	addl	$VEC_SIZE, %eax
+	jl	L(page_cross_loop)
+
+	subl	%eax, %OFFSET_REG
+	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
+	   to not cross page so is safe to load. Since we have already
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
+
+	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+
+# ifdef USE_AS_STRNCMP
+	leal	VEC_SIZE(%OFFSET_REG64), %eax
+	cmpq	%rax, %rdx
+	jbe	L(check_ret_vec_page_cross2)
+	addq	%rdi, %rdx
+# endif
+	incl	%ecx
+	jz	L(prepare_loop_no_len)
 
+	.p2align 4,, 4
+L(ret_vec_page_cross):
+# ifndef USE_AS_STRNCMP
+L(check_ret_vec_page_cross):
+# endif
+	tzcntl	%ecx, %ecx
+	addl	%OFFSET_REG, %ecx
+L(ret_vec_page_cross_cont):
 # ifdef USE_AS_WCSCMP
-	.p2align 4
-L(different):
-	/* Use movl to avoid modifying EFLAGS.  */
-	movl	$0, %eax
+	movl	(%rdi, %rcx), %edx
+	xorl	%eax, %eax
+	cmpl	(%rsi, %rcx), %edx
+	je	L(ret12)
 	setl	%al
 	negl	%eax
-	orl	$1, %eax
-	VZEROUPPER_RETURN
+	xorl	%r8d, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
+L(ret12):
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNCMP
-	.p2align 4
-L(zero):
+	.p2align 4,, 10
+L(check_ret_vec_page_cross2):
+	incl	%ecx
+L(check_ret_vec_page_cross):
+	tzcntl	%ecx, %ecx
+	addl	%OFFSET_REG, %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_page_cross_cont)
+	.p2align 4,, 2
+L(ret_zero_page_cross):
 	xorl	%eax, %eax
 	VZEROUPPER_RETURN
+# endif
 
-	.p2align 4
-L(char0):
-#  ifdef USE_AS_WCSCMP
-	xorl	%eax, %eax
-	movl	(%rdi), %ecx
-	cmpl	(%rsi), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rsi), %ecx
-	movzbl	(%rdi), %eax
-	subl	%ecx, %eax
-#  endif
-	VZEROUPPER_RETURN
+	.p2align 4,, 4
+L(page_cross_s2):
+	/* Ensure this is a true page cross.  */
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
+	jbe	L(no_page_cross)
+
+
+	movl	%ecx, %eax
+	movq	%rdi, %rcx
+	movq	%rsi, %rdi
+	movq	%rcx, %rsi
+
+	/* set r8 to negate return value as rdi and rsi swapped.  */
+# ifdef USE_AS_WCSCMP
+	movl	$-4, %r8d
+# else
+	movl	$-1, %r8d
 # endif
+	xorl	%OFFSET_REG, %OFFSET_REG
 
-	.p2align 4
-L(last_vector):
-	addq	%rdx, %rdi
-	addq	%rdx, %rsi
+	/* Check if more than 1x VEC till page cross.  */
+	subl	$(VEC_SIZE * 3), %eax
+	jle	L(page_cross_loop)
+
+	.p2align 4,, 6
+L(less_1x_vec_till_page):
+	/* Find largest load size we can use.  */
+	cmpl	$16, %eax
+	ja	L(less_16_till_page)
+
+	VMOVU	(%rdi), %xmm0
+	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	incw	%cx
+	jnz	L(check_ret_vec_page_cross)
+	movl	$16, %OFFSET_REG
 # ifdef USE_AS_STRNCMP
-	subq	%rdx, %r11
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subl	%eax, %OFFSET_REG
+# else
+	/* Explicit check for 16 byte alignment.  */
+	subl	%eax, %OFFSET_REG
+	jz	L(prepare_loop)
 # endif
-	tzcntl	%ecx, %edx
+
+	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
+	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	incw	%cx
+	jnz	L(check_ret_vec_page_cross)
+
 # ifdef USE_AS_STRNCMP
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	addl	$16, %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subq	$-(VEC_SIZE * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+# else
+	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
 # endif
-# ifdef USE_AS_WCSCMP
+	jmp	L(prepare_loop_aligned)
+
+# ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case0):
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-# else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	ret
 # endif
-	VZEROUPPER_RETURN
 
-	/* Comparing on page boundary region requires special treatment:
-	   It must done one vector at the time, starting with the wider
-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
-	   (xmm) still passes the boundary, byte comparison must be done.
-	 */
-	.p2align 4
-L(cross_page):
-	/* Try one ymm vector at a time.  */
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	jg	L(cross_page_1_vector)
-L(loop_1_vector):
-	vmovdqu	(%rdi, %rdx), %ymm1
-	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
-	VPMINU	%ymm1, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
-	vpmovmskb %ymm0, %ecx
-	testl	%ecx, %ecx
-	jne	L(last_vector)
 
-	addl	$VEC_SIZE, %edx
+	.p2align 4,, 10
+L(less_16_till_page):
+	/* Find largest load size we can use.  */
+	cmpl	$24, %eax
+	ja	L(less_8_till_page)
 
-	addl	$VEC_SIZE, %eax
-# ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
-# endif
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	jle	L(loop_1_vector)
-L(cross_page_1_vector):
-	/* Less than 32 bytes to check, try one xmm vector.  */
-	cmpl	$(PAGE_SIZE - 16), %eax
-	jg	L(cross_page_1_xmm)
-	vmovdqu	(%rdi, %rdx), %xmm1
-	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
-	VPMINU	%xmm1, %xmm0, %xmm0
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
-	vpmovmskb %xmm0, %ecx
-	testl	%ecx, %ecx
-	jne	L(last_vector)
+	vmovq	(%rdi), %xmm0
+	vmovq	(%rsi), %xmm1
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	incb	%cl
+	jnz	L(check_ret_vec_page_cross)
 
-	addl	$16, %edx
-# ifndef USE_AS_WCSCMP
-	addl	$16, %eax
+
+# ifdef USE_AS_STRNCMP
+	cmpq	$8, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
 # endif
+	movl	$24, %OFFSET_REG
+	/* Explicit check for 16 byte alignment.  */
+	subl	%eax, %OFFSET_REG
+
+
+
+	vmovq	(%rdi, %OFFSET_REG64), %xmm0
+	vmovq	(%rsi, %OFFSET_REG64), %xmm1
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	incb	%cl
+	jnz	L(check_ret_vec_page_cross)
+
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
-# endif
-
-L(cross_page_1_xmm):
-# ifndef USE_AS_WCSCMP
-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
-	cmpl	$(PAGE_SIZE - 8), %eax
-	jg	L(cross_page_8bytes)
-	vmovq	(%rdi, %rdx), %xmm1
-	vmovq	(%rsi, %rdx), %xmm0
-	VPCMPEQ	%xmm0, %xmm1, %xmm0
-	VPMINU	%xmm1, %xmm0, %xmm0
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
-	vpmovmskb %xmm0, %ecx
-	/* Only last 8 bits are valid.  */
-	andl	$0xff, %ecx
-	testl	%ecx, %ecx
-	jne	L(last_vector)
+	addl	$8, %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subq	$-(VEC_SIZE * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+# else
+	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+# endif
+	jmp	L(prepare_loop_aligned)
+
 
-	addl	$8, %edx
-	addl	$8, %eax
+	.p2align 4,, 10
+L(less_8_till_page):
+# ifdef USE_AS_WCSCMP
+	/* If using wchar then this is the only check before we reach
+	   the page boundary.  */
+	movl	(%rdi), %eax
+	movl	(%rsi), %ecx
+	cmpl	%ecx, %eax
+	jnz	L(ret_less_8_wcs)
 #  ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	addq	%rdi, %rdx
+	/* We already checked for len <= 1 so cannot hit that case here.
+	 */
 #  endif
+	testl	%eax, %eax
+	jnz	L(prepare_loop_no_len)
+	ret
 
-L(cross_page_8bytes):
-	/* Less than 8 bytes to check, try 4 byte vector.  */
-	cmpl	$(PAGE_SIZE - 4), %eax
-	jg	L(cross_page_4bytes)
-	vmovd	(%rdi, %rdx), %xmm1
-	vmovd	(%rsi, %rdx), %xmm0
-	VPCMPEQ	%xmm0, %xmm1, %xmm0
-	VPMINU	%xmm1, %xmm0, %xmm0
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
-	vpmovmskb %xmm0, %ecx
-	/* Only last 4 bits are valid.  */
-	andl	$0xf, %ecx
-	testl	%ecx, %ecx
-	jne	L(last_vector)
+	.p2align 4,, 8
+L(ret_less_8_wcs):
+	setl	%OFFSET_REG8
+	negl	%OFFSET_REG
+	movl	%OFFSET_REG, %eax
+	xorl	%r8d, %eax
+	ret
+
+# else
+
+	/* Find largest load size we can use.  */
+	cmpl	$28, %eax
+	ja	L(less_4_till_page)
+
+	vmovd	(%rdi), %xmm0
+	vmovd	(%rsi), %xmm1
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	subl	$0xf, %ecx
+	jnz	L(check_ret_vec_page_cross)
 
-	addl	$4, %edx
 #  ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	cmpq	$4, %rdx
+	jbe	L(ret_zero_page_cross_slow_case1)
 #  endif
+	movl	$28, %OFFSET_REG
+	/* Explicit check for 16 byte alignment.  */
+	subl	%eax, %OFFSET_REG
 
-L(cross_page_4bytes):
-# endif
-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
-# ifdef USE_AS_STRNCMP
-	cmpq	%r11, %rdx
-	jae	L(zero)
-# endif
-# ifdef USE_AS_WCSCMP
-	movl	(%rdi, %rdx), %eax
-	movl	(%rsi, %rdx), %ecx
-# else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %ecx
+
+
+	vmovd	(%rdi, %OFFSET_REG64), %xmm0
+	vmovd	(%rsi, %OFFSET_REG64), %xmm1
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	subl	$0xf, %ecx
+	jnz	L(check_ret_vec_page_cross)
+
+#  ifdef USE_AS_STRNCMP
+	addl	$4, %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case1)
+	subq	$-(VEC_SIZE * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+#  else
+	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+#  endif
+	jmp	L(prepare_loop_aligned)
+
+#  ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case1):
+	xorl	%eax, %eax
+	ret
+#  endif
+
+	.p2align 4,, 10
+L(less_4_till_page):
+	subq	%rdi, %rsi
+	/* Extremely slow byte comparison loop.  */
+L(less_4_loop):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi, %rdi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
+	jnz	L(ret_less_4_loop)
+	testl	%ecx, %ecx
+	jz	L(ret_zero_4_loop)
+#  ifdef USE_AS_STRNCMP
+	decq	%rdx
+	jz	L(ret_zero_4_loop)
+#  endif
+	incq	%rdi
+	/* end condition is reach page boundary (rdi is aligned).  */
+	testl	$31, %edi
+	jnz	L(less_4_loop)
+	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
+	addq	$-(VEC_SIZE * 4), %rdi
+#  ifdef USE_AS_STRNCMP
+	subq	$-(VEC_SIZE * 4), %rdx
+#  endif
+	jmp	L(prepare_loop_aligned)
+
+L(ret_zero_4_loop):
+	xorl	%eax, %eax
+	ret
+L(ret_less_4_loop):
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+	ret
 # endif
-	testl	%eax, %eax
-	jne	L(cross_page_loop)
-	subl	%ecx, %eax
-	VZEROUPPER_RETURN
-END (STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 0cd939d5a..2a5b3ce03 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -19,6 +19,9 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
 
 # ifndef STRCMP
 #  define STRCMP	__strcmp_evex
@@ -26,54 +29,165 @@
 
 # define PAGE_SIZE	4096
 
-/* VEC_SIZE = Number of bytes in a ymm register */
+	/* VEC_SIZE = Number of bytes in a ymm register.  */
 # define VEC_SIZE	32
+# define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
 
-/* Shift for dividing by (VEC_SIZE * 4).  */
-# define DIVIDE_BY_VEC_4_SHIFT	7
-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-# endif
-
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
+# define VMOVU	vmovdqu64
+# define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSCMP
-/* Compare packed dwords.  */
-#  define VPCMP		vpcmpd
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__wcscmp_evex
+#  endif
+
+#  define TESTEQ	subl $0xff,
+	/* Compare packed dwords.  */
+#  define VPCMP	vpcmpd
 #  define VPMINU	vpminud
 #  define VPTESTM	vptestmd
-#  define SHIFT_REG32	r8d
-#  define SHIFT_REG64	r8
-/* 1 dword char == 4 bytes.  */
+#  define VPTESTNM	vptestnmd
+	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
 # else
-/* Compare packed bytes.  */
-#  define VPCMP		vpcmpb
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__strcmp_evex
+#  endif
+
+#  define TESTEQ	incl
+	/* Compare packed bytes.  */
+#  define VPCMP	vpcmpb
 #  define VPMINU	vpminub
 #  define VPTESTM	vptestmb
-#  define SHIFT_REG32	ecx
-#  define SHIFT_REG64	rcx
-/* 1 byte char == 1 byte.  */
+#  define VPTESTNM	vptestnmb
+	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
 # endif
 
-# define XMMZERO	xmm16
-# define XMM0		xmm17
-# define XMM1		xmm18
+# ifdef USE_AS_STRNCMP
+#  define LOOP_REG	r9d
+#  define LOOP_REG64	r9
+
+#  define OFFSET_REG8	r9b
+#  define OFFSET_REG	r9d
+#  define OFFSET_REG64	r9
+# else
+#  define LOOP_REG	edx
+#  define LOOP_REG64	rdx
+
+#  define OFFSET_REG8	dl
+#  define OFFSET_REG	edx
+#  define OFFSET_REG64	rdx
+# endif
+
+# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
+#  define VEC_OFFSET	0
+# else
+#  define VEC_OFFSET	(-VEC_SIZE)
+# endif
+
+# define XMM0	xmm17
+# define XMM1	xmm18
+
+# define XMM10	xmm27
+# define XMM11	xmm28
+# define XMM12	xmm29
+# define XMM13	xmm30
+# define XMM14	xmm31
+
+
+# define YMM0	ymm17
+# define YMM1	ymm18
+# define YMM2	ymm19
+# define YMM3	ymm20
+# define YMM4	ymm21
+# define YMM5	ymm22
+# define YMM6	ymm23
+# define YMM7	ymm24
+# define YMM8	ymm25
+# define YMM9	ymm26
+# define YMM10	ymm27
+# define YMM11	ymm28
+# define YMM12	ymm29
+# define YMM13	ymm30
+# define YMM14	ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_evex
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_evex
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
+# define LCASE_MIN_YMM	%YMM12
+# define LCASE_MAX_YMM	%YMM13
+# define CASE_ADD_YMM	%YMM14
+
+# define LCASE_MIN_XMM	%XMM12
+# define LCASE_MAX_XMM	%XMM13
+# define CASE_ADD_XMM	%XMM14
+
+	/* NB: wcsncmp uses r11 but strcasecmp is never used in
+	   conjunction with wcscmp.  */
+# define TOLOWER_BASE	%r11
+
+# ifdef USE_AS_STRCASECMP_L
+#  define _REG(x, y) x ## y
+#  define REG(x, y) _REG(x, y)
+#  define TOLOWER(reg1, reg2, ext)										\
+	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
+	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
+	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
+	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
+#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
+	TOLOWER	(s1_reg, s2_reg, ext);										\
+	VPCMP	$0, s1_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
+	VMOVU	s2_mem, s2_reg;												\
+	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_YMM(...)
+#  define TOLOWER_XMM(...)
 
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
-# define YMM2		ymm19
-# define YMM3		ymm20
-# define YMM4		ymm21
-# define YMM5		ymm22
-# define YMM6		ymm23
-# define YMM7		ymm24
-# define YMM8		ymm25
-# define YMM9		ymm26
-# define YMM10		ymm27
+#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
+	VPCMP	$0, s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
+	VPCMP	$0, s2_mem, s1_reg, reg_out
+
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
 
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -96,985 +210,1208 @@
    the maximum offset is reached before a difference is found, zero is
    returned.  */
 
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCMP)
-# ifdef USE_AS_STRNCMP
-	/* Check for simple cases (0 or 1) in offset.  */
-	cmp	$1, %RDX_LP
-	je	L(char0)
-	jb	L(zero)
-#  ifdef USE_AS_WCSCMP
-#  ifndef __ILP32__
-	movq	%rdx, %rcx
-	/* Check if length could overflow when multiplied by
-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
-	   overflow cases as well as redirect cases where its impossible to
-	   length to bound a valid memory region. In these cases just use
-	   'wcscmp'.  */
-	shrq	$56, %rcx
-	jnz	__wcscmp_evex
+	.section .text.evex, "ax", @progbits
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (STRCASECMP)
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
 #  endif
-	/* Convert units: from wide to byte char.  */
-	shl	$2, %RDX_LP
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
+# ifdef USE_AS_STRNCMP
+	/* Don't overwrite LOCALE_REG (rcx) until we have pass
+	   L(one_or_less). Otherwise we might use the wrong locale in
+	   the OVERFLOW_STRCMP (strcasecmp_l).  */
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
 #  endif
-	/* Register %r11 tracks the maximum offset.  */
-	mov	%RDX_LP, %R11_LP
+	cmp	$1, %RDX_LP
+	/* Signed comparison intentional. We use this branch to also
+	   test cases where length >= 2^63. These very large sizes can be
+	   handled with strcmp as there is no way for that length to
+	   actually bound the buffer.  */
+	jle	L(one_or_less)
+# endif
+
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+L(lcase_max):
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
 # endif
+
 	movl	%edi, %eax
-	xorl	%edx, %edx
-	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
 	orl	%esi, %eax
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
-	jg	L(cross_page)
-	/* Start comparing 4 vectors.  */
+	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
+	sall	$20, %eax
+	/* Check if s1 or s2 may cross a page in next 4x VEC loads.  */
+	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
+	ja	L(page_cross)
+
+L(no_page_cross):
+	/* Safe to compare 4x vectors.  */
 	VMOVU	(%rdi), %YMM0
-
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
 	VPTESTM	%YMM0, %YMM0, %k2
-
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 	   in YMM0 and 32 bytes at (%rsi).  */
-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
-
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
-# ifdef USE_AS_WCSCMP
-	subl	$0xff, %ecx
-# else
-	incl	%ecx
-# endif
-	je	L(next_3_vectors)
-	tzcntl	%ecx, %edx
-# ifdef USE_AS_WCSCMP
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %edx
-# endif
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx) is after the maximum
-	   offset (%r11).   */
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(vec_0_test_len)
 # endif
+
+	/* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
+	   wcscmp/wcsncmp.  */
+
+	/* All 1s represents all equals. TESTEQ will overflow to zero in
+	   all equals case. Otherwise 1s will carry until position of first
+	   mismatch.  */
+	TESTEQ	%ecx
+	jz	L(more_3x_vec)
+
+	.p2align 4,, 4
+L(return_vec_0):
+	tzcntl	%ecx, %ecx
 # ifdef USE_AS_WCSCMP
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	je	L(return)
-L(wcscmp_return):
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret0)
 	setl	%al
 	negl	%eax
 	orl	$1, %eax
-L(return):
 # else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 # endif
+L(ret0):
 	ret
 
-L(return_vec_size):
-	tzcntl	%ecx, %edx
-# ifdef USE_AS_WCSCMP
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %edx
-# endif
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
-	   the maximum offset (%r11).  */
-	addq	$VEC_SIZE, %rdx
-	cmpq	%r11, %rdx
-	jae	L(zero)
-#  ifdef USE_AS_WCSCMP
+	.p2align 4,, 4
+L(vec_0_test_len):
+	notl	%ecx
+	bzhil	%edx, %ecx, %eax
+	jnz	L(return_vec_0)
+	/* Align if will cross fetch block.  */
+	.p2align 4,, 2
+L(ret_zero):
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	ret
+
+	.p2align 4,, 5
+L(one_or_less):
+#  ifdef USE_AS_STRCASECMP_L
+	/* Set locale argument for strcasecmp.  */
+	movq	%LOCALE_REG, %rdx
 #  endif
-# else
+	jb	L(ret_zero)
+	/* 'nbe' covers the case where length is negative (large
+	   unsigned).  */
+	jnbe	OVERFLOW_STRCMP
 #  ifdef USE_AS_WCSCMP
+	movl	(%rdi), %edx
 	xorl	%eax, %eax
-	movl	VEC_SIZE(%rdi, %rdx), %ecx
-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
+	cmpl	(%rsi), %edx
+	je	L(ret1)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
 #  else
-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 #  endif
-# endif
+L(ret1):
 	ret
+# endif
 
-L(return_2_vec_size):
-	tzcntl	%ecx, %edx
+	.p2align 4,, 10
+L(return_vec_1):
+	tzcntl	%ecx, %ecx
+# ifdef USE_AS_STRNCMP
+	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
+	   worrying about underflow.  */
+	addq	$-CHAR_PER_VEC, %rdx
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero)
+# endif
 # ifdef USE_AS_WCSCMP
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %edx
+	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret2)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 # endif
+L(ret2):
+	ret
+
+	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
-	   after the maximum offset (%r11).  */
-	addq	$(VEC_SIZE * 2), %rdx
-	cmpq	%r11, %rdx
-	jae	L(zero)
-#  ifdef USE_AS_WCSCMP
-	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
+L(return_vec_3):
+#  if CHAR_PER_VEC <= 16
+	sall	$CHAR_PER_VEC, %ecx
 #  else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	salq	$CHAR_PER_VEC, %rcx
 #  endif
+# endif
+L(return_vec_2):
+# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
+	tzcntl	%ecx, %ecx
 # else
-#  ifdef USE_AS_WCSCMP
-	xorl	%eax, %eax
-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
-	subl	%edx, %eax
-#  endif
+	tzcntq	%rcx, %rcx
 # endif
-	ret
 
-L(return_3_vec_size):
-	tzcntl	%ecx, %edx
-# ifdef USE_AS_WCSCMP
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %edx
-# endif
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
-	   after the maximum offset (%r11).  */
-	addq	$(VEC_SIZE * 3), %rdx
-	cmpq	%r11, %rdx
-	jae	L(zero)
-#  ifdef USE_AS_WCSCMP
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret3)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
 # else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+# endif
+L(ret3):
+	ret
+
+# ifndef USE_AS_STRNCMP
+	.p2align 4,, 10
+L(return_vec_3):
+	tzcntl	%ecx, %ecx
 #  ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret4)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
 #  else
-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 #  endif
-# endif
+L(ret4):
 	ret
+# endif
 
-	.p2align 4
-L(next_3_vectors):
-	VMOVU	VEC_SIZE(%rdi), %YMM0
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
+	/* 32 byte align here ensures the main loop is ideally aligned
+	   for DSB.  */
+	.p2align 5
+L(more_3x_vec):
+	/* Safe to compare 4x vectors.  */
+	VMOVU	(VEC_SIZE)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
-	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
-# ifdef USE_AS_WCSCMP
-	subl	$0xff, %ecx
-# else
-	incl	%ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_1)
+
+# ifdef USE_AS_STRNCMP
+	subq	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(ret_zero)
 # endif
-	jne	L(return_vec_size)
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
 	VPTESTM	%YMM0, %YMM0, %k2
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
-	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
-# ifdef USE_AS_WCSCMP
-	subl	$0xff, %ecx
-# else
-	incl	%ecx
-# endif
-	jne	L(return_2_vec_size)
+	TESTEQ	%ecx
+	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
 	VPTESTM	%YMM0, %YMM0, %k2
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
-	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_3)
+
+# ifdef USE_AS_STRNCMP
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(ret_zero)
+# endif
+
+
 # ifdef USE_AS_WCSCMP
-	subl	$0xff, %ecx
+	/* any non-zero positive value that doesn't inference with 0x1.
+	 */
+	movl	$2, %r8d
+
 # else
-	incl	%ecx
-# endif
-	jne	L(return_3_vec_size)
-L(main_loop_header):
-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
-	movl	$PAGE_SIZE, %ecx
-	/* Align load via RAX.  */
-	andq	$-(VEC_SIZE * 4), %rdx
-	subq	%rdi, %rdx
-	leaq	(%rdi, %rdx), %rax
+	xorl	%r8d, %r8d
+# endif
+
+	/* The prepare labels are various entry points from the page
+	   cross logic.  */
+L(prepare_loop):
+
 # ifdef USE_AS_STRNCMP
-	/* Starting from this point, the maximum offset, or simply the
-	   'offset', DECREASES by the same amount when base pointers are
-	   moved forward.  Return 0 when:
-	     1) On match: offset <= the matched vector index.
-	     2) On mistmach, offset is before the mistmatched index.
-	 */
-	subq	%rdx, %r11
-	jbe	L(zero)
-# endif
-	addq	%rsi, %rdx
-	movq	%rdx, %rsi
-	andl	$(PAGE_SIZE - 1), %esi
-	/* Number of bytes before page crossing.  */
-	subq	%rsi, %rcx
-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
-	movl	%ecx, %esi
-	jmp	L(loop_start)
+#  ifdef USE_AS_WCSCMP
+L(prepare_loop_no_len):
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	shrl	$2, %ecx
+	leaq	(CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
+#  else
+	/* Store N + (VEC_SIZE * 4) and place check at the begining of
+	   the loop.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
+L(prepare_loop_no_len):
+#  endif
+# else
+L(prepare_loop_no_len):
+# endif
+
+	/* Align s1 and adjust s2 accordingly.  */
+	subq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 4), %rdi
+L(prepare_loop_readj):
+	addq	%rdi, %rsi
+# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
+	subq	%rdi, %rdx
+# endif
+
+L(prepare_loop_aligned):
+	/* eax stores distance from rsi to next page cross. These cases
+	   need to be handled specially as the 4x loop could potentially
+	   read memory past the length of s1 or s2 and across a page
+	   boundary.  */
+	movl	$-(VEC_SIZE * 4), %eax
+	subl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
 
+
+	/* Loop 4x comparisons at a time.  */
 	.p2align 4
 L(loop):
+
+	/* End condition for strncmp.  */
 # ifdef USE_AS_STRNCMP
-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
-	   the maximum offset (%r11) by the same amount.  */
-	subq	$(VEC_SIZE * 4), %r11
-	jbe	L(zero)
-# endif
-	addq	$(VEC_SIZE * 4), %rax
-	addq	$(VEC_SIZE * 4), %rdx
-L(loop_start):
-	testl	%esi, %esi
-	leal	-1(%esi), %esi
-	je	L(loop_cross_page)
-L(back_to_loop):
-	/* Main loop, comparing 4 vectors are a time.  */
-	VMOVA	(%rax), %YMM0
-	VMOVA	VEC_SIZE(%rax), %YMM2
-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
-	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(ret_zero)
+# endif
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+
+	/* Check if rsi loads will cross a page boundary.  */
+	addl	$-(VEC_SIZE * 4), %eax
+	jnb	L(page_cross_during_loop)
+
+	/* Loop entry after handling page cross during loop.  */
+L(loop_skip_page_cross_check):
+	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
+	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
 
 	VPMINU	%YMM0, %YMM2, %YMM8
 	VPMINU	%YMM4, %YMM6, %YMM9
 
-	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
-	VPMINU	%YMM8, %YMM9, %YMM8
+	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
+	VPMINU	%YMM8, %YMM9, %YMM9
+
+	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
+	VPTESTM	%YMM9, %YMM9, %k1
+# ifndef USE_AS_STRCASECMP_L
+	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
+	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
+	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
+	   oring with YMM1. Result is stored in YMM6.  */
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+# else
+	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
+	TOLOWER_YMM (%YMM0, %YMM1)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
+	TOLOWER_YMM (%YMM2, %YMM3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM0, %YMM1, %YMM1
+	vpxorq	%YMM2, %YMM3, %YMM3
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
+	/* Or together YMM3, YMM5, and YMM6.  */
+	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
 
-	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
-	VPTESTM	%YMM8, %YMM8, %k1
 
-	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
-	vpxorq	(%rdx), %YMM0, %YMM1
-	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
-	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
+	/* A non-zero CHAR in YMM6 represents a mismatch.  */
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+	kmovd	%k0, %LOOP_REG
 
-	vporq	%YMM1, %YMM3, %YMM9
-	vporq	%YMM5, %YMM7, %YMM10
+	TESTEQ	%LOOP_REG
+	jz	L(loop)
 
-	/* A non-zero CHAR in YMM9 represents a mismatch.  */
-	vporq	%YMM9, %YMM10, %YMM9
 
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
-	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
-	kmovd   %k0, %ecx
-# ifdef USE_AS_WCSCMP
-	subl	$0xff, %ecx
-# else
-	incl	%ecx
+	/* Find which VEC has the mismatch of end of string.  */
+	VPTESTM	%YMM0, %YMM0, %k1
+	VPTESTNM %YMM1, %YMM1, %k0{%k1}
+	kmovd	%k0, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_0_end)
+
+	VPTESTM	%YMM2, %YMM2, %k1
+	VPTESTNM %YMM3, %YMM3, %k0{%k1}
+	kmovd	%k0, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_1_end)
+
+
+	/* Handle VEC 2 and 3 without branches.  */
+L(return_vec_2_3_end):
+# ifdef USE_AS_STRNCMP
+	subq	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(ret_zero_end)
 # endif
-	je	 L(loop)
 
-	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
-	VPTESTM	%YMM0, %YMM0, %k1
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
-	   in YMM0 and (%rdx).  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+	VPTESTM	%YMM4, %YMM4, %k1
+	VPTESTNM %YMM5, %YMM5, %k0{%k1}
 	kmovd	%k0, %ecx
-# ifdef USE_AS_WCSCMP
-	subl	$0xff, %ecx
+	TESTEQ	%ecx
+# if CHAR_PER_VEC <= 16
+	sall	$CHAR_PER_VEC, %LOOP_REG
+	orl	%ecx, %LOOP_REG
 # else
-	incl	%ecx
-# endif
-	je	L(test_vec)
-	tzcntl	%ecx, %ecx
-# ifdef USE_AS_WCSCMP
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %ecx
+	salq	$CHAR_PER_VEC, %LOOP_REG64
+	orq	%rcx, %LOOP_REG64
+# endif
+L(return_vec_3_end):
+	/* LOOP_REG contains matches for null/mismatch from the loop. If
+	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
+	   must entirely be from VEC 3 which is fully represented by
+	   LOOP_REG.  */
+# if CHAR_PER_VEC <= 16
+	tzcntl	%LOOP_REG, %LOOP_REG
+# else
+	tzcntq	%LOOP_REG64, %LOOP_REG64
 # endif
 # ifdef USE_AS_STRNCMP
-	cmpq	%rcx, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	cmpq	%LOOP_REG64, %rdx
+	jbe	L(ret_zero_end)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
 	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+	je	L(ret5)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
+L(ret5):
 	ret
 
-	.p2align 4
-L(test_vec):
 # ifdef USE_AS_STRNCMP
-	/* The first vector matched.  Return 0 if the maximum offset
-	   (%r11) <= VEC_SIZE.  */
-	cmpq	$VEC_SIZE, %r11
-	jbe	L(zero)
-# endif
-	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
-	VPTESTM	%YMM2, %YMM2, %k1
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
-	   in YMM2 and VEC_SIZE(%rdx).  */
-	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
-	kmovd	%k0, %ecx
-# ifdef USE_AS_WCSCMP
-	subl	$0xff, %ecx
-# else
-	incl	%ecx
-# endif
-	je	L(test_2_vec)
-	tzcntl	%ecx, %edi
-# ifdef USE_AS_WCSCMP
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %edi
+	.p2align 4,, 2
+L(ret_zero_end):
+	xorl	%eax, %eax
+	ret
 # endif
+
+
+	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
+	   they use the value of `r8` to negate the return value. This is
+	   because the page cross logic can swap `rdi` and `rsi`.  */
+	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
-	addq	$VEC_SIZE, %rdi
-	cmpq	%rdi, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(%rsi, %rdi), %ecx
-	cmpl	(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
+L(return_vec_1_end):
+#  if CHAR_PER_VEC <= 16
+	sall	$CHAR_PER_VEC, %ecx
 #  else
-	movzbl	(%rax, %rdi), %eax
-	movzbl	(%rdx, %rdi), %edx
-	subl	%edx, %eax
+	salq	$CHAR_PER_VEC, %rcx
 #  endif
+# endif
+L(return_vec_0_end):
+# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
+	tzcntl	%ecx, %ecx
 # else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	VEC_SIZE(%rsi, %rdi), %ecx
-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	VEC_SIZE(%rax, %rdi), %eax
-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
-	subl	%edx, %eax
-#  endif
+	tzcntq	%rcx, %rcx
 # endif
-	ret
 
-	.p2align 4
-L(test_2_vec):
 # ifdef USE_AS_STRNCMP
-	/* The first 2 vectors matched.  Return 0 if the maximum offset
-	   (%r11) <= 2 * VEC_SIZE.  */
-	cmpq	$(VEC_SIZE * 2), %r11
-	jbe	L(zero)
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_end)
 # endif
-	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
-	VPTESTM	%YMM4, %YMM4, %k1
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
-	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
-	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
-	kmovd	%k0, %ecx
+
 # ifdef USE_AS_WCSCMP
-	subl	$0xff, %ecx
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret6)
+	setl	%al
+	negl	%eax
+	/* This is the non-zero case for `eax` so just xorl with `r8d`
+	   flip is `rdi` and `rsi` where swapped.  */
+	xorl	%r8d, %eax
 # else
-	incl	%ecx
-# endif
-	je	L(test_3_vec)
-	tzcntl	%ecx, %edi
-# ifdef USE_AS_WCSCMP
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %edi
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
+	   logic. Subtract `r8d` after xor for zero case.  */
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
-# ifdef USE_AS_STRNCMP
-	addq	$(VEC_SIZE * 2), %rdi
-	cmpq	%rdi, %r11
-	jbe	L(zero)
+L(ret6):
+	ret
+
+# ifndef USE_AS_STRNCMP
+	.p2align 4,, 10
+L(return_vec_1_end):
+	tzcntl	%ecx, %ecx
 #  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
-	movl	(%rsi, %rdi), %ecx
-	cmpl	(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
+	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret7)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 #  else
-	movzbl	(%rax, %rdi), %eax
-	movzbl	(%rdx, %rdi), %edx
-	subl	%edx, %eax
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 #  endif
+L(ret7):
+	ret
+# endif
+
+
+	/* Page cross in rsi in next 4x VEC.  */
+
+	/* TODO: Improve logic here.  */
+	.p2align 4,, 10
+L(page_cross_during_loop):
+	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
+
+	/* Optimistically rsi and rdi and both aligned in which case we
+	   don't need any logic here.  */
+	cmpl	$-(VEC_SIZE * 4), %eax
+	/* Don't adjust eax before jumping back to loop and we will
+	   never hit page cross case again.  */
+	je	L(loop_skip_page_cross_check)
+
+	/* Check if we can safely load a VEC.  */
+	cmpl	$-(VEC_SIZE * 3), %eax
+	jle	L(less_1x_vec_till_page_cross)
+
+	VMOVA	(%rdi), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_0_end)
+
+	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
+	cmpl	$-(VEC_SIZE * 2), %eax
+	jg	L(more_2x_vec_till_page_cross)
+
+	.p2align 4,, 4
+L(less_1x_vec_till_page_cross):
+	subl	$-(VEC_SIZE * 4), %eax
+	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
+	   concerning case is first iteration if incoming s1 was near start
+	   of a page and s2 near end. If s1 was near the start of the page
+	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
+	   to read back -VEC_SIZE. If rdi is truly at the start of a page
+	   here, it means the previous page (rdi - VEC_SIZE) has already
+	   been loaded earlier so must be valid.  */
+	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+	/* Mask of potentially valid bits. The lower bits can be out of
+	   range comparisons (but safe regarding page crosses).  */
+
+# ifdef USE_AS_WCSCMP
+	movl	$-1, %r10d
+	movl	%esi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+	shrl	$2, %ecx
+	shlxl	%ecx, %r10d, %ecx
+	movzbl	%cl, %r10d
 # else
+	movl	$-1, %ecx
+	shlxl	%esi, %ecx, %r10d
+# endif
+
+	kmovd	%k1, %ecx
+	notl	%ecx
+
+
+# ifdef USE_AS_STRNCMP
 #  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
+	movl	%eax, %r11d
+	shrl	$2, %r11d
+	cmpq	%r11, %rdx
 #  else
-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
-	subl	%edx, %eax
+	cmpq	%rax, %rdx
 #  endif
+	jbe	L(return_page_cross_end_check)
 # endif
-	ret
+	movl	%eax, %OFFSET_REG
 
-	.p2align 4
-L(test_3_vec):
-# ifdef USE_AS_STRNCMP
-	/* The first 3 vectors matched.  Return 0 if the maximum offset
-	   (%r11) <= 3 * VEC_SIZE.  */
-	cmpq	$(VEC_SIZE * 3), %r11
-	jbe	L(zero)
-# endif
-	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
-	VPTESTM	%YMM6, %YMM6, %k1
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
-	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
-	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
-	kmovd	%k0, %ecx
-# ifdef USE_AS_WCSCMP
-	subl	$0xff, %ecx
+	/* Readjust eax before potentially returning to the loop.  */
+	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
+
+	andl	%r10d, %ecx
+	jz	L(loop_skip_page_cross_check)
+
+	.p2align 4,, 3
+L(return_page_cross_end):
+	tzcntl	%ecx, %ecx
+
+# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
+	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
+L(return_page_cross_cmp_mem):
 # else
-	incl	%ecx
+	addl	%OFFSET_REG, %ecx
 # endif
-	tzcntl	%ecx, %ecx
 # ifdef USE_AS_WCSCMP
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %ecx
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
+	xorl	%eax, %eax
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
+	je	L(ret8)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
+# else
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
+L(ret8):
+	ret
+
 # ifdef USE_AS_STRNCMP
-	addq	$(VEC_SIZE * 3), %rcx
-	cmpq	%rcx, %r11
-	jbe	L(zero)
+	.p2align 4,, 10
+L(return_page_cross_end_check):
+	andl	%r10d, %ecx
+	tzcntl	%ecx, %ecx
+	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
 #  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %esi
-	cmpl	(%rdx, %rcx), %esi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
+	sall	$2, %edx
 #  endif
-# else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	cmpl	%ecx, %edx
+	ja	L(return_page_cross_cmp_mem)
 	xorl	%eax, %eax
-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
-# endif
 	ret
-
-	.p2align 4
-L(loop_cross_page):
-	xorl	%r10d, %r10d
-	movq	%rdx, %rcx
-	/* Align load via RDX.  We load the extra ECX bytes which should
-	   be ignored.  */
-	andl	$((VEC_SIZE * 4) - 1), %ecx
-	/* R10 is -RCX.  */
-	subq	%rcx, %r10
-
-	/* This works only if VEC_SIZE * 2 == 64. */
-# if (VEC_SIZE * 2) != 64
-#  error (VEC_SIZE * 2) != 64
 # endif
 
-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
-	cmpl	$(VEC_SIZE * 2), %ecx
-	jge	L(loop_cross_page_2_vec)
 
-	VMOVU	(%rax, %r10), %YMM2
-	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
+	.p2align 4,, 10
+L(more_2x_vec_till_page_cross):
+	/* If more 2x vec till cross we will complete a full loop
+	   iteration here.  */
 
-	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
-	VPTESTM	%YMM2, %YMM2, %k2
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
-	   in YMM2 and 32 bytes at (%rdx, %r10).  */
-	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
-	kmovd	%k1, %r9d
-	/* Don't use subl since it is the lower 16/32 bits of RDI
-	   below.  */
-	notl	%r9d
-# ifdef USE_AS_WCSCMP
-	/* Only last 8 bits are valid.  */
-	andl	$0xff, %r9d
-# endif
-
-	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
-	VPTESTM	%YMM3, %YMM3, %k4
-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
-	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
-	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
-	kmovd	%k3, %edi
-    /* Must use notl %edi here as lower bits are for CHAR
-	   comparisons potentially out of range thus can be 0 without
-	   indicating mismatch.  */
-	notl	%edi
-# ifdef USE_AS_WCSCMP
-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
-	andl	$0xff, %edi
+	VMOVA	VEC_SIZE(%rdi), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_1_end)
+
+# ifdef USE_AS_STRNCMP
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(ret_zero_in_loop_page_cross)
 # endif
 
-# ifdef USE_AS_WCSCMP
-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
-	sall	$8, %edi
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
-	   bytes.  */
-	movl	%ecx, %SHIFT_REG32
-	sarl	$2, %SHIFT_REG32
-
-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
-	orl	%r9d, %edi
-# else
-	salq	$32, %rdi
+	subl	$-(VEC_SIZE * 4), %eax
 
-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
-	orq	%r9, %rdi
-# endif
+	/* Safe to include comparisons from lower bytes.  */
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_page_cross_0)
+
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_page_cross_1)
 
-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
-	shrxq	%SHIFT_REG64, %rdi, %rdi
-	testq	%rdi, %rdi
-	je	L(loop_cross_page_2_vec)
-	tzcntq	%rdi, %rcx
-# ifdef USE_AS_WCSCMP
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %ecx
-# endif
 # ifdef USE_AS_STRNCMP
-	cmpq	%rcx, %r11
-	jbe	L(zero)
+	/* Must check length here as length might proclude reading next
+	   page.  */
 #  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
+	movl	%eax, %r11d
+	shrl	$2, %r11d
+	cmpq	%r11, %rdx
 #  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
+	cmpq	%rax, %rdx
 #  endif
+	jbe	L(ret_zero_in_loop_page_cross)
+# endif
+
+	/* Finish the loop.  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+	VPMINU	%YMM4, %YMM6, %YMM9
+	VPTESTM	%YMM9, %YMM9, %k1
+# ifndef USE_AS_STRCASECMP_L
+	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
 # else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+	kmovd	%k0, %LOOP_REG
+	TESTEQ	%LOOP_REG
+	jnz	L(return_vec_2_3_end)
+
+	/* Best for code size to include ucond-jmp here. Would be faster
+	   if this case is hot to duplicate the L(return_vec_2_3_end) code
+	   as fall-through and have jump back to loop on mismatch
+	   comparison.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
+# ifdef USE_AS_STRNCMP
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop_skip_page_cross_check)
+L(ret_zero_in_loop_page_cross):
 	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
-# endif
 	ret
+# else
+	jmp	L(loop_skip_page_cross_check)
+# endif
 
-	.p2align 4
-L(loop_cross_page_2_vec):
-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
-	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
-	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
 
-	VPTESTM	%YMM0, %YMM0, %k2
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
-	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
-	kmovd	%k1, %r9d
-	/* Don't use subl since it is the lower 16/32 bits of RDI
-	   below.  */
-	notl	%r9d
-# ifdef USE_AS_WCSCMP
-	/* Only last 8 bits are valid.  */
-	andl	$0xff, %r9d
-# endif
-
-	VPTESTM	%YMM1, %YMM1, %k4
-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
-	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
-	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
-	kmovd	%k3, %edi
-	/* Must use notl %edi here as lower bits are for CHAR
-	   comparisons potentially out of range thus can be 0 without
-	   indicating mismatch.  */
-	notl	%edi
-# ifdef USE_AS_WCSCMP
-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
-	andl	$0xff, %edi
+	.p2align 4,, 10
+L(return_vec_page_cross_0):
+	addl	$-VEC_SIZE, %eax
+L(return_vec_page_cross_1):
+	tzcntl	%ecx, %ecx
+# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
+	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
+#  ifdef USE_AS_STRNCMP
+#   ifdef USE_AS_WCSCMP
+	/* Must divide ecx instead of multiply rdx due to overflow.  */
+	movl	%ecx, %eax
+	shrl	$2, %eax
+	cmpq	%rax, %rdx
+#   else
+	cmpq	%rcx, %rdx
+#   endif
+	jbe	L(ret_zero_in_loop_page_cross)
+#  endif
+# else
+	addl	%eax, %ecx
 # endif
 
 # ifdef USE_AS_WCSCMP
-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
-	sall	$8, %edi
-
-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
-	orl	%r9d, %edi
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
+	xorl	%eax, %eax
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
+	je	L(ret9)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
-	salq	$32, %rdi
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+# endif
+L(ret9):
+	ret
 
-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
-	orq	%r9, %rdi
+
+	.p2align 4,, 10
+L(page_cross):
+# ifndef USE_AS_STRNCMP
+	/* If both are VEC aligned we don't need any special logic here.
+	   Only valid for strcmp where stop condition is guranteed to be
+	   reachable by just reading memory.  */
+	testl	$((VEC_SIZE - 1) << 20), %eax
+	jz	L(no_page_cross)
 # endif
 
-	xorl	%r8d, %r8d
-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
-	subl	$(VEC_SIZE * 2), %ecx
-	jle	1f
-	/* R8 has number of bytes skipped.  */
-	movl	%ecx, %r8d
+	movl	%edi, %eax
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %eax
+	andl	$(PAGE_SIZE - 1), %ecx
+
+	xorl	%OFFSET_REG, %OFFSET_REG
+
+	/* Check which is closer to page cross, s1 or s2.  */
+	cmpl	%eax, %ecx
+	jg	L(page_cross_s2)
+
+	/* The previous page cross check has false positives. Check for
+	   true positive as page cross logic is very expensive.  */
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
+	jbe	L(no_page_cross)
+
+
+	/* Set r8 to not interfere with normal return value (rdi and rsi
+	   did not swap).  */
 # ifdef USE_AS_WCSCMP
-	/* NB: Divide shift count by 4 since each bit in RDI represent 4
-	   bytes.  */
-	sarl	$2, %ecx
-	/* Skip ECX bytes.  */
-	shrl	%cl, %edi
+	/* any non-zero positive value that doesn't inference with 0x1.
+	 */
+	movl	$2, %r8d
 # else
-	/* Skip ECX bytes.  */
-	shrq	%cl, %rdi
+	xorl	%r8d, %r8d
 # endif
-1:
-	/* Before jumping back to the loop, set ESI to the number of
-	   VEC_SIZE * 4 blocks before page crossing.  */
-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
 
-	testq	%rdi, %rdi
+	/* Check if less than 1x VEC till page cross.  */
+	subl	$(VEC_SIZE * 3), %eax
+	jg	L(less_1x_vec_till_page)
+
+
+	/* If more than 1x VEC till page cross, loop throuh safely
+	   loadable memory until within 1x VEC of page cross.  */
+	.p2align 4,, 8
+L(page_cross_loop):
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(check_ret_vec_page_cross)
+	addl	$CHAR_PER_VEC, %OFFSET_REG
 # ifdef USE_AS_STRNCMP
-	/* At this point, if %rdi value is 0, it already tested
-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
-	   checks whether strncmp maximum offset reached or not.  */
-	je	L(string_nbyte_offset_check)
-# else
-	je	L(back_to_loop)
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross)
 # endif
-	tzcntq	%rdi, %rcx
+	addl	$VEC_SIZE, %eax
+	jl	L(page_cross_loop)
+
 # ifdef USE_AS_WCSCMP
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %ecx
+	shrl	$2, %eax
 # endif
-	addq	%r10, %rcx
-	/* Adjust for number of bytes skipped.  */
-	addq	%r8, %rcx
+
+
+	subl	%eax, %OFFSET_REG
+	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
+	   to not cross page so is safe to load. Since we have already
+	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
+	 */
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+
+	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
-	addq	$(VEC_SIZE * 2), %rcx
-	subq	%rcx, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
-# else
+	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
+	cmpq	%rax, %rdx
+	jbe	L(check_ret_vec_page_cross2)
 #  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
+	addq	$-(CHAR_PER_VEC * 2), %rdx
 #  else
-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
-	subl	%edx, %eax
+	addq	%rdi, %rdx
 #  endif
 # endif
-	ret
+	TESTEQ	%ecx
+	jz	L(prepare_loop_no_len)
 
-# ifdef USE_AS_STRNCMP
-L(string_nbyte_offset_check):
-	leaq	(VEC_SIZE * 4)(%r10), %r10
-	cmpq	%r10, %r11
-	jbe	L(zero)
-	jmp	L(back_to_loop)
-# endif
-
-	.p2align 4
-L(cross_page_loop):
-	/* Check one byte/dword at a time.  */
-# ifdef USE_AS_WCSCMP
-	cmpl	%ecx, %eax
-# else
-	subl	%ecx, %eax
-# endif
-	jne	L(different)
-	addl	$SIZE_OF_CHAR, %edx
-	cmpl	$(VEC_SIZE * 4), %edx
-	je	L(main_loop_header)
-# ifdef USE_AS_STRNCMP
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	.p2align 4,, 4
+L(ret_vec_page_cross):
+# ifndef USE_AS_STRNCMP
+L(check_ret_vec_page_cross):
 # endif
+	tzcntl	%ecx, %ecx
+	addl	%OFFSET_REG, %ecx
+L(ret_vec_page_cross_cont):
 # ifdef USE_AS_WCSCMP
-	movl	(%rdi, %rdx), %eax
-	movl	(%rsi, %rdx), %ecx
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret12)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %ecx
-# endif
-	/* Check null CHAR.  */
-	testl	%eax, %eax
-	jne	L(cross_page_loop)
-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
-	   comparisons.  */
+	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
+	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
-# ifndef USE_AS_WCSCMP
-L(different):
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
+L(ret12):
 	ret
 
-# ifdef USE_AS_WCSCMP
-	.p2align 4
-L(different):
-	/* Use movl to avoid modifying EFLAGS.  */
-	movl	$0, %eax
-	setl	%al
-	negl	%eax
-	orl	$1, %eax
-	ret
-# endif
 
 # ifdef USE_AS_STRNCMP
-	.p2align 4
-L(zero):
+	.p2align 4,, 10
+L(check_ret_vec_page_cross2):
+	TESTEQ	%ecx
+L(check_ret_vec_page_cross):
+	tzcntl	%ecx, %ecx
+	addl	%OFFSET_REG, %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_page_cross_cont)
+	.p2align 4,, 2
+L(ret_zero_page_cross):
 	xorl	%eax, %eax
 	ret
+# endif
 
-	.p2align 4
-L(char0):
-#  ifdef USE_AS_WCSCMP
-	xorl	%eax, %eax
-	movl	(%rdi), %ecx
-	cmpl	(%rsi), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rsi), %ecx
-	movzbl	(%rdi), %eax
-	subl	%ecx, %eax
-#  endif
-	ret
+	.p2align 4,, 4
+L(page_cross_s2):
+	/* Ensure this is a true page cross.  */
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
+	jbe	L(no_page_cross)
+
+
+	movl	%ecx, %eax
+	movq	%rdi, %rcx
+	movq	%rsi, %rdi
+	movq	%rcx, %rsi
+
+	/* set r8 to negate return value as rdi and rsi swapped.  */
+# ifdef USE_AS_WCSCMP
+	movl	$-4, %r8d
+# else
+	movl	$-1, %r8d
 # endif
+	xorl	%OFFSET_REG, %OFFSET_REG
 
-	.p2align 4
-L(last_vector):
-	addq	%rdx, %rdi
-	addq	%rdx, %rsi
-# ifdef USE_AS_STRNCMP
-	subq	%rdx, %r11
+	/* Check if more than 1x VEC till page cross.  */
+	subl	$(VEC_SIZE * 3), %eax
+	jle	L(page_cross_loop)
+
+	.p2align 4,, 6
+L(less_1x_vec_till_page):
+# ifdef USE_AS_WCSCMP
+	shrl	$2, %eax
 # endif
-	tzcntl	%ecx, %edx
+	/* Find largest load size we can use.  */
+	cmpl	$(16 / SIZE_OF_CHAR), %eax
+	ja	L(less_16_till_page)
+
+	/* Use 16 byte comparison.  */
+	vmovdqu	(%rdi), %xmm0
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
+	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %edx
+	subl	$0xf, %ecx
+# else
+	incw	%cx
 # endif
+	jnz	L(check_ret_vec_page_cross)
+	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
 # ifdef USE_AS_STRNCMP
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subl	%eax, %OFFSET_REG
+# else
+	/* Explicit check for 16 byte alignment.  */
+	subl	%eax, %OFFSET_REG
+	jz	L(prepare_loop)
 # endif
+	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
+	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
-	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
+	subl	$0xf, %ecx
+# else
+	incw	%cx
+# endif
+	jnz	L(check_ret_vec_page_cross)
+# ifdef USE_AS_STRNCMP
+	addl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
 # else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
 # endif
+	jmp	L(prepare_loop_aligned)
+
+# ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case0):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	/* Comparing on page boundary region requires special treatment:
-	   It must done one vector at the time, starting with the wider
-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
-	   (xmm) still passes the boundary, byte comparison must be done.
-	 */
-	.p2align 4
-L(cross_page):
-	/* Try one ymm vector at a time.  */
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	jg	L(cross_page_1_vector)
-L(loop_1_vector):
-	VMOVU	(%rdi, %rdx), %YMM0
 
-	VPTESTM	%YMM0, %YMM0, %k2
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
-	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
-	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
+	.p2align 4,, 10
+L(less_16_till_page):
+	cmpl	$(24 / SIZE_OF_CHAR), %eax
+	ja	L(less_8_till_page)
+
+	/* Use 8 byte comparison.  */
+	vmovq	(%rdi), %xmm0
+	vmovq	(%rsi), %xmm1
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
-	subl	$0xff, %ecx
+	subl	$0x3, %ecx
 # else
-	incl	%ecx
+	incb	%cl
 # endif
-	jne	L(last_vector)
+	jnz	L(check_ret_vec_page_cross)
 
-	addl	$VEC_SIZE, %edx
 
-	addl	$VEC_SIZE, %eax
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	cmpq	$(8 / SIZE_OF_CHAR), %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
 # endif
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	jle	L(loop_1_vector)
-L(cross_page_1_vector):
-	/* Less than 32 bytes to check, try one xmm vector.  */
-	cmpl	$(PAGE_SIZE - 16), %eax
-	jg	L(cross_page_1_xmm)
-	VMOVU	(%rdi, %rdx), %XMM0
+	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
+	subl	%eax, %OFFSET_REG
 
-	VPTESTM	%YMM0, %YMM0, %k2
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
-	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
-	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
+	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
-	subl	$0xf, %ecx
+	subl	$0x3, %ecx
 # else
-	subl	$0xffff, %ecx
+	incb	%cl
 # endif
-	jne	L(last_vector)
+	jnz	L(check_ret_vec_page_cross)
+
 
-	addl	$16, %edx
-# ifndef USE_AS_WCSCMP
-	addl	$16, %eax
-# endif
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	addl	$(8 / SIZE_OF_CHAR), %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+# else
+	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
 # endif
+	jmp	L(prepare_loop_aligned)
 
-L(cross_page_1_xmm):
-# ifndef USE_AS_WCSCMP
-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
-	cmpl	$(PAGE_SIZE - 8), %eax
-	jg	L(cross_page_8bytes)
-	vmovq	(%rdi, %rdx), %XMM0
-	vmovq	(%rsi, %rdx), %XMM1
 
-	VPTESTM	%YMM0, %YMM0, %k2
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
-	   in XMM0 and XMM1.  */
-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
-	kmovb	%k1, %ecx
+
+
+	.p2align 4,, 10
+L(less_8_till_page):
 # ifdef USE_AS_WCSCMP
-	subl	$0x3, %ecx
+	/* If using wchar then this is the only check before we reach
+	   the page boundary.  */
+	movl	(%rdi), %eax
+	movl	(%rsi), %ecx
+	cmpl	%ecx, %eax
+	jnz	L(ret_less_8_wcs)
+#  ifdef USE_AS_STRNCMP
+	addq	$-(CHAR_PER_VEC * 2), %rdx
+	/* We already checked for len <= 1 so cannot hit that case here.
+	 */
+#  endif
+	testl	%eax, %eax
+	jnz	L(prepare_loop)
+	ret
+
+	.p2align 4,, 8
+L(ret_less_8_wcs):
+	setl	%OFFSET_REG8
+	negl	%OFFSET_REG
+	movl	%OFFSET_REG, %eax
+	xorl	%r8d, %eax
+	ret
+
 # else
-	subl	$0xff, %ecx
-# endif
-	jne	L(last_vector)
+	cmpl	$28, %eax
+	ja	L(less_4_till_page)
+
+	vmovd	(%rdi), %xmm0
+	vmovd	(%rsi), %xmm1
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+	kmovd	%k1, %ecx
+	subl	$0xf, %ecx
+	jnz	L(check_ret_vec_page_cross)
 
-	addl	$8, %edx
-	addl	$8, %eax
 #  ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	cmpq	$4, %rdx
+	jbe	L(ret_zero_page_cross_slow_case1)
 #  endif
+	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
+	subl	%eax, %OFFSET_REG
 
-L(cross_page_8bytes):
-	/* Less than 8 bytes to check, try 4 byte vector.  */
-	cmpl	$(PAGE_SIZE - 4), %eax
-	jg	L(cross_page_4bytes)
-	vmovd	(%rdi, %rdx), %XMM0
-	vmovd	(%rsi, %rdx), %XMM1
-
-	VPTESTM	%YMM0, %YMM0, %k2
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
-	   in XMM0 and XMM1.  */
-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
+	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
-# ifdef USE_AS_WCSCMP
-	subl	$0x1, %ecx
-# else
 	subl	$0xf, %ecx
-# endif
-	jne	L(last_vector)
+	jnz	L(check_ret_vec_page_cross)
+#  ifdef USE_AS_STRNCMP
+	addl	$(4 / SIZE_OF_CHAR), %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case1)
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  else
+	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  endif
+	jmp	L(prepare_loop_aligned)
+
 
-	addl	$4, %edx
 #  ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case1):
+	xorl	%eax, %eax
+	ret
 #  endif
 
-L(cross_page_4bytes):
-# endif
-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
-# ifdef USE_AS_STRNCMP
-	cmpq	%r11, %rdx
-	jae	L(zero)
-# endif
-# ifdef USE_AS_WCSCMP
-	movl	(%rdi, %rdx), %eax
-	movl	(%rsi, %rdx), %ecx
-# else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %ecx
-# endif
-	testl	%eax, %eax
-	jne	L(cross_page_loop)
-	subl	%ecx, %eax
+	.p2align 4,, 10
+L(less_4_till_page):
+	subq	%rdi, %rsi
+	/* Extremely slow byte comparison loop.  */
+L(less_4_loop):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi, %rdi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
+	jnz	L(ret_less_4_loop)
+	testl	%ecx, %ecx
+	jz	L(ret_zero_4_loop)
+#  ifdef USE_AS_STRNCMP
+	decq	%rdx
+	jz	L(ret_zero_4_loop)
+#  endif
+	incq	%rdi
+	/* end condition is reach page boundary (rdi is aligned).  */
+	testl	$31, %edi
+	jnz	L(less_4_loop)
+	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
+	addq	$-(VEC_SIZE * 4), %rdi
+#  ifdef USE_AS_STRNCMP
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+#  endif
+	jmp	L(prepare_loop_aligned)
+
+L(ret_zero_4_loop):
+	xorl	%eax, %eax
+	ret
+L(ret_less_4_loop):
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 	ret
-END (STRCMP)
+# endif
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index 580feb90e..a9178ad25 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -41,13 +41,8 @@
 # define UPDATE_STRNCMP_COUNTER
 #endif
 
-#ifdef USE_AVX
-# define SECTION	avx
-# define GLABEL(l)	l##_avx
-#else
-# define SECTION	sse4.2
-# define GLABEL(l)	l##_sse42
-#endif
+#define SECTION	sse4.2
+#define GLABEL(l)	l##_sse42
 
 #define LABEL(l)	.L##l
 
@@ -88,9 +83,8 @@ ENTRY (GLABEL(__strcasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RDX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strcasecmp))
 	/* FALLTHROUGH to strcasecmp_l.  */
 #endif
@@ -99,29 +93,14 @@ ENTRY (GLABEL(__strncasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RCX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strncasecmp))
 	/* FALLTHROUGH to strncasecmp_l.  */
 #endif
 
 
-#ifdef USE_AVX
-# define movdqa vmovdqa
-# define movdqu vmovdqu
-# define pmovmskb vpmovmskb
-# define pcmpistri vpcmpistri
-# define psubb vpsubb
-# define pcmpeqb vpcmpeqb
-# define psrldq vpsrldq
-# define pslldq vpslldq
-# define palignr vpalignr
-# define pxor vpxor
-# define D(arg) arg, arg
-#else
-# define D(arg) arg
-#endif
+#define arg arg
 
 STRCMP_SSE42:
 	cfi_startproc
@@ -169,27 +148,22 @@ STRCMP_SSE42:
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
-LABEL(belowupper):
-	.quad	0x4040404040404040
-	.quad	0x4040404040404040
-LABEL(topupper):
-# ifdef USE_AVX
-	.quad	0x5a5a5a5a5a5a5a5a
-	.quad	0x5a5a5a5a5a5a5a5a
-# else
-	.quad	0x5b5b5b5b5b5b5b5b
-	.quad	0x5b5b5b5b5b5b5b5b
-# endif
-LABEL(touppermask):
+LABEL(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+LABEL(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+LABEL(case_add):
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.previous
-	movdqa	LABEL(belowupper)(%rip), %xmm4
-# define UCLOW_reg %xmm4
-	movdqa	LABEL(topupper)(%rip), %xmm5
-# define UCHIGH_reg %xmm5
-	movdqa	LABEL(touppermask)(%rip), %xmm6
-# define LCQWORD_reg %xmm6
+	movdqa	LABEL(lcase_min)(%rip), %xmm4
+# define LCASE_MIN_reg %xmm4
+	movdqa	LABEL(lcase_max)(%rip), %xmm5
+# define LCASE_MAX_reg %xmm5
+	movdqa	LABEL(case_add)(%rip), %xmm6
+# define CASE_ADD_reg %xmm6
 #endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
@@ -198,43 +172,26 @@ LABEL(touppermask):
 	movdqu	(%rdi), %xmm1
 	movdqu	(%rsi), %xmm2
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# ifdef USE_AVX
-#  define TOLOWER(reg1, reg2) \
-	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
-	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
-	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
-	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
-	vpandn	%xmm7, %xmm8, %xmm8;					\
-	vpandn	%xmm9, %xmm10, %xmm10;					\
-	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
-	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
-	vpor	reg1, %xmm8, reg1;					\
-	vpor	reg2, %xmm10, reg2
-# else
-#  define TOLOWER(reg1, reg2) \
-	movdqa	reg1, %xmm7;					\
-	movdqa	UCHIGH_reg, %xmm8;				\
-	movdqa	reg2, %xmm9;					\
-	movdqa	UCHIGH_reg, %xmm10;				\
-	pcmpgtb	UCLOW_reg, %xmm7;				\
-	pcmpgtb	reg1, %xmm8;					\
-	pcmpgtb	UCLOW_reg, %xmm9;				\
-	pcmpgtb	reg2, %xmm10;					\
-	pand	%xmm8, %xmm7;					\
-	pand	%xmm10, %xmm9;					\
-	pand	LCQWORD_reg, %xmm7;				\
-	pand	LCQWORD_reg, %xmm9;				\
-	por	%xmm7, reg1;					\
-	por	%xmm9, reg2
-# endif
+# define TOLOWER(reg1, reg2) \
+	movdqa	LCASE_MIN_reg, %xmm7;					\
+	movdqa	LCASE_MIN_reg, %xmm8;					\
+	paddb	reg1, %xmm7;					\
+	paddb	reg2, %xmm8;					\
+	pcmpgtb	LCASE_MAX_reg, %xmm7;				\
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
+	pandn	CASE_ADD_reg, %xmm7;					\
+	pandn	CASE_ADD_reg, %xmm8;					\
+	paddb	%xmm7, reg1;					\
+	paddb	%xmm8, reg2
+
 	TOLOWER (%xmm1, %xmm2)
 #else
 # define TOLOWER(reg1, reg2)
 #endif
-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char checks */
-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
-	pcmpeqb	%xmm2, D(%xmm1)		/* compare first 16 bytes for equality */
-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
 	jnz	LABEL(less16bytes)/* If not, find different value or null char */
@@ -258,7 +215,7 @@ LABEL(crosscache):
 	xor	%r8d, %r8d
 	and	$0xf, %ecx		/* offset of rsi */
 	and	$0xf, %eax		/* offset of rdi */
-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char check */
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char check */
 	cmp	%eax, %ecx
 	je	LABEL(ashr_0)		/* rsi and rdi relative offset same */
 	ja	LABEL(bigger)
@@ -272,7 +229,7 @@ LABEL(bigger):
 	sub	%rcx, %r9
 	lea	LABEL(unaligned_table)(%rip), %r10
 	movslq	(%r10, %r9,4), %r9
-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	lea	(%r10, %r9), %r10
 	_CET_NOTRACK jmp *%r10		/* jump to corresponding case */
 
@@ -285,15 +242,15 @@ LABEL(bigger):
 LABEL(ashr_0):
 
 	movdqa	(%rsi), %xmm1
-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
-	pcmpeqb	(%rdi), D(%xmm1)	/* compare 16 bytes for equality */
+	pcmpeqb	(%rdi), %xmm1		/* compare 16 bytes for equality */
 #else
 	movdqa	(%rdi), %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm2, D(%xmm1)		/* compare 16 bytes for equality */
+	pcmpeqb	%xmm2, %xmm1		/* compare 16 bytes for equality */
 #endif
-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
 	pmovmskb %xmm1, %r9d
 	shr	%cl, %edx		/* adjust 0xffff for offset */
 	shr	%cl, %r9d		/* adjust for 16-byte offset */
@@ -373,10 +330,10 @@ LABEL(ashr_0_exit_use):
  */
 	.p2align 4
 LABEL(ashr_1):
-	pslldq	$15, D(%xmm2)		/* shift first string to align with second */
+	pslldq	$15, %xmm2		/* shift first string to align with second */
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)		/* compare 16 bytes for equality */
-	psubb	%xmm0, D(%xmm2)		/* packed sub of comparison results*/
+	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx		/* adjust 0xffff for offset */
 	shr	%cl, %r9d		/* adjust for 16-byte offset */
@@ -404,7 +361,7 @@ LABEL(loop_ashr_1_use):
 
 LABEL(nibble_ashr_1_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
+	palignr $1, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -423,7 +380,7 @@ LABEL(nibble_ashr_1_restart_use):
 	jg	LABEL(nibble_ashr_1_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
+	palignr $1, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -443,7 +400,7 @@ LABEL(nibble_ashr_1_restart_use):
 LABEL(nibble_ashr_1_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$1, D(%xmm0)
+	psrldq	$1, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -461,10 +418,10 @@ LABEL(nibble_ashr_1_use):
  */
 	.p2align 4
 LABEL(ashr_2):
-	pslldq	$14, D(%xmm2)
+	pslldq	$14, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -492,7 +449,7 @@ LABEL(loop_ashr_2_use):
 
 LABEL(nibble_ashr_2_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
+	palignr $2, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -511,7 +468,7 @@ LABEL(nibble_ashr_2_restart_use):
 	jg	LABEL(nibble_ashr_2_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
+	palignr $2, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -531,7 +488,7 @@ LABEL(nibble_ashr_2_restart_use):
 LABEL(nibble_ashr_2_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$2, D(%xmm0)
+	psrldq	$2, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -549,10 +506,10 @@ LABEL(nibble_ashr_2_use):
  */
 	.p2align 4
 LABEL(ashr_3):
-	pslldq	$13, D(%xmm2)
+	pslldq	$13, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -580,7 +537,7 @@ LABEL(loop_ashr_3_use):
 
 LABEL(nibble_ashr_3_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
+	palignr $3, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -599,7 +556,7 @@ LABEL(nibble_ashr_3_restart_use):
 	jg	LABEL(nibble_ashr_3_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
+	palignr $3, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -619,7 +576,7 @@ LABEL(nibble_ashr_3_restart_use):
 LABEL(nibble_ashr_3_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$3, D(%xmm0)
+	psrldq	$3, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -637,10 +594,10 @@ LABEL(nibble_ashr_3_use):
  */
 	.p2align 4
 LABEL(ashr_4):
-	pslldq	$12, D(%xmm2)
+	pslldq	$12, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -669,7 +626,7 @@ LABEL(loop_ashr_4_use):
 
 LABEL(nibble_ashr_4_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
+	palignr $4, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -688,7 +645,7 @@ LABEL(nibble_ashr_4_restart_use):
 	jg	LABEL(nibble_ashr_4_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
+	palignr $4, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -708,7 +665,7 @@ LABEL(nibble_ashr_4_restart_use):
 LABEL(nibble_ashr_4_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$4, D(%xmm0)
+	psrldq	$4, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -726,10 +683,10 @@ LABEL(nibble_ashr_4_use):
  */
 	.p2align 4
 LABEL(ashr_5):
-	pslldq	$11, D(%xmm2)
+	pslldq	$11, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -758,7 +715,7 @@ LABEL(loop_ashr_5_use):
 
 LABEL(nibble_ashr_5_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
+	palignr $5, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -778,7 +735,7 @@ LABEL(nibble_ashr_5_restart_use):
 
 	movdqa	(%rdi, %rdx), %xmm0
 
-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
+	palignr $5, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -798,7 +755,7 @@ LABEL(nibble_ashr_5_restart_use):
 LABEL(nibble_ashr_5_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$5, D(%xmm0)
+	psrldq	$5, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -816,10 +773,10 @@ LABEL(nibble_ashr_5_use):
  */
 	.p2align 4
 LABEL(ashr_6):
-	pslldq	$10, D(%xmm2)
+	pslldq	$10, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -848,7 +805,7 @@ LABEL(loop_ashr_6_use):
 
 LABEL(nibble_ashr_6_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
+	palignr $6, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -867,7 +824,7 @@ LABEL(nibble_ashr_6_restart_use):
 	jg	LABEL(nibble_ashr_6_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
+	palignr $6, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -887,7 +844,7 @@ LABEL(nibble_ashr_6_restart_use):
 LABEL(nibble_ashr_6_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$6, D(%xmm0)
+	psrldq	$6, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -905,10 +862,10 @@ LABEL(nibble_ashr_6_use):
  */
 	.p2align 4
 LABEL(ashr_7):
-	pslldq	$9, D(%xmm2)
+	pslldq	$9, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -937,7 +894,7 @@ LABEL(loop_ashr_7_use):
 
 LABEL(nibble_ashr_7_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
+	palignr $7, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -956,7 +913,7 @@ LABEL(nibble_ashr_7_restart_use):
 	jg	LABEL(nibble_ashr_7_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
+	palignr $7, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -976,7 +933,7 @@ LABEL(nibble_ashr_7_restart_use):
 LABEL(nibble_ashr_7_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$7, D(%xmm0)
+	psrldq	$7, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -994,10 +951,10 @@ LABEL(nibble_ashr_7_use):
  */
 	.p2align 4
 LABEL(ashr_8):
-	pslldq	$8, D(%xmm2)
+	pslldq	$8, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1026,7 +983,7 @@ LABEL(loop_ashr_8_use):
 
 LABEL(nibble_ashr_8_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
+	palignr $8, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1045,7 +1002,7 @@ LABEL(nibble_ashr_8_restart_use):
 	jg	LABEL(nibble_ashr_8_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
+	palignr $8, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1065,7 +1022,7 @@ LABEL(nibble_ashr_8_restart_use):
 LABEL(nibble_ashr_8_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$8, D(%xmm0)
+	psrldq	$8, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1083,10 +1040,10 @@ LABEL(nibble_ashr_8_use):
  */
 	.p2align 4
 LABEL(ashr_9):
-	pslldq	$7, D(%xmm2)
+	pslldq	$7, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1116,7 +1073,7 @@ LABEL(loop_ashr_9_use):
 LABEL(nibble_ashr_9_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
 
-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
+	palignr $9, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1135,7 +1092,7 @@ LABEL(nibble_ashr_9_restart_use):
 	jg	LABEL(nibble_ashr_9_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
+	palignr $9, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1155,7 +1112,7 @@ LABEL(nibble_ashr_9_restart_use):
 LABEL(nibble_ashr_9_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$9, D(%xmm0)
+	psrldq	$9, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1173,10 +1130,10 @@ LABEL(nibble_ashr_9_use):
  */
 	.p2align 4
 LABEL(ashr_10):
-	pslldq	$6, D(%xmm2)
+	pslldq	$6, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1205,7 +1162,7 @@ LABEL(loop_ashr_10_use):
 
 LABEL(nibble_ashr_10_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
+	palignr $10, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1224,7 +1181,7 @@ LABEL(nibble_ashr_10_restart_use):
 	jg	LABEL(nibble_ashr_10_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
+	palignr $10, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1244,7 +1201,7 @@ LABEL(nibble_ashr_10_restart_use):
 LABEL(nibble_ashr_10_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$10, D(%xmm0)
+	psrldq	$10, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1262,10 +1219,10 @@ LABEL(nibble_ashr_10_use):
  */
 	.p2align 4
 LABEL(ashr_11):
-	pslldq	$5, D(%xmm2)
+	pslldq	$5, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1294,7 +1251,7 @@ LABEL(loop_ashr_11_use):
 
 LABEL(nibble_ashr_11_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
+	palignr $11, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1313,7 +1270,7 @@ LABEL(nibble_ashr_11_restart_use):
 	jg	LABEL(nibble_ashr_11_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
+	palignr $11, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1333,7 +1290,7 @@ LABEL(nibble_ashr_11_restart_use):
 LABEL(nibble_ashr_11_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$11, D(%xmm0)
+	psrldq	$11, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1351,10 +1308,10 @@ LABEL(nibble_ashr_11_use):
  */
 	.p2align 4
 LABEL(ashr_12):
-	pslldq	$4, D(%xmm2)
+	pslldq	$4, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1383,7 +1340,7 @@ LABEL(loop_ashr_12_use):
 
 LABEL(nibble_ashr_12_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
+	palignr $12, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1402,7 +1359,7 @@ LABEL(nibble_ashr_12_restart_use):
 	jg	LABEL(nibble_ashr_12_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
+	palignr $12, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1422,7 +1379,7 @@ LABEL(nibble_ashr_12_restart_use):
 LABEL(nibble_ashr_12_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$12, D(%xmm0)
+	psrldq	$12, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1440,10 +1397,10 @@ LABEL(nibble_ashr_12_use):
  */
 	.p2align 4
 LABEL(ashr_13):
-	pslldq	$3, D(%xmm2)
+	pslldq	$3, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1473,7 +1430,7 @@ LABEL(loop_ashr_13_use):
 
 LABEL(nibble_ashr_13_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
+	palignr $13, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1492,7 +1449,7 @@ LABEL(nibble_ashr_13_restart_use):
 	jg	LABEL(nibble_ashr_13_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
+	palignr $13, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1512,7 +1469,7 @@ LABEL(nibble_ashr_13_restart_use):
 LABEL(nibble_ashr_13_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$13, D(%xmm0)
+	psrldq	$13, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1530,10 +1487,10 @@ LABEL(nibble_ashr_13_use):
  */
 	.p2align 4
 LABEL(ashr_14):
-	pslldq  $2, D(%xmm2)
+	pslldq  $2, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1563,7 +1520,7 @@ LABEL(loop_ashr_14_use):
 
 LABEL(nibble_ashr_14_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
+	palignr $14, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1582,7 +1539,7 @@ LABEL(nibble_ashr_14_restart_use):
 	jg	LABEL(nibble_ashr_14_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
+	palignr $14, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1602,7 +1559,7 @@ LABEL(nibble_ashr_14_restart_use):
 LABEL(nibble_ashr_14_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$14, D(%xmm0)
+	psrldq	$14, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1620,10 +1577,10 @@ LABEL(nibble_ashr_14_use):
  */
 	.p2align 4
 LABEL(ashr_15):
-	pslldq	$1, D(%xmm2)
+	pslldq	$1, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1655,7 +1612,7 @@ LABEL(loop_ashr_15_use):
 
 LABEL(nibble_ashr_15_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
+	palignr $15, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1674,7 +1631,7 @@ LABEL(nibble_ashr_15_restart_use):
 	jg	LABEL(nibble_ashr_15_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
+	palignr $15, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1694,7 +1651,7 @@ LABEL(nibble_ashr_15_restart_use):
 LABEL(nibble_ashr_15_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$15, D(%xmm0)
+	psrldq	$15, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 68cb73baa..08638e263 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -29,6 +29,7 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -53,6 +54,10 @@ IFUNC_SELECTOR (void)
 	return OPTIMIZE (avx2);
     }
 
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
+    return OPTIMIZE (sse42);
+
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index 013aebf79..c312fab8b 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
     RETURN (NULL, strlen (s));
 
   const char *aligned;
-  __m128i mask;
-  int offset = (int) ((size_t) a & 15);
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (unsigned int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
   if (offset != 0)
     {
       /* Load masks.  */
       aligned = (const char *) ((size_t) a & -16L);
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
-      mask = __m128i_shift_right (mask0, offset);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
 
       /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16 - offset)
-	{
-	  /* There is no NULL terminator.  */
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
-	  length += index;
-
-	  /* Don't use SSE4.2 if the length of A > 16.  */
-	  if (length > 16)
-	    return STRCSPN_SSE2 (s, a);
-
-	  if (index != 0)
-	    {
-	      /* Combine mask0 and mask1.  We could play games with
-		 palignr, but frankly this data should be in L1 now
-		 so do the merge via an unaligned load.  */
-	      mask = _mm_loadu_si128 ((__m128i *) a);
-	    }
-	}
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
     }
-  else
-    {
-      /* A is aligned.  */
-      mask = _mm_load_si128 ((__m128i *) a);
 
-      /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16)
-	{
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
-	     of A > 16.  */
-	  if (a[16] != 0)
-	    return STRCSPN_SSE2 (s, a);
-	}
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return STRCSPN_SSE2 (s, a);
     }
 
-  offset = (int) ((size_t) s & 15);
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
   if (offset != 0)
     {
+    start_unaligned:
       /* Check partial string.  */
       aligned = (const char *) ((size_t) s & -16L);
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
 
       value = __m128i_shift_right (value, offset);
 
-      int length = _mm_cmpistri (mask, value, 0x2);
+      unsigned int length = _mm_cmpistri (mask, value, 0x2);
       /* No need to check ZFlag since ZFlag is always 1.  */
-      int cflag = _mm_cmpistrc (mask, value, 0x2);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
       if (cflag)
 	RETURN ((char *) (s + length), length);
       /* Find where the NULL terminator is.  */
-      int index = _mm_cmpistri (value, value, 0x3a);
+      unsigned int index = _mm_cmpistri (value, value, 0x3a);
       if (index < 16 - offset)
 	RETURN (NULL, index);
       aligned += 16;
     }
-  else
-    aligned = s;
 
+start_loop:
   while (1)
     {
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      int index = _mm_cmpistri (mask, value, 0x2);
-      int cflag = _mm_cmpistrc (mask, value, 0x2);
-      int zflag = _mm_cmpistrz (mask, value, 0x2);
+      unsigned int index = _mm_cmpistri (mask, value, 0x2);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
       if (cflag)
 	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
       if (zflag)
diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c
similarity index 86%
rename from sysdeps/x86_64/multiarch/strspn-sse2.S
rename to sysdeps/x86_64/multiarch/strcspn-sse2.c
index e0a095f25..3a04bb39f 100644
--- a/sysdeps/x86_64/multiarch/strspn-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
@@ -1,4 +1,4 @@
-/* strspn optimized with SSE2.
+/* strcspn.
    Copyright (C) 2017-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -19,10 +19,10 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
-# define strspn __strspn_sse2
+# define STRCSPN __strcspn_sse2
 
 # undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strspn)
+# define libc_hidden_builtin_def(STRCSPN)
 #endif
 
-#include <sysdeps/x86_64/strspn.S>
+#include <string/strcspn.c>
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
new file mode 100644
index 000000000..278c89969
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -0,0 +1,302 @@
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMP		vpcmpd
+#  define VPTESTN	vptestnmd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMP		vpcmpb
+#  define VPTESTN	vptestnmb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# define XMM0		xmm16
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# if VEC_SIZE == 64
+#  define KMOV		kmovq
+#  define KORTEST	kortestq
+#  define RAX		rax
+#  define RCX		rcx
+#  define RDX		rdx
+#  define SHR		shrq
+#  define TEXTSUFFIX	evex512
+#  define VMM0		zmm16
+#  define VMM1		zmm17
+#  define VMM2		zmm18
+#  define VMM3		zmm19
+#  define VMM4		zmm20
+#  define VMOVA		vmovdqa64
+# elif VEC_SIZE == 32
+/* Currently Unused.  */
+#  define KMOV		kmovd
+#  define KORTEST	kortestd
+#  define RAX		eax
+#  define RCX		ecx
+#  define RDX		edx
+#  define SHR		shrl
+#  define TEXTSUFFIX	evex256
+#  define VMM0		ymm16
+#  define VMM1		ymm17
+#  define VMM2		ymm18
+#  define VMM3		ymm19
+#  define VMM4		ymm20
+#  define VMOVA		vmovdqa32
+# endif
+
+	.section .text.TEXTSUFFIX, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(ret_max)
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+# endif
+
+	movl	%edi, %eax
+	vpxorq	%XMM0, %XMM0, %XMM0
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMP	$0, (%rdi), %VMM0, %k0
+	KMOV	%k0, %RAX
+	test	%RAX, %RAX
+	jz	L(align_more)
+
+	bsf	%RAX, %RAX
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+	/* At this point vector max length reached.  */
+# ifdef USE_AS_STRNLEN
+	.p2align 4,,3
+L(ret_max):
+	movq	%rsi, %rax
+	ret
+# endif
+
+L(align_more):
+	leaq	VEC_SIZE(%rdi), %rax
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+	movq	%rax, %rdx
+	subq	%rdi, %rdx
+#  ifdef USE_AS_WCSLEN
+	SHR	$2, %RDX
+#  endif
+	/* At this point rdx contains [w]chars already compared.  */
+	subq	%rsi, %rdx
+	jae	L(ret_max)
+	negq	%rdx
+	/* At this point rdx contains number of w[char] needs to go.
+	   Now onwards rdx will keep decrementing with each compare.  */
+# endif
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMP	$0, (%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x1)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x2)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x3)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x4)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+	/* Save pointer before 4 x VEC_SIZE alignment.  */
+	movq	%rax, %rcx
+# endif
+
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rax
+
+# ifdef USE_AS_STRNLEN
+	subq	%rax, %rcx
+#  ifdef USE_AS_WCSLEN
+	SHR	$2, %RCX
+#  endif
+	/* rcx contains number of [w]char will be recompared due to
+	   alignment fixes.  rdx must be incremented by rcx to offset
+	   alignment adjustment.  */
+	addq	%rcx, %rdx
+	/* Need jump as we don't want to add/subtract rdx for first
+	   iteration of 4 x VEC_SIZE aligned loop.  */
+	jmp	L(loop_entry)
+# endif
+
+	.p2align 4,,11
+L(loop):
+# ifdef USE_AS_STRNLEN
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(ret_max)
+L(loop_entry):
+# endif
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+
+	VPTESTN	%VMM2, %VMM2, %k0
+	VPTESTN	%VMM4, %VMM4, %k1
+
+	subq	$-(VEC_SIZE * 4), %rax
+	KORTEST	%k0, %k1
+	jz	L(loop)
+
+	VPTESTN	%VMM1, %VMM1, %k2
+	KMOV	%k2, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x1)
+
+	KMOV	%k0, %RCX
+	/* At this point, if k0 is non zero, null char must be in the
+	   second vector.  */
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x2)
+
+	VPTESTN	%VMM3, %VMM3, %k3
+	KMOV	%k3, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	KMOV	%k1, %RCX
+
+	/* Fourth, third, second vector terminating are pretty much
+	   same, implemented this way to avoid branching and reuse code
+	   from pre loop exit condition.  */
+L(ret_vec_x4):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 3), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(ret_vec_x3):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 2), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rax
+L(ret_vec_x1):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(page_cross):
+	movl	%eax, %ecx
+# ifdef USE_AS_WCSLEN
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	xorq	%rdi, %rax
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
+	KMOV	%k0, %RAX
+	/* Ignore number of character for alignment adjustment.  */
+	SHR	%cl, %RAX
+	jz	L(align_more)
+
+	bsf	%RAX, %RAX
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
new file mode 100644
index 000000000..116f8981c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -0,0 +1,7 @@
+#ifndef STRLEN
+# define STRLEN		__strlen_evex512
+#endif
+
+#define VEC_SIZE	64
+
+#include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
index 42b6124df..874123d60 100644
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -28,6 +28,10 @@
 # define SHIFT_RETURN
 #endif
 
+#ifndef SECTION
+# define SECTION(p)	p
+#endif
+
 /* Long lived register in strlen(s), strnlen(s, n) are:
 
 	%xmm3 - zero
@@ -37,7 +41,7 @@
 */
 
 
-.text
+	.section SECTION(.text),"ax",@progbits
 ENTRY(strlen)
 
 /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 000000000..58c05dcfb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+#define OVERFLOW_STRCMP	__strcasecmp_l_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 000000000..48c0aa21f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP	__strcasecmp_l_avx2
+#endif
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
new file mode 100644
index 000000000..8a5af3695
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
@@ -0,0 +1,25 @@
+/* strncasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP	__strcasecmp_l_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
index 37d1224bb..68bad365b 100644
--- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
@@ -1,3 +1,4 @@
 #define STRCMP	__strncmp_avx2_rtm
 #define USE_AS_STRNCMP 1
+#define OVERFLOW_STRCMP	__strcmp_avx2_rtm
 #include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
index 1678bcc23..f138e9f1f 100644
--- a/sysdeps/x86_64/multiarch/strncmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
@@ -1,3 +1,4 @@
 #define STRCMP	__strncmp_avx2
 #define USE_AS_STRNCMP 1
+#define OVERFLOW_STRCMP __strcmp_avx2
 #include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
index 9773e5fc0..310a6dbe7 100644
--- a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
+++ b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#define STRCMP_SSE42 __strncmp_sse42
-#define USE_AS_STRNCMP
-#include "strcmp-sse42.S"
+#if IS_IN (libc)
+# define STRCMP_SSE42 __strncmp_sse42
+# define USE_AS_STRNCMP
+# include "strcmp-sse42.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
new file mode 100644
index 000000000..0b7f22021
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
similarity index 85%
rename from sysdeps/x86_64/multiarch/strcspn-sse2.S
rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c
index f97e856e1..d03214c4f 100644
--- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
+++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
@@ -1,4 +1,4 @@
-/* strcspn optimized with SSE2.
+/* strpbrk.
    Copyright (C) 2017-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -19,10 +19,10 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
-# define strcspn __strcspn_sse2
+# define STRPBRK __strpbrk_sse2
 
 # undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcspn)
+# define libc_hidden_builtin_def(STRPBRK)
 #endif
 
-#include <sysdeps/x86_64/strcspn.S>
+#include <string/strpbrk.c>
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 1df2adfad..bd26ba80d 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
 # ifdef USE_AS_WCSRCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
 # ifndef VZEROUPPER
@@ -41,196 +45,304 @@
 # endif
 
 # define VEC_SIZE	32
+# define PAGE_SIZE	4096
 
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
-	movd	%esi, %xmm4
-	movl	%edi, %ecx
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+	movd	%esi, %xmm7
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMM4.  */
-	VPBROADCAST %xmm4, %ymm4
+	VPBROADCAST %xmm7, %ymm7
 	vpxor	%xmm0, %xmm0, %xmm0
 
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	/* Shift here instead of `andl` to save code size (saves a fetch
+	   block).  */
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(cross_page)
 
+L(page_cross_continue):
 	vmovdqu	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	addq	$VEC_SIZE, %rdi
+	/* Check end of string match.  */
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	/* Only check match with search CHAR if needed.  */
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Check if match before first zero.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
+	.p2align 4,, 10
+L(first_vec_x1):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	.p2align 4,, 4
+L(first_vec_x0_test):
+	VPCMPEQ	%ymm1, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	testl	%eax, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret1):
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 10
+L(first_vec_x0_x1_test):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	/* Check ymm2 for search CHAR match. If no match then check ymm1
+	   before returning.  */
 	testl	%eax, %eax
-	jnz	L(first_vec)
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
-	testl	%ecx, %ecx
-	jnz	L(return_null)
 
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	/* If no in-range search CHAR match in ymm3 then need to check
+	   ymm1/ymm2 for an earlier match (we delay checking search
+	   CHAR matches until needed).  */
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
+
 
 	.p2align 4
-L(first_vec):
-	/* Check if there is a nul CHAR.  */
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+
+	/* Align src.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqu	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
 	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
+	jnz	L(first_vec_x1)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
+	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
+	VPCMPEQ	%ymm3, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	addq	$(VEC_SIZE + 1), %rdi
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %edx
-	vpmovmskb %ymm3, %eax
-	shrl	%cl, %edx
-	shrl	%cl, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
+L(first_aligned_loop):
+	/* Do 2x VEC at a time. Any more and the cost of finding the
+	   match outweights loop benefit.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm8
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm8, %ymm0, %ymm8
+	vpor	%ymm5, %ymm8, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
+	/* No zero or search CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
+	jz	L(first_aligned_loop)
 
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
+	/* If no zero CHAR then go to second loop (this allows us to
+	   throw away all prior work).  */
+	vpmovmskb %ymm8, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_prep)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	/* Search char could be zero so we need to get the true match.
+	 */
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(first_aligned_loop_return)
 
-	.p2align 4
-L(aligned_loop):
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	add	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
+	.p2align 4,, 4
+L(first_vec_x1_or_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm3
+	VPCMPEQ	%ymm2, %ymm7, %ymm2
 	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
-
-	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a nul CHAR in a loop.  */
-	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	vpmovmskb %ymm2, %edx
+	/* Use add for macro-fusion.  */
+	addq	%rax, %rdx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	salq	$32, %rax
+	addq	%rdx, %rax
+	bsrq	%rax, %rax
+	leaq	1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 8
+L(first_aligned_loop_return):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(first_vec_x1_or_x2)
+
+	bsrq	%rax, %rax
+	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
 # ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %eax
+	andq	$-CHAR_SIZE, %rax
 # endif
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	VZEROUPPER_RETURN
 
+	/* Search char cannot be zero.  */
 	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a nul CHAR.  */
-	vpmovmskb %ymm2, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+L(second_aligned_loop_set_furthest_match):
+	/* Save VEC and pointer from most recent match.  */
+L(second_aligned_loop_prep):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	vmovdqu	%ymm6, %ymm2
+	vmovdqu	%ymm10, %ymm3
 
 	.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+L(second_aligned_loop):
+	/* Search 2x at at time.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm1
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpor	%ymm5, %ymm1, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
 	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a nul CHAR.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+	jz	L(second_aligned_loop)
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_set_furthest_match)
+	vpmovmskb %ymm5, %eax
 	testl	%eax, %eax
-	/* Return null pointer if the nul CHAR comes first.  */
-	jz	L(return_null)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	jnz	L(return_new_match)
+
+	/* This is the hot patch. We know CHAR is inbounds and that
+	   ymm3/ymm2 have latest match.  */
+	.p2align 4,, 4
+L(return_old_match):
+	vpmovmskb %ymm3, %eax
+	vpmovmskb %ymm2, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
+	/* Last iteration also potentially has a match.  */
+	.p2align 4,, 8
+L(return_new_match):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(return_old_match)
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
 	VZEROUPPER_RETURN
 
-END (STRRCHR)
+	.p2align 4,, 4
+L(cross_page):
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	/* Shift out zero CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+
+	/* Shift out search CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%edi, %eax, %eax
+	blsmskl	%ecx, %ecx
+	/* Check if any search CHAR match in range.  */
+	andl	%ecx, %eax
+	jz	L(ret2)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret2):
+	VZEROUPPER_RETURN
+END(STRRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index adeddaed3..8014c285b 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,351 @@
 #  define STRRCHR	__strrchr_evex
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
+# define VMOVU	vmovdqu64
+# define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSRCHR
+#  define SHIFT_REG	esi
+
+#  define kunpck	kunpckbw
+#  define kmov_2x	kmovd
+#  define maskz_2x	ecx
+#  define maskm_2x	eax
+#  define CHAR_SIZE	4
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
-#  define SHIFT_REG	r8d
+#  define VPCMP	vpcmpd
 # else
+#  define SHIFT_REG	edi
+
+#  define kunpck	kunpckdq
+#  define kmov_2x	kmovq
+#  define maskz_2x	rcx
+#  define maskm_2x	rax
+
+#  define CHAR_SIZE	1
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
-#  define SHIFT_REG	ecx
+#  define VPCMP	vpcmpb
 # endif
 
 # define XMMZERO	xmm16
 # define YMMZERO	ymm16
 # define YMMMATCH	ymm17
-# define YMM1		ymm18
+# define YMMSAVE	ymm18
+
+# define YMM1	ymm19
+# define YMM2	ymm20
+# define YMM3	ymm21
+# define YMM4	ymm22
+# define YMM5	ymm23
+# define YMM6	ymm24
+# define YMM7	ymm25
+# define YMM8	ymm26
 
-# define VEC_SIZE	32
 
-	.section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
-	movl	%edi, %ecx
+# define VEC_SIZE	32
+# define PAGE_SIZE	4096
+	.section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMMMATCH.  */
 	VPBROADCAST %esi, %YMMMATCH
 
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(cross_page_boundary)
 
+L(page_cross_continue):
 	VMOVU	(%rdi), %YMM1
-
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	/* k0 has a 1 for each zero CHAR in YMM1.  */
+	VPTESTN	%YMM1, %YMM1, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-
-	addq	$VEC_SIZE, %rdi
-
-	testl	%eax, %eax
-	jnz	L(first_vec)
-
 	testl	%ecx, %ecx
-	jnz	L(return_null)
-
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(first_vec):
-	/* Check if there is a null byte.  */
-	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	jz	L(aligned_more)
+	/* fallthrough: zero CHAR in first VEC.  */
 
+	/* K1 has a 1 for each search CHAR match in YMM1.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	/* Build mask up until first zero CHAR (used to mask of
+	   potential search CHAR matches past the end of the string).
+	 */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	/* Get last match (the `andl` removed any out of bounds
+	   matches).  */
+	bsrl	%eax, %eax
 # ifdef USE_AS_WCSRCHR
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
-	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl	$2, %SHIFT_REG
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
+L(ret0):
+	ret
 
-	VMOVA	(%rdi), %YMM1
-
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
+	/* Returns for first vec x1/x2/x3 have hard coded backward
+	   search path for earlier matches.  */
+	.p2align 4,, 6
+L(first_vec_x1):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	/* eax non-zero if search CHAR in range.  */
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	/* fallthrough: no match in YMM2 then need to check for earlier
+	   matches (in YMM1).  */
+	.p2align 4,, 4
+L(first_vec_x0_test):
 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %edx
 	kmovd	%k1, %eax
-
-	shrxl	%SHIFT_REG, %edx, %edx
-	shrxl	%SHIFT_REG, %eax, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
-
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	jz	L(ret1)
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rsi, %rax
+# endif
+L(ret1):
+	ret
 
-	.p2align 4
-L(aligned_loop):
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
+	.p2align 4,, 10
+L(first_vec_x1_or_x2):
+	VPCMP	$0, %YMM3, %YMMMATCH, %k3
+	VPCMP	$0, %YMM2, %YMMMATCH, %k2
+	/* K2 and K3 have 1 for any search CHAR match. Test if any
+	   matches between either of them. Otherwise check YMM1.  */
+	kortestd %k2, %k3
+	jz	L(first_vec_x0_test)
+
+	/* Guranteed that YMM2 and YMM3 are within range so merge the
+	   two bitmasks then get last result.  */
+	kunpck	%k2, %k3, %k3
+	kmovq	%k3, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %ecx
+	.p2align 4,, 6
+L(first_vec_x3):
+	VPCMP	$0, %YMMMATCH, %YMM4, %k1
 	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_or_x2)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	VMOVA	(%rdi), %YMM1
-	add	$VEC_SIZE, %rdi
+	.p2align 4,, 6
+L(first_vec_x0_x1_test):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	/* Check YMM2 for last match first. If no match try YMM1.  */
+	testl	%eax, %eax
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %ecx
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMP	$0, %YMMMATCH, %YMM3, %k1
 	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
+	 */
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	.p2align 4
+L(aligned_more):
+	/* Need to keep original pointer incase YMM1 has last match.  */
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rdi
+	VMOVU	VEC_SIZE(%rdi), %YMM2
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
 
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
+	VPTESTN	%YMM3, %YMM3, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
+	VPTESTN	%YMM4, %YMM4, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
+	movq	%rdi, %r8
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x3)
 
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a null byte in a loop.  */
+L(first_aligned_loop):
+	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+	   they don't store a match.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
+
+	VPCMP	$0, %YMM5, %YMMMATCH, %k2
+	vpxord	%YMM6, %YMMMATCH, %YMM7
+
+	VPMIN	%YMM5, %YMM6, %YMM8
+	VPMIN	%YMM8, %YMM7, %YMM7
+
+	VPTESTN	%YMM7, %YMM7, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(first_aligned_loop)
+
+	VPCMP	$0, %YMM6, %YMMMATCH, %k3
+	VPTESTN	%YMM8, %YMM8, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_prep)
+
+	kortestd %k2, %k3
+	jnz	L(return_first_aligned_loop)
+
+	.p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+	VPCMP	$0, %YMM4, %YMMMATCH, %k4
+	kmovd	%k4, %eax
 	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	jz	L(first_vec_x1_or_x2)
 	bsrl	%eax, %eax
-# ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-# endif
+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a null byte.  */
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
+	.p2align 4,, 8
+L(return_first_aligned_loop):
+	VPTESTN	%YMM5, %YMM5, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(first_vec_x1_or_x2_or_x3)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+	/* We can throw away the work done for the first 4x checks here
+	   as we have a later match. This is the 'fast' path persay.
+	 */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	kunpck	%k2, %k3, %k4
 
 	.p2align 4
-L(find_nul):
-	/* Mask out any matching bits after the null byte.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
-	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
+L(second_aligned_loop):
+	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
+	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
+
+	VPCMP	$0, %YMM1, %YMMMATCH, %k2
+	vpxord	%YMM2, %YMMMATCH, %YMM3
+
+	VPMIN	%YMM1, %YMM2, %YMM4
+	VPMIN	%YMM3, %YMM4, %YMM3
+
+	VPTESTN	%YMM3, %YMM3, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(second_aligned_loop)
+
+	VPCMP	$0, %YMM2, %YMMMATCH, %k3
+	VPTESTN	%YMM4, %YMM4, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_set_furthest_match)
+
+	kortestd %k2, %k3
+	/* branch here because there is a significant advantage interms
+	   of output dependency chance in using edx.  */
+	jnz	L(return_new_match)
+L(return_old_match):
+	kmovq	%k4, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(return_new_match):
+	VPTESTN	%YMM1, %YMM1, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(return_old_match)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(cross_page_boundary):
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
+	   rax` sets pointer will all page offset bits cleared so
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+	   before page cross (guranteed to be safe to read). Doing this
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+	   a bit of code size.  */
+	xorq	%rdi, %rax
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+	VPTESTN	%YMM1, %YMM1, %k0
+	kmovd	%k0, %ecx
+
+	/* Shift out zero CHAR matches that are before the begining of
+	   src (rdi).  */
 # ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	movl	%edi, %esi
+	andl	$(VEC_SIZE - 1), %esi
+	shrl	$2, %esi
 # endif
-	ret
+	shrxl	%SHIFT_REG, %ecx, %ecx
 
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a null byte.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
-	/* Mask out any matching bits after the null byte.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
-	testl	%eax, %eax
-	/* Return null pointer if the null byte comes first.  */
-	jz	L(return_null)
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+
+	/* Found zero CHAR so need to test for search CHAR.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	/* Shift out search CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%SHIFT_REG, %eax, %eax
+
+	/* Check if any search CHAR match in range.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret3)
 	bsrl	%eax, %eax
 # ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	addq	%rdi, %rax
 # endif
+L(ret3):
 	ret
 
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
-	ret
-
-END (STRRCHR)
+END(STRRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index db1b44c23..866396e94 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
 
 # undef weak_alias
 # define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index 8fb3aba64..6124033ce 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
     return 0;
 
   const char *aligned;
-  __m128i mask;
-  int offset = (int) ((size_t) a & 15);
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
   if (offset != 0)
     {
       /* Load masks.  */
       aligned = (const char *) ((size_t) a & -16L);
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
-      mask = __m128i_shift_right (mask0, offset);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
 
       /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16 - offset)
-	{
-	  /* There is no NULL terminator.  */
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
-	  length += index;
-
-	  /* Don't use SSE4.2 if the length of A > 16.  */
-	  if (length > 16)
-	    return __strspn_sse2 (s, a);
-
-	  if (index != 0)
-	    {
-	      /* Combine mask0 and mask1.  We could play games with
-		 palignr, but frankly this data should be in L1 now
-		 so do the merge via an unaligned load.  */
-	      mask = _mm_loadu_si128 ((__m128i *) a);
-	    }
-	}
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
     }
-  else
-    {
-      /* A is aligned.  */
-      mask = _mm_load_si128 ((__m128i *) a);
 
-      /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16)
-	{
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
-	     of A > 16.  */
-	  if (a[16] != 0)
-	    return __strspn_sse2 (s, a);
-	}
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return __strspn_sse2 (s, a);
     }
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
 
-  offset = (int) ((size_t) s & 15);
   if (offset != 0)
     {
+    start_unaligned:
       /* Check partial string.  */
       aligned = (const char *) ((size_t) s & -16L);
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      __m128i adj_value = __m128i_shift_right (value, offset);
 
-      value = __m128i_shift_right (value, offset);
-
-      int length = _mm_cmpistri (mask, value, 0x12);
+      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
       /* No need to check CFlag since it is always 1.  */
       if (length < 16 - offset)
 	return length;
       /* Find where the NULL terminator is.  */
-      int index = _mm_cmpistri (value, value, 0x3a);
-      if (index < 16 - offset)
+      maskz = _mm_cmpeq_epi8 (value, zero);
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
 	return length;
       aligned += 16;
     }
-  else
-    aligned = s;
 
+start_loop:
   while (1)
     {
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      int index = _mm_cmpistri (mask, value, 0x12);
-      int cflag = _mm_cmpistrc (mask, value, 0x12);
+      unsigned int index = _mm_cmpistri (mask, value, 0x12);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
       if (cflag)
 	return (size_t) (aligned + index - s);
       aligned += 16;
diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c
similarity index 84%
rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S
rename to sysdeps/x86_64/multiarch/strspn-sse2.c
index d537b6c27..61cc6cb0a 100644
--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S
+++ b/sysdeps/x86_64/multiarch/strspn-sse2.c
@@ -1,4 +1,4 @@
-/* strpbrk optimized with SSE2.
+/* strspn.
    Copyright (C) 2017-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -19,11 +19,10 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
-# define strcspn __strpbrk_sse2
+# define STRSPN __strspn_sse2
 
 # undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strpbrk)
+# define libc_hidden_builtin_def(STRSPN)
 #endif
 
-#define USE_AS_STRPBRK
-#include <sysdeps/x86_64/strcspn.S>
+#include <string/strspn.c>
diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
new file mode 100644
index 000000000..e44c1a05d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
@@ -0,0 +1,218 @@
+/* strstr optimized with 512-bit AVX-512 instructions
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define FULL_MMASK64 0xffffffffffffffff
+#define ONE_64BIT 0x1ull
+#define ZMM_SIZE_IN_BYTES 64
+#define PAGESIZE 4096
+
+#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
+#define kshiftri_mask64(x, y) ((x) >> (y))
+#define kand_mask64(x, y) ((x) & (y))
+
+/*
+ Returns the index of the first edge within the needle, returns 0 if no edge
+ is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
+ */
+static inline size_t
+find_edge_in_needle (const char *ned)
+{
+  size_t ind = 0;
+  while (ned[ind + 1] != '\0')
+    {
+      if (ned[ind] != ned[ind + 1])
+        return ind;
+      else
+        ind = ind + 1;
+    }
+  return 0;
+}
+
+/*
+ Compare needle with haystack byte by byte at specified location
+ */
+static inline bool
+verify_string_match (const char *hay, const size_t hay_index, const char *ned,
+                     size_t ind)
+{
+  while (ned[ind] != '\0')
+    {
+      if (ned[ind] != hay[hay_index + ind])
+        return false;
+      ind = ind + 1;
+    }
+  return true;
+}
+
+/*
+ Compare needle with haystack at specified location. The first 64 bytes are
+ compared using a ZMM register.
+ */
+static inline bool
+verify_string_match_avx512 (const char *hay, const size_t hay_index,
+                            const char *ned, const __mmask64 ned_mask,
+                            const __m512i ned_zmm)
+{
+  /* check first 64 bytes using zmm and then scalar */
+  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
+  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
+  if (match != 0x0) // failed the first few chars
+    return false;
+  else if (ned_mask == FULL_MMASK64)
+    return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
+  return true;
+}
+
+char *
+__strstr_avx512 (const char *haystack, const char *ned)
+{
+  char first = ned[0];
+  if (first == '\0')
+    return (char *)haystack;
+  if (ned[1] == '\0')
+    return (char *)strchr (haystack, ned[0]);
+
+  size_t edge = find_edge_in_needle (ned);
+
+  /* ensure haystack is as long as the pos of edge in needle */
+  for (int ii = 0; ii < edge; ++ii)
+    {
+      if (haystack[ii] == '\0')
+        return NULL;
+    }
+
+  /*
+   Load 64 bytes of the needle and save it to a zmm register
+   Read one cache line at a time to avoid loading across a page boundary
+   */
+  __mmask64 ned_load_mask = _bzhi_u64 (
+      FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
+  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
+  __mmask64 ned_nullmask
+      = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
+
+  if (__glibc_unlikely (ned_nullmask == 0x0))
+    {
+      ned_zmm = _mm512_loadu_si512 (ned);
+      ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+      if (ned_nullmask != 0x0)
+        ned_load_mask = ned_load_mask >> 1;
+    }
+  else
+    {
+      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+      ned_load_mask = ned_load_mask >> 1;
+    }
+  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
+  const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
+
+  /*
+   Read the bytes of haystack in the current cache line
+   */
+  size_t hay_index = edge;
+  __mmask64 loadmask = _bzhi_u64 (
+      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
+  /* First load is a partial cache line */
+  __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
+  /* Search for NULL and compare only till null char */
+  uint64_t nullmask
+      = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
+  uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+  cmpmask = cmpmask & cvtmask64_u64 (loadmask);
+  /* Search for the 2 charaters of needle */
+  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
+  k1 = kshiftri_mask64 (k1, 1);
+  /* k2 masks tell us if both chars from needle match */
+  uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
+  /* For every match, search for the entire needle for a full match */
+  while (k2)
+    {
+      uint64_t bitcount = _tzcnt_u64 (k2);
+      k2 = _blsr_u64 (k2);
+      size_t match_pos = hay_index + bitcount - edge;
+      if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
+          < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
+        {
+          /*
+           * Use vector compare as long as you are not crossing a page
+           */
+          if (verify_string_match_avx512 (haystack, match_pos, ned,
+                                          ned_load_mask, ned_zmm))
+            return (char *)haystack + match_pos;
+        }
+      else
+        {
+          if (verify_string_match (haystack, match_pos, ned, 0))
+            return (char *)haystack + match_pos;
+        }
+    }
+  /* We haven't checked for potential match at the last char yet */
+  haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
+  hay_index = 0;
+
+  /*
+   Loop over one cache line at a time to prevent reading over page
+   boundary
+   */
+  __m512i hay1;
+  while (nullmask == 0)
+    {
+      hay0 = _mm512_loadu_si512 (haystack + hay_index);
+      hay1 = _mm512_load_si512 (haystack + hay_index
+                                + 1); // Always 64 byte aligned
+      nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
+      /* Compare only till null char */
+      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
+      /* k2 masks tell us if both chars from needle match */
+      k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
+      /* For every match, compare full strings for potential match */
+      while (k2)
+        {
+          uint64_t bitcount = _tzcnt_u64 (k2);
+          k2 = _blsr_u64 (k2);
+          size_t match_pos = hay_index + bitcount - edge;
+          if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
+              < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
+            {
+              /*
+               * Use vector compare as long as you are not crossing a page
+               */
+              if (verify_string_match_avx512 (haystack, match_pos, ned,
+                                              ned_load_mask, ned_zmm))
+                return (char *)haystack + match_pos;
+            }
+          else
+            {
+              /* Compare byte by byte */
+              if (verify_string_match (haystack, match_pos, ned, 0))
+                return (char *)haystack + match_pos;
+            }
+        }
+      hay_index += ZMM_SIZE_IN_BYTES;
+    }
+  return NULL;
+}
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index 95600a9de..2fb8b169b 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -35,16 +35,32 @@
 
 extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
 extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
 
 #include "init-arch.h"
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr,
-	    HAS_ARCH_FEATURE (Fast_Unaligned_Load)
-	    ? __strstr_sse2_unaligned
-	    : __strstr_sse2)
 
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+    return __strstr_avx512;
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+    return __strstr_sse2_unaligned;
+
+  return __strstr_sse2;
+}
+
+libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
 #undef strstr
 strong_alias (__libc_strstr, strstr)
diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c
index c8210f054..d27767520 100644
--- a/sysdeps/x86_64/multiarch/varshift.c
+++ b/sysdeps/x86_64/multiarch/varshift.c
@@ -16,9 +16,10 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include "varshift.h"
+#include <stdint.h>
 
-const int8_t ___m128i_shift_right[31] attribute_hidden =
+const int8_t ___m128i_shift_right[31] attribute_hidden
+    __attribute__((aligned(32))) =
   {
     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h
index af3069448..ffd12d79e 100644
--- a/sysdeps/x86_64/multiarch/varshift.h
+++ b/sysdeps/x86_64/multiarch/varshift.h
@@ -19,7 +19,8 @@
 #include <stdint.h>
 #include <tmmintrin.h>
 
-extern const int8_t ___m128i_shift_right[31] attribute_hidden;
+extern const int8_t ___m128i_shift_right[31] attribute_hidden
+    __attribute__ ((aligned (32)));
 
 static __inline__ __m128i
 __m128i_shift_right (__m128i value, unsigned long int offset)
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
new file mode 100644
index 000000000..9f3ffeced
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/vec-macros.h
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _VEC_MACROS_H
+#define _VEC_MACROS_H			1
+
+#ifndef VEC_SIZE
+# error "Never include this file directly. Always include a vector config."
+#endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VEC(N) values.  */
+#define VEC_hi_xmm0				xmm16
+#define VEC_hi_xmm1				xmm17
+#define VEC_hi_xmm2				xmm18
+#define VEC_hi_xmm3				xmm19
+#define VEC_hi_xmm4				xmm20
+#define VEC_hi_xmm5				xmm21
+#define VEC_hi_xmm6				xmm22
+#define VEC_hi_xmm7				xmm23
+#define VEC_hi_xmm8				xmm24
+#define VEC_hi_xmm9				xmm25
+#define VEC_hi_xmm10			xmm26
+#define VEC_hi_xmm11			xmm27
+#define VEC_hi_xmm12			xmm28
+#define VEC_hi_xmm13			xmm29
+#define VEC_hi_xmm14			xmm30
+#define VEC_hi_xmm15			xmm31
+
+#define VEC_hi_ymm0				ymm16
+#define VEC_hi_ymm1				ymm17
+#define VEC_hi_ymm2				ymm18
+#define VEC_hi_ymm3				ymm19
+#define VEC_hi_ymm4				ymm20
+#define VEC_hi_ymm5				ymm21
+#define VEC_hi_ymm6				ymm22
+#define VEC_hi_ymm7				ymm23
+#define VEC_hi_ymm8				ymm24
+#define VEC_hi_ymm9				ymm25
+#define VEC_hi_ymm10			ymm26
+#define VEC_hi_ymm11			ymm27
+#define VEC_hi_ymm12			ymm28
+#define VEC_hi_ymm13			ymm29
+#define VEC_hi_ymm14			ymm30
+#define VEC_hi_ymm15			ymm31
+
+#define VEC_hi_zmm0				zmm16
+#define VEC_hi_zmm1				zmm17
+#define VEC_hi_zmm2				zmm18
+#define VEC_hi_zmm3				zmm19
+#define VEC_hi_zmm4				zmm20
+#define VEC_hi_zmm5				zmm21
+#define VEC_hi_zmm6				zmm22
+#define VEC_hi_zmm7				zmm23
+#define VEC_hi_zmm8				zmm24
+#define VEC_hi_zmm9				zmm25
+#define VEC_hi_zmm10			zmm26
+#define VEC_hi_zmm11			zmm27
+#define VEC_hi_zmm12			zmm28
+#define VEC_hi_zmm13			zmm29
+#define VEC_hi_zmm14			zmm30
+#define VEC_hi_zmm15			zmm31
+
+#define PRIMITIVE_VEC(vec, num)		vec##num
+
+#define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
+#define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
+#define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
+
+#define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
+#define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
+#define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
new file mode 100644
index 000000000..f59c372b7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
index 7e62621af..e306a77f5 100644
--- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
@@ -1,4 +1,5 @@
 #define AS_WCSLEN
 #define strlen	__wcslen_sse4_1
+#define SECTION(p)	p##.sse4.1
 
 #include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
index 4e88c70cc..f467582cb 100644
--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
@@ -1,5 +1,5 @@
 #define STRCMP __wcsncmp_avx2_rtm
 #define USE_AS_STRNCMP 1
 #define USE_AS_WCSCMP 1
-
+#define OVERFLOW_STRCMP	__wcscmp_avx2_rtm
 #include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
index 4fa1de4d3..e9ede522b 100644
--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
@@ -1,5 +1,5 @@
 #define STRCMP __wcsncmp_avx2
 #define USE_AS_STRNCMP 1
 #define USE_AS_WCSCMP 1
-
+#define OVERFLOW_STRCMP	__wcscmp_avx2
 #include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
new file mode 100644
index 000000000..73dcf2f21
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
index 5fa51fe07..d2f7dd6e2 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -1,5 +1,6 @@
 #define AS_WCSLEN
 #define AS_STRNLEN
 #define strlen	__wcsnlen_sse4_1
+#define SECTION(p)	p##.sse4.1
 
 #include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index 78d1ca655..69d2f3cdb 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR	__wcsrchr_sse2
 #endif
-
 #include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
deleted file mode 100644
index 46b6715e1..000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-c.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#if IS_IN (libc)
-# include <wchar.h>
-
-# define WMEMCMP  __wmemcmp_sse2
-
-extern __typeof (wmemcmp) __wmemcmp_sse2;
-#endif
-
-#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse2.S b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
new file mode 100644
index 000000000..f09192ed7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
@@ -0,0 +1,21 @@
+/* wmemcmp optimized with SSE2.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define USE_AS_WMEMCMP	1
+#define MEMCMP	__wmemcmp_sse2
+#include "../memcmp.S"
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index e2ab59c55..99d8b36f1 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RDX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END2 (__strcasecmp)
 # ifndef NO_NOLOCALE_ALIAS
 weak_alias (__strcasecmp, strcasecmp)
@@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RCX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END2 (__strncasecmp)
 # ifndef NO_NOLOCALE_ALIAS
 weak_alias (__strncasecmp, strncasecmp)
@@ -146,22 +144,22 @@ ENTRY (STRCMP)
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
-.Lbelowupper:
-	.quad	0x4040404040404040
-	.quad	0x4040404040404040
-.Ltopupper:
-	.quad	0x5b5b5b5b5b5b5b5b
-	.quad	0x5b5b5b5b5b5b5b5b
-.Ltouppermask:
+.Llcase_min:
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+.Llcase_max:
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+.Lcase_add:
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.previous
-	movdqa	.Lbelowupper(%rip), %xmm5
-# define UCLOW_reg %xmm5
-	movdqa	.Ltopupper(%rip), %xmm6
-# define UCHIGH_reg %xmm6
-	movdqa	.Ltouppermask(%rip), %xmm7
-# define LCQWORD_reg %xmm7
+	movdqa	.Llcase_min(%rip), %xmm5
+# define LCASE_MIN_reg %xmm5
+	movdqa	.Llcase_max(%rip), %xmm6
+# define LCASE_MAX_reg %xmm6
+	movdqa	.Lcase_add(%rip), %xmm7
+# define CASE_ADD_reg %xmm7
 #endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
@@ -172,22 +170,18 @@ ENTRY (STRCMP)
 	movhpd	8(%rdi), %xmm1
 	movhpd	8(%rsi), %xmm2
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define TOLOWER(reg1, reg2) \
-	movdqa	reg1, %xmm8;					\
-	movdqa	UCHIGH_reg, %xmm9;				\
-	movdqa	reg2, %xmm10;					\
-	movdqa	UCHIGH_reg, %xmm11;				\
-	pcmpgtb	UCLOW_reg, %xmm8;				\
-	pcmpgtb	reg1, %xmm9;					\
-	pcmpgtb	UCLOW_reg, %xmm10;				\
-	pcmpgtb	reg2, %xmm11;					\
-	pand	%xmm9, %xmm8;					\
-	pand	%xmm11, %xmm10;					\
-	pand	LCQWORD_reg, %xmm8;				\
-	pand	LCQWORD_reg, %xmm10;				\
-	por	%xmm8, reg1;					\
-	por	%xmm10, reg2
-	TOLOWER (%xmm1, %xmm2)
+#  define TOLOWER(reg1, reg2) \
+	movdqa	LCASE_MIN_reg, %xmm8;					\
+	movdqa	LCASE_MIN_reg, %xmm9;					\
+	paddb	reg1, %xmm8;					\
+	paddb	reg2, %xmm9;					\
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
+	pcmpgtb	LCASE_MAX_reg, %xmm9;				\
+	pandn	CASE_ADD_reg, %xmm8;					\
+	pandn	CASE_ADD_reg, %xmm9;					\
+	paddb	%xmm8, reg1;					\
+	paddb	%xmm9, reg2
+	TOLOWER	(%xmm1, %xmm2)
 #else
 # define TOLOWER(reg1, reg2)
 #endif
diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
deleted file mode 100644
index f3cd86c60..000000000
--- a/sysdeps/x86_64/strcspn.S
+++ /dev/null
@@ -1,119 +0,0 @@
-/* strcspn (str, ss) -- Return the length of the initial segment of STR
-			which contains no characters from SS.
-   For AMD x86-64.
-   Copyright (C) 1994-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-	.text
-ENTRY (strcspn)
-
-	movq %rdi, %rdx		/* Save SRC.  */
-
-	/* First we create a table with flags for all possible characters.
-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
-	   supported by the C string functions we have 256 characters.
-	   Before inserting marks for the stop characters we clear the whole
-	   table.  */
-	movq %rdi, %r8			/* Save value.  */
-	subq $256, %rsp			/* Make space for 256 bytes.  */
-	cfi_adjust_cfa_offset(256)
-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
-	movq %rsp, %rdi
-	xorl %eax, %eax			/* We store 0s.  */
-	cld
-	rep
-	stosq
-
-	movq %rsi, %rax			/* Setup skipset.  */
-
-/* For understanding the following code remember that %rcx == 0 now.
-   Although all the following instruction only modify %cl we always
-   have a correct zero-extended 64-bit value in %rcx.  */
-
-	.p2align 4
-L(2):	movb (%rax), %cl	/* get byte from skipset */
-	testb %cl, %cl		/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-
-	movb 1(%rax), %cl	/* get byte from skipset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-
-	movb 2(%rax), %cl	/* get byte from skipset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-
-	movb 3(%rax), %cl	/* get byte from skipset */
-	addq $4, %rax		/* increment skipset pointer */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-	testb $0xff, %cl	/* is NUL char? */
-	jnz L(2)		/* no => process next dword from skipset */
-
-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
-
-	/* We use a neat trick for the following loop.  Normally we would
-	   have to test for two termination conditions
-	   1. a character in the skipset was found
-	   and
-	   2. the end of the string was found
-	   But as a sign that the character is in the skipset we store its
-	   value in the table.  But the value of NUL is NUL so the loop
-	   terminates for NUL in every case.  */
-
-	.p2align 4
-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
-
-	movb (%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	je L(4)			/* yes => return */
-
-	movb 1(%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	je L(5)			/* yes => return */
-
-	movb 2(%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(6)			/* yes => return */
-
-	movb 3(%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jne L(3)		/* no => start loop again */
-
-	incq %rax		/* adjust pointer */
-L(6):	incq %rax
-L(5):	incq %rax
-
-L(4):	addq $256, %rsp		/* remove skipset */
-	cfi_adjust_cfa_offset(-256)
-#ifdef USE_AS_STRPBRK
-	xorl %edx,%edx
-	orb %cl, %cl		/* was last character NUL? */
-	cmovzq %rdx, %rax	/* Yes:	return NULL */
-#else
-	subq %rdx, %rax		/* we have to return the number of valid
-				   characters, so compute distance to first
-				   non-valid character */
-#endif
-	ret
-END (strcspn)
-libc_hidden_builtin_def (strcspn)
diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S
deleted file mode 100644
index 21888a5b9..000000000
--- a/sysdeps/x86_64/strpbrk.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define strcspn strpbrk
-#define USE_AS_STRPBRK
-#include <sysdeps/x86_64/strcspn.S>
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 50d886713..4d7ba4ceb 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,360 @@
 
 #include <sysdep.h>
 
+#ifndef STRRCHR
+# define STRRCHR	strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define PMINU	pminud
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
+# define PMINU	pminub
+#endif
+
+#define PAGE_SIZE	4096
+#define VEC_SIZE	16
+
 	.text
-ENTRY (strrchr)
-	movd	%esi, %xmm1
+ENTRY(STRRCHR)
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	andl	$4095, %eax
-	punpcklbw	%xmm1, %xmm1
-	cmpq	$4032, %rax
-	punpcklwd	%xmm1, %xmm1
-	pshufd	$0, %xmm1, %xmm1
+	andl	$(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+#endif
+	pshufd	$0, %xmm0, %xmm0
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page)
-	movdqu	(%rdi), %xmm0
+
+L(cross_page_continue):
+	movups	(%rdi), %xmm1
 	pxor	%xmm2, %xmm2
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb	%xmm0, %ecx
-	pmovmskb	%xmm3, %edx
-	testq	%rdx, %rdx
-	je	L(next_48_bytes)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rcx, %rax
-	je	L(exit)
-	bsrq	%rax, %rax
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret0):
 	ret
 
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
 	.p2align 4
-L(next_48_bytes):
-	movdqu	16(%rdi), %xmm4
-	movdqa	%xmm4, %xmm5
-	movdqu	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm2, %xmm5
-	movdqu	48(%rdi), %xmm0
-	pmovmskb	%xmm5, %edx
-	movdqa	%xmm3, %xmm5
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm5
-	pcmpeqb	%xmm0, %xmm2
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r8d
-	pmovmskb	%xmm5, %eax
-	pmovmskb	%xmm2, %esi
-	salq	$32, %r8
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rdx, %rax
-	movq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	salq	$48, %rdx
-	salq	$16, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
-	pmovmskb	%xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rsi
-	orq	%rdx, %rax
-	je	L(loop_header2)
-	leaq	-1(%rax), %rcx
-	xorq	%rax, %rcx
-	andq	%rcx, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rsi
-	leaq	(%rdi,%rsi), %rax
+L(first_vec_x0_test):
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
-L(loop_header2):
-	testq	%rsi, %rsi
-	movq	%rdi, %rcx
-	je	L(no_c_found)
-L(loop_header):
-	addq	$64, %rdi
-	pxor	%xmm7, %xmm7
-	andq	$-64, %rdi
-	jmp	L(loop_entry)
+L(first_vec_x1):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
 
 	.p2align 4
-L(loop64):
-	testq	%rdx, %rdx
-	cmovne	%rdx, %rsi
-	cmovne	%rdi, %rcx
-	addq	$64, %rdi
-L(loop_entry):
-	movdqa	32(%rdi), %xmm3
-	pxor	%xmm6, %xmm6
-	movdqa	48(%rdi), %xmm2
-	movdqa	%xmm3, %xmm0
-	movdqa	16(%rdi), %xmm4
-	pminub	%xmm2, %xmm0
-	movdqa	(%rdi), %xmm5
-	pminub	%xmm4, %xmm0
-	pminub	%xmm5, %xmm0
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm0, %eax
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %r9d
-	movdqa	%xmm4, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %edx
-	movdqa	%xmm3, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm0, %r10d
-	movdqa	%xmm2, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$32, %r10
-	orq	%r10, %rdx
-	pmovmskb	%xmm0, %r8d
-	orq	%r9, %rdx
-	salq	$48, %r8
-	orq	%r8, %rdx
+L(first_vec_x1_test):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
 	testl	%eax, %eax
-	je	L(loop64)
-	pcmpeqb	%xmm6, %xmm4
-	pcmpeqb	%xmm6, %xmm3
-	pcmpeqb	%xmm6, %xmm5
-	pmovmskb	%xmm4, %eax
-	pmovmskb	%xmm3, %r10d
-	pcmpeqb	%xmm6, %xmm2
-	pmovmskb	%xmm5, %r9d
-	salq	$32, %r10
-	salq	$16, %rax
-	pmovmskb	%xmm2, %r8d
-	orq	%r10, %rax
-	orq	%r9, %rax
-	salq	$48, %r8
-	orq	%r8, %rax
-	leaq	-1(%rax), %r8
-	xorq	%rax, %r8
-	andq	%r8, %rdx
-	cmovne	%rdi, %rcx
-	cmovne	%rdx, %rsi
-	bsrq	%rsi, %rsi
-	leaq	(%rcx,%rsi), %rax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+	andq	$-VEC_SIZE, %rdi
+
+	movaps	VEC_SIZE(%rdi), %xmm2
+	pxor	%xmm3, %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pmovmskb %xmm3, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
+
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
+	pxor	%xmm4, %xmm4
+	PCMPEQ	%xmm3, %xmm4
+	pmovmskb %xmm4, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+
+	addq	$VEC_SIZE, %rdi
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 2), %rdi
+	.p2align 4
+L(first_loop):
+	/* Do 2x VEC at a time.  */
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
+	   macro-fuse with `jz`.  */
+	addl	%ecx, %eax
+	jz	L(first_loop)
+
+	/* Check if there is zero match.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+	/* Check if there was a match in last iteration.  */
+	subl	%ecx, %eax
+	jnz	L(new_match)
+
+L(first_loop_old_match):
+	PCMPEQ	%xmm0, %xmm2
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	addl	%eax, %ecx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	sall	$16, %eax
+	orl	%ecx, %eax
+
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
+	/* Save minimum state for getting most recent match. We can
+	   throw out all previous work.  */
 	.p2align 4
-L(no_c_found):
-	movl	$1, %esi
-	xorl	%ecx, %ecx
-	jmp	L(loop_header)
+L(second_loop_match):
+	movq	%rdi, %rsi
+	movaps	%xmm4, %xmm2
+	movaps	%xmm7, %xmm3
 
 	.p2align 4
-L(exit):
-	xorl	%eax, %eax
+L(second_loop):
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Either null term or new occurence of CHAR.  */
+	addl	%ecx, %eax
+	jz	L(second_loop)
+
+	/* No null term so much be new occurence of CHAR.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+
+	subl	%ecx, %eax
+	jnz	L(second_loop_new_match)
+
+L(second_loop_old_match):
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	sall	$16, %eax
+	orl	%ecx, %eax
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
+L(second_loop_new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(second_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4,, 4
 L(cross_page):
-	movq	%rdi, %rax
-	pxor	%xmm0, %xmm0
-	andq	$-64, %rax
-	movdqu	(%rax), %xmm5
-	movdqa	%xmm5, %xmm6
-	movdqu	16(%rax), %xmm4
-	pcmpeqb	%xmm1, %xmm5
-	pcmpeqb	%xmm0, %xmm6
-	movdqu	32(%rax), %xmm3
-	pmovmskb	%xmm6, %esi
-	movdqa	%xmm4, %xmm6
-	movdqu	48(%rax), %xmm2
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm0, %xmm6
-	pmovmskb	%xmm6, %edx
-	movdqa	%xmm3, %xmm6
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm0, %xmm6
-	pcmpeqb	%xmm2, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r9d
-	pmovmskb	%xmm6, %r8d
-	pmovmskb	%xmm0, %ecx
-	salq	$32, %r9
-	salq	$32, %r8
-	pcmpeqb	%xmm1, %xmm2
-	orq	%r8, %rdx
-	salq	$48, %rcx
-	pmovmskb	%xmm5, %r8d
-	orq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	orq	%rcx, %rdx
-	pmovmskb	%xmm2, %ecx
-	salq	$16, %rsi
-	salq	$48, %rcx
-	orq	%r9, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	movaps	(%rsi), %xmm1
+	pxor	%xmm2, %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
 	movl	%edi, %ecx
-	subl	%eax, %ecx
-	shrq	%cl, %rdx
-	shrq	%cl, %rsi
-	testq	%rdx, %rdx
-	je	L(loop_header2)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rax, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rax
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	%cl, %edx
+	jz	L(cross_page_continue)
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	sarl	%cl, %eax
+	leal	-1(%rdx), %ecx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret1):
 	ret
-END (strrchr)
+END(STRRCHR)
 
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+	weak_alias (STRRCHR, rindex)
+	libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
deleted file mode 100644
index 61b76ee0a..000000000
--- a/sysdeps/x86_64/strspn.S
+++ /dev/null
@@ -1,112 +0,0 @@
-/* strspn (str, ss) -- Return the length of the initial segment of STR
-			which contains only characters from SS.
-   For AMD x86-64.
-   Copyright (C) 1994-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-	.text
-ENTRY (strspn)
-
-	movq %rdi, %rdx		/* Save SRC.  */
-
-	/* First we create a table with flags for all possible characters.
-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
-	   supported by the C string functions we have 256 characters.
-	   Before inserting marks for the stop characters we clear the whole
-	   table.  */
-	movq %rdi, %r8			/* Save value.  */
-	subq $256, %rsp			/* Make space for 256 bytes.  */
-	cfi_adjust_cfa_offset(256)
-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
-	movq %rsp, %rdi
-	xorl %eax, %eax			/* We store 0s.  */
-	cld
-	rep
-	stosq
-
-	movq %rsi, %rax			/* Setup stopset.  */
-
-/* For understanding the following code remember that %rcx == 0 now.
-   Although all the following instruction only modify %cl we always
-   have a correct zero-extended 64-bit value in %rcx.  */
-
-	.p2align 4
-L(2):	movb (%rax), %cl	/* get byte from stopset */
-	testb %cl, %cl		/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-
-	movb 1(%rax), %cl	/* get byte from stopset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-
-	movb 2(%rax), %cl	/* get byte from stopset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-
-	movb 3(%rax), %cl	/* get byte from stopset */
-	addq $4, %rax		/* increment stopset pointer */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-	testb $0xff, %cl	/* is NUL char? */
-	jnz L(2)		/* no => process next dword from stopset */
-
-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
-
-	/* We use a neat trick for the following loop.  Normally we would
-	   have to test for two termination conditions
-	   1. a character in the stopset was found
-	   and
-	   2. the end of the string was found
-	   But as a sign that the character is in the stopset we store its
-	   value in the table.  But the value of NUL is NUL so the loop
-	   terminates for NUL in every case.  */
-
-	.p2align 4
-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
-
-	movb (%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(4)			/* no => return */
-
-	movb 1(%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(5)			/* no => return */
-
-	movb 2(%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(6)			/* no => return */
-
-	movb 3(%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jnz L(3)		/* yes => start loop again */
-
-	incq %rax		/* adjust pointer */
-L(6):	incq %rax
-L(5):	incq %rax
-
-L(4):	addq $256, %rsp		/* remove stopset */
-	cfi_adjust_cfa_offset(-256)
-	subq %rdx, %rax		/* we have to return the number of valid
-				   characters, so compute distance to first
-				   non-valid character */
-	ret
-END (strspn)
-libc_hidden_builtin_def (strspn)
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index f14d50786..7f5defa4e 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -99,13 +99,31 @@ lose:									      \
    to avoid RTM abort triggered by VZEROUPPER inside transactionally.  */
 #define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
 	xtest;							\
-	jz	1f;						\
-	vzeroall;						\
+	jnz	1f;						\
+	vzeroupper;						\
 	ret;							\
 1:								\
-	vzeroupper;						\
+	vzeroall;						\
 	ret
 
+/* Can be used to replace vzeroupper that is not directly before a
+   return.  This is useful when hoisting a vzeroupper from multiple
+   return paths to decrease the total number of vzerouppers and code
+   size.  */
+#define COND_VZEROUPPER_XTEST							\
+    xtest;							\
+    jz 1f;							\
+    vzeroall;							\
+    jmp 2f;							\
+1:							\
+    vzeroupper;							\
+2:
+
+/* In RTM define this as COND_VZEROUPPER_XTEST.  */
+#ifndef COND_VZEROUPPER
+# define COND_VZEROUPPER vzeroupper
+#endif
+
 /* Zero upper vector registers and return.  */
 #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
 # define ZERO_UPPER_VEC_REGISTERS_RETURN \
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index c9165dbf0..d641141d7 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -40,82 +40,82 @@ ENTRY (__wcslen)
 	pxor	%xmm0, %xmm0
 
 	lea	32(%rdi), %rax
-	lea	16(%rdi), %rcx
+	addq	$16, %rdi
 	and	$-16, %rax
 
 	pcmpeqd	(%rax), %xmm0
 	pmovmskb %xmm0, %edx
 	pxor	%xmm1, %xmm1
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	pxor	%xmm2, %xmm2
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	pxor	%xmm3, %xmm3
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm3
 	pmovmskb %xmm3, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm0
 	pmovmskb %xmm0, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm1
 	pmovmskb %xmm1, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm2
 	pmovmskb %xmm2, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm3
 	pmovmskb %xmm3, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm0
 	pmovmskb %xmm0, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm1
 	pmovmskb %xmm1, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm2
 	pmovmskb %xmm2, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm3
 	pmovmskb %xmm3, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	and	$-0x40, %rax
@@ -132,104 +132,100 @@ L(aligned_64_loop):
 	pminub	%xmm0, %xmm2
 	pcmpeqd	%xmm3, %xmm2
 	pmovmskb %xmm2, %edx
+	addq	$64, %rax
 	test	%edx, %edx
-	lea	64(%rax), %rax
 	jz	L(aligned_64_loop)
 
 	pcmpeqd	-64(%rax), %xmm3
 	pmovmskb %xmm3, %edx
+    addq	$48, %rdi
 	test	%edx, %edx
-	lea	48(%rcx), %rcx
 	jnz	L(exit)
 
 	pcmpeqd	%xmm1, %xmm3
 	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
 	test	%edx, %edx
-	lea	-16(%rcx), %rcx
 	jnz	L(exit)
 
 	pcmpeqd	-32(%rax), %xmm3
 	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
 	test	%edx, %edx
-	lea	-16(%rcx), %rcx
 	jnz	L(exit)
 
 	pcmpeqd	%xmm6, %xmm3
 	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
 	test	%edx, %edx
-	lea	-16(%rcx), %rcx
-	jnz	L(exit)
-
-	jmp	L(aligned_64_loop)
+	jz	L(aligned_64_loop)
 
 	.p2align 4
 L(exit):
-	sub	%rcx, %rax
+	sub	%rdi, %rax
 	shr	$2, %rax
 	test	%dl, %dl
 	jz	L(exit_high)
 
-	mov	%dl, %cl
-	and	$15, %cl
+	andl	$15, %edx
 	jz	L(exit_1)
 	ret
 
-	.p2align 4
+	/* No align here. Naturally aligned % 16 == 1.  */
 L(exit_high):
-	mov	%dh, %ch
-	and	$15, %ch
+	andl	$(15 << 8), %edx
 	jz	L(exit_3)
 	add	$2, %rax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_1):
 	add	$1, %rax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_3):
 	add	$3, %rax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail0):
-	xor	%rax, %rax
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail1):
-	mov	$1, %rax
+	movl	$1, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail2):
-	mov	$2, %rax
+	movl	$2, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail3):
-	mov	$3, %rax
+	movl	$3, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail4):
-	mov	$4, %rax
+	movl	$4, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail5):
-	mov	$5, %rax
+	movl	$5, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail6):
-	mov	$6, %rax
+	movl	$6, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail7):
-	mov	$7, %rax
+	movl	$7, %eax
 	ret
 
 END (__wcslen)
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 61552954d..2b80efc5e 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -1,4 +1,4 @@
-/* wcsrchr with SSSE3
+/* wcsrchr optimized with SSE2.
    Copyright (C) 2011-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,266 +16,12 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 
-	.text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR	1
+#define NO_PMINU	1
 
-	movd	%rsi, %xmm1
-	mov	%rdi, %rcx
-	punpckldq %xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	punpckldq %xmm1, %xmm1
-	and	$63, %rcx
-	cmp	$48, %rcx
-	ja	L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR	wcsrchr
+#endif
 
-	movdqu	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match1)
-
-	test	%rcx, %rcx
-	jnz	L(return_null)
-
-	and	$-16, %rdi
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match1):
-	test	%rcx, %rcx
-	jnz	L(prolog_find_zero_1)
-
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	and	$-16, %rdi
-	jmp	L(loop)
-
-	.p2align 4
-L(crosscache):
-	and	$15, %rcx
-	and	$-16, %rdi
-	pxor	%xmm3, %xmm3
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm3
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm3, %rdx
-	pmovmskb %xmm0, %rax
-	shr	%cl, %rdx
-	shr	%cl, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match)
-
-	test	%rdx, %rdx
-	jnz	L(return_null)
-
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match):
-	test	%rdx, %rdx
-	jnz	L(prolog_find_zero)
-
-	mov	%rax, %r8
-	lea	(%rdi, %rcx), %rsi
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqd	%xmm3, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm3
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm3, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqd	%xmm4, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm4
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm4, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm5
-	pcmpeqd	%xmm5, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm5
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm5, %rax
-	or	%rax, %rcx
-	jz	L(loop)
-
-	.p2align 4
-L(matches):
-	test	%rax, %rax
-	jnz	L(match)
-L(return_value):
-	test	%r8, %r8
-	jz	L(return_null)
-	mov	%r8, %rax
-	mov	%rsi, %rdi
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match):
-	pmovmskb %xmm2, %rcx
-	test	%rcx, %rcx
-	jnz	L(find_zero)
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	jmp	L(loop)
-
-	.p2align 4
-L(find_zero):
-	test	$15, %cl
-	jnz	L(find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_value)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_value)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero):
-	add	%rcx, %rdi
-	mov     %rdx, %rcx
-L(prolog_find_zero_1):
-	test	$15, %cl
-	jnz	L(prolog_find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(prolog_find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(prolog_find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_null)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_null)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_second_wchar):
-	lea	-12(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_third_wchar):
-	lea	-8(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_fourth_wchar):
-	lea	-4(%rdi), %rax
-	ret
-
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-END (wcsrchr)
+#include "../strrchr.S"
diff --git a/sysdeps/x86_64/wmemcmp.S b/sysdeps/x86_64/wmemcmp.S
new file mode 100644
index 000000000..815b999e4
--- /dev/null
+++ b/sysdeps/x86_64/wmemcmp.S
@@ -0,0 +1,23 @@
+/* wmemcmp optimized with SSE2.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define MEMCMP	__wmemcmp
+#define USE_AS_WMEMCMP	1
+#include "memcmp.S"
+
+weak_alias (__wmemcmp, wmemcmp)
diff --git a/wcsmbs/Makefile b/wcsmbs/Makefile
index df9a85f4a..9ec758233 100644
--- a/wcsmbs/Makefile
+++ b/wcsmbs/Makefile
@@ -22,8 +22,9 @@ subdir	:= wcsmbs
 
 include ../Makeconfig
 
-headers	:= wchar.h bits/wchar.h bits/wchar2.h bits/wchar-ldbl.h uchar.h \
-	   bits/types/__mbstate_t.h bits/types/mbstate_t.h bits/types/wint_t.h
+headers	:= wchar.h bits/wchar.h bits/wchar2.h bits/wchar2-decl.h \
+	   bits/wchar-ldbl.h uchar.h bits/types/__mbstate_t.h \
+	   bits/types/mbstate_t.h bits/types/wint_t.h
 
 routines := wcscat wcschr wcscmp wcscpy wcscspn wcsdup wcslen wcsncat \
 	    wcsncmp wcsncpy wcspbrk wcsrchr wcsspn wcstok wcsstr wmemchr \
diff --git a/wcsmbs/bits/wchar2-decl.h b/wcsmbs/bits/wchar2-decl.h
new file mode 100644
index 000000000..8e1735c33
--- /dev/null
+++ b/wcsmbs/bits/wchar2-decl.h
@@ -0,0 +1,124 @@
+/* Checking macros for wchar functions.  Declarations only.
+   Copyright (C) 2004-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _BITS_WCHAR2_DECL_H
+#define _BITS_WCHAR2_DECL_H 1
+
+#ifndef _WCHAR_H
+# error "Never include <bits/wchar2-decl.h> directly; use <wchar.h> instead."
+#endif
+
+
+extern wchar_t *__wmemcpy_chk (wchar_t *__restrict __s1,
+			       const wchar_t *__restrict __s2, size_t __n,
+			       size_t __ns1) __THROW;
+extern wchar_t *__wmemmove_chk (wchar_t *__s1, const wchar_t *__s2,
+				size_t __n, size_t __ns1) __THROW;
+
+
+#ifdef __USE_GNU
+
+extern wchar_t *__wmempcpy_chk (wchar_t *__restrict __s1,
+				const wchar_t *__restrict __s2, size_t __n,
+				size_t __ns1) __THROW;
+
+#endif
+
+
+extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
+			       size_t __ns) __THROW;
+extern wchar_t *__wcscpy_chk (wchar_t *__restrict __dest,
+			      const wchar_t *__restrict __src,
+			      size_t __n) __THROW;
+extern wchar_t *__wcpcpy_chk (wchar_t *__restrict __dest,
+			      const wchar_t *__restrict __src,
+			      size_t __destlen) __THROW;
+extern wchar_t *__wcsncpy_chk (wchar_t *__restrict __dest,
+			       const wchar_t *__restrict __src, size_t __n,
+			       size_t __destlen) __THROW;
+extern wchar_t *__wcpncpy_chk (wchar_t *__restrict __dest,
+			       const wchar_t *__restrict __src, size_t __n,
+			       size_t __destlen) __THROW;
+extern wchar_t *__wcscat_chk (wchar_t *__restrict __dest,
+			      const wchar_t *__restrict __src,
+			      size_t __destlen) __THROW;
+extern wchar_t *__wcsncat_chk (wchar_t *__restrict __dest,
+			       const wchar_t *__restrict __src,
+			       size_t __n, size_t __destlen) __THROW;
+extern int __swprintf_chk (wchar_t *__restrict __s, size_t __n,
+			   int __flag, size_t __s_len,
+			   const wchar_t *__restrict __format, ...)
+     __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 6))) */;
+extern int __vswprintf_chk (wchar_t *__restrict __s, size_t __n,
+			    int __flag, size_t __s_len,
+			    const wchar_t *__restrict __format,
+			    __gnuc_va_list __arg)
+     __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 0))) */;
+
+#if __USE_FORTIFY_LEVEL > 1
+
+extern int __fwprintf_chk (__FILE *__restrict __stream, int __flag,
+			   const wchar_t *__restrict __format, ...);
+extern int __wprintf_chk (int __flag, const wchar_t *__restrict __format,
+			  ...);
+extern int __vfwprintf_chk (__FILE *__restrict __stream, int __flag,
+			    const wchar_t *__restrict __format,
+			    __gnuc_va_list __ap);
+extern int __vwprintf_chk (int __flag, const wchar_t *__restrict __format,
+			   __gnuc_va_list __ap);
+
+#endif
+
+extern wchar_t *__fgetws_chk (wchar_t *__restrict __s, size_t __size, int __n,
+			      __FILE *__restrict __stream) __wur;
+
+#ifdef __USE_GNU
+
+extern wchar_t *__fgetws_unlocked_chk (wchar_t *__restrict __s, size_t __size,
+				       int __n, __FILE *__restrict __stream)
+       __wur;
+
+#endif
+
+extern size_t __wcrtomb_chk (char *__restrict __s, wchar_t __wchar,
+			     mbstate_t *__restrict __p,
+			     size_t __buflen) __THROW __wur;
+extern size_t __mbsrtowcs_chk (wchar_t *__restrict __dst,
+			       const char **__restrict __src,
+			       size_t __len, mbstate_t *__restrict __ps,
+			       size_t __dstlen) __THROW;
+extern size_t __wcsrtombs_chk (char *__restrict __dst,
+			       const wchar_t **__restrict __src,
+			       size_t __len, mbstate_t *__restrict __ps,
+			       size_t __dstlen) __THROW;
+
+#ifdef	__USE_XOPEN2K8
+
+extern size_t __mbsnrtowcs_chk (wchar_t *__restrict __dst,
+				const char **__restrict __src, size_t __nmc,
+				size_t __len, mbstate_t *__restrict __ps,
+				size_t __dstlen) __THROW;
+extern size_t __wcsnrtombs_chk (char *__restrict __dst,
+				const wchar_t **__restrict __src,
+				size_t __nwc, size_t __len,
+				mbstate_t *__restrict __ps, size_t __dstlen)
+       __THROW;
+
+#endif
+
+#endif /* bits/wchar2-decl.h.  */
diff --git a/wcsmbs/bits/wchar2.h b/wcsmbs/bits/wchar2.h
index 0e017f458..3f110efe5 100644
--- a/wcsmbs/bits/wchar2.h
+++ b/wcsmbs/bits/wchar2.h
@@ -21,9 +21,6 @@
 #endif
 
 
-extern wchar_t *__wmemcpy_chk (wchar_t *__restrict __s1,
-			       const wchar_t *__restrict __s2, size_t __n,
-			       size_t __ns1) __THROW;
 extern wchar_t *__REDIRECT_NTH (__wmemcpy_alias,
 				(wchar_t *__restrict __s1,
 				 const wchar_t *__restrict __s2, size_t __n),
@@ -45,8 +42,6 @@ __NTH (wmemcpy (wchar_t *__restrict __s1, const wchar_t *__restrict __s2,
 }
 
 
-extern wchar_t *__wmemmove_chk (wchar_t *__s1, const wchar_t *__s2,
-				size_t __n, size_t __ns1) __THROW;
 extern wchar_t *__REDIRECT_NTH (__wmemmove_alias, (wchar_t *__s1,
 						   const wchar_t *__s2,
 						   size_t __n), wmemmove);
@@ -66,9 +61,6 @@ __NTH (wmemmove (wchar_t *__s1, const wchar_t *__s2, size_t __n))
 
 
 #ifdef __USE_GNU
-extern wchar_t *__wmempcpy_chk (wchar_t *__restrict __s1,
-				const wchar_t *__restrict __s2, size_t __n,
-				size_t __ns1) __THROW;
 extern wchar_t *__REDIRECT_NTH (__wmempcpy_alias,
 				(wchar_t *__restrict __s1,
 				 const wchar_t *__restrict __s2,
@@ -91,8 +83,6 @@ __NTH (wmempcpy (wchar_t *__restrict __s1, const wchar_t *__restrict __s2,
 #endif
 
 
-extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
-			       size_t __ns) __THROW;
 extern wchar_t *__REDIRECT_NTH (__wmemset_alias, (wchar_t *__s, wchar_t __c,
 						  size_t __n), wmemset);
 extern wchar_t *__REDIRECT_NTH (__wmemset_chk_warn,
@@ -110,9 +100,6 @@ __NTH (wmemset (wchar_t *__s, wchar_t __c, size_t __n))
 }
 
 
-extern wchar_t *__wcscpy_chk (wchar_t *__restrict __dest,
-			      const wchar_t *__restrict __src,
-			      size_t __n) __THROW;
 extern wchar_t *__REDIRECT_NTH (__wcscpy_alias,
 				(wchar_t *__restrict __dest,
 				 const wchar_t *__restrict __src), wcscpy);
@@ -127,9 +114,6 @@ __NTH (wcscpy (wchar_t *__restrict __dest, const wchar_t *__restrict __src))
 }
 
 
-extern wchar_t *__wcpcpy_chk (wchar_t *__restrict __dest,
-			      const wchar_t *__restrict __src,
-			      size_t __destlen) __THROW;
 extern wchar_t *__REDIRECT_NTH (__wcpcpy_alias,
 				(wchar_t *__restrict __dest,
 				 const wchar_t *__restrict __src), wcpcpy);
@@ -144,9 +128,6 @@ __NTH (wcpcpy (wchar_t *__restrict __dest, const wchar_t *__restrict __src))
 }
 
 
-extern wchar_t *__wcsncpy_chk (wchar_t *__restrict __dest,
-			       const wchar_t *__restrict __src, size_t __n,
-			       size_t __destlen) __THROW;
 extern wchar_t *__REDIRECT_NTH (__wcsncpy_alias,
 				(wchar_t *__restrict __dest,
 				 const wchar_t *__restrict __src,
@@ -168,9 +149,6 @@ __NTH (wcsncpy (wchar_t *__restrict __dest, const wchar_t *__restrict __src,
 }
 
 
-extern wchar_t *__wcpncpy_chk (wchar_t *__restrict __dest,
-			       const wchar_t *__restrict __src, size_t __n,
-			       size_t __destlen) __THROW;
 extern wchar_t *__REDIRECT_NTH (__wcpncpy_alias,
 				(wchar_t *__restrict __dest,
 				 const wchar_t *__restrict __src,
@@ -192,9 +170,6 @@ __NTH (wcpncpy (wchar_t *__restrict __dest, const wchar_t *__restrict __src,
 }
 
 
-extern wchar_t *__wcscat_chk (wchar_t *__restrict __dest,
-			      const wchar_t *__restrict __src,
-			      size_t __destlen) __THROW;
 extern wchar_t *__REDIRECT_NTH (__wcscat_alias,
 				(wchar_t *__restrict __dest,
 				 const wchar_t *__restrict __src), wcscat);
@@ -209,9 +184,6 @@ __NTH (wcscat (wchar_t *__restrict __dest, const wchar_t *__restrict __src))
 }
 
 
-extern wchar_t *__wcsncat_chk (wchar_t *__restrict __dest,
-			       const wchar_t *__restrict __src,
-			       size_t __n, size_t __destlen) __THROW;
 extern wchar_t *__REDIRECT_NTH (__wcsncat_alias,
 				(wchar_t *__restrict __dest,
 				 const wchar_t *__restrict __src,
@@ -228,10 +200,6 @@ __NTH (wcsncat (wchar_t *__restrict __dest, const wchar_t *__restrict __src,
 }
 
 
-extern int __swprintf_chk (wchar_t *__restrict __s, size_t __n,
-			   int __flag, size_t __s_len,
-			   const wchar_t *__restrict __format, ...)
-     __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 6))) */;
 
 extern int __REDIRECT_NTH_LDBL (__swprintf_alias,
 				(wchar_t *__restrict __s, size_t __n,
@@ -258,11 +226,6 @@ __NTH (swprintf (wchar_t *__restrict __s, size_t __n,
    : swprintf (s, n, __VA_ARGS__))
 #endif
 
-extern int __vswprintf_chk (wchar_t *__restrict __s, size_t __n,
-			    int __flag, size_t __s_len,
-			    const wchar_t *__restrict __format,
-			    __gnuc_va_list __arg)
-     __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 0))) */;
 
 extern int __REDIRECT_NTH_LDBL (__vswprintf_alias,
 				(wchar_t *__restrict __s, size_t __n,
@@ -283,16 +246,6 @@ __NTH (vswprintf (wchar_t *__restrict __s, size_t __n,
 
 #if __USE_FORTIFY_LEVEL > 1
 
-extern int __fwprintf_chk (__FILE *__restrict __stream, int __flag,
-			   const wchar_t *__restrict __format, ...);
-extern int __wprintf_chk (int __flag, const wchar_t *__restrict __format,
-			  ...);
-extern int __vfwprintf_chk (__FILE *__restrict __stream, int __flag,
-			    const wchar_t *__restrict __format,
-			    __gnuc_va_list __ap);
-extern int __vwprintf_chk (int __flag, const wchar_t *__restrict __format,
-			   __gnuc_va_list __ap);
-
 # ifdef __va_arg_pack
 __fortify_function int
 wprintf (const wchar_t *__restrict __fmt, ...)
@@ -328,8 +281,6 @@ vfwprintf (__FILE *__restrict __stream,
 
 #endif
 
-extern wchar_t *__fgetws_chk (wchar_t *__restrict __s, size_t __size, int __n,
-			      __FILE *__restrict __stream) __wur;
 extern wchar_t *__REDIRECT (__fgetws_alias,
 			    (wchar_t *__restrict __s, int __n,
 			     __FILE *__restrict __stream), fgetws) __wur;
@@ -351,9 +302,6 @@ fgetws (wchar_t *__restrict __s, int __n, __FILE *__restrict __stream)
 }
 
 #ifdef __USE_GNU
-extern wchar_t *__fgetws_unlocked_chk (wchar_t *__restrict __s, size_t __size,
-				       int __n, __FILE *__restrict __stream)
-  __wur;
 extern wchar_t *__REDIRECT (__fgetws_unlocked_alias,
 			    (wchar_t *__restrict __s, int __n,
 			     __FILE *__restrict __stream), fgetws_unlocked)
@@ -379,9 +327,6 @@ fgetws_unlocked (wchar_t *__restrict __s, int __n, __FILE *__restrict __stream)
 #endif
 
 
-extern size_t __wcrtomb_chk (char *__restrict __s, wchar_t __wchar,
-			     mbstate_t *__restrict __p,
-			     size_t __buflen) __THROW __wur;
 extern size_t __REDIRECT_NTH (__wcrtomb_alias,
 			      (char *__restrict __s, wchar_t __wchar,
 			       mbstate_t *__restrict __ps), wcrtomb) __wur;
@@ -404,10 +349,6 @@ __NTH (wcrtomb (char *__restrict __s, wchar_t __wchar,
 }
 
 
-extern size_t __mbsrtowcs_chk (wchar_t *__restrict __dst,
-			       const char **__restrict __src,
-			       size_t __len, mbstate_t *__restrict __ps,
-			       size_t __dstlen) __THROW;
 extern size_t __REDIRECT_NTH (__mbsrtowcs_alias,
 			      (wchar_t *__restrict __dst,
 			       const char **__restrict __src,
@@ -431,10 +372,6 @@ __NTH (mbsrtowcs (wchar_t *__restrict __dst, const char **__restrict __src,
 }
 
 
-extern size_t __wcsrtombs_chk (char *__restrict __dst,
-			       const wchar_t **__restrict __src,
-			       size_t __len, mbstate_t *__restrict __ps,
-			       size_t __dstlen) __THROW;
 extern size_t __REDIRECT_NTH (__wcsrtombs_alias,
 			      (char *__restrict __dst,
 			       const wchar_t **__restrict __src,
@@ -458,10 +395,6 @@ __NTH (wcsrtombs (char *__restrict __dst, const wchar_t **__restrict __src,
 
 
 #ifdef	__USE_XOPEN2K8
-extern size_t __mbsnrtowcs_chk (wchar_t *__restrict __dst,
-				const char **__restrict __src, size_t __nmc,
-				size_t __len, mbstate_t *__restrict __ps,
-				size_t __dstlen) __THROW;
 extern size_t __REDIRECT_NTH (__mbsnrtowcs_alias,
 			      (wchar_t *__restrict __dst,
 			       const char **__restrict __src, size_t __nmc,
@@ -485,11 +418,6 @@ __NTH (mbsnrtowcs (wchar_t *__restrict __dst, const char **__restrict __src,
 }
 
 
-extern size_t __wcsnrtombs_chk (char *__restrict __dst,
-				const wchar_t **__restrict __src,
-				size_t __nwc, size_t __len,
-				mbstate_t *__restrict __ps, size_t __dstlen)
-     __THROW;
 extern size_t __REDIRECT_NTH (__wcsnrtombs_alias,
 			      (char *__restrict __dst,
 			       const wchar_t **__restrict __src,
diff --git a/wcsmbs/wchar.h b/wcsmbs/wchar.h
index 5d6a40853..c1321c751 100644
--- a/wcsmbs/wchar.h
+++ b/wcsmbs/wchar.h
@@ -864,14 +864,21 @@ extern size_t wcsftime_l (wchar_t *__restrict __s, size_t __maxsize,
 
 /* Define some macros helping to catch buffer overflows.  */
 #if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function
-# include <bits/wchar2.h>
+/* Declare all functions from bits/wchar2-decl.h first.  */
+# include <bits/wchar2-decl.h>
 #endif
 
-#include <bits/floatn.h>
+/* The following headers provide asm redirections.  These redirections must
+   appear before the first usage of these functions, e.g. in bits/wchar.h.  */
 #if defined __LDBL_COMPAT || __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI == 1
 # include <bits/wchar-ldbl.h>
 #endif
 
+#if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function
+/* Now include the function definitions and redirects too.  */
+# include <bits/wchar2.h>
+#endif
+
 __END_DECLS
 
 #endif /* wchar.h  */
-- 
2.30.2