From: GNU Libc Maintainers <debian-glibc@lists.debian.org>
Date: Sun, 10 Jul 2022 20:29:34 +0000 (+0100)
Subject: git-updates
X-Git-Tag: archive/raspbian/2.33-8+rpi1^2~115
X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=e517cadf0d9acb8054a80ab29b9fc6429758b894;p=glibc.git

git-updates

GIT update of https://sourceware.org/git/glibc.git/release/2.33/master from glibc-2.33

GIT update of https://sourceware.org/git/glibc.git/release/2.33/master from glibc-2.33


Gbp-Pq: Name git-updates.diff
---

diff --git a/NEWS b/NEWS
index 71f5d2032..bd86d8971 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,66 @@ See the end for copying conditions.
 Please send GNU C library bug reports via <https://sourceware.org/bugzilla/>
 using `glibc' in the "product" field.
 
+Version 2.33.1
+
+Major new features:
+
+* The dynamic linker implements the --list-diagnostics option, printing
+  a dump of information related to IFUNC resolver operation and
+  glibc-hwcaps subdirectory selection.
+
+Security related changes:
+
+  CVE-2021-33574: The mq_notify function has a potential use-after-free
+  issue when using a notification type of SIGEV_THREAD and a thread
+  attribute with a non-default affinity mask.
+
+  CVE-2022-23219: Passing an overlong file name to the clnt_create
+  legacy function could result in a stack-based buffer overflow when
+  using the "unix" protocol.  Reported by Martin Sebor.
+
+  CVE-2022-23218: Passing an overlong file name to the svcunix_create
+  legacy function could result in a stack-based buffer overflow.
+
+  CVE-2021-3998: Passing a path longer than PATH_MAX to the realpath
+  function could result in a memory leak and potential access of
+  uninitialized memory.  Reported by Qualys.
+
+  CVE-2021-3999: Passing a buffer of size exactly 1 byte to the getcwd
+  function may result in an off-by-one buffer underflow and overflow
+  when the current working directory is longer than PATH_MAX and also
+  corresponds to the / directory through an unprivileged mount
+  namespace.  Reported by Qualys.
+
+The following bugs are resolved with this release:
+
+  [15271] dlfcn function failure after dlmopen terminates process
+  [18435] pthread_once hangs when init routine throws an exception
+  [22542] CVE-2022-23219: Buffer overflow in sunrpc clnt_create for "unix"
+  [23462] Static binary with dynamic string tokens ($LIB, $PLATFORM, $ORIGIN)
+    crashes
+  [27304] pthread_cond_destroy does not pass private flag to futex system calls
+  [27457] vzeroupper use in AVX2 multiarch string functions cause HTM aborts
+  [27537] test-container: Always copy test-specific support files
+  [27577] elf/ld.so --help doesn't work
+  [27646] gethostbyname and NSS crashes after dlmopen
+  [27648] FAIL: misc/tst-select
+  [27651] Performance regression after updating to 2.33
+  [27706] select fails to update timeout on error
+  [27744] Support different libpthread/ld.so load orders for gdb -p
+  [27892] powerpc: scv ABI error handling fails to check IS_ERR_VALUE
+  [27974] Overflow bug in some implementation of wcsnlen, wmemchr, and wcsncat
+  [28353] Race condition in __opensock
+  [28607] Masked signals are delivered on thread exit
+  [28524] Conversion from ISO-2022-JP-3 with iconv may emit spurious NULs
+  [28532] powerpc64[le]: CFI for assembly templated syscalls is incorrect
+  [28755] overflow bug in wcsncmp_avx2 and wcsncmp_evex
+  [28768] CVE-2022-23218: Buffer overflow in sunrpc svcunix_create
+  [28769] CVE-2021-3999: Off-by-one buffer overflow/underflow in getcwd()
+  [28770] CVE-2021-3998: Unexpected return value from realpath() for too long results
+  [28896] strncmp-avx2-rtm and wcsncmp-avx2-rtm fallback on non-rtm
+    variants when avoiding overflow
+
 Version 2.33
 
 Major new features:
@@ -238,6 +298,7 @@ The following bugs are resolved with this release:
   [27237] malloc: deadlock in malloc/tst-malloc-stats-cancellation
   [27256] locale: Assertion failure in ISO-2022-JP-3 gconv module
     related to combining characters (CVE-2021-3326)
+  [28784] x86: crash in 32bit memset-sse2.s when the cache size can not be determined
 
 
 Version 2.32
diff --git a/config.h.in b/config.h.in
index 06ee8ae26..f21bf04e4 100644
--- a/config.h.in
+++ b/config.h.in
@@ -275,4 +275,10 @@
 /* Define if x86 ISA level should be included in shared libraries.  */
 #undef INCLUDE_X86_ISA_LEVEL
 
+/* Define if -msahf is enabled by default on x86.  */
+#undef HAVE_X86_LAHF_SAHF
+
+/* Define if -mmovbe is enabled by default on x86.  */
+#undef HAVE_X86_MOVBE
+
 #endif
diff --git a/dlfcn/dlerror.c b/dlfcn/dlerror.c
index 48b4c25be..ff7c7b922 100644
--- a/dlfcn/dlerror.c
+++ b/dlfcn/dlerror.c
@@ -167,8 +167,17 @@ _dlerror_run (void (*operate) (void *), void *args)
       result->errstring = NULL;
     }
 
-  result->errcode = _dl_catch_error (&result->objname, &result->errstring,
-				     &result->malloced, operate, args);
+#ifdef SHARED
+  result->errcode = _dl_catch_error_ptr (&result->objname,
+					 &result->errstring,
+					 &result->malloced,
+					 operate, args);
+#else
+  result->errcode = _dl_catch_error (&result->objname,
+				     &result->errstring,
+				     &result->malloced,
+				     operate, args);
+#endif
 
   /* If no error we mark that no error string is available.  */
   result->returned = result->errstring == NULL;
diff --git a/elf/Makefile b/elf/Makefile
index 5d666b1b0..37944c189 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -66,7 +66,7 @@ elide-routines.os = $(all-dl-routines) dl-support enbl-secure dl-origin \
 # interpreter and operating independent of libc.
 rtld-routines	= rtld $(all-dl-routines) dl-sysdep dl-environ dl-minimal \
   dl-error-minimal dl-conflict dl-hwcaps dl-hwcaps_split dl-hwcaps-subdirs \
-  dl-usage
+  dl-usage dl-diagnostics dl-diagnostics-kernel dl-diagnostics-cpu
 all-rtld-routines = $(rtld-routines) $(sysdep-rtld-routines)
 
 CFLAGS-dl-runtime.c += -fexceptions -fasynchronous-unwind-tables
@@ -164,7 +164,8 @@ tests-static-normal := tst-leaks1-static tst-array1-static tst-array5-static \
 	       tst-dl-iter-static \
 	       tst-tlsalign-static tst-tlsalign-extern-static \
 	       tst-linkall-static tst-env-setuid tst-env-setuid-tunables \
-	       tst-single_threaded-static tst-single_threaded-pthread-static
+	       tst-single_threaded-static tst-single_threaded-pthread-static \
+	       tst-dst-static
 
 tests-static-internal := tst-tls1-static tst-tls2-static \
 	       tst-ptrguard1-static tst-stackguard1-static \
@@ -225,7 +226,8 @@ tests += restest1 preloadtest loadfail multiload origtest resolvfail \
 	 tst-audit14 tst-audit15 tst-audit16 \
 	 tst-single_threaded tst-single_threaded-pthread \
 	 tst-tls-ie tst-tls-ie-dlmopen argv0test \
-	 tst-glibc-hwcaps tst-glibc-hwcaps-prepend tst-glibc-hwcaps-mask
+	 tst-glibc-hwcaps tst-glibc-hwcaps-prepend tst-glibc-hwcaps-mask \
+	 tst-dlmopen-dlerror tst-dlmopen-gethostbyname
 #	 reldep9
 tests-internal += loadtest unload unload2 circleload1 \
 	 neededtest neededtest2 neededtest3 neededtest4 \
@@ -244,7 +246,7 @@ tests += $(tests-execstack-$(have-z-execstack))
 ifeq ($(run-built-tests),yes)
 tests-special += $(objpfx)tst-leaks1-mem.out \
 		 $(objpfx)tst-leaks1-static-mem.out $(objpfx)noload-mem.out \
-		 $(objpfx)tst-ldconfig-X.out
+		 $(objpfx)tst-ldconfig-X.out $(objpfx)tst-rtld-help.out
 endif
 tlsmod17a-suffixes = 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
 tlsmod18a-suffixes = 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
@@ -347,6 +349,9 @@ modules-names = testobj1 testobj2 testobj3 testobj4 testobj5 testobj6 \
 		libmarkermod2-1 libmarkermod2-2 \
 		libmarkermod3-1 libmarkermod3-2 libmarkermod3-3 \
 		libmarkermod4-1 libmarkermod4-2 libmarkermod4-3 libmarkermod4-4 \
+		libmarkermod5-1 libmarkermod5-2 libmarkermod5-3 libmarkermod5-4 \
+		libmarkermod5-5 \
+		tst-dlmopen-dlerror-mod tst-dlmopen-gethostbyname-mod \
 
 # Most modules build with _ISOMAC defined, but those filtered out
 # depend on internal headers.
@@ -432,7 +437,8 @@ endif
 ifeq (yes,$(build-shared))
 ifeq ($(run-built-tests),yes)
 tests-special += $(objpfx)tst-pathopt.out $(objpfx)tst-rtld-load-self.out \
-		 $(objpfx)tst-rtld-preload.out $(objpfx)argv0test.out
+		 $(objpfx)tst-rtld-preload.out $(objpfx)argv0test.out \
+		 $(objpfx)tst-rtld-help.out
 endif
 tests-special += $(objpfx)check-textrel.out $(objpfx)check-execstack.out \
 		 $(objpfx)check-wx-segment.out \
@@ -678,6 +684,9 @@ CFLAGS-cache.c += $(SYSCONF-FLAGS)
 CFLAGS-rtld.c += $(SYSCONF-FLAGS)
 CFLAGS-dl-usage.c += $(SYSCONF-FLAGS) \
   -D'RTLD="$(rtlddir)/$(rtld-installed-name)"'
+CFLAGS-dl-diagnostics.c += $(SYSCONF-FLAGS) \
+  -D'PREFIX="$(prefix)"' \
+  -D'RTLD="$(rtlddir)/$(rtld-installed-name)"'
 
 cpp-srcs-left := $(all-rtld-routines:=.os)
 lib := rtld
@@ -1578,6 +1587,10 @@ $(objpfx)tst-sonamemove-dlopen.out: \
   $(objpfx)tst-sonamemove-runmod1.so \
   $(objpfx)tst-sonamemove-runmod2.so
 
+$(objpfx)tst-dlmopen-dlerror: $(libdl)
+$(objpfx)tst-dlmopen-dlerror-mod.so: $(libdl) $(libsupport)
+$(objpfx)tst-dlmopen-dlerror.out: $(objpfx)tst-dlmopen-dlerror-mod.so
+
 # Override -z defs, so that we can reference an undefined symbol.
 # Force lazy binding for the same reason.
 LDFLAGS-tst-latepthreadmod.so = \
@@ -1653,8 +1666,6 @@ $(objpfx)tst-nodelete-dlclose.out: $(objpfx)tst-nodelete-dlclose-dso.so \
 
 tst-env-setuid-ENV = MALLOC_CHECK_=2 MALLOC_MMAP_THRESHOLD_=4096 \
 		     LD_HWCAP_MASK=0x1
-tst-env-setuid-tunables-ENV = \
-	GLIBC_TUNABLES=glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096
 
 $(objpfx)tst-debug1: $(libdl)
 $(objpfx)tst-debug1.out: $(objpfx)tst-debug1mod1.so
@@ -1840,6 +1851,7 @@ LDFLAGS-libmarkermod1-1.so += -Wl,-soname,libmarkermod1.so
 LDFLAGS-libmarkermod2-1.so += -Wl,-soname,libmarkermod2.so
 LDFLAGS-libmarkermod3-1.so += -Wl,-soname,libmarkermod3.so
 LDFLAGS-libmarkermod4-1.so += -Wl,-soname,libmarkermod4.so
+LDFLAGS-libmarkermod5-1.so += -Wl,-soname,libmarkermod5.so
 $(objpfx)libmarkermod%.os : markermodMARKER-VALUE.c
 	$(compile-command.c) \
 	  -DMARKER=marker$(firstword $(subst -, ,$*)) \
@@ -1852,6 +1864,8 @@ $(objpfx)libmarkermod3.so: $(objpfx)libmarkermod3-1.so
 	cp $< $@
 $(objpfx)libmarkermod4.so: $(objpfx)libmarkermod4-1.so
 	cp $< $@
+$(objpfx)libmarkermod5.so: $(objpfx)libmarkermod5-1.so
+	cp $< $@
 
 # tst-glibc-hwcaps-prepend checks that --glibc-hwcaps-prepend is
 # preferred over auto-detected subdirectories.
@@ -1902,3 +1916,21 @@ $(objpfx)list-tunables.out: tst-rtld-list-tunables.sh $(objpfx)ld.so
 	cmp tst-rtld-list-tunables.exp \
 	    $(objpfx)/tst-rtld-list-tunables.out > $@; \
 	$(evaluate-test)
+
+tst-dst-static-ENV = LD_LIBRARY_PATH='$$ORIGIN'
+
+$(objpfx)tst-rtld-help.out: $(objpfx)ld.so
+	$(test-wrapper) $(rtld-prefix) --help > $@; \
+	status=$$?; \
+	echo "info: ld.so exit status: $$status" >> $@; \
+	if ! grep -q 'Legacy HWCAP subdirectories under library search path directories' $@; then \
+	  echo "error: missing subdirectory pattern" >> $@; \
+	  if test $$status -eq 0; then \
+	    status=1; \
+	  fi; \
+	fi; \
+	(exit $$status); \
+	$(evaluate-test)
+
+$(objpfx)tst-dlmopen-gethostbyname: $(libdl)
+$(objpfx)tst-dlmopen-gethostbyname.out: $(objpfx)tst-dlmopen-gethostbyname-mod.so
diff --git a/elf/Versions b/elf/Versions
index be88c48e6..cdfd7b4d2 100644
--- a/elf/Versions
+++ b/elf/Versions
@@ -72,7 +72,7 @@ ld {
 
     # Internal error handling support.  Interposed by libc.so.
     _dl_signal_exception; _dl_catch_exception;
-    _dl_signal_error; _dl_catch_error;
+    _dl_signal_error; _dl_catch_error; _dl_catch_error_ptr;
 
     # Set value of a tunable.
     __tunable_get_val;
diff --git a/elf/dl-cache.c b/elf/dl-cache.c
index 32f3bef5e..2b8da8650 100644
--- a/elf/dl-cache.c
+++ b/elf/dl-cache.c
@@ -269,81 +269,77 @@ search_cache (const char *string_table, uint32_t string_table_size,
 	      if (_dl_cache_check_flags (flags)
 		  && _dl_cache_verify_ptr (lib->value, string_table_size))
 		{
-		  if (best == NULL || flags == GLRO (dl_correct_cache_id))
-		    {
-		      /* Named/extension hwcaps get slightly different
-			 treatment: We keep searching for a better
-			 match.  */
-		      bool named_hwcap = false;
+		  /* Named/extension hwcaps get slightly different
+		     treatment: We keep searching for a better
+		     match.  */
+		  bool named_hwcap = false;
 
-		      if (entry_size >= sizeof (struct file_entry_new))
-			{
-			  /* The entry is large enough to include
-			     HWCAP data.  Check it.  */
-			  struct file_entry_new *libnew
-			    = (struct file_entry_new *) lib;
+		  if (entry_size >= sizeof (struct file_entry_new))
+		    {
+		      /* The entry is large enough to include
+			 HWCAP data.  Check it.  */
+		      struct file_entry_new *libnew
+			= (struct file_entry_new *) lib;
 
 #ifdef SHARED
-			  named_hwcap = dl_cache_hwcap_extension (libnew);
-			  if (named_hwcap
-			      && !dl_cache_hwcap_isa_level_compatible (libnew))
-			    continue;
+		      named_hwcap = dl_cache_hwcap_extension (libnew);
+		      if (named_hwcap
+			  && !dl_cache_hwcap_isa_level_compatible (libnew))
+			continue;
 #endif
 
-			  /* The entries with named/extension hwcaps
-			     have been exhausted.  Return the best
-			     match encountered so far if there is
-			     one.  */
-			  if (!named_hwcap && best != NULL)
-			    break;
+		      /* The entries with named/extension hwcaps have
+			 been exhausted (they are listed before all
+			 other entries).  Return the best match
+			 encountered so far if there is one.  */
+		      if (!named_hwcap && best != NULL)
+			break;
 
-			  if ((libnew->hwcap & hwcap_exclude) && !named_hwcap)
-			    continue;
-			  if (GLRO (dl_osversion)
-			      && libnew->osversion > GLRO (dl_osversion))
-			    continue;
-			  if (_DL_PLATFORMS_COUNT
-			      && (libnew->hwcap & _DL_HWCAP_PLATFORM) != 0
-			      && ((libnew->hwcap & _DL_HWCAP_PLATFORM)
-				  != platform))
-			    continue;
+		      if ((libnew->hwcap & hwcap_exclude) && !named_hwcap)
+			continue;
+		      if (GLRO (dl_osversion)
+			  && libnew->osversion > GLRO (dl_osversion))
+			continue;
+		      if (_DL_PLATFORMS_COUNT
+			  && (libnew->hwcap & _DL_HWCAP_PLATFORM) != 0
+			  && ((libnew->hwcap & _DL_HWCAP_PLATFORM)
+			      != platform))
+			continue;
 
 #ifdef SHARED
-			  /* For named hwcaps, determine the priority
-			     and see if beats what has been found so
-			     far.  */
-			  if (named_hwcap)
-			    {
-			      uint32_t entry_priority
-				= glibc_hwcaps_priority (libnew->hwcap);
-			      if (entry_priority == 0)
-				/* Not usable at all.  Skip.  */
-				continue;
-			      else if (best == NULL
-				       || entry_priority < best_priority)
-				/* This entry is of higher priority
-				   than the previous one, or it is the
-				   first entry.  */
-				best_priority = entry_priority;
-			      else
-				/* An entry has already been found,
-				   but it is a better match.  */
-				continue;
-			    }
-#endif /* SHARED */
+		      /* For named hwcaps, determine the priority and
+			 see if beats what has been found so far.  */
+		      if (named_hwcap)
+			{
+			  uint32_t entry_priority
+			    = glibc_hwcaps_priority (libnew->hwcap);
+			  if (entry_priority == 0)
+			    /* Not usable at all.  Skip.  */
+			    continue;
+			  else if (best == NULL
+				   || entry_priority < best_priority)
+			    /* This entry is of higher priority
+			       than the previous one, or it is the
+			       first entry.  */
+			    best_priority = entry_priority;
+			  else
+			    /* An entry has already been found,
+			       but it is a better match.  */
+			    continue;
 			}
+#endif /* SHARED */
+		    }
 
-		      best = string_table + lib->value;
+		  best = string_table + lib->value;
 
-		      if (flags == GLRO (dl_correct_cache_id)
-			  && !named_hwcap)
-			/* We've found an exact match for the shared
-			   object and no general `ELF' release.  Stop
-			   searching, but not if a named (extension)
-			   hwcap is used.  In this case, an entry with
-			   a higher priority may come up later.  */
-			break;
-		    }
+		  if (!named_hwcap && flags == _DL_CACHE_DEFAULT_ID)
+		    /* With named hwcaps, we need to keep searching to
+		       see if we find a better match.  A better match
+		       is also possible if the flags of the current
+		       entry do not match the expected cache flags.
+		       But if the flags match, no better entry will be
+		       found.  */
+		    break;
 		}
 	    }
 	  while (++middle <= right);
diff --git a/elf/dl-diagnostics-cpu.c b/elf/dl-diagnostics-cpu.c
new file mode 100644
index 000000000..f7d149764
--- /dev/null
+++ b/elf/dl-diagnostics-cpu.c
@@ -0,0 +1,24 @@
+/* Print CPU diagnostics data in ld.so.  Stub version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-diagnostics.h>
+
+void
+_dl_diagnostics_cpu (void)
+{
+}
diff --git a/elf/dl-diagnostics-kernel.c b/elf/dl-diagnostics-kernel.c
new file mode 100644
index 000000000..831c358f1
--- /dev/null
+++ b/elf/dl-diagnostics-kernel.c
@@ -0,0 +1,24 @@
+/* Print kernel diagnostics data in ld.so.  Stub version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-diagnostics.h>
+
+void
+_dl_diagnostics_kernel (void)
+{
+}
diff --git a/elf/dl-diagnostics.c b/elf/dl-diagnostics.c
new file mode 100644
index 000000000..bef224b36
--- /dev/null
+++ b/elf/dl-diagnostics.c
@@ -0,0 +1,265 @@
+/* Print diagnostics data in ld.so.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <gnu/lib-names.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <unistd.h>
+
+#include <dl-diagnostics.h>
+#include <dl-hwcaps.h>
+#include <dl-main.h>
+#include <dl-procinfo.h>
+#include <dl-sysdep.h>
+#include <ldsodefs.h>
+#include "trusted-dirs.h"
+#include "version.h"
+
+/* Write CH to standard output.  */
+static void
+_dl_putc (char ch)
+{
+  _dl_write (STDOUT_FILENO, &ch, 1);
+}
+
+/* Print CH to standard output, quoting it if necessary.  */
+static void
+print_quoted_char (char ch)
+{
+  if (ch < ' ' || ch > '~')
+    {
+      char buf[4];
+      buf[0] = '\\';
+      buf[1] = '0' + ((ch >> 6) & 7);
+      buf[2] = '0' + ((ch >> 6) & 7);
+      buf[3] = '0' + (ch & 7);
+      _dl_write (STDOUT_FILENO, buf, 4);
+    }
+  else
+    {
+      if (ch == '\\' || ch == '"')
+        _dl_putc ('\\');
+      _dl_putc (ch);
+    }
+}
+
+/* Print S of LEN bytes to standard output, quoting characters as
+   needed.  */
+static void
+print_string_length (const char *s, size_t len)
+{
+  _dl_putc ('"');
+  for (size_t i = 0; i < len; ++i)
+    print_quoted_char (s[i]);
+  _dl_putc ('"');
+}
+
+void
+_dl_diagnostics_print_string (const char *s)
+{
+  if (s == NULL)
+    {
+      _dl_printf ("0x0");
+      return;
+    }
+
+  _dl_putc ('"');
+  while (*s != '\0')
+    {
+      print_quoted_char (*s);
+      ++s;
+    }
+  _dl_putc ('"');
+}
+
+void
+_dl_diagnostics_print_labeled_string (const char *label, const char *s)
+{
+  _dl_printf ("%s=", label);
+  _dl_diagnostics_print_string (s);
+  _dl_putc ('\n');
+}
+
+void
+_dl_diagnostics_print_labeled_value (const char *label, uint64_t value)
+{
+  if (sizeof (value) == sizeof (unsigned long int))
+    /* _dl_printf can print 64-bit values directly.  */
+    _dl_printf ("%s=0x%lx\n", label, (unsigned long int) value);
+  else
+    {
+      uint32_t high = value >> 32;
+      uint32_t low = value;
+      if (high == 0)
+        _dl_printf ("%s=0x%x\n", label, low);
+      else
+        _dl_printf ("%s=0x%x%08x\n", label, high, low);
+    }
+}
+
+/* Return true if ENV is an unfiltered environment variable.  */
+static bool
+unfiltered_envvar (const char *env, size_t *name_length)
+{
+  char *env_equal = strchr (env, '=');
+  if (env_equal == NULL)
+    {
+      /* Always dump malformed entries.  */
+      *name_length = strlen (env);
+      return true;
+    }
+  size_t envname_length = env_equal - env;
+  *name_length = envname_length;
+
+  /* LC_ and LD_ variables.  */
+  if (env[0] == 'L' && (env[1] == 'C' || env[1] == 'D')
+      && env[2] == '_')
+    return true;
+
+  /* MALLOC_ variables.  */
+  if (strncmp (env, "MALLOC_", strlen ("MALLOC_")) == 0)
+    return true;
+
+  static const char unfiltered[] =
+    "DATEMSK\0"
+    "GCONV_PATH\0"
+    "GETCONF_DIR\0"
+    "GETCONF_DIR\0"
+    "GLIBC_TUNABLES\0"
+    "GMON_OUTPUT_PREFIX\0"
+    "HESIOD_CONFIG\0"
+    "HES_DOMAIN\0"
+    "HOSTALIASES\0"
+    "I18NPATH\0"
+    "IFS\0"
+    "LANG\0"
+    "LOCALDOMAIN\0"
+    "LOCPATH\0"
+    "MSGVERB\0"
+    "NIS_DEFAULTS\0"
+    "NIS_GROUP\0"
+    "NIS_PATH\0"
+    "NLSPATH\0"
+    "PATH\0"
+    "POSIXLY_CORRECT\0"
+    "RESOLV_HOST_CONF\0"
+    "RES_OPTIONS\0"
+    "SEV_LEVEL\0"
+    "TMPDIR\0"
+    "TZ\0"
+    "TZDIR\0"
+    /* Two null bytes at the end to mark the end of the list via an
+       empty substring.  */
+    ;
+  for (const char *candidate = unfiltered; *candidate != '\0'; )
+    {
+      size_t candidate_length = strlen (candidate);
+      if (candidate_length == envname_length
+          && memcmp (candidate, env, candidate_length) == 0)
+        return true;
+      candidate += candidate_length + 1;
+    }
+
+  return false;
+}
+
+/* Dump the process environment.  */
+static void
+print_environ (char **environ)
+{
+  unsigned int index = 0;
+  for (char **envp = environ; *envp != NULL; ++envp)
+    {
+      char *env = *envp;
+      size_t name_length;
+      bool unfiltered = unfiltered_envvar (env, &name_length);
+      _dl_printf ("env%s[0x%x]=",
+                  unfiltered ? "" : "_filtered", index);
+      if (unfiltered)
+        _dl_diagnostics_print_string (env);
+      else
+        print_string_length (env, name_length);
+      _dl_putc ('\n');
+      ++index;
+    }
+}
+
+/* Print configured paths and the built-in search path.  */
+static void
+print_paths (void)
+{
+  _dl_diagnostics_print_labeled_string ("path.prefix", PREFIX);
+  _dl_diagnostics_print_labeled_string ("path.rtld", RTLD);
+  _dl_diagnostics_print_labeled_string ("path.sysconfdir", SYSCONFDIR);
+
+  unsigned int index = 0;
+  static const char *system_dirs = SYSTEM_DIRS "\0";
+  for (const char *e = system_dirs; *e != '\0'; )
+    {
+      size_t len = strlen (e);
+      _dl_printf ("path.system_dirs[0x%x]=", index);
+      print_string_length (e, len);
+      _dl_putc ('\n');
+      ++index;
+      e += len + 1;
+    }
+}
+
+/* Print information about the glibc version.  */
+static void
+print_version (void)
+{
+  _dl_diagnostics_print_labeled_string ("version.release", RELEASE);
+  _dl_diagnostics_print_labeled_string ("version.version", VERSION);
+}
+
+void
+_dl_print_diagnostics (char **environ)
+{
+#ifdef HAVE_DL_DISCOVER_OSVERSION
+  _dl_diagnostics_print_labeled_value
+    ("dl_discover_osversion", _dl_discover_osversion ());
+#endif
+  _dl_diagnostics_print_labeled_string ("dl_dst_lib", DL_DST_LIB);
+  _dl_diagnostics_print_labeled_value ("dl_hwcap", GLRO (dl_hwcap));
+  _dl_diagnostics_print_labeled_value ("dl_hwcap_important", HWCAP_IMPORTANT);
+  _dl_diagnostics_print_labeled_value ("dl_hwcap2", GLRO (dl_hwcap2));
+  _dl_diagnostics_print_labeled_string
+    ("dl_hwcaps_subdirs", _dl_hwcaps_subdirs);
+  _dl_diagnostics_print_labeled_value
+    ("dl_hwcaps_subdirs_active", _dl_hwcaps_subdirs_active ());
+  _dl_diagnostics_print_labeled_value ("dl_osversion", GLRO (dl_osversion));
+  _dl_diagnostics_print_labeled_value ("dl_pagesize", GLRO (dl_pagesize));
+  _dl_diagnostics_print_labeled_string ("dl_platform", GLRO (dl_platform));
+  _dl_diagnostics_print_labeled_string
+    ("dl_profile_output", GLRO (dl_profile_output));
+  _dl_diagnostics_print_labeled_value
+    ("dl_string_platform", _dl_string_platform ( GLRO (dl_platform)));
+
+  _dl_diagnostics_print_labeled_string ("dso.ld", LD_SO);
+  _dl_diagnostics_print_labeled_string ("dso.libc", LIBC_SO);
+
+  print_environ (environ);
+  print_paths ();
+  print_version ();
+
+  _dl_diagnostics_kernel ();
+  _dl_diagnostics_cpu ();
+
+  _exit (EXIT_SUCCESS);
+}
diff --git a/elf/dl-diagnostics.h b/elf/dl-diagnostics.h
new file mode 100644
index 000000000..27dcb12bc
--- /dev/null
+++ b/elf/dl-diagnostics.h
@@ -0,0 +1,46 @@
+/* Interfaces for printing diagnostics in ld.so.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_DIAGNOSTICS_H
+#define _DL_DIAGNOSTICS_H
+
+#include <stdint.h>
+
+/* Write the null-terminated string to standard output, surrounded in
+   quotation marks.  */
+void _dl_diagnostics_print_string (const char *s) attribute_hidden;
+
+/* Like _dl_diagnostics_print_string, but add a LABEL= prefix, and a
+   newline character as a suffix.  */
+void _dl_diagnostics_print_labeled_string (const char *label, const char *s)
+  attribute_hidden;
+
+/* Print LABEL=VALUE to standard output, followed by a newline
+   character.  */
+void _dl_diagnostics_print_labeled_value (const char *label, uint64_t value)
+  attribute_hidden;
+
+/* Print diagnostics data for the kernel.  Called from
+   _dl_print_diagnostics.  */
+void _dl_diagnostics_kernel (void) attribute_hidden;
+
+/* Print diagnostics data for the CPU(s).  Called from
+   _dl_print_diagnostics.  */
+void _dl_diagnostics_cpu (void) attribute_hidden;
+
+#endif /* _DL_DIAGNOSTICS_H */
diff --git a/elf/dl-error-skeleton.c b/elf/dl-error-skeleton.c
index 2fd62777c..0de505f25 100644
--- a/elf/dl-error-skeleton.c
+++ b/elf/dl-error-skeleton.c
@@ -248,4 +248,19 @@ _dl_receive_error (receiver_fct fct, void (*operate) (void *), void *args)
   catch_hook = old_catch;
   receiver = old_receiver;
 }
+
+/* Forwarder used for initializing _dl_catch_error_ptr.  */
+int
+_rtld_catch_error (const char **objname, const char **errstring,
+		   bool *mallocedp, void (*operate) (void *),
+		   void *args)
+{
+  /* The reference to _dl_catch_error will eventually be relocated to
+     point to the implementation in libc.so.  */
+  return _dl_catch_error (objname, errstring, mallocedp, operate, args);
+}
+
+__typeof (_dl_catch_error) *_dl_catch_error_ptr = _rtld_catch_error;
+rtld_hidden_data_def (_dl_catch_error_ptr);
+
 #endif /* DL_ERROR_BOOTSTRAP */
diff --git a/elf/dl-libc.c b/elf/dl-libc.c
index ed551f6e5..e2236d87d 100644
--- a/elf/dl-libc.c
+++ b/elf/dl-libc.c
@@ -43,9 +43,15 @@ dlerror_run (void (*operate) (void *), void *args)
   const char *last_errstring = NULL;
   bool malloced;
 
+#ifdef SHARED
+  int result = (_dl_catch_error_ptr (&objname, &last_errstring, &malloced,
+				     operate, args)
+		?: last_errstring != NULL);
+#else
   int result = (_dl_catch_error (&objname, &last_errstring, &malloced,
-				operate, args)
+				 operate, args)
 		?: last_errstring != NULL);
+#endif
 
   if (result && malloced)
     free ((char *) last_errstring);
diff --git a/elf/dl-load.c b/elf/dl-load.c
index 9e2089cfa..639d78083 100644
--- a/elf/dl-load.c
+++ b/elf/dl-load.c
@@ -758,50 +758,51 @@ _dl_init_paths (const char *llp, const char *source,
   max_dirnamelen = SYSTEM_DIRS_MAX_LEN;
   *aelem = NULL;
 
-#ifdef SHARED
-  /* This points to the map of the main object.  */
+  /* This points to the map of the main object.  If there is no main
+     object (e.g., under --help, use the dynamic loader itself as a
+     stand-in.  */
   l = GL(dl_ns)[LM_ID_BASE]._ns_loaded;
-  if (l != NULL)
+#ifdef SHARED
+  if (l == NULL)
+    l = &GL (dl_rtld_map);
+#endif
+  assert (l->l_type != lt_loaded);
+
+  if (l->l_info[DT_RUNPATH])
+    {
+      /* Allocate room for the search path and fill in information
+	 from RUNPATH.  */
+      decompose_rpath (&l->l_runpath_dirs,
+		       (const void *) (D_PTR (l, l_info[DT_STRTAB])
+				       + l->l_info[DT_RUNPATH]->d_un.d_val),
+		       l, "RUNPATH");
+      /* During rtld init the memory is allocated by the stub malloc,
+	 prevent any attempt to free it by the normal malloc.  */
+      l->l_runpath_dirs.malloced = 0;
+
+      /* The RPATH is ignored.  */
+      l->l_rpath_dirs.dirs = (void *) -1;
+    }
+  else
     {
-      assert (l->l_type != lt_loaded);
+      l->l_runpath_dirs.dirs = (void *) -1;
 
-      if (l->l_info[DT_RUNPATH])
+      if (l->l_info[DT_RPATH])
 	{
 	  /* Allocate room for the search path and fill in information
-	     from RUNPATH.  */
-	  decompose_rpath (&l->l_runpath_dirs,
+	     from RPATH.  */
+	  decompose_rpath (&l->l_rpath_dirs,
 			   (const void *) (D_PTR (l, l_info[DT_STRTAB])
-					   + l->l_info[DT_RUNPATH]->d_un.d_val),
-			   l, "RUNPATH");
-	  /* During rtld init the memory is allocated by the stub malloc,
-	     prevent any attempt to free it by the normal malloc.  */
-	  l->l_runpath_dirs.malloced = 0;
-
-	  /* The RPATH is ignored.  */
-	  l->l_rpath_dirs.dirs = (void *) -1;
+					   + l->l_info[DT_RPATH]->d_un.d_val),
+			   l, "RPATH");
+	  /* During rtld init the memory is allocated by the stub
+	     malloc, prevent any attempt to free it by the normal
+	     malloc.  */
+	  l->l_rpath_dirs.malloced = 0;
 	}
       else
-	{
-	  l->l_runpath_dirs.dirs = (void *) -1;
-
-	  if (l->l_info[DT_RPATH])
-	    {
-	      /* Allocate room for the search path and fill in information
-		 from RPATH.  */
-	      decompose_rpath (&l->l_rpath_dirs,
-			       (const void *) (D_PTR (l, l_info[DT_STRTAB])
-					       + l->l_info[DT_RPATH]->d_un.d_val),
-			       l, "RPATH");
-	      /* During rtld init the memory is allocated by the stub
-		 malloc, prevent any attempt to free it by the normal
-		 malloc.  */
-	      l->l_rpath_dirs.malloced = 0;
-	    }
-	  else
-	    l->l_rpath_dirs.dirs = (void *) -1;
-	}
+	l->l_rpath_dirs.dirs = (void *) -1;
     }
-#endif	/* SHARED */
 
   if (llp != NULL && *llp != '\0')
     {
@@ -1113,6 +1114,7 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
     struct loadcmd loadcmds[l->l_phnum];
     size_t nloadcmds = 0;
     bool has_holes = false;
+    bool empty_dynamic = false;
 
     /* The struct is initialized to zero so this is not necessary:
     l->l_ld = 0;
@@ -1125,7 +1127,9 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
 	     segments are mapped in.  We record the addresses it says
 	     verbatim, and later correct for the run-time load address.  */
 	case PT_DYNAMIC:
-	  if (ph->p_filesz)
+	  if (ph->p_filesz == 0)
+	    empty_dynamic = true; /* Usually separate debuginfo.  */
+	  else
 	    {
 	      /* Debuginfo only files from "objcopy --only-keep-debug"
 		 contain a PT_DYNAMIC segment with p_filesz == 0.  Skip
@@ -1247,6 +1251,13 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
 	goto lose;
       }
 
+    /* This check recognizes most separate debuginfo files.  */
+    if (__glibc_unlikely ((l->l_ld == 0 && type == ET_DYN) || empty_dynamic))
+      {
+	errstring = N_("object file has no dynamic section");
+	goto lose;
+      }
+
     /* Length of the sections to be loaded.  */
     maplength = loadcmds[nloadcmds - 1].allocend - loadcmds[0].mapstart;
 
@@ -1264,15 +1275,7 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
       }
   }
 
-  if (l->l_ld == 0)
-    {
-      if (__glibc_unlikely (type == ET_DYN))
-	{
-	  errstring = N_("object file has no dynamic section");
-	  goto lose;
-	}
-    }
-  else
+  if (l->l_ld != 0)
     l->l_ld = (ElfW(Dyn) *) ((ElfW(Addr)) l->l_ld + l->l_addr);
 
   elf_get_dynamic_info (l, NULL);
diff --git a/elf/dl-main.h b/elf/dl-main.h
index 3a5e13c73..d3820e006 100644
--- a/elf/dl-main.h
+++ b/elf/dl-main.h
@@ -63,7 +63,7 @@ struct audit_list
 enum rtld_mode
   {
     rtld_mode_normal, rtld_mode_list, rtld_mode_verify, rtld_mode_trace,
-    rtld_mode_list_tunables, rtld_mode_help,
+    rtld_mode_list_tunables, rtld_mode_list_diagnostics, rtld_mode_help,
   };
 
 /* Aggregated state information extracted from environment variables
@@ -121,4 +121,7 @@ _Noreturn void _dl_version (void) attribute_hidden;
 _Noreturn void _dl_help (const char *argv0, struct dl_main_state *state)
   attribute_hidden;
 
+/* Print a diagnostics dump.  */
+_Noreturn void _dl_print_diagnostics (char **environ) attribute_hidden;
+
 #endif /* _DL_MAIN */
diff --git a/elf/dl-open.c b/elf/dl-open.c
index ab7aaa345..1b965457c 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -881,7 +881,7 @@ no more namespaces available for dlmopen()"));
       /* Avoid keeping around a dangling reference to the libc.so link
 	 map in case it has been cached in libc_map.  */
       if (!args.libc_already_loaded)
-	GL(dl_ns)[nsid].libc_map = NULL;
+	GL(dl_ns)[args.nsid].libc_map = NULL;
 
       /* Remove the object from memory.  It may be in an inconsistent
 	 state if relocation failed, for example.  */
diff --git a/elf/dl-tunable-types.h b/elf/dl-tunable-types.h
index 3fcc0806f..39bf738d9 100644
--- a/elf/dl-tunable-types.h
+++ b/elf/dl-tunable-types.h
@@ -38,8 +38,8 @@ typedef enum
 typedef struct
 {
   tunable_type_code_t type_code;
-  int64_t min;
-  int64_t max;
+  tunable_num_t min;
+  tunable_num_t max;
 } tunable_type_t;
 
 /* Security level for tunables.  This decides what to do with individual
@@ -81,4 +81,21 @@ struct _tunable
 
 typedef struct _tunable tunable_t;
 
+static __always_inline bool
+unsigned_tunable_type (tunable_type_code_t t)
+{
+  switch (t)
+    {
+    case TUNABLE_TYPE_INT_32:
+      return false;
+    case TUNABLE_TYPE_UINT_64:
+    case TUNABLE_TYPE_SIZE_T:
+      return true;
+    case TUNABLE_TYPE_STRING:
+    default:
+      break;
+    }
+  __builtin_unreachable ();
+}
+
 #endif
diff --git a/elf/dl-tunables.c b/elf/dl-tunables.c
index b1a50b846..8009e54ee 100644
--- a/elf/dl-tunables.c
+++ b/elf/dl-tunables.c
@@ -93,88 +93,49 @@ get_next_env (char **envp, char **name, size_t *namelen, char **val,
   return NULL;
 }
 
-#define TUNABLE_SET_VAL_IF_VALID_RANGE(__cur, __val, __type)		      \
-({									      \
-  __type min = (__cur)->type.min;					      \
-  __type max = (__cur)->type.max;					      \
-									      \
-  if ((__type) (__val) >= min && (__type) (__val) <= max)		      \
-    {									      \
-      (__cur)->val.numval = (__val);					      \
-      (__cur)->initialized = true;					      \
-    }									      \
-})
-
-#define TUNABLE_SET_BOUNDS_IF_VALID(__cur, __minp, __maxp, __type)	      \
-({									      \
-  if (__minp != NULL)							      \
-    {									      \
-      /* MIN is specified.  */						      \
-      __type min = *((__type *) __minp);				      \
-      if (__maxp != NULL)						      \
-	{								      \
-	   /* Both MIN and MAX are specified.  */			      \
-	    __type max = *((__type *) __maxp);				      \
-	  if (max >= min						      \
-	      && max <= (__cur)->type.max				      \
-	      && min >= (__cur)->type.min)				      \
-	    {								      \
-	      (__cur)->type.min = min;					      \
-	      (__cur)->type.max = max;					      \
-	    }								      \
-	}								      \
-      else if (min > (__cur)->type.min && min <= (__cur)->type.max)	      \
-	{								      \
-	  /* Only MIN is specified.  */					      \
-	  (__cur)->type.min = min;					      \
-	}								      \
-    }									      \
-  else if (__maxp != NULL)						      \
-    {									      \
-      /* Only MAX is specified.  */					      \
-      __type max = *((__type *) __maxp);				      \
-      if (max < (__cur)->type.max && max >= (__cur)->type.min)		      \
-	(__cur)->type.max = max;					      \
-    }									      \
-})
-
 static void
-do_tunable_update_val (tunable_t *cur, const void *valp,
-		       const void *minp, const void *maxp)
+do_tunable_update_val (tunable_t *cur, const tunable_val_t *valp,
+		       const tunable_num_t *minp,
+		       const tunable_num_t *maxp)
 {
-  uint64_t val;
+  tunable_num_t val, min, max;
 
-  if (cur->type.type_code != TUNABLE_TYPE_STRING)
-    val = *((int64_t *) valp);
+  if (cur->type.type_code == TUNABLE_TYPE_STRING)
+    {
+      cur->val.strval = valp->strval;
+      cur->initialized = true;
+      return;
+    }
 
-  switch (cur->type.type_code)
+  bool unsigned_cmp = unsigned_tunable_type (cur->type.type_code);
+
+  val = valp->numval;
+  min = minp != NULL ? *minp : cur->type.min;
+  max = maxp != NULL ? *maxp : cur->type.max;
+
+  /* We allow only increasingly restrictive bounds.  */
+  if (tunable_val_lt (min, cur->type.min, unsigned_cmp))
+    min = cur->type.min;
+
+  if (tunable_val_gt (max, cur->type.max, unsigned_cmp))
+    max = cur->type.max;
+
+  /* Skip both bounds if they're inconsistent.  */
+  if (tunable_val_gt (min, max, unsigned_cmp))
     {
-    case TUNABLE_TYPE_INT_32:
-	{
-	  TUNABLE_SET_BOUNDS_IF_VALID (cur, minp, maxp, int64_t);
-	  TUNABLE_SET_VAL_IF_VALID_RANGE (cur, val, int64_t);
-	  break;
-	}
-    case TUNABLE_TYPE_UINT_64:
-	{
-	  TUNABLE_SET_BOUNDS_IF_VALID (cur, minp, maxp, uint64_t);
-	  TUNABLE_SET_VAL_IF_VALID_RANGE (cur, val, uint64_t);
-	  break;
-	}
-    case TUNABLE_TYPE_SIZE_T:
-	{
-	  TUNABLE_SET_BOUNDS_IF_VALID (cur, minp, maxp, uint64_t);
-	  TUNABLE_SET_VAL_IF_VALID_RANGE (cur, val, uint64_t);
-	  break;
-	}
-    case TUNABLE_TYPE_STRING:
-	{
-	  cur->val.strval = valp;
-	  break;
-	}
-    default:
-      __builtin_unreachable ();
+      min = cur->type.min;
+      max = cur->type.max;
     }
+
+  /* Bail out if the bounds are not valid.  */
+  if (tunable_val_lt (val, min, unsigned_cmp)
+      || tunable_val_lt (max, val, unsigned_cmp))
+    return;
+
+  cur->val.numval = val;
+  cur->type.min = min;
+  cur->type.max = max;
+  cur->initialized = true;
 }
 
 /* Validate range of the input value and initialize the tunable CUR if it looks
@@ -182,24 +143,18 @@ do_tunable_update_val (tunable_t *cur, const void *valp,
 static void
 tunable_initialize (tunable_t *cur, const char *strval)
 {
-  uint64_t val;
-  const void *valp;
+  tunable_val_t val;
 
   if (cur->type.type_code != TUNABLE_TYPE_STRING)
-    {
-      val = _dl_strtoul (strval, NULL);
-      valp = &val;
-    }
+    val.numval = (tunable_num_t) _dl_strtoul (strval, NULL);
   else
-    {
-      cur->initialized = true;
-      valp = strval;
-    }
-  do_tunable_update_val (cur, valp, NULL, NULL);
+    val.strval = strval;
+  do_tunable_update_val (cur, &val, NULL, NULL);
 }
 
 void
-__tunable_set_val (tunable_id_t id, void *valp, void *minp, void *maxp)
+__tunable_set_val (tunable_id_t id, tunable_val_t *valp, tunable_num_t *minp,
+		   tunable_num_t *maxp)
 {
   tunable_t *cur = &tunable_list[id];
 
@@ -219,6 +174,7 @@ parse_tunables (char *tunestr, char *valstring)
     return;
 
   char *p = tunestr;
+  size_t off = 0;
 
   while (true)
     {
@@ -232,7 +188,11 @@ parse_tunables (char *tunestr, char *valstring)
       /* If we reach the end of the string before getting a valid name-value
 	 pair, bail out.  */
       if (p[len] == '\0')
-	return;
+	{
+	  if (__libc_enable_secure)
+	    tunestr[off] = '\0';
+	  return;
+	}
 
       /* We did not find a valid name-value pair before encountering the
 	 colon.  */
@@ -258,35 +218,28 @@ parse_tunables (char *tunestr, char *valstring)
 
 	  if (tunable_is_name (cur->name, name))
 	    {
-	      /* If we are in a secure context (AT_SECURE) then ignore the tunable
-		 unless it is explicitly marked as secure.  Tunable values take
-		 precedence over their envvar aliases.  */
+	      /* If we are in a secure context (AT_SECURE) then ignore the
+		 tunable unless it is explicitly marked as secure.  Tunable
+		 values take precedence over their envvar aliases.  We write
+		 the tunables that are not SXID_ERASE back to TUNESTR, thus
+		 dropping all SXID_ERASE tunables and any invalid or
+		 unrecognized tunables.  */
 	      if (__libc_enable_secure)
 		{
-		  if (cur->security_level == TUNABLE_SECLEVEL_SXID_ERASE)
+		  if (cur->security_level != TUNABLE_SECLEVEL_SXID_ERASE)
 		    {
-		      if (p[len] == '\0')
-			{
-			  /* Last tunable in the valstring.  Null-terminate and
-			     return.  */
-			  *name = '\0';
-			  return;
-			}
-		      else
-			{
-			  /* Remove the current tunable from the string.  We do
-			     this by overwriting the string starting from NAME
-			     (which is where the current tunable begins) with
-			     the remainder of the string.  We then have P point
-			     to NAME so that we continue in the correct
-			     position in the valstring.  */
-			  char *q = &p[len + 1];
-			  p = name;
-			  while (*q != '\0')
-			    *name++ = *q++;
-			  name[0] = '\0';
-			  len = 0;
-			}
+		      if (off > 0)
+			tunestr[off++] = ':';
+
+		      const char *n = cur->name;
+
+		      while (*n != '\0')
+			tunestr[off++] = *n++;
+
+		      tunestr[off++] = '=';
+
+		      for (size_t j = 0; j < len; j++)
+			tunestr[off++] = value[j];
 		    }
 
 		  if (cur->security_level != TUNABLE_SECLEVEL_NONE)
@@ -299,9 +252,7 @@ parse_tunables (char *tunestr, char *valstring)
 	    }
 	}
 
-      if (p[len] == '\0')
-	return;
-      else
+      if (p[len] != '\0')
 	p += len + 1;
     }
 }
diff --git a/elf/dl-tunables.h b/elf/dl-tunables.h
index 971376ba8..3880e4aab 100644
--- a/elf/dl-tunables.h
+++ b/elf/dl-tunables.h
@@ -33,9 +33,11 @@ __tunables_init (char **unused __attribute__ ((unused)))
 # include <stddef.h>
 # include <stdint.h>
 
+typedef intmax_t tunable_num_t;
+
 typedef union
 {
-  int64_t numval;
+  tunable_num_t numval;
   const char *strval;
 } tunable_val_t;
 
@@ -52,7 +54,8 @@ typedef void (*tunable_callback_t) (tunable_val_t *);
 extern void __tunables_init (char **);
 extern void __tunables_print (void);
 extern void __tunable_get_val (tunable_id_t, void *, tunable_callback_t);
-extern void __tunable_set_val (tunable_id_t, void *, void *, void *);
+extern void __tunable_set_val (tunable_id_t, tunable_val_t *, tunable_num_t *,
+			       tunable_num_t *);
 rtld_hidden_proto (__tunables_init)
 rtld_hidden_proto (__tunables_print)
 rtld_hidden_proto (__tunable_get_val)
@@ -64,20 +67,18 @@ rtld_hidden_proto (__tunable_set_val)
 #if defined TOP_NAMESPACE && defined TUNABLE_NAMESPACE
 # define TUNABLE_GET(__id, __type, __cb) \
   TUNABLE_GET_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, __type, __cb)
-# define TUNABLE_SET(__id, __type, __val) \
-  TUNABLE_SET_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, __type, __val)
-# define TUNABLE_SET_WITH_BOUNDS(__id, __type, __val, __min, __max) \
+# define TUNABLE_SET(__id, __val) \
+  TUNABLE_SET_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, __val)
+# define TUNABLE_SET_WITH_BOUNDS(__id, __val, __min, __max) \
   TUNABLE_SET_WITH_BOUNDS_FULL (TOP_NAMESPACE, TUNABLE_NAMESPACE, __id, \
-				__type, __val, __min, __max)
+				__val, __min, __max)
 #else
 # define TUNABLE_GET(__top, __ns, __id, __type, __cb) \
   TUNABLE_GET_FULL (__top, __ns, __id, __type, __cb)
-# define TUNABLE_SET(__top, __ns, __id, __type, __val) \
-  TUNABLE_SET_FULL (__top, __ns, __id, __type, __val)
-# define TUNABLE_SET_WITH_BOUNDS(__top, __ns, __id, __type, __val, \
-				 __min, __max) \
-  TUNABLE_SET_WITH_BOUNDS_FULL (__top, __ns, __id, __type, __val, \
-				__min, __max)
+# define TUNABLE_SET(__top, __ns, __id, __val) \
+  TUNABLE_SET_FULL (__top, __ns, __id, __val)
+# define TUNABLE_SET_WITH_BOUNDS(__top, __ns, __id, __val, __min, __max) \
+  TUNABLE_SET_WITH_BOUNDS_FULL (__top, __ns, __id, __val, __min, __max)
 #endif
 
 /* Get and return a tunable value.  If the tunable was set externally and __CB
@@ -91,19 +92,19 @@ rtld_hidden_proto (__tunable_set_val)
 })
 
 /* Set a tunable value.  */
-# define TUNABLE_SET_FULL(__top, __ns, __id, __type, __val) \
+# define TUNABLE_SET_FULL(__top, __ns, __id, __val) \
 ({									      \
   __tunable_set_val (TUNABLE_ENUM_NAME (__top, __ns, __id),		      \
-		     & (__type) {__val}, NULL, NULL);			      \
+		     & (tunable_val_t) {.numval = __val}, NULL, NULL);	      \
 })
 
 /* Set a tunable value together with min/max values.  */
-# define TUNABLE_SET_WITH_BOUNDS_FULL(__top, __ns, __id, __type, __val,	      \
-				      __min, __max)			      \
+# define TUNABLE_SET_WITH_BOUNDS_FULL(__top, __ns, __id,__val, __min, __max)  \
 ({									      \
   __tunable_set_val (TUNABLE_ENUM_NAME (__top, __ns, __id),		      \
-		     & (__type) {__val},  & (__type) {__min},		      \
-		     & (__type) {__max});				      \
+		     & (tunable_val_t) {.numval = __val},		      \
+		     & (tunable_num_t) {__min},				      \
+		     & (tunable_num_t) {__max});			      \
 })
 
 /* Namespace sanity for callback functions.  Use this macro to keep the
@@ -114,6 +115,24 @@ rtld_hidden_proto (__tunable_set_val)
 /* The default value for TUNABLES_FRONTEND.  */
 # define TUNABLES_FRONTEND_yes TUNABLES_FRONTEND_valstring
 
+static __always_inline bool
+tunable_val_lt (tunable_num_t lhs, tunable_num_t rhs, bool unsigned_cmp)
+{
+  if (unsigned_cmp)
+    return (uintmax_t) lhs < (uintmax_t) rhs;
+  else
+    return lhs < rhs;
+}
+
+static __always_inline bool
+tunable_val_gt (tunable_num_t lhs, tunable_num_t rhs, bool unsigned_cmp)
+{
+  if (unsigned_cmp)
+    return (uintmax_t) lhs > (uintmax_t) rhs;
+  else
+    return lhs > rhs;
+}
+
 /* Compare two name strings, bounded by the name hardcoded in glibc.  */
 static __always_inline bool
 tunable_is_name (const char *orig, const char *envname)
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index 3cf0ad83e..8ddd4a231 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -64,6 +64,7 @@ glibc {
       type: INT_32
       env_alias: MALLOC_MMAP_MAX_
       security_level: SXID_IGNORE
+      minval: 0
     }
     arena_max {
       type: SIZE_T
@@ -109,22 +110,27 @@ glibc {
     skip_lock_busy {
       type: INT_32
       default: 3
+      minval: 0
     }
     skip_lock_internal_abort {
       type: INT_32
       default: 3
+      minval: 0
     }
     skip_lock_after_retries {
       type: INT_32
       default: 3
+      minval: 0
     }
     tries {
       type: INT_32
       default: 3
+      minval: 0
     }
     skip_trylock_internal_abort {
       type: INT_32
       default: 3
+      minval: 0
     }
   }
 
diff --git a/elf/dl-usage.c b/elf/dl-usage.c
index 6e26818bd..5ad3a7255 100644
--- a/elf/dl-usage.c
+++ b/elf/dl-usage.c
@@ -261,6 +261,7 @@ setting environment variables (which would be inherited by subprocesses).\n\
   --list-tunables       list all tunables with minimum and maximum values\n"
 #endif
 "\
+  --list-diagnostics    list diagnostics information\n\
   --help                display this help and exit\n\
   --version             output version information and exit\n\
 \n\
diff --git a/elf/rtld.c b/elf/rtld.c
index 596b6ac3d..489e58c55 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -141,6 +141,7 @@ static void dl_main_state_init (struct dl_main_state *state);
 /* Process all environments variables the dynamic linker must recognize.
    Since all of them start with `LD_' we are a bit smarter while finding
    all the entries.  */
+extern char **_environ attribute_hidden;
 static void process_envvars (struct dl_main_state *state);
 
 #ifdef DL_ARGV_NOT_RELRO
@@ -379,7 +380,6 @@ struct rtld_global_ro _rtld_global_ro attribute_relro =
 extern struct rtld_global_ro _rtld_local_ro
     __attribute__ ((alias ("_rtld_global_ro"), visibility ("hidden")));
 
-
 static void dl_main (const ElfW(Phdr) *phdr, ElfW(Word) phnum,
 		     ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv);
 
@@ -1287,6 +1287,14 @@ dl_main (const ElfW(Phdr) *phdr,
 	    ++_dl_argv;
 	  }
 #endif
+	else if (! strcmp (_dl_argv[1], "--list-diagnostics"))
+	  {
+	    state.mode = rtld_mode_list_diagnostics;
+
+	    ++_dl_skip_args;
+	    --_dl_argc;
+	    ++_dl_argv;
+	  }
 	else if (strcmp (_dl_argv[1], "--help") == 0)
 	  {
 	    state.mode = rtld_mode_help;
@@ -1315,6 +1323,9 @@ dl_main (const ElfW(Phdr) *phdr,
 	}
 #endif
 
+      if (state.mode == rtld_mode_list_diagnostics)
+	_dl_print_diagnostics (_environ);
+
       /* If we have no further argument the program was called incorrectly.
 	 Grant the user some education.  */
       if (_dl_argc < 2)
@@ -2649,12 +2660,6 @@ a filename can be specified using the LD_DEBUG_OUTPUT environment variable.\n");
     }
 }
 
-/* Process all environments variables the dynamic linker must recognize.
-   Since all of them start with `LD_' we are a bit smarter while finding
-   all the entries.  */
-extern char **_environ attribute_hidden;
-
-
 static void
 process_envvars (struct dl_main_state *state)
 {
diff --git a/elf/tst-dlmopen-dlerror-mod.c b/elf/tst-dlmopen-dlerror-mod.c
new file mode 100644
index 000000000..7e95dcdea
--- /dev/null
+++ b/elf/tst-dlmopen-dlerror-mod.c
@@ -0,0 +1,41 @@
+/* Check that dlfcn errors are reported properly after dlmopen.  Test module.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <stddef.h>
+#include <support/check.h>
+
+/* Note: This object is not linked into the main program, so we cannot
+   use delayed test failure reporting via TEST_VERIFY etc., and have
+   to use FAIL_EXIT1 (or something else that calls exit).  */
+
+void
+call_dlsym (void)
+{
+  void *ptr = dlsym (NULL, "does not exist");
+  if (ptr != NULL)
+    FAIL_EXIT1 ("dlsym did not fail as expected");
+}
+
+void
+call_dlopen (void)
+{
+  void *handle = dlopen ("tst-dlmopen-dlerror does not exist", RTLD_NOW);
+  if (handle != NULL)
+    FAIL_EXIT1 ("dlopen did not fail as expected");
+}
diff --git a/elf/tst-dlmopen-dlerror.c b/elf/tst-dlmopen-dlerror.c
new file mode 100644
index 000000000..e864d2fe4
--- /dev/null
+++ b/elf/tst-dlmopen-dlerror.c
@@ -0,0 +1,37 @@
+/* Check that dlfcn errors are reported properly after dlmopen.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stddef.h>
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+static int
+do_test (void)
+{
+  void *handle = xdlmopen (LM_ID_NEWLM, "tst-dlmopen-dlerror-mod.so",
+                           RTLD_NOW);
+  void (*call_dlsym) (void) = xdlsym (handle, "call_dlsym");
+  void (*call_dlopen) (void) = xdlsym (handle, "call_dlopen");
+
+  call_dlsym ();
+  call_dlopen ();
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-dlmopen-gethostbyname-mod.c b/elf/tst-dlmopen-gethostbyname-mod.c
new file mode 100644
index 000000000..9a68ea505
--- /dev/null
+++ b/elf/tst-dlmopen-gethostbyname-mod.c
@@ -0,0 +1,29 @@
+/* Exercise dlerror_run in elf/dl-libc.c after dlmopen, via NSS.  Helper module.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <netdb.h>
+#include <nss.h>
+
+void
+call_gethostbyname (void)
+{
+  __nss_configure_lookup ("hosts", "files");
+  /* This should not terminate the process due to a missing
+     _nss_files_getcanonname_r symbol.  */
+  gethostbyname ("localhost");
+}
diff --git a/elf/tst-dlmopen-gethostbyname.c b/elf/tst-dlmopen-gethostbyname.c
new file mode 100644
index 000000000..12deb2990
--- /dev/null
+++ b/elf/tst-dlmopen-gethostbyname.c
@@ -0,0 +1,31 @@
+/* Exercise dlerror_run in elf/dl-libc.c after dlmopen, via NSS (bug 27646).
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/xdlfcn.h>
+
+static int
+do_test (void)
+{
+  void *handle = xdlmopen (LM_ID_NEWLM, "tst-dlmopen-gethostbyname-mod.so",
+                           RTLD_NOW);
+  void (*call_gethostbyname) (void) = xdlsym (handle, "call_gethostbyname");
+  call_gethostbyname ();
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-dst-static.c b/elf/tst-dst-static.c
new file mode 100644
index 000000000..56eb371c9
--- /dev/null
+++ b/elf/tst-dst-static.c
@@ -0,0 +1,32 @@
+/* Test DST expansion for static binaries doesn't carsh.  Bug 23462.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* The purpose of this test is to exercise the code in elf/dl-loac.c
+   (_dl_init_paths) or thereabout and ensure that static binaries
+   don't crash when expanding DSTs.
+
+   If the dynamic loader code linked into the static binary cannot
+   handle expanding the DSTs e.g. null-deref on an incomplete link
+   map, then it will crash before reaching main, so the test harness
+   is unnecessary.  */
+
+int
+main (void)
+{
+  return 0;
+}
diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c
index 50bef8683..05619c9ad 100644
--- a/elf/tst-env-setuid-tunables.c
+++ b/elf/tst-env-setuid-tunables.c
@@ -25,35 +25,76 @@
 #include "config.h"
 #undef _LIBC
 
-#define test_parent test_parent_tunables
-#define test_child test_child_tunables
-
-static int test_child_tunables (void);
-static int test_parent_tunables (void);
-
-#include "tst-env-setuid.c"
-
-#define CHILD_VALSTRING_VALUE "glibc.malloc.mmap_threshold=4096"
-#define PARENT_VALSTRING_VALUE \
-  "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096"
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <intprops.h>
+#include <array_length.h>
+
+#include <support/check.h>
+#include <support/support.h>
+#include <support/test-driver.h>
+#include <support/capture_subprocess.h>
+
+const char *teststrings[] =
+{
+  "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.check=2:glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.check=2:glibc.malloc.mmap_threshold=4096:glibc.malloc.check=2",
+  "glibc.malloc.perturb=0x800",
+  "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.perturb=0x800:not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+  "glibc.not_valid.check=2:glibc.malloc.mmap_threshold=4096",
+  "not_valid.malloc.check=2:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096:glibc.malloc.check=2",
+  "glibc.malloc.check=4:glibc.malloc.garbage=2:glibc.maoc.mmap_threshold=4096",
+  ":glibc.malloc.garbage=2:glibc.malloc.check=1",
+  "glibc.malloc.check=1:glibc.malloc.check=2",
+  "not_valid.malloc.check=2",
+  "glibc.not_valid.check=2",
+};
+
+const char *resultstrings[] =
+{
+  "glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.perturb=0x800",
+  "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.perturb=0x800:glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.mmap_threshold=4096",
+  "glibc.malloc.mmap_threshold=4096",
+  "",
+  "",
+  "",
+  "",
+  "",
+  "",
+};
 
 static int
-test_child_tunables (void)
+test_child (int off)
 {
   const char *val = getenv ("GLIBC_TUNABLES");
 
 #if HAVE_TUNABLES
-  if (val != NULL && strcmp (val, CHILD_VALSTRING_VALUE) == 0)
+  if (val != NULL && strcmp (val, resultstrings[off]) == 0)
     return 0;
 
   if (val != NULL)
-    printf ("Unexpected GLIBC_TUNABLES VALUE %s\n", val);
+    printf ("[%d] Unexpected GLIBC_TUNABLES VALUE %s\n", off, val);
 
   return 1;
 #else
   if (val != NULL)
     {
-      printf ("GLIBC_TUNABLES not cleared\n");
+      printf ("[%d] GLIBC_TUNABLES not cleared\n", off);
       return 1;
     }
   return 0;
@@ -61,15 +102,48 @@ test_child_tunables (void)
 }
 
 static int
-test_parent_tunables (void)
+do_test (int argc, char **argv)
 {
-  const char *val = getenv ("GLIBC_TUNABLES");
+  /* Setgid child process.  */
+  if (argc == 2)
+    {
+      if (getgid () == getegid ())
+	/* This can happen if the file system is mounted nosuid.  */
+	FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+			  (intmax_t) getgid ());
 
-  if (val != NULL && strcmp (val, PARENT_VALSTRING_VALUE) == 0)
-    return 0;
+      int ret = test_child (atoi (argv[1]));
 
-  if (val != NULL)
-    printf ("Unexpected GLIBC_TUNABLES VALUE %s\n", val);
+      if (ret != 0)
+	exit (1);
 
-  return 1;
+      exit (EXIT_SUCCESS);
+    }
+  else
+    {
+      int ret = 0;
+
+      /* Spawn tests.  */
+      for (int i = 0; i < array_length (teststrings); i++)
+	{
+	  char buf[INT_BUFSIZE_BOUND (int)];
+
+	  printf ("Spawned test for %s (%d)\n", teststrings[i], i);
+	  snprintf (buf, sizeof (buf), "%d\n", i);
+	  if (setenv ("GLIBC_TUNABLES", teststrings[i], 1) != 0)
+	    exit (1);
+
+	  int status = support_capture_subprogram_self_sgid (buf);
+
+	  /* Bail out early if unsupported.  */
+	  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+	    return EXIT_UNSUPPORTED;
+
+	  ret |= status;
+	}
+      return ret;
+    }
 }
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c
index 60ae0ca38..49b5e319e 100644
--- a/elf/tst-env-setuid.c
+++ b/elf/tst-env-setuid.c
@@ -29,173 +29,12 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
+#include <support/check.h>
 #include <support/support.h>
 #include <support/test-driver.h>
+#include <support/capture_subprocess.h>
 
 static char SETGID_CHILD[] = "setgid-child";
-#define CHILD_STATUS 42
-
-/* Return a GID which is not our current GID, but is present in the
-   supplementary group list.  */
-static gid_t
-choose_gid (void)
-{
-  const int count = 64;
-  gid_t groups[count];
-  int ret = getgroups (count, groups);
-  if (ret < 0)
-    {
-      printf ("getgroups: %m\n");
-      exit (1);
-    }
-  gid_t current = getgid ();
-  for (int i = 0; i < ret; ++i)
-    {
-      if (groups[i] != current)
-	return groups[i];
-    }
-  return 0;
-}
-
-/* Spawn and execute a program and verify that it returns the CHILD_STATUS.  */
-static pid_t
-do_execve (char **args)
-{
-  pid_t kid = vfork ();
-
-  if (kid < 0)
-    {
-      printf ("vfork: %m\n");
-      return -1;
-    }
-
-  if (kid == 0)
-    {
-      /* Child process.  */
-      execve (args[0], args, environ);
-      _exit (-errno);
-    }
-
-  if (kid < 0)
-    return 1;
-
-  int status;
-
-  if (waitpid (kid, &status, 0) < 0)
-    {
-      printf ("waitpid: %m\n");
-      return 1;
-    }
-
-  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
-    return EXIT_UNSUPPORTED;
-
-  if (!WIFEXITED (status) || WEXITSTATUS (status) != CHILD_STATUS)
-    {
-      printf ("Unexpected exit status %d from child process\n",
-	      WEXITSTATUS (status));
-      return 1;
-    }
-  return 0;
-}
-
-/* Copies the executable into a restricted directory, so that we can
-   safely make it SGID with the TARGET group ID.  Then runs the
-   executable.  */
-static int
-run_executable_sgid (gid_t target)
-{
-  char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
-			     test_dir, (intmax_t) getpid ());
-  char *execname = xasprintf ("%s/bin", dirname);
-  int infd = -1;
-  int outfd = -1;
-  int ret = 0;
-  if (mkdir (dirname, 0700) < 0)
-    {
-      printf ("mkdir: %m\n");
-      goto err;
-    }
-  infd = open ("/proc/self/exe", O_RDONLY);
-  if (infd < 0)
-    {
-      printf ("open (/proc/self/exe): %m\n");
-      goto err;
-    }
-  outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
-  if (outfd < 0)
-    {
-      printf ("open (%s): %m\n", execname);
-      goto err;
-    }
-  char buf[4096];
-  for (;;)
-    {
-      ssize_t rdcount = read (infd, buf, sizeof (buf));
-      if (rdcount < 0)
-	{
-	  printf ("read: %m\n");
-	  goto err;
-	}
-      if (rdcount == 0)
-	break;
-      char *p = buf;
-      char *end = buf + rdcount;
-      while (p != end)
-	{
-	  ssize_t wrcount = write (outfd, buf, end - p);
-	  if (wrcount == 0)
-	    errno = ENOSPC;
-	  if (wrcount <= 0)
-	    {
-	      printf ("write: %m\n");
-	      goto err;
-	    }
-	  p += wrcount;
-	}
-    }
-  if (fchown (outfd, getuid (), target) < 0)
-    {
-      printf ("fchown (%s): %m\n", execname);
-      goto err;
-    }
-  if (fchmod (outfd, 02750) < 0)
-    {
-      printf ("fchmod (%s): %m\n", execname);
-      goto err;
-    }
-  if (close (outfd) < 0)
-    {
-      printf ("close (outfd): %m\n");
-      goto err;
-    }
-  if (close (infd) < 0)
-    {
-      printf ("close (infd): %m\n");
-      goto err;
-    }
-
-  char *args[] = {execname, SETGID_CHILD, NULL};
-
-  ret = do_execve (args);
-
-err:
-  if (outfd >= 0)
-    close (outfd);
-  if (infd >= 0)
-    close (infd);
-  if (execname)
-    {
-      unlink (execname);
-      free (execname);
-    }
-  if (dirname)
-    {
-      rmdir (dirname);
-      free (dirname);
-    }
-  return ret;
-}
 
 #ifndef test_child
 static int
@@ -256,40 +95,32 @@ do_test (int argc, char **argv)
   if (argc == 2 && strcmp (argv[1], SETGID_CHILD) == 0)
     {
       if (getgid () == getegid ())
-	{
-	  /* This can happen if the file system is mounted nosuid.  */
-	  fprintf (stderr, "SGID failed: GID and EGID match (%jd)\n",
-		   (intmax_t) getgid ());
-	  exit (EXIT_UNSUPPORTED);
-	}
+	/* This can happen if the file system is mounted nosuid.  */
+	FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+			  (intmax_t) getgid ());
 
       int ret = test_child ();
 
       if (ret != 0)
 	exit (1);
 
-      exit (CHILD_STATUS);
+      exit (EXIT_SUCCESS);
     }
   else
     {
       if (test_parent () != 0)
 	exit (1);
 
-      /* Try running a setgid program.  */
-      gid_t target = choose_gid ();
-      if (target == 0)
-	{
-	  fprintf (stderr,
-		   "Could not find a suitable GID for user %jd, skipping test\n",
-		   (intmax_t) getuid ());
-	  exit (0);
-	}
+      int status = support_capture_subprogram_self_sgid (SETGID_CHILD);
 
-      return run_executable_sgid (target);
-    }
+      if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+	return EXIT_UNSUPPORTED;
+
+      if (!WIFEXITED (status))
+	FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
 
-  /* Something went wrong and our argv was corrupted.  */
-  _exit (1);
+      return 0;
+    }
 }
 
 #define TEST_FUNCTION_ARGV do_test
diff --git a/elf/tst-glibc-hwcaps-cache.script b/elf/tst-glibc-hwcaps-cache.script
index c3271f61f..d58fc8c5d 100644
--- a/elf/tst-glibc-hwcaps-cache.script
+++ b/elf/tst-glibc-hwcaps-cache.script
@@ -4,6 +4,7 @@
 cp $B/elf/libmarkermod2-1.so $L/libmarkermod2.so
 cp $B/elf/libmarkermod3-1.so $L/libmarkermod3.so
 cp $B/elf/libmarkermod4-1.so $L/libmarkermod4.so
+cp $B/elf/libmarkermod5-1.so $L/libmarkermod5.so
 
 mkdirp 0770 $L/glibc-hwcaps/power9
 cp $B/elf/libmarkermod2-2.so $L/glibc-hwcaps/power9/libmarkermod2.so
@@ -20,6 +21,11 @@ mkdirp 0770 $L/glibc-hwcaps/z15
 cp $B/elf/libmarkermod4-2.so $L/glibc-hwcaps/z13/libmarkermod4.so
 cp $B/elf/libmarkermod4-3.so $L/glibc-hwcaps/z14/libmarkermod4.so
 cp $B/elf/libmarkermod4-4.so $L/glibc-hwcaps/z15/libmarkermod4.so
+mkdirp 0770 $L/glibc-hwcaps/z16
+cp $B/elf/libmarkermod5-2.so $L/glibc-hwcaps/z13/libmarkermod5.so
+cp $B/elf/libmarkermod5-3.so $L/glibc-hwcaps/z14/libmarkermod5.so
+cp $B/elf/libmarkermod5-4.so $L/glibc-hwcaps/z15/libmarkermod5.so
+cp $B/elf/libmarkermod5-5.so $L/glibc-hwcaps/z16/libmarkermod5.so
 
 mkdirp 0770 $L/glibc-hwcaps/x86-64-v2
 cp $B/elf/libmarkermod2-2.so $L/glibc-hwcaps/x86-64-v2/libmarkermod2.so
diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
index 4f3f7ee4e..9f66c5288 100644
--- a/elf/tst-rtld-list-tunables.exp
+++ b/elf/tst-rtld-list-tunables.exp
@@ -1,7 +1,7 @@
 glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+)
 glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+)
 glibc.malloc.check: 0 (min: 0, max: 3)
-glibc.malloc.mmap_max: 0 (min: -2147483648, max: 2147483647)
+glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647)
 glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+)
 glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+)
 glibc.malloc.perturb: 0 (min: 0, max: 255)
diff --git a/iconvdata/Makefile b/iconvdata/Makefile
index 55c527a5f..2612d2557 100644
--- a/iconvdata/Makefile
+++ b/iconvdata/Makefile
@@ -1,4 +1,5 @@
 # Copyright (C) 1997-2021 Free Software Foundation, Inc.
+# Copyright (C) The GNU Toolchain Authors.
 # This file is part of the GNU C Library.
 
 # The GNU C Library is free software; you can redistribute it and/or
@@ -74,7 +75,7 @@ ifeq (yes,$(build-shared))
 tests = bug-iconv1 bug-iconv2 tst-loading tst-e2big tst-iconv4 bug-iconv4 \
 	tst-iconv6 bug-iconv5 bug-iconv6 tst-iconv7 bug-iconv8 bug-iconv9 \
 	bug-iconv10 bug-iconv11 bug-iconv12 tst-iconv-big5-hkscs-to-2ucs4 \
-	bug-iconv13 bug-iconv14
+	bug-iconv13 bug-iconv14 bug-iconv15
 ifeq ($(have-thread-library),yes)
 tests += bug-iconv3
 endif
@@ -324,6 +325,8 @@ $(objpfx)bug-iconv12.out: $(objpfx)gconv-modules \
 			  $(addprefix $(objpfx),$(modules.so))
 $(objpfx)bug-iconv14.out: $(objpfx)gconv-modules \
 			  $(addprefix $(objpfx),$(modules.so))
+$(objpfx)bug-iconv15.out: $(addprefix $(objpfx), $(gconv-modules)) \
+			  $(addprefix $(objpfx),$(modules.so))
 
 $(objpfx)iconv-test.out: run-iconv-test.sh $(objpfx)gconv-modules \
 			 $(addprefix $(objpfx),$(modules.so)) \
diff --git a/iconvdata/bug-iconv15.c b/iconvdata/bug-iconv15.c
new file mode 100644
index 000000000..cc04bd031
--- /dev/null
+++ b/iconvdata/bug-iconv15.c
@@ -0,0 +1,60 @@
+/* Bug 28524: Conversion from ISO-2022-JP-3 with iconv
+   may emit spurious NUL character on state reset.
+   Copyright (C) The GNU Toolchain Authors.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stddef.h>
+#include <iconv.h>
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+  char in[] = "\x1b(I";
+  char *inbuf = in;
+  size_t inleft = sizeof (in) - 1;
+  char out[1];
+  char *outbuf = out;
+  size_t outleft = sizeof (out);
+  iconv_t cd;
+
+  cd = iconv_open ("UTF8", "ISO-2022-JP-3");
+  TEST_VERIFY_EXIT (cd != (iconv_t) -1);
+
+  /* First call to iconv should alter internal state.
+     Now, JISX0201_Kana_set is selected and
+     state value != ASCII_set.  */
+  TEST_VERIFY (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (size_t) -1);
+
+  /* No bytes should have been added to
+     the output buffer at this point.  */
+  TEST_VERIFY (outbuf == out);
+  TEST_VERIFY (outleft == sizeof (out));
+
+  /* Second call shall emit spurious NUL character in unpatched glibc.  */
+  TEST_VERIFY (iconv (cd, NULL, NULL, &outbuf, &outleft) != (size_t) -1);
+
+  /* No characters are expected to be produced.  */
+  TEST_VERIFY (outbuf == out);
+  TEST_VERIFY (outleft == sizeof (out));
+
+  TEST_VERIFY_EXIT (iconv_close (cd) != -1);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/iconvdata/iso-2022-jp-3.c b/iconvdata/iso-2022-jp-3.c
index c8ba88cdc..5fc0c0f73 100644
--- a/iconvdata/iso-2022-jp-3.c
+++ b/iconvdata/iso-2022-jp-3.c
@@ -1,5 +1,6 @@
 /* Conversion module for ISO-2022-JP-3.
    Copyright (C) 1998-2021 Free Software Foundation, Inc.
+   Copyright (C) The GNU Toolchain Authors.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998,
    and Bruno Haible <bruno@clisp.org>, 2002.
@@ -81,20 +82,31 @@ enum
    the output state to the initial state.  This has to be done during the
    flushing.  */
 #define EMIT_SHIFT_TO_INIT \
-  if (data->__statep->__count != ASCII_set)			      \
+  if ((data->__statep->__count & ~7) != ASCII_set)			      \
     {									      \
       if (FROM_DIRECTION)						      \
 	{								      \
-	  if (__glibc_likely (outbuf + 4 <= outend))			      \
+	  uint32_t ch = data->__statep->__count >> 6;			      \
+									      \
+	  if (__glibc_unlikely (ch != 0))				      \
 	    {								      \
-	      /* Write out the last character.  */			      \
-	      *((uint32_t *) outbuf) = data->__statep->__count >> 6;	      \
-	      outbuf += sizeof (uint32_t);				      \
-	      data->__statep->__count = ASCII_set;			\
+	      if (__glibc_likely (outbuf + 4 <= outend))		      \
+		{							      \
+		  /* Write out the last character.  */			      \
+		  put32u (outbuf, ch);					      \
+		  outbuf += 4;						      \
+		  data->__statep->__count &= 7;				      \
+		  data->__statep->__count |= ASCII_set;			      \
+		}							      \
+	      else							      \
+		/* We don't have enough room in the output buffer.  */	      \
+		status = __GCONV_FULL_OUTPUT;				      \
 	    }								      \
 	  else								      \
-	    /* We don't have enough room in the output buffer.  */	      \
-	    status = __GCONV_FULL_OUTPUT;				      \
+	    {								      \
+	      data->__statep->__count &= 7;				      \
+	      data->__statep->__count |= ASCII_set;			      \
+	    }								      \
 	}								      \
       else								      \
 	{								      \
diff --git a/include/libc-symbols.h b/include/libc-symbols.h
index ea126ae70..c83e550b0 100644
--- a/include/libc-symbols.h
+++ b/include/libc-symbols.h
@@ -59,6 +59,19 @@
 # define IN_MODULE (-1)
 #endif
 
+/* Use symbol_version_reference to specify the version a symbol
+   reference should link to.  Use symbol_version or
+   default_symbol_version for the definition of a versioned symbol.
+   The difference is that the latter is a no-op in non-shared
+   builds.  */
+#ifdef __ASSEMBLER__
+# define symbol_version_reference(real, name, version) \
+     .symver real, name##@##version
+#else  /* !__ASSEMBLER__ */
+# define symbol_version_reference(real, name, version) \
+  __asm__ (".symver " #real "," #name "@" #version)
+#endif
+
 #ifndef _ISOMAC
 
 /* This is defined for the compilation of all C library code.  features.h
@@ -396,19 +409,6 @@ for linking")
    past the last element in SET.  */
 #define symbol_set_end_p(set, ptr) ((ptr) >= (void *const *) &__stop_##set)
 
-/* Use symbol_version_reference to specify the version a symbol
-   reference should link to.  Use symbol_version or
-   default_symbol_version for the definition of a versioned symbol.
-   The difference is that the latter is a no-op in non-shared
-   builds.  */
-#ifdef __ASSEMBLER__
-# define symbol_version_reference(real, name, version) \
-     .symver real, name##@##version
-#else  /* !__ASSEMBLER__ */
-# define symbol_version_reference(real, name, version) \
-  __asm__ (".symver " #real "," #name "@" #version)
-#endif
-
 #ifdef SHARED
 # define symbol_version(real, name, version) \
   symbol_version_reference(real, name, version)
diff --git a/include/sys/un.h b/include/sys/un.h
index bdbee9998..152afd9fc 100644
--- a/include/sys/un.h
+++ b/include/sys/un.h
@@ -1 +1,13 @@
 #include <socket/sys/un.h>
+
+#ifndef _ISOMAC
+
+/* Set ADDR->sun_family to AF_UNIX and ADDR->sun_path to PATHNAME.
+   Return 0 on success or -1 on failure (due to overlong PATHNAME).
+   The caller should always use sizeof (struct sockaddr_un) as the
+   socket address length, disregaring the length of PATHNAME.
+   Only concrete (non-abstract) pathnames are supported.  */
+int __sockaddr_un_set (struct sockaddr_un *addr, const char *pathname)
+  attribute_hidden;
+
+#endif /* _ISOMAC */
diff --git a/include/time.h b/include/time.h
index caf2af5e7..e0636132a 100644
--- a/include/time.h
+++ b/include/time.h
@@ -502,6 +502,11 @@ time_now (void)
   __clock_gettime (TIME_CLOCK_GETTIME_CLOCKID, &ts);
   return ts.tv_sec;
 }
+
+#define NSEC_PER_SEC    1000000000L  /* Nanoseconds per second.  */
+#define USEC_PER_SEC    1000000L     /* Microseconds per second.  */
+#define NSEC_PER_USEC   1000L        /* Nanoseconds per microsecond.  */
+
 #endif
 
 #endif
diff --git a/io/Makefile b/io/Makefile
index b7bebe923..d145d88f4 100644
--- a/io/Makefile
+++ b/io/Makefile
@@ -68,7 +68,7 @@ tests		:= test-utime test-stat test-stat2 test-lfs tst-getcwd \
 		   tst-fts tst-fts-lfs tst-open-tmpfile \
 		   tst-copy_file_range tst-getcwd-abspath tst-lockf \
 		   tst-ftw-lnk tst-file_change_detection tst-lchmod \
-		   tst-ftw-bz26353
+		   tst-ftw-bz26353 tst-stat tst-stat-lfs
 
 # Likewise for statx, but we do not need static linking here.
 tests-internal += tst-statx
diff --git a/io/fstat.c b/io/fstat.c
index dc117361f..17f31bf3b 100644
--- a/io/fstat.c
+++ b/io/fstat.c
@@ -16,10 +16,16 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sys/stat.h>
+#include <errno.h>
 
 int
 __fstat (int fd, struct stat *buf)
 {
+  if (fd < 0)
+    {
+      __set_errno (EBADF);
+      return -1;
+    }
   return __fstatat (fd, "", buf, AT_EMPTY_PATH);
 }
 
diff --git a/io/fstat64.c b/io/fstat64.c
index addf37977..618170695 100644
--- a/io/fstat64.c
+++ b/io/fstat64.c
@@ -16,10 +16,16 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sys/stat.h>
+#include <errno.h>
 
 int
 __fstat64 (int fd, struct stat64 *buf)
 {
+  if (fd < 0)
+    {
+      __set_errno (EBADF);
+      return -1;
+    }
   return __fstatat64 (fd, "", buf, AT_EMPTY_PATH);
 }
 hidden_def (__fstat64)
diff --git a/io/tst-stat-lfs.c b/io/tst-stat-lfs.c
new file mode 100644
index 000000000..b53f460ad
--- /dev/null
+++ b/io/tst-stat-lfs.c
@@ -0,0 +1,2 @@
+#define _FILE_OFFSET_BITS 64
+#include "tst-stat.c"
diff --git a/io/tst-stat.c b/io/tst-stat.c
new file mode 100644
index 000000000..445ac4176
--- /dev/null
+++ b/io/tst-stat.c
@@ -0,0 +1,102 @@
+/* Basic tests for stat, lstat, fstat, and fstatat.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <array_length.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/temp_file.h>
+#include <support/xunistd.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+
+static void
+stat_check (int fd, const char *path, struct stat *st)
+{
+  TEST_COMPARE (stat (path, st), 0);
+}
+
+static void
+lstat_check (int fd, const char *path, struct stat *st)
+{
+  TEST_COMPARE (lstat (path, st), 0);
+}
+
+static void
+fstat_check (int fd, const char *path, struct stat *st)
+{
+  /* Test for invalid fstat input (BZ #27559).  */
+  TEST_COMPARE (fstat (AT_FDCWD, st), -1);
+  TEST_COMPARE (errno, EBADF);
+
+  TEST_COMPARE (fstat (fd, st), 0);
+}
+
+static void
+fstatat_check (int fd, const char *path, struct stat *st)
+{
+  TEST_COMPARE (fstatat (fd, "", st, 0), -1);
+  TEST_COMPARE (errno, ENOENT);
+
+  TEST_COMPARE (fstatat (fd, path, st, 0), 0);
+}
+
+typedef void (*test_t)(int, const char *path, struct stat *);
+
+static int
+do_test (void)
+{
+  char *path;
+  int fd = create_temp_file ("tst-fstat.", &path);
+  TEST_VERIFY_EXIT (fd >= 0);
+  support_write_file_string (path, "abc");
+
+  struct statx stx;
+  TEST_COMPARE (statx (fd, path, 0, STATX_BASIC_STATS, &stx), 0);
+
+  test_t tests[] = { stat_check, lstat_check, fstat_check, fstatat_check };
+
+  for (int i = 0; i < array_length (tests); i++)
+    {
+      struct stat st;
+      tests[i](fd, path, &st);
+
+      TEST_COMPARE (stx.stx_dev_major, major (st.st_dev));
+      TEST_COMPARE (stx.stx_dev_minor, minor (st.st_dev));
+      TEST_COMPARE (stx.stx_ino, st.st_ino);
+      TEST_COMPARE (stx.stx_mode, st.st_mode);
+      TEST_COMPARE (stx.stx_nlink, st.st_nlink);
+      TEST_COMPARE (stx.stx_uid, st.st_uid);
+      TEST_COMPARE (stx.stx_gid, st.st_gid);
+      TEST_COMPARE (stx.stx_rdev_major, major (st.st_rdev));
+      TEST_COMPARE (stx.stx_rdev_minor, minor (st.st_rdev));
+      TEST_COMPARE (stx.stx_blksize, st.st_blksize);
+      TEST_COMPARE (stx.stx_blocks, st.st_blocks);
+
+      TEST_COMPARE (stx.stx_ctime.tv_sec, st.st_ctim.tv_sec);
+      TEST_COMPARE (stx.stx_ctime.tv_nsec, st.st_ctim.tv_nsec);
+      TEST_COMPARE (stx.stx_mtime.tv_sec, st.st_mtim.tv_sec);
+      TEST_COMPARE (stx.stx_mtime.tv_nsec, st.st_mtim.tv_nsec);
+    }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 1f4bbd8ed..8f8f12c27 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3446,7 +3446,9 @@ __libc_realloc (void *oldmem, size_t bytes)
       newp = __libc_malloc (bytes);
       if (newp != NULL)
         {
-          memcpy (newp, oldmem, oldsize - SIZE_SZ);
+	  size_t sz = CHUNK_AVAILABLE_SIZE (oldp) - CHUNK_HDR_SZ;
+	  memcpy (newp, oldmem, sz);
+	  (void) TAG_REGION (chunk2rawmem (oldp), sz);
           _int_free (ar_ptr, oldp, 0);
         }
     }
diff --git a/misc/Makefile b/misc/Makefile
index b08d7c68a..05ad034ba 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -88,7 +88,7 @@ tests := tst-dirname tst-tsearch tst-fdset tst-mntent tst-hsearch \
 	 tst-preadvwritev tst-preadvwritev64 tst-makedev tst-empty \
 	 tst-preadvwritev2 tst-preadvwritev64v2 tst-warn-wide \
 	 tst-ldbl-warn tst-ldbl-error tst-dbl-efgcvt tst-ldbl-efgcvt \
-	 tst-mntent-autofs tst-syscalls tst-mntent-escape
+	 tst-mntent-autofs tst-syscalls tst-mntent-escape tst-select
 
 # Tests which need libdl.
 ifeq (yes,$(build-shared))
diff --git a/misc/tst-select.c b/misc/tst-select.c
new file mode 100644
index 000000000..52aa26651
--- /dev/null
+++ b/misc/tst-select.c
@@ -0,0 +1,143 @@
+/* Test for select timeout.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <support/capture_subprocess.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/timespec.h>
+#include <support/xunistd.h>
+#include <support/xtime.h>
+#include <support/xsignal.h>
+
+struct child_args
+{
+  int fds[2][2];
+  struct timeval tmo;
+};
+
+static void
+alarm_handler (int signum)
+{
+  /* Do nothing.  */
+}
+
+static void
+do_test_child (void *clousure)
+{
+  struct child_args *args = (struct child_args *) clousure;
+
+  close (args->fds[0][1]);
+  close (args->fds[1][0]);
+
+  fd_set rfds;
+  FD_ZERO (&rfds);
+  FD_SET (args->fds[0][0], &rfds);
+
+  struct timespec ts = xclock_now (CLOCK_REALTIME);
+  ts = timespec_add (ts, (struct timespec) { args->tmo.tv_sec, 0 });
+
+  int r = select (args->fds[0][0] + 1, &rfds, NULL, NULL, &args->tmo);
+  TEST_COMPARE (r, 0);
+
+  if (support_select_modifies_timeout ())
+    {
+      TEST_COMPARE (args->tmo.tv_sec, 0);
+      TEST_COMPARE (args->tmo.tv_usec, 0);
+    }
+
+  TEST_TIMESPEC_NOW_OR_AFTER (CLOCK_REALTIME, ts);
+
+  xwrite (args->fds[1][1], "foo", 3);
+}
+
+static void
+do_test_child_alarm (void *clousure)
+{
+  struct sigaction act = { .sa_handler = alarm_handler };
+  xsigaction (SIGALRM, &act, NULL);
+  alarm (1);
+
+  struct timeval tv = { .tv_sec = 10, .tv_usec = 0 };
+  int r = select (0, NULL, NULL, NULL, &tv);
+  TEST_COMPARE (r, -1);
+  TEST_COMPARE (errno, EINTR);
+
+  if (support_select_modifies_timeout ())
+    TEST_VERIFY (tv.tv_sec < 10);
+}
+
+static int
+do_test (void)
+{
+  struct child_args args;
+
+  xpipe (args.fds[0]);
+  xpipe (args.fds[1]);
+
+  /* The child select should timeout and write on its pipe end.  */
+  args.tmo = (struct timeval) { .tv_sec = 0, .tv_usec = 250000 };
+  {
+    struct support_capture_subprocess result;
+    result = support_capture_subprocess (do_test_child, &args);
+    support_capture_subprocess_check (&result, "tst-select-child", 0,
+				      sc_allow_none);
+  }
+
+  if (support_select_normalizes_timeout ())
+    {
+      /* This is handled as 1 second instead of failing with EINVAL.  */
+      args.tmo = (struct timeval) { .tv_sec = 0, .tv_usec = 1000000 };
+      struct support_capture_subprocess result;
+      result = support_capture_subprocess (do_test_child, &args);
+      support_capture_subprocess_check (&result, "tst-select-child", 0,
+					sc_allow_none);
+    }
+
+  /* Same as before, but simulating polling.  */
+  args.tmo = (struct timeval) { .tv_sec = 0, .tv_usec = 0 };
+  {
+    struct support_capture_subprocess result;
+    result = support_capture_subprocess (do_test_child, &args);
+    support_capture_subprocess_check (&result, "tst-select-child", 0,
+				      sc_allow_none);
+  }
+
+  xclose (args.fds[0][0]);
+  xclose (args.fds[1][1]);
+
+  {
+    struct support_capture_subprocess result;
+    result = support_capture_subprocess (do_test_child_alarm, NULL);
+    support_capture_subprocess_check (&result, "tst-select-child", 0,
+				      sc_allow_none);
+  }
+
+  {
+    fd_set rfds;
+    FD_ZERO (&rfds);
+    FD_SET (args.fds[1][0], &rfds);
+
+    int r = select (args.fds[1][0] + 1, &rfds, NULL, NULL, &args.tmo);
+    TEST_COMPARE (r, 1);
+  }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/nptl/Makefile b/nptl/Makefile
index 0282e0739..a1a8ef254 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -294,7 +294,8 @@ tests = tst-attr2 tst-attr3 tst-default-attr \
 	tst-thread-affinity-sched \
 	tst-pthread-defaultattr-free \
 	tst-pthread-attr-sigmask \
-	tst-pthread-timedlock-lockloop
+	tst-pthread-timedlock-lockloop \
+	tst-pthread-gdb-attach tst-pthread-gdb-attach-static
 
 tests-container =  tst-pthread-getattr
 
@@ -314,10 +315,6 @@ xtests += tst-eintr1
 
 test-srcs = tst-oddstacklimit
 
-# Test expected to fail on most targets (except x86_64) due to bug
-# 18435 - pthread_once hangs when init routine throws an exception.
-test-xfail-tst-once5 = yes
-
 gen-as-const-headers = unwindbuf.sym \
 		       pthread-pi-defines.sym
 
@@ -344,6 +341,22 @@ CPPFLAGS-test-cond-printers.c := $(CFLAGS-printers-tests)
 CPPFLAGS-test-rwlockattr-printers.c := $(CFLAGS-printers-tests)
 CPPFLAGS-test-rwlock-printers.c := $(CFLAGS-printers-tests)
 
+# Reuse the CFLAGS setting for the GDB attaching test.  It needs
+# debugging information.
+CFLAGS-tst-pthread-gdb-attach.c := $(CFLAGS-printers-tests)
+CPPFLAGS-tst-pthread-gdb-attach.c := $(CFLAGS-printers-tests)
+ifeq ($(build-shared)$(build-hardcoded-path-in-tests),yesno)
+CPPFLAGS-tst-pthread-gdb-attach.c += -DDO_ADD_SYMBOL_FILE=1
+else
+CPPFLAGS-tst-pthread-gdb-attach.c += -DDO_ADD_SYMBOL_FILE=0
+endif
+CFLAGS-tst-pthread-gdb-attach-static.c := $(CFLAGS-printers-tests)
+CPPFLAGS-tst-pthread-gdb-attach-static.c := \
+  $(CFLAGS-printers-tests) -DDO_ADD_SYMBOL_FILE=0
+# As of version 9.2, GDB cannot attach properly to PIE programs that
+# were launched with an explicit ld.so invocation.
+tst-pthread-gdb-attach-no-pie = yes
+
 ifeq ($(build-shared),yes)
 tests-printers-libs := $(shared-thread-library)
 else
@@ -415,7 +428,8 @@ link-libc-static := $(common-objpfx)libc.a $(static-gnulib) \
 tests-static += tst-stackguard1-static \
 		tst-cancel24-static \
 		tst-mutex8-static tst-mutexpi8-static tst-sem11-static \
-		tst-sem12-static tst-cond11-static
+		tst-sem12-static tst-cond11-static \
+		tst-pthread-gdb-attach-static
 
 tests += tst-cancel24-static
 
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
index e5efa2e62..79be1bc70 100644
--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
@@ -602,6 +602,67 @@ extern void __pthread_cleanup_pop (struct _pthread_cleanup_buffer *buffer,
 # undef pthread_cleanup_pop
 # define pthread_cleanup_pop(execute)                   \
   __pthread_cleanup_pop (&_buffer, (execute)); }
+
+# if defined __EXCEPTIONS && !defined __cplusplus
+/* Structure to hold the cleanup handler information.  */
+struct __pthread_cleanup_combined_frame
+{
+  void (*__cancel_routine) (void *);
+  void *__cancel_arg;
+  int __do_it;
+  struct _pthread_cleanup_buffer __buffer;
+};
+
+/* Special cleanup macros which register cleanup both using
+   __pthread_cleanup_{push,pop} and using cleanup attribute.  This is needed
+   for pthread_once, so that it supports both throwing exceptions from the
+   pthread_once callback (only cleanup attribute works there) and cancellation
+   of the thread running the callback if the callback or some routines it
+   calls don't have unwind information.  */
+
+static __always_inline void
+__pthread_cleanup_combined_routine (struct __pthread_cleanup_combined_frame
+				    *__frame)
+{
+  if (__frame->__do_it)
+    {
+      __frame->__cancel_routine (__frame->__cancel_arg);
+      __frame->__do_it = 0;
+      __pthread_cleanup_pop (&__frame->__buffer, 0);
+    }
+}
+
+static inline void
+__pthread_cleanup_combined_routine_voidptr (void *__arg)
+{
+  struct __pthread_cleanup_combined_frame *__frame
+    = (struct __pthread_cleanup_combined_frame *) __arg;
+  if (__frame->__do_it)
+    {
+      __frame->__cancel_routine (__frame->__cancel_arg);
+      __frame->__do_it = 0;
+    }
+}
+
+#  define pthread_cleanup_combined_push(routine, arg) \
+  do {									      \
+    void (*__cancel_routine) (void *) = (routine);			      \
+    struct __pthread_cleanup_combined_frame __clframe			      \
+      __attribute__ ((__cleanup__ (__pthread_cleanup_combined_routine)))      \
+      = { .__cancel_routine = __cancel_routine, .__cancel_arg = (arg),	      \
+	  .__do_it = 1 };						      \
+    __pthread_cleanup_push (&__clframe.__buffer,			      \
+			    __pthread_cleanup_combined_routine_voidptr,	      \
+			    &__clframe);
+
+#  define pthread_cleanup_combined_pop(execute) \
+    __pthread_cleanup_pop (&__clframe.__buffer, 0);			      \
+    __clframe.__do_it = 0;						      \
+    if (execute)							      \
+      __cancel_routine (__clframe.__cancel_arg);			      \
+  } while (0)
+
+# endif
 #endif
 
 extern void __pthread_cleanup_push_defer (struct _pthread_cleanup_buffer *buffer,
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 6c645aff4..337a7dcba 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -51,6 +51,14 @@ static td_thr_events_t __nptl_threads_events __attribute_used__;
 /* Pointer to descriptor with the last event.  */
 static struct pthread *__nptl_last_event __attribute_used__;
 
+#ifdef SHARED
+/* This variable is used to access _rtld_global from libthread_db.  If
+   GDB loads libpthread before ld.so, it is not possible to resolve
+   _rtld_global directly during libpthread initialization.  */
+static struct rtld_global *__nptl_rtld_global __attribute_used__
+  = &_rtld_global;
+#endif
+
 /* Number of threads running.  */
 unsigned int __nptl_nthreads = 1;
 
@@ -426,8 +434,6 @@ START_THREAD_DEFN
   unwind_buf.priv.data.prev = NULL;
   unwind_buf.priv.data.cleanup = NULL;
 
-  __libc_signal_restore_set (&pd->sigmask);
-
   /* Allow setxid from now onwards.  */
   if (__glibc_unlikely (atomic_exchange_acq (&pd->setxid_futex, 0) == -2))
     futex_wake (&pd->setxid_futex, 1, FUTEX_PRIVATE);
@@ -437,6 +443,8 @@ START_THREAD_DEFN
       /* Store the new cleanup handler info.  */
       THREAD_SETMEM (pd, cleanup_jmp_buf, &unwind_buf);
 
+      __libc_signal_restore_set (&pd->sigmask);
+
       /* We are either in (a) or (b), and in either case we either own
          PD already (2) or are about to own PD (1), and so our only
 	 restriction would be that we can't free PD until we know we
diff --git a/nptl/pthread_once.c b/nptl/pthread_once.c
index 28d97097c..7645da222 100644
--- a/nptl/pthread_once.c
+++ b/nptl/pthread_once.c
@@ -111,11 +111,11 @@ __pthread_once_slow (pthread_once_t *once_control, void (*init_routine) (void))
       /* This thread is the first here.  Do the initialization.
 	 Register a cleanup handler so that in case the thread gets
 	 interrupted the initialization can be restarted.  */
-      pthread_cleanup_push (clear_once_control, once_control);
+      pthread_cleanup_combined_push (clear_once_control, once_control);
 
       init_routine ();
 
-      pthread_cleanup_pop (0);
+      pthread_cleanup_combined_pop (0);
 
 
       /* Mark *once_control as having finished the initialization.  We need
diff --git a/nptl/tst-once5.cc b/nptl/tst-once5.cc
index b797ab356..60fe1ef82 100644
--- a/nptl/tst-once5.cc
+++ b/nptl/tst-once5.cc
@@ -59,7 +59,7 @@ do_test (void)
                " throwing an exception", stderr);
     }
     catch (OnceException) {
-      if (1 < niter)
+      if (niter > 1)
         fputs ("pthread_once unexpectedly threw", stderr);
       result = 0;
     }
@@ -75,7 +75,5 @@ do_test (void)
   return result;
 }
 
-// The test currently hangs and is XFAILed.  Reduce the timeout.
-#define TIMEOUT 1
 #define TEST_FUNCTION do_test ()
 #include "../test-skeleton.c"
diff --git a/nptl/tst-pthread-gdb-attach-static.c b/nptl/tst-pthread-gdb-attach-static.c
new file mode 100644
index 000000000..e159632ca
--- /dev/null
+++ b/nptl/tst-pthread-gdb-attach-static.c
@@ -0,0 +1 @@
+#include "tst-pthread-gdb-attach.c"
diff --git a/nptl/tst-pthread-gdb-attach.c b/nptl/tst-pthread-gdb-attach.c
new file mode 100644
index 000000000..901a12003
--- /dev/null
+++ b/nptl/tst-pthread-gdb-attach.c
@@ -0,0 +1,217 @@
+/* Smoke testing GDB process attach with thread-local variable access.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* This test runs GDB against a forked copy of itself, to check
+   whether libthread_db can be loaded, and that access to thread-local
+   variables works.  */
+
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/temp_file.h>
+#include <support/test-driver.h>
+#include <support/xstdio.h>
+#include <support/xthread.h>
+#include <support/xunistd.h>
+#include <unistd.h>
+
+/* Starts out as zero, changed to 1 or 2 by the debugger, depending on
+   the thread.  */
+__thread volatile int altered_by_debugger;
+
+/* Common prefix between 32-bit and 64-bit ELF.  */
+struct elf_prefix
+{
+  unsigned char e_ident[EI_NIDENT];
+  uint16_t e_type;
+  uint16_t e_machine;
+  uint32_t e_version;
+};
+_Static_assert (sizeof (struct elf_prefix) == EI_NIDENT + 8,
+                "padding in struct elf_prefix");
+
+/* Reads the ELF header from PATH.  Returns true if the header can be
+   read, false if the file is too short.  */
+static bool
+read_elf_header (const char *path, struct elf_prefix *elf)
+{
+  int fd = xopen (path, O_RDONLY, 0);
+  bool result = read (fd, elf, sizeof (*elf)) == sizeof (*elf);
+  xclose (fd);
+  return result;
+}
+
+/* Searches for "gdb" alongside the path variable.  See execvpe.  */
+static char *
+find_gdb (void)
+{
+  const char *path = getenv ("PATH");
+  if (path == NULL)
+    return NULL;
+  while (true)
+    {
+      const char *colon = strchrnul (path, ':');
+      char *candidate = xasprintf ("%.*s/gdb", (int) (colon - path), path);
+      if (access (candidate, X_OK) == 0)
+        return candidate;
+      free (candidate);
+      if (*colon == '\0')
+        break;
+      path = colon + 1;
+    }
+  return NULL;
+}
+
+/* Writes the GDB script to run the test to PATH.  */
+static void
+write_gdbscript (const char *path, int tested_pid)
+{
+  FILE *fp = xfopen (path, "w");
+  fprintf (fp,
+           "set trace-commands on\n"
+           "set debug libthread-db 1\n"
+#if DO_ADD_SYMBOL_FILE
+           /* Do not do this unconditionally to work around a GDB
+              assertion failure: ../../gdb/symtab.c:6404:
+              internal-error: CORE_ADDR get_msymbol_address(objfile*,
+              const minimal_symbol*): Assertion `(objf->flags &
+              OBJF_MAINLINE) == 0' failed.  */
+           "add-symbol-file %1$s/nptl/tst-pthread-gdb-attach\n"
+#endif
+           "set auto-load safe-path %1$s/nptl_db\n"
+           "set libthread-db-search-path %1$s/nptl_db\n"
+           "attach %2$d\n",
+           support_objdir_root, tested_pid);
+  fputs ("break debugger_inspection_point\n"
+         "continue\n"
+         "thread 1\n"
+         "print altered_by_debugger\n"
+         "print altered_by_debugger = 1\n"
+         "thread 2\n"
+         "print altered_by_debugger\n"
+         "print altered_by_debugger = 2\n"
+         "continue\n",
+         fp);
+  xfclose (fp);
+}
+
+/* The test sets a breakpoint on this function and alters the
+   altered_by_debugger thread-local variable.  */
+void __attribute__ ((weak))
+debugger_inspection_point (void)
+{
+}
+
+/* Thread function for the test thread in the subprocess.  */
+static void *
+subprocess_thread (void *closure)
+{
+  /* Wait until altered_by_debugger changes the value away from 0.  */
+  while (altered_by_debugger == 0)
+    {
+      usleep (100 * 1000);
+      debugger_inspection_point ();
+    }
+
+  TEST_COMPARE (altered_by_debugger, 2);
+  return NULL;
+}
+
+/* This function implements the subprocess under test.  It creates a
+   second thread, waiting for its value to change to 2, and checks
+   that the main thread also changed its value to 1.  */
+static void
+in_subprocess (void)
+{
+  pthread_t thr = xpthread_create (NULL, subprocess_thread, NULL);
+  TEST_VERIFY (xpthread_join (thr) == NULL);
+  TEST_COMPARE (altered_by_debugger, 1);
+  _exit (0);
+}
+
+static int
+do_test (void)
+{
+  char *gdb_path = find_gdb ();
+  if (gdb_path == NULL)
+    FAIL_UNSUPPORTED ("gdb command not found in PATH: %s", getenv ("PATH"));
+
+  /* Check that libthread_db is compatible with the gdb architecture
+     because gdb loads it via dlopen.  */
+  {
+    char *threaddb_path = xasprintf ("%s/nptl_db/libthread_db.so",
+                                     support_objdir_root);
+    struct elf_prefix elf_threaddb;
+    TEST_VERIFY_EXIT (read_elf_header (threaddb_path, &elf_threaddb));
+    struct elf_prefix elf_gdb;
+    /* If the ELF header cannot be read or "gdb" is not an ELF file,
+       assume this is a wrapper script that can run.  */
+    if (read_elf_header (gdb_path, &elf_gdb)
+        && memcmp (&elf_gdb, ELFMAG, SELFMAG) == 0)
+      {
+        if (elf_gdb.e_ident[EI_CLASS] != elf_threaddb.e_ident[EI_CLASS])
+          FAIL_UNSUPPORTED ("GDB at %s has wrong class", gdb_path);
+        if (elf_gdb.e_ident[EI_DATA] != elf_threaddb.e_ident[EI_DATA])
+          FAIL_UNSUPPORTED ("GDB at %s has wrong data", gdb_path);
+        if (elf_gdb.e_machine != elf_threaddb.e_machine)
+          FAIL_UNSUPPORTED ("GDB at %s has wrong machine", gdb_path);
+      }
+    free (threaddb_path);
+  }
+
+  pid_t tested_pid = xfork ();
+  if (tested_pid == 0)
+    in_subprocess ();
+  char *tested_pid_string = xasprintf ("%d", tested_pid);
+
+  char *gdbscript;
+  xclose (create_temp_file ("tst-pthread-gdb-attach-", &gdbscript));
+  write_gdbscript (gdbscript, tested_pid);
+
+  pid_t gdb_pid = xfork ();
+  if (gdb_pid == 0)
+    {
+      xdup2 (STDOUT_FILENO, STDERR_FILENO);
+      execl (gdb_path, "gdb", "-nx", "-batch", "-x", gdbscript, NULL);
+      if (errno == ENOENT)
+        _exit (EXIT_UNSUPPORTED);
+      else
+        _exit (1);
+    }
+
+  int status;
+  TEST_COMPARE (xwaitpid (gdb_pid, &status, 0), gdb_pid);
+  if (WIFEXITED (status) && WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+    /* gdb is not installed.  */
+    return EXIT_UNSUPPORTED;
+  TEST_COMPARE (status, 0);
+  TEST_COMPARE (xwaitpid (tested_pid, &status, 0), tested_pid);
+  TEST_COMPARE (status, 0);
+
+  free (tested_pid_string);
+  free (gdbscript);
+  free (gdb_path);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/nptl/unwind.c b/nptl/unwind.c
index 9c7ed7d7c..0a2ec9b8b 100644
--- a/nptl/unwind.c
+++ b/nptl/unwind.c
@@ -26,7 +26,7 @@
 #include <libc-diag.h>
 #include <jmpbuf-unwind.h>
 
-#ifdef _STACK_GROWS_DOWN
+#if _STACK_GROWS_DOWN
 # define FRAME_LEFT(frame, other, adj) \
   ((uintptr_t) frame - adj >= (uintptr_t) other - adj)
 #elif _STACK_GROWS_UP
diff --git a/nptl_db/structs.def b/nptl_db/structs.def
index 999a9fc35..8a613dd2f 100644
--- a/nptl_db/structs.def
+++ b/nptl_db/structs.def
@@ -100,8 +100,7 @@ DB_STRUCT_FIELD (pthread, dtvp)
 #endif
 
 #if !(IS_IN (libpthread) && !defined SHARED)
-DB_STRUCT (rtld_global)
-DB_RTLD_VARIABLE (_rtld_global)
+DB_VARIABLE (__nptl_rtld_global)
 #endif
 DB_RTLD_GLOBAL_FIELD (dl_tls_dtv_slotinfo_list)
 DB_RTLD_GLOBAL_FIELD (dl_stack_user)
diff --git a/nptl_db/td_init.c b/nptl_db/td_init.c
index 1d1568122..06b5adc5c 100644
--- a/nptl_db/td_init.c
+++ b/nptl_db/td_init.c
@@ -33,13 +33,14 @@ td_init (void)
 bool
 __td_ta_rtld_global (td_thragent_t *ta)
 {
-  if (ta->ta_addr__rtld_global == 0
-      && td_mod_lookup (ta->ph, LD_SO, SYM__rtld_global,
-                        &ta->ta_addr__rtld_global) != PS_OK)
+  if (ta->ta_addr__rtld_global == 0)
     {
-      ta->ta_addr__rtld_global = (void*)-1;
-      return false;
+      psaddr_t rtldglobalp;
+      if (DB_GET_VALUE (rtldglobalp, ta, __nptl_rtld_global, 0) == TD_OK)
+        ta->ta_addr__rtld_global = rtldglobalp;
+      else
+        ta->ta_addr__rtld_global = (void *) -1;
     }
-  else
-    return ta->ta_addr__rtld_global != (void*)-1;
+
+  return ta->ta_addr__rtld_global != (void *)-1;
 }
diff --git a/nptl_db/thread_dbP.h b/nptl_db/thread_dbP.h
index 580a70c47..712fa3aeb 100644
--- a/nptl_db/thread_dbP.h
+++ b/nptl_db/thread_dbP.h
@@ -108,6 +108,8 @@ struct td_thragent
 # undef DB_SYMBOL
 # undef DB_VARIABLE
 
+  psaddr_t ta_addr__rtld_global;
+
   /* The method of locating a thread's th_unique value.  */
   enum
     {
diff --git a/nscd/netgroupcache.c b/nscd/netgroupcache.c
index dba6ceec1..ad2daddaf 100644
--- a/nscd/netgroupcache.c
+++ b/nscd/netgroupcache.c
@@ -248,7 +248,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
 					     : NULL);
 				    ndomain = (ndomain ? newbuf + ndomaindiff
 					       : NULL);
-				    buffer = newbuf;
+				    *tofreep = buffer = newbuf;
 				  }
 
 				nhost = memcpy (buffer + bufused,
@@ -319,7 +319,7 @@ addgetnetgrentX (struct database_dyn *db, int fd, request_header *req,
 		    else if (status == NSS_STATUS_TRYAGAIN && e == ERANGE)
 		      {
 			buflen *= 2;
-			buffer = xrealloc (buffer, buflen);
+			*tofreep = buffer = xrealloc (buffer, buflen);
 		      }
 		    else if (status == NSS_STATUS_RETURN
 			     || status == NSS_STATUS_NOTFOUND
diff --git a/nss/nss_database.c b/nss/nss_database.c
index cf0306adc..fb72d0cc0 100644
--- a/nss/nss_database.c
+++ b/nss/nss_database.c
@@ -398,10 +398,10 @@ nss_database_check_reload_and_get (struct nss_database_state *local,
 	  && (str.st_ino != local->root_ino
 	      ||  str.st_dev != local->root_dev)))
     {
-      /* Change detected; disable reloading.  */
+      /* Change detected; disable reloading and return current state.  */
       atomic_store_release (&local->data.reload_disabled, 1);
+      *result = local->data.services[database_index];
       __libc_lock_unlock (local->lock);
-      __nss_module_disable_loading ();
       return true;
     }
   local->root_ino = str.st_ino;
diff --git a/nss/tst-nss-files-hosts-long.root/etc/nsswitch.conf b/nss/tst-nss-files-hosts-long.root/etc/nsswitch.conf
new file mode 100644
index 000000000..5b0c6a419
--- /dev/null
+++ b/nss/tst-nss-files-hosts-long.root/etc/nsswitch.conf
@@ -0,0 +1 @@
+hosts: files
diff --git a/nss/tst-reload2.c b/nss/tst-reload2.c
index 5dae16b4f..5ecb032e9 100644
--- a/nss/tst-reload2.c
+++ b/nss/tst-reload2.c
@@ -26,6 +26,7 @@
 #include <pwd.h>
 #include <grp.h>
 #include <unistd.h>
+#include <netdb.h>
 
 #include <support/support.h>
 #include <support/check.h>
@@ -48,7 +49,7 @@ static const char *group_4[] = {
   "alpha", "beta", "gamma", "fred", NULL
 };
 
-static struct group group_table_data[] =
+static struct group group_table_data1[] =
   {
    GRP (4),
    GRP_LAST ()
@@ -58,7 +59,7 @@ void
 _nss_test1_init_hook (test_tables *t)
 {
   t->pwd_table = pwd_table1;
-  t->grp_table = group_table_data;
+  t->grp_table = group_table_data1;
 }
 
 static struct passwd pwd_table2[] =
@@ -68,10 +69,21 @@ static struct passwd pwd_table2[] =
    PWD_LAST ()
   };
 
+static const char *group_5[] = {
+  "fred", NULL
+};
+
+static struct group group_table_data2[] =
+  {
+   GRP (5),
+   GRP_LAST ()
+  };
+
 void
 _nss_test2_init_hook (test_tables *t)
 {
   t->pwd_table = pwd_table2;
+  t->grp_table = group_table_data2;
 }
 
 static int
@@ -79,6 +91,7 @@ do_test (void)
 {
   struct passwd *pw;
   struct group *gr;
+  struct hostent *he;
   char buf1[PATH_MAX];
   char buf2[PATH_MAX];
 
@@ -99,7 +112,9 @@ do_test (void)
     TEST_COMPARE (pw->pw_uid, 1234);
 
   /* This just loads the test2 DSO.  */
-  gr = getgrnam ("name4");
+  gr = getgrgid (5);
+  TEST_VERIFY (gr != NULL);
+
 
   /* Change the root dir.  */
 
@@ -114,15 +129,21 @@ do_test (void)
   if (pw)
     TEST_VERIFY (pw->pw_uid != 2468);
 
-  /* The "files" DSO should not be loaded.  */
-  gr = getgrnam ("test3");
-  TEST_VERIFY (gr == NULL);
-
   /* We should still be using the old configuration.  */
   pw = getpwnam ("test1");
   TEST_VERIFY (pw != NULL);
   if (pw)
     TEST_COMPARE (pw->pw_uid, 1234);
+  gr = getgrgid (5);
+  TEST_VERIFY (gr != NULL);
+  gr = getgrnam ("name4");
+  TEST_VERIFY (gr == NULL);
+
+  /* hosts in the outer nsswitch is files; the inner one is test1.
+     Verify that we're still using the outer nsswitch *and* that we
+     can load the files DSO. */
+  he = gethostbyname ("test2");
+  TEST_VERIFY (he != NULL);
 
   return 0;
 }
diff --git a/nss/tst-reload2.root/etc/hosts b/nss/tst-reload2.root/etc/hosts
new file mode 100644
index 000000000..bbd9e494e
--- /dev/null
+++ b/nss/tst-reload2.root/etc/hosts
@@ -0,0 +1 @@
+1.2.3.4 test1
diff --git a/nss/tst-reload2.root/etc/nsswitch.conf b/nss/tst-reload2.root/etc/nsswitch.conf
index 570795ae2..688a58951 100644
--- a/nss/tst-reload2.root/etc/nsswitch.conf
+++ b/nss/tst-reload2.root/etc/nsswitch.conf
@@ -1,2 +1,3 @@
 passwd: test1
 group: test2
+hosts: files
diff --git a/nss/tst-reload2.root/subdir/etc/hosts b/nss/tst-reload2.root/subdir/etc/hosts
new file mode 100644
index 000000000..0a2cbd433
--- /dev/null
+++ b/nss/tst-reload2.root/subdir/etc/hosts
@@ -0,0 +1 @@
+1.2.3.4 test2
diff --git a/nss/tst-reload2.root/subdir/etc/nsswitch.conf b/nss/tst-reload2.root/subdir/etc/nsswitch.conf
index f1d73f876..fea271869 100644
--- a/nss/tst-reload2.root/subdir/etc/nsswitch.conf
+++ b/nss/tst-reload2.root/subdir/etc/nsswitch.conf
@@ -1,2 +1,3 @@
 passwd: test2
 group: files
+hosts: test1
diff --git a/posix/bits/unistd.h b/posix/bits/unistd.h
index f0831386c..622adeb2b 100644
--- a/posix/bits/unistd.h
+++ b/posix/bits/unistd.h
@@ -199,10 +199,9 @@ __NTH (readlinkat (int __fd, const char *__restrict __path,
 #endif
 
 extern char *__getcwd_chk (char *__buf, size_t __size, size_t __buflen)
-     __THROW __wur __attr_access ((__write_only__, 1, 2));
+     __THROW __wur;
 extern char *__REDIRECT_NTH (__getcwd_alias,
-			     (char *__buf, size_t __size), getcwd)
-  __wur __attr_access ((__write_only__, 1, 2));
+			     (char *__buf, size_t __size), getcwd) __wur;
 extern char *__REDIRECT_NTH (__getcwd_chk_warn,
 			     (char *__buf, size_t __size, size_t __buflen),
 			     __getcwd_chk)
diff --git a/posix/unistd.h b/posix/unistd.h
index 3f2276337..bede49c1f 100644
--- a/posix/unistd.h
+++ b/posix/unistd.h
@@ -517,8 +517,7 @@ extern int fchdir (int __fd) __THROW __wur;
    an array is allocated with `malloc'; the array is SIZE
    bytes long, unless SIZE == 0, in which case it is as
    big as necessary.  */
-extern char *getcwd (char *__buf, size_t __size) __THROW __wur
-    __attr_access ((__write_only__, 1, 2));
+extern char *getcwd (char *__buf, size_t __size) __THROW __wur;
 
 #ifdef	__USE_GNU
 /* Return a malloc'd string containing the current directory name.
diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c
index f93a546d7..9df02dbbb 100644
--- a/posix/wordexp-test.c
+++ b/posix/wordexp-test.c
@@ -183,6 +183,7 @@ struct test_case_struct
     { 0, NULL, "$var", 0, 0, { NULL, }, IFS },
     { 0, NULL, "\"\\n\"", 0, 1, { "\\n", }, IFS },
     { 0, NULL, "", 0, 0, { NULL, }, IFS },
+    { 0, NULL, "${1234567890123456789012}", 0, 0, { NULL, }, IFS },
 
     /* Flags not already covered (testit() has special handling for these) */
     { 0, NULL, "one two", WRDE_DOOFFS, 2, { "one", "two", }, IFS },
diff --git a/posix/wordexp.c b/posix/wordexp.c
index bcbe96e48..1f3b09f72 100644
--- a/posix/wordexp.c
+++ b/posix/wordexp.c
@@ -1399,7 +1399,7 @@ envsubst:
   /* Is it a numeric parameter? */
   else if (isdigit (env[0]))
     {
-      int n = atoi (env);
+      unsigned long n = strtoul (env, NULL, 10);
 
       if (n >= __libc_argc)
 	/* Substitute NULL. */
diff --git a/rt/Makefile b/rt/Makefile
index 7b374f207..c87d95793 100644
--- a/rt/Makefile
+++ b/rt/Makefile
@@ -44,6 +44,7 @@ tests := tst-shm tst-timer tst-timer2 \
 	 tst-aio7 tst-aio8 tst-aio9 tst-aio10 \
 	 tst-mqueue1 tst-mqueue2 tst-mqueue3 tst-mqueue4 \
 	 tst-mqueue5 tst-mqueue6 tst-mqueue7 tst-mqueue8 tst-mqueue9 \
+	 tst-bz28213 \
 	 tst-timer3 tst-timer4 tst-timer5 \
 	 tst-cpuclock2 tst-cputimer1 tst-cputimer2 tst-cputimer3 \
 	 tst-shm-cancel
diff --git a/rt/tst-bz28213.c b/rt/tst-bz28213.c
new file mode 100644
index 000000000..0c096b5a0
--- /dev/null
+++ b/rt/tst-bz28213.c
@@ -0,0 +1,101 @@
+/* Bug 28213: test for NULL pointer dereference in mq_notify.
+   Copyright (C) The GNU Toolchain Authors.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <mqueue.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/check.h>
+
+static mqd_t m = -1;
+static const char msg[] = "hello";
+
+static void
+check_bz28213_cb (union sigval sv)
+{
+  char buf[sizeof (msg)];
+
+  (void) sv;
+
+  TEST_VERIFY_EXIT ((size_t) mq_receive (m, buf, sizeof (buf), NULL)
+		    == sizeof (buf));
+  TEST_VERIFY_EXIT (memcmp (buf, msg, sizeof (buf)) == 0);
+
+  exit (0);
+}
+
+static void
+check_bz28213 (void)
+{
+  struct sigevent sev;
+
+  memset (&sev, '\0', sizeof (sev));
+  sev.sigev_notify = SIGEV_THREAD;
+  sev.sigev_notify_function = check_bz28213_cb;
+
+  /* Step 1: Register & unregister notifier.
+     Helper thread should receive NOTIFY_REMOVED notification.
+     In a vulnerable version of glibc, NULL pointer dereference follows. */
+  TEST_VERIFY_EXIT (mq_notify (m, &sev) == 0);
+  TEST_VERIFY_EXIT (mq_notify (m, NULL) == 0);
+
+  /* Step 2: Once again, register notification.
+     Try to send one message.
+     Test is considered successful, if the callback does exit (0). */
+  TEST_VERIFY_EXIT (mq_notify (m, &sev) == 0);
+  TEST_VERIFY_EXIT (mq_send (m, msg, sizeof (msg), 1) == 0);
+
+  /* Wait... */
+  pause ();
+}
+
+static int
+do_test (void)
+{
+  static const char m_name[] = "/bz28213_queue";
+  struct mq_attr m_attr;
+
+  memset (&m_attr, '\0', sizeof (m_attr));
+  m_attr.mq_maxmsg = 1;
+  m_attr.mq_msgsize = sizeof (msg);
+
+  m = mq_open (m_name,
+               O_RDWR | O_CREAT | O_EXCL,
+               0600,
+               &m_attr);
+
+  if (m < 0)
+    {
+      if (errno == ENOSYS)
+        FAIL_UNSUPPORTED ("POSIX message queues are not implemented\n");
+      FAIL_EXIT1 ("Failed to create POSIX message queue: %m\n");
+    }
+
+  TEST_VERIFY_EXIT (mq_unlink (m_name) == 0);
+
+  check_bz28213 ();
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/socket/Makefile b/socket/Makefile
index 600891d5d..4ab67aed6 100644
--- a/socket/Makefile
+++ b/socket/Makefile
@@ -29,10 +29,14 @@ headers	:= sys/socket.h sys/un.h bits/sockaddr.h bits/socket.h \
 routines := accept bind connect getpeername getsockname getsockopt	\
 	    listen recv recvfrom recvmsg send sendmsg sendto		\
 	    setsockopt shutdown socket socketpair isfdtype opensock	\
-	    sockatmark accept4 recvmmsg sendmmsg
+	    sockatmark accept4 recvmmsg sendmmsg sockaddr_un_set
 
 tests := tst-accept4
 
+tests-internal := \
+  tst-sockaddr_un_set \
+  # tests-internal
+
 aux	 := sa_len
 
 include ../Rules
diff --git a/socket/opensock.c b/socket/opensock.c
index 37148d474..3e35821f9 100644
--- a/socket/opensock.c
+++ b/socket/opensock.c
@@ -1,4 +1,5 @@
-/* Copyright (C) 1999-2021 Free Software Foundation, Inc.
+/* Create socket with an unspecified address family for use with ioctl.
+   Copyright (C) 1999-2021 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -15,56 +16,27 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <stdio.h>
+#include <errno.h>
 #include <sys/socket.h>
-#include <libc-lock.h>
 
 /* Return a socket of any type.  The socket can be used in subsequent
    ioctl calls to talk to the kernel.  */
 int
 __opensock (void)
 {
-  /* Cache the last AF that worked, to avoid many redundant calls to
-     socket().  */
-  static int sock_af = -1;
-  int fd = -1;
-  __libc_lock_define_initialized (static, lock);
-
-  if (sock_af != -1)
-    {
-      fd = __socket (sock_af, SOCK_DGRAM, 0);
-      if (fd != -1)
-        return fd;
-    }
-
-  __libc_lock_lock (lock);
-
-  if (sock_af != -1)
-    fd = __socket (sock_af, SOCK_DGRAM, 0);
-
-  if (fd == -1)
-    {
-#ifdef AF_INET
-      fd = __socket (sock_af = AF_INET, SOCK_DGRAM, 0);
-#endif
-#ifdef AF_INET6
-      if (fd < 0)
-	fd = __socket (sock_af = AF_INET6, SOCK_DGRAM, 0);
-#endif
-#ifdef AF_IPX
-      if (fd < 0)
-	fd = __socket (sock_af = AF_IPX, SOCK_DGRAM, 0);
-#endif
-#ifdef AF_AX25
-      if (fd < 0)
-	fd = __socket (sock_af = AF_AX25, SOCK_DGRAM, 0);
-#endif
-#ifdef AF_APPLETALK
-      if (fd < 0)
-	fd = __socket (sock_af = AF_APPLETALK, SOCK_DGRAM, 0);
-#endif
-    }
-
-  __libc_lock_unlock (lock);
+  /* SOCK_DGRAM is supported by all address families.  */
+  int type = SOCK_DGRAM | SOCK_CLOEXEC;
+  int fd;
+
+  fd = __socket (AF_UNIX, type, 0);
+  if (fd >= 0)
+    return fd;
+  fd = __socket (AF_INET, type, 0);
+  if (fd >= 0)
+    return fd;
+  fd = __socket (AF_INET6, type, 0);
+  if (fd >= 0)
+    return fd;
+  __set_errno (ENOENT);
   return fd;
 }
diff --git a/socket/sockaddr_un_set.c b/socket/sockaddr_un_set.c
new file mode 100644
index 000000000..0bd40dc34
--- /dev/null
+++ b/socket/sockaddr_un_set.c
@@ -0,0 +1,41 @@
+/* Set the sun_path member of struct sockaddr_un.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+int
+__sockaddr_un_set (struct sockaddr_un *addr, const char *pathname)
+{
+  size_t name_length = strlen (pathname);
+
+  /* The kernel supports names of exactly sizeof (addr->sun_path)
+     bytes, without a null terminator, but userspace does not; see the
+     SUN_LEN macro.  */
+  if (name_length >= sizeof (addr->sun_path))
+    {
+      __set_errno (EINVAL);     /* Error code used by the kernel.  */
+      return -1;
+    }
+
+  addr->sun_family = AF_UNIX;
+  memcpy (addr->sun_path, pathname, name_length + 1);
+  return 0;
+}
diff --git a/socket/tst-sockaddr_un_set.c b/socket/tst-sockaddr_un_set.c
new file mode 100644
index 000000000..29c2a81af
--- /dev/null
+++ b/socket/tst-sockaddr_un_set.c
@@ -0,0 +1,62 @@
+/* Test the __sockaddr_un_set function.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Re-compile the function because the version in libc is not
+   exported.  */
+#include "sockaddr_un_set.c"
+
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+  struct sockaddr_un sun;
+
+  memset (&sun, 0xcc, sizeof (sun));
+  __sockaddr_un_set (&sun, "");
+  TEST_COMPARE (sun.sun_family, AF_UNIX);
+  TEST_COMPARE (__sockaddr_un_set (&sun, ""), 0);
+
+  memset (&sun, 0xcc, sizeof (sun));
+  TEST_COMPARE (__sockaddr_un_set (&sun, "/example"), 0);
+  TEST_COMPARE_STRING (sun.sun_path, "/example");
+
+  {
+    char pathname[108];         /* Length of sun_path (ABI constant).  */
+    memset (pathname, 'x', sizeof (pathname));
+    pathname[sizeof (pathname) - 1] = '\0';
+    memset (&sun, 0xcc, sizeof (sun));
+    TEST_COMPARE (__sockaddr_un_set (&sun, pathname), 0);
+    TEST_COMPARE (sun.sun_family, AF_UNIX);
+    TEST_COMPARE_STRING (sun.sun_path, pathname);
+  }
+
+  {
+    char pathname[109];
+    memset (pathname, 'x', sizeof (pathname));
+    pathname[sizeof (pathname) - 1] = '\0';
+    memset (&sun, 0xcc, sizeof (sun));
+    errno = 0;
+    TEST_COMPARE (__sockaddr_un_set (&sun, pathname), -1);
+    TEST_COMPARE (errno, EINVAL);
+  }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/stdlib/Makefile b/stdlib/Makefile
index b3b30ab73..b1eebd568 100644
--- a/stdlib/Makefile
+++ b/stdlib/Makefile
@@ -86,7 +86,8 @@ tests		:= tst-strtol tst-strtod testmb testrand testsort testdiv   \
 		   tst-makecontext-align test-bz22786 tst-strtod-nan-sign \
 		   tst-swapcontext1 tst-setcontext4 tst-setcontext5 \
 		   tst-setcontext6 tst-setcontext7 tst-setcontext8 \
-		   tst-setcontext9 tst-bz20544 tst-canon-bz26341
+		   tst-setcontext9 tst-bz20544 tst-canon-bz26341 \
+		   tst-realpath-toolong
 
 tests-internal	:= tst-strtod1i tst-strtod3 tst-strtod4 tst-strtod5i \
 		   tst-tls-atexit tst-tls-atexit-nodelete
diff --git a/stdlib/canonicalize.c b/stdlib/canonicalize.c
index 698f9ede2..e2d4244fc 100644
--- a/stdlib/canonicalize.c
+++ b/stdlib/canonicalize.c
@@ -400,8 +400,16 @@ realpath_stk (const char *name, char *resolved,
 
 error:
   *dest++ = '\0';
-  if (resolved != NULL && dest - rname <= get_path_max ())
-    rname = strcpy (resolved, rname);
+  if (resolved != NULL)
+    {
+      if (dest - rname <= get_path_max ())
+	rname = strcpy (resolved, rname);
+      else if (!failed)
+	{
+	  failed = true;
+	  __set_errno (ENAMETOOLONG);
+	}
+    }
 
 error_nomem:
   scratch_buffer_free (&extra_buffer);
diff --git a/stdlib/tst-realpath-toolong.c b/stdlib/tst-realpath-toolong.c
new file mode 100644
index 000000000..438889029
--- /dev/null
+++ b/stdlib/tst-realpath-toolong.c
@@ -0,0 +1,53 @@
+/* Verify that realpath returns NULL with ENAMETOOLONG if the result exceeds
+   NAME_MAX.
+   Copyright The GNU Toolchain Authors.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <support/check.h>
+#include <support/temp_file.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#define BASENAME "tst-realpath-toolong."
+
+#ifndef PATH_MAX
+# define PATH_MAX 1024
+#endif
+
+int
+do_test (void)
+{
+  char *base = support_create_and_chdir_toolong_temp_directory (BASENAME);
+
+  char buf[PATH_MAX + 1];
+  const char *res = realpath (".", buf);
+
+  /* canonicalize.c states that if the real path is >= PATH_MAX, then
+     realpath returns NULL and sets ENAMETOOLONG.  */
+  TEST_VERIFY (res == NULL);
+  TEST_VERIFY (errno == ENAMETOOLONG);
+
+  free (base);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c
index c9ec03866..5567c9ae2 100644
--- a/stdlib/tst-secure-getenv.c
+++ b/stdlib/tst-secure-getenv.c
@@ -30,167 +30,12 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
+#include <support/check.h>
 #include <support/support.h>
+#include <support/capture_subprocess.h>
 #include <support/test-driver.h>
 
 static char MAGIC_ARGUMENT[] = "run-actual-test";
-#define MAGIC_STATUS 19
-
-/* Return a GID which is not our current GID, but is present in the
-   supplementary group list.  */
-static gid_t
-choose_gid (void)
-{
-  int count = getgroups (0, NULL);
-  if (count < 0)
-    {
-      printf ("getgroups: %m\n");
-      exit (1);
-    }
-  gid_t *groups;
-  groups = xcalloc (count, sizeof (*groups));
-  int ret = getgroups (count, groups);
-  if (ret < 0)
-    {
-      printf ("getgroups: %m\n");
-      exit (1);
-    }
-  gid_t current = getgid ();
-  gid_t not_current = 0;
-  for (int i = 0; i < ret; ++i)
-    {
-      if (groups[i] != current)
-        {
-          not_current = groups[i];
-          break;
-        }
-    }
-  free (groups);
-  return not_current;
-}
-
-
-/* Copies the executable into a restricted directory, so that we can
-   safely make it SGID with the TARGET group ID.  Then runs the
-   executable.  */
-static int
-run_executable_sgid (gid_t target)
-{
-  char *dirname = xasprintf ("%s/secure-getenv.%jd",
-			     test_dir, (intmax_t) getpid ());
-  char *execname = xasprintf ("%s/bin", dirname);
-  int infd = -1;
-  int outfd = -1;
-  int ret = -1;
-  if (mkdir (dirname, 0700) < 0)
-    {
-      printf ("mkdir: %m\n");
-      goto err;
-    }
-  infd = open ("/proc/self/exe", O_RDONLY);
-  if (infd < 0)
-    {
-      printf ("open (/proc/self/exe): %m\n");
-      goto err;
-    }
-  outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
-  if (outfd < 0)
-    {
-      printf ("open (%s): %m\n", execname);
-      goto err;
-    }
-  char buf[4096];
-  for (;;)
-    {
-      ssize_t rdcount = read (infd, buf, sizeof (buf));
-      if (rdcount < 0)
-	{
-	  printf ("read: %m\n");
-	  goto err;
-	}
-      if (rdcount == 0)
-	break;
-      char *p = buf;
-      char *end = buf + rdcount;
-      while (p != end)
-	{
-	  ssize_t wrcount = write (outfd, buf, end - p);
-	  if (wrcount == 0)
-	    errno = ENOSPC;
-	  if (wrcount <= 0)
-	    {
-	      printf ("write: %m\n");
-	      goto err;
-	    }
-	  p += wrcount;
-	}
-    }
-  if (fchown (outfd, getuid (), target) < 0)
-    {
-      printf ("fchown (%s): %m\n", execname);
-      goto err;
-    }
-  if (fchmod (outfd, 02750) < 0)
-    {
-      printf ("fchmod (%s): %m\n", execname);
-      goto err;
-    }
-  if (close (outfd) < 0)
-    {
-      printf ("close (outfd): %m\n");
-      goto err;
-    }
-  if (close (infd) < 0)
-    {
-      printf ("close (infd): %m\n");
-      goto err;
-    }
-
-  int kid = fork ();
-  if (kid < 0)
-    {
-      printf ("fork: %m\n");
-      goto err;
-    }
-  if (kid == 0)
-    {
-      /* Child process.  */
-      char *args[] = { execname, MAGIC_ARGUMENT, NULL };
-      execve (execname, args, environ);
-      printf ("execve (%s): %m\n", execname);
-      _exit (1);
-    }
-  int status;
-  if (waitpid (kid, &status, 0) < 0)
-    {
-      printf ("waitpid: %m\n");
-      goto err;
-    }
-  if (!WIFEXITED (status) || WEXITSTATUS (status) != MAGIC_STATUS)
-    {
-      printf ("Unexpected exit status %d from child process\n",
-	      status);
-      goto err;
-    }
-  ret = 0;
-
-err:
-  if (outfd >= 0)
-    close (outfd);
-  if (infd >= 0)
-    close (infd);
-  if (execname)
-    {
-      unlink (execname);
-      free (execname);
-    }
-  if (dirname)
-    {
-      rmdir (dirname);
-      free (dirname);
-    }
-  return ret;
-}
 
 static int
 do_test (void)
@@ -212,15 +57,15 @@ do_test (void)
       exit (1);
     }
 
-  gid_t target = choose_gid ();
-  if (target == 0)
-    {
-      fprintf (stderr,
-	       "Could not find a suitable GID for user %jd, skipping test\n",
-	       (intmax_t) getuid ());
-      exit (0);
-    }
-  return run_executable_sgid (target);
+  int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
+
+  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
+    return EXIT_UNSUPPORTED;
+
+  if (!WIFEXITED (status))
+    FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
+
+  return 0;
 }
 
 static void
@@ -229,23 +74,15 @@ alternative_main (int argc, char **argv)
   if (argc == 2 && strcmp (argv[1], MAGIC_ARGUMENT) == 0)
     {
       if (getgid () == getegid ())
-	{
-	  /* This can happen if the file system is mounted nosuid.  */
-	  fprintf (stderr, "SGID failed: GID and EGID match (%jd)\n",
-		  (intmax_t) getgid ());
-	  exit (MAGIC_STATUS);
-	}
+	/* This can happen if the file system is mounted nosuid.  */
+	FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+		   (intmax_t) getgid ());
       if (getenv ("PATH") == NULL)
-	{
-	  printf ("PATH variable not present\n");
-	  exit (3);
-	}
+	FAIL_EXIT (3, "PATH variable not present\n");
       if (secure_getenv ("PATH") != NULL)
-	{
-	  printf ("PATH variable not filtered out\n");
-	  exit (4);
-	}
-      exit (MAGIC_STATUS);
+	FAIL_EXIT (4, "PATH variable not filtered out\n");
+
+      exit (EXIT_SUCCESS);
     }
 }
 
diff --git a/string/rawmemchr.c b/string/rawmemchr.c
index 59bbeeaa4..b8523118e 100644
--- a/string/rawmemchr.c
+++ b/string/rawmemchr.c
@@ -22,24 +22,28 @@
 # define RAWMEMCHR __rawmemchr
 #endif
 
-/* Find the first occurrence of C in S.  */
-void *
-RAWMEMCHR (const void *s, int c)
-{
-  DIAG_PUSH_NEEDS_COMMENT;
+/* The pragmata should be nested inside RAWMEMCHR below, but that
+   triggers GCC PR 98512.  */
+DIAG_PUSH_NEEDS_COMMENT;
 #if __GNUC_PREREQ (7, 0)
-  /* GCC 8 warns about the size passed to memchr being larger than
-     PTRDIFF_MAX; the use of SIZE_MAX is deliberate here.  */
-  DIAG_IGNORE_NEEDS_COMMENT (8, "-Wstringop-overflow=");
+/* GCC 8 warns about the size passed to memchr being larger than
+   PTRDIFF_MAX; the use of SIZE_MAX is deliberate here.  */
+DIAG_IGNORE_NEEDS_COMMENT (8, "-Wstringop-overflow=");
 #endif
 #if __GNUC_PREREQ (11, 0)
-  /* Likewise GCC 11, with a different warning option.  */
-  DIAG_IGNORE_NEEDS_COMMENT (11, "-Wstringop-overread");
+/* Likewise GCC 11, with a different warning option.  */
+DIAG_IGNORE_NEEDS_COMMENT (11, "-Wstringop-overread");
 #endif
+
+/* Find the first occurrence of C in S.  */
+void *
+RAWMEMCHR (const void *s, int c)
+{
   if (c != '\0')
     return memchr (s, c, (size_t)-1);
-  DIAG_POP_NEEDS_COMMENT;
   return (char *)s + strlen (s);
 }
 libc_hidden_def (__rawmemchr)
 weak_alias (__rawmemchr, rawmemchr)
+
+DIAG_POP_NEEDS_COMMENT;
diff --git a/string/test-memchr.c b/string/test-memchr.c
index 665edc32a..ce964284a 100644
--- a/string/test-memchr.c
+++ b/string/test-memchr.c
@@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res)
   CHAR *res = CALL (impl, s, c, n);
   if (res != exp_res)
     {
-      error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     res, exp_res);
+      error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p",
+             impl->name, s, c, n, res, exp_res);
       ret = 1;
       return;
     }
@@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
     }
   buf[align + len] = 0;
 
-  if (pos < len)
+  if (pos < MIN(n, len))
     {
       buf[align + pos] = seek_char;
       buf[align + len] = -seek_char;
@@ -107,6 +107,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char)
     do_one_test (impl, (CHAR *) (buf + align), seek_char, n, result);
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  uintptr_t buf_addr = (uintptr_t) buf1;
+
+  for (i = 0; i < 750; ++i)
+    {
+        do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR);
+        do_test (0, i, 751, i - buf_addr, BIG_CHAR);
+        do_test (0, i, 751, -buf_addr - i, BIG_CHAR);
+        do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR);
+        do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+      len = 0;
+      for (j = 8 * sizeof(size_t) - 1; j ; --j)
+        {
+          len |= one << j;
+          do_test (0, i, 751, len - i, BIG_CHAR);
+          do_test (0, i, 751, len + i, BIG_CHAR);
+          do_test (0, i, 751, len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, 751, len - buf_addr + i, BIG_CHAR);
+
+          do_test (0, i, 751, ~len - i, BIG_CHAR);
+          do_test (0, i, 751, ~len + i, BIG_CHAR);
+          do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR);
+        }
+    }
+}
+
 static void
 do_random_tests (void)
 {
@@ -221,6 +253,7 @@ test_main (void)
     do_test (page_size / 2 - i, i, i, 1, 0x9B);
 
   do_random_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
diff --git a/string/test-strcmp.c b/string/test-strcmp.c
index 7feababf4..a0255b962 100644
--- a/string/test-strcmp.c
+++ b/string/test-strcmp.c
@@ -25,6 +25,7 @@
 # define TEST_NAME "strcmp"
 #endif
 #include "test-string.h"
+#include <support/test-driver.h>
 
 #ifdef WIDE
 # include <wchar.h>
@@ -392,6 +393,32 @@ check2 (void)
 	}
 }
 
+static void
+check3 (void)
+{
+  size_t size = 0xd000 + 0x4000;
+  CHAR *s1, *s2;
+  CHAR *buffer1 = mmap (NULL, size, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANON, -1, 0);
+  CHAR *buffer2 = mmap (NULL, size, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANON, -1, 0);
+  if (buffer1 == MAP_FAILED || buffer1 == MAP_FAILED)
+    error (EXIT_UNSUPPORTED, errno, "mmap failed");
+
+  s1 = (CHAR *) (buffer1 + 0x8f8 / sizeof (CHAR));
+  s2 = (CHAR *) (buffer2 + 0xcff3 / sizeof (CHAR));
+
+  STRCPY(s1, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/util/PathDocFileFactory.java"));
+  STRCPY(s2, L("/export/redhat/rpms/BUILD/java-1.8.0-openjdk-1.8.0.312.b07-2.fc35.x86_64/openjdk/langtools/src/share/classes/com/sun/tools/doclets/internal/toolkit/taglets/ThrowsTaglet.java"));
+
+  int exp_result = SIMPLE_STRCMP (s1, s2);
+  FOR_EACH_IMPL (impl, 0)
+    check_result (impl, s1, s2, exp_result);
+
+  munmap ((void *) buffer1, size);
+  munmap ((void *) buffer2, size);
+}
+
 int
 test_main (void)
 {
@@ -400,6 +427,7 @@ test_main (void)
   test_init ();
   check();
   check2 ();
+  check3 ();
 
   printf ("%23s", "");
   FOR_EACH_IMPL (impl, 0)
diff --git a/string/test-strncat.c b/string/test-strncat.c
index 2ef917b82..37ea26ea0 100644
--- a/string/test-strncat.c
+++ b/string/test-strncat.c
@@ -134,6 +134,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2,
     }
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  CHAR *s1, *s2;
+  uintptr_t s1_addr;
+  s1 = (CHAR *) buf1;
+  s2 = (CHAR *) buf2;
+  s1_addr = (uintptr_t)s1;
+ for (j = 0; j < 200; ++j)
+      s2[j] = 32 + 23 * j % (BIG_CHAR - 32);
+ s2[200] = 0;
+  for (i = 0; i < 750; ++i) {
+    for (j = 0; j < i; ++j)
+      s1[j] = 32 + 23 * j % (BIG_CHAR - 32);
+    s1[i] = '\0';
+
+       FOR_EACH_IMPL (impl, 0)
+    {
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - i);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, i - s1_addr);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, -s1_addr - i);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i);
+      s2[200] = '\0';
+      do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i);
+    }
+
+    len = 0;
+    for (j = 8 * sizeof(size_t) - 1; j ; --j)
+      {
+        len |= one << j;
+        FOR_EACH_IMPL (impl, 0)
+          {
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len + i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len - s1_addr - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, len - s1_addr + i);
+
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len + i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len - s1_addr - i);
+            s2[200] = '\0';
+            do_one_test (impl, s2, s1, ~len - s1_addr + i);
+          }
+      }
+  }
+}
+
 static void
 do_random_tests (void)
 {
@@ -316,6 +376,7 @@ test_main (void)
     }
 
   do_random_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
diff --git a/string/test-strncmp.c b/string/test-strncmp.c
index 10b34de8d..56e23670a 100644
--- a/string/test-strncmp.c
+++ b/string/test-strncmp.c
@@ -435,6 +435,28 @@ check3 (void)
 	}
 }
 
+static void
+check4 (void)
+{
+  /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of
+     the end of the page. 2) For there to be no mismatch/null byte before the
+     first page cross. 3) For length (`n`) to be large enough for one string to
+     cross the page. And 4) for there to be either mismatch/null bytes before
+     the start of the strings.  */
+
+  size_t size = 10;
+  size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1);
+  CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa));
+  CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed));
+  int exp_result;
+
+  STRCPY (s1, L ("tst-tlsmod%"));
+  STRCPY (s2, L ("tst-tls-manydynamic73mod"));
+  exp_result = SIMPLE_STRNCMP (s1, s2, size);
+  FOR_EACH_IMPL (impl, 0)
+  check_result (impl, s1, s2, size, exp_result);
+}
+
 int
 test_main (void)
 {
@@ -445,6 +467,7 @@ test_main (void)
   check1 ();
   check2 ();
   check3 ();
+  check4 ();
 
   printf ("%23s", "");
   FOR_EACH_IMPL (impl, 0)
diff --git a/string/test-strnlen.c b/string/test-strnlen.c
index 61eb521dc..0d9479d15 100644
--- a/string/test-strnlen.c
+++ b/string/test-strnlen.c
@@ -27,6 +27,7 @@
 
 #ifndef WIDE
 # define STRNLEN strnlen
+# define MEMSET memset
 # define CHAR char
 # define BIG_CHAR CHAR_MAX
 # define MIDDLE_CHAR 127
@@ -34,6 +35,7 @@
 #else
 # include <wchar.h>
 # define STRNLEN wcsnlen
+# define MEMSET wmemset
 # define CHAR wchar_t
 # define BIG_CHAR WCHAR_MAX
 # define MIDDLE_CHAR 1121
@@ -87,6 +89,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char)
     do_one_test (impl, (CHAR *) (buf + align), maxlen, MIN (len, maxlen));
 }
 
+static void
+do_overflow_tests (void)
+{
+  size_t i, j, len;
+  const size_t one = 1;
+  uintptr_t buf_addr = (uintptr_t) buf1;
+
+  for (i = 0; i < 750; ++i)
+    {
+      do_test (0, i, SIZE_MAX - i, BIG_CHAR);
+      do_test (0, i, i - buf_addr, BIG_CHAR);
+      do_test (0, i, -buf_addr - i, BIG_CHAR);
+      do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR);
+      do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR);
+
+      len = 0;
+      for (j = 8 * sizeof(size_t) - 1; j ; --j)
+        {
+          len |= one << j;
+          do_test (0, i, len - i, BIG_CHAR);
+          do_test (0, i, len + i, BIG_CHAR);
+          do_test (0, i, len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, len - buf_addr + i, BIG_CHAR);
+
+          do_test (0, i, ~len - i, BIG_CHAR);
+          do_test (0, i, ~len + i, BIG_CHAR);
+          do_test (0, i, ~len - buf_addr - i, BIG_CHAR);
+          do_test (0, i, ~len - buf_addr + i, BIG_CHAR);
+        }
+    }
+}
+
 static void
 do_random_tests (void)
 {
@@ -153,7 +187,7 @@ do_page_tests (void)
   size_t last_offset = (page_size / sizeof (CHAR)) - 1;
 
   CHAR *s = (CHAR *) buf2;
-  memset (s, 65, (last_offset - 1));
+  MEMSET (s, 65, (last_offset - 1));
   s[last_offset] = 0;
 
   /* Place short strings ending at page boundary.  */
@@ -196,6 +230,35 @@ do_page_tests (void)
     }
 }
 
+/* Tests meant to unveil fail on implementations that access bytes
+   beyond the maxium length.  */
+
+static void
+do_page_2_tests (void)
+{
+  size_t i, exp_len, offset;
+  size_t last_offset = page_size / sizeof (CHAR);
+
+  CHAR *s = (CHAR *) buf2;
+  MEMSET (s, 65, last_offset);
+
+  /* Place short strings ending at page boundary without the null
+     byte.  */
+  offset = last_offset;
+  for (i = 0; i < 128; i++)
+    {
+      /* Decrease offset to stress several sizes and alignments.  */
+      offset--;
+      exp_len = last_offset - offset;
+      FOR_EACH_IMPL (impl, 0)
+	{
+	  /* If an implementation goes beyond EXP_LEN, it will trigger
+	     the segfault.  */
+	  do_one_test (impl, (CHAR *) (s + offset), exp_len, exp_len);
+	}
+    }
+}
+
 int
 test_main (void)
 {
@@ -242,6 +305,8 @@ test_main (void)
 
   do_random_tests ();
   do_page_tests ();
+  do_page_2_tests ();
+  do_overflow_tests ();
   return ret;
 }
 
diff --git a/sunrpc/Makefile b/sunrpc/Makefile
index 976158540..a7cd3eafa 100644
--- a/sunrpc/Makefile
+++ b/sunrpc/Makefile
@@ -65,7 +65,8 @@ shared-only-routines = $(routines)
 endif
 
 tests = tst-xdrmem tst-xdrmem2 test-rpcent tst-udp-error tst-udp-timeout \
-  tst-udp-nonblocking
+  tst-udp-nonblocking tst-bug22542 tst-bug28768
+
 xtests := tst-getmyaddr
 
 ifeq ($(have-thread-library),yes)
@@ -111,6 +112,8 @@ $(objpfx)tst-udp-nonblocking: $(common-objpfx)linkobj/libc.so
 $(objpfx)tst-udp-garbage: \
   $(common-objpfx)linkobj/libc.so $(shared-thread-library)
 
+$(objpfx)tst-bug22542: $(common-objpfx)linkobj/libc.so
+
 else # !have-GLIBC_2.31
 
 routines = $(routines-for-nss)
diff --git a/sunrpc/clnt_gen.c b/sunrpc/clnt_gen.c
index 13ced8994..b44357cd8 100644
--- a/sunrpc/clnt_gen.c
+++ b/sunrpc/clnt_gen.c
@@ -57,9 +57,13 @@ clnt_create (const char *hostname, u_long prog, u_long vers,
 
   if (strcmp (proto, "unix") == 0)
     {
-      memset ((char *)&sun, 0, sizeof (sun));
-      sun.sun_family = AF_UNIX;
-      strcpy (sun.sun_path, hostname);
+      if (__sockaddr_un_set (&sun, hostname) < 0)
+	{
+	  struct rpc_createerr *ce = &get_rpc_createerr ();
+	  ce->cf_stat = RPC_SYSTEMERROR;
+	  ce->cf_error.re_errno = errno;
+	  return NULL;
+	}
       sock = RPC_ANYSOCK;
       client = clntunix_create (&sun, prog, vers, &sock, 0, 0);
       if (client == NULL)
diff --git a/sunrpc/svc_unix.c b/sunrpc/svc_unix.c
index 679fbe9cb..46f8d16fe 100644
--- a/sunrpc/svc_unix.c
+++ b/sunrpc/svc_unix.c
@@ -154,7 +154,10 @@ svcunix_create (int sock, u_int sendsize, u_int recvsize, char *path)
   SVCXPRT *xprt;
   struct unix_rendezvous *r;
   struct sockaddr_un addr;
-  socklen_t len = sizeof (struct sockaddr_in);
+  socklen_t len = sizeof (addr);
+
+  if (__sockaddr_un_set (&addr, path) < 0)
+    return NULL;
 
   if (sock == RPC_ANYSOCK)
     {
@@ -165,12 +168,6 @@ svcunix_create (int sock, u_int sendsize, u_int recvsize, char *path)
 	}
       madesock = TRUE;
     }
-  memset (&addr, '\0', sizeof (addr));
-  addr.sun_family = AF_UNIX;
-  len = strlen (path) + 1;
-  memcpy (addr.sun_path, path, len);
-  len += sizeof (addr.sun_family);
-
   __bind (sock, (struct sockaddr *) &addr, len);
 
   if (__getsockname (sock, (struct sockaddr *) &addr, &len) != 0
diff --git a/sunrpc/svcauth_des.c b/sunrpc/svcauth_des.c
index 7607abc81..25a85c909 100644
--- a/sunrpc/svcauth_des.c
+++ b/sunrpc/svcauth_des.c
@@ -58,7 +58,6 @@
 
 #define debug(msg)		/*printf("svcauth_des: %s\n", msg) */
 
-#define USEC_PER_SEC ((uint32_t) 1000000L)
 #define BEFORE(t1, t2) timercmp(t1, t2, <)
 
 /*
diff --git a/sunrpc/tst-bug22542.c b/sunrpc/tst-bug22542.c
new file mode 100644
index 000000000..d6cd79787
--- /dev/null
+++ b/sunrpc/tst-bug22542.c
@@ -0,0 +1,44 @@
+/* Test to verify that overlong hostname is rejected by clnt_create
+   and doesn't cause a buffer overflow (bug  22542).
+
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <rpc/clnt.h>
+#include <string.h>
+#include <support/check.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+static int
+do_test (void)
+{
+  /* Create an arbitrary hostname that's longer than fits in sun_path.  */
+  char name [sizeof ((struct sockaddr_un*)0)->sun_path * 2];
+  memset (name, 'x', sizeof name - 1);
+  name [sizeof name - 1] = '\0';
+
+  errno = 0;
+  CLIENT *clnt = clnt_create (name, 0, 0, "unix");
+
+  TEST_VERIFY (clnt == NULL);
+  TEST_COMPARE (errno, EINVAL);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sunrpc/tst-bug28768.c b/sunrpc/tst-bug28768.c
new file mode 100644
index 000000000..35a4b7b0b
--- /dev/null
+++ b/sunrpc/tst-bug28768.c
@@ -0,0 +1,42 @@
+/* Test to verify that long path is rejected by svcunix_create (bug 28768).
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <rpc/svc.h>
+#include <shlib-compat.h>
+#include <string.h>
+#include <support/check.h>
+
+/* svcunix_create does not have a default version in linkobj/libc.so.  */
+compat_symbol_reference (libc, svcunix_create, svcunix_create, GLIBC_2_1);
+
+static int
+do_test (void)
+{
+  char pathname[109];
+  memset (pathname, 'x', sizeof (pathname));
+  pathname[sizeof (pathname) - 1] = '\0';
+
+  errno = 0;
+  TEST_VERIFY (svcunix_create (RPC_ANYSOCK, 4096, 4096, pathname) == NULL);
+  TEST_COMPARE (errno, EINVAL);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/support/Makefile b/support/Makefile
index bb9889efb..f631846c0 100644
--- a/support/Makefile
+++ b/support/Makefile
@@ -67,6 +67,8 @@ libsupport-routines = \
   support_quote_string \
   support_record_failure \
   support_run_diff \
+  support_select_modifies_timeout \
+  support_select_normalizes_timeout \
   support_set_small_thread_stack_size \
   support_shared_allocate \
   support_small_stack_thread_attribute \
@@ -90,6 +92,7 @@ libsupport-routines = \
   xchdir \
   xchroot \
   xclock_gettime \
+  xclone \
   xclose \
   xchmod \
   xconnect \
@@ -139,6 +142,7 @@ libsupport-routines = \
   xpthread_join \
   xpthread_key_create \
   xpthread_key_delete \
+  xpthread_kill \
   xpthread_mutex_consistent \
   xpthread_mutex_destroy \
   xpthread_mutex_init \
diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
index 8969d4a99..4be430f09 100644
--- a/support/capture_subprocess.h
+++ b/support/capture_subprocess.h
@@ -41,6 +41,12 @@ struct support_capture_subprocess support_capture_subprocess
 struct support_capture_subprocess support_capture_subprogram
   (const char *file, char *const argv[]);
 
+/* Copy the running program into a setgid binary and run it with CHILD_ID
+   argument.  If execution is successful, return the exit status of the child
+   program, otherwise return a non-zero failure exit code.  */
+int support_capture_subprogram_self_sgid
+  (char *child_id);
+
 /* Deallocate the subprocess data captured by
    support_capture_subprocess.  */
 void support_capture_subprocess_free (struct support_capture_subprocess *);
diff --git a/support/subprocess.h b/support/subprocess.h
index 11cfc6a07..40d82c7e4 100644
--- a/support/subprocess.h
+++ b/support/subprocess.h
@@ -38,6 +38,11 @@ struct support_subprocess support_subprocess
 struct support_subprocess support_subprogram
   (const char *file, char *const argv[]);
 
+/* Invoke program FILE with ARGV arguments by using posix_spawn and wait for it
+   to complete.  Return program exit status.  */
+int support_subprogram_wait
+  (const char *file, char *const argv[]);
+
 /* Wait for the subprocess indicated by PROC::PID.  Return the status
    indicate by waitpid call.  */
 int support_process_wait (struct support_subprocess *proc);
diff --git a/support/support.h b/support/support.h
index 9cbc45572..8c7890e0a 100644
--- a/support/support.h
+++ b/support/support.h
@@ -23,6 +23,7 @@
 #ifndef SUPPORT_H
 #define SUPPORT_H
 
+#include <stdbool.h>
 #include <stddef.h>
 #include <sys/cdefs.h>
 /* For mode_t.  */
@@ -129,6 +130,14 @@ extern void support_copy_file (const char *from, const char *to);
 extern ssize_t support_copy_file_range (int, off64_t *, int, off64_t *,
 					size_t, unsigned int);
 
+/* Return true if select modify the timeout to reflect the amount of time
+   no slept.  */
+extern bool support_select_modifies_timeout (void);
+
+/* Return true if select normalize the timeout input by taking in account
+   tv_usec larger than 1000000.  */
+extern bool support_select_normalizes_timeout (void);
+
 __END_DECLS
 
 #endif /* SUPPORT_H */
diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
index a7afa0e70..27bfd19c9 100644
--- a/support/support_capture_subprocess.c
+++ b/support/support_capture_subprocess.c
@@ -20,11 +20,14 @@
 #include <support/capture_subprocess.h>
 
 #include <errno.h>
+#include <fcntl.h>
 #include <stdlib.h>
 #include <support/check.h>
 #include <support/xunistd.h>
 #include <support/xsocket.h>
 #include <support/xspawn.h>
+#include <support/support.h>
+#include <support/test-driver.h>
 
 static void
 transfer (const char *what, struct pollfd *pfd, struct xmemstream *stream)
@@ -36,7 +39,7 @@ transfer (const char *what, struct pollfd *pfd, struct xmemstream *stream)
       if (ret < 0)
         {
           support_record_failure ();
-          printf ("error: reading from subprocess %s: %m", what);
+          printf ("error: reading from subprocess %s: %m\n", what);
           pfd->events = 0;
           pfd->revents = 0;
         }
@@ -102,6 +105,129 @@ support_capture_subprogram (const char *file, char *const argv[])
   return result;
 }
 
+/* Copies the executable into a restricted directory, so that we can
+   safely make it SGID with the TARGET group ID.  Then runs the
+   executable.  */
+static int
+copy_and_spawn_sgid (char *child_id, gid_t gid)
+{
+  char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
+			     test_dir, (intmax_t) getpid ());
+  char *execname = xasprintf ("%s/bin", dirname);
+  int infd = -1;
+  int outfd = -1;
+  int ret = 1, status = 1;
+
+  TEST_VERIFY (mkdir (dirname, 0700) == 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+
+  infd = open ("/proc/self/exe", O_RDONLY);
+  if (infd < 0)
+    FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n");
+
+  outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
+  TEST_VERIFY (outfd >= 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+
+  char buf[4096];
+  for (;;)
+    {
+      ssize_t rdcount = read (infd, buf, sizeof (buf));
+      TEST_VERIFY (rdcount >= 0);
+      if (support_record_failure_is_failed ())
+	goto err;
+      if (rdcount == 0)
+	break;
+      char *p = buf;
+      char *end = buf + rdcount;
+      while (p != end)
+	{
+	  ssize_t wrcount = write (outfd, buf, end - p);
+	  if (wrcount == 0)
+	    errno = ENOSPC;
+	  TEST_VERIFY (wrcount > 0);
+	  if (support_record_failure_is_failed ())
+	    goto err;
+	  p += wrcount;
+	}
+    }
+  TEST_VERIFY (fchown (outfd, getuid (), gid) == 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+  TEST_VERIFY (fchmod (outfd, 02750) == 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+  TEST_VERIFY (close (outfd) == 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+  TEST_VERIFY (close (infd) == 0);
+  if (support_record_failure_is_failed ())
+    goto err;
+
+  /* We have the binary, now spawn the subprocess.  Avoid using
+     support_subprogram because we only want the program exit status, not the
+     contents.  */
+  ret = 0;
+
+  char * const args[] = {execname, child_id, NULL};
+
+  status = support_subprogram_wait (args[0], args);
+
+err:
+  if (outfd >= 0)
+    close (outfd);
+  if (infd >= 0)
+    close (infd);
+  if (execname != NULL)
+    {
+      unlink (execname);
+      free (execname);
+    }
+  if (dirname != NULL)
+    {
+      rmdir (dirname);
+      free (dirname);
+    }
+
+  if (ret != 0)
+    FAIL_EXIT1("Failed to make sgid executable for test\n");
+
+  return status;
+}
+
+int
+support_capture_subprogram_self_sgid (char *child_id)
+{
+  gid_t target = 0;
+  const int count = 64;
+  gid_t groups[count];
+
+  /* Get a GID which is not our current GID, but is present in the
+     supplementary group list.  */
+  int ret = getgroups (count, groups);
+  if (ret < 0)
+    FAIL_UNSUPPORTED("Could not get group list for user %jd\n",
+		     (intmax_t) getuid ());
+
+  gid_t current = getgid ();
+  for (int i = 0; i < ret; ++i)
+    {
+      if (groups[i] != current)
+	{
+	  target = groups[i];
+	  break;
+	}
+    }
+
+  if (target == 0)
+    FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
+		     (intmax_t) getuid ());
+
+  return copy_and_spawn_sgid (child_id, target);
+}
+
 void
 support_capture_subprocess_free (struct support_capture_subprocess *p)
 {
diff --git a/support/support_select_modifies_timeout.c b/support/support_select_modifies_timeout.c
new file mode 100644
index 000000000..653ea2cc9
--- /dev/null
+++ b/support/support_select_modifies_timeout.c
@@ -0,0 +1,29 @@
+/* Return whether select modifies the timeout.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+
+bool
+support_select_modifies_timeout (void)
+{
+#ifdef __linux__
+  return true;
+#else
+  return false;
+#endif
+}
diff --git a/support/support_select_normalizes_timeout.c b/support/support_select_normalizes_timeout.c
new file mode 100644
index 000000000..987f9b035
--- /dev/null
+++ b/support/support_select_normalizes_timeout.c
@@ -0,0 +1,29 @@
+/* Return whether select normalizes the timeout.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/support.h>
+
+bool
+support_select_normalizes_timeout (void)
+{
+#ifdef __linux__
+  return true;
+#else
+  return false;
+#endif
+}
diff --git a/support/support_subprocess.c b/support/support_subprocess.c
index 88489a335..89e767ae4 100644
--- a/support/support_subprocess.c
+++ b/support/support_subprocess.c
@@ -27,7 +27,7 @@
 #include <support/subprocess.h>
 
 static struct support_subprocess
-support_suprocess_init (void)
+support_subprocess_init (void)
 {
   struct support_subprocess result;
 
@@ -48,7 +48,7 @@ support_suprocess_init (void)
 struct support_subprocess
 support_subprocess (void (*callback) (void *), void *closure)
 {
-  struct support_subprocess result = support_suprocess_init ();
+  struct support_subprocess result = support_subprocess_init ();
 
   result.pid = xfork ();
   if (result.pid == 0)
@@ -71,7 +71,7 @@ support_subprocess (void (*callback) (void *), void *closure)
 struct support_subprocess
 support_subprogram (const char *file, char *const argv[])
 {
-  struct support_subprocess result = support_suprocess_init ();
+  struct support_subprocess result = support_subprocess_init ();
 
   posix_spawn_file_actions_t fa;
   /* posix_spawn_file_actions_init does not fail.  */
@@ -84,7 +84,7 @@ support_subprogram (const char *file, char *const argv[])
   xposix_spawn_file_actions_addclose (&fa, result.stdout_pipe[1]);
   xposix_spawn_file_actions_addclose (&fa, result.stderr_pipe[1]);
 
-  result.pid = xposix_spawn (file, &fa, NULL, argv, NULL);
+  result.pid = xposix_spawn (file, &fa, NULL, argv, environ);
 
   xclose (result.stdout_pipe[1]);
   xclose (result.stderr_pipe[1]);
@@ -92,6 +92,19 @@ support_subprogram (const char *file, char *const argv[])
   return result;
 }
 
+int
+support_subprogram_wait (const char *file, char *const argv[])
+{
+  posix_spawn_file_actions_t fa;
+
+  posix_spawn_file_actions_init (&fa);
+  struct support_subprocess res = support_subprocess_init ();
+
+  res.pid = xposix_spawn (file, &fa, NULL, argv, environ);
+
+  return support_process_wait (&res);
+}
+
 int
 support_process_wait (struct support_subprocess *proc)
 {
diff --git a/support/temp_file.c b/support/temp_file.c
index c6df64187..e41128c2d 100644
--- a/support/temp_file.c
+++ b/support/temp_file.c
@@ -1,5 +1,6 @@
 /* Temporary file handling for tests.
-   Copyright (C) 1998-2021 Free Software Foundation, Inc.
+   Copyright (C) 1998-2022 Free Software Foundation, Inc.
+   Copyright The GNU Tools Authors.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -20,15 +21,17 @@
    some 32-bit platforms. */
 #define _FILE_OFFSET_BITS 64
 
+#include <support/check.h>
 #include <support/temp_file.h>
 #include <support/temp_file-internal.h>
 #include <support/support.h>
 
+#include <errno.h>
 #include <paths.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <unistd.h>
+#include <xunistd.h>
 
 /* List of temporary files.  */
 static struct temp_name_list
@@ -36,14 +39,20 @@ static struct temp_name_list
   struct temp_name_list *next;
   char *name;
   pid_t owner;
+  bool toolong;
 } *temp_name_list;
 
 /* Location of the temporary files.  Set by the test skeleton via
    support_set_test_dir.  The string is not be freed.  */
 static const char *test_dir = _PATH_TMP;
 
-void
-add_temp_file (const char *name)
+/* Name of subdirectories in a too long temporary directory tree.  */
+static char toolong_subdir[NAME_MAX + 1];
+static bool toolong_initialized;
+static size_t toolong_path_max;
+
+static void
+add_temp_file_internal (const char *name, bool toolong)
 {
   struct temp_name_list *newp
     = (struct temp_name_list *) xcalloc (sizeof (*newp), 1);
@@ -53,12 +62,19 @@ add_temp_file (const char *name)
       newp->name = newname;
       newp->next = temp_name_list;
       newp->owner = getpid ();
+      newp->toolong = toolong;
       temp_name_list = newp;
     }
   else
     free (newp);
 }
 
+void
+add_temp_file (const char *name)
+{
+  add_temp_file_internal (name, false);
+}
+
 int
 create_temp_file_in_dir (const char *base, const char *dir, char **filename)
 {
@@ -90,8 +106,8 @@ create_temp_file (const char *base, char **filename)
   return create_temp_file_in_dir (base, test_dir, filename);
 }
 
-char *
-support_create_temp_directory (const char *base)
+static char *
+create_temp_directory_internal (const char *base, bool toolong)
 {
   char *path = xasprintf ("%s/%sXXXXXX", test_dir, base);
   if (mkdtemp (path) == NULL)
@@ -99,16 +115,132 @@ support_create_temp_directory (const char *base)
       printf ("error: mkdtemp (\"%s\"): %m", path);
       exit (1);
     }
-  add_temp_file (path);
+  add_temp_file_internal (path, toolong);
   return path;
 }
 
-/* Helper functions called by the test skeleton follow.  */
+char *
+support_create_temp_directory (const char *base)
+{
+  return create_temp_directory_internal (base, false);
+}
+
+static void
+ensure_toolong_initialized (void)
+{
+  if (!toolong_initialized)
+    FAIL_EXIT1 ("uninitialized toolong directory tree\n");
+}
+
+static void
+initialize_toolong (const char *base)
+{
+  long name_max = pathconf (base, _PC_NAME_MAX);
+  name_max = (name_max < 0 ? 64
+	      : (name_max < sizeof (toolong_subdir) ? name_max
+		 : sizeof (toolong_subdir) - 1));
+
+  long path_max = pathconf (base, _PC_PATH_MAX);
+  path_max = (path_max < 0 ? 1024
+	      : path_max <= PTRDIFF_MAX ? path_max : PTRDIFF_MAX);
+
+  /* Sanity check to ensure that the test does not create temporary directories
+     in different filesystems because this API doesn't support it.  */
+  if (toolong_initialized)
+    {
+      if (name_max != strlen (toolong_subdir))
+	FAIL_UNSUPPORTED ("name_max: Temporary directories in different"
+			  " filesystems not supported yet\n");
+      if (path_max != toolong_path_max)
+	FAIL_UNSUPPORTED ("path_max: Temporary directories in different"
+			  " filesystems not supported yet\n");
+      return;
+    }
+
+  toolong_path_max = path_max;
+
+  size_t len = name_max;
+  memset (toolong_subdir, 'X', len);
+  toolong_initialized = true;
+}
+
+char *
+support_create_and_chdir_toolong_temp_directory (const char *basename)
+{
+  char *base = create_temp_directory_internal (basename, true);
+  xchdir (base);
+
+  initialize_toolong (base);
+
+  size_t sz = strlen (toolong_subdir);
+
+  /* Create directories and descend into them so that the final path is larger
+     than PATH_MAX.  */
+  for (size_t i = 0; i <= toolong_path_max / sz; i++)
+    {
+      int ret = mkdir (toolong_subdir, S_IRWXU);
+      if (ret != 0 && errno == ENAMETOOLONG)
+	FAIL_UNSUPPORTED ("Filesystem does not support creating too long "
+			  "directory trees\n");
+      else if (ret != 0)
+	FAIL_EXIT1 ("Failed to create directory tree: %m\n");
+      xchdir (toolong_subdir);
+    }
+  return base;
+}
 
 void
-support_set_test_dir (const char *path)
+support_chdir_toolong_temp_directory (const char *base)
 {
-  test_dir = path;
+  ensure_toolong_initialized ();
+
+  xchdir (base);
+
+  size_t sz = strlen (toolong_subdir);
+  for (size_t i = 0; i <= toolong_path_max / sz; i++)
+    xchdir (toolong_subdir);
+}
+
+/* Helper functions called by the test skeleton follow.  */
+
+static void
+remove_toolong_subdirs (const char *base)
+{
+  ensure_toolong_initialized ();
+
+  if (chdir (base) != 0)
+    {
+      printf ("warning: toolong cleanup base failed: chdir (\"%s\"): %m\n",
+	      base);
+      return;
+    }
+
+  /* Descend.  */
+  int levels = 0;
+  size_t sz = strlen (toolong_subdir);
+  for (levels = 0; levels <= toolong_path_max / sz; levels++)
+    if (chdir (toolong_subdir) != 0)
+      {
+	printf ("warning: toolong cleanup failed: chdir (\"%s\"): %m\n",
+		toolong_subdir);
+	break;
+      }
+
+  /* Ascend and remove.  */
+  while (--levels >= 0)
+    {
+      if (chdir ("..") != 0)
+	{
+	  printf ("warning: toolong cleanup failed: chdir (\"..\"): %m\n");
+	  return;
+	}
+      if (remove (toolong_subdir) != 0)
+	{
+	  printf ("warning: could not remove subdirectory: %s: %m\n",
+		  toolong_subdir);
+	  return;
+	}
+    }
 }
 
 void
@@ -123,6 +255,9 @@ support_delete_temp_files (void)
 	 around, to prevent PID reuse.)  */
       if (temp_name_list->owner == pid)
 	{
+	  if (temp_name_list->toolong)
+	    remove_toolong_subdirs (temp_name_list->name);
+
 	  if (remove (temp_name_list->name) != 0)
 	    printf ("warning: could not remove temporary file: %s: %m\n",
 		    temp_name_list->name);
@@ -147,3 +282,9 @@ support_print_temp_files (FILE *f)
       fprintf (f, ")\n");
     }
 }
+
+void
+support_set_test_dir (const char *path)
+{
+  test_dir = path;
+}
diff --git a/support/temp_file.h b/support/temp_file.h
index f3a7fb6f9..a22964c6f 100644
--- a/support/temp_file.h
+++ b/support/temp_file.h
@@ -44,6 +44,15 @@ int create_temp_file_in_dir (const char *base, const char *dir,
    returns.  The caller should free this string.  */
 char *support_create_temp_directory (const char *base);
 
+/* Create a temporary directory tree that is longer than PATH_MAX and schedule
+   it for deletion.  BASENAME is used as a prefix for the unique directory
+   name, which the function returns.  The caller should free this string.  */
+char *support_create_and_chdir_toolong_temp_directory (const char *basename);
+
+/* Change into the innermost directory of the directory tree BASE, which was
+   created using support_create_and_chdir_toolong_temp_directory.  */
+void support_chdir_toolong_temp_directory (const char *base);
+
 __END_DECLS
 
 #endif /* SUPPORT_TEMP_FILE_H */
diff --git a/support/test-container.c b/support/test-container.c
index 28cc44d9f..94498d390 100644
--- a/support/test-container.c
+++ b/support/test-container.c
@@ -481,7 +481,7 @@ need_sync (char *ap, char *bp, struct stat *a, struct stat *b)
 }
 
 static void
-rsync_1 (path_buf * src, path_buf * dest, int and_delete)
+rsync_1 (path_buf * src, path_buf * dest, int and_delete, int force_copies)
 {
   DIR *dir;
   struct dirent *de;
@@ -491,8 +491,9 @@ rsync_1 (path_buf * src, path_buf * dest, int and_delete)
   r_append ("/", dest);
 
   if (verbose)
-    printf ("sync %s to %s %s\n", src->buf, dest->buf,
-	    and_delete ? "and delete" : "");
+    printf ("sync %s to %s%s%s\n", src->buf, dest->buf,
+	    and_delete ? " and delete" : "",
+	    force_copies ? " (forced)" : "");
 
   size_t staillen = src->len;
 
@@ -521,10 +522,10 @@ rsync_1 (path_buf * src, path_buf * dest, int and_delete)
 	 missing.  */
       lstat (dest->buf, &d);
 
-      if (! need_sync (src->buf, dest->buf, &s, &d))
+      if (! force_copies && ! need_sync (src->buf, dest->buf, &s, &d))
 	{
 	  if (S_ISDIR (s.st_mode))
-	    rsync_1 (src, dest, and_delete);
+	    rsync_1 (src, dest, and_delete, force_copies);
 	  continue;
 	}
 
@@ -559,7 +560,7 @@ rsync_1 (path_buf * src, path_buf * dest, int and_delete)
 	  if (verbose)
 	    printf ("+D %s\n", dest->buf);
 	  maybe_xmkdir (dest->buf, (s.st_mode & 0777) | 0700);
-	  rsync_1 (src, dest, and_delete);
+	  rsync_1 (src, dest, and_delete, force_copies);
 	  break;
 
 	case S_IFLNK:
@@ -639,12 +640,12 @@ rsync_1 (path_buf * src, path_buf * dest, int and_delete)
 }
 
 static void
-rsync (char *src, char *dest, int and_delete)
+rsync (char *src, char *dest, int and_delete, int force_copies)
 {
   r_setup (src, &spath);
   r_setup (dest, &dpath);
 
-  rsync_1 (&spath, &dpath, and_delete);
+  rsync_1 (&spath, &dpath, and_delete, force_copies);
 }
 
 
@@ -846,11 +847,11 @@ main (int argc, char **argv)
     do_ldconfig = true;
 
   rsync (pristine_root_path, new_root_path,
-	 file_exists (concat (command_root, "/preclean.req", NULL)));
+	 file_exists (concat (command_root, "/preclean.req", NULL)), 0);
 
   if (stat (command_root, &st) >= 0
       && S_ISDIR (st.st_mode))
-    rsync (command_root, new_root_path, 0);
+    rsync (command_root, new_root_path, 0, 1);
 
   new_objdir_path = xstrdup (concat (new_root_path,
 				    support_objdir_root, NULL));
@@ -1044,7 +1045,7 @@ main (int argc, char **argv)
 
 	  /* Child has exited, we can post-clean the test root.  */
 	  printf("running post-clean rsync\n");
-	  rsync (pristine_root_path, new_root_path, 1);
+	  rsync (pristine_root_path, new_root_path, 1, 0);
 
 	  if (WIFEXITED (status))
 	    exit (WEXITSTATUS (status));
diff --git a/support/xclone.c b/support/xclone.c
new file mode 100644
index 000000000..243eee8b2
--- /dev/null
+++ b/support/xclone.c
@@ -0,0 +1,49 @@
+/* Auxiliary functions to issue the clone syscall.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifdef __linux__
+# include <support/check.h>
+# include <stackinfo.h>  /* For _STACK_GROWS_{UP,DOWN}.  */
+# include <xsched.h>
+
+pid_t
+xclone (int (*fn) (void *arg), void *arg, void *stack, size_t stack_size,
+	int flags)
+{
+  pid_t r = -1;
+
+# ifdef __ia64__
+  extern int __clone2 (int (*fn) (void *arg), void *stack, size_t stack_size,
+		       int flags, void *arg, ...);
+  r = __clone2 (fn, stack, stack_size, flags, arg, /* ptid */ NULL,
+		/* tls */ NULL, /* ctid  */ NULL);
+# else
+#  if _STACK_GROWS_DOWN
+  r = clone (fn, stack + stack_size, flags, arg, /* ptid */ NULL,
+	     /* tls */ NULL, /* ctid */  NULL);
+#  elif _STACK_GROWS_UP
+  r = clone (fn, stack, flags, arg, /* ptid */ NULL, /* tls */ NULL, NULL);
+#  endif
+# endif
+
+  if (r < 0)
+    FAIL_EXIT1 ("clone: %m");
+
+  return r;
+}
+#endif
diff --git a/support/xpthread_kill.c b/support/xpthread_kill.c
new file mode 100644
index 000000000..111a75d85
--- /dev/null
+++ b/support/xpthread_kill.c
@@ -0,0 +1,26 @@
+/* pthread_kill with error checking.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <signal.h>
+#include <support/xthread.h>
+
+void
+xpthread_kill (pthread_t thr, int signo)
+{
+  xpthread_check_return ("pthread_kill", pthread_kill (thr, signo));
+}
diff --git a/support/xsched.h b/support/xsched.h
new file mode 100644
index 000000000..eefd73194
--- /dev/null
+++ b/support/xsched.h
@@ -0,0 +1,34 @@
+/* Wrapper for sched.h functions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef SUPPORT_XSCHED_H
+#define SUPPORT_XSCHED_H
+
+__BEGIN_DECLS
+
+#include <sched.h>
+#include <sys/types.h>
+
+#ifdef __linux__
+pid_t xclone (int (*fn) (void *arg), void *arg, void *stack,
+	      size_t stack_size, int flags);
+#endif
+
+__END_DECLS
+
+#endif
diff --git a/support/xthread.h b/support/xthread.h
index c2086db34..1ba3f133a 100644
--- a/support/xthread.h
+++ b/support/xthread.h
@@ -75,6 +75,8 @@ void xpthread_attr_setstacksize (pthread_attr_t *attr,
 void xpthread_attr_setguardsize (pthread_attr_t *attr,
 				 size_t guardsize);
 
+void xpthread_kill (pthread_t thr, int signo);
+
 /* Return the stack size used on support_set_small_thread_stack_size.  */
 size_t support_small_thread_stack_size (void);
 /* Set the stack size in ATTR to a small value, but still large enough
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index aab7245e9..d552a7886 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -893,6 +893,17 @@ extern int _dl_catch_error (const char **objname, const char **errstring,
 			    void *args);
 libc_hidden_proto (_dl_catch_error)
 
+
+/* libdl in a secondary namespace (after dlopen) must use
+   _dl_catch_error from the main namespace, so it has to be exported
+   in some way.  Initialized to _rtld_catch_error in rtld.c.  Not in
+   _rtld_global_ro to preserve structure layout.  */
+extern __typeof (_dl_catch_error) *_dl_catch_error_ptr attribute_relro;
+rtld_hidden_proto (_dl_catch_error_ptr)
+
+/* Used for initializing _dl_catch_error_ptr.  */
+extern __typeof__ (_dl_catch_error) _rtld_catch_error attribute_hidden;
+
 /* Call OPERATE (ARGS).  If no error occurs, set *EXCEPTION to zero.
    Otherwise, store a copy of the raised exception in *EXCEPTION,
    which has to be freed by _dl_exception_free.  As a special case, if
diff --git a/sysdeps/hppa/dl-fptr.c b/sysdeps/hppa/dl-fptr.c
index 62ef68b62..cd4f77c0e 100644
--- a/sysdeps/hppa/dl-fptr.c
+++ b/sysdeps/hppa/dl-fptr.c
@@ -26,6 +26,7 @@
 #include <ldsodefs.h>
 #include <elf/dynamic-link.h>
 #include <dl-fptr.h>
+#include <dl-runtime.h>
 #include <dl-unmap-segments.h>
 #include <atomic.h>
 #include <libc-pointer-arith.h>
@@ -351,21 +352,20 @@ _dl_lookup_address (const void *address)
 {
   ElfW(Addr) addr = (ElfW(Addr)) address;
   ElfW(Word) reloc_arg;
-  volatile unsigned int *desc;
-  unsigned int *gptr;
+  unsigned int *desc, *gptr;
 
   /* Return ADDR if the least-significant two bits of ADDR are not consistent
      with ADDR being a linker defined function pointer.  The normal value for
      a code address in a backtrace is 3.  */
-  if (((unsigned int) addr & 3) != 2)
+  if (((uintptr_t) addr & 3) != 2)
     return addr;
 
   /* Handle special case where ADDR points to page 0.  */
-  if ((unsigned int) addr < 4096)
+  if ((uintptr_t) addr < 4096)
     return addr;
 
   /* Clear least-significant two bits from descriptor address.  */
-  desc = (unsigned int *) ((unsigned int) addr & ~3);
+  desc = (unsigned int *) ((uintptr_t) addr & ~3);
   if (!_dl_read_access_allowed (desc))
     return addr;
 
@@ -376,7 +376,7 @@ _dl_lookup_address (const void *address)
   /* Then load first word of candidate descriptor.  It should be a pointer
      with word alignment and point to memory that can be read.  */
   gptr = (unsigned int *) desc[0];
-  if (((unsigned int) gptr & 3) != 0
+  if (((uintptr_t) gptr & 3) != 0
       || !_dl_read_access_allowed (gptr))
     return addr;
 
@@ -400,10 +400,11 @@ _dl_lookup_address (const void *address)
 
       /* If gp has been resolved, we need to hunt for relocation offset.  */
       if (!(reloc_arg & PA_GP_RELOC))
-	reloc_arg = _dl_fix_reloc_arg (addr, l);
+	reloc_arg = _dl_fix_reloc_arg ((struct fdesc *) addr, l);
 
       _dl_fixup (l, reloc_arg);
     }
 
   return (ElfW(Addr)) desc[0];
 }
+rtld_hidden_def (_dl_lookup_address)
diff --git a/sysdeps/hppa/dl-lookupcfg.h b/sysdeps/hppa/dl-lookupcfg.h
index 29d994bfc..34456aae4 100644
--- a/sysdeps/hppa/dl-lookupcfg.h
+++ b/sysdeps/hppa/dl-lookupcfg.h
@@ -30,6 +30,7 @@ rtld_hidden_proto (_dl_symbol_address)
 #define DL_SYMBOL_ADDRESS(map, ref) _dl_symbol_address(map, ref)
 
 Elf32_Addr _dl_lookup_address (const void *address);
+rtld_hidden_proto (_dl_lookup_address)
 
 #define DL_LOOKUP_ADDRESS(addr) _dl_lookup_address ((const void *) addr)
 
diff --git a/sysdeps/hppa/dl-machine.h b/sysdeps/hppa/dl-machine.h
index f048fd207..24f0f47d8 100644
--- a/sysdeps/hppa/dl-machine.h
+++ b/sysdeps/hppa/dl-machine.h
@@ -27,6 +27,7 @@
 #include <string.h>
 #include <link.h>
 #include <errno.h>
+#include <ldsodefs.h>
 #include <dl-fptr.h>
 #include <abort-instr.h>
 #include <tls.h>
@@ -159,6 +160,24 @@ elf_machine_plt_value (struct link_map *map, const Elf32_Rela *reloc,
   return (struct fdesc) { value.ip + reloc->r_addend, value.gp };
 }
 
+static inline struct link_map *
+elf_machine_main_map (void)
+{
+  struct link_map *main_map;
+
+#if defined SHARED && IS_IN (rtld)
+  asm (
+"	bl	1f,%0\n"
+"	addil	L'_rtld_local - ($PIC_pcrel$0 - 1),%0\n"
+"1:	ldw	R'_rtld_local - ($PIC_pcrel$0 - 5)(%%r1),%0\n"
+   : "=r" (main_map) : : "r1");
+#else
+  main_map = NULL;
+#endif
+
+  return main_map;
+}
+
 /* Set up the loaded object described by L so its unrelocated PLT
    entries will jump to the on-demand fixup code in dl-runtime.c.  */
 
@@ -174,6 +193,15 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
     Elf32_Addr i[2];
   } sig = {{0x00,0xc0,0xff,0xee, 0xde,0xad,0xbe,0xef}};
 
+  /* Initialize dp register for main executable.  */
+  if (l == elf_machine_main_map ())
+    {
+      register Elf32_Addr dp asm ("%r27");
+
+      dp = D_PTR (l, l_info[DT_PLTGOT]);
+      asm volatile ("" : : "r" (dp));
+    }
+
   /* If we don't have a PLT we can just skip all this... */
   if (__builtin_expect (l->l_info[DT_JMPREL] == NULL,0))
     return lazy;
@@ -336,16 +364,6 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
    its return value is the user program's entry point.  */
 
 #define RTLD_START \
-/* Set up dp for any non-PIC lib constructors that may be called.  */	\
-static struct link_map * __attribute__((used))				\
-set_dp (struct link_map *map)						\
-{									\
-  register Elf32_Addr dp asm ("%r27");					\
-  dp = D_PTR (map, l_info[DT_PLTGOT]);					\
-  asm volatile ("" : : "r" (dp));					\
-  return map;								\
-}									\
-									\
 asm (									\
 "	.text\n"							\
 "	.globl _start\n"						\
@@ -445,14 +463,11 @@ asm (									\
 "	stw	%r24,-44(%sp)\n"					\
 									\
 ".Lnofix:\n"								\
+	/* Call _dl_init(main_map, argc, argv, envp). */		\
 "	addil	LT'_rtld_local,%r19\n"					\
 "	ldw	RT'_rtld_local(%r1),%r26\n"				\
-"	bl	set_dp, %r2\n"						\
 "	ldw	0(%r26),%r26\n"						\
 									\
-	/* Call _dl_init(_dl_loaded, argc, argv, envp). */		\
-"	copy	%r28,%r26\n"						\
-									\
 	/* envp = argv + argc + 1 */					\
 "	sh2add	%r25,%r24,%r23\n"					\
 "	bl	_dl_init,%r2\n"						\
diff --git a/sysdeps/hppa/dl-runtime.c b/sysdeps/hppa/dl-runtime.c
index e7fbb7417..a71b5b201 100644
--- a/sysdeps/hppa/dl-runtime.c
+++ b/sysdeps/hppa/dl-runtime.c
@@ -25,8 +25,7 @@
    return that to the caller.  The caller will continue on to call
    _dl_fixup with the relocation offset.  */
 
-ElfW(Word)
-attribute_hidden __attribute ((noinline)) ARCH_FIXUP_ATTRIBUTE
+ElfW(Word) __attribute ((noinline)) ARCH_FIXUP_ATTRIBUTE
 _dl_fix_reloc_arg (struct fdesc *fptr, struct link_map *l)
 {
   Elf32_Addr l_addr, iplt, jmprel, end_jmprel, r_type;
@@ -52,3 +51,4 @@ _dl_fix_reloc_arg (struct fdesc *fptr, struct link_map *l)
   ABORT_INSTRUCTION;
   return 0;
 }
+rtld_hidden_def (_dl_fix_reloc_arg)
diff --git a/sysdeps/hppa/dl-runtime.h b/sysdeps/hppa/dl-runtime.h
index 5d6ee53b0..9913539b5 100644
--- a/sysdeps/hppa/dl-runtime.h
+++ b/sysdeps/hppa/dl-runtime.h
@@ -17,6 +17,9 @@
    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
    02111-1307 USA.  */
 
+ElfW(Word) _dl_fix_reloc_arg (struct fdesc *, struct link_map *);
+rtld_hidden_proto (_dl_fix_reloc_arg)
+
 /* Clear PA_GP_RELOC bit in relocation offset.  */
 static inline uintptr_t
 reloc_offset (uintptr_t plt0, uintptr_t pltn)
diff --git a/sysdeps/mach/hurd/if_index.c b/sysdeps/mach/hurd/if_index.c
index 56e63a4a9..5e566da82 100644
--- a/sysdeps/mach/hurd/if_index.c
+++ b/sysdeps/mach/hurd/if_index.c
@@ -32,7 +32,7 @@ unsigned int
 __if_nametoindex (const char *ifname)
 {
   struct ifreq ifr;
-  int fd = __opensock ();
+  int fd = __socket (AF_INET, SOCK_DGRAM, 0);
 
   if (fd < 0)
     return 0;
@@ -84,7 +84,7 @@ __if_nameindex (void)
   error_t err = 0;
   char data[2048];
   file_t server;
-  int fd = __opensock ();
+  int fd = __socket (AF_INET, SOCK_DGRAM, 0);
   struct ifconf ifc;
   unsigned int nifs, i;
   struct if_nameindex *idx = NULL;
@@ -169,7 +169,7 @@ char *
 __if_indextoname (unsigned int ifindex, char *ifname)
 {
   struct ifreq ifr;
-  int fd = __opensock ();
+  int fd = __socket (AF_INET, SOCK_DGRAM, 0);
 
   if (fd < 0)
     return NULL;
diff --git a/sysdeps/nios2/libm-test-ulps b/sysdeps/nios2/libm-test-ulps
index 8c3e9df54..9315ba82f 100644
--- a/sysdeps/nios2/libm-test-ulps
+++ b/sysdeps/nios2/libm-test-ulps
@@ -12,7 +12,7 @@ Function: "asin":
 float: 1
 
 Function: "asinh":
-double: 1
+double: 2
 float: 2
 
 Function: "atan":
@@ -80,7 +80,7 @@ double: 1
 float: 1
 
 Function: "cbrt":
-double: 3
+double: 4
 float: 1
 
 Function: Real part of "ccos":
@@ -127,7 +127,7 @@ double: 1
 float: 1
 
 Function: "cosh":
-double: 1
+double: 2
 float: 2
 
 Function: Real part of "cpow":
@@ -177,10 +177,11 @@ double: 1
 float: 1
 
 Function: "erfc":
-double: 3
+double: 5
 float: 3
 
 Function: "exp":
+double: 1
 float: 1
 
 Function: "exp10":
@@ -256,7 +257,7 @@ double: 2
 float: 2
 
 Function: "tgamma":
-double: 5
+double: 9
 float: 8
 
 Function: "y0":
diff --git a/sysdeps/nptl/lowlevellock-futex.h b/sysdeps/nptl/lowlevellock-futex.h
index ecb729da6..ca96397a4 100644
--- a/sysdeps/nptl/lowlevellock-futex.h
+++ b/sysdeps/nptl/lowlevellock-futex.h
@@ -50,20 +50,8 @@
 #define LLL_SHARED	FUTEX_PRIVATE_FLAG
 
 #ifndef __ASSEMBLER__
-
-# if IS_IN (libc) || IS_IN (rtld)
-/* In libc.so or ld.so all futexes are private.  */
-#  define __lll_private_flag(fl, private)			\
-  ({								\
-    /* Prevent warnings in callers of this macro.  */		\
-    int __lll_private_flag_priv __attribute__ ((unused));	\
-    __lll_private_flag_priv = (private);			\
-    ((fl) | FUTEX_PRIVATE_FLAG);				\
-  })
-# else
-#  define __lll_private_flag(fl, private) \
+# define __lll_private_flag(fl, private) \
   (((fl) | FUTEX_PRIVATE_FLAG) ^ (private))
-# endif
 
 # define lll_futex_syscall(nargs, futexp, op, ...)                      \
   ({                                                                    \
diff --git a/sysdeps/posix/getcwd.c b/sysdeps/posix/getcwd.c
index f11644aae..242e4dbe4 100644
--- a/sysdeps/posix/getcwd.c
+++ b/sysdeps/posix/getcwd.c
@@ -187,6 +187,13 @@ __getcwd_generic (char *buf, size_t size)
   size_t allocated = size;
   size_t used;
 
+  /* A size of 1 byte is never useful.  */
+  if (allocated == 1)
+    {
+      __set_errno (ERANGE);
+      return NULL;
+    }
+
 #if HAVE_MINIMALLY_WORKING_GETCWD
   /* If AT_FDCWD is not defined, the algorithm below is O(N**2) and
      this is much slower than the system getcwd (at least on
diff --git a/sysdeps/powerpc/Makefile b/sysdeps/powerpc/Makefile
index d1c71a0ca..06491b0ef 100644
--- a/sysdeps/powerpc/Makefile
+++ b/sysdeps/powerpc/Makefile
@@ -62,11 +62,6 @@ ifeq ($(subdir),misc)
 sysdep_headers += sys/platform/ppc.h
 tests += test-gettimebase
 tests += tst-set_ppr
-
-# This test is expected to run and exit with EXIT_UNSUPPORTED on
-# processors that do not implement the Power ISA 2.06 or greater.
-# But the test makes use of instructions from Power ISA 2.06 and 2.07.
-CFLAGS-tst-set_ppr.c += -Wa,-many
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
index 291941c1e..5421525ac 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -352,7 +352,7 @@ L(zero_padding_loop):
 	cmpldi	cr6,r5,16	/* Check if length was reached.  */
 	ble	cr6,L(zero_padding_end)
 
-	stxv	v18,0(r11)
+	stxv	32+v18,0(r11)
 	addi	r11,r11,16
 	addi	r5,r5,-16
 
@@ -360,7 +360,7 @@ L(zero_padding_loop):
 
 L(zero_padding_end):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
-	stxvl	v18,r11,r10	/* Partial store  */
+	stxvl	32+v18,r11,r10	/* Partial store  */
 	blr
 
 	.align	4
diff --git a/sysdeps/powerpc/powerpc64/sysdep.h b/sysdeps/powerpc/powerpc64/sysdep.h
index c57bb1c05..83fdc8e83 100644
--- a/sysdeps/powerpc/powerpc64/sysdep.h
+++ b/sysdeps/powerpc/powerpc64/sysdep.h
@@ -275,12 +275,14 @@ LT_LABELSUFFIX(name,_name_end): ; \
 /* Allocate frame and save register */
 #define NVOLREG_SAVE \
     stdu r1,-SCV_FRAME_SIZE(r1); \
+    cfi_adjust_cfa_offset(SCV_FRAME_SIZE); \
     std r31,SCV_FRAME_NVOLREG_SAVE(r1); \
-    cfi_adjust_cfa_offset(SCV_FRAME_SIZE);
+    cfi_rel_offset(r31,SCV_FRAME_NVOLREG_SAVE);
 
 /* Restore register and destroy frame */
 #define NVOLREG_RESTORE	\
     ld r31,SCV_FRAME_NVOLREG_SAVE(r1); \
+    cfi_restore(r31); \
     addi r1,r1,SCV_FRAME_SIZE; \
     cfi_adjust_cfa_offset(-SCV_FRAME_SIZE);
 
@@ -331,13 +333,13 @@ LT_LABELSUFFIX(name,_name_end): ; \
 
 #define DO_CALL_SCV \
     mflr r9; \
-    std r9,FRAME_LR_SAVE(r1); \
-    cfi_offset(lr,FRAME_LR_SAVE); \
+    std r9,SCV_FRAME_SIZE+FRAME_LR_SAVE(r1);   \
+    cfi_rel_offset(lr,SCV_FRAME_SIZE+FRAME_LR_SAVE); \
     .machine "push"; \
     .machine "power9"; \
     scv 0; \
     .machine "pop"; \
-    ld r9,FRAME_LR_SAVE(r1); \
+    ld r9,SCV_FRAME_SIZE+FRAME_LR_SAVE(r1);      \
     mtlr r9; \
     cfi_restore(lr);
 
@@ -398,8 +400,9 @@ LT_LABELSUFFIX(name,_name_end): ; \
 #endif
 
 #define RET_SCV \
-    cmpdi r3,0; \
-    bgelr+; \
+    li r9,-4095; \
+    cmpld r3,r9; \
+    bltlr+; \
     neg r3,r3;
 
 #define RET_SC \
diff --git a/sysdeps/powerpc/powerpc64/tst-ucontext-ppc64-vscr.c b/sysdeps/powerpc/powerpc64/tst-ucontext-ppc64-vscr.c
index 28c87fcef..d3fc4ab58 100644
--- a/sysdeps/powerpc/powerpc64/tst-ucontext-ppc64-vscr.c
+++ b/sysdeps/powerpc/powerpc64/tst-ucontext-ppc64-vscr.c
@@ -50,6 +50,7 @@ do_test (void)
   /* Set SAT bit in VSCR register.  */
   asm volatile (".machine push;\n"
 		".machine \"power5\";\n"
+		".machine altivec;\n"
 		"vspltisb %0,0;\n"
 		"vspltisb %1,-1;\n"
 		"vpkuwus %0,%0,%1;\n"
diff --git a/sysdeps/powerpc/tst-set_ppr.c b/sysdeps/powerpc/tst-set_ppr.c
index 7684f5d6e..e80da1532 100644
--- a/sysdeps/powerpc/tst-set_ppr.c
+++ b/sysdeps/powerpc/tst-set_ppr.c
@@ -44,7 +44,8 @@ get_thread_priority (void)
 {
   /* Read the PPR.  */
   ppr_t ppr;
-  asm volatile (MFPPR" %0" : "=r"(ppr));
+  asm volatile (".machine push; .machine power7; "MFPPR" %0; .machine pop"
+		: "=r"(ppr));
   /* Return the thread priority value.  */
   return EXTRACT_THREAD_PRIORITY (ppr);
 }
diff --git a/sysdeps/pthread/Makefile b/sysdeps/pthread/Makefile
index eeb64f9fb..2df947cb0 100644
--- a/sysdeps/pthread/Makefile
+++ b/sysdeps/pthread/Makefile
@@ -108,6 +108,7 @@ tests += tst-cnd-basic tst-mtx-trylock tst-cnd-broadcast \
 	 tst-unload \
 	 tst-unwind-thread \
 	 tst-pt-vfork1 tst-pt-vfork2 tst-vfork1x tst-vfork2x \
+	 tst-pthread-exit-signal \
 
 
 # Files which must not be linked with libpthread.
@@ -231,7 +232,7 @@ generated += $(objpfx)tst-atfork2.mtrace \
 
 tests-internal += tst-cancel25 tst-robust8
 
-tests += tst-oncex3 tst-oncex4
+tests += tst-oncex3 tst-oncex4 tst-oncey3 tst-oncey4
 
 modules-names += tst-join7mod
 
@@ -242,6 +243,8 @@ endif
 
 CFLAGS-tst-oncex3.c += -fexceptions
 CFLAGS-tst-oncex4.c += -fexceptions
+CFLAGS-tst-oncey3.c += -fno-exceptions -fno-asynchronous-unwind-tables
+CFLAGS-tst-oncey4.c += -fno-exceptions -fno-asynchronous-unwind-tables
 
 $(objpfx)tst-join7: $(libdl) $(shared-thread-library)
 $(objpfx)tst-join7.out: $(objpfx)tst-join7mod.so
diff --git a/sysdeps/pthread/tst-oncey3.c b/sysdeps/pthread/tst-oncey3.c
new file mode 100644
index 000000000..08225b88d
--- /dev/null
+++ b/sysdeps/pthread/tst-oncey3.c
@@ -0,0 +1 @@
+#include "tst-once3.c"
diff --git a/sysdeps/pthread/tst-oncey4.c b/sysdeps/pthread/tst-oncey4.c
new file mode 100644
index 000000000..9b4d98f3f
--- /dev/null
+++ b/sysdeps/pthread/tst-oncey4.c
@@ -0,0 +1 @@
+#include "tst-once4.c"
diff --git a/sysdeps/pthread/tst-pthread-exit-signal.c b/sysdeps/pthread/tst-pthread-exit-signal.c
new file mode 100644
index 000000000..b4526fe66
--- /dev/null
+++ b/sysdeps/pthread/tst-pthread-exit-signal.c
@@ -0,0 +1,45 @@
+/* Test that pending signals are not delivered on thread exit (bug 28607).
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Due to bug 28607, pthread_kill (or pthread_cancel) restored the
+   signal mask during during thread exit, triggering the delivery of a
+   blocked pending signal (SIGUSR1 in this test).  */
+
+#include <support/xthread.h>
+#include <support/xsignal.h>
+
+static void *
+threadfunc (void *closure)
+{
+  sigset_t sigmask;
+  sigfillset (&sigmask);
+  xpthread_sigmask (SIG_SETMASK, &sigmask, NULL);
+  xpthread_kill (pthread_self (), SIGUSR1);
+  pthread_exit (NULL);
+  return NULL;
+}
+
+static int
+do_test (void)
+{
+  pthread_t thr = xpthread_create (NULL, threadfunc, NULL);
+  xpthread_join (thr);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/riscv/rv64/rvd/libm-test-ulps b/sysdeps/riscv/rv64/rvd/libm-test-ulps
index 5b6f121ac..a0085bcdb 100644
--- a/sysdeps/riscv/rv64/rvd/libm-test-ulps
+++ b/sysdeps/riscv/rv64/rvd/libm-test-ulps
@@ -60,7 +60,7 @@ float: 1
 ldouble: 2
 
 Function: "asinh":
-double: 1
+double: 2
 float: 2
 ldouble: 3
 
@@ -413,7 +413,7 @@ float: 1
 ldouble: 2
 
 Function: "cbrt":
-double: 3
+double: 4
 float: 1
 ldouble: 1
 
@@ -652,17 +652,17 @@ float: 1
 ldouble: 2
 
 Function: "cosh":
-double: 1
+double: 2
 float: 2
 ldouble: 1
 
 Function: "cosh_downward":
-double: 2
+double: 3
 float: 1
 ldouble: 2
 
 Function: "cosh_towardzero":
-double: 2
+double: 3
 float: 1
 ldouble: 2
 
@@ -948,6 +948,7 @@ float: 4
 ldouble: 5
 
 Function: "exp":
+double: 1
 float: 1
 ldouble: 1
 
@@ -1068,7 +1069,7 @@ ldouble: 4
 Function: "j0_towardzero":
 double: 5
 float: 6
-ldouble: 2
+ldouble: 4
 
 Function: "j0_upward":
 double: 4
@@ -1136,6 +1137,7 @@ float: 5
 ldouble: 8
 
 Function: "log":
+double: 1
 ldouble: 1
 
 Function: "log10":
@@ -1274,7 +1276,7 @@ float: 3
 ldouble: 3
 
 Function: "sinh_towardzero":
-double: 2
+double: 3
 float: 2
 ldouble: 3
 
@@ -1323,22 +1325,22 @@ float: 3
 ldouble: 3
 
 Function: "tgamma":
-double: 5
+double: 9
 float: 8
 ldouble: 4
 
 Function: "tgamma_downward":
-double: 5
+double: 8
 float: 7
 ldouble: 5
 
 Function: "tgamma_towardzero":
-double: 5
+double: 9
 float: 7
 ldouble: 5
 
 Function: "tgamma_upward":
-double: 4
+double: 9
 float: 8
 ldouble: 4
 
diff --git a/sysdeps/s390/configure b/sysdeps/s390/configure
index 5f98640d0..7eaefbabc 100644
--- a/sysdeps/s390/configure
+++ b/sysdeps/s390/configure
@@ -123,7 +123,9 @@ void testinsn (char *buf)
     __asm__ (".machine \"arch13\" \n\t"
 	     ".machinemode \"zarch_nohighgprs\" \n\t"
 	     "lghi %%r0,16 \n\t"
-	     "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+	     "mvcrl 0(%0),32(%0) \n\t"
+	     "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+	     : : "a" (buf) : "memory", "r0");
 }
 EOF
 if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c
@@ -271,7 +273,9 @@ else
 void testinsn (char *buf)
 {
     __asm__ ("lghi %%r0,16 \n\t"
-	     "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+	     "mvcrl 0(%0),32(%0) \n\t"
+	     "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+	     : : "a" (buf) : "memory", "r0");
 }
 EOF
 if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c
diff --git a/sysdeps/s390/configure.ac b/sysdeps/s390/configure.ac
index dfe007a77..e6df62491 100644
--- a/sysdeps/s390/configure.ac
+++ b/sysdeps/s390/configure.ac
@@ -88,7 +88,9 @@ void testinsn (char *buf)
     __asm__ (".machine \"arch13\" \n\t"
 	     ".machinemode \"zarch_nohighgprs\" \n\t"
 	     "lghi %%r0,16 \n\t"
-	     "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+	     "mvcrl 0(%0),32(%0) \n\t"
+	     "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+	     : : "a" (buf) : "memory", "r0");
 }
 EOF
 dnl test, if assembler supports S390 arch13 instructions
@@ -195,7 +197,9 @@ cat > conftest.c <<\EOF
 void testinsn (char *buf)
 {
     __asm__ ("lghi %%r0,16 \n\t"
-	     "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+	     "mvcrl 0(%0),32(%0) \n\t"
+	     "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+	     : : "a" (buf) : "memory", "r0");
 }
 EOF
 dnl test, if assembler supports S390 arch13 zarch instructions as default
diff --git a/sysdeps/s390/dl-procinfo.c b/sysdeps/s390/dl-procinfo.c
index 0c334a255..755b54ff1 100644
--- a/sysdeps/s390/dl-procinfo.c
+++ b/sysdeps/s390/dl-procinfo.c
@@ -46,12 +46,13 @@
 #if !defined PROCINFO_DECL && defined SHARED
   ._dl_s390_cap_flags
 #else
-PROCINFO_CLASS const char _dl_s390_cap_flags[19][9]
+PROCINFO_CLASS const char _dl_s390_cap_flags[23][9]
 #endif
 #ifndef PROCINFO_DECL
 = {
      "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", "edat", "etf3eh",
-     "highgprs", "te", "vx", "vxd", "vxe", "gs", "vxe2", "vxp", "sort", "dflt"
+     "highgprs", "te", "vx", "vxd", "vxe", "gs", "vxe2", "vxp", "sort", "dflt",
+     "vxp2", "nnpa", "pcimio", "sie"
   }
 #endif
 #if !defined SHARED || defined PROCINFO_DECL
@@ -63,11 +64,12 @@ PROCINFO_CLASS const char _dl_s390_cap_flags[19][9]
 #if !defined PROCINFO_DECL && defined SHARED
   ._dl_s390_platforms
 #else
-PROCINFO_CLASS const char _dl_s390_platforms[10][7]
+PROCINFO_CLASS const char _dl_s390_platforms[11][7]
 #endif
 #ifndef PROCINFO_DECL
 = {
-    "g5", "z900", "z990", "z9-109", "z10", "z196", "zEC12", "z13", "z14", "z15"
+    "g5", "z900", "z990", "z9-109", "z10", "z196", "zEC12", "z13", "z14", "z15",
+    "z16"
   }
 #endif
 #if !defined SHARED || defined PROCINFO_DECL
diff --git a/sysdeps/s390/dl-procinfo.h b/sysdeps/s390/dl-procinfo.h
index 9e1a8c7ba..d44e1dd97 100644
--- a/sysdeps/s390/dl-procinfo.h
+++ b/sysdeps/s390/dl-procinfo.h
@@ -21,9 +21,9 @@
 #define _DL_PROCINFO_H	1
 #include <ldsodefs.h>
 
-#define _DL_HWCAP_COUNT 19
+#define _DL_HWCAP_COUNT 23
 
-#define _DL_PLATFORMS_COUNT	10
+#define _DL_PLATFORMS_COUNT	11
 
 /* The kernel provides up to 32 capability bits with elf_hwcap.  */
 #define _DL_FIRST_PLATFORM	32
@@ -61,6 +61,10 @@ enum
   HWCAP_S390_VXRS_PDE = 1 << 16,
   HWCAP_S390_SORT = 1 << 17,
   HWCAP_S390_DFLT = 1 << 18,
+  HWCAP_S390_VXRS_PDE2 = 1 << 19,
+  HWCAP_S390_NNPA = 1 << 20,
+  HWCAP_S390_PCI_MIO = 1 << 21,
+  HWCAP_S390_SIE = 1 << 22,
 };
 
 #define HWCAP_IMPORTANT (HWCAP_S390_ZARCH | HWCAP_S390_LDISP \
diff --git a/sysdeps/s390/memmem-arch13.S b/sysdeps/s390/memmem-arch13.S
index c5c8d8c97..58df8cdb1 100644
--- a/sysdeps/s390/memmem-arch13.S
+++ b/sysdeps/s390/memmem-arch13.S
@@ -41,7 +41,7 @@ ENTRY(MEMMEM_ARCH13)
 #  error The arch13 variant of memmem needs the z13 variant of memmem!
 # endif
 	clgfi	%r5,9
-	jh	MEMMEM_Z13
+	jgh	MEMMEM_Z13
 
 	aghik	%r0,%r5,-1		/* vll needs highest index.  */
 	bc	4,0(%r14)		/* cc==1: return if needle-len == 0.  */
diff --git a/sysdeps/s390/memmove.c b/sysdeps/s390/memmove.c
index f88ea79d9..1a7d3369f 100644
--- a/sysdeps/s390/memmove.c
+++ b/sysdeps/s390/memmove.c
@@ -43,7 +43,7 @@ extern __typeof (__redirect_memmove) MEMMOVE_ARCH13 attribute_hidden;
 s390_libc_ifunc_expr (__redirect_memmove, memmove,
 		      ({
 			s390_libc_ifunc_expr_stfle_init ();
-			(HAVE_MEMMOVE_ARCH13
+			(HAVE_MEMMOVE_ARCH13 && (hwcap & HWCAP_S390_VXRS_EXT2)
 			 && S390_IS_ARCH13_MIE3 (stfle_bits))
 			  ? MEMMOVE_ARCH13
 			  : (HAVE_MEMMOVE_Z13 && (hwcap & HWCAP_S390_VX))
diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c
index 4b170e445..2ef38b72d 100644
--- a/sysdeps/s390/multiarch/ifunc-impl-list.c
+++ b/sysdeps/s390/multiarch/ifunc-impl-list.c
@@ -171,7 +171,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
     IFUNC_IMPL (i, name, memmove,
 # if HAVE_MEMMOVE_ARCH13
 		IFUNC_IMPL_ADD (array, i, memmove,
-				S390_IS_ARCH13_MIE3 (stfle_bits),
+				((dl_hwcap & HWCAP_S390_VXRS_EXT2)
+				 && S390_IS_ARCH13_MIE3 (stfle_bits)),
 				MEMMOVE_ARCH13)
 # endif
 # if HAVE_MEMMOVE_Z13
diff --git a/sysdeps/s390/s390-64/Makefile b/sysdeps/s390/s390-64/Makefile
index e5da26871..66ed844e6 100644
--- a/sysdeps/s390/s390-64/Makefile
+++ b/sysdeps/s390/s390-64/Makefile
@@ -7,8 +7,11 @@ CFLAGS-rtld.c += -Wno-uninitialized -Wno-unused
 CFLAGS-dl-load.c += -Wno-unused
 CFLAGS-dl-reloc.c += -Wno-unused
 
-$(objpfx)tst-glibc-hwcaps: $(objpfx)libmarkermod2-1.so \
-  $(objpfx)libmarkermod3-1.so $(objpfx)libmarkermod4-1.so
+$(objpfx)tst-glibc-hwcaps: \
+  $(objpfx)libmarkermod2-1.so \
+  $(objpfx)libmarkermod3-1.so \
+  $(objpfx)libmarkermod4-1.so \
+  $(objpfx)libmarkermod5-1.so
 $(objpfx)tst-glibc-hwcaps.out: \
   $(objpfx)libmarkermod2.so \
     $(objpfx)glibc-hwcaps/z13/libmarkermod2.so \
@@ -19,6 +22,11 @@ $(objpfx)tst-glibc-hwcaps.out: \
     $(objpfx)glibc-hwcaps/z13/libmarkermod4.so \
     $(objpfx)glibc-hwcaps/z14/libmarkermod4.so \
     $(objpfx)glibc-hwcaps/z15/libmarkermod4.so \
+  $(objpfx)libmarkermod5.so \
+    $(objpfx)glibc-hwcaps/z13/libmarkermod5.so \
+    $(objpfx)glibc-hwcaps/z14/libmarkermod5.so \
+    $(objpfx)glibc-hwcaps/z15/libmarkermod5.so \
+    $(objpfx)glibc-hwcaps/z16/libmarkermod5.so
 
 $(objpfx)glibc-hwcaps/z13/libmarkermod2.so: $(objpfx)libmarkermod2-2.so
 	$(make-target-directory)
@@ -38,6 +46,19 @@ $(objpfx)glibc-hwcaps/z14/libmarkermod4.so: $(objpfx)libmarkermod4-3.so
 $(objpfx)glibc-hwcaps/z15/libmarkermod4.so: $(objpfx)libmarkermod4-4.so
 	$(make-target-directory)
 	cp $< $@
+$(objpfx)glibc-hwcaps/z13/libmarkermod5.so: $(objpfx)libmarkermod5-2.so
+	$(make-target-directory)
+	cp $< $@
+$(objpfx)glibc-hwcaps/z14/libmarkermod5.so: $(objpfx)libmarkermod5-3.so
+	$(make-target-directory)
+	cp $< $@
+$(objpfx)glibc-hwcaps/z15/libmarkermod5.so: $(objpfx)libmarkermod5-4.so
+	$(make-target-directory)
+	cp $< $@
+$(objpfx)glibc-hwcaps/z16/libmarkermod5.so: $(objpfx)libmarkermod5-5.so
+	$(make-target-directory)
+	cp $< $@
+
 
 ifeq (no,$(build-hardcoded-path-in-tests))
 # This is an ld.so.cache test, and RPATH/RUNPATH in the executable
diff --git a/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c b/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c
index b9d094f3d..187d732d5 100644
--- a/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c
+++ b/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c
@@ -19,8 +19,8 @@
 #include <dl-hwcaps.h>
 #include <ldsodefs.h>
 
-const char _dl_hwcaps_subdirs[] = "z15:z14:z13";
-enum { subdirs_count = 3 }; /* Number of components in _dl_hwcaps_subdirs.  */
+const char _dl_hwcaps_subdirs[] = "z16:z15:z14:z13";
+enum { subdirs_count = 4 }; /* Number of components in _dl_hwcaps_subdirs.  */
 
 uint32_t
 _dl_hwcaps_subdirs_active (void)
@@ -50,5 +50,12 @@ _dl_hwcaps_subdirs_active (void)
     return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
   ++active;
 
+  /* z16.
+   Note: We do not list HWCAP_S390_NNPA here as, according to the Principles of
+   Operation, those instructions may be replaced or removed in future.  */
+  if (!(GLRO (dl_hwcap) & HWCAP_S390_VXRS_PDE2))
+    return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
+  ++active;
+
   return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
 }
diff --git a/sysdeps/s390/s390-64/tst-glibc-hwcaps.c b/sysdeps/s390/s390-64/tst-glibc-hwcaps.c
index 02397a478..f3b8ef3de 100644
--- a/sysdeps/s390/s390-64/tst-glibc-hwcaps.c
+++ b/sysdeps/s390/s390-64/tst-glibc-hwcaps.c
@@ -25,6 +25,7 @@
 extern int marker2 (void);
 extern int marker3 (void);
 extern int marker4 (void);
+extern int marker5 (void);
 
 /* Return the arch level, 10 for the baseline libmarkermod*.so's.  */
 static int
@@ -63,9 +64,11 @@ compute_level (void)
     return 12;
   if (strcmp (platform, "z15") == 0)
     return 13;
+  if (strcmp (platform, "z16") == 0)
+    return 14;
   printf ("warning: unrecognized AT_PLATFORM value: %s\n", platform);
-  /* Assume that the new platform supports z15.  */
-  return 13;
+  /* Assume that the new platform supports z16.  */
+  return 14;
 }
 
 static int
@@ -76,6 +79,7 @@ do_test (void)
   TEST_COMPARE (marker2 (), MIN (level - 9, 2));
   TEST_COMPARE (marker3 (), MIN (level - 9, 3));
   TEST_COMPARE (marker4 (), MIN (level - 9, 4));
+  TEST_COMPARE (marker5 (), MIN (level - 9, 5));
   return 0;
 }
 
diff --git a/sysdeps/s390/strstr-arch13.S b/sysdeps/s390/strstr-arch13.S
index c7183e627..222a6de91 100644
--- a/sysdeps/s390/strstr-arch13.S
+++ b/sysdeps/s390/strstr-arch13.S
@@ -49,7 +49,7 @@ ENTRY(STRSTR_ARCH13)
 #  error The arch13 variant of strstr needs the z13 variant of strstr!
 # endif
 	clgfi	%r4,9
-	jh	STRSTR_Z13
+	jgh	STRSTR_Z13
 
 	/* In case of a partial match, the vstrs instruction returns the index
 	   of the partial match in a vector-register.  Then we have to
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 472eab700..9531641f8 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -282,7 +282,12 @@ sysdep_routines += xstatconv internal_statvfs internal_statvfs64 \
 
 sysdep_headers += bits/fcntl-linux.h
 
-tests += tst-fallocate tst-fallocate64 tst-o_path-locks
+tests += \
+  tst-fallocate \
+  tst-fallocate64 \
+  tst-getcwd-smallbuff \
+  tst-o_path-locks \
+# tests
 endif
 
 ifeq ($(subdir),elf)
diff --git a/sysdeps/unix/sysv/linux/aarch64/clone.S b/sysdeps/unix/sysv/linux/aarch64/clone.S
index c9e63bae4..fe04bce6b 100644
--- a/sysdeps/unix/sysv/linux/aarch64/clone.S
+++ b/sysdeps/unix/sysv/linux/aarch64/clone.S
@@ -47,6 +47,8 @@ ENTRY(__clone)
 	/* Sanity check args.  */
 	mov	x0, #-EINVAL
 	cbz	x10, .Lsyscall_error
+	/* Align sp.  */
+	and	x1, x1, -16
 	cbz	x1, .Lsyscall_error
 
 	/* Do the system call.  */
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index fe52b6308..db6aa3516 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -104,7 +104,7 @@ init_cpu_features (struct cpu_features *cpu_features)
   cpu_features->mte_state = (GLRO (dl_hwcap2) & HWCAP2_MTE) ? mte_state : 0;
   /* If we lack the MTE feature, disable the tunable, since it will
      otherwise cause instructions that won't run on this CPU to be used.  */
-  TUNABLE_SET (glibc, mem, tagging, unsigned, cpu_features->mte_state);
+  TUNABLE_SET (glibc, mem, tagging, cpu_features->mte_state);
 # endif
 
   if (cpu_features->mte_state & 2)
diff --git a/sysdeps/unix/sysv/linux/dl-diagnostics-kernel.c b/sysdeps/unix/sysv/linux/dl-diagnostics-kernel.c
new file mode 100644
index 000000000..59f6402c5
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/dl-diagnostics-kernel.c
@@ -0,0 +1,77 @@
+/* Print kernel diagnostics data in ld.so.  Linux version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-diagnostics.h>
+#include <ldsodefs.h>
+#include <sys/utsname.h>
+
+/* Dump the auxiliary vector to standard output.  */
+static void
+print_auxv (void)
+{
+  /* See _dl_show_auxv.  The code below follows the general output
+     format for diagnostic dumps.  */
+  unsigned int index = 0;
+  for (ElfW(auxv_t) *av = GLRO(dl_auxv); av->a_type != AT_NULL; ++av)
+    {
+      _dl_printf ("auxv[0x%x].a_type=0x%lx\n"
+                  "auxv[0x%x].a_val=",
+                  index, (unsigned long int) av->a_type, index);
+      if (av->a_type == AT_EXECFN
+          || av->a_type == AT_PLATFORM
+          || av->a_type == AT_BASE_PLATFORM)
+        /* The address of the strings is not useful at all, so print
+           the strings themselvs.  */
+        _dl_diagnostics_print_string ((const char *) av->a_un.a_val);
+      else
+        _dl_printf ("0x%lx", (unsigned long int) av->a_un.a_val);
+      _dl_printf ("\n");
+      ++index;
+    }
+}
+
+/* Print one uname entry.  */
+static void
+print_utsname_entry (const char *field, const char *value)
+{
+  _dl_printf ("uname.");
+  _dl_diagnostics_print_labeled_string (field, value);
+}
+
+/* Print information from uname, including the kernel version.  */
+static void
+print_uname (void)
+{
+  struct utsname uts;
+  if (__uname (&uts) == 0)
+    {
+      print_utsname_entry ("sysname", uts.sysname);
+      print_utsname_entry ("nodename", uts.nodename);
+      print_utsname_entry ("release", uts.release);
+      print_utsname_entry ("version", uts.version);
+      print_utsname_entry ("machine", uts.machine);
+      print_utsname_entry ("domainname", uts.domainname);
+    }
+}
+
+void
+_dl_diagnostics_kernel (void)
+{
+  print_auxv ();
+  print_uname ();
+}
diff --git a/sysdeps/unix/sysv/linux/fstat.c b/sysdeps/unix/sysv/linux/fstat.c
index fd6436220..31a172dcc 100644
--- a/sysdeps/unix/sysv/linux/fstat.c
+++ b/sysdeps/unix/sysv/linux/fstat.c
@@ -19,11 +19,17 @@
 #include <sys/stat.h>
 #include <kernel_stat.h>
 #include <fcntl.h>
+#include <errno.h>
 
 #if !XSTAT_IS_XSTAT64
 int
 __fstat (int fd, struct stat *buf)
 {
+  if (fd < 0)
+    {
+      __set_errno (EBADF);
+      return -1;
+    }
   return __fstatat (fd, "", buf, AT_EMPTY_PATH);
 }
 
diff --git a/sysdeps/unix/sysv/linux/fstat64.c b/sysdeps/unix/sysv/linux/fstat64.c
index 993abcb44..46de80b66 100644
--- a/sysdeps/unix/sysv/linux/fstat64.c
+++ b/sysdeps/unix/sysv/linux/fstat64.c
@@ -22,10 +22,16 @@
 #include <fcntl.h>
 #include <kernel_stat.h>
 #include <stat_t64_cp.h>
+#include <errno.h>
 
 int
 __fstat64_time64 (int fd, struct __stat64_t64 *buf)
 {
+  if (fd < 0)
+    {
+      __set_errno (EBADF);
+      return -1;
+    }
   return __fstatat64_time64 (fd, "", buf, AT_EMPTY_PATH);
 }
 #if __TIMESIZE != 64
@@ -34,6 +40,12 @@ hidden_def (__fstat64_time64)
 int
 __fstat64 (int fd, struct stat64 *buf)
 {
+  if (fd < 0)
+    {
+      __set_errno (EBADF);
+      return -1;
+    }
+
   struct __stat64_t64 st_t64;
   return __fstat64_time64 (fd, &st_t64)
 	 ?: __cp_stat64_t64_stat64 (&st_t64, buf);
diff --git a/sysdeps/unix/sysv/linux/hppa/getcontext.S b/sysdeps/unix/sysv/linux/hppa/getcontext.S
index 1405b4281..4f2e2587d 100644
--- a/sysdeps/unix/sysv/linux/hppa/getcontext.S
+++ b/sysdeps/unix/sysv/linux/hppa/getcontext.S
@@ -22,22 +22,28 @@
 #include "ucontext_i.h"
 
 
-	/* Trampoline function. Non-standard calling ABI.  */
+	/* Trampoline function.  Non-standard calling ABI.  */
 	/* Can not use ENTRY(__getcontext_ret) here.  */
 	.type	__getcontext_ret, @function
 	.hidden	__getcontext_ret
 __getcontext_ret:
 	.proc
 	.callinfo FRAME=0,NO_CALLS
-	/* r26-r23 contain original r3-r6, but because setcontext
-	   does not reload r3-r6 (it's using them as temporaries)
-	   we must save them elsewhere and swap them back in.  */
-	copy	%r23, %r3
-	copy	%r24, %r4
-	copy	%r25, %r5
-	copy	%r26, %r6
-	/* r20 contains original return pointer.  */
-	bv	0(%r20)
+	/* Because setcontext does not reload r3-r6 (it's using them
+	   as temporaries), we must load them ourself.  */
+	ldw	oR3(%r26), %r3
+	ldw	oR4(%r26), %r4
+	ldw	oR5(%r26), %r5
+	ldw	oR6(%r26), %r6
+
+	/* Also reload registers clobbered by $$dyncall.  */
+	ldw	oR21(%r26), %r21
+	ldw	oR22(%r26), %r22
+	ldw	oR31(%r26), %r31
+
+	/* oR0 contains original return pointer.  */
+	ldw	oR0(%r26), %rp
+	bv	0(%rp)
 	copy	%r0, %ret0
 	.procend
 	.size	__getcontext_ret, .-__getcontext_ret
@@ -65,13 +71,13 @@ ENTRY(__getcontext)
 	stw	%r17, oR17(%r26)
 	stw	%r18, oR18(%r26)
 	stw	%r19, oR19(%r26)
-	/* stw	%r20, oR20(%r26) - used for trampoline.  */
+	stw	%r20, oR20(%r26)
 	stw	%r21, oR21(%r26)
 	stw	%r22, oR22(%r26)
-	/* stw	%r23, oR23(%r26) - used for trampoline.  */
-	/* stw	%r24, oR24(%r26) - used for trampoline.  */
-	/* stw	%r25, oR25(%r26) - used for trampoline.  */
-	/* stw	%r26, oR26(%r26) - used for trampoline.  */
+	stw	%r23, oR23(%r26)
+	stw	%r24, oR24(%r26)
+	stw	%r25, oR25(%r26)
+	stw	%r26, oR26(%r26)
 	stw	%r27, oR27(%r26)
 	stw	%r28, oR28(%r26)
 	stw	%r29, oR29(%r26)
@@ -90,7 +96,10 @@ ENTRY(__getcontext)
 	stw	%r0, oIASQ1(%r26)
 	stw	%r0, oIAOQ0(%r26)
 	stw	%r0, oIAOQ1(%r26)
-	stw	%r0, oSAR(%r26) /* used as flag in swapcontext().  */
+
+	/* Save SAR register.  */
+	mfctl	%sar, %r1
+	stw	%r1, oSAR(%r26) /* MSB used as flag in swapcontext().  */
 
 
 	/* Store floating-point regs.  */
@@ -138,15 +147,12 @@ ENTRY(__getcontext)
 	stw	%r19, -32(%sp)
 	.cfi_offset 19, 32
 #endif
+	stw	%ret1, -60(%sp)
+	.cfi_offset 29, 4
 
 	/* Set up the trampoline registers.
-	   r20, r23, r24, r25, r26 and r2 are clobbered
-	   by call to getcontext() anyway. Reuse them.  */
-	stw	%r2, oR20(%r26)
-	stw	%r3, oR23(%r26)
-	stw	%r4, oR24(%r26)
-	stw	%r5, oR25(%r26)
-	stw	%r6, oR26(%r26)
+	   Use oR0 context slot to save return value.  */
+	stw	%r2, oR0(%r26)
 #ifdef PIC
 	addil	LT%__getcontext_ret, %r19
 	ldw     RT%__getcontext_ret(%r1), %r1
@@ -168,6 +174,7 @@ ENTRY(__getcontext)
 #ifdef PIC
 	ldw	-32(%sp), %r19
 #endif
+	ldw	-60(%sp), %ret1
 	bv	%r0(%r2)
 	ldwm	-64(%sp), %r4
 END(__getcontext)
diff --git a/sysdeps/unix/sysv/linux/hppa/setcontext.S b/sysdeps/unix/sysv/linux/hppa/setcontext.S
index 8fc5f5e56..616405b80 100644
--- a/sysdeps/unix/sysv/linux/hppa/setcontext.S
+++ b/sysdeps/unix/sysv/linux/hppa/setcontext.S
@@ -34,6 +34,8 @@ ENTRY(__setcontext)
 	stw	%r19, -32(%sp)
 	.cfi_offset 19, 32
 #endif
+	stw	%ret1, -60(%sp)
+	.cfi_offset 29, 4
 
 	/* Save ucp.  */
 	copy	%r26, %r3
@@ -74,7 +76,7 @@ ENTRY(__setcontext)
 	ldw	oR18(%r3), %r18
 	ldw	oR19(%r3), %r19
 	ldw	oR20(%r3), %r20
-	ldw	oR21(%r3), %r21
+	ldw	oR21(%r3), %r21 /* maybe clobbered by dyncall */
 	/* ldw	oR22(%r3), %r22 - dyncall arg.  */
 	ldw	oR23(%r3), %r23
 	ldw	oR24(%r3), %r24
@@ -86,6 +88,10 @@ ENTRY(__setcontext)
 	ldw	oR30(%r3), %sp
 	/* ldw	oR31(%r3), %r31 - dyncall scratch register */
 
+	/* Restore SAR register.  */
+	ldw	oSAR(%r3), %r22
+	mtsar	%r22
+
 	/* Restore floating-point registers.  */
 	ldo	 oFPREGS31(%r3), %r22
 	fldds	  0(%r22), %fr31
@@ -155,6 +161,7 @@ ENTRY(__setcontext)
 #ifdef PIC
 	ldw	-32(%r30), %r19
 #endif
+	ldw	-60(%r30), %ret1
 	bv	%r0(%r2)
 	ldwm	-64(%r30), %r3
 L(pseudo_end):
diff --git a/sysdeps/unix/sysv/linux/hppa/swapcontext.S b/sysdeps/unix/sysv/linux/hppa/swapcontext.S
new file mode 100644
index 000000000..fbc22586d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/hppa/swapcontext.S
@@ -0,0 +1,72 @@
+/* Swap to new context.
+   Copyright (C) 2008-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "ucontext_i.h"
+
+	.text
+ENTRY(__swapcontext)
+
+	/* Copy rp to ret0 (r28).  */
+	copy %rp,%ret0
+
+	/* Create a frame.  */
+	ldo 64(%sp),%sp
+	.cfi_def_cfa_offset -64
+
+	/* Save the current machine context to oucp.  */
+	bl __getcontext,%rp
+
+	/* Copy oucp to register ret1 (r29).  __getcontext saves and
+	   restores it on a normal return.  It is restored from oR29
+	   on reactivation.  */
+	copy %r26,%ret1
+
+	/* Pop frame.  */
+	ldo -64(%sp),%sp
+	.cfi_def_cfa_offset 0
+
+	/* Load return pointer from oR28.  */
+	ldw oR28(%ret1),%rp
+
+	/* Return if error.  */
+	or,= %r0,%ret0,%r0
+	bv,n %r0(%rp)
+
+	/* Load sc_sar flag.  */
+	ldb oSAR(%ret1),%r20
+
+	/* Return if oucp context has been reactivated.  */
+	or,= %r0,%r20,%r0
+	bv,n %r0(%rp)
+
+	/* Mark sc_sar flag.  */
+	ldi 1,%r20
+	stb %r20,oSAR(%ret1)
+
+	/* Activate the machine context in ucp.  */
+	bl __setcontext,%rp
+	ldw oR25(%ret1),%r26
+
+	/* Load return pointer.  */
+	ldw oR28(%ret1),%rp
+	bv,n %r0(%rp)
+
+END(__swapcontext)
+
+weak_alias (__swapcontext, swapcontext)
diff --git a/sysdeps/unix/sysv/linux/hppa/swapcontext.c b/sysdeps/unix/sysv/linux/hppa/swapcontext.c
deleted file mode 100644
index f9a820754..000000000
--- a/sysdeps/unix/sysv/linux/hppa/swapcontext.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Swap to new context.
-   Copyright (C) 2008-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Helge Deller <deller@gmx.de>, 2008.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <ucontext.h>
-
-extern int __getcontext (ucontext_t *ucp);
-extern int __setcontext (const ucontext_t *ucp);
-
-int
-__swapcontext (ucontext_t *oucp, const ucontext_t *ucp)
-{
-  /* Save the current machine context to oucp.  */
-  __getcontext (oucp);
-
-  /* mark sc_sar flag to skip the setcontext call on reactivation.  */
-  if (oucp->uc_mcontext.sc_sar == 0) {
-	oucp->uc_mcontext.sc_sar++;
-
-	/* Restore the machine context in ucp.  */
-	__setcontext (ucp);
-  }
-
-  return 0;
-}
-
-weak_alias (__swapcontext, swapcontext)
diff --git a/sysdeps/unix/sysv/linux/mips/fxstat.c b/sysdeps/unix/sysv/linux/mips/fxstat.c
index 11511d30b..4a6016ff1 100644
--- a/sysdeps/unix/sysv/linux/mips/fxstat.c
+++ b/sysdeps/unix/sysv/linux/mips/fxstat.c
@@ -35,7 +35,9 @@ __fxstat (int vers, int fd, struct stat *buf)
       {
 	struct kernel_stat kbuf;
 	int r = INTERNAL_SYSCALL_CALL (fstat, fd, &kbuf);
-	return r ?: __xstat_conv (vers, &kbuf, buf);
+	if (r == 0)
+	  return  __xstat_conv (vers, &kbuf, buf);
+	return INLINE_SYSCALL_ERROR_RETURN_VALUE (-r);
       }
     }
 }
diff --git a/sysdeps/unix/sysv/linux/mips/lxstat.c b/sysdeps/unix/sysv/linux/mips/lxstat.c
index 871fb6c6c..54f990a25 100644
--- a/sysdeps/unix/sysv/linux/mips/lxstat.c
+++ b/sysdeps/unix/sysv/linux/mips/lxstat.c
@@ -35,7 +35,9 @@ __lxstat (int vers, const char *name, struct stat *buf)
       {
 	struct kernel_stat kbuf;
 	int r = INTERNAL_SYSCALL_CALL (lstat, name, &kbuf);
-	return r ?: __xstat_conv (vers, &kbuf, buf);
+	if (r == 0)
+	  return  __xstat_conv (vers, &kbuf, buf);
+	return INLINE_SYSCALL_ERROR_RETURN_VALUE (-r);
       }
     }
 }
diff --git a/sysdeps/unix/sysv/linux/mips/xstat.c b/sysdeps/unix/sysv/linux/mips/xstat.c
index 9d810b6f6..86f4dc31a 100644
--- a/sysdeps/unix/sysv/linux/mips/xstat.c
+++ b/sysdeps/unix/sysv/linux/mips/xstat.c
@@ -35,7 +35,9 @@ __xstat (int vers, const char *name, struct stat *buf)
       {
 	struct kernel_stat kbuf;
 	int r = INTERNAL_SYSCALL_CALL (stat, name, &kbuf);
-	return r ?: __xstat_conv (vers, &kbuf, buf);
+	if (r == 0)
+	  return  __xstat_conv (vers, &kbuf, buf);
+	return INLINE_SYSCALL_ERROR_RETURN_VALUE (-r);
       }
     }
 }
diff --git a/sysdeps/unix/sysv/linux/mq_notify.c b/sysdeps/unix/sysv/linux/mq_notify.c
index cc575a0cd..1714e1cc5 100644
--- a/sysdeps/unix/sysv/linux/mq_notify.c
+++ b/sysdeps/unix/sysv/linux/mq_notify.c
@@ -132,9 +132,12 @@ helper_thread (void *arg)
 	       to wait until it is done with it.  */
 	    (void) __pthread_barrier_wait (&notify_barrier);
 	}
-      else if (data.raw[NOTIFY_COOKIE_LEN - 1] == NOTIFY_REMOVED)
-	/* The only state we keep is the copy of the thread attributes.  */
-	free (data.attr);
+      else if (data.raw[NOTIFY_COOKIE_LEN - 1] == NOTIFY_REMOVED && data.attr != NULL)
+	{
+	  /* The only state we keep is the copy of the thread attributes.  */
+	  pthread_attr_destroy (data.attr);
+	  free (data.attr);
+	}
     }
   return NULL;
 }
@@ -255,8 +258,14 @@ mq_notify (mqd_t mqdes, const struct sigevent *notification)
       if (data.attr == NULL)
 	return -1;
 
-      memcpy (data.attr, notification->sigev_notify_attributes,
-	      sizeof (pthread_attr_t));
+      int ret = __pthread_attr_copy (data.attr,
+				     notification->sigev_notify_attributes);
+      if (ret != 0)
+	{
+	  free (data.attr);
+	  __set_errno (ret);
+	  return -1;
+	}
     }
 
   /* Construct the new request.  */
@@ -269,8 +278,11 @@ mq_notify (mqd_t mqdes, const struct sigevent *notification)
   int retval = INLINE_SYSCALL (mq_notify, 2, mqdes, &se);
 
   /* If it failed, free the allocated memory.  */
-  if (__glibc_unlikely (retval != 0))
-    free (data.attr);
+  if (retval != 0 && data.attr != NULL)
+    {
+      pthread_attr_destroy (data.attr);
+      free (data.attr);
+    }
 
   return retval;
 }
diff --git a/sysdeps/unix/sysv/linux/mq_timedreceive.c b/sysdeps/unix/sysv/linux/mq_timedreceive.c
index eb948ccc1..d3dde7a58 100644
--- a/sysdeps/unix/sysv/linux/mq_timedreceive.c
+++ b/sysdeps/unix/sysv/linux/mq_timedreceive.c
@@ -33,7 +33,7 @@ __mq_timedreceive_time64 (mqd_t mqdes, char *__restrict msg_ptr, size_t msg_len,
                             msg_prio, abs_timeout);
 
 #ifndef __ASSUME_TIME64_SYSCALLS
-  if (ret == 0 || errno != ENOSYS)
+  if (ret >= 0 || errno != ENOSYS)
     return ret;
 
   struct timespec ts32;
diff --git a/sysdeps/unix/sysv/linux/opensock.c b/sysdeps/unix/sysv/linux/opensock.c
deleted file mode 100644
index e87d6e58b..000000000
--- a/sysdeps/unix/sysv/linux/opensock.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (C) 1999-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <assert.h>
-#include <errno.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/socket.h>
-
-/* Return a socket of any type.  The socket can be used in subsequent
-   ioctl calls to talk to the kernel.  */
-int
-__opensock (void)
-{
-  static int last_family;	/* Available socket family we will use.  */
-  static int last_type;
-  static const struct
-  {
-    int family;
-    const char procname[15];
-  } afs[] =
-    {
-      { AF_UNIX, "net/unix" },
-      { AF_INET, "" },
-      { AF_INET6, "net/if_inet6" },
-      { AF_AX25, "net/ax25" },
-      { AF_NETROM, "net/nr" },
-      { AF_ROSE, "net/rose" },
-      { AF_IPX, "net/ipx" },
-      { AF_APPLETALK, "net/appletalk" },
-      { AF_ECONET, "sys/net/econet" },
-      { AF_ASH, "sys/net/ash" },
-      { AF_X25, "net/x25" },
-#ifdef NEED_AF_IUCV
-      { AF_IUCV, "net/iucv" }
-#endif
-    };
-#define nafs (sizeof (afs) / sizeof (afs[0]))
-  char fname[sizeof "/proc/" + 14];
-  int result;
-  int has_proc;
-  size_t cnt;
-
-  /* We already know which family to use from the last call.  Use it
-     again.  */
-  if (last_family != 0)
-    {
-      assert (last_type != 0);
-
-      result = __socket (last_family, last_type | SOCK_CLOEXEC, 0);
-      if (result != -1 || errno != EAFNOSUPPORT)
-	/* Maybe the socket type isn't supported anymore (module is
-	   unloaded).  In this case again try to find the type.  */
-	return result;
-
-      /* Reset the values.  They seem not valid anymore.  */
-      last_family = 0;
-      last_type = 0;
-    }
-
-  /* Check whether the /proc filesystem is available.  */
-  has_proc = __access ("/proc/net", R_OK) != -1;
-  strcpy (fname, "/proc/");
-
-  /* Iterate over the interface families and find one which is
-     available.  */
-  for (cnt = 0; cnt < nafs; ++cnt)
-    {
-      int type = SOCK_DGRAM;
-
-      if (has_proc && afs[cnt].procname[0] != '\0')
-	{
-	  strcpy (fname + 6, afs[cnt].procname);
-	  if (__access (fname, R_OK) == -1)
-	    /* The /proc entry is not available.  I.e., we cannot
-	       create a socket of this type (without loading the
-	       module).  Don't look for it since this might trigger
-	       loading the module.  */
-	    continue;
-	}
-
-      if (afs[cnt].family == AF_NETROM || afs[cnt].family == AF_X25)
-	type = SOCK_SEQPACKET;
-
-      result = __socket (afs[cnt].family, type | SOCK_CLOEXEC, 0);
-      if (result != -1)
-	{
-	  /* Found an available family.  */
-	  last_type = type;
-	  last_family = afs[cnt].family;
-	  return result;
-	}
-    }
-
-  /* None of the protocol families is available.  It is unclear what kind
-     of error is returned.  ENOENT seems like a reasonable choice.  */
-  __set_errno (ENOENT);
-  return -1;
-}
diff --git a/sysdeps/unix/sysv/linux/powerpc/syscall.S b/sysdeps/unix/sysv/linux/powerpc/syscall.S
index d6ec87f00..d06e776ca 100644
--- a/sysdeps/unix/sysv/linux/powerpc/syscall.S
+++ b/sysdeps/unix/sysv/linux/powerpc/syscall.S
@@ -27,7 +27,11 @@ ENTRY (syscall)
 	mr   r8,r9
 #if !IS_IN(rtld) && (defined(__PPC64__) || defined(__powerpc64__))
 	CHECK_SCV_SUPPORT r9 0f
+	stdu r1,-SCV_FRAME_SIZE(r1)
+	cfi_adjust_cfa_offset(SCV_FRAME_SIZE)
 	DO_CALL_SCV
+	addi r1,r1,SCV_FRAME_SIZE
+	cfi_adjust_cfa_offset(-SCV_FRAME_SIZE)
 	RET_SCV
 	b 1f
 #endif
diff --git a/sysdeps/unix/sysv/linux/s390/bits/hwcap.h b/sysdeps/unix/sysv/linux/s390/bits/hwcap.h
index 696616e77..00e73a3e3 100644
--- a/sysdeps/unix/sysv/linux/s390/bits/hwcap.h
+++ b/sysdeps/unix/sysv/linux/s390/bits/hwcap.h
@@ -22,6 +22,11 @@
 
 /*
  * The following must match the kernels asm/elf.h.
+ * Note: The kernel commit 511ad531afd4090625def4d9aba1f5227bd44b8e
+ * "s390/hwcaps: shorten HWCAP defines" has shortened the prefix of the macros
+ * from "HWCAP_S390_" to "HWCAP_".  For compatibility reasons, we do not
+ * change the prefix in public glibc header file.
+ *
  * Note that these are *not* the same as the STORE FACILITY LIST bits.
  */
 #define HWCAP_S390_ESAN3        1
@@ -46,3 +51,7 @@
 #define HWCAP_S390_VXRS_PDE     65536
 #define HWCAP_S390_SORT         131072
 #define HWCAP_S390_DFLT         262144
+#define HWCAP_S390_VXRS_PDE2    524288
+#define HWCAP_S390_NNPA         1048576
+#define HWCAP_S390_PCI_MIO      2097152
+#define HWCAP_S390_SIE          4194304
diff --git a/sysdeps/unix/sysv/linux/s390/opensock.c b/sysdeps/unix/sysv/linux/s390/opensock.c
deleted file mode 100644
index f099d651f..000000000
--- a/sysdeps/unix/sysv/linux/s390/opensock.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define NEED_AF_IUCV 1
-#include "../opensock.c"
diff --git a/sysdeps/unix/sysv/linux/select.c b/sysdeps/unix/sysv/linux/select.c
index 415aa87d3..dc16a816e 100644
--- a/sysdeps/unix/sysv/linux/select.c
+++ b/sysdeps/unix/sysv/linux/select.c
@@ -33,13 +33,35 @@ int
 __select64 (int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
 	    struct __timeval64 *timeout)
 {
-  struct __timespec64 ts64, *pts64 = NULL;
-  if (timeout != NULL)
+  __time64_t s = timeout != NULL ? timeout->tv_sec : 0;
+  int32_t us = timeout != NULL ? timeout->tv_usec : 0;
+  int32_t ns;
+
+  if (s < 0 || us < 0)
+    return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
+
+  /* Normalize the timeout, as legacy Linux __NR_select and __NR__newselect.
+     Different than syscall, it also handle possible overflow.  */
+  if (us / USEC_PER_SEC > INT64_MAX - s)
+    {
+      s = INT64_MAX;
+      ns = NSEC_PER_SEC - 1;
+    }
+  else
     {
-      ts64 = timeval64_to_timespec64 (*timeout);
-      pts64 = &ts64;
+      s += us / USEC_PER_SEC;
+      us = us % USEC_PER_SEC;
+      ns = us * NSEC_PER_USEC;
     }
 
+  struct __timespec64 ts64, *pts64 = NULL;
+   if (timeout != NULL)
+     {
+       ts64.tv_sec = s;
+       ts64.tv_nsec = ns;
+       pts64 = &ts64;
+     }
+
 #ifndef __NR_pselect6_time64
 # define __NR_pselect6_time64 __NR_pselect6
 #endif
@@ -52,10 +74,10 @@ __select64 (int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
          (though the pselect() glibc call suppresses this behavior).
          Since select() on Linux has the same behavior as the pselect6
          syscall, we update the timeout here.  */
-      if (r == 0 || errno != ENOSYS)
+      if (r >= 0 || errno != ENOSYS)
 	{
 	  if (timeout != NULL)
-	    TIMEVAL_TO_TIMESPEC (timeout, &ts64);
+	    TIMESPEC_TO_TIMEVAL (timeout, &ts64);
 	  return r;
 	}
 
@@ -64,14 +86,15 @@ __select64 (int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
 
 #ifndef __ASSUME_TIME64_SYSCALLS
   struct timespec ts32, *pts32 = NULL;
-  if (timeout != NULL)
+  if (pts64 != NULL)
     {
-      if (! in_time_t_range (timeout->tv_sec))
+      if (! in_time_t_range (pts64->tv_sec))
 	{
 	  __set_errno (EINVAL);
 	  return -1;
 	}
-      ts32 = valid_timespec64_to_timespec (ts64);
+      ts32.tv_sec = s;
+      ts32.tv_nsec = ns;
       pts32 = &ts32;
     }
 # ifndef __ASSUME_PSELECT
@@ -84,7 +107,7 @@ __select64 (int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
   r = SYSCALL_CANCEL (pselect6, nfds, readfds, writefds, exceptfds, pts32,
 		      NULL);
 # endif
-  if (r >= 0 && timeout != NULL)
+  if (timeout != NULL)
     *timeout = valid_timespec_to_timeval64 (ts32);
 #endif
 
@@ -105,7 +128,7 @@ __select (int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
       ptv64 = &tv64;
     }
   int r = __select64 (nfds, readfds, writefds, exceptfds, ptv64);
-  if (r >= 0 && timeout != NULL)
+  if (timeout != NULL)
     /* The remanining timeout will be always less the input TIMEOUT.  */
     *timeout = valid_timeval64_to_timeval (tv64);
   return r;
diff --git a/sysdeps/unix/sysv/linux/sys/prctl.h b/sysdeps/unix/sysv/linux/sys/prctl.h
index 00817ff0f..c9048c7cd 100644
--- a/sysdeps/unix/sysv/linux/sys/prctl.h
+++ b/sysdeps/unix/sysv/linux/sys/prctl.h
@@ -25,10 +25,6 @@
    we're picking up...  */
 
 /* Memory tagging control operations (for AArch64).  */
-#ifndef PR_TAGGED_ADDR_ENABLE
-# define PR_TAGGED_ADDR_ENABLE	(1UL << 8)
-#endif
-
 #ifndef PR_MTE_TCF_SHIFT
 # define PR_MTE_TCF_SHIFT	1
 # define PR_MTE_TCF_NONE	(0UL << PR_MTE_TCF_SHIFT)
diff --git a/sysdeps/unix/sysv/linux/tst-getcwd-smallbuff.c b/sysdeps/unix/sysv/linux/tst-getcwd-smallbuff.c
new file mode 100644
index 000000000..55362f606
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-getcwd-smallbuff.c
@@ -0,0 +1,259 @@
+/* Verify that getcwd returns ERANGE for size 1 byte and does not underflow
+   buffer when the CWD is too long and is also a mount target of /.  See bug
+   #28769 or CVE-2021-3999 for more context.
+   Copyright The GNU Toolchain Authors.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <intprops.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <support/check.h>
+#include <support/temp_file.h>
+#include <support/test-driver.h>
+#include <support/xsched.h>
+#include <support/xunistd.h>
+
+static char *base;
+#define BASENAME "tst-getcwd-smallbuff"
+#define MOUNT_NAME "mpoint"
+static int sockfd[2];
+
+static void
+do_cleanup (void)
+{
+  support_chdir_toolong_temp_directory (base);
+  TEST_VERIFY_EXIT (rmdir (MOUNT_NAME) == 0);
+  free (base);
+}
+
+static void
+send_fd (const int sock, const int fd)
+{
+  struct msghdr msg = {0};
+  union
+    {
+      struct cmsghdr hdr;
+      char buf[CMSG_SPACE (sizeof (int))];
+    } cmsgbuf = {0};
+  struct cmsghdr *cmsg;
+  struct iovec vec;
+  char ch = 'A';
+  ssize_t n;
+
+  msg.msg_control = &cmsgbuf.buf;
+  msg.msg_controllen = sizeof (cmsgbuf.buf);
+
+  cmsg = CMSG_FIRSTHDR (&msg);
+  cmsg->cmsg_len = CMSG_LEN (sizeof (int));
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_RIGHTS;
+  memcpy (CMSG_DATA (cmsg), &fd, sizeof (fd));
+
+  vec.iov_base = &ch;
+  vec.iov_len = 1;
+  msg.msg_iov = &vec;
+  msg.msg_iovlen = 1;
+
+  while ((n = sendmsg (sock, &msg, 0)) == -1 && errno == EINTR);
+
+  TEST_VERIFY_EXIT (n == 1);
+}
+
+static int
+recv_fd (const int sock)
+{
+  struct msghdr msg = {0};
+  union
+    {
+      struct cmsghdr hdr;
+      char buf[CMSG_SPACE(sizeof(int))];
+    } cmsgbuf = {0};
+  struct cmsghdr *cmsg;
+  struct iovec vec;
+  ssize_t n;
+  char ch = '\0';
+  int fd = -1;
+
+  vec.iov_base = &ch;
+  vec.iov_len = 1;
+  msg.msg_iov = &vec;
+  msg.msg_iovlen = 1;
+
+  msg.msg_control = &cmsgbuf.buf;
+  msg.msg_controllen = sizeof (cmsgbuf.buf);
+
+  while ((n = recvmsg (sock, &msg, 0)) == -1 && errno == EINTR);
+  if (n != 1 || ch != 'A')
+    return -1;
+
+  cmsg = CMSG_FIRSTHDR (&msg);
+  if (cmsg == NULL)
+    return -1;
+  if (cmsg->cmsg_type != SCM_RIGHTS)
+    return -1;
+  memcpy (&fd, CMSG_DATA (cmsg), sizeof (fd));
+  if (fd < 0)
+    return -1;
+  return fd;
+}
+
+static int
+child_func (void * const arg)
+{
+  xclose (sockfd[0]);
+  const int sock = sockfd[1];
+  char ch;
+
+  TEST_VERIFY_EXIT (read (sock, &ch, 1) == 1);
+  TEST_VERIFY_EXIT (ch == '1');
+
+  if (mount ("/", MOUNT_NAME, NULL, MS_BIND | MS_REC, NULL))
+    FAIL_EXIT1 ("mount failed: %m\n");
+  const int fd = xopen ("mpoint",
+			O_RDONLY | O_PATH | O_DIRECTORY | O_NOFOLLOW, 0);
+
+  send_fd (sock, fd);
+  xclose (fd);
+
+  TEST_VERIFY_EXIT (read (sock, &ch, 1) == 1);
+  TEST_VERIFY_EXIT (ch == 'a');
+
+  xclose (sock);
+  return 0;
+}
+
+static void
+update_map (char * const mapping, const char * const map_file)
+{
+  const size_t map_len = strlen (mapping);
+
+  const int fd = xopen (map_file, O_WRONLY, 0);
+  xwrite (fd, mapping, map_len);
+  xclose (fd);
+}
+
+static void
+proc_setgroups_write (const long child_pid, const char * const str)
+{
+  const size_t str_len = strlen(str);
+
+  char setgroups_path[sizeof ("/proc//setgroups") + INT_STRLEN_BOUND (long)];
+
+  snprintf (setgroups_path, sizeof (setgroups_path),
+	    "/proc/%ld/setgroups", child_pid);
+
+  const int fd = open (setgroups_path, O_WRONLY);
+
+  if (fd < 0)
+    {
+      TEST_VERIFY_EXIT (errno == ENOENT);
+      FAIL_UNSUPPORTED ("/proc/%ld/setgroups not found\n", child_pid);
+    }
+
+  xwrite (fd, str, str_len);
+  xclose(fd);
+}
+
+static char child_stack[1024 * 1024];
+
+int
+do_test (void)
+{
+  base = support_create_and_chdir_toolong_temp_directory (BASENAME);
+
+  xmkdir (MOUNT_NAME, S_IRWXU);
+  atexit (do_cleanup);
+
+  /* Check whether user namespaces are supported.  */
+  {
+    pid_t pid = xfork ();
+    if (pid == 0)
+      {
+	if (unshare (CLONE_NEWUSER | CLONE_NEWNS) != 0)
+	  _exit (EXIT_UNSUPPORTED);
+	else
+	  _exit (0);
+      }
+    int status;
+    xwaitpid (pid, &status, 0);
+    TEST_VERIFY_EXIT (WIFEXITED (status));
+    if (WEXITSTATUS (status) != 0)
+      return WEXITSTATUS (status);
+  }
+
+  TEST_VERIFY_EXIT (socketpair (AF_UNIX, SOCK_STREAM, 0, sockfd) == 0);
+  pid_t child_pid = xclone (child_func, NULL, child_stack,
+			    sizeof (child_stack),
+			    CLONE_NEWUSER | CLONE_NEWNS | SIGCHLD);
+
+  xclose (sockfd[1]);
+  const int sock = sockfd[0];
+
+  char map_path[sizeof ("/proc//uid_map") + INT_STRLEN_BOUND (long)];
+  char map_buf[sizeof ("0  1") + INT_STRLEN_BOUND (long)];
+
+  snprintf (map_path, sizeof (map_path), "/proc/%ld/uid_map",
+	    (long) child_pid);
+  snprintf (map_buf, sizeof (map_buf), "0 %ld 1", (long) getuid());
+  update_map (map_buf, map_path);
+
+  proc_setgroups_write ((long) child_pid, "deny");
+  snprintf (map_path, sizeof (map_path), "/proc/%ld/gid_map",
+	    (long) child_pid);
+  snprintf (map_buf, sizeof (map_buf), "0 %ld 1", (long) getgid());
+  update_map (map_buf, map_path);
+
+  TEST_VERIFY_EXIT (send (sock, "1", 1, MSG_NOSIGNAL) == 1);
+  const int fd = recv_fd (sock);
+  TEST_VERIFY_EXIT (fd >= 0);
+  TEST_VERIFY_EXIT (fchdir (fd) == 0);
+
+  static char buf[2 * 10 + 1];
+  memset (buf, 'A', sizeof (buf));
+
+  /* Finally, call getcwd and check if it resulted in a buffer underflow.  */
+  char * cwd = getcwd (buf + sizeof (buf) / 2, 1);
+  TEST_VERIFY (cwd == NULL);
+  TEST_VERIFY (errno == ERANGE);
+
+  for (int i = 0; i < sizeof (buf); i++)
+    if (buf[i] != 'A')
+      {
+	printf ("buf[%d] = %02x\n", i, (unsigned int) buf[i]);
+	support_record_failure ();
+      }
+
+  TEST_VERIFY_EXIT (send (sock, "a", 1, MSG_NOSIGNAL) == 1);
+  xclose (sock);
+  TEST_VERIFY_EXIT (xwaitpid (child_pid, NULL, 0) == child_pid);
+
+  return 0;
+}
+
+#define CLEANUP_HANDLER do_cleanup
+#include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/tst-sysvshm-linux.c b/sysdeps/unix/sysv/linux/tst-sysvshm-linux.c
index 2f05f21e4..110a7c143 100644
--- a/sysdeps/unix/sysv/linux/tst-sysvshm-linux.c
+++ b/sysdeps/unix/sysv/linux/tst-sysvshm-linux.c
@@ -122,18 +122,21 @@ do_test (void)
   if (shmid == -1)
     FAIL_EXIT1 ("shmget failed: %m");
 
+  /* It does not check shmmax because kernel clamp its value to INT_MAX for:
+
+     1. Compat symbols with IPC_64, i.e, 32-bit binaries running on 64-bit
+        kernels.
+
+     2. Default symbol without IPC_64 (defined as IPC_OLD within Linux) and
+        glibc always use IPC_64 for 32-bit ABIs (to support 64-bit time_t).
+        It means that 32-bit binaries running on 32-bit kernels will not see
+        shmmax being clamped.
+
+     And finding out whether the compat symbol is used would require checking
+     the underlying kernel against the current ABI.  The shmall and shmmni
+     already provided enough coverage.  */
+
   struct test_shminfo tipcinfo;
-  {
-    uint64_t v = read_proc_file ("/proc/sys/kernel/shmmax");
-#if LONG_MAX == INT_MAX
-    /* Kernel explicit clamp the value for shmmax on compat symbol (32-bit
-       binaries running on 64-bit kernels).  */
-    if (sizeof (__syscall_ulong_t) == sizeof (unsigned long int)
-        && v > INT_MAX)
-      v = INT_MAX;
-#endif
-    tipcinfo.shmmax = v;
-  }
   tipcinfo.shmall = read_proc_file ("/proc/sys/kernel/shmall");
   tipcinfo.shmmni = read_proc_file ("/proc/sys/kernel/shmmni");
 
@@ -152,7 +155,6 @@ do_test (void)
       FAIL_EXIT1 ("shmctl with IPC_INFO failed: %m");
 
     TEST_COMPARE (ipcinfo.shmall, tipcinfo.shmall);
-    TEST_COMPARE (ipcinfo.shmmax, tipcinfo.shmmax);
     TEST_COMPARE (ipcinfo.shmmni, tipcinfo.shmmni);
   }
 
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index dd8267434..184660916 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -73,6 +73,32 @@ endif
 
 ifeq ($(subdir),string)
 sysdep_routines += cacheinfo
+
+tests += \
+  tst-memchr-rtm \
+  tst-memcmp-rtm \
+  tst-memmove-rtm \
+  tst-memrchr-rtm \
+  tst-memset-rtm \
+  tst-strchr-rtm \
+  tst-strcpy-rtm \
+  tst-strlen-rtm \
+  tst-strncmp-rtm \
+  tst-strrchr-rtm \
+  tst-wcsncmp-rtm \
+# tests
+
+CFLAGS-tst-memchr-rtm.c += -mrtm
+CFLAGS-tst-memcmp-rtm.c += -mrtm
+CFLAGS-tst-memmove-rtm.c += -mrtm
+CFLAGS-tst-memrchr-rtm.c += -mrtm
+CFLAGS-tst-memset-rtm.c += -mrtm
+CFLAGS-tst-strchr-rtm.c += -mrtm
+CFLAGS-tst-strcpy-rtm.c += -mrtm
+CFLAGS-tst-strlen-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
+CFLAGS-tst-strrchr-rtm.c += -mrtm
+CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
 endif
 
 ifneq ($(enable-cet),no)
@@ -208,3 +234,11 @@ $(objpfx)check-cet.out: $(..)sysdeps/x86/check-cet.awk \
 generated += check-cet.out
 endif
 endif
+
+ifeq ($(subdir),posix)
+tests += \
+  tst-sysconf-cache-linesize \
+  tst-sysconf-cache-linesize-static
+tests-static += \
+  tst-sysconf-cache-linesize-static
+endif
diff --git a/sysdeps/x86/bits/platform/x86.h b/sysdeps/x86/bits/platform/x86.h
index 8f423ae72..1b4d6dab6 100644
--- a/sysdeps/x86/bits/platform/x86.h
+++ b/sysdeps/x86/bits/platform/x86.h
@@ -210,7 +210,7 @@ enum
   x86_cpu_AVX512_VP2INTERSECT	= x86_cpu_index_7_edx + 8,
   x86_cpu_INDEX_7_EDX_9		= x86_cpu_index_7_edx + 9,
   x86_cpu_MD_CLEAR		= x86_cpu_index_7_edx + 10,
-  x86_cpu_INDEX_7_EDX_11	= x86_cpu_index_7_edx + 11,
+  x86_cpu_RTM_ALWAYS_ABORT	= x86_cpu_index_7_edx + 11,
   x86_cpu_INDEX_7_EDX_12	= x86_cpu_index_7_edx + 12,
   x86_cpu_INDEX_7_EDX_13	= x86_cpu_index_7_edx + 13,
   x86_cpu_SERIALIZE		= x86_cpu_index_7_edx + 14,
@@ -244,7 +244,7 @@ enum
   x86_cpu_XOP			= x86_cpu_index_80000001_ecx + 11,
   x86_cpu_LWP			= x86_cpu_index_80000001_ecx + 15,
   x86_cpu_FMA4			= x86_cpu_index_80000001_ecx + 16,
-  x86_cpu_TBM			= x86_cpu_index_80000001_ecx + 20,
+  x86_cpu_TBM			= x86_cpu_index_80000001_ecx + 21,
 
   x86_cpu_index_80000001_edx
     = (CPUID_INDEX_80000001 * 8 * 4 * sizeof (unsigned int)
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 7b8df45e3..5ea4723ca 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -32,6 +32,9 @@ __cache_sysconf (int name)
     case _SC_LEVEL1_ICACHE_SIZE:
       return cpu_features->level1_icache_size;
 
+    case _SC_LEVEL1_ICACHE_LINESIZE:
+      return cpu_features->level1_icache_linesize;
+
     case _SC_LEVEL1_DCACHE_SIZE:
       return cpu_features->level1_dcache_size;
 
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index 68c253542..58ae0c5dd 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -54,6 +54,14 @@ long int __x86_rep_movsb_threshold attribute_hidden = 2048;
 /* Threshold to use Enhanced REP STOSB.  */
 long int __x86_rep_stosb_threshold attribute_hidden = 2048;
 
+/* Threshold to stop using Enhanced REP MOVSB.  */
+long int __x86_rep_movsb_stop_threshold attribute_hidden;
+
+/* A bit-wise OR of string/memory requirements for optimal performance
+   e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
+   are used at runtime to tune implementation behavior.  */
+int __x86_string_control attribute_hidden;
+
 static void
 init_cacheinfo (void)
 {
@@ -63,21 +71,32 @@ init_cacheinfo (void)
   __x86_raw_data_cache_size = data;
   /* Round data cache size to multiple of 256 bytes.  */
   data = data & ~255L;
-  __x86_data_cache_size_half = data / 2;
-  __x86_data_cache_size = data;
+  if (data > 0)
+    {
+      __x86_data_cache_size_half = data / 2;
+      __x86_data_cache_size = data;
+    }
 
   long int shared = cpu_features->shared_cache_size;
   __x86_raw_shared_cache_size_half = shared / 2;
   __x86_raw_shared_cache_size = shared;
   /* Round shared cache size to multiple of 256 bytes.  */
   shared = shared & ~255L;
-  __x86_shared_cache_size_half = shared / 2;
-  __x86_shared_cache_size = shared;
+  if (shared > 0)
+    {
+      __x86_shared_cache_size_half = shared / 2;
+      __x86_shared_cache_size = shared;
+    }
 
   __x86_shared_non_temporal_threshold
     = cpu_features->non_temporal_threshold;
 
   __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
+  __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
+    __x86_string_control
+      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
 }
 #endif
diff --git a/sysdeps/x86/configure b/sysdeps/x86/configure
index 5e32dc62b..ead1295c3 100644
--- a/sysdeps/x86/configure
+++ b/sysdeps/x86/configure
@@ -126,6 +126,8 @@ cat > conftest2.S <<EOF
 4:
 EOF
 libc_cv_include_x86_isa_level=no
+libc_cv_have_x86_lahf_sahf=no
+libc_cv_have_x86_movbe=no
 if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -nostartfiles -nostdlib -r -o conftest conftest1.S conftest2.S'
   { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
   (eval $ac_try) 2>&5
@@ -135,6 +137,24 @@ if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -nostartfiles -nostdlib -r -o conftest c
   count=`LC_ALL=C $READELF -n conftest | grep NT_GNU_PROPERTY_TYPE_0 | wc -l`
   if test "$count" = 1; then
     libc_cv_include_x86_isa_level=yes
+    cat > conftest.c <<EOF
+EOF
+    if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fverbose-asm -S -o - conftest.c'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } | grep -q "\-msahf"; then
+      libc_cv_have_x86_lahf_sahf=yes
+    fi
+    if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fverbose-asm -S -o - conftest.c'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; } | grep -q "\-mmovbe"; then
+      libc_cv_have_x86_movbe=yes
+    fi
   fi
 fi
 rm -f conftest*
@@ -144,6 +164,14 @@ $as_echo "$libc_cv_include_x86_isa_level" >&6; }
 if test $libc_cv_include_x86_isa_level = yes; then
   $as_echo "#define INCLUDE_X86_ISA_LEVEL 1" >>confdefs.h
 
+fi
+if test $libc_cv_have_x86_lahf_sahf = yes; then
+  $as_echo "#define HAVE_X86_LAHF_SAHF 1" >>confdefs.h
+
+fi
+if test $libc_cv_have_x86_movbe = yes; then
+  $as_echo "#define HAVE_X86_MOVBE 1" >>confdefs.h
+
 fi
 config_vars="$config_vars
 enable-x86-isa-level = $libc_cv_include_x86_isa_level"
diff --git a/sysdeps/x86/configure.ac b/sysdeps/x86/configure.ac
index f94088f37..bca97fdc2 100644
--- a/sysdeps/x86/configure.ac
+++ b/sysdeps/x86/configure.ac
@@ -98,14 +98,30 @@ cat > conftest2.S <<EOF
 4:
 EOF
 libc_cv_include_x86_isa_level=no
+libc_cv_have_x86_lahf_sahf=no
+libc_cv_have_x86_movbe=no
 if AC_TRY_COMMAND(${CC-cc} $CFLAGS $CPPFLAGS -nostartfiles -nostdlib -r -o conftest conftest1.S conftest2.S); then
   count=`LC_ALL=C $READELF -n conftest | grep NT_GNU_PROPERTY_TYPE_0 | wc -l`
   if test "$count" = 1; then
     libc_cv_include_x86_isa_level=yes
+    cat > conftest.c <<EOF
+EOF
+    if AC_TRY_COMMAND(${CC-cc} $CFLAGS $CPPFLAGS -fverbose-asm -S -o - conftest.c) | grep -q "\-msahf"; then
+      libc_cv_have_x86_lahf_sahf=yes
+    fi
+    if AC_TRY_COMMAND(${CC-cc} $CFLAGS $CPPFLAGS -fverbose-asm -S -o - conftest.c) | grep -q "\-mmovbe"; then
+      libc_cv_have_x86_movbe=yes
+    fi
   fi
 fi
 rm -f conftest*])
 if test $libc_cv_include_x86_isa_level = yes; then
   AC_DEFINE(INCLUDE_X86_ISA_LEVEL)
 fi
+if test $libc_cv_have_x86_lahf_sahf = yes; then
+  AC_DEFINE(HAVE_X86_LAHF_SAHF)
+fi
+if test $libc_cv_have_x86_movbe = yes; then
+  AC_DEFINE(HAVE_X86_MOVBE)
+fi
 LIBC_CONFIG_VAR([enable-x86-isa-level], [$libc_cv_include_x86_isa_level])
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 73b0a4dc9..42b02d1d3 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -66,7 +66,6 @@ update_usable (struct cpu_features *cpu_features)
   CPU_FEATURE_SET_USABLE (cpu_features, HLE);
   CPU_FEATURE_SET_USABLE (cpu_features, BMI2);
   CPU_FEATURE_SET_USABLE (cpu_features, ERMS);
-  CPU_FEATURE_SET_USABLE (cpu_features, RTM);
   CPU_FEATURE_SET_USABLE (cpu_features, RDSEED);
   CPU_FEATURE_SET_USABLE (cpu_features, ADX);
   CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT);
@@ -75,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
   CPU_FEATURE_SET_USABLE (cpu_features, PREFETCHWT1);
   CPU_FEATURE_SET_USABLE (cpu_features, OSPKE);
   CPU_FEATURE_SET_USABLE (cpu_features, WAITPKG);
-  CPU_FEATURE_SET_USABLE (cpu_features, SHSTK);
   CPU_FEATURE_SET_USABLE (cpu_features, GFNI);
   CPU_FEATURE_SET_USABLE (cpu_features, RDPID);
   CPU_FEATURE_SET_USABLE (cpu_features, RDRAND);
@@ -83,9 +81,9 @@ update_usable (struct cpu_features *cpu_features)
   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI);
   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B);
   CPU_FEATURE_SET_USABLE (cpu_features, FSRM);
+  CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
   CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE);
   CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK);
-  CPU_FEATURE_SET_USABLE (cpu_features, IBT);
   CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64);
   CPU_FEATURE_SET_USABLE (cpu_features, LZCNT);
   CPU_FEATURE_SET_USABLE (cpu_features, SSE4A);
@@ -97,6 +95,14 @@ update_usable (struct cpu_features *cpu_features)
   CPU_FEATURE_SET_USABLE (cpu_features, FSRS);
   CPU_FEATURE_SET_USABLE (cpu_features, FSRCS);
 
+  if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
+    CPU_FEATURE_SET_USABLE (cpu_features, RTM);
+
+#if CET_ENABLED
+  CPU_FEATURE_SET_USABLE (cpu_features, IBT);
+  CPU_FEATURE_SET_USABLE (cpu_features, SHSTK);
+#endif
+
   /* Can we call xgetbv?  */
   if (CPU_FEATURES_CPU_P (cpu_features, OSXSAVE))
     {
@@ -490,11 +496,39 @@ init_cpu_features (struct cpu_features *cpu_features)
 	      break;
 	    }
 
-	 /* Disable TSX on some Haswell processors to avoid TSX on kernels that
-	    weren't updated with the latest microcode package (which disables
-	    broken feature by default).  */
+	 /* Disable TSX on some processors to avoid TSX on kernels that
+	    weren't updated with the latest microcode package (which
+	    disables broken feature by default).  */
 	 switch (model)
 	    {
+	    case 0x55:
+	      if (stepping <= 5)
+		goto disable_tsx;
+	      break;
+	    case 0x8e:
+	      /* NB: Although the errata documents that for model == 0x8e,
+		 only 0xb stepping or lower are impacted, the intention of
+		 the errata was to disable TSX on all client processors on
+		 all steppings.  Include 0xc stepping which is an Intel
+		 Core i7-8665U, a client mobile processor.  */
+	    case 0x9e:
+	      if (stepping > 0xc)
+		break;
+	      /* Fall through.  */
+	    case 0x4e:
+	    case 0x5e:
+	      {
+		/* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
+		   processors listed in:
+
+https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+		 */
+disable_tsx:
+		CPU_FEATURE_UNSET (cpu_features, HLE);
+		CPU_FEATURE_UNSET (cpu_features, RTM);
+		CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
+	      }
+	      break;
 	    case 0x3f:
 	      /* Xeon E7 v3 with stepping >= 4 has working TSX.  */
 	      if (stepping >= 4)
@@ -520,8 +554,24 @@ init_cpu_features (struct cpu_features *cpu_features)
 	cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
 	  |= bit_arch_Prefer_No_VZEROUPPER;
       else
-	cpu_features->preferred[index_arch_Prefer_No_AVX512]
-	  |= bit_arch_Prefer_No_AVX512;
+	{
+	  /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
+	     when ZMM load and store instructions are used.  */
+	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
+	    cpu_features->preferred[index_arch_Prefer_No_AVX512]
+	      |= bit_arch_Prefer_No_AVX512;
+
+	  /* Avoid RTM abort triggered by VZEROUPPER inside a
+	     transactionally executing RTM region.  */
+	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+	      |= bit_arch_Prefer_No_VZEROUPPER;
+	}
+
+      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
+      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
+	cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
+	  |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
     }
   /* This spells out "AuthenticAMD" or "HygonGenuine".  */
   else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index a31fa0783..2e43e67e4 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -704,9 +704,10 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   int max_cpuid_ex;
   long int data = -1;
   long int shared = -1;
-  long int core;
+  long int core = -1;
   unsigned int threads = 0;
   unsigned long int level1_icache_size = -1;
+  unsigned long int level1_icache_linesize = -1;
   unsigned long int level1_dcache_size = -1;
   unsigned long int level1_dcache_assoc = -1;
   unsigned long int level1_dcache_linesize = -1;
@@ -726,6 +727,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
 
       level1_icache_size
 	= handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
+      level1_icache_linesize
+	= handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
       level1_dcache_size = data;
       level1_dcache_assoc
 	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
@@ -753,6 +756,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
 
       level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
+      level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
       level1_dcache_size = data;
       level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
       level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
@@ -772,6 +776,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
 
       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
+      level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
       level1_dcache_size = data;
       level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
       level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
@@ -833,6 +838,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
     }
 
   cpu_features->level1_icache_size = level1_icache_size;
+  cpu_features->level1_icache_linesize = level1_icache_linesize;
   cpu_features->level1_dcache_size = level1_dcache_size;
   cpu_features->level1_dcache_assoc = level1_dcache_assoc;
   cpu_features->level1_dcache_linesize = level1_dcache_linesize;
@@ -860,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
   unsigned int minimum_rep_movsb_threshold;
 #endif
-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
+  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
+     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
+     threshold is 2048 * (VEC_SIZE / 16).  */
   unsigned int rep_movsb_threshold;
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
     {
-      rep_movsb_threshold = 2048 * (64 / 16);
+      rep_movsb_threshold = 4096 * (64 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 64 * 8;
 #endif
@@ -873,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 				    AVX_Fast_Unaligned_Load))
     {
-      rep_movsb_threshold = 2048 * (32 / 16);
+      rep_movsb_threshold = 4096 * (32 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 32 * 8;
 #endif
@@ -885,6 +893,22 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       minimum_rep_movsb_threshold = 16 * 8;
 #endif
     }
+  /* NB: The default REP MOVSB threshold is 2112 on processors with fast
+     short REP MOVSB (FSRM).  */
+  if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+    rep_movsb_threshold = 2112;
+
+  unsigned long int rep_movsb_stop_threshold;
+  /* ERMS feature is implemented from AMD Zen3 architecture and it is
+     performing poorly for data above L2 cache size. Henceforth, adding
+     an upper bound threshold parameter to limit the usage of Enhanced
+     REP MOVSB operations and setting its value to L2 cache size.  */
+  if (cpu_features->basic.kind == arch_kind_amd)
+    rep_movsb_stop_threshold = core;
+  /* Setting the upper bound of ERMS to the computed value of
+     non-temporal threshold for architectures other than AMD.  */
+  else
+    rep_movsb_stop_threshold = non_temporal_threshold;
 
   /* The default threshold to use Enhanced REP STOSB.  */
   unsigned long int rep_stosb_threshold = 2048;
@@ -917,17 +941,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
 				     long int, NULL);
 
-  TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, long int, data,
-			   0, (long int) -1);
-  TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, long int, shared,
-			   0, (long int) -1);
-  TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, long int,
-			   non_temporal_threshold, 0, (long int) -1);
-  TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, long int,
-			   rep_movsb_threshold,
-			   minimum_rep_movsb_threshold, (long int) -1);
-  TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, long int,
-			   rep_stosb_threshold, 1, (long int) -1);
+  TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
+  TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
+  TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
+			   0, SIZE_MAX);
+  TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
+			   minimum_rep_movsb_threshold, SIZE_MAX);
+  TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+			   SIZE_MAX);
 #endif
 
   cpu_features->data_cache_size = data;
@@ -935,4 +956,5 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   cpu_features->non_temporal_threshold = non_temporal_threshold;
   cpu_features->rep_movsb_threshold = rep_movsb_threshold;
   cpu_features->rep_stosb_threshold = rep_stosb_threshold;
+  cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
 }
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
new file mode 100644
index 000000000..af8486470
--- /dev/null
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
@@ -0,0 +1,118 @@
+/* Print CPU diagnostics data in ld.so.  x86 version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-diagnostics.h>
+#include <ldsodefs.h>
+
+static void
+print_cpu_features_value (const char *label, uint64_t value)
+{
+  _dl_printf ("x86.cpu_features.");
+  _dl_diagnostics_print_labeled_value (label, value);
+}
+
+static void
+print_cpu_feature_internal (unsigned int index, const char *kind,
+                            unsigned int reg, uint32_t value)
+{
+  _dl_printf ("x86.cpu_features.features[0x%x].%s[0x%x]=0x%x\n",
+              index, kind, reg, value);
+}
+
+static void
+print_cpu_feature_preferred (const char *label, unsigned int flag)
+{
+  _dl_printf("x86.cpu_features.preferred.%s=0x%x\n", label, flag);
+}
+
+void
+_dl_diagnostics_cpu (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  print_cpu_features_value ("basic.kind", cpu_features->basic.kind);
+  print_cpu_features_value ("basic.max_cpuid", cpu_features->basic.max_cpuid);
+  print_cpu_features_value ("basic.family", cpu_features->basic.family);
+  print_cpu_features_value ("basic.model", cpu_features->basic.model);
+  print_cpu_features_value ("basic.stepping", cpu_features->basic.stepping);
+
+  for (unsigned int index = 0; index < CPUID_INDEX_MAX; ++index)
+    {
+      /* The index values are part of the ABI via
+         <sys/platform/x86.h>, so translating them to strings is not
+         necessary.  */
+      for (unsigned int reg = 0; reg < 4; ++reg)
+        print_cpu_feature_internal
+          (index, "cpuid", reg,
+           cpu_features->features[index].cpuid_array[reg]);
+      for (unsigned int reg = 0; reg < 4; ++reg)
+        print_cpu_feature_internal
+          (index, "usable", reg,
+           cpu_features->features[index].usable_array[reg]);
+    }
+
+  /* The preferred indicators are not part of the ABI and need to be
+     translated.  */
+#define BIT(x) \
+  print_cpu_feature_preferred (#x, CPU_FEATURE_PREFERRED_P (cpu_features, x));
+#include "cpu-features-preferred_feature_index_1.def"
+#undef BIT
+
+  print_cpu_features_value ("isa_1", cpu_features->isa_1);
+  print_cpu_features_value ("xsave_state_size",
+                            cpu_features->xsave_state_size);
+  print_cpu_features_value ("xsave_state_full_size",
+                            cpu_features->xsave_state_full_size);
+  print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size);
+  print_cpu_features_value ("shared_cache_size",
+                            cpu_features->shared_cache_size);
+  print_cpu_features_value ("non_temporal_threshold",
+                            cpu_features->non_temporal_threshold);
+  print_cpu_features_value ("rep_movsb_threshold",
+                            cpu_features->rep_movsb_threshold);
+  print_cpu_features_value ("rep_stosb_threshold",
+                            cpu_features->rep_stosb_threshold);
+  print_cpu_features_value ("level1_icache_size",
+                            cpu_features->level1_icache_size);
+  print_cpu_features_value ("level1_icache_linesize",
+                            cpu_features->level1_icache_linesize);
+  print_cpu_features_value ("level1_dcache_size",
+                            cpu_features->level1_dcache_size);
+  print_cpu_features_value ("level1_dcache_assoc",
+                            cpu_features->level1_dcache_assoc);
+  print_cpu_features_value ("level1_dcache_linesize",
+                            cpu_features->level1_dcache_linesize);
+  print_cpu_features_value ("level2_cache_size",
+                            cpu_features->level2_cache_size);
+  print_cpu_features_value ("level2_cache_assoc",
+                            cpu_features->level2_cache_assoc);
+  print_cpu_features_value ("level2_cache_linesize",
+                            cpu_features->level2_cache_linesize);
+  print_cpu_features_value ("level3_cache_size",
+                            cpu_features->level3_cache_size);
+  print_cpu_features_value ("level3_cache_assoc",
+                            cpu_features->level3_cache_assoc);
+  print_cpu_features_value ("level3_cache_linesize",
+                            cpu_features->level3_cache_linesize);
+  print_cpu_features_value ("level4_cache_size",
+                            cpu_features->level4_cache_size);
+  _Static_assert (offsetof (struct cpu_features, level4_cache_size)
+                  + sizeof (cpu_features->level4_cache_size)
+                  == sizeof (*cpu_features),
+                  "last cpu_features field has been printed");
+}
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index dd6e1d65c..419313804 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -32,17 +32,21 @@ glibc {
     }
     x86_rep_movsb_threshold {
       type: SIZE_T
-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
-      # isn't faster on short data.  The memcpy micro benchmark in glibc
-      # shows that 2KB is the approximate value above which REP MOVSB
-      # becomes faster than SSE2 optimization on processors with Enhanced
-      # REP MOVSB.  Since larger register size can move more data with a
-      # single load and store, the threshold is higher with larger register
-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
-      # times of vector size and the default value is 2048 * (vector size
-      # / 16), the default value and the minimum value must be updated at
-      # run-time.  NB: Don't set the default value since we can't tell if
-      # the tunable value is set by user or not [BZ #27069].
+      # Since there is overhead to set up REP MOVSB operation, REP
+      # MOVSB isn't faster on short data.  The memcpy micro benchmark
+      # in glibc shows that 2KB is the approximate value above which
+      # REP MOVSB becomes faster than SSE2 optimization on processors
+      # with Enhanced REP MOVSB.  Since larger register size can move
+      # more data with a single load and store, the threshold is
+      # higher with larger register size.  Micro benchmarks show AVX
+      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
+      # threshold is extrapolated to 16KB.  For machines with FSRM the
+      # threshold is universally set at 2112 bytes.  Note: Since the
+      # REP MOVSB threshold must be greater than 8 times of vector
+      # size and the default value is 4096 * (vector size / 16), the
+      # default value and the minimum value must be updated at
+      # run-time.  NB: Don't set the default value since we can't tell
+      # if the tunable value is set by user or not [BZ #27069].
       minval: 1
     }
     x86_rep_stosb_threshold {
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
new file mode 100644
index 000000000..1530d594b
--- /dev/null
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -0,0 +1,35 @@
+/* Bits in the PREFERRED_FEATURE_INDEX_1 bitfield of <cpu-features.h>.
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+BIT (Fast_Rep_String)
+BIT (Fast_Copy_Backward)
+BIT (Slow_BSF)
+BIT (Fast_Unaligned_Load)
+BIT (Prefer_PMINUB_for_stringop)
+BIT (Fast_Unaligned_Copy)
+BIT (I586)
+BIT (I686)
+BIT (Slow_SSE4_2)
+BIT (AVX_Fast_Unaligned_Load)
+BIT (Prefer_MAP_32BIT_EXEC)
+BIT (Prefer_No_VZEROUPPER)
+BIT (Prefer_ERMS)
+BIT (Prefer_No_AVX512)
+BIT (MathVec_Prefer_No_AVX512)
+BIT (Prefer_FSRM)
+BIT (Avoid_Short_Distance_REP_MOVSB)
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index 624736b40..74408fde5 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -229,7 +229,7 @@ enum
 #define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
 #define bit_cpu_INDEX_7_EDX_9	(1u << 9)
 #define bit_cpu_MD_CLEAR	(1u << 10)
-#define bit_cpu_INDEX_7_EDX_11	(1u << 11)
+#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
 #define bit_cpu_INDEX_7_EDX_12	(1u << 12)
 #define bit_cpu_INDEX_7_EDX_13	(1u << 13)
 #define bit_cpu_SERIALIZE	(1u << 14)
@@ -454,7 +454,7 @@ enum
 #define index_cpu_AVX512_VP2INTERSECT CPUID_INDEX_7
 #define index_cpu_INDEX_7_EDX_9	CPUID_INDEX_7
 #define index_cpu_MD_CLEAR	CPUID_INDEX_7
-#define index_cpu_INDEX_7_EDX_11 CPUID_INDEX_7
+#define index_cpu_RTM_ALWAYS_ABORT CPUID_INDEX_7
 #define index_cpu_INDEX_7_EDX_12 CPUID_INDEX_7
 #define index_cpu_INDEX_7_EDX_13 CPUID_INDEX_7
 #define index_cpu_SERIALIZE	CPUID_INDEX_7
@@ -679,7 +679,7 @@ enum
 #define reg_AVX512_VP2INTERSECT	edx
 #define reg_INDEX_7_EDX_9	edx
 #define reg_MD_CLEAR		edx
-#define reg_INDEX_7_EDX_11	edx
+#define reg_RTM_ALWAYS_ABORT	edx
 #define reg_INDEX_7_EDX_12	edx
 #define reg_INDEX_7_EDX_13	edx
 #define reg_SERIALIZE		edx
@@ -757,40 +757,23 @@ enum
 #define reg_AESKLE		ebx
 #define reg_WIDE_KL		ebx
 
-/* PREFERRED_FEATURE_INDEX_1.  */
-#define bit_arch_I586				(1u << 0)
-#define bit_arch_I686				(1u << 1)
-#define bit_arch_Fast_Rep_String		(1u << 2)
-#define bit_arch_Fast_Copy_Backward		(1u << 3)
-#define bit_arch_Fast_Unaligned_Load		(1u << 4)
-#define bit_arch_Fast_Unaligned_Copy		(1u << 5)
-#define bit_arch_Slow_BSF			(1u << 6)
-#define bit_arch_Slow_SSE4_2			(1u << 7)
-#define bit_arch_AVX_Fast_Unaligned_Load	(1u << 8)
-#define bit_arch_Prefer_MAP_32BIT_EXEC		(1u << 9)
-#define bit_arch_Prefer_PMINUB_for_stringop	(1u << 10)
-#define bit_arch_Prefer_No_VZEROUPPER		(1u << 11)
-#define bit_arch_Prefer_ERMS			(1u << 12)
-#define bit_arch_Prefer_FSRM			(1u << 13)
-#define bit_arch_Prefer_No_AVX512		(1u << 14)
-#define bit_arch_MathVec_Prefer_No_AVX512	(1u << 15)
-
-#define index_arch_Fast_Rep_String		PREFERRED_FEATURE_INDEX_1
-#define index_arch_Fast_Copy_Backward		PREFERRED_FEATURE_INDEX_1
-#define index_arch_Slow_BSF			PREFERRED_FEATURE_INDEX_1
-#define index_arch_Fast_Unaligned_Load		PREFERRED_FEATURE_INDEX_1
-#define index_arch_Prefer_PMINUB_for_stringop 	PREFERRED_FEATURE_INDEX_1
-#define index_arch_Fast_Unaligned_Copy		PREFERRED_FEATURE_INDEX_1
-#define index_arch_I586				PREFERRED_FEATURE_INDEX_1
-#define index_arch_I686				PREFERRED_FEATURE_INDEX_1
-#define index_arch_Slow_SSE4_2			PREFERRED_FEATURE_INDEX_1
-#define index_arch_AVX_Fast_Unaligned_Load	PREFERRED_FEATURE_INDEX_1
-#define index_arch_Prefer_MAP_32BIT_EXEC	PREFERRED_FEATURE_INDEX_1
-#define index_arch_Prefer_No_VZEROUPPER		PREFERRED_FEATURE_INDEX_1
-#define index_arch_Prefer_ERMS			PREFERRED_FEATURE_INDEX_1
-#define index_arch_Prefer_No_AVX512		PREFERRED_FEATURE_INDEX_1
-#define index_arch_MathVec_Prefer_No_AVX512	PREFERRED_FEATURE_INDEX_1
-#define index_arch_Prefer_FSRM			PREFERRED_FEATURE_INDEX_1
+/* PREFERRED_FEATURE_INDEX_1.  First define the bitindex values
+   sequentially, then define the bit_arch* and index_arch_* lookup
+   constants.  */
+enum
+  {
+#define BIT(x) _bitindex_arch_##x ,
+#include "cpu-features-preferred_feature_index_1.def"
+#undef BIT
+  };
+enum
+  {
+#define BIT(x)					\
+    bit_arch_##x = 1u << _bitindex_arch_##x ,	\
+    index_arch_##x = PREFERRED_FEATURE_INDEX_1,
+#include "cpu-features-preferred_feature_index_1.def"
+#undef BIT
+  };
 
 /* XCR0 Feature flags.  */
 #define bit_XMM_state		(1u << 1)
@@ -841,6 +824,8 @@ struct cpuid_feature_internal
     };
 };
 
+/* NB: When adding new fields, update sysdeps/x86/dl-diagnostics-cpu.c
+   to print them.  */
 struct cpu_features
 {
   struct cpu_features_basic basic;
@@ -870,10 +855,14 @@ struct cpu_features
   unsigned long int non_temporal_threshold;
   /* Threshold to use "rep movsb".  */
   unsigned long int rep_movsb_threshold;
+  /* Threshold to stop using "rep movsb".  */
+  unsigned long int rep_movsb_stop_threshold;
   /* Threshold to use "rep stosb".  */
   unsigned long int rep_stosb_threshold;
   /* _SC_LEVEL1_ICACHE_SIZE.  */
   unsigned long int level1_icache_size;
+  /* _SC_LEVEL1_ICACHE_LINESIZE.  */
+  unsigned long int level1_icache_linesize;
   /* _SC_LEVEL1_DCACHE_SIZE.  */
   unsigned long int level1_dcache_size;
   /* _SC_LEVEL1_DCACHE_ASSOC.  */
diff --git a/sysdeps/x86/isa-level.c b/sysdeps/x86/isa-level.c
index aaf524cb5..49ef4aa61 100644
--- a/sysdeps/x86/isa-level.c
+++ b/sysdeps/x86/isa-level.c
@@ -29,32 +29,35 @@
 
 /* ELF program property for x86 ISA level.  */
 #ifdef INCLUDE_X86_ISA_LEVEL
-# if defined __x86_64__ || defined __FXSR__ || !defined _SOFT_FLOAT \
-     || defined  __MMX__ || defined __SSE__ || defined __SSE2__
+# if defined __SSE__ && defined __SSE2__
+/* NB: ISAs, excluding MMX, in x86-64 ISA level baseline are used.  */
 #  define ISA_BASELINE	GNU_PROPERTY_X86_ISA_1_BASELINE
 # else
 #  define ISA_BASELINE	0
 # endif
 
-# if defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 \
-     || (defined __x86_64__ && defined __LAHF_SAHF__) \
-     || defined __POPCNT__ || defined __SSE3__ \
-     || defined __SSSE3__ || defined __SSE4_1__ || defined __SSE4_2__
+# if ISA_BASELINE && defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 \
+     && defined HAVE_X86_LAHF_SAHF && defined __POPCNT__ \
+     && defined __SSE3__ && defined __SSSE3__ && defined __SSE4_1__ \
+     && defined __SSE4_2__
+/* NB: ISAs in x86-64 ISA level v2 are used.  */
 #  define ISA_V2	GNU_PROPERTY_X86_ISA_1_V2
 # else
 #  define ISA_V2	0
 # endif
 
-# if defined __AVX__ || defined __AVX2__ || defined __F16C__ \
-     || defined __FMA__ || defined __LZCNT__ || defined __MOVBE__ \
-     || defined __XSAVE__
+# if ISA_V2 && defined __AVX__ && defined __AVX2__ && defined __F16C__ \
+     && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE
+/* NB: ISAs in x86-64 ISA level v3 are used.  */
 #  define ISA_V3	GNU_PROPERTY_X86_ISA_1_V3
 # else
 #  define ISA_V3	0
 # endif
 
-# if defined __AVX512F__ || defined __AVX512BW__ || defined __AVX512CD__ \
-     || defined __AVX512DQ__ || defined __AVX512VL__
+# if ISA_V3 && defined __AVX512F__ && defined __AVX512BW__ \
+     && defined __AVX512CD__ && defined __AVX512DQ__ \
+     && defined __AVX512VL__
+/* NB: ISAs in x86-64 ISA level v4 are used.  */
 #  define ISA_V4	GNU_PROPERTY_X86_ISA_1_V4
 # else
 #  define ISA_V4	0
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 51c069bfe..deda1c4e4 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -57,6 +57,11 @@ enum cf_protection_level
 #define STATE_SAVE_MASK \
   ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
 
+/* Constants for bits in __x86_string_control:  */
+
+/* Avoid short distance REP MOVSB.  */
+#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB	(1 << 0)
+
 #ifdef	__ASSEMBLER__
 
 /* Syntactic details of assembler.  */
@@ -73,15 +78,18 @@ enum cf_protection_level
 #define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
 
 /* Define an entry point visible from C.  */
-#define	ENTRY(name)							      \
+#define	ENTRY_P2ALIGN(name, alignment)					      \
   .globl C_SYMBOL_NAME(name);						      \
   .type C_SYMBOL_NAME(name),@function;					      \
-  .align ALIGNARG(4);							      \
+  .align ALIGNARG(alignment);						      \
   C_LABEL(name)								      \
   cfi_startproc;							      \
   _CET_ENDBR;								      \
   CALL_MCOUNT
 
+/* Common entry 16 byte aligns.  */
+#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
+
 #undef	END
 #define END(name)							      \
   cfi_endproc;								      \
@@ -103,7 +111,8 @@ enum cf_protection_level
 /* Local label name for asm code. */
 #ifndef L
 /* ELF-like local names start with `.L'.  */
-# define L(name)	.L##name
+# define LOCAL_LABEL(name) .L##name
+# define L(name)	LOCAL_LABEL(name)
 #endif
 
 #define atom_text_section .section ".text.atom", "ax"
diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
index 79d803eb2..e79d19b5a 100644
--- a/sysdeps/x86/tst-cpu-features-supports.c
+++ b/sysdeps/x86/tst-cpu-features-supports.c
@@ -59,9 +59,9 @@ do_test (int argc, char **argv)
   fails += CHECK_SUPPORTS (aes, AES);
 #endif
 #if __GNUC_PREREQ (11, 1)
-  fails += CHECK_SUPPORTS (amx_bf16, AMX_BF16);
-  fails += CHECK_SUPPORTS (amx_int8, AMX_INT8);
-  fails += CHECK_SUPPORTS (amx_tile, AMX_TILE);
+  fails += CHECK_SUPPORTS (amx-bf16, AMX_BF16);
+  fails += CHECK_SUPPORTS (amx-int8, AMX_INT8);
+  fails += CHECK_SUPPORTS (amx-tile, AMX_TILE);
 #endif
   fails += CHECK_SUPPORTS (avx, AVX);
   fails += CHECK_SUPPORTS (avx2, AVX2);
@@ -130,7 +130,7 @@ do_test (int argc, char **argv)
   fails += CHECK_SUPPORTS (gfni, GFNI);
 #endif
 #if __GNUC_PREREQ (11, 0)
-  fails += CHECK_SUPPORTS (hle, HLE);
+  fails += CHECK_CPU_SUPPORTS (hle, HLE);
   fails += CHECK_CPU_SUPPORTS (ibt, IBT);
   fails += CHECK_SUPPORTS (lahf_lm, LAHF64_SAHF64);
   fails += CHECK_CPU_SUPPORTS (lm, LM);
@@ -152,7 +152,7 @@ do_test (int argc, char **argv)
   fails += CHECK_SUPPORTS (rdpid, RDPID);
   fails += CHECK_SUPPORTS (rdrnd, RDRAND);
   fails += CHECK_SUPPORTS (rdseed, RDSEED);
-  fails += CHECK_SUPPORTS (rtm, RTM);
+  fails += CHECK_CPU_SUPPORTS (rtm, RTM);
   fails += CHECK_SUPPORTS (serialize, SERIALIZE);
   fails += CHECK_SUPPORTS (sha, SHA);
   fails += CHECK_CPU_SUPPORTS (shstk, SHSTK);
diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
index b5e7f6e7b..37d9ec9d8 100644
--- a/sysdeps/x86/tst-get-cpu-features.c
+++ b/sysdeps/x86/tst-get-cpu-features.c
@@ -158,6 +158,7 @@ do_test (void)
   CHECK_CPU_FEATURE (UINTR);
   CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
   CHECK_CPU_FEATURE (MD_CLEAR);
+  CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
   CHECK_CPU_FEATURE (SERIALIZE);
   CHECK_CPU_FEATURE (HYBRID);
   CHECK_CPU_FEATURE (TSXLDTRK);
@@ -321,6 +322,7 @@ do_test (void)
   CHECK_CPU_FEATURE_USABLE (FSRM);
   CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
   CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
+  CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
   CHECK_CPU_FEATURE_USABLE (SERIALIZE);
   CHECK_CPU_FEATURE_USABLE (HYBRID);
   CHECK_CPU_FEATURE_USABLE (TSXLDTRK);
diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
new file mode 100644
index 000000000..e47494011
--- /dev/null
+++ b/sysdeps/x86/tst-memchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = memchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = memchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
new file mode 100644
index 000000000..e4c8a623b
--- /dev/null
+++ b/sysdeps/x86/tst-memcmp-rtm.c
@@ -0,0 +1,52 @@
+/* Test case for memcmp inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  memset (string2, 'a', STRING_SIZE);
+  if (memcmp (string1, string2, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (memcmp (string1, string2, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memcmp", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
new file mode 100644
index 000000000..4bf97ef1e
--- /dev/null
+++ b/sysdeps/x86/tst-memmove-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for memmove inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  if (memmove (string2, string1, STRING_SIZE) == string2
+      && memcmp (string2, string1, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (memmove (string2, string1, STRING_SIZE) == string2
+      && memcmp (string2, string1, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memmove", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
new file mode 100644
index 000000000..a57a5a8eb
--- /dev/null
+++ b/sysdeps/x86/tst-memrchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memrchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = memrchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[STRING_SIZE - 100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = memrchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[STRING_SIZE - 100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memrchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
new file mode 100644
index 000000000..bf343a4da
--- /dev/null
+++ b/sysdeps/x86/tst-memset-rtm.c
@@ -0,0 +1,45 @@
+/* Test case for memset inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  return EXIT_SUCCESS;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memset", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
new file mode 100644
index 000000000..a82e29c07
--- /dev/null
+++ b/sysdeps/x86/tst-strchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for strchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = strchr (string1, 'c');
+  if (p == &string1[100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = strchr (string1, 'c');
+  if (p == &string1[100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
new file mode 100644
index 000000000..2b2a583fb
--- /dev/null
+++ b/sysdeps/x86/tst-strcpy-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strcpy inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  if (strcpy (string2, string1) == string2
+      && strcmp (string2, string1) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (strcpy (string2, string1) == string2
+      && strcmp (string2, string1) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strcpy", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
new file mode 100644
index 000000000..d2470afa1
--- /dev/null
+++ b/sysdeps/x86/tst-string-rtm.h
@@ -0,0 +1,72 @@
+/* Test string function in a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include <x86intrin.h>
+#include <sys/platform/x86.h>
+#include <support/check.h>
+#include <support/test-driver.h>
+
+static int
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
+	   int (*function) (void))
+{
+  if (!CPU_FEATURE_USABLE (RTM))
+    return EXIT_UNSUPPORTED;
+
+  int status = prepare ();
+  if (status != EXIT_SUCCESS)
+    return status;
+
+  unsigned int i;
+  unsigned int naborts = 0;
+  unsigned int failed = 0;
+  for (i = 0; i < loop; i++)
+    {
+      failed |= function ();
+      if (_xbegin() == _XBEGIN_STARTED)
+	{
+	  failed |= function ();
+	  _xend();
+	}
+      else
+	{
+	  failed |= function ();
+	  ++naborts;
+	}
+    }
+
+  if (failed)
+    FAIL_EXIT1 ("%s() failed", name);
+
+  if (naborts)
+    {
+      /* NB: Low single digit (<= 5%) noise-level aborts are normal for
+	 TSX.  */
+      double rate = 100 * ((double) naborts) / ((double) loop);
+      if (rate > 5)
+	FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
+		    rate, naborts, loop);
+    }
+
+  return EXIT_SUCCESS;
+}
+
+static int do_test (void);
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
new file mode 100644
index 000000000..0dcf14db8
--- /dev/null
+++ b/sysdeps/x86/tst-strlen-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strlen inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[STRING_SIZE - 100] = '\0';
+  size_t len = strlen (string1);
+  if (len == STRING_SIZE - 100)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  size_t len = strlen (string1);
+  if (len == STRING_SIZE - 100)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strlen", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
new file mode 100644
index 000000000..ba6543be8
--- /dev/null
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -0,0 +1,96 @@
+/* Test case for strncmp inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <tst-string-rtm.h>
+
+#ifdef WIDE
+# define CHAR wchar_t
+# define MEMSET wmemset
+# define STRNCMP wcsncmp
+# define TEST_NAME "wcsncmp"
+#else /* !WIDE */
+# define CHAR char
+# define MEMSET memset
+# define STRNCMP strncmp
+# define TEST_NAME "strncmp"
+#endif /* !WIDE */
+
+
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+CHAR string1[STRING_SIZE];
+CHAR string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  MEMSET (string1, 'a', STRING_SIZE - 1);
+  MEMSET (string2, 'a', STRING_SIZE - 1);
+  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function_overflow (void)
+{
+  if (STRNCMP (string1, string2, SIZE_MAX) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function_overflow2 (void)
+{
+  if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
+  if (status != EXIT_SUCCESS)
+    return status;
+  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+  if (status != EXIT_SUCCESS)
+    return status;
+  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
+  if (status != EXIT_SUCCESS)
+    return status;
+  return status;
+}
diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
new file mode 100644
index 000000000..e32bfaf5f
--- /dev/null
+++ b/sysdeps/x86/tst-strrchr-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strrchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = strrchr (string1, 'c');
+  if (p == &string1[STRING_SIZE - 100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = strrchr (string1, 'c');
+  if (p == &string1[STRING_SIZE - 100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strrchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-sysconf-cache-linesize-static.c b/sysdeps/x86/tst-sysconf-cache-linesize-static.c
new file mode 100644
index 000000000..152ae6882
--- /dev/null
+++ b/sysdeps/x86/tst-sysconf-cache-linesize-static.c
@@ -0,0 +1 @@
+#include "tst-sysconf-cache-linesize.c"
diff --git a/sysdeps/x86/tst-sysconf-cache-linesize.c b/sysdeps/x86/tst-sysconf-cache-linesize.c
new file mode 100644
index 000000000..642dbde5d
--- /dev/null
+++ b/sysdeps/x86/tst-sysconf-cache-linesize.c
@@ -0,0 +1,57 @@
+/* Test system cache line sizes.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <array_length.h>
+
+static struct
+{
+  const char *name;
+  int _SC_val;
+} sc_options[] =
+  {
+#define N(name) { "_SC_"#name, _SC_##name }
+    N (LEVEL1_ICACHE_LINESIZE),
+    N (LEVEL1_DCACHE_LINESIZE),
+    N (LEVEL2_CACHE_LINESIZE)
+  };
+
+static int
+do_test (void)
+{
+  int result = EXIT_SUCCESS;
+
+  for (int i = 0; i < array_length (sc_options); ++i)
+    {
+      long int scret = sysconf (sc_options[i]._SC_val);
+      if (scret < 0)
+	{
+	  printf ("sysconf (%s) returned < 0 (%ld)\n",
+		  sc_options[i].name, scret);
+	  result = EXIT_FAILURE;
+	}
+      else
+	printf ("sysconf (%s): %ld\n", sc_options[i].name, scret);
+    }
+
+  return result;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c
new file mode 100644
index 000000000..bad3b8637
--- /dev/null
+++ b/sysdeps/x86/tst-wcsncmp-rtm.c
@@ -0,0 +1,21 @@
+/* Test case for wcsncmp inside a transactionally executing RTM region.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include <wchar.h>
+#include "tst-strncmp-rtm.c"
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index d1d7cb9d2..2dd9fd516 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -20,6 +20,8 @@ endif
 ifeq ($(subdir),string)
 sysdep_routines += strcasecmp_l-nonascii strncase_l-nonascii
 gen-as-const-headers += locale-defines.sym
+tests += \
+  tst-rsi-strlen
 endif
 
 ifeq ($(subdir),elf)
@@ -189,6 +191,11 @@ ifeq ($(subdir),csu)
 gen-as-const-headers += tlsdesc.sym rtld-offsets.sym
 endif
 
+ifeq ($(subdir),wcsmbs)
+tests += \
+  tst-rsi-wcslen
+endif
+
 $(objpfx)x86_64/tst-x86_64mod-1.os: $(objpfx)tst-x86_64mod-1.os
 	$(make-target-directory)
 	rm -f $@
diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
old mode 100644
new mode 100755
index 198554d78..75c96d60d
--- a/sysdeps/x86_64/configure
+++ b/sysdeps/x86_64/configure
@@ -107,39 +107,6 @@ if test x"$build_mathvec" = xnotset; then
   build_mathvec=yes
 fi
 
-if test "$static_pie" = yes; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for linker static PIE support" >&5
-$as_echo_n "checking for linker static PIE support... " >&6; }
-if ${libc_cv_ld_static_pie+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat > conftest.s <<\EOF
-	.text
-	.global _start
-	.weak foo
-_start:
-	leaq	foo(%rip), %rax
-EOF
-  libc_cv_pie_option="-Wl,-pie"
-  if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostartfiles -nostdlib $no_ssp $libc_cv_pie_option -o conftest conftest.s 1>&5'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then
-    libc_cv_ld_static_pie=yes
-  else
-    libc_cv_ld_static_pie=no
-  fi
-rm -f conftest*
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_ld_static_pie" >&5
-$as_echo "$libc_cv_ld_static_pie" >&6; }
-  if test "$libc_cv_ld_static_pie" != yes; then
-    as_fn_error $? "linker support for static PIE needed" "$LINENO" 5
-  fi
-fi
-
 $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
 
 
diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
index ec776274a..66219e7ce 100644
--- a/sysdeps/x86_64/configure.ac
+++ b/sysdeps/x86_64/configure.ac
@@ -53,31 +53,6 @@ if test x"$build_mathvec" = xnotset; then
   build_mathvec=yes
 fi
 
-dnl Check if linker supports static PIE with the fix for
-dnl
-dnl https://sourceware.org/bugzilla/show_bug.cgi?id=21782
-dnl
-if test "$static_pie" = yes; then
-  AC_CACHE_CHECK(for linker static PIE support, libc_cv_ld_static_pie, [dnl
-cat > conftest.s <<\EOF
-	.text
-	.global _start
-	.weak foo
-_start:
-	leaq	foo(%rip), %rax
-EOF
-  libc_cv_pie_option="-Wl,-pie"
-  if AC_TRY_COMMAND(${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostartfiles -nostdlib $no_ssp $libc_cv_pie_option -o conftest conftest.s 1>&AS_MESSAGE_LOG_FD); then
-    libc_cv_ld_static_pie=yes
-  else
-    libc_cv_ld_static_pie=no
-  fi
-rm -f conftest*])
-  if test "$libc_cv_ld_static_pie" != yes; then
-    AC_MSG_ERROR([linker support for static PIE needed])
-  fi
-fi
-
 dnl It is always possible to access static and hidden symbols in an
 dnl position independent way.
 AC_DEFINE(PI_STATIC_AND_HIDDEN)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
index e68fcdbb1..58e588a3d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
         vmovaps   %zmm0, %zmm8
 
 /* Check for large arguments path */
-        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
+        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
 
 /*
   ARGUMENT RANGE REDUCTION:
@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN8v_cos_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.16:
-	.long	0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.16,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
index dfa2acafc..f5f117d47 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
 
 /* preserve mantissa, set input exponent to 2^(-10) */
         vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
-        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
+        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
         vpsrlq    $32, %zmm4, %zmm6
 
 /* reciprocal approximation good to at least 11 bits */
@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN8v_log_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.12:
-	.long	0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.12,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
index be8ab7c6e..48d251db1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
         andq      $-64, %rsp
         subq      $1280, %rsp
         movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
-        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
+        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
         vmovups __dAbsMask(%rax), %zmm7
         vmovups __dInvPI(%rax), %zmm2
         vmovups __dRShifter(%rax), %zmm1
@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN8v_sin_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.14:
-	.long	0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.14,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
index 611887082..a4944a4fe 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
 
 /* SinPoly = SinR*SinPoly */
         vfmadd213pd %zmm5, %zmm5, %zmm4
-        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
+        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
 
 /* Update Cos result's sign */
         vxorpd    %zmm2, %zmm1, %zmm1
@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
 ENTRY (_ZGVeN8vvv_sincos_skx)
 WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
 END (_ZGVeN8vvv_sincos_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.15:
-	.long	0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.15,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
index f671d60d5..fe8474fed 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
   X = X - Y*PI1 - Y*PI2 - Y*PI3
  */
         vmovaps   %zmm0, %zmm6
-        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
+        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
         vmovups __sRShifter(%rax), %zmm3
         vmovups __sPI1_FMA(%rax), %zmm5
         vmovups __sA9_FMA(%rax), %zmm9
@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN16v_cosf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.13:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
index 637bfe3c0..229b7828c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
         vmovaps   %zmm0, %zmm7
 
 /* compare against threshold */
-        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
+        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
         vmovups __sInvLn2(%rax), %zmm4
         vmovups __sShifter(%rax), %zmm1
         vmovups __sLn2hi(%rax), %zmm6
@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
 
 #endif
 END (_ZGVeN16v_expf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.13:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
index 9d790fbf0..fa2aae986 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
         andq      $-64, %rsp
         subq      $1280, %rsp
         movq      __svml_slog_data@GOTPCREL(%rip), %rax
-        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
+        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
         vmovups _iBrkValue(%rax), %zmm4
         vmovups _sPoly_7(%rax), %zmm8
 
@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
 
 #endif
 END (_ZGVeN16v_logf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.7:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.7,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
index c5c43c46f..6aea2a4f1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
         vpsrlq    $32, %zmm3, %zmm2
         vpmovqd   %zmm2, %ymm11
         vcvtps2pd %ymm14, %zmm13
-        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
         vmovaps   %zmm14, %zmm26
         vpandd _ABSMASK(%rax), %zmm1, %zmm8
         vpcmpd    $1, _INF(%rax), %zmm8, %k2
@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
         vpmovqd   %zmm11, %ymm5
         vpxord    %zmm10, %zmm10, %zmm10
         vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
-        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
+        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
         vpxord    %zmm11, %zmm11, %zmm11
         vcvtdq2pd %ymm7, %zmm7
         vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN16vv_powf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.23:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.23,@object
-.L_2il0floatpacket.24:
-	.long	0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.24,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
index 9cf359c86..a446c504f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
 
 /* Result sign calculations */
         vpternlogd $150, %zmm0, %zmm14, %zmm1
-        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
 
 /* Add correction term 0.5 for cos() part */
         vaddps    %zmm8, %zmm5, %zmm15
@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
 ENTRY (_ZGVeN16vvv_sincosf_skx)
 WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
 END (_ZGVeN16vvv_sincosf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.13:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
index bd05109a6..c1b352d0a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
 
 /* Check for large and special values */
-        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
         vmovups __sAbsMask(%rax), %zmm5
         vmovups __sInvPI(%rax), %zmm1
         vmovups __sRShifter(%rax), %zmm2
@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
         jmp       .LBL_2_7
 #endif
 END (_ZGVeN16v_sinf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.11:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.11,@object
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index beff2708d..3ddc4655c 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
 #ifdef USE_AS_WMEMCHR
 # define MEMCHR		wmemchr
 # define PCMPEQ		pcmpeqd
+# define CHAR_PER_VEC	4
 #else
 # define MEMCHR		memchr
 # define PCMPEQ		pcmpeqb
+# define CHAR_PER_VEC	16
 #endif
 
 /* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
 	movd	%esi, %xmm1
 	mov	%edi, %ecx
 
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#endif
 #ifdef USE_AS_WMEMCHR
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
-	shl	$2, %RDX_LP
 #else
-# ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
-# endif
 	punpcklbw %xmm1, %xmm1
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
 	test	%eax, %eax
 
 	jnz	L(matches_1)
-	sub	$16, %rdx
+	sub	$CHAR_PER_VEC, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
 	and	$15, %ecx
 	and	$-16, %rdi
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	jmp	L(loop_prolog)
 
@@ -77,16 +81,21 @@ L(crosscache):
 	movdqa	(%rdi), %xmm0
 
 	PCMPEQ	%xmm1, %xmm0
-/* Check if there is a match.  */
+	/* Check if there is a match.  */
 	pmovmskb %xmm0, %eax
-/* Remove the leading bytes.  */
+	/* Remove the leading bytes.  */
 	sar	%cl, %eax
 	test	%eax, %eax
 	je	L(unaligned_no_match)
-/* Check which byte is a match.  */
+	/* Check which byte is a match.  */
 	bsf	%eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	add	%rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
 
 	.p2align 4
 L(unaligned_no_match):
-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
 	   possible addition overflow.  */
 	neg	%rcx
 	add	$16, %rcx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	sub	%rcx, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
 	test	$0x3f, %rdi
 	jz	L(align64_loop)
 
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 
 	movdqa	(%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
 	mov	%rdi, %rcx
 	and	$-64, %rdi
 	and	$63, %ecx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
 	add	%rcx, %rdx
 
 	.p2align 4
 L(align64_loop):
-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	movdqa	(%rdi), %xmm0
 	movdqa	16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	jle	L(exit_loop_32)
 
 	movdqa	(%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jle	L(return_null)
 
 	PCMPEQ	48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
 
 	.p2align 4
 L(exit_loop_32):
-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
 	movdqa	(%rdi), %xmm0
 	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
 	jbe	L(return_null)
 
 	PCMPEQ	16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
 	.p2align 4
 L(matches_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	ret
@@ -301,7 +322,13 @@ L(matches_1):
 	.p2align 4
 L(matches16_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	16(%rdi, %rax), %rax
 	ret
@@ -309,7 +336,13 @@ L(matches16_1):
 	.p2align 4
 L(matches32_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	32(%rdi, %rax), %rax
 	ret
@@ -317,7 +350,13 @@ L(matches32_1):
 	.p2align 4
 L(matches48_1):
 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
 	sub	%rax, %rdx
+#endif
 	jbe	L(return_null)
 	lea	48(%rdi, %rax), %rax
 	ret
diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
index db106a7a1..b2b318084 100644
--- a/sysdeps/x86_64/memmove.S
+++ b/sysdeps/x86_64/memmove.S
@@ -25,7 +25,7 @@
 /* Use movups and movaps for smaller code sizes.  */
 #define VMOVU		movups
 #define VMOVA		movaps
-
+#define MOV_SIZE	3
 #define SECTION(p)		p
 
 #ifdef USE_MULTIARCH
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 7d4a327eb..0358210c7 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -18,25 +18,36 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#define USE_WITH_SSE2	1
 
 #define VEC_SIZE	16
+#define MOV_SIZE	3
+#define RET_SIZE	1
+
 #define VEC(i)		xmm##i
-/* Don't use movups and movaps since it will get larger nop paddings for
-   alignment.  */
-#define VMOVU		movdqu
-#define VMOVA		movdqa
+#define VMOVU     movups
+#define VMOVA     movaps
 
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
   movq r, %rax; \
   punpcklbw %xmm0, %xmm0; \
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define BZERO_ZERO_VEC0() \
+  pxor %xmm0, %xmm0
+
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
-  movq r, %rax; \
-  pshufd $0, %xmm0, %xmm0
+  pshufd $0, %xmm0, %xmm0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 #define SECTION(p)		p
 
@@ -45,6 +56,10 @@
 # define MEMSET_SYMBOL(p,s)	memset
 #endif
 
+#ifndef BZERO_SYMBOL
+# define BZERO_SYMBOL(p,s)	__bzero
+#endif
+
 #ifndef WMEMSET_SYMBOL
 # define WMEMSET_CHK_SYMBOL(p,s) p
 # define WMEMSET_SYMBOL(p,s)	__wmemset
@@ -55,6 +70,7 @@
 libc_hidden_builtin_def (memset)
 
 #if IS_IN (libc)
+weak_alias (__bzero, bzero)
 libc_hidden_def (__wmemset)
 weak_alias (__wmemset, wmemset)
 libc_hidden_weak (wmemset)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 9477538af..b503e4b81 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -1,45 +1,133 @@
 ifeq ($(subdir),string)
 
-sysdep_routines += strncat-c stpncpy-c strncpy-c \
-		   strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3  \
-		   strcmp-sse4_2 strcmp-avx2 \
-		   strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \
-		   memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
-		   memrchr-sse2 memrchr-avx2 \
-		   memcmp-sse2 \
-		   memcmp-avx2-movbe \
-		   memcmp-sse4 memcpy-ssse3 \
-		   memmove-ssse3 \
-		   memcpy-ssse3-back \
-		   memmove-ssse3-back \
-		   memmove-avx512-no-vzeroupper \
-		   strcasecmp_l-sse2 strcasecmp_l-ssse3 \
-		   strcasecmp_l-sse4_2 strcasecmp_l-avx \
-		   strncase_l-sse2 strncase_l-ssse3 \
-		   strncase_l-sse4_2 strncase_l-avx \
-		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
-		   strrchr-sse2 strrchr-avx2 \
-		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
-		   strcat-avx2 strncat-avx2 \
-		   strcat-ssse3 strncat-ssse3\
-		   strcpy-avx2 strncpy-avx2 \
-		   strcpy-sse2 stpcpy-sse2 \
-		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
-		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
-		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
-		   stpcpy-avx2 stpncpy-avx2 \
-		   strcat-sse2 \
-		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
-		   strcspn-sse2 strpbrk-sse2 strspn-sse2 \
-		   strcspn-c strpbrk-c strspn-c varshift \
-		   memset-avx512-no-vzeroupper \
-		   memmove-sse2-unaligned-erms \
-		   memmove-avx-unaligned-erms \
-		   memmove-avx512-unaligned-erms \
-		   memset-sse2-unaligned-erms \
-		   memset-avx2-unaligned-erms \
-		   memset-avx512-unaligned-erms
+sysdep_routines += \
+  bzero \
+  memchr-avx2 \
+  memchr-avx2-rtm \
+  memchr-evex \
+  memchr-evex-rtm \
+  memchr-sse2 \
+  memcmp-avx2-movbe \
+  memcmp-avx2-movbe-rtm \
+  memcmp-evex-movbe \
+  memcmp-sse2 \
+  memcmp-ssse3 \
+  memcpy-ssse3 \
+  memcpy-ssse3-back \
+  memmove-avx-unaligned-erms \
+  memmove-avx-unaligned-erms-rtm \
+  memmove-avx512-no-vzeroupper \
+  memmove-avx512-unaligned-erms \
+  memmove-evex-unaligned-erms \
+  memmove-sse2-unaligned-erms \
+  memmove-ssse3 \
+  memmove-ssse3-back \
+  memrchr-avx2 \
+  memrchr-avx2-rtm \
+  memrchr-evex \
+  memrchr-sse2 \
+  memset-avx2-unaligned-erms \
+  memset-avx2-unaligned-erms-rtm \
+  memset-avx512-no-vzeroupper \
+  memset-avx512-unaligned-erms \
+  memset-evex-unaligned-erms \
+  memset-sse2-unaligned-erms \
+  rawmemchr-avx2 \
+  rawmemchr-avx2-rtm \
+  rawmemchr-evex \
+  rawmemchr-evex-rtm \
+  rawmemchr-sse2 \
+  stpcpy-avx2 \
+  stpcpy-avx2-rtm \
+  stpcpy-evex \
+  stpcpy-sse2 \
+  stpcpy-sse2-unaligned \
+  stpcpy-ssse3 \
+  stpncpy-avx2 \
+  stpncpy-avx2-rtm \
+  stpncpy-c \
+  stpncpy-evex \
+  stpncpy-sse2-unaligned \
+  stpncpy-ssse3 \
+  strcasecmp_l-avx2 \
+  strcasecmp_l-avx2-rtm \
+  strcasecmp_l-evex \
+  strcasecmp_l-sse2 \
+  strcasecmp_l-sse4_2 \
+  strcasecmp_l-ssse3 \
+  strcat-avx2 \
+  strcat-avx2-rtm \
+  strcat-evex \
+  strcat-sse2 \
+  strcat-sse2-unaligned \
+  strcat-ssse3 \
+  strchr-avx2 \
+  strchr-avx2-rtm \
+  strchr-evex \
+  strchr-sse2 \
+  strchr-sse2-no-bsf \
+  strchrnul-avx2 \
+  strchrnul-avx2-rtm \
+  strchrnul-evex \
+  strchrnul-sse2 \
+  strcmp-avx2 \
+  strcmp-avx2-rtm \
+  strcmp-evex \
+  strcmp-sse2 \
+  strcmp-sse2-unaligned \
+  strcmp-sse4_2 \
+  strcmp-ssse3 \
+  strcpy-avx2 \
+  strcpy-avx2-rtm \
+  strcpy-evex \
+  strcpy-sse2 \
+  strcpy-sse2-unaligned \
+  strcpy-ssse3 \
+  strcspn-c \
+  strcspn-sse2 \
+  strlen-avx2 \
+  strlen-avx2-rtm \
+  strlen-evex \
+  strlen-sse2 \
+  strncase_l-avx2 \
+  strncase_l-avx2-rtm \
+  strncase_l-evex \
+  strncase_l-sse2 \
+  strncase_l-sse4_2 \
+  strncase_l-ssse3 \
+  strncat-avx2 \
+  strncat-avx2-rtm \
+  strncat-c \
+  strncat-evex \
+  strncat-sse2-unaligned \
+  strncat-ssse3 \
+  strncmp-avx2 \
+  strncmp-avx2-rtm \
+  strncmp-evex \
+  strncmp-sse2 \
+  strncmp-sse4_2 \
+  strncmp-ssse3 \
+  strncpy-avx2 \
+  strncpy-avx2-rtm \
+  strncpy-c \
+  strncpy-evex \
+  strncpy-sse2-unaligned \
+  strncpy-ssse3 \
+  strnlen-avx2 \
+  strnlen-avx2-rtm \
+  strnlen-evex \
+  strnlen-sse2 \
+  strpbrk-c \
+  strpbrk-sse2 \
+  strrchr-avx2 \
+  strrchr-avx2-rtm \
+  strrchr-evex \
+  strrchr-sse2 \
+  strspn-c \
+  strspn-sse2 \
+  strstr-sse2-unaligned \
+  varshift \
+# sysdep_routines
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
@@ -47,20 +135,54 @@ CFLAGS-strspn-c.c += -msse4
 endif
 
 ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
-		   wmemcmp-avx2-movbe \
-		   wmemchr-sse2 wmemchr-avx2 \
-		   wcscmp-sse2 wcscmp-avx2 \
-		   wcsncmp-sse2 wcsncmp-avx2 \
-		   wcscpy-ssse3 wcscpy-c \
-		   wcschr-sse2 wcschr-avx2 \
-		   wcsrchr-sse2 wcsrchr-avx2 \
-		   wcsnlen-sse4_1 wcsnlen-c \
-		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2
+sysdep_routines += \
+  wcschr-avx2 \
+  wcschr-avx2-rtm \
+  wcschr-evex \
+  wcschr-sse2 \
+  wcscmp-avx2 \
+  wcscmp-avx2-rtm \
+  wcscmp-evex \
+  wcscmp-sse2 \
+  wcscpy-c \
+  wcscpy-ssse3 \
+  wcslen-avx2 \
+  wcslen-avx2-rtm \
+  wcslen-evex \
+  wcslen-sse2 \
+  wcslen-sse4_1 \
+  wcsncmp-avx2 \
+  wcsncmp-avx2-rtm \
+  wcsncmp-evex \
+  wcsncmp-sse2 \
+  wcsnlen-avx2 \
+  wcsnlen-avx2-rtm \
+  wcsnlen-c \
+  wcsnlen-evex \
+  wcsnlen-sse4_1 \
+  wcsrchr-avx2 \
+  wcsrchr-avx2-rtm \
+  wcsrchr-evex \
+  wcsrchr-sse2 \
+  wmemchr-avx2 \
+  wmemchr-avx2-rtm \
+  wmemchr-evex \
+  wmemchr-evex-rtm \
+  wmemchr-sse2 \
+  wmemcmp-avx2-movbe \
+  wmemcmp-avx2-movbe-rtm \
+  wmemcmp-c \
+  wmemcmp-evex-movbe \
+  wmemcmp-ssse3 \
+# sysdep_routines
 endif
 
 ifeq ($(subdir),debug)
-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \
-		   memmove_chk-nonshared memset_chk-nonshared \
-		   wmemset_chk-nonshared
+sysdep_routines += \
+  memcpy_chk-nonshared \
+  memmove_chk-nonshared \
+  mempcpy_chk-nonshared \
+  memset_chk-nonshared \
+  wmemset_chk-nonshared \
+# sysdep_routines
 endif
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
deleted file mode 100644
index 639f02bde..000000000
--- a/sysdeps/x86_64/multiarch/bcopy.S
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <sysdep.h>
-
-	.text
-ENTRY(bcopy)
-	xchg	%rdi, %rsi
-	jmp	__libc_memmove	/* Branch to IFUNC memmove.  */
-END(bcopy)
diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
new file mode 100644
index 000000000..13e399a9a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/bzero.c
@@ -0,0 +1,108 @@
+/* Multiple versions of bzero.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __bzero __redirect___bzero
+# include <string.h>
+# undef __bzero
+
+/* OPTIMIZE1 definition required for bzero patch.  */
+# define OPTIMIZE1(name)	EVALUATOR1 (SYMBOL_NAME, name)
+# define SYMBOL_NAME __bzero
+# include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
+  attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx512_unaligned_erms);
+
+	  return OPTIMIZE1 (avx512_unaligned);
+	}
+    }
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (evex_unaligned_erms);
+
+	  return OPTIMIZE1 (evex_unaligned);
+	}
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
+
+	  return OPTIMIZE1 (avx2_unaligned_rtm);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx2_unaligned_erms);
+
+	  return OPTIMIZE1 (avx2_unaligned);
+	}
+    }
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+    return OPTIMIZE1 (sse2_unaligned_erms);
+
+  return OPTIMIZE1 (sse2_unaligned);
+}
+
+libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
+
+weak_alias (__bzero, bzero)
+#endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
index bbaf5dcf1..6de72f727 100644
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
@@ -21,16 +21,28 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
 
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
new file mode 100644
index 000000000..fc391edb8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
@@ -0,0 +1,55 @@
+/* Common definition for ifunc selection optimized with EVEX.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
+
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	    return OPTIMIZE (evex_rtm);
+
+	  return OPTIMIZE (evex);
+	}
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
+
+  return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 1be5dd032..e5e48b36c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -43,16 +43,41 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memchr,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __memchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, memchr,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, memchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __memchr_evex)
+	      IFUNC_IMPL_ADD (array, i, memchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __memchr_evex_rtm)
 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
   IFUNC_IMPL (i, name, memcmp,
 	      IFUNC_IMPL_ADD (array, i, memcmp,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __memcmp_avx2_movbe)
-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
-			      __memcmp_sse4_1)
+	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+			       && CPU_FEATURE_USABLE (MOVBE)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memcmp_avx2_movbe_rtm)
+	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)
+			       && CPU_FEATURE_USABLE (MOVBE)),
+			      __memcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
 			      __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
@@ -64,10 +89,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memmove_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (AVX),
@@ -75,6 +100,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (AVX),
 			      __memmove_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memmove_chk_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memmove_chk_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_chk_ssse3_back)
@@ -97,14 +136,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      CPU_FEATURE_USABLE (AVX),
 			      __memmove_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memmove_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memmove_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memmove_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, memmove,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_ssse3_back)
@@ -121,6 +174,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memrchr,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __memrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, memrchr,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memrchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, memrchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __memrchr_evex)
+
 	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
 
 #ifdef SHARED
@@ -139,10 +201,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __memset_chk_avx2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memset_chk_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memset_chk_avx2_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __memset_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __memset_chk_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      CPU_FEATURE_USABLE (AVX512F),
@@ -164,35 +248,135 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __memset_avx2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memset_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memset_avx2_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __memset_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __memset_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memset_avx512_no_vzeroupper)
 	     )
 
+  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
+  IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, 1,
+			      __bzero_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero, 1,
+			      __bzero_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __bzero_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __bzero_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __bzero_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __bzero_avx2_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_avx512_unaligned)
+	     )
+
   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
   IFUNC_IMPL (i, name, rawmemchr,
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __rawmemchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __rawmemchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __rawmemchr_evex)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __rawmemchr_evex_rtm)
 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
   IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strlen_avx2)
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strlen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strlen_evex)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
   IFUNC_IMPL (i, name, strnlen,
 	      IFUNC_IMPL_ADD (array, i, strnlen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strnlen_avx2)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strnlen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strnlen_evex)
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
@@ -201,6 +385,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpncpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __stpncpy_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __stpncpy_evex)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
 			      __stpncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
@@ -211,14 +403,30 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpcpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, stpcpy,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __stpcpy_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, stpcpy,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __stpcpy_evex)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strcasecmp_avx)
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_evex)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_sse42)
@@ -229,9 +437,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strcasecmp_l_avx)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_l_evex)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strcasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_l_sse42)
@@ -245,6 +461,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strcat,
 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2),
 			      __strcat_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcat,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcat_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strcat,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcat_evex)
 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
 			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
@@ -253,16 +477,38 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strchr.c.  */
   IFUNC_IMPL (i, name, strchr,
 	      IFUNC_IMPL_ADD (array, i, strchr,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, strchr,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strchr_evex)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strchrnul.c.  */
   IFUNC_IMPL (i, name, strchrnul,
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strchrnul_avx2)
+	      IFUNC_IMPL_ADD (array, i, strchrnul,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strchrnul_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strchrnul,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strchrnul_evex)
 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strrchr.c.  */
@@ -270,6 +516,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strrchr,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, strrchr,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strrchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strrchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strrchr_evex)
 	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcmp.c.  */
@@ -277,6 +531,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strcmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strcmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strcmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strcmp_sse42)
 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
@@ -288,6 +551,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strcpy,
 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2),
 			      __strcpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcpy,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strcpy_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strcpy,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcpy_evex)
 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
@@ -302,8 +573,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strncasecmp_avx)
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_evex)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_sse42)
@@ -315,9 +594,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
-			      CPU_FEATURE_USABLE (AVX),
-			      __strncasecmp_l_avx)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_l_evex)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __strncasecmp_l_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncasecmp_l_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_l_sse42)
@@ -331,6 +618,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strncat,
 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2),
 			      __strncat_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncat,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncat_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strncat,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncat_evex)
 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
 			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
@@ -341,6 +636,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strncpy,
 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2),
 			      __strncpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncpy_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncpy_evex)
 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
 			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
@@ -368,8 +671,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcschr.c.  */
   IFUNC_IMPL (i, name, wcschr,
 	      IFUNC_IMPL_ADD (array, i, wcschr,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcschr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcschr,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __wcschr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcschr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcschr_evex)
 	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcsrchr.c.  */
@@ -377,6 +691,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wcsrchr,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __wcsrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __wcsrchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcsrchr_evex)
 	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcscmp.c.  */
@@ -384,6 +707,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wcscmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __wcscmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcscmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __wcscmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcscmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcscmp_evex)
 	      IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcsncmp.c.  */
@@ -391,6 +723,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wcsncmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __wcsncmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __wcsncmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcsncmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcsncmp_evex)
 	      IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
@@ -402,15 +743,40 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
   IFUNC_IMPL (i, name, wcslen,
 	      IFUNC_IMPL_ADD (array, i, wcslen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __wcslen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcslen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      CPU_FEATURE_USABLE (SSE4_1),
+			      __wcslen_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
   IFUNC_IMPL (i, name, wcsnlen,
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsnlen_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __wcsnlen_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcsnlen_evex)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcsnlen_sse4_1)
@@ -421,16 +787,41 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wmemchr,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __wmemchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __wmemchr_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wmemchr_evex)
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wmemchr_evex_rtm)
 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
   IFUNC_IMPL (i, name, wmemcmp,
 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __wmemcmp_avx2_movbe)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
-			      __wmemcmp_sse4_1)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+			       && CPU_FEATURE_USABLE (MOVBE)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __wmemcmp_avx2_movbe_rtm)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)
+			       && CPU_FEATURE_USABLE (MOVBE)),
+			      __wmemcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
@@ -443,7 +834,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __wmemset_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, wmemset,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __wmemset_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wmemset_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wmemset_avx512_unaligned))
 
 #ifdef SHARED
@@ -453,10 +855,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memcpy_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (AVX),
@@ -464,6 +866,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (AVX),
 			      __memcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memcpy_chk_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memcpy_chk_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_chk_ssse3_back)
@@ -486,6 +902,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX),
 			      __memcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memcpy_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __memcpy_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
@@ -494,10 +924,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memcpy_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
@@ -511,10 +941,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __mempcpy_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (AVX),
@@ -522,6 +952,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (AVX),
 			      __mempcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __mempcpy_chk_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __mempcpy_chk_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_chk_ssse3_back)
@@ -542,10 +986,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __mempcpy_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      CPU_FEATURE_USABLE (AVX),
@@ -553,6 +997,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      CPU_FEATURE_USABLE (AVX),
 			      __mempcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __mempcpy_avx_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      (CPU_FEATURE_USABLE (AVX)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __mempcpy_avx_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
@@ -568,6 +1026,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strncmp_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncmp,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __strncmp_avx2_rtm)
+	      IFUNC_IMPL_ADD (array, i, strncmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strncmp_sse42)
 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
@@ -583,7 +1049,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __wmemset_chk_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wmemset_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wmemset_chk_avx512_unaligned))
 #endif
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index d5df541ec..5b9259409 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -21,22 +21,30 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2_movbe);
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex_movbe);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
-    return OPTIMIZE (sse4_1);
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_movbe_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2_movbe);
+    }
 
   if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
     return OPTIMIZE (ssse3);
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index bf42a555d..a14718a97 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -29,6 +29,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -48,21 +56,42 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
     {
-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
-	return OPTIMIZE (avx512_no_vzeroupper);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx512_unaligned_erms);
 
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	return OPTIMIZE (avx512_unaligned_erms);
+	  return OPTIMIZE (avx512_unaligned);
+	}
 
-      return OPTIMIZE (avx512_unaligned);
+      return OPTIMIZE (avx512_no_vzeroupper);
     }
 
   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	return OPTIMIZE (avx_unaligned_erms);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (evex_unaligned_erms);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx_unaligned_erms_rtm);
+
+	  return OPTIMIZE (avx_unaligned_rtm);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx_unaligned_erms);
 
-      return OPTIMIZE (avx_unaligned);
+	  return OPTIMIZE (avx_unaligned);
+	}
     }
 
   if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 0ac6b1188..eda564054 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -27,6 +27,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -45,21 +53,46 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
     {
-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
-	return OPTIMIZE (avx512_no_vzeroupper);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx512_unaligned_erms);
 
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	return OPTIMIZE (avx512_unaligned_erms);
+	  return OPTIMIZE (avx512_unaligned);
+	}
 
-      return OPTIMIZE (avx512_unaligned);
+      return OPTIMIZE (avx512_no_vzeroupper);
     }
 
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
     {
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	return OPTIMIZE (avx2_unaligned_erms);
-      else
-	return OPTIMIZE (avx2_unaligned);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (evex_unaligned_erms);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx2_unaligned_erms_rtm);
+
+	  return OPTIMIZE (avx2_unaligned_rtm);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx2_unaligned_erms);
+
+	  return OPTIMIZE (avx2_unaligned);
+	}
     }
 
   if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 931770e07..40819caf5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -22,15 +22,28 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
-    return OPTIMIZE (avx);
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+        return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+        return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+        return OPTIMIZE (avx2);
+    }
 
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 1100cd23c..39568f480 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -25,16 +25,27 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
 
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
new file mode 100644
index 000000000..39e334737
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
@@ -0,0 +1,52 @@
+/* Common definition for ifunc selections for wcslen and wcsnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+    return OPTIMIZE (sse4_1);
+
+  return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
index c1b0c2254..756f0ccdb 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -20,6 +20,9 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
 
 static inline void *
@@ -27,14 +30,21 @@ IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
-	return OPTIMIZE (avx512_unaligned);
-      else
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+	{
+	  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+	    return OPTIMIZE (avx512_unaligned);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_unaligned_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	return OPTIMIZE (avx2_unaligned);
     }
 
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
new file mode 100644
index 000000000..87b076c7c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMCHR
+# define MEMCHR __memchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 77a952316..afdb95650 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,319 +26,407 @@
 
 # ifdef USE_AS_WMEMCHR
 #  define VPCMPEQ	vpcmpeqd
+#  define VPBROADCAST	vpbroadcastd
+#  define CHAR_SIZE	4
 # else
 #  define VPCMPEQ	vpcmpeqb
+#  define VPBROADCAST	vpbroadcastb
+#  define CHAR_SIZE	1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+#  define ERAW_PTR_REG	ecx
+#  define RRAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define ERAW_PTR_REG	edi
+#  define RRAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
 # endif
 
 # ifndef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
+#  ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+#  else
 	test	%RDX_LP, %RDX_LP
+#  endif
 	jz	L(null)
 # endif
-	movl	%edi, %ecx
-	/* Broadcast CHAR to YMM0.  */
+	/* Broadcast CHAR to YMMMATCH.  */
 	vmovd	%esi, %xmm0
-# ifdef USE_AS_WMEMCHR
-	shl	$2, %RDX_LP
-	vpbroadcastd %xmm0, %ymm0
-# else
-#  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
-#  endif
-	vpbroadcastb %xmm0, %ymm0
-# endif
+	VPBROADCAST %xmm0, %ymm0
 	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
 # ifndef USE_AS_RAWMEMCHR
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rdx
-	jbe	L(zero)
-# else
-	jnz	L(first_vec_x0)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(first_vec_x0)
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
-	addq	%rcx, %rdx
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax), %rax
+	cmovle	%rcx, %rax
+	VZEROUPPER_RETURN
 
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
+L(null):
+	xorl	%eax, %eax
+	ret
 # endif
-	jmp	L(more_4x_vec)
-
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+	   and rdi for rawmemchr.  */
+	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a
+	   match).  */
+	leaq	1(%ALGN_PTR_REG), %rsi
+	subq	%RRAW_PTR_REG, %rsi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+#  endif
+# endif
 	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	sarxl	%ERAW_PTR_REG, %eax, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
 # endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	incq	%rdi
 	addq	%rdi, %rax
-	addq	%rcx, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
-	   overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	/* Check the end of data.  */
-	subq	%rcx, %rdx
-	jbe	L(zero)
-# endif
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	addq	$VEC_SIZE, %rdi
 
-# ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-L(more_4x_vec):
+	.p2align 4
+L(aligned_more):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+	/* Align data to VEC_SIZE - 1.  */
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	orq	$(VEC_SIZE - 1), %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
+# else
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
+	/* Check if at last VEC_SIZE * 4 length.  */
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+	   length.  */
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
 	addq	%rcx, %rdx
+# else
+	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 # endif
 
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
 	vpor	%ymm1, %ymm2, %ymm5
 	vpor	%ymm3, %ymm4, %ymm6
 	vpor	%ymm5, %ymm6, %ymm5
 
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
+	vpmovmskb %ymm5, %ecx
 # ifdef USE_AS_RAWMEMCHR
-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 # else
-	subq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec)
+	testl	%ecx, %ecx
+	jnz	L(loop_4x_vec_end)
 
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %edx
-	jle	L(last_2x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
 
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop_4x_vec)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	/* Fall through into less than 4 remaining vectors of length
+	   case.  */
+	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(last_4x_vec_or_less):
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(first_vec_x1_check)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
+	/* If remaining length > VEC_SIZE * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jg	L(last_4x_vec)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+L(last_2x_vec):
+	/* If remaining length < VEC_SIZE.  */
+	addl	$VEC_SIZE, %edx
+	jle	L(zero_end)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
-	jnz	L(first_vec_x3_check)
-	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+L(zero_end):
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %edx
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
+	jnz	L(last_vec_x1_return)
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
-
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	jnz	L(last_vec_x2_return)
 
-	.p2align 4
-L(first_vec_x0_check):
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	vpmovmskb %ymm3, %eax
+	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
+# endif
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
 
 	.p2align 4
 L(first_vec_x1_check):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$VEC_SIZE, %rax
+	/* Adjust length.  */
+	subl	$-(VEC_SIZE * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	incq	%rdi
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
+	.p2align 4
+L(set_zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
 
 	.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4 - 1), %rdi
+# else
+	incq	%rdi
+# endif
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 3 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE + 1), %rdi
+# endif
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
+# ifndef USE_AS_RAWMEMCHR
 	.p2align 4
-L(zero):
-	VZEROUPPER
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
 
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jle	L(last_2x_vec)
 	.p2align 4
-L(first_vec_x0):
-	tzcntl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+L(last_4x_vec):
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
 
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
 
-	.p2align 4
-L(first_vec_x2):
+	/* Create mask for possible matches within remaining length.  */
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= VEC_SIZE * 3 (Note this is after
+	   remaining length was found to be > VEC_SIZE * 2.  */
+	subl	$VEC_SIZE, %edx
+	jbe	L(zero_end2)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Shift remaining length mask for last VEC.  */
+	shrq	$32, %rcx
+	andl	%ecx, %eax
+	jz	L(zero_end2)
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 2), %rax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+L(zero_end2):
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(4x_vec_end):
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3), %rax
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
+# endif
 
 END (MEMCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
new file mode 100644
index 000000000..198718827
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
@@ -0,0 +1,8 @@
+#ifndef MEMCHR
+# define MEMCHR __memchr_evex_rtm
+#endif
+
+#define USE_IN_RTM 1
+#define SECTION(p) p##.evex.rtm
+
+#include "memchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
new file mode 100644
index 000000000..4d0ed6d13
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -0,0 +1,567 @@
+/* memchr/wmemchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCHR
+#  define MEMCHR	__memchr_evex
+# endif
+
+# ifdef USE_AS_WMEMCHR
+#  define VPBROADCAST	vpbroadcastd
+#  define VPMINU	vpminud
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+# else
+#  define VPBROADCAST	vpbroadcastb
+#  define VPMINU	vpminub
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+# endif
+
+	/* In the 4x loop the RTM and non-RTM versions have data pointer
+	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
+	   This is represented by BASE_OFFSET. As well because the RTM
+	   version uses vpcmp which stores a bit per element compared where
+	   the non-RTM version uses vpcmpeq which stores a bit per byte
+	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
+	   version.  */
+# ifdef USE_IN_RTM
+#  define VZEROUPPER
+#  define BASE_OFFSET	(VEC_SIZE * 4)
+#  define RET_SCALE	CHAR_SIZE
+# else
+#  define VZEROUPPER	vzeroupper
+#  define BASE_OFFSET	0
+#  define RET_SCALE	1
+# endif
+
+	/* In the return from 4x loop memchr and rawmemchr versions have
+	   data pointers off by VEC_SIZE * 4 with memchr version being
+	   VEC_SIZE * 4 greater.  */
+# ifdef USE_AS_RAWMEMCHR
+#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
+#  define RAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define RET_OFFSET	BASE_OFFSET
+#  define RAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
+# endif
+
+# define XMMZERO	xmm23
+# define YMMZERO	ymm23
+# define XMMMATCH	xmm16
+# define YMMMATCH	ymm16
+# define YMM1		ymm17
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+
+# ifndef SECTION
+#  define SECTION(p)	p##.evex
+# endif
+
+# define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
+
+	.section SECTION(.text),"ax",@progbits
+ENTRY (MEMCHR)
+# ifndef USE_AS_RAWMEMCHR
+	/* Check for zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	L(zero)
+
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
+# endif
+	/* Broadcast CHAR to YMMMATCH.  */
+	VPBROADCAST %esi, %YMMMATCH
+	/* Check if we may cross page boundary with one vector load.  */
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  */
+	VPCMP	$0, (%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+	ret
+
+# ifndef USE_AS_RAWMEMCHR
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	cmovle	%rcx, %rax
+	ret
+# else
+	/* NB: first_vec_x0 is 17 bytes which will leave
+	   cross_page_boundary (which is relatively cold) close enough
+	   to ideal alignment. So only realign L(cross_page_boundary) if
+	   rawmemchr.  */
+	.p2align 4
+# endif
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+	   for rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+	kmovd	%k0, %r8d
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+	   bytes.  */
+	sarl	$2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
+	subl	%eax, %esi
+# endif
+# ifdef USE_AS_WMEMCHR
+	andl	$(CHAR_PER_VEC - 1), %eax
+# endif
+	/* Remove the leading bytes.  */
+	sarxl	%eax, %r8d, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+	addq	%RAW_PTR_REG, %rax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 5
+L(aligned_more):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+
+# ifndef USE_AS_RAWMEMCHR
+	/* Align data to VEC_SIZE.  */
+L(cross_page_continue):
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	andq	$-VEC_SIZE, %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
+# else
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+
+
+# ifndef USE_AS_RAWMEMCHR
+	/* Check if at last CHAR_PER_VEC * 4 length.  */
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
+	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+
+	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
+	 */
+#  ifdef USE_AS_WMEMCHR
+	movl	%edi, %ecx
+	andq	$-(4 * VEC_SIZE), %rdi
+	subl	%edi, %ecx
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+	addq	%rcx, %rdx
+#  else
+	addq	%rdi, %rdx
+	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rdx
+#  endif
+# else
+	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+	andq	$-(4 * VEC_SIZE), %rdi
+# endif
+# ifdef USE_IN_RTM
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+# else
+	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
+	   encodable with EVEX registers (ymm16-ymm31).  */
+	vmovdqa64 %YMMMATCH, %ymm0
+# endif
+
+	/* Compare 4 * VEC at a time forward.  */
+	.p2align 4
+L(loop_4x_vec):
+	/* Two versions of the loop. One that does not require
+	   vzeroupper by not using ymm0-ymm15 and another does that require
+	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
+	   is used at all is because there is no EVEX encoding vpcmpeq and
+	   with vpcmpeq this loop can be performed more efficiently. The
+	   non-vzeroupper version is safe for RTM while the vzeroupper
+	   version should be prefered if RTM are not supported.  */
+# ifdef USE_IN_RTM
+	/* It would be possible to save some instructions using 4x VPCMP
+	   but bottleneck on port 5 makes it not woth it.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+	/* xor will set bytes match esi to zero.  */
+	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+	VPCMP	$0, %YMM3, %YMMZERO, %k2
+# else
+	/* Since vptern can only take 3x vectors fastest to do 1 vec
+	   seperately with EVEX vpcmp.  */
+#  ifdef USE_AS_WMEMCHR
+	/* vptern can only accept masks for epi32/epi64 so can only save
+	   instruction using not equals mask on vptern with wmemchr.  */
+	VPCMP	$4, (%rdi), %YMMMATCH, %k1
+#  else
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+#  endif
+	/* Compare 3x with vpcmpeq and or them all together with vptern.
+	 */
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+#  ifdef USE_AS_WMEMCHR
+	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
+	   combines result from VEC0 with zero mask.  */
+	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
+	vpmovmskb %ymm4, %ecx
+#  else
+	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
+	vpternlogd $254, %ymm2, %ymm3, %ymm4
+	vpmovmskb %ymm4, %ecx
+	kmovd	%k1, %eax
+#  endif
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+	subq	$-(VEC_SIZE * 4), %rdi
+# endif
+# ifdef USE_IN_RTM
+	kortestd %k2, %k3
+# else
+#  ifdef USE_AS_WMEMCHR
+	/* ecx contains not of matches. All 1s means no matches. incl will
+	   overflow and set zeroflag if that is the case.  */
+	incl	%ecx
+#  else
+	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
+	   to ecx is not an issue because if eax is non-zero it will be
+	   used for returning the match. If it is zero the add does
+	   nothing.  */
+	addq	%rax, %rcx
+#  endif
+# endif
+# ifdef USE_AS_RAWMEMCHR
+	jz	L(loop_4x_vec)
+# else
+	jnz	L(loop_4x_vec_end)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop_4x_vec)
+
+	/* Fall through into less than 4 remaining vectors of length case.
+	 */
+	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
+	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
+	kmovd	%k0, %eax
+	VZEROUPPER
+
+L(last_4x_vec_or_less):
+	/* Check if first VEC contained match.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+
+	/* If remaining length > CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jg	L(last_4x_vec)
+
+L(last_2x_vec):
+	/* If remaining length < CHAR_PER_VEC.  */
+	addl	$CHAR_PER_VEC, %edx
+	jle	L(zero_end)
+
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+	ret
+
+
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Adjust length.  */
+	subl	$-(CHAR_PER_VEC * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+L(set_zero_end):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
+# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
+	/* k1 has not of matches with VEC1.  */
+	kmovd	%k1, %eax
+#  ifdef USE_AS_WMEMCHR
+	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+#  else
+	incl	%eax
+#  endif
+# else
+	/* eax already has matches for VEC1.  */
+	testl	%eax, %eax
+# endif
+	jnz	L(last_vec_x1_return)
+
+# ifdef USE_IN_RTM
+	VPCMP	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %eax
+# else
+	vpmovmskb %ymm2, %eax
+# endif
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
+
+# ifdef USE_IN_RTM
+	kmovd	%k2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3_return)
+
+	kmovd	%k3, %eax
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	vpmovmskb %ymm3, %eax
+	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
+	salq	$VEC_SIZE, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
+	VZEROUPPER
+# endif
+	ret
+
+	.p2align 4
+L(last_vec_x1_return):
+	tzcntl	%eax, %eax
+# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_x2_return):
+	tzcntl	%eax, %eax
+	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
+	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
+	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
+	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
+	VZEROUPPER
+	ret
+
+# ifdef USE_IN_RTM
+	.p2align 4
+L(last_vec_x3_return):
+	tzcntl	%eax, %eax
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+# endif
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jle	L(last_2x_vec)
+
+	.p2align 4
+L(last_4x_vec):
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Create mask for possible matches within remaining length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+	bzhil	%edx, %ecx, %ecx
+#  else
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+#  endif
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+	   remaining length was found to be > CHAR_PER_VEC * 2.  */
+	subl	$CHAR_PER_VEC, %edx
+	jbe	L(zero_end2)
+
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Shift remaining length mask for last VEC.  */
+#  ifdef USE_AS_WMEMCHR
+	shrl	$CHAR_PER_VEC, %ecx
+#  else
+	shrq	$CHAR_PER_VEC, %rcx
+#  endif
+	andl	%ecx, %eax
+	jz	L(zero_end2)
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
+	ret
+
+L(last_vec_x2):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(last_vec_x3):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+# endif
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
index 6ad94f577..5a4131cb8 100644
--- a/sysdeps/x86_64/multiarch/memchr.c
+++ b/sysdeps/x86_64/multiarch/memchr.c
@@ -24,7 +24,7 @@
 # undef memchr
 
 # define SYMBOL_NAME memchr
-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
 
 libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
 strong_alias (memchr, __memchr)
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
new file mode 100644
index 000000000..cf4eff5d4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMCMP
+# define MEMCMP __memcmp_avx2_movbe_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memcmp-avx2-movbe.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index cf9c9b8c1..ec9cf0852 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -19,17 +19,23 @@
 #if IS_IN (libc)
 
 /* memcmp/wmemcmp is implemented as:
-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
-      to avoid branches.
-   2. Use overlapping compare to avoid branch.
-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
-      bytes for wmemcmp.
-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+   1. Use ymm vector compares when possible. The only case where
+      vector compares is not possible for when size < VEC_SIZE
+      and loading from either s1 or s2 would cause a page cross.
+   2. For size from 2 to 7 bytes on page cross, load as big endian
+      with movbe and bswap to avoid branches.
+   3. Use xmm vector compare when size >= 4 bytes for memcmp or
+      size >= 8 bytes for wmemcmp.
+   4. Optimistically compare up to first 4 * VEC_SIZE one at a
+      to check for early mismatches. Only do this if its guranteed the
+      work is not wasted.
+   5. If size is 8 * VEC_SIZE or less, unroll the loop.
+   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
       area.
-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+   7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   9. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
 
 # include <sysdep.h>
 
@@ -38,8 +44,10 @@
 # endif
 
 # ifdef USE_AS_WMEMCMP
+#  define CHAR_SIZE	4
 #  define VPCMPEQ	vpcmpeqd
 # else
+#  define CHAR_SIZE	1
 #  define VPCMPEQ	vpcmpeqb
 # endif
 
@@ -47,15 +55,19 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
-# define VEC_MASK ((1 << VEC_SIZE) - 1)
+# define PAGE_SIZE	4096
 
 /* Warning!
            wmemcmp has to use SIGNED comparison for elements.
            memcmp has to use UNSIGNED comparison for elemnts.
 */
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
 	shl	$2, %RDX_LP
@@ -67,118 +79,360 @@ ENTRY (MEMCMP)
 	jb	L(less_vec)
 
 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	vmovdqu	(%rsi), %ymm2
-	VPCMPEQ (%rdi), %ymm2, %ymm2
-	vpmovmskb %ymm2, %eax
-	subl    $VEC_MASK, %eax
-	jnz	L(first_vec)
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	(%rdi), %ymm1, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* NB: eax must be destination register if going to
+	   L(return_vec_[0,2]). For L(return_vec_3 destination register
+	   must be ecx.  */
+	incl	%eax
+	jnz	L(return_vec_0)
 
 	cmpq	$(VEC_SIZE * 2), %rdx
-	jbe	L(last_vec)
+	jbe	L(last_1x_vec)
+
+	/* Check second VEC no matter what.  */
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	/* If all 4 VEC where equal eax will be all 1s so incl will
+	   overflow and set zero flag.  */
+	incl	%eax
+	jnz	L(return_vec_1)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_2x_vec)
 
-	VPCMPEQ	%ymm0, %ymm0, %ymm0
-	/* More than 2 * VEC.  */
+	/* Check third and fourth VEC no matter what.  */
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpmovmskb %ymm3, %eax
+	incl	%eax
+	jnz	L(return_vec_2)
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpmovmskb %ymm4, %ecx
+	incl	%ecx
+	jnz	L(return_vec_3)
+
+	/* Go to 4x VEC loop.  */
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
-	cmpq	$(VEC_SIZE * 4), %rdx
-	jb	L(last_4x_vec)
 
-	/* From 4 * VEC to 8 * VEC, inclusively. */
-	vmovdqu	(%rsi), %ymm1
-	VPCMPEQ (%rdi), %ymm1, %ymm1
+	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
+	   branches.  */
 
-	vmovdqu	VEC_SIZE(%rsi), %ymm2
-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	/* Load first two VEC from s2 before adjusting addresses.  */
+	vmovdqu	-(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
+	vmovdqu	-(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
 
-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	/* Wait to load from s1 until addressed adjust due to
+	   unlamination of microfusion with complex address mode.  */
+	VPCMPEQ	(%rdi), %ymm1, %ymm1
+	VPCMPEQ	(VEC_SIZE)(%rdi), %ymm2, %ymm2
 
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 
+	/* Reduce VEC0 - VEC4.  */
 	vpand	%ymm1, %ymm2, %ymm5
 	vpand	%ymm3, %ymm4, %ymm6
-	vpand	%ymm5, %ymm6, %ymm5
+	vpand	%ymm5, %ymm6, %ymm7
+	vpmovmskb %ymm7, %ecx
+	incl	%ecx
+	jnz	L(return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	VZEROUPPER_RETURN
 
-	vptest	%ymm0, %ymm5
-	jnc	L(4x_vec_end)
+	.p2align 4
+L(return_vec_0):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
-	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
-	vmovdqu	(%rsi), %ymm1
-	VPCMPEQ (%rdi), %ymm1, %ymm1
+	.p2align 4
+L(return_vec_1):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	VEC_SIZE(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	VEC_SIZE(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
 
-	vmovdqu	VEC_SIZE(%rsi), %ymm2
-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
-	vpand	%ymm2, %ymm1, %ymm5
+	.p2align 4
+L(return_vec_2):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+
+	/* NB: p2align 5 here to ensure 4x loop is 32 byte aligned.  */
+	.p2align 5
+L(8x_return_vec_0_1_2_3):
+	/* Returning from L(more_8x_vec) requires restoring rsi.  */
+	addq	%rdi, %rsi
+L(return_vec_0_1_2_3):
+	vpmovmskb %ymm1, %eax
+	incl	%eax
+	jnz	L(return_vec_0)
 
-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
-	vpand	%ymm3, %ymm5, %ymm5
+	vpmovmskb %ymm2, %eax
+	incl	%eax
+	jnz	L(return_vec_1)
 
-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
-	vpand	%ymm4, %ymm5, %ymm5
+	vpmovmskb %ymm3, %eax
+	incl	%eax
+	jnz	L(return_vec_2)
+L(return_vec_3):
+	tzcntl	%ecx, %ecx
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %eax
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
 
-	vptest	%ymm0, %ymm5
-	jnc	L(4x_vec_end)
-	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	.p2align 4
+L(more_8x_vec):
+	/* Set end of s1 in rdx.  */
+	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
+	/* rsi stores s2 - s1. This allows loop to only update one
+	   pointer.  */
+	subq	%rdi, %rsi
+	/* Align s1 pointer.  */
+	andq	$-VEC_SIZE, %rdi
+	/* Adjust because first 4x vec where check already.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(loop_4x_vec):
+	/* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
+	 */
+	vmovdqu	(%rsi, %rdi), %ymm1
+	VPCMPEQ	(%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi, %rdi), %ymm2
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdi), %ymm3
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdi), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm7
+	vpmovmskb %ymm7, %ecx
+	incl	%ecx
+	jnz	L(8x_return_vec_0_1_2_3)
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check if s1 pointer at end.  */
+	cmpq	%rdx, %rdi
+	jb	L(loop_4x_vec)
+
+	subq	%rdx, %rdi
+	/* rdi has 4 * VEC_SIZE - remaining length.  */
+	cmpl	$(VEC_SIZE * 3), %edi
+	jae	L(8x_last_1x_vec)
+	/* Load regardless of branch.  */
+	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdx), %ymm3
+	cmpl	$(VEC_SIZE * 2), %edi
+	jae	L(8x_last_2x_vec)
+
+	/* Check last 4 VEC.  */
+	vmovdqu	(%rsi, %rdx), %ymm1
+	VPCMPEQ	(%rdx), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi, %rdx), %ymm2
+	VPCMPEQ	VEC_SIZE(%rdx), %ymm2, %ymm2
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm7
+	vpmovmskb %ymm7, %ecx
+	/* Restore s1 pointer to rdi.  */
+	movq	%rdx, %rdi
+	incl	%ecx
+	jnz	L(8x_return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	VZEROUPPER_RETURN
+
+	/* Only entry is from L(more_8x_vec).  */
+	.p2align 4
+L(8x_last_2x_vec):
+	/* Check second to last VEC. rdx store end pointer of s1 and
+	   ymm3 has already been loaded with second to last VEC from s2.
+	 */
+	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
+	vpmovmskb %ymm3, %eax
+	incl	%eax
+	jnz	L(8x_return_vec_2)
+	/* Check last VEC.  */
+	.p2align 4
+L(8x_last_1x_vec):
+	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
+	vpmovmskb %ymm4, %eax
+	incl	%eax
+	jnz	L(8x_return_vec_3)
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_2x_vec):
-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	vmovdqu	(%rsi), %ymm2
-	VPCMPEQ (%rdi), %ymm2, %ymm2
-	vpmovmskb %ymm2, %eax
-	subl    $VEC_MASK, %eax
-	jnz	L(first_vec)
+	/* Check second to last VEC.  */
+	vmovdqu	-(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
+	VPCMPEQ	-(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
+	vpmovmskb %ymm1, %eax
+	incl	%eax
+	jnz	L(return_vec_1_end)
+	/* Check last VEC.  */
+L(last_1x_vec):
+	vmovdqu	-(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
+	VPCMPEQ	-(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
+	vpmovmskb %ymm1, %eax
+	incl	%eax
+	jnz	L(return_vec_0_end)
+	VZEROUPPER_RETURN
 
-L(last_vec):
-	/* Use overlapping loads to avoid branches.  */
-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
-	vmovdqu	(%rsi), %ymm2
-	VPCMPEQ (%rdi), %ymm2, %ymm2
-	vpmovmskb %ymm2, %eax
-	subl    $VEC_MASK, %eax
-	jnz	L(first_vec)
-	VZEROUPPER
-	ret
+	.p2align 4
+L(8x_return_vec_2):
+	subq	$VEC_SIZE, %rdx
+L(8x_return_vec_3):
+	tzcntl	%eax, %eax
+	addq	%rdx, %rax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 3)(%rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec):
-	/* A byte or int32 is different within 16 or 32 bytes.  */
-	tzcntl	%eax, %ecx
+L(return_vec_1_end):
+	tzcntl	%eax, %eax
+	addl	%edx, %eax
 # ifdef USE_AS_WMEMCMP
-	xorl	%eax, %eax
-	movl	(%rdi, %rcx), %edx
-	cmpl	(%rsi, %rcx), %edx
-L(wmemcmp_return):
-	setl	%al
-	negl	%eax
-	orl	$1, %eax
+	movl	-(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
 # else
-	movzbl	(%rdi, %rcx), %eax
-	movzbl	(%rsi, %rcx), %edx
-	sub	%edx, %eax
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
+	.p2align 4
+L(return_vec_0_end):
+	tzcntl	%eax, %eax
+	addl	%edx, %eax
 # ifdef USE_AS_WMEMCMP
+	movl	-VEC_SIZE(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	-VEC_SIZE(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	-VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+
 	.p2align 4
-L(4):
+L(less_vec):
+	/* Check if one or less CHAR. This is necessary for size = 0 but
+	   is also faster for size = CHAR_SIZE.  */
+	cmpl	$CHAR_SIZE, %edx
+	jbe	L(one_or_less)
+
+	/* Check if loading one VEC from either s1 or s2 could cause a
+	   page cross. This can have false positives but is by far the
+	   fastest method.  */
+	movl	%edi, %eax
+	orl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(page_cross_less_vec)
+
+	/* No page cross possible.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ	(%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	incl	%eax
+	/* Result will be zero if s1 and s2 match. Otherwise first set
+	   bit will be first mismatch.  */
+	bzhil	%edx, %eax, %edx
+	jnz	L(return_vec_0)
 	xorl	%eax, %eax
-	movl	(%rdi), %edx
-	cmpl	(%rsi), %edx
-	jne	L(wmemcmp_return)
-	ret
-# else
+	VZEROUPPER_RETURN
+
 	.p2align 4
-L(between_4_7):
-	/* Load as big endian with overlapping movbe to avoid branches.  */
+L(page_cross_less_vec):
+	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+	   bytes.  */
+	cmpl	$16, %edx
+	jae	L(between_16_31)
+# ifndef USE_AS_WMEMCMP
+	cmpl	$8, %edx
+	jae	L(between_8_15)
+	/* Fall through for [4, 7].  */
+	cmpl	$4, %edx
+	jb	L(between_2_3)
+
 	movbe	(%rdi), %eax
 	movbe	(%rsi), %ecx
 	shlq	$32, %rax
@@ -188,245 +442,127 @@ L(between_4_7):
 	orq	%rdi, %rax
 	orq	%rsi, %rcx
 	subq	%rcx, %rax
-	je	L(exit)
-	sbbl	%eax, %eax
-	orl	$1, %eax
-	ret
-
-	.p2align 4
-L(exit):
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
+	/* No ymm register was touched.  */
 	ret
 
 	.p2align 4
-L(between_2_3):
-	/* Load as big endian to avoid branches.  */
-	movzwl	(%rdi), %eax
-	movzwl	(%rsi), %ecx
-	shll	$8, %eax
-	shll	$8, %ecx
-	bswap	%eax
-	bswap	%ecx
-	movb	-1(%rdi, %rdx), %al
-	movb	-1(%rsi, %rdx), %cl
-	/* Subtraction is okay because the upper 8 bits are zero.  */
+L(one_or_less):
+	jb	L(zero)
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
 	subl	%ecx, %eax
+	/* No ymm register was touched.  */
 	ret
 
-	.p2align 4
-L(1):
-	movzbl	(%rdi), %eax
-	movzbl	(%rsi), %ecx
-	subl	%ecx, %eax
+	.p2align 4,, 5
+L(ret_nonzero):
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	/* No ymm register was touched.  */
 	ret
-# endif
 
-	.p2align 4
+	.p2align 4,, 2
 L(zero):
 	xorl	%eax, %eax
+	/* No ymm register was touched.  */
 	ret
 
 	.p2align 4
-L(less_vec):
-# ifdef USE_AS_WMEMCMP
-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
-	cmpb	$4, %dl
-	je	L(4)
-	jb	L(zero)
+L(between_8_15):
+	movbe	(%rdi), %rax
+	movbe	(%rsi), %rcx
+	subq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	movbe	-8(%rdi, %rdx), %rax
+	movbe	-8(%rsi, %rdx), %rcx
+	subq	%rcx, %rax
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
+	/* No ymm register was touched.  */
+	ret
 # else
-	cmpb	$1, %dl
-	je	L(1)
-	jb	L(zero)
-	cmpb	$4, %dl
-	jb	L(between_2_3)
-	cmpb	$8, %dl
-	jb	L(between_4_7)
-# endif
-	cmpb	$16, %dl
-	jae	L(between_16_31)
-	/* It is between 8 and 15 bytes.  */
+	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 	vmovq	(%rdi), %xmm1
 	vmovq	(%rsi), %xmm2
-	VPCMPEQ %xmm1, %xmm2, %xmm2
+	VPCMPEQ	%xmm1, %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
-	subl    $0xffff, %eax
-	jnz	L(first_vec)
+	subl	$0xffff, %eax
+	jnz	L(return_vec_0)
 	/* Use overlapping loads to avoid branches.  */
 	leaq	-8(%rdi, %rdx), %rdi
 	leaq	-8(%rsi, %rdx), %rsi
 	vmovq	(%rdi), %xmm1
 	vmovq	(%rsi), %xmm2
-	VPCMPEQ %xmm1, %xmm2, %xmm2
+	VPCMPEQ	%xmm1, %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
-	subl    $0xffff, %eax
-	jnz	L(first_vec)
+	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
+	jnz	L(return_vec_0)
+	/* No ymm register was touched.  */
 	ret
+# endif
 
-	.p2align 4
+	.p2align 4,, 10
 L(between_16_31):
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
 	vmovdqu	(%rsi), %xmm2
-	VPCMPEQ (%rdi), %xmm2, %xmm2
+	VPCMPEQ	(%rdi), %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
-	subl    $0xffff, %eax
-	jnz	L(first_vec)
+	subl	$0xffff, %eax
+	jnz	L(return_vec_0)
 
 	/* Use overlapping loads to avoid branches.  */
+
+	vmovdqu	-16(%rsi, %rdx), %xmm2
 	leaq	-16(%rdi, %rdx), %rdi
 	leaq	-16(%rsi, %rdx), %rsi
-	vmovdqu	(%rsi), %xmm2
-	VPCMPEQ (%rdi), %xmm2, %xmm2
+	VPCMPEQ	(%rdi), %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
-	subl    $0xffff, %eax
-	jnz	L(first_vec)
-	ret
-
-	.p2align 4
-L(more_8x_vec):
-	/* More than 8 * VEC.  Check the first VEC.  */
-	vmovdqu	(%rsi), %ymm2
-	VPCMPEQ (%rdi), %ymm2, %ymm2
-	vpmovmskb %ymm2, %eax
-	subl    $VEC_MASK, %eax
-	jnz	L(first_vec)
-
-	/* Align the first memory area for aligned loads in the loop.
-	   Compute how much the first memory area is misaligned.  */
-	movq	%rdi, %rcx
-	andl	$(VEC_SIZE - 1), %ecx
-	/* Get the negative of offset for alignment.  */
-	subq	$VEC_SIZE, %rcx
-	/* Adjust the second memory area.  */
-	subq	%rcx, %rsi
-	/* Adjust the first memory area which should be aligned now.  */
-	subq	%rcx, %rdi
-	/* Adjust length.  */
-	addq	%rcx, %rdx
-
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	vmovdqu	(%rsi), %ymm1
-	VPCMPEQ (%rdi), %ymm1, %ymm1
-
-	vmovdqu	VEC_SIZE(%rsi), %ymm2
-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
-	vpand	%ymm2, %ymm1, %ymm5
-
-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
-	vpand	%ymm3, %ymm5, %ymm5
-
-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
-	vpand	%ymm4, %ymm5, %ymm5
-
-	vptest	%ymm0, %ymm5
-	jnc	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-	addq	$(VEC_SIZE * 4), %rsi
-
-	subq	$(VEC_SIZE * 4), %rdx
-	cmpq	$(VEC_SIZE * 4), %rdx
-	jae	L(loop_4x_vec)
-
-	/* Less than 4 * VEC.  */
-	cmpq	$VEC_SIZE, %rdx
-	jbe	L(last_vec)
-	cmpq	$(VEC_SIZE * 2), %rdx
-	jbe	L(last_2x_vec)
-
-L(last_4x_vec):
-	/* From 2 * VEC to 4 * VEC. */
-	vmovdqu	(%rsi), %ymm2
-	VPCMPEQ (%rdi), %ymm2, %ymm2
-	vpmovmskb %ymm2, %eax
-	subl    $VEC_MASK, %eax
-	jnz	L(first_vec)
-
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rsi
-	vmovdqu	(%rsi), %ymm2
-	VPCMPEQ (%rdi), %ymm2, %ymm2
-	vpmovmskb %ymm2, %eax
-	subl    $VEC_MASK, %eax
-	jnz	L(first_vec)
-
-	/* Use overlapping loads to avoid branches.  */
-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
-	vmovdqu	(%rsi), %ymm2
-	VPCMPEQ (%rdi), %ymm2, %ymm2
-	vpmovmskb %ymm2, %eax
-	subl    $VEC_MASK, %eax
-	jnz	L(first_vec)
-
-	addq	$VEC_SIZE, %rdi
-	addq	$VEC_SIZE, %rsi
-	vmovdqu	(%rsi), %ymm2
-	VPCMPEQ (%rdi), %ymm2, %ymm2
-	vpmovmskb %ymm2, %eax
-	subl    $VEC_MASK, %eax
-	jnz	L(first_vec)
-	VZEROUPPER
+	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
+	jnz	L(return_vec_0)
+	/* No ymm register was touched.  */
 	ret
 
-	.p2align 4
-L(4x_vec_end):
-	vpmovmskb %ymm1, %eax
-	subl	$VEC_MASK, %eax
-	jnz	L(first_vec)
-	vpmovmskb %ymm2, %eax
-	subl	$VEC_MASK, %eax
-	jnz	L(first_vec_x1)
-	vpmovmskb %ymm3, %eax
-	subl	$VEC_MASK, %eax
-	jnz	L(first_vec_x2)
-	vpmovmskb %ymm4, %eax
-	subl	$VEC_MASK, %eax
-	tzcntl	%eax, %ecx
 # ifdef USE_AS_WMEMCMP
+	.p2align 4,, 2
+L(zero):
 	xorl	%eax, %eax
-	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
-	jmp	L(wmemcmp_return)
-# else
-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
-	sub	%edx, %eax
-# endif
-	VZEROUPPER
 	ret
 
 	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %ecx
-# ifdef USE_AS_WMEMCMP
-	xorl	%eax, %eax
-	movl	VEC_SIZE(%rdi, %rcx), %edx
-	cmpl	VEC_SIZE(%rsi, %rcx), %edx
-	jmp	L(wmemcmp_return)
-# else
-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
-	sub	%edx, %eax
-# endif
-	VZEROUPPER
+L(one_or_less):
+	jb	L(zero)
+	movl	(%rdi), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi), %ecx
+	je	L(zero)
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+	/* No ymm register was touched.  */
 	ret
+# else
 
 	.p2align 4
-L(first_vec_x2):
-	tzcntl	%eax, %ecx
-# ifdef USE_AS_WMEMCMP
-	xorl	%eax, %eax
-	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
-	jmp	L(wmemcmp_return)
-# else
-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
-	sub	%edx, %eax
-# endif
-	VZEROUPPER
+L(between_2_3):
+	/* Load as big endian to avoid branches.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	bswap	%eax
+	bswap	%ecx
+	shrl	%eax
+	shrl	%ecx
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	/* Subtraction is okay because the upper bit is zero.  */
+	subl	%ecx, %eax
+	/* No ymm register was touched.  */
 	ret
+# endif
+
 END (MEMCMP)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
new file mode 100644
index 000000000..d2899e7c7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -0,0 +1,459 @@
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+   1. Use ymm vector compares when possible. The only case where
+      vector compares is not possible for when size < CHAR_PER_VEC
+      and loading from either s1 or s2 would cause a page cross.
+   2. For size from 2 to 7 bytes on page cross, load as big endian
+      with movbe and bswap to avoid branches.
+   3. Use xmm vector compare when size >= 4 bytes for memcmp or
+      size >= 8 bytes for wmemcmp.
+   4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
+      to check for early mismatches. Only do this if its guranteed the
+      work is not wasted.
+   5. If size is 8 * VEC_SIZE or less, unroll the loop.
+   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
+      area.
+   7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
+   8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
+   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
+
+When possible the implementation tries to optimize for frontend in the
+following ways:
+Throughput:
+    1. All code sections that fit are able to run optimally out of the
+       LSD.
+    2. All code sections that fit are able to run optimally out of the
+       DSB
+    3. Basic blocks are contained in minimum number of fetch blocks
+       necessary.
+
+Latency:
+    1. Logically connected basic blocks are put in the same
+       cache-line.
+    2. Logically connected basic blocks that do not fit in the same
+       cache-line are put in adjacent lines. This can get beneficial
+       L2 spatial prefetching and L1 next-line prefetching.  */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_evex_movbe
+# endif
+
+# define VMOVU		vmovdqu64
+
+# ifdef USE_AS_WMEMCMP
+#  define VMOVU_MASK	vmovdqu32
+#  define CHAR_SIZE	4
+#  define VPCMP	vpcmpd
+#  define VPTEST	vptestmd
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define CHAR_SIZE	1
+#  define VPCMP	vpcmpub
+#  define VPTEST	vptestmb
+# endif
+
+
+# define VEC_SIZE	32
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# define XMM0		xmm16
+# define XMM1		xmm17
+# define XMM2		xmm18
+# define YMM0		ymm16
+# define XMM1		xmm17
+# define XMM2		xmm18
+# define YMM1		ymm17
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.evex,"ax",@progbits
+/* Cache align memcmp entry. This allows for much more thorough
+   frontend optimization.  */
+ENTRY_P2ALIGN (MEMCMP, 6)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$CHAR_PER_VEC, %RDX_LP
+	/* Fall through for [0, VEC_SIZE] as its the hottest.  */
+	ja	L(more_1x_vec)
+
+	/* Create mask for CHAR's we want to compare. This allows us to
+	   avoid having to include page cross logic.  */
+	movl	$-1, %ecx
+	bzhil	%edx, %ecx, %ecx
+	kmovd	%ecx, %k2
+
+	/* Safe to load full ymm with mask.  */
+	VMOVU_MASK (%rsi), %YMM2{%k2}
+	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+	ret
+
+	.p2align 4
+L(return_vec_0):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+
+	.p2align 4
+L(more_1x_vec):
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %YMM1
+	/* Use compare not equals to directly check for mismatch.  */
+	VPCMP	$4,(%rdi), %YMM1, %k1
+	kmovd	%k1, %eax
+	/* NB: eax must be destination register if going to
+	   L(return_vec_[0,2]). For L(return_vec_3) destination register
+	   must be ecx.  */
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(last_1x_vec)
+
+	/* Check second VEC no matter what.  */
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_1)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_2x_vec)
+
+	/* Check third and fourth VEC no matter what.  */
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_2)
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
+	kmovd	%k1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(return_vec_3)
+
+	/* Go to 4x VEC loop.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rdx
+	ja	L(more_8x_vec)
+
+	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
+	   branches.  */
+
+	/* Load first two VEC from s2 before adjusting addresses.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+
+	/* Wait to load from s1 until addressed adjust due to
+	   unlamination of microfusion with complex address mode.  */
+
+	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
+	   will have some 1s.  */
+	vpxorq	(%rdi), %YMM1, %YMM1
+	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
+
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+	   oring with YMM1. Result is stored in YMM4.  */
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+
+	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+
+	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
+	 */
+	VPTEST	%YMM4, %YMM4, %k1
+	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
+	kmovd	%k1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	ret
+
+
+	.p2align 4,, 8
+L(8x_end_return_vec_0_1_2_3):
+	movq	%rdx, %rdi
+L(8x_return_vec_0_1_2_3):
+	addq	%rdi, %rsi
+L(return_vec_0_1_2_3):
+	VPTEST	%YMM1, %YMM1, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+
+	VPTEST	%YMM2, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_1)
+
+	VPTEST	%YMM3, %YMM3, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_2)
+L(return_vec_3):
+	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
+	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
+	   line.  */
+	bsfl	%ecx, %ecx
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	subl	%ecx, %eax
+# endif
+	ret
+
+
+	.p2align 4
+L(return_vec_1):
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
+	   fetch block.  */
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	.p2align 4,, 10
+L(return_vec_2):
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
+	   fetch block.  */
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	.p2align 4
+L(more_8x_vec):
+	/* Set end of s1 in rdx.  */
+	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
+	/* rsi stores s2 - s1. This allows loop to only update one
+	   pointer.  */
+	subq	%rdi, %rsi
+	/* Align s1 pointer.  */
+	andq	$-VEC_SIZE, %rdi
+	/* Adjust because first 4x vec where check already.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+
+	.p2align 4
+L(loop_4x_vec):
+	VMOVU	(%rsi, %rdi), %YMM1
+	vpxorq	(%rdi), %YMM1, %YMM1
+	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
+	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+	VPTEST	%YMM4, %YMM4, %k1
+	kmovd	%k1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(8x_return_vec_0_1_2_3)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdx, %rdi
+	jb	L(loop_4x_vec)
+
+	subq	%rdx, %rdi
+	/* rdi has 4 * VEC_SIZE - remaining length.  */
+	cmpl	$(VEC_SIZE * 3), %edi
+	jae	L(8x_last_1x_vec)
+	/* Load regardless of branch.  */
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
+	cmpl	$(VEC_SIZE * 2), %edi
+	jae	L(8x_last_2x_vec)
+
+	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+
+	VMOVU	(%rsi, %rdx), %YMM1
+	vpxorq	(%rdx), %YMM1, %YMM1
+
+	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+	VPTEST	%YMM4, %YMM4, %k1
+	kmovd	%k1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(8x_end_return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	ret
+
+	/* Only entry is from L(more_8x_vec).  */
+	.p2align 4,, 10
+L(8x_last_2x_vec):
+	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(8x_return_vec_2)
+	/* Naturally aligned to 16 bytes.  */
+L(8x_last_1x_vec):
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
+	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(8x_return_vec_3)
+	ret
+
+	/* Not ideally aligned (at offset +9 bytes in fetch block) but
+	   not aligning keeps it in the same cache line as
+	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
+	   size.  */
+	.p2align 4,, 4
+L(8x_return_vec_2):
+	subq	$VEC_SIZE, %rdx
+L(8x_return_vec_3):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+	movl	(VEC_SIZE * 3)(%rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	addq	%rdx, %rax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	.p2align 4,, 10
+L(last_2x_vec):
+	/* Check second to last VEC.  */
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
+	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_1_end)
+
+	/* Check last VEC.  */
+	.p2align 4
+L(last_1x_vec):
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
+	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0_end)
+	ret
+
+
+	/* Don't align. Takes 2-fetch blocks either way and aligning
+	   will cause code to spill into another cacheline.  */
+L(return_vec_1_end):
+	/* Use bsf to save code size. This is necessary to have
+	   L(one_or_less) fit in aligning bytes between.  */
+	bsfl	%eax, %eax
+	addl	%edx, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	/* Don't align. Takes 2-fetch blocks either way and aligning
+	   will cause code to spill into another cacheline.  */
+L(return_vec_0_end):
+	tzcntl	%eax, %eax
+	addl	%edx, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	-VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	-VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+	/* 1-byte until next cache line.  */
+
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
deleted file mode 100644
index b7ac03456..000000000
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ /dev/null
@@ -1,1779 +0,0 @@
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
-   Copyright (C) 2010-2021 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-#  define MEMCMP	__memcmp_sse4_1
-# endif
-
-# define JMPTBL(I, B)	(I - B)
-
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), %rcx;			\
-  add		%r11, %rcx;					\
-  _CET_NOTRACK jmp *%rcx;					\
-  ud2
-
-/* Warning!
-           wmemcmp has to use SIGNED comparison for elements.
-           memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
-	.section .text.sse4.1,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
-	shl	$2, %RDX_LP
-# elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-# endif
-	pxor	%xmm0, %xmm0
-	cmp	$79, %RDX_LP
-	ja	L(79bytesormore)
-# ifndef USE_AS_WMEMCMP
-	cmp	$1, %RDX_LP
-	je	L(firstbyte)
-# endif
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-# ifndef USE_AS_WMEMCMP
-	.p2align 4
-L(firstbyte):
-	movzbl	(%rdi), %eax
-	movzbl	(%rsi), %ecx
-	sub	%ecx, %eax
-	ret
-# endif
-
-	.p2align 4
-L(79bytesormore):
-	movdqu	(%rsi), %xmm1
-	movdqu	(%rdi), %xmm2
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-	mov	%rsi, %rcx
-	and	$-16, %rsi
-	add	$16, %rsi
-	sub	%rsi, %rcx
-
-	sub	%rcx, %rdi
-	add	%rcx, %rdx
-	test	$0xf, %rdi
-	jz	L(2aligned)
-
-	cmp	$128, %rdx
-	ja	L(128bytesormore)
-L(less128bytes):
-	sub	$64, %rdx
-
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqu	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqu	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-	cmp	$32, %rdx
-	jb	L(less32bytesin64)
-
-	movdqu	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqu	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin64):
-	add	$64, %rdi
-	add	$64, %rsi
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(128bytesormore):
-	cmp	$512, %rdx
-	ja	L(512bytesormore)
-	cmp	$256, %rdx
-	ja	L(less512bytes)
-L(less256bytes):
-	sub	$128, %rdx
-
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqu	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqu	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-
-	movdqu	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqu	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-
-	movdqu	96(%rdi), %xmm2
-	pxor	96(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(112bytesin256)
-
-	movdqu	112(%rdi), %xmm2
-	pxor	112(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(128bytesin256)
-
-	add	$128, %rsi
-	add	$128, %rdi
-
-	cmp	$64, %rdx
-	jae	L(less128bytes)
-
-	cmp	$32, %rdx
-	jb	L(less32bytesin128)
-
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin128):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(less512bytes):
-	sub	$256, %rdx
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqu	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqu	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-
-	movdqu	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqu	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-
-	movdqu	96(%rdi), %xmm2
-	pxor	96(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(112bytesin256)
-
-	movdqu	112(%rdi), %xmm2
-	pxor	112(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(128bytesin256)
-
-	movdqu	128(%rdi), %xmm2
-	pxor	128(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(144bytesin256)
-
-	movdqu	144(%rdi), %xmm2
-	pxor	144(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(160bytesin256)
-
-	movdqu	160(%rdi), %xmm2
-	pxor	160(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(176bytesin256)
-
-	movdqu	176(%rdi), %xmm2
-	pxor	176(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(192bytesin256)
-
-	movdqu	192(%rdi), %xmm2
-	pxor	192(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(208bytesin256)
-
-	movdqu	208(%rdi), %xmm2
-	pxor	208(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(224bytesin256)
-
-	movdqu	224(%rdi), %xmm2
-	pxor	224(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(240bytesin256)
-
-	movdqu	240(%rdi), %xmm2
-	pxor	240(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(256bytesin256)
-
-	add	$256, %rsi
-	add	$256, %rdi
-
-	cmp	$128, %rdx
-	jae	L(less256bytes)
-
-	cmp	$64, %rdx
-	jae	L(less128bytes)
-
-	cmp	$32, %rdx
-	jb	L(less32bytesin256)
-
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin256):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-	.p2align 4
-L(512bytesormore):
-# ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
-# else
-	mov	__x86_data_cache_size_half(%rip), %R8_LP
-# endif
-	mov	%r8, %r9
-	shr	$1, %r8
-	add	%r9, %r8
-	cmp	%r8, %rdx
-	ja	L(L2_L3_cache_unaglined)
-	sub	$64, %rdx
-	.p2align 4
-L(64bytesormore_loop):
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	movdqa	%xmm2, %xmm1
-
-	movdqu	16(%rdi), %xmm3
-	pxor	16(%rsi), %xmm3
-	por	%xmm3, %xmm1
-
-	movdqu	32(%rdi), %xmm4
-	pxor	32(%rsi), %xmm4
-	por	%xmm4, %xmm1
-
-	movdqu	48(%rdi), %xmm5
-	pxor	48(%rsi), %xmm5
-	por	%xmm5, %xmm1
-
-	ptest	%xmm1, %xmm0
-	jnc	L(64bytesormore_loop_end)
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	jae	L(64bytesormore_loop)
-
-	add	$64, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-L(L2_L3_cache_unaglined):
-	sub	$64, %rdx
-	.p2align 4
-L(L2_L3_unaligned_128bytes_loop):
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x1c0(%rsi)
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	movdqa	%xmm2, %xmm1
-
-	movdqu	16(%rdi), %xmm3
-	pxor	16(%rsi), %xmm3
-	por	%xmm3, %xmm1
-
-	movdqu	32(%rdi), %xmm4
-	pxor	32(%rsi), %xmm4
-	por	%xmm4, %xmm1
-
-	movdqu	48(%rdi), %xmm5
-	pxor	48(%rsi), %xmm5
-	por	%xmm5, %xmm1
-
-	ptest	%xmm1, %xmm0
-	jnc	L(64bytesormore_loop_end)
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	jae	L(L2_L3_unaligned_128bytes_loop)
-
-	add	$64, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-/*
- * This case is for machines which are sensitive for unaligned instructions.
- */
-	.p2align 4
-L(2aligned):
-	cmp	$128, %rdx
-	ja	L(128bytesormorein2aligned)
-L(less128bytesin2aligned):
-	sub	$64, %rdx
-
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqa	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqa	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqa	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-	cmp	$32, %rdx
-	jb	L(less32bytesin64in2alinged)
-
-	movdqa	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqa	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin64in2alinged):
-	add	$64, %rdi
-	add	$64, %rsi
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-	.p2align 4
-L(128bytesormorein2aligned):
-	cmp	$512, %rdx
-	ja	L(512bytesormorein2aligned)
-	cmp	$256, %rdx
-	ja	L(256bytesormorein2aligned)
-L(less256bytesin2alinged):
-	sub	$128, %rdx
-
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqa	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqa	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqa	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-
-	movdqa	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqa	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-
-	movdqa	96(%rdi), %xmm2
-	pxor	96(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(112bytesin256)
-
-	movdqa	112(%rdi), %xmm2
-	pxor	112(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(128bytesin256)
-
-	add	$128, %rsi
-	add	$128, %rdi
-
-	cmp	$64, %rdx
-	jae	L(less128bytesin2aligned)
-
-	cmp	$32, %rdx
-	jb	L(less32bytesin128in2aligned)
-
-	movdqu	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqu	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin128in2aligned):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-	.p2align 4
-L(256bytesormorein2aligned):
-
-	sub	$256, %rdx
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqa	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-
-	movdqa	32(%rdi), %xmm2
-	pxor	32(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(48bytesin256)
-
-	movdqa	48(%rdi), %xmm2
-	pxor	48(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(64bytesin256)
-
-	movdqa	64(%rdi), %xmm2
-	pxor	64(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(80bytesin256)
-
-	movdqa	80(%rdi), %xmm2
-	pxor	80(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(96bytesin256)
-
-	movdqa	96(%rdi), %xmm2
-	pxor	96(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(112bytesin256)
-
-	movdqa	112(%rdi), %xmm2
-	pxor	112(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(128bytesin256)
-
-	movdqa	128(%rdi), %xmm2
-	pxor	128(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(144bytesin256)
-
-	movdqa	144(%rdi), %xmm2
-	pxor	144(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(160bytesin256)
-
-	movdqa	160(%rdi), %xmm2
-	pxor	160(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(176bytesin256)
-
-	movdqa	176(%rdi), %xmm2
-	pxor	176(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(192bytesin256)
-
-	movdqa	192(%rdi), %xmm2
-	pxor	192(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(208bytesin256)
-
-	movdqa	208(%rdi), %xmm2
-	pxor	208(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(224bytesin256)
-
-	movdqa	224(%rdi), %xmm2
-	pxor	224(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(240bytesin256)
-
-	movdqa	240(%rdi), %xmm2
-	pxor	240(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(256bytesin256)
-
-	add	$256, %rsi
-	add	$256, %rdi
-
-	cmp	$128, %rdx
-	jae	L(less256bytesin2alinged)
-
-	cmp	$64, %rdx
-	jae	L(less128bytesin2aligned)
-
-	cmp	$32, %rdx
-	jb	L(less32bytesin256in2alinged)
-
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytesin256)
-
-	movdqa	16(%rdi), %xmm2
-	pxor	16(%rsi), %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(32bytesin256)
-	sub	$32, %rdx
-	add	$32, %rdi
-	add	$32, %rsi
-L(less32bytesin256in2alinged):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-	.p2align 4
-L(512bytesormorein2aligned):
-# ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
-# else
-	mov	__x86_data_cache_size_half(%rip), %R8_LP
-# endif
-	mov	%r8, %r9
-	shr	$1, %r8
-	add	%r9, %r8
-	cmp	%r8, %rdx
-	ja	L(L2_L3_cache_aglined)
-
-	sub	$64, %rdx
-	.p2align 4
-L(64bytesormore_loopin2aligned):
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	movdqa	%xmm2, %xmm1
-
-	movdqa	16(%rdi), %xmm3
-	pxor	16(%rsi), %xmm3
-	por	%xmm3, %xmm1
-
-	movdqa	32(%rdi), %xmm4
-	pxor	32(%rsi), %xmm4
-	por	%xmm4, %xmm1
-
-	movdqa	48(%rdi), %xmm5
-	pxor	48(%rsi), %xmm5
-	por	%xmm5, %xmm1
-
-	ptest	%xmm1, %xmm0
-	jnc	L(64bytesormore_loop_end)
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	jae	L(64bytesormore_loopin2aligned)
-
-	add	$64, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-L(L2_L3_cache_aglined):
-	sub	$64, %rdx
-
-	.p2align 4
-L(L2_L3_aligned_128bytes_loop):
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x1c0(%rsi)
-	movdqa	(%rdi), %xmm2
-	pxor	(%rsi), %xmm2
-	movdqa	%xmm2, %xmm1
-
-	movdqa	16(%rdi), %xmm3
-	pxor	16(%rsi), %xmm3
-	por	%xmm3, %xmm1
-
-	movdqa	32(%rdi), %xmm4
-	pxor	32(%rsi), %xmm4
-	por	%xmm4, %xmm1
-
-	movdqa	48(%rdi), %xmm5
-	pxor	48(%rsi), %xmm5
-	por	%xmm5, %xmm1
-
-	ptest	%xmm1, %xmm0
-	jnc	L(64bytesormore_loop_end)
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	jae	L(L2_L3_aligned_128bytes_loop)
-
-	add	$64, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
-
-
-	.p2align 4
-L(64bytesormore_loop_end):
-	add	$16, %rdi
-	add	$16, %rsi
-	ptest	%xmm2, %xmm0
-	jnc	L(16bytes)
-
-	add	$16, %rdi
-	add	$16, %rsi
-	ptest	%xmm3, %xmm0
-	jnc	L(16bytes)
-
-	add	$16, %rdi
-	add	$16, %rsi
-	ptest	%xmm4, %xmm0
-	jnc	L(16bytes)
-
-	add	$16, %rdi
-	add	$16, %rsi
-	jmp	L(16bytes)
-
-L(256bytesin256):
-	add	$256, %rdi
-	add	$256, %rsi
-	jmp	L(16bytes)
-L(240bytesin256):
-	add	$240, %rdi
-	add	$240, %rsi
-	jmp	L(16bytes)
-L(224bytesin256):
-	add	$224, %rdi
-	add	$224, %rsi
-	jmp	L(16bytes)
-L(208bytesin256):
-	add	$208, %rdi
-	add	$208, %rsi
-	jmp	L(16bytes)
-L(192bytesin256):
-	add	$192, %rdi
-	add	$192, %rsi
-	jmp	L(16bytes)
-L(176bytesin256):
-	add	$176, %rdi
-	add	$176, %rsi
-	jmp	L(16bytes)
-L(160bytesin256):
-	add	$160, %rdi
-	add	$160, %rsi
-	jmp	L(16bytes)
-L(144bytesin256):
-	add	$144, %rdi
-	add	$144, %rsi
-	jmp	L(16bytes)
-L(128bytesin256):
-	add	$128, %rdi
-	add	$128, %rsi
-	jmp	L(16bytes)
-L(112bytesin256):
-	add	$112, %rdi
-	add	$112, %rsi
-	jmp	L(16bytes)
-L(96bytesin256):
-	add	$96, %rdi
-	add	$96, %rsi
-	jmp	L(16bytes)
-L(80bytesin256):
-	add	$80, %rdi
-	add	$80, %rsi
-	jmp	L(16bytes)
-L(64bytesin256):
-	add	$64, %rdi
-	add	$64, %rsi
-	jmp	L(16bytes)
-L(48bytesin256):
-	add	$16, %rdi
-	add	$16, %rsi
-L(32bytesin256):
-	add	$16, %rdi
-	add	$16, %rsi
-L(16bytesin256):
-	add	$16, %rdi
-	add	$16, %rsi
-L(16bytes):
-	mov	-16(%rdi), %rax
-	mov	-16(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-L(8bytes):
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(12bytes):
-	mov	-12(%rdi), %rax
-	mov	-12(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-L(4bytes):
-	mov	-4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
-	mov	-4(%rdi), %eax
-	cmp	%eax, %ecx
-# else
-	cmp	-4(%rdi), %ecx
-# endif
-	jne	L(diffin4bytes)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal case for wmemcmp */
-	.p2align 4
-L(65bytes):
-	movdqu	-65(%rdi), %xmm1
-	movdqu	-65(%rsi), %xmm2
-	mov	$-65, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(49bytes):
-	movdqu	-49(%rdi), %xmm1
-	movdqu	-49(%rsi), %xmm2
-	mov	$-49, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(33bytes):
-	movdqu	-33(%rdi), %xmm1
-	movdqu	-33(%rsi), %xmm2
-	mov	$-33, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(17bytes):
-	mov	-17(%rdi), %rax
-	mov	-17(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-L(9bytes):
-	mov	-9(%rdi), %rax
-	mov	-9(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	movzbl	-1(%rdi), %eax
-	movzbl	-1(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(13bytes):
-	mov	-13(%rdi), %rax
-	mov	-13(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(5bytes):
-	mov	-5(%rdi), %eax
-	mov	-5(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	movzbl	-1(%rdi), %eax
-	movzbl	-1(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(66bytes):
-	movdqu	-66(%rdi), %xmm1
-	movdqu	-66(%rsi), %xmm2
-	mov	$-66, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(50bytes):
-	movdqu	-50(%rdi), %xmm1
-	movdqu	-50(%rsi), %xmm2
-	mov	$-50, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(34bytes):
-	movdqu	-34(%rdi), %xmm1
-	movdqu	-34(%rsi), %xmm2
-	mov	$-34, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(18bytes):
-	mov	-18(%rdi), %rax
-	mov	-18(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-L(10bytes):
-	mov	-10(%rdi), %rax
-	mov	-10(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-L(14bytes):
-	mov	-14(%rdi), %rax
-	mov	-14(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(6bytes):
-	mov	-6(%rdi), %eax
-	mov	-6(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-L(2bytes):
-	movzwl	-2(%rsi), %ecx
-	movzwl	-2(%rdi), %eax
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-L(67bytes):
-	movdqu	-67(%rdi), %xmm2
-	movdqu	-67(%rsi), %xmm1
-	mov	$-67, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(51bytes):
-	movdqu	-51(%rdi), %xmm2
-	movdqu	-51(%rsi), %xmm1
-	mov	$-51, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(35bytes):
-	movdqu	-35(%rsi), %xmm1
-	movdqu	-35(%rdi), %xmm2
-	mov	$-35, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(19bytes):
-	mov	-19(%rdi), %rax
-	mov	-19(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-L(11bytes):
-	mov	-11(%rdi), %rax
-	mov	-11(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-4(%rdi), %eax
-	mov	-4(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(15bytes):
-	mov	-15(%rdi), %rax
-	mov	-15(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(7bytes):
-	mov	-7(%rdi), %eax
-	mov	-7(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	mov	-4(%rdi), %eax
-	mov	-4(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(3bytes):
-	movzwl	-3(%rdi), %eax
-	movzwl	-3(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin2bytes)
-L(1bytes):
-	movzbl	-1(%rdi), %eax
-	movzbl	-1(%rsi), %ecx
-	sub	%ecx, %eax
-	ret
-# endif
-
-	.p2align 4
-L(68bytes):
-	movdqu	-68(%rdi), %xmm2
-	movdqu	-68(%rsi), %xmm1
-	mov	$-68, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(52bytes):
-	movdqu	-52(%rdi), %xmm2
-	movdqu	-52(%rsi), %xmm1
-	mov	$-52, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(36bytes):
-	movdqu	-36(%rdi), %xmm2
-	movdqu	-36(%rsi), %xmm1
-	mov	$-36, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(20bytes):
-	movdqu	-20(%rdi), %xmm2
-	movdqu	-20(%rsi), %xmm1
-	mov	$-20, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-4(%rsi), %ecx
-
-# ifndef USE_AS_WMEMCMP
-	mov	-4(%rdi), %eax
-	cmp	%eax, %ecx
-# else
-	cmp	-4(%rdi), %ecx
-# endif
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
-	.p2align 4
-L(69bytes):
-	movdqu	-69(%rsi), %xmm1
-	movdqu	-69(%rdi), %xmm2
-	mov	$-69, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(53bytes):
-	movdqu	-53(%rsi), %xmm1
-	movdqu	-53(%rdi), %xmm2
-	mov	$-53, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(37bytes):
-	movdqu	-37(%rsi), %xmm1
-	movdqu	-37(%rdi), %xmm2
-	mov	$-37, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(21bytes):
-	movdqu	-21(%rsi), %xmm1
-	movdqu	-21(%rdi), %xmm2
-	mov	$-21, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(70bytes):
-	movdqu	-70(%rsi), %xmm1
-	movdqu	-70(%rdi), %xmm2
-	mov	$-70, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(54bytes):
-	movdqu	-54(%rsi), %xmm1
-	movdqu	-54(%rdi), %xmm2
-	mov	$-54, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(38bytes):
-	movdqu	-38(%rsi), %xmm1
-	movdqu	-38(%rdi), %xmm2
-	mov	$-38, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(22bytes):
-	movdqu	-22(%rsi), %xmm1
-	movdqu	-22(%rdi), %xmm2
-	mov	$-22, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(71bytes):
-	movdqu	-71(%rsi), %xmm1
-	movdqu	-71(%rdi), %xmm2
-	mov	$-71, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(55bytes):
-	movdqu	-55(%rdi), %xmm2
-	movdqu	-55(%rsi), %xmm1
-	mov	$-55, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(39bytes):
-	movdqu	-39(%rdi), %xmm2
-	movdqu	-39(%rsi), %xmm1
-	mov	$-39, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(23bytes):
-	movdqu	-23(%rdi), %xmm2
-	movdqu	-23(%rsi), %xmm1
-	mov	$-23, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-# endif
-
-	.p2align 4
-L(72bytes):
-	movdqu	-72(%rsi), %xmm1
-	movdqu	-72(%rdi), %xmm2
-	mov	$-72, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(56bytes):
-	movdqu	-56(%rdi), %xmm2
-	movdqu	-56(%rsi), %xmm1
-	mov	$-56, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(40bytes):
-	movdqu	-40(%rdi), %xmm2
-	movdqu	-40(%rsi), %xmm1
-	mov	$-40, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(24bytes):
-	movdqu	-24(%rdi), %xmm2
-	movdqu	-24(%rsi), %xmm1
-	mov	$-24, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-
-	mov	-8(%rsi), %rcx
-	mov	-8(%rdi), %rax
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
-	.p2align 4
-L(73bytes):
-	movdqu	-73(%rsi), %xmm1
-	movdqu	-73(%rdi), %xmm2
-	mov	$-73, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(57bytes):
-	movdqu	-57(%rdi), %xmm2
-	movdqu	-57(%rsi), %xmm1
-	mov	$-57, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(41bytes):
-	movdqu	-41(%rdi), %xmm2
-	movdqu	-41(%rsi), %xmm1
-	mov	$-41, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(25bytes):
-	movdqu	-25(%rdi), %xmm2
-	movdqu	-25(%rsi), %xmm1
-	mov	$-25, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-9(%rdi), %rax
-	mov	-9(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	movzbl	-1(%rdi), %eax
-	movzbl	-1(%rsi), %ecx
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-L(74bytes):
-	movdqu	-74(%rsi), %xmm1
-	movdqu	-74(%rdi), %xmm2
-	mov	$-74, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(58bytes):
-	movdqu	-58(%rdi), %xmm2
-	movdqu	-58(%rsi), %xmm1
-	mov	$-58, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(42bytes):
-	movdqu	-42(%rdi), %xmm2
-	movdqu	-42(%rsi), %xmm1
-	mov	$-42, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(26bytes):
-	movdqu	-26(%rdi), %xmm2
-	movdqu	-26(%rsi), %xmm1
-	mov	$-26, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-10(%rdi), %rax
-	mov	-10(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	jmp	L(diffin2bytes)
-
-	.p2align 4
-L(75bytes):
-	movdqu	-75(%rsi), %xmm1
-	movdqu	-75(%rdi), %xmm2
-	mov	$-75, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(59bytes):
-	movdqu	-59(%rdi), %xmm2
-	movdqu	-59(%rsi), %xmm1
-	mov	$-59, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(43bytes):
-	movdqu	-43(%rdi), %xmm2
-	movdqu	-43(%rsi), %xmm1
-	mov	$-43, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(27bytes):
-	movdqu	-27(%rdi), %xmm2
-	movdqu	-27(%rsi), %xmm1
-	mov	$-27, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-11(%rdi), %rax
-	mov	-11(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-4(%rdi), %eax
-	mov	-4(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(76bytes):
-	movdqu	-76(%rsi), %xmm1
-	movdqu	-76(%rdi), %xmm2
-	mov	$-76, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(60bytes):
-	movdqu	-60(%rdi), %xmm2
-	movdqu	-60(%rsi), %xmm1
-	mov	$-60, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(44bytes):
-	movdqu	-44(%rdi), %xmm2
-	movdqu	-44(%rsi), %xmm1
-	mov	$-44, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(28bytes):
-	movdqu	-28(%rdi), %xmm2
-	movdqu	-28(%rsi), %xmm1
-	mov	$-28, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-12(%rdi), %rax
-	mov	-12(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-4(%rsi), %ecx
-# ifndef USE_AS_WMEMCMP
-	mov	-4(%rdi), %eax
-	cmp	%eax, %ecx
-# else
-	cmp	-4(%rdi), %ecx
-# endif
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-
-# ifndef USE_AS_WMEMCMP
-/* unreal cases for wmemcmp */
-	.p2align 4
-L(77bytes):
-	movdqu	-77(%rsi), %xmm1
-	movdqu	-77(%rdi), %xmm2
-	mov	$-77, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(61bytes):
-	movdqu	-61(%rdi), %xmm2
-	movdqu	-61(%rsi), %xmm1
-	mov	$-61, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(45bytes):
-	movdqu	-45(%rdi), %xmm2
-	movdqu	-45(%rsi), %xmm1
-	mov	$-45, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(29bytes):
-	movdqu	-29(%rdi), %xmm2
-	movdqu	-29(%rsi), %xmm1
-	mov	$-29, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-
-	mov	-13(%rdi), %rax
-	mov	-13(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(78bytes):
-	movdqu	-78(%rsi), %xmm1
-	movdqu	-78(%rdi), %xmm2
-	mov	$-78, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(62bytes):
-	movdqu	-62(%rdi), %xmm2
-	movdqu	-62(%rsi), %xmm1
-	mov	$-62, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(46bytes):
-	movdqu	-46(%rdi), %xmm2
-	movdqu	-46(%rsi), %xmm1
-	mov	$-46, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(30bytes):
-	movdqu	-30(%rdi), %xmm2
-	movdqu	-30(%rsi), %xmm1
-	mov	$-30, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-14(%rdi), %rax
-	mov	-14(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(79bytes):
-	movdqu	-79(%rsi), %xmm1
-	movdqu	-79(%rdi), %xmm2
-	mov	$-79, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(63bytes):
-	movdqu	-63(%rdi), %xmm2
-	movdqu	-63(%rsi), %xmm1
-	mov	$-63, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(47bytes):
-	movdqu	-47(%rdi), %xmm2
-	movdqu	-47(%rsi), %xmm1
-	mov	$-47, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(31bytes):
-	movdqu	-31(%rdi), %xmm2
-	movdqu	-31(%rsi), %xmm1
-	mov	$-31, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-	mov	-15(%rdi), %rax
-	mov	-15(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-# endif
-	.p2align 4
-L(64bytes):
-	movdqu	-64(%rdi), %xmm2
-	movdqu	-64(%rsi), %xmm1
-	mov	$-64, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(48bytes):
-	movdqu	-48(%rdi), %xmm2
-	movdqu	-48(%rsi), %xmm1
-	mov	$-48, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-L(32bytes):
-	movdqu	-32(%rdi), %xmm2
-	movdqu	-32(%rsi), %xmm1
-	mov	$-32, %dl
-	pxor	%xmm1, %xmm2
-	ptest	%xmm2, %xmm0
-	jnc	L(less16bytes)
-
-	mov	-16(%rdi), %rax
-	mov	-16(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-
-	mov	-8(%rdi), %rax
-	mov	-8(%rsi), %rcx
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	xor	%eax, %eax
-	ret
-
-/*
- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
- */
-	.p2align 3
-L(less16bytes):
-	movsbq	%dl, %rdx
-	mov	(%rsi, %rdx), %rcx
-	mov	(%rdi, %rdx), %rax
-	cmp	%rax, %rcx
-	jne	L(diffin8bytes)
-	mov	8(%rsi, %rdx), %rcx
-	mov	8(%rdi, %rdx), %rax
-L(diffin8bytes):
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	shr	$32, %rcx
-	shr	$32, %rax
-
-# ifdef USE_AS_WMEMCMP
-/* for wmemcmp */
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	xor	%eax, %eax
-	ret
-# endif
-
-L(diffin4bytes):
-# ifndef USE_AS_WMEMCMP
-	cmp	%cx, %ax
-	jne	L(diffin2bytes)
-	shr	$16, %ecx
-	shr	$16, %eax
-L(diffin2bytes):
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
-	ret
-
-	.p2align 4
-L(end):
-	and	$0xff, %eax
-	and	$0xff, %ecx
-	sub	%ecx, %eax
-	ret
-# else
-
-/* for wmemcmp */
-	mov	$1, %eax
-	jl	L(nequal_bigger)
-	neg	%eax
-	ret
-
-	.p2align 4
-L(nequal_bigger):
-	ret
-
-L(unreal_case):
-	xor	%eax, %eax
-	ret
-# endif
-
-END (MEMCMP)
-
-	.section .rodata.sse4.1,"a",@progbits
-	.p2align 3
-# ifndef USE_AS_WMEMCMP
-L(table_64bytes):
-	.int	JMPTBL (L(0bytes), L(table_64bytes))
-	.int	JMPTBL (L(1bytes), L(table_64bytes))
-	.int	JMPTBL (L(2bytes), L(table_64bytes))
-	.int	JMPTBL (L(3bytes), L(table_64bytes))
-	.int	JMPTBL (L(4bytes), L(table_64bytes))
-	.int	JMPTBL (L(5bytes), L(table_64bytes))
-	.int	JMPTBL (L(6bytes), L(table_64bytes))
-	.int	JMPTBL (L(7bytes), L(table_64bytes))
-	.int	JMPTBL (L(8bytes), L(table_64bytes))
-	.int	JMPTBL (L(9bytes), L(table_64bytes))
-	.int	JMPTBL (L(10bytes), L(table_64bytes))
-	.int	JMPTBL (L(11bytes), L(table_64bytes))
-	.int	JMPTBL (L(12bytes), L(table_64bytes))
-	.int	JMPTBL (L(13bytes), L(table_64bytes))
-	.int	JMPTBL (L(14bytes), L(table_64bytes))
-	.int	JMPTBL (L(15bytes), L(table_64bytes))
-	.int	JMPTBL (L(16bytes), L(table_64bytes))
-	.int	JMPTBL (L(17bytes), L(table_64bytes))
-	.int	JMPTBL (L(18bytes), L(table_64bytes))
-	.int	JMPTBL (L(19bytes), L(table_64bytes))
-	.int	JMPTBL (L(20bytes), L(table_64bytes))
-	.int	JMPTBL (L(21bytes), L(table_64bytes))
-	.int	JMPTBL (L(22bytes), L(table_64bytes))
-	.int	JMPTBL (L(23bytes), L(table_64bytes))
-	.int	JMPTBL (L(24bytes), L(table_64bytes))
-	.int	JMPTBL (L(25bytes), L(table_64bytes))
-	.int	JMPTBL (L(26bytes), L(table_64bytes))
-	.int	JMPTBL (L(27bytes), L(table_64bytes))
-	.int	JMPTBL (L(28bytes), L(table_64bytes))
-	.int	JMPTBL (L(29bytes), L(table_64bytes))
-	.int	JMPTBL (L(30bytes), L(table_64bytes))
-	.int	JMPTBL (L(31bytes), L(table_64bytes))
-	.int	JMPTBL (L(32bytes), L(table_64bytes))
-	.int	JMPTBL (L(33bytes), L(table_64bytes))
-	.int	JMPTBL (L(34bytes), L(table_64bytes))
-	.int	JMPTBL (L(35bytes), L(table_64bytes))
-	.int	JMPTBL (L(36bytes), L(table_64bytes))
-	.int	JMPTBL (L(37bytes), L(table_64bytes))
-	.int	JMPTBL (L(38bytes), L(table_64bytes))
-	.int	JMPTBL (L(39bytes), L(table_64bytes))
-	.int	JMPTBL (L(40bytes), L(table_64bytes))
-	.int	JMPTBL (L(41bytes), L(table_64bytes))
-	.int	JMPTBL (L(42bytes), L(table_64bytes))
-	.int	JMPTBL (L(43bytes), L(table_64bytes))
-	.int	JMPTBL (L(44bytes), L(table_64bytes))
-	.int	JMPTBL (L(45bytes), L(table_64bytes))
-	.int	JMPTBL (L(46bytes), L(table_64bytes))
-	.int	JMPTBL (L(47bytes), L(table_64bytes))
-	.int	JMPTBL (L(48bytes), L(table_64bytes))
-	.int	JMPTBL (L(49bytes), L(table_64bytes))
-	.int	JMPTBL (L(50bytes), L(table_64bytes))
-	.int	JMPTBL (L(51bytes), L(table_64bytes))
-	.int	JMPTBL (L(52bytes), L(table_64bytes))
-	.int	JMPTBL (L(53bytes), L(table_64bytes))
-	.int	JMPTBL (L(54bytes), L(table_64bytes))
-	.int	JMPTBL (L(55bytes), L(table_64bytes))
-	.int	JMPTBL (L(56bytes), L(table_64bytes))
-	.int	JMPTBL (L(57bytes), L(table_64bytes))
-	.int	JMPTBL (L(58bytes), L(table_64bytes))
-	.int	JMPTBL (L(59bytes), L(table_64bytes))
-	.int	JMPTBL (L(60bytes), L(table_64bytes))
-	.int	JMPTBL (L(61bytes), L(table_64bytes))
-	.int	JMPTBL (L(62bytes), L(table_64bytes))
-	.int	JMPTBL (L(63bytes), L(table_64bytes))
-	.int	JMPTBL (L(64bytes), L(table_64bytes))
-	.int	JMPTBL (L(65bytes), L(table_64bytes))
-	.int	JMPTBL (L(66bytes), L(table_64bytes))
-	.int	JMPTBL (L(67bytes), L(table_64bytes))
-	.int	JMPTBL (L(68bytes), L(table_64bytes))
-	.int	JMPTBL (L(69bytes), L(table_64bytes))
-	.int	JMPTBL (L(70bytes), L(table_64bytes))
-	.int	JMPTBL (L(71bytes), L(table_64bytes))
-	.int	JMPTBL (L(72bytes), L(table_64bytes))
-	.int	JMPTBL (L(73bytes), L(table_64bytes))
-	.int	JMPTBL (L(74bytes), L(table_64bytes))
-	.int	JMPTBL (L(75bytes), L(table_64bytes))
-	.int	JMPTBL (L(76bytes), L(table_64bytes))
-	.int	JMPTBL (L(77bytes), L(table_64bytes))
-	.int	JMPTBL (L(78bytes), L(table_64bytes))
-	.int	JMPTBL (L(79bytes), L(table_64bytes))
-# else
-L(table_64bytes):
-	.int	JMPTBL (L(0bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(4bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(8bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(12bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(16bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(20bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(24bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(28bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(32bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(36bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(40bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(44bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(48bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(52bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(56bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(60bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(64bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(68bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(72bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(76bytes), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-	.int	JMPTBL (L(unreal_case), L(table_64bytes))
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
new file mode 100644
index 000000000..67a55f0c8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -0,0 +1,17 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define VEC(i)		ymm##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu
+# define VMOVA		vmovdqa
+# define MOV_SIZE	4
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+# define VZEROUPPER_RETURN jmp	 L(return)
+
+# define SECTION(p)		p##.avx.rtm
+# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index e195e93f1..975ae6c05 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -4,7 +4,7 @@
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
-
+# define MOV_SIZE	4
 # define SECTION(p)		p##.avx
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index aac1515cf..0fa712683 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,11 +1,32 @@
 #if IS_IN (libc)
 # define VEC_SIZE	64
-# define VEC(i)		zmm##i
+# define XMM0		xmm16
+# define XMM1		xmm17
+# define YMM0		ymm16
+# define YMM1		ymm17
+# define VEC0		zmm16
+# define VEC1		zmm17
+# define VEC2		zmm18
+# define VEC3		zmm19
+# define VEC4		zmm20
+# define VEC5		zmm21
+# define VEC6		zmm22
+# define VEC7		zmm23
+# define VEC8		zmm24
+# define VEC9		zmm25
+# define VEC10		zmm26
+# define VEC11		zmm27
+# define VEC12		zmm28
+# define VEC13		zmm29
+# define VEC14		zmm30
+# define VEC15		zmm31
+# define VEC(i)		VEC##i
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
-
-# define SECTION(p)		p##.avx512
+# define VZEROUPPER
+# define MOV_SIZE	6
+# define SECTION(p)		p##.evex512
 # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
 
 # include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
new file mode 100644
index 000000000..88715441f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -0,0 +1,33 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define XMM0		xmm16
+# define XMM1		xmm17
+# define YMM0		ymm16
+# define YMM1		ymm17
+# define VEC0		ymm16
+# define VEC1		ymm17
+# define VEC2		ymm18
+# define VEC3		ymm19
+# define VEC4		ymm20
+# define VEC5		ymm21
+# define VEC6		ymm22
+# define VEC7		ymm23
+# define VEC8		ymm24
+# define VEC9		ymm25
+# define VEC10		ymm26
+# define VEC11		ymm27
+# define VEC12		ymm28
+# define VEC13		ymm29
+# define VEC14		ymm30
+# define VEC15		ymm31
+# define VEC(i)		VEC##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+# define VZEROUPPER
+# define MOV_SIZE	6
+# define SECTION(p)		p##.evex
+# define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 0980c9537..7b27cbdda 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -30,9 +30,21 @@
       load and aligned store.  Load the last 4 * VEC and first VEC
       before the loop and store them after the loop to support
       overlapping addresses.
-   6. If size >= __x86_shared_non_temporal_threshold and there is no
+   6. On machines with ERMS feature, if size greater than equal or to
+      __x86_rep_movsb_threshold and less than
+      __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
+   7. If size >= __x86_shared_non_temporal_threshold and there is no
       overlap between destination and source, use non-temporal store
-      instead of aligned store.  */
+      instead of aligned store copying from either 2 or 4 pages at
+      once.
+   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
+      and source and destination do not page alias, copy from 2 pages
+      at once using non-temporal stores. Page aliasing in this case is
+      considered true if destination's page alignment - sources' page
+      alignment is less than 8 * VEC_SIZE.
+   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
+      and destination do page alias copy from 4 pages at once using
+      non-temporal stores.  */
 
 #include <sysdep.h>
 
@@ -48,6 +60,14 @@
 # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
 #endif
 
+#ifndef XMM0
+# define XMM0				xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0				ymm0
+#endif
+
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER vzeroupper
@@ -56,6 +76,53 @@
 # endif
 #endif
 
+/* Whether to align before movsb. Ultimately we want 64 byte
+   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
+#define ALIGN_MOVSB	(VEC_SIZE > 16)
+/* Number of bytes to align movsb to.  */
+#define MOVSB_ALIGN_TO	64
+
+#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
+#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
+
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
+# error MOV_SIZE Unknown
+#endif
+
+#if LARGE_MOV_SIZE
+# define SMALL_SIZE_OFFSET	(4)
+#else
+# define SMALL_SIZE_OFFSET	(0)
+#endif
+
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+
+#if PAGE_SIZE != 4096
+# error Unsupported PAGE_SIZE
+#endif
+
+#ifndef LOG_PAGE_SIZE
+# define LOG_PAGE_SIZE 12
+#endif
+
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
+# error Invalid LOG_PAGE_SIZE
+#endif
+
+/* Byte per page for large_memcpy inner loop.  */
+#if VEC_SIZE == 64
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
+#else
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+#endif
+
+/* Amount to shift rdx by to compare for memcpy_large_4x.  */
+#ifndef LOG_4X_MEMCPY_THRESH
+# define LOG_4X_MEMCPY_THRESH 4
+#endif
+
 /* Avoid short distance rep movsb only with non-SSE vector.  */
 #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
 # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
@@ -95,6 +162,28 @@
 # error Unsupported PREFETCH_SIZE!
 #endif
 
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1;
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVNT  vec0, (offset)base; \
+	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1; \
+	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
+	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVNT	vec0, (offset)base; \
+	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
+	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
+	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
+#else
+# error Invalid LARGE_LOAD_SIZE
+#endif
+
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -129,24 +218,21 @@ L(start):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(last_2x_vec):
-#endif
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	(%rsi), %VEC(0)
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
-	VZEROUPPER
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(nop):
+#if !(defined USE_MULTIARCH && IS_IN (libc))
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
+	VZEROUPPER_RETURN
 #endif
-	ret
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned))
-
 # if VEC_SIZE == 16
 ENTRY (__mempcpy_chk_erms)
 	cmp	%RDX_LP, %RCX_LP
@@ -218,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
 	movq	%rdi, %rax
 L(start_erms):
 # ifdef __ILP32__
@@ -227,338 +313,614 @@ L(start_erms):
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	/* Load regardless.  */
+	VMOVU	(%rsi), %VEC(0)
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
-L(last_2x_vec):
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
+	 */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
 L(return):
-	VZEROUPPER
+# if VEC_SIZE > 16
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+# else
 	ret
-
-L(movsb):
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	jae	L(more_8x_vec)
-	cmpq	%rsi, %rdi
-	jb	1f
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %r9
-	cmpq	%r9, %rdi
-	/* Avoid slow backward REP MOVSB.  */
-	jb	L(more_8x_vec_backward)
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	movq	%rdi, %rcx
-	subq	%rsi, %rcx
-	jmp	2f
-# endif
-1:
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
-	movq	%rsi, %rcx
-	subq	%rdi, %rcx
-2:
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
-   is N*4GB + [1..63] with N >= 0.  */
-	cmpl	$63, %ecx
-	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
 # endif
-	mov	%RDX_LP, %RCX_LP
-	rep movsb
-L(nop):
+#endif
+
+#if LARGE_MOV_SIZE
+	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
+	   ENTRY block and L(less_vec).  */
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	(%rsi), %ecx
+	movl	(%rsi, %rdx), %esi
+	movl	%ecx, (%rdi)
+	movl	%esi, (%rdi, %rdx)
 	ret
 #endif
 
+	.p2align 4
 L(less_vec):
 	/* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 # error Unsupported VEC_SIZE!
 #endif
 #if VEC_SIZE > 32
-	cmpb	$32, %dl
+	cmpl	$32, %edx
 	jae	L(between_32_63)
 #endif
 #if VEC_SIZE > 16
-	cmpb	$16, %dl
+	cmpl	$16, %edx
 	jae	L(between_16_31)
 #endif
-	cmpb	$8, %dl
+	cmpl	$8, %edx
 	jae	L(between_8_15)
-	cmpb	$4, %dl
+#if SMALL_MOV_SIZE
+	cmpl	$4, %edx
+#else
+	subq	$4, %rdx
+#endif
 	jae	L(between_4_7)
-	cmpb	$1, %dl
-	ja	L(between_2_3)
-	jb	1f
-	movzbl	(%rsi), %ecx
+	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
+	jl	L(copy_0)
+	movb	(%rsi), %cl
+	je	L(copy_1)
+	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
+	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
+L(copy_1):
 	movb	%cl, (%rdi)
-1:
+L(copy_0):
 	ret
-#if VEC_SIZE > 32
-L(between_32_63):
-	/* From 32 to 63.  No branch when size == 32.  */
-	vmovdqu	(%rsi), %ymm0
-	vmovdqu	-32(%rsi,%rdx), %ymm1
-	vmovdqu	%ymm0, (%rdi)
-	vmovdqu	%ymm1, -32(%rdi,%rdx)
-	VZEROUPPER
+
+#if SMALL_MOV_SIZE
+	.p2align 4,, 8
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi, %rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi, %rdx)
+	movl	%esi, (%rdi)
 	ret
 #endif
+
 #if VEC_SIZE > 16
 	/* From 16 to 31.  No branch when size == 16.  */
+	.p2align 4,, 8
 L(between_16_31):
 	vmovdqu	(%rsi), %xmm0
-	vmovdqu	-16(%rsi,%rdx), %xmm1
+	vmovdqu	-16(%rsi, %rdx), %xmm1
 	vmovdqu	%xmm0, (%rdi)
-	vmovdqu	%xmm1, -16(%rdi,%rdx)
+	vmovdqu	%xmm1, -16(%rdi, %rdx)
+	/* No ymm registers have been touched.  */
 	ret
 #endif
+
+#if VEC_SIZE > 32
+	.p2align 4,, 10
+L(between_32_63):
+	/* From 32 to 63.  No branch when size == 32.  */
+	VMOVU	(%rsi), %YMM0
+	VMOVU	-32(%rsi, %rdx), %YMM1
+	VMOVU	%YMM0, (%rdi)
+	VMOVU	%YMM1, -32(%rdi, %rdx)
+	VZEROUPPER_RETURN
+#endif
+
+	.p2align 4,, 10
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
-	movq	-8(%rsi,%rdx), %rcx
+	movq	-8(%rsi, %rdx), %rcx
 	movq	(%rsi), %rsi
-	movq	%rcx, -8(%rdi,%rdx)
 	movq	%rsi, (%rdi)
-	ret
-L(between_4_7):
-	/* From 4 to 7.  No branch when size == 4.  */
-	movl	-4(%rsi,%rdx), %ecx
-	movl	(%rsi), %esi
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%esi, (%rdi)
-	ret
-L(between_2_3):
-	/* From 2 to 3.  No branch when size == 2.  */
-	movzwl	-2(%rsi,%rdx), %ecx
-	movzwl	(%rsi), %esi
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%si, (%rdi)
+	movq	%rcx, -8(%rdi, %rdx)
 	ret
 
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
+
+	/* VEC(0) and VEC(1) have already been loaded.  */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4
 #if defined USE_MULTIARCH && IS_IN (libc)
 L(movsb_more_2x_vec):
 	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
 	ja	L(movsb)
 #endif
 L(more_2x_vec):
-	/* More than 2 * VEC and there may be overlap between destination
-	   and source.  */
+	/* More than 2 * VEC and there may be overlap between
+	   destination and source.  */
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
-	cmpq	$(VEC_SIZE * 4), %rdx
-	jb	L(last_4x_vec)
-	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
-	VMOVU	(%rsi), %VEC(0)
+	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), VEC_SIZE(%rdi)
 	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
-	VZEROUPPER
-	ret
-L(last_4x_vec):
-	/* Copy from 2 * VEC to 4 * VEC. */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VZEROUPPER
-	ret
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 4
 L(more_8x_vec):
-	cmpq	%rsi, %rdi
-	ja	L(more_8x_vec_backward)
-	/* Source == destination is less common.  */
-	je	L(nop)
-	/* Load the first VEC and last 4 * VEC to support overlapping
-	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
-	/* Save start and stop of the destination buffer.  */
-	movq	%rdi, %r11
-	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
-	/* Align destination for aligned stores in the loop.  Compute
-	   how much destination is misaligned.  */
-	movq	%rdi, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Get the negative of offset for alignment.  */
-	subq	$VEC_SIZE, %r8
-	/* Adjust source.  */
-	subq	%r8, %rsi
-	/* Adjust destination which should be aligned now.  */
-	subq	%r8, %rdi
-	/* Adjust length.  */
-	addq	%r8, %rdx
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
+	/* Check if non-temporal move candidate.  */
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	/* Check non-temporal store threshold.  */
 	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_forward)
+	ja	L(large_memcpy_2x)
 #endif
+	/* To reach this point there cannot be overlap and dst > src. So
+	   check for overlap and src > dst in which case correctness
+	   requires forward copy. Otherwise decide between backward/forward
+	   copy depending on address aliasing.  */
+
+	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
+	   but less than __x86_shared_non_temporal_threshold.  */
+L(more_8x_vec_check):
+	/* rcx contains dst - src. Add back length (rdx).  */
+	leaq	(%rcx, %rdx), %r8
+	/* If r8 has different sign than rcx then there is overlap so we
+	   must do forward copy.  */
+	xorq	%rcx, %r8
+	/* Isolate just sign bit of r8.  */
+	shrq	$63, %r8
+	/* Get 4k difference dst - src.  */
+	andl	$(PAGE_SIZE - 256), %ecx
+	/* If r8 is non-zero must do foward for correctness. Otherwise
+	   if ecx is non-zero there is 4k False Alaising so do backward
+	   copy.  */
+	addl	%r8d, %ecx
+	jz	L(more_8x_vec_backward)
+
+	/* if rdx is greater than __x86_shared_non_temporal_threshold
+	   but there is overlap, or from short distance movsb.  */
+L(more_8x_vec_forward):
+	/* Load first and last 4 * VEC to support overlapping addresses.
+	 */
+
+	/* First vec was already loaded into VEC(0).  */
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	/* Save begining of dst.  */
+	movq	%rdi, %rcx
+	/* Align dst to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rcx, %rsi
+	/* Finish aligning dst.  */
+	incq	%rdi
+	/* Restore src adjusted with new value for aligned dst.  */
+	addq	%rdi, %rsi
+	/* Store end of buffer minus tail in rdx.  */
+	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
+
+	/* Dont use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$(VEC_SIZE * 4), %rsi
-	subq	$(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%rdi)
-	VMOVA	%VEC(1), VEC_SIZE(%rdi)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$(VEC_SIZE * 4), %rdi
-	cmpq	$(VEC_SIZE * 4), %rdx
+	VMOVU	(%rsi), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+	subq	$-(VEC_SIZE * 4), %rsi
+	VMOVA	%VEC(1), (%rdi)
+	VMOVA	%VEC(2), VEC_SIZE(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdi, %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
+	VMOVU	%VEC(7), VEC_SIZE(%rdx)
+	VMOVU	%VEC(8), (%rdx)
 	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
-	VZEROUPPER
-	ret
-
+	VMOVU	%VEC(0), (%rcx)
+	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
+	 */
+L(nop_backward):
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(more_8x_vec_backward_check_nop):
+	/* rcx contains dst - src. Test for dst == src to skip all of
+	   memmove.  */
+	testq	%rcx, %rcx
+	jz	L(nop_backward)
 L(more_8x_vec_backward):
 	/* Load the first 4 * VEC and last VEC to support overlapping
 	   addresses.  */
-	VMOVU	(%rsi), %VEC(4)
+
+	/* First vec was also loaded into VEC(0).  */
 	VMOVU	VEC_SIZE(%rsi), %VEC(5)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	/* Begining of region for 4x backward copy stored in rcx.  */
+	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
-	/* Save stop of the destination buffer.  */
-	leaq	-VEC_SIZE(%rdi, %rdx), %r11
-	/* Align destination end for aligned stores in the loop.  Compute
-	   how much destination end is misaligned.  */
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	movq	%r11, %r9
-	movq	%r11, %r8
-	andq	$(VEC_SIZE - 1), %r8
-	/* Adjust source.  */
-	subq	%r8, %rcx
-	/* Adjust the end of destination which should be aligned now.  */
-	subq	%r8, %r9
-	/* Adjust length.  */
-	subq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_backward)
-#endif
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Align dst.  */
+	andq	$-(VEC_SIZE), %rcx
+	/* Restore src.  */
+	addq	%rcx, %rsi
+
+	/* Don't use multi-byte nop to align.  */
+	.p2align 4,, 11
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$(VEC_SIZE * 4), %rcx
-	subq	$(VEC_SIZE * 4), %rdx
-	VMOVA	%VEC(0), (%r9)
-	VMOVA	%VEC(1), -VEC_SIZE(%r9)
-	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$(VEC_SIZE * 4), %r9
-	cmpq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec_backward)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
+	addq	$(VEC_SIZE * -4), %rsi
+	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
+	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
+	addq	$(VEC_SIZE * -4), %rcx
+	cmpq	%rcx, %rdi
+	jb	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
-	VZEROUPPER
+	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
+	VZEROUPPER_RETURN
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	/* L(skip_short_movsb_check) is only used with ERMS. Not for
+	   FSRM.  */
+	.p2align 5,, 16
+# if ALIGN_MOVSB
+L(skip_short_movsb_check):
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* If CPU does not have FSRM two options for aligning. Align src
+	   if dst and src 4k alias. Otherwise align dst.  */
+	testl	$(PAGE_SIZE - 512), %ecx
+	jnz	L(movsb_align_dst)
+	/* Fall through. dst and src 4k alias. It's better to align src
+	   here because the bottleneck will be loads dues to the false
+	   dependency on dst.  */
+
+	/* rcx already has dst - src.  */
+	movq	%rcx, %r9
+	/* Add src to len. Subtract back after src aligned. -1 because
+	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
+	leaq	-1(%rsi, %rdx), %rcx
+	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
+	orq	$(MOVSB_ALIGN_TO - 1), %rsi
+	/* Restore dst and len adjusted with new values for aligned dst.
+	 */
+	leaq	1(%rsi, %r9), %rdi
+	subq	%rsi, %rcx
+	/* Finish aligning src.  */
+	incq	%rsi
+
+	rep	movsb
+
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4,, 12
+L(movsb):
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	/* Go to backwards temporal copy if overlap no matter what as
+	   backward REP MOVSB is slow and we don't want to use NT stores if
+	   there is overlap.  */
+	cmpq	%rdx, %rcx
+	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
+	jb	L(more_8x_vec_backward_check_nop)
+# if ALIGN_MOVSB
+	/* Save dest for storing aligning VECs later.  */
+	movq	%rdi, %r8
+# endif
+	/* If above __x86_rep_movsb_stop_threshold most likely is
+	   candidate for NT moves aswell.  */
+	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
+	jae	L(large_memcpy_2x_check)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
+	/* Only avoid short movsb if CPU has FSRM.  */
+	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
+	jz	L(skip_short_movsb_check)
+#  if AVOID_SHORT_DISTANCE_REP_MOVSB
+	/* Avoid "rep movsb" if RCX, the distance between source and
+	   destination, is N*4GB + [1..63] with N >= 0.  */
+
+	/* ecx contains dst - src. Early check for backward copy
+	   conditions means only case of slow movsb with src = dst + [0,
+	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
+	   for that case.  */
+	cmpl	$-64, %ecx
+	ja	L(more_8x_vec_forward)
+#  endif
+# endif
+# if ALIGN_MOVSB
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  endif
+#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
+#   error Unsupported MOVSB_ALIGN_TO
+#  endif
+	/* Fall through means cpu has FSRM. In that case exclusively
+	   align destination.  */
+L(movsb_align_dst):
+	/* Subtract dst from src. Add back after dst aligned.  */
+	subq	%rdi, %rsi
+	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
+	addq	$(MOVSB_ALIGN_TO - 1), %rdi
+	/* Add dst to len. Subtract back after dst aligned.  */
+	leaq	(%r8, %rdx), %rcx
+	/* Finish aligning dst.  */
+	andq	$-(MOVSB_ALIGN_TO), %rdi
+	/* Restore src and len adjusted with new values for aligned dst.
+	 */
+	addq	%rdi, %rsi
+	subq	%rdi, %rcx
+
+	rep	movsb
+
+	/* Store VECs loaded for aligning.  */
+	VMOVU	%VEC(0), (%r8)
+#  if MOVSB_ALIGN_TO > VEC_SIZE
+	VMOVU	%VEC(1), VEC_SIZE(%r8)
+#  endif
+	VZEROUPPER_RETURN
+# else	/* !ALIGN_MOVSB.  */
+L(skip_short_movsb_check):
+	mov	%RDX_LP, %RCX_LP
+	rep	movsb
 	ret
+# endif
+#endif
 
+	.p2align 4,, 10
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rdi, %rdx), %r10
-	cmpq    %r10, %rsi
-	jb	L(loop_4x_vec_forward)
-L(loop_large_forward):
+L(large_memcpy_2x_check):
+	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
+	jb	L(more_8x_vec_check)
+L(large_memcpy_2x):
+	/* To reach this point it is impossible for dst > src and
+	   overlap. Remaining to check is src > dst and overlap. rcx
+	   already contains dst - src. Negate rcx to get src - dst. If
+	   length > rcx then there is overlap and forward copy is best.  */
+	negq	%rcx
+	cmpq	%rcx, %rdx
+	ja	L(more_8x_vec_forward)
+
+	/* Cache align destination. First store the first 64 bytes then
+	   adjust alignments.  */
+
+	/* First vec was also loaded into VEC(0).  */
+# if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+#  if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+#  endif
+# endif
+	VMOVU	%VEC(0), (%rdi)
+# if VEC_SIZE < 64
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+#  if VEC_SIZE < 32
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+#  endif
+# endif
+
+	/* Adjust source, destination, and size.  */
+	movq	%rdi, %r8
+	andq	$63, %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$64, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+
+	/* Test if source and destination addresses will alias. If they
+	   do the larger pipeline in large_memcpy_4x alleviated the
+	   performance drop.  */
+
+	/* ecx contains -(dst - src). not ecx will return dst - src - 1
+	   which works for testing aliasing.  */
+	notl	%ecx
+	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
+	jz	L(large_memcpy_4x)
+
+	movq	%rdx, %r10
+	shrq	$LOG_4X_MEMCPY_THRESH, %r10
+	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
+	jae	L(large_memcpy_4x)
+
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 2 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+	/* Copy 4x VEC at a time from 2 pages.  */
+	.p2align 4
+L(loop_large_memcpy_2x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_2x_inner):
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	subq	$-LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	subq	$-LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_2x_inner)
+	addq	$PAGE_SIZE, %rdi
+	addq	$PAGE_SIZE, %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_2x_outer)
+	sfence
+
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_2x_end)
+
+	/* Handle the last 2 * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_2x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$PREFETCHED_LOAD_SIZE, %rsi
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%rdi)
-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$PREFETCHED_LOAD_SIZE, %rdi
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_forward)
-	sfence
-	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
-	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
-	VZEROUPPER
-	ret
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$-(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_2x_tail)
 
-L(large_backward):
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rcx, %rdx), %r10
-	cmpq    %r10, %r9
-	jb	L(loop_4x_vec_backward)
-L(loop_large_backward):
-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$PREFETCHED_LOAD_SIZE, %rcx
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%r9)
-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$PREFETCHED_LOAD_SIZE, %r9
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_backward)
+L(large_memcpy_2x_end):
+	/* Store the last 4 * VEC.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(large_memcpy_4x):
+	movq	%rdx, %r10
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 4 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$(LOG_PAGE_SIZE + 2), %r10
+	/* Copy 4x VEC at a time from 4 pages.  */
+	.p2align 4
+L(loop_large_memcpy_4x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_4x_inner):
+	/* Only one prefetch set per page as doing 4 pages give more
+	   time for prefetcher to keep up.  */
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	subq	$-LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	subq	$-LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_4x_inner)
+	addq	$(PAGE_SIZE * 3), %rdi
+	addq	$(PAGE_SIZE * 3), %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_4x_outer)
 	sfence
-	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
-	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
-	VZEROUPPER
-	ret
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_4x_end)
+
+	/* Handle the last 4  * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_4x_tail):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$-(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_4x_tail)
+
+L(large_memcpy_4x_end):
+	/* Store the last 4 * VEC.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
new file mode 100644
index 000000000..cea2d2a72
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "memrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index eddede45b..ac7370cb0 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -20,14 +20,22 @@
 
 # include <sysdep.h>
 
+# ifndef MEMRCHR
+#  define MEMRCHR	__memrchr_avx2
+# endif
+
 # ifndef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
 
-	.section .text.avx,"ax",@progbits
-ENTRY (__memrchr_avx2)
+	.section SECTION(.text),"ax",@progbits
+ENTRY (MEMRCHR)
 	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
 	vpbroadcastb %xmm0, %ymm0
@@ -134,8 +142,8 @@ L(loop_4x_vec):
 	vpmovmskb %ymm1, %eax
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(last_4x_vec_or_less):
@@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
 	addq	%rax, %rdx
 	jl	L(zero)
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_2x_vec):
@@ -191,31 +198,27 @@ L(last_2x_vec):
 	jl	L(zero)
 	addl	$(VEC_SIZE * 2), %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x0):
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x1):
 	bsrl	%eax, %eax
 	addl	$VEC_SIZE, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x2):
 	bsrl	%eax, %eax
 	addl	$(VEC_SIZE * 2), %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x3):
@@ -232,8 +235,7 @@ L(last_vec_x1_check):
 	jl	L(zero)
 	addl	$VEC_SIZE, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_x3_check):
@@ -243,12 +245,14 @@ L(last_vec_x3_check):
 	jl	L(zero)
 	addl	$(VEC_SIZE * 3), %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(zero):
-	VZEROUPPER
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+
+	.p2align 4
 L(null):
 	xorl	%eax, %eax
 	ret
@@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
 
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_or_less):
@@ -315,8 +318,7 @@ L(last_vec_or_less):
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
 	addq	%r8, %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(last_vec_2x_aligned):
@@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
 	bsrl	%eax, %eax
 	addq	%rdi, %rax
 	addq	%r8, %rax
-	VZEROUPPER
-	ret
-END (__memrchr_avx2)
+	VZEROUPPER_RETURN
+END (MEMRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
new file mode 100644
index 000000000..16bf8e02b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -0,0 +1,337 @@
+/* memrchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# define VMOVA		vmovdqa64
+
+# define YMMMATCH	ymm16
+
+# define VEC_SIZE 32
+
+	.section .text.evex,"ax",@progbits
+ENTRY (__memrchr_evex)
+	/* Broadcast CHAR to YMMMATCH.  */
+	vpbroadcastb %esi, %YMMMATCH
+
+	sub	$VEC_SIZE, %RDX_LP
+	jbe	L(last_vec_or_less)
+
+	add	%RDX_LP, %RDI_LP
+
+	/* Check the last VEC_SIZE bytes.  */
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x0)
+
+	subq	$(VEC_SIZE * 4), %rdi
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+	jz	L(aligned_more)
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rdx
+	andq	$-VEC_SIZE, %rdi
+	subq	%rcx, %rdx
+
+	.p2align 4
+L(aligned_more):
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+
+	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
+	kmovd	%k2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
+	kmovd	%k3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
+	kmovd	%k4, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x0)
+
+	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
+	   There are some overlaps with above if data isn't aligned
+	   to 4 * VEC_SIZE.  */
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	jz	L(loop_4x_vec)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rdx
+	andq	$-(VEC_SIZE * 4), %rdi
+	subq	%rcx, %rdx
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	subq	$(VEC_SIZE * 4), %rdi
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
+	kord	%k1, %k2, %k5
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
+
+	kord	%k3, %k4, %k6
+	kortestd %k5, %k6
+	jz	L(loop_4x_vec)
+
+	/* There is a match.  */
+	kmovd	%k4, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	kmovd	%k3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	kmovd	%k2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	kmovd	%k1, %eax
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_4x_vec_or_less):
+	addl	$(VEC_SIZE * 4), %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
+	kmovd	%k2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k3
+	kmovd	%k3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1_check)
+	cmpl	$(VEC_SIZE * 3), %edx
+	jbe	L(zero)
+
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k4
+	kmovd	%k4, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	vpcmpb	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3_check)
+	cmpl	$VEC_SIZE, %edx
+	jbe	L(zero)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 2), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$(VEC_SIZE * 2), %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x0):
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x1):
+	bsrl	%eax, %eax
+	addl	$VEC_SIZE, %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x2):
+	bsrl	%eax, %eax
+	addl	$(VEC_SIZE * 2), %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x3):
+	bsrl	%eax, %eax
+	addl	$(VEC_SIZE * 3), %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x1_check):
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 3), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$VEC_SIZE, %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x3_check):
+	bsrl	%eax, %eax
+	subq	$VEC_SIZE, %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$(VEC_SIZE * 3), %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(last_vec_or_less_aligned):
+	movl	%edx, %ecx
+
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+
+	movl	$1, %edx
+	/* Support rdx << 32.  */
+	salq	%cl, %rdx
+	subq	$1, %rdx
+
+	kmovd	%k1, %eax
+
+	/* Remove the trailing bytes.  */
+	andl	%edx, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_or_less):
+	addl	$VEC_SIZE, %edx
+
+	/* Check for zero length.  */
+	testl	%edx, %edx
+	jz	L(zero)
+
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+	jz	L(last_vec_or_less_aligned)
+
+	movl	%ecx, %esi
+	movl	%ecx, %r8d
+	addl	%edx, %esi
+	andq	$-VEC_SIZE, %rdi
+
+	subl	$VEC_SIZE, %esi
+	ja	L(last_vec_2x_aligned)
+
+	/* Check the last VEC.  */
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+	kmovd	%k1, %eax
+
+	/* Remove the leading and trailing bytes.  */
+	sarl	%cl, %eax
+	movl	%edx, %ecx
+
+	movl	$1, %edx
+	sall	%cl, %edx
+	subl	$1, %edx
+
+	andl	%edx, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	addq	%r8, %rax
+	ret
+
+	.p2align 4
+L(last_vec_2x_aligned):
+	movl	%esi, %ecx
+
+	/* Check the last VEC.  */
+	vpcmpb	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+
+	movl	$1, %edx
+	sall	%cl, %edx
+	subl	$1, %edx
+
+	kmovd	%k1, %eax
+
+	/* Remove the trailing bytes.  */
+	andl	%edx, %eax
+
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	/* Check the second last VEC.  */
+	vpcmpb	$0, (%rdi), %YMMMATCH, %k1
+
+	movl	%r8d, %ecx
+
+	kmovd	%k1, %eax
+
+	/* Remove the leading bytes.  Must use unsigned right shift for
+	   bsrl below.  */
+	shrl	%cl, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	addq	%r8, %rax
+	ret
+END (__memrchr_evex)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
new file mode 100644
index 000000000..5a5ee6f67
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -0,0 +1,11 @@
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return)
+
+#define SECTION(p) p##.avx.rtm
+#define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
+#define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+
+#include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 7ab3d8984..a093a2831 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -1,22 +1,44 @@
 #if IS_IN (libc)
+# define USE_WITH_AVX2	1
+
 # define VEC_SIZE	32
+# define MOV_SIZE	4
+# define RET_SIZE	4
+
 # define VEC(i)		ymm##i
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastb %xmm0, %ymm0
+# define VMOVU     vmovdqu
+# define VMOVA     vmovdqa
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastd %xmm0, %ymm0
+  movq r, %rax;
+
+# define BZERO_ZERO_VEC0() \
+  vpxor %xmm0, %xmm0, %xmm0
+
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
 
-# define SECTION(p)		p##.avx
-# define MEMSET_SYMBOL(p,s)	p##_avx2_##s
-# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+# ifndef SECTION
+#  define SECTION(p)		p##.avx
+# endif
+# ifndef MEMSET_SYMBOL
+#  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+# endif
+# ifndef BZERO_SYMBOL
+#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
+# endif
+# ifndef WMEMSET_SYMBOL
+#  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
+# endif
 
+# define USE_XMM_LESS_VEC
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 0783979ca..727c92133 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,24 +1,40 @@
 #if IS_IN (libc)
+# define USE_WITH_AVX512	1
+
 # define VEC_SIZE	64
-# define VEC(i)		zmm##i
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastb %xmm0, %xmm0; \
-  vpbroadcastq %xmm0, %zmm0
-
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastd %xmm0, %xmm0; \
-  vpbroadcastq %xmm0, %zmm0
-
-# define SECTION(p)		p##.avx512
+# define MOV_SIZE	6
+# define RET_SIZE	1
+
+# define XMM0		xmm16
+# define YMM0		ymm16
+# define VEC0		zmm16
+# define VEC(i)		VEC##i
+
+# define VMOVU     vmovdqu64
+# define VMOVA     vmovdqa64
+
+# define VZEROUPPER
+
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
+
+# define BZERO_ZERO_VEC0() \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
+
+# define SECTION(p)		p##.evex512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
 # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
-
+# define USE_LESS_VEC_MASK_STORE	1
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
new file mode 100644
index 000000000..5d8fa78f0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -0,0 +1,40 @@
+#if IS_IN (libc)
+# define USE_WITH_EVEX	1
+
+# define VEC_SIZE	32
+# define MOV_SIZE	6
+# define RET_SIZE	1
+
+# define XMM0		xmm16
+# define YMM0		ymm16
+# define VEC0		ymm16
+# define VEC(i)		VEC##i
+
+# define VMOVU     vmovdqu64
+# define VMOVA     vmovdqa64
+
+# define VZEROUPPER
+
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
+
+# define BZERO_ZERO_VEC0() \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
+
+# define SECTION(p)		p##.evex
+# define MEMSET_SYMBOL(p,s)	p##_evex_##s
+# define WMEMSET_SYMBOL(p,s)	p##_evex_##s
+# define USE_LESS_VEC_MASK_STORE	1
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index e4e95fc19..2951f7f5f 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -22,6 +22,7 @@
 
 #if IS_IN (libc)
 # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
 # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
 
 # ifdef SHARED
@@ -30,9 +31,7 @@
 # endif
 
 # undef weak_alias
-# define weak_alias(original, alias) \
-	.weak bzero; bzero = __bzero
-
+# define weak_alias(original, alias)
 # undef strong_alias
 # define strong_alias(ignored1, ignored2)
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index faa408561..d9c577fb5 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -26,6 +26,10 @@
 
 #include <sysdep.h>
 
+#ifndef BZERO_SYMBOL
+# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
+#endif
+
 #ifndef MEMSET_CHK_SYMBOL
 # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
 #endif
@@ -34,45 +38,83 @@
 # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
 #endif
 
+#ifndef XMM0
+# define XMM0				xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0				ymm0
+#endif
+
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
+#  define VZEROUPPER_SHORT_RETURN	vzeroupper; ret
 # else
 #  define VZEROUPPER
 # endif
 #endif
 
 #ifndef VZEROUPPER_SHORT_RETURN
-# if VEC_SIZE > 16
-#  define VZEROUPPER_SHORT_RETURN	vzeroupper
-# else
-#  define VZEROUPPER_SHORT_RETURN	rep
-# endif
+# define VZEROUPPER_SHORT_RETURN	rep; ret
 #endif
 
 #ifndef MOVQ
 # if VEC_SIZE > 16
 #  define MOVQ				vmovq
+#  define MOVD				vmovd
 # else
 #  define MOVQ				movq
+#  define MOVD				movd
 # endif
 #endif
 
-#ifndef SECTION
-# error SECTION is not defined!
+#if VEC_SIZE == 64
+# define LOOP_4X_OFFSET	(VEC_SIZE * 4)
+#else
+# define LOOP_4X_OFFSET	(0)
+#endif
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+# define END_REG	rcx
+# define LOOP_REG	rdi
+# define LESS_VEC_REG	rax
+#else
+# define END_REG	rdi
+# define LOOP_REG	rdx
+# define LESS_VEC_REG	rdi
+#endif
+
+#ifdef USE_XMM_LESS_VEC
+# define XMM_SMALL	1
+#else
+# define XMM_SMALL	0
+#endif
+
+#ifdef USE_LESS_VEC_MASK_STORE
+# define SET_REG64	rcx
+# define SET_REG32	ecx
+# define SET_REG16	cx
+# define SET_REG8	cl
+#else
+# define SET_REG64	rsi
+# define SET_REG32	esi
+# define SET_REG16	si
+# define SET_REG8	sil
 #endif
 
-	.section SECTION(.text),"ax",@progbits
-#if VEC_SIZE == 16 && IS_IN (libc)
-ENTRY (__bzero)
-	mov	%RDI_LP, %RAX_LP /* Set return value.  */
-	mov	%RSI_LP, %RDX_LP /* Set n.  */
-	pxor	%xmm0, %xmm0
-	jmp	L(entry_from_bzero)
-END (__bzero)
-weak_alias (__bzero, bzero)
+#define PAGE_SIZE 4096
+
+/* Macro to calculate size of small memset block for aligning
+   purposes.  */
+#define SMALL_MEMSET_ALIGN(mov_sz,	ret_sz)	(2 * (mov_sz) + (ret_sz) + 1)
+
+
+#ifndef SECTION
+# error SECTION is not defined!
 #endif
 
+	.section SECTION(.text), "ax", @progbits
 #if IS_IN (libc)
 # if defined SHARED
 ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
@@ -83,11 +125,40 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 
 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 	shl	$2, %RDX_LP
-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
-	jmp	L(entry_from_bzero)
+	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+	WMEMSET_VDUP_TO_VEC0_LOW()
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec_from_wmemset)
+	WMEMSET_VDUP_TO_VEC0_HIGH()
+	jmp	L(entry_from_wmemset)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
 #endif
 
+ENTRY (BZERO_SYMBOL(__bzero, unaligned))
+#if VEC_SIZE > 16
+	BZERO_ZERO_VEC0 ()
+#endif
+	mov	%RDI_LP, %RAX_LP
+	mov	%RSI_LP, %RDX_LP
+#ifndef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+#endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec_no_vdup)
+#ifdef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+#endif
+#if VEC_SIZE <= 16
+	BZERO_ZERO_VEC0 ()
+#endif
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+END (BZERO_SYMBOL(__bzero, unaligned))
+
 #if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 	cmp	%RDX_LP, %RCX_LP
@@ -96,21 +167,21 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
 # endif
-L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH()
+L(entry_from_wmemset):
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
 
@@ -133,20 +204,43 @@ ENTRY (__memset_erms)
 ENTRY (MEMSET_SYMBOL (__memset, erms))
 # endif
 L(stosb):
-	/* Issue vzeroupper before rep stosb.  */
-	VZEROUPPER
 	mov	%RDX_LP, %RCX_LP
 	movzbl	%sil, %eax
 	mov	%RDI_LP, %RDX_LP
 	rep stosb
 	mov	%RDX_LP, %RAX_LP
-	ret
+	VZEROUPPER_RETURN
 # if VEC_SIZE == 16
 END (__memset_erms)
 # else
 END (MEMSET_SYMBOL (__memset, erms))
 # endif
 
+ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
+# if VEC_SIZE > 16
+	BZERO_ZERO_VEC0 ()
+# endif
+	mov	%RDI_LP, %RAX_LP
+	mov	%RSI_LP, %RDX_LP
+# ifndef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec_no_vdup)
+# ifdef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+# endif
+# if VEC_SIZE <= 16
+	BZERO_ZERO_VEC0 ()
+# endif
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(stosb_more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+END (BZERO_SYMBOL(__bzero, unaligned_erms))
+
 # if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 	cmp	%RDX_LP, %RCX_LP
@@ -154,119 +248,277 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 
-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH ()
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), (%rdi)
-	VZEROUPPER
-	ret
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+#endif
 
-L(stosb_more_2x_vec):
-	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
-	ja	L(stosb)
+	.p2align 4,, 4
+L(last_2x_vec):
+#ifdef USE_LESS_VEC_MASK_STORE
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+#else
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
 #endif
-L(more_2x_vec):
-	cmpq  $(VEC_SIZE * 4), %rdx
-	ja	L(loop_start)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
-L(return):
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
-L(loop_start):
-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
-	VMOVU	%VEC(0), (%rdi)
-	andq	$-(VEC_SIZE * 4), %rcx
-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
-	addq	%rdi, %rdx
-	andq	$-(VEC_SIZE * 4), %rdx
-	cmpq	%rdx, %rcx
-	je	L(return)
-L(loop):
-	VMOVA	%VEC(0), (%rcx)
-	VMOVA	%VEC(0), VEC_SIZE(%rcx)
-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
-	addq	$(VEC_SIZE * 4), %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
-	VZEROUPPER_SHORT_RETURN
-	ret
+	/* If have AVX512 mask instructions put L(less_vec) close to
+	   entry as it doesn't take much space and is likely a hot target.
+	 */
+#ifdef USE_LESS_VEC_MASK_STORE
+	.p2align 4,, 10
 L(less_vec):
+L(less_vec_no_vdup):
+L(less_vec_from_wmemset):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
 # endif
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check. Note that we are using rax which is set in
+	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+	andl	$(PAGE_SIZE - 1), %edi
+	/* Check if VEC_SIZE store cross page. Mask stores suffer
+	   serious performance degradation when it has to fault supress.
+	 */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
+	/* This is generally considered a cold target.  */
+	ja	L(cross_page)
 # if VEC_SIZE > 32
-	cmpb	$32, %dl
-	jae	L(between_32_63)
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+	kmovq	%rcx, %k1
+# else
+	movl	$-1, %ecx
+	bzhil	%edx, %ecx, %ecx
+	kmovd	%ecx, %k1
 # endif
-# if VEC_SIZE > 16
-	cmpb	$16, %dl
-	jae	L(between_16_31)
+	vmovdqu8 %VEC(0), (%rax){%k1}
+	VZEROUPPER_RETURN
+
+# if defined USE_MULTIARCH && IS_IN (libc)
+	/* Include L(stosb_local) here if including L(less_vec) between
+	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
+	   L(stosb_more_2x_vec) target.  */
+	.p2align 4,, 10
+L(stosb_local):
+	movzbl	%sil, %eax
+	mov	%RDX_LP, %RCX_LP
+	mov	%RDI_LP, %RDX_LP
+	rep	stosb
+	mov	%RDX_LP, %RAX_LP
+	VZEROUPPER_RETURN
 # endif
-	MOVQ	%xmm0, %rcx
-	cmpb	$8, %dl
-	jae	L(between_8_15)
-	cmpb	$4, %dl
-	jae	L(between_4_7)
-	cmpb	$1, %dl
-	ja	L(between_2_3)
-	jb	1f
-	movb	%cl, (%rdi)
-1:
-	VZEROUPPER
+#endif
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+	.p2align 4
+L(stosb_more_2x_vec):
+	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+	ja	L(stosb_local)
+#endif
+	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
+	   and (4x, 8x] jump to target.  */
+L(more_2x_vec):
+	/* Store next 2x vec regardless.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
+
+
+	/* Two different methods of setting up pointers / compare. The two
+	   methods are based on the fact that EVEX/AVX512 mov instructions take
+	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
+	   address mode. For EVEX/AVX512 this saves code size and keeps a few
+	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
+	   bottlenecks.  */
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
+	addq	%rdx, %END_REG
+#endif
+
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_2x_vec)
+
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+	   LEA_BID.  */
+
+	/* END_REG is rcx for EVEX/AVX512.  */
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
+	/* Store next 2x vec regardless.  */
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
+	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
+
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
+	   extra offset to addresses in loop. Used for AVX512 to save space
+	   as no way to get (VEC_SIZE * 4) in imm8.  */
+# if LOOP_4X_OFFSET == 0
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
+# endif
+	/* Avoid imm32 compare here to save code size.  */
+	cmpq	%rdi, %rcx
+#else
+	addq	$-(VEC_SIZE * 4), %END_REG
+	cmpq	$(VEC_SIZE * 8), %rdx
+#endif
+	jbe	L(last_4x_vec)
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+	/* Set LOOP_REG (rdx).  */
+	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
+#endif
+	/* Align dst for loop.  */
+	andq	$(VEC_SIZE * -2), %LOOP_REG
+	.p2align 4
+L(loop):
+	VMOVA	%VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
+	VMOVA	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
+	VMOVA	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
+	cmpq	%END_REG, %LOOP_REG
+	jb	L(loop)
+	.p2align 4,, MOV_SIZE
+L(last_4x_vec):
+	VMOVU	%VEC(0), LOOP_4X_OFFSET(%END_REG)
+	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
+	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
+L(return):
+#if VEC_SIZE > 16
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
 	ret
-# if VEC_SIZE > 32
+#endif
+
+	.p2align 4,, 10
+#ifndef USE_LESS_VEC_MASK_STORE
+# if defined USE_MULTIARCH && IS_IN (libc)
+	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
+	   range for 2-byte jump encoding.  */
+L(stosb_local):
+	movzbl	%sil, %eax
+	mov	%RDX_LP, %RCX_LP
+	mov	%RDI_LP, %RDX_LP
+	rep	stosb
+	mov	%RDX_LP, %RAX_LP
+	VZEROUPPER_RETURN
+# endif
+	/* Define L(less_vec) only if not otherwise defined.  */
+	.p2align 4
+L(less_vec):
+	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+	   xmm). This is only does anything for AVX2.  */
+	MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_from_wmemset):
+#if VEC_SIZE > 16
+L(less_vec_no_vdup):
+#endif
+#endif
+L(cross_page):
+#if VEC_SIZE > 32
+	cmpl	$32, %edx
+	jge	L(between_32_63)
+#endif
+#if VEC_SIZE > 16
+	cmpl	$16, %edx
+	jge	L(between_16_31)
+#endif
+#ifndef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, %SET_REG64
+#endif
+#if VEC_SIZE <= 16
+L(less_vec_no_vdup):
+#endif
+	cmpl	$8, %edx
+	jge	L(between_8_15)
+	cmpl	$4, %edx
+	jge	L(between_4_7)
+	cmpl	$1, %edx
+	jg	L(between_2_3)
+	jl	L(between_0_0)
+	movb	%SET_REG8, (%LESS_VEC_REG)
+L(between_0_0):
+	ret
+
+	/* Align small targets only if not doing so would cross a fetch line.
+	 */
+#if VEC_SIZE > 32
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	vmovdqu	%ymm0, -32(%rdi,%rdx)
-	vmovdqu	%ymm0, (%rdi)
-	VZEROUPPER
-	ret
-# endif
-# if VEC_SIZE > 16
-	/* From 16 to 31.  No branch when size == 16.  */
+	VMOVU	%YMM0, (%LESS_VEC_REG)
+	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+	VZEROUPPER_RETURN
+#endif
+
+#if VEC_SIZE >= 32
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 L(between_16_31):
-	vmovdqu	%xmm0, -16(%rdi,%rdx)
-	vmovdqu	%xmm0, (%rdi)
-	VZEROUPPER
+	/* From 16 to 31.  No branch when size == 16.  */
+	VMOVU	%XMM0, (%LESS_VEC_REG)
+	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
 	ret
-# endif
-	/* From 8 to 15.  No branch when size == 8.  */
+#endif
+
+	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
 L(between_8_15):
-	movq	%rcx, -8(%rdi,%rdx)
-	movq	%rcx, (%rdi)
-	VZEROUPPER
+	/* From 8 to 15.  No branch when size == 8.  */
+#ifdef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, (%rdi)
+	MOVQ	%XMM0, -8(%rdi, %rdx)
+#else
+	movq	%SET_REG64, (%LESS_VEC_REG)
+	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
+#endif
 	ret
+
+	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%ecx, (%rdi)
-	VZEROUPPER
+#ifdef USE_XMM_LESS_VEC
+	MOVD	%XMM0, (%rdi)
+	MOVD	%XMM0, -4(%rdi, %rdx)
+#else
+	movl	%SET_REG32, (%LESS_VEC_REG)
+	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
+#endif
 	ret
+
+	/* 4 * XMM_SMALL for the third mov for AVX2.  */
+	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%cx, (%rdi)
-	VZEROUPPER
+#ifdef USE_XMM_LESS_VEC
+	movb	%SET_REG8, (%rdi)
+	movb	%SET_REG8, 1(%rdi)
+	movb	%SET_REG8, -1(%rdi, %rdx)
+#else
+	movw	%SET_REG16, (%LESS_VEC_REG)
+	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
+#endif
 	ret
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
new file mode 100644
index 000000000..acc5f6e2f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_avx2_rtm
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
new file mode 100644
index 000000000..deda1ca39
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
@@ -0,0 +1,3 @@
+#define MEMCHR __rawmemchr_evex_rtm
+#define USE_AS_RAWMEMCHR 1
+#include "memchr-evex-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
new file mode 100644
index 000000000..ec942b77b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_evex
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
index 8ef246840..813387540 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr.c
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
@@ -26,7 +26,7 @@
 # undef __rawmemchr
 
 # define SYMBOL_NAME rawmemchr
-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
 
 libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
 		       IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
new file mode 100644
index 000000000..2b9c07a59
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S
new file mode 100644
index 000000000..7c6f26cd9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_evex
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
new file mode 100644
index 000000000..60a2ccfe5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
new file mode 100644
index 000000000..1570014d1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_evex
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
deleted file mode 100644
index 647aa0571..000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* strcasecmp_l optimized with AVX.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#define STRCMP_SSE42 __strcasecmp_l_avx
-#define USE_AVX 1
-#define USE_AS_STRCASECMP_L
-#include "strcmp-sse42.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 000000000..09957fc3c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 000000000..e2762f2a2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
new file mode 100644
index 000000000..58642db74
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
new file mode 100644
index 000000000..637fb557c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCAT
+# define STRCAT __strcat_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
index 41de8b2b6..4356fa733 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -30,7 +30,11 @@
 /* Number of bytes in a vector register */
 # define VEC_SIZE	32
 
-	.section .text.avx,"ax",@progbits
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCAT)
 	mov	%rdi, %r9
 # ifdef USE_AS_STRNCAT
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
new file mode 100644
index 000000000..97c3d85b6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -0,0 +1,283 @@
+/* strcat with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_evex
+# endif
+
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+/* zero register */
+# define XMMZERO	xmm16
+# define YMMZERO	ymm16
+# define YMM0		ymm17
+# define YMM1		ymm18
+
+# define USE_AS_STRCAT
+
+/* Number of bytes in a vector register */
+# define VEC_SIZE	32
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRCAT)
+	mov	%rdi, %r9
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+	xor	%eax, %eax
+	mov	%edi, %ecx
+	and	$((VEC_SIZE * 4) - 1), %ecx
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+	cmp	$(VEC_SIZE * 3), %ecx
+	ja	L(fourth_vector_boundary)
+	vpcmpb	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_first_vector)
+	mov	%rdi, %rax
+	and	$-VEC_SIZE, %rax
+	jmp	L(align_vec_size_start)
+L(fourth_vector_boundary):
+	mov	%rdi, %rax
+	and	$-VEC_SIZE, %rax
+	vpcmpb	$0, (%rax), %YMMZERO, %k0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	kmovd	%k0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+
+L(align_vec_size_start):
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	add	$(VEC_SIZE * 4), %rax
+	kmovd	%k4, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	kmovd	%k4, %edx
+	add	$(VEC_SIZE * 4), %rax
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	add	$(VEC_SIZE * 4), %rax
+	kmovd	%k4, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	add	$(VEC_SIZE * 5), %rax
+	kmovd	%k4, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	add	$VEC_SIZE, %rax
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	add	$VEC_SIZE, %rax
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
+	add	$VEC_SIZE, %rax
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$VEC_SIZE, %rax
+
+	.p2align 4
+L(align_four_vec_loop):
+	VMOVA	(%rax), %YMM0
+	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
+	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
+	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
+	vpminub	%YMM0, %YMM1, %YMM0
+	/* If K0 != 0, there is a null byte.  */
+	vpcmpb	$0, %YMM0, %YMMZERO, %k0
+	add	$(VEC_SIZE * 4), %rax
+	ktestd	%k0, %k0
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
+	sub	$(VEC_SIZE * 5), %rax
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 4), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_null_on_first_vector):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_second_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$VEC_SIZE, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_third_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 2), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_fourth_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 3), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_fifth_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 4), %rax
+
+	.p2align 4
+L(StartStrcpyPart):
+	lea	(%r9, %rax), %rdi
+	mov	%rsi, %rcx
+	mov	%r9, %rax      /* save result */
+
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-evex.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
new file mode 100644
index 000000000..81f20d1d8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCHR
+# define STRCHR __strchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index d416558d0..ef4ce0f36 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
 #  define CHAR_REG	esi
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
 #  define CHAR_REG	sil
 # endif
 
@@ -38,217 +40,274 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
 
-	.section .text.avx,"ax",@progbits
-ENTRY (STRCHR)
-	movl	%edi, %ecx
-	/* Broadcast CHAR to YMM0.  */
+	.section SECTION(.text),"ax",@progbits
+ENTRY_P2ALIGN (STRCHR, 5)
+	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
-	vpxor	%xmm9, %xmm9, %xmm9
-	VPBROADCAST %xmm0, %ymm0
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
-
-	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	VPBROADCAST	%xmm0, %ymm0
+	vpxor	%xmm1, %xmm1, %xmm1
+
+	/* Check if we cross page boundary with one vector load.  */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 	   null byte.  */
-	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqu	(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.  */
+	cmp	(%rdi, %rax), %CHAR_REG
+	/* NB: Use a branch instead of cmovcc here. The expectation is
+	   that with strchr the user will branch based on input being
+	   null. Since this branch will be 100% predictive of the user
+	   branch a branch miss here should save what otherwise would
+	   be branch miss in the user code. Otherwise using a branch 1)
+	   saves code size and 2) is faster in highly predictable
+	   environments.  */
+	jne	L(zero)
+# endif
+	addq	%rdi, %rax
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+# ifndef USE_AS_STRCHRNUL
+L(zero):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
 
-	jmp	L(more_4x_vec)
 
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	/* Found CHAR or the null byte.  */
-	tzcntl	%eax, %eax
-	addq	%rcx, %rax
-# ifdef USE_AS_STRCHRNUL
+L(first_vec_x1):
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	incq	%rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 10
+L(first_vec_x2):
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$(VEC_SIZE + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(first_vec_x3):
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
-	VZEROUPPER
-	ret
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 10
+L(first_vec_x4):
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+
 
 	.p2align 4
 L(aligned_more):
-	addq	$VEC_SIZE, %rdi
-
-L(more_4x_vec):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
+	   on x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	vmovdqa	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	vmovdqa	VEC_SIZE(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
-
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+	/* Align data to VEC_SIZE * 4 - 1.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 	.p2align 4
 L(loop_4x_vec):
 	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(%rdi), %ymm5
-	vmovdqa	VEC_SIZE(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+	vmovdqa	1(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm7
+
+	/* Leaves only CHARS matching esi as 0.	 */
+	vpxor	%ymm6, %ymm0, %ymm2
+	vpxor	%ymm7, %ymm0, %ymm3
+
+	VPMINU	%ymm2, %ymm6, %ymm2
+	VPMINU	%ymm3, %ymm7, %ymm3
+
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm7
 
-	VPCMPEQ %ymm5, %ymm0, %ymm1
-	VPCMPEQ %ymm6, %ymm0, %ymm2
-	VPCMPEQ %ymm7, %ymm0, %ymm3
-	VPCMPEQ %ymm8, %ymm0, %ymm4
+	vpxor	%ymm6, %ymm0, %ymm4
+	vpxor	%ymm7, %ymm0, %ymm5
 
-	VPCMPEQ %ymm5, %ymm9, %ymm5
-	VPCMPEQ %ymm6, %ymm9, %ymm6
-	VPCMPEQ %ymm7, %ymm9, %ymm7
-	VPCMPEQ %ymm8, %ymm9, %ymm8
+	VPMINU	%ymm4, %ymm6, %ymm4
+	VPMINU	%ymm5, %ymm7, %ymm5
 
-	vpor	%ymm1, %ymm5, %ymm1
-	vpor	%ymm2, %ymm6, %ymm2
-	vpor	%ymm3, %ymm7, %ymm3
-	vpor	%ymm4, %ymm8, %ymm4
+	VPMINU	%ymm2, %ymm3, %ymm6
+	VPMINU	%ymm4, %ymm5, %ymm7
 
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
+	VPMINU	%ymm6, %ymm7, %ymm7
 
-	vpor	%ymm5, %ymm6, %ymm5
+	VPCMPEQ	%ymm7, %ymm1, %ymm7
+	vpmovmskb %ymm7, %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 
-	vpmovmskb %ymm5, %eax
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
-	jnz	L(4x_vec_end)
+	jnz	L(last_vec_x0)
 
-	addq	$(VEC_SIZE * 4), %rdi
 
-	jmp	L(loop_4x_vec)
+	VPCMPEQ	%ymm3, %ymm1, %ymm3
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
 
-	.p2align 4
-L(first_vec_x0):
-	/* Found CHAR or the null byte.  */
-	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	VPCMPEQ	%ymm4, %ymm1, %ymm4
+	vpmovmskb %ymm4, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
 # endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$VEC_SIZE, %rax
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	VEC_SIZE(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 10
+L(last_vec_x0):
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$-(VEC_SIZE * 4 - 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
 # endif
-	VZEROUPPER
-	ret
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	.p2align 4
-L(first_vec_x2):
+
+	.p2align 4,, 10
+L(last_vec_x1):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$(VEC_SIZE * 2), %rax
+	subq	$(VEC_SIZE * 3 - 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	VZEROUPPER_RETURN
+
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
 # endif
-	VZEROUPPER
-	ret
 
-	.p2align 4
-L(4x_vec_end):
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	/* Cold case for crossing page with first load.	 */
+	.p2align 4,, 8
+L(cross_page_boundary):
+	movq	%rdi, %rdx
+	/* Align rdi to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
 	vpmovmskb %ymm3, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod edx.  */
+	sarxl	%edx, %eax, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-L(first_vec_x3):
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+# ifndef USE_AS_STRCHRNUL
+	xorl	%ecx, %ecx
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdx, %rax), %CHAR_REG
+	jne	L(zero_end)
 # endif
-	VZEROUPPER
-	ret
+	addq	%rdx, %rax
+	VZEROUPPER_RETURN
 
 END (STRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
new file mode 100644
index 000000000..0b49e0ac5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -0,0 +1,393 @@
+/* strchr/strchrnul optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCHR
+#  define STRCHR	__strchr_evex
+# endif
+
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# ifdef USE_AS_WCSCHR
+#  define VPBROADCAST	vpbroadcastd
+#  define VPCMP		vpcmpd
+#  define VPTESTN	vptestnmd
+#  define VPMINU	vpminud
+#  define CHAR_REG	esi
+#  define SHIFT_REG	ecx
+#  define CHAR_SIZE	4
+# else
+#  define VPBROADCAST	vpbroadcastb
+#  define VPCMP		vpcmpb
+#  define VPTESTN	vptestnmb
+#  define VPMINU	vpminub
+#  define CHAR_REG	sil
+#  define SHIFT_REG	edx
+#  define CHAR_SIZE	1
+# endif
+
+# define XMMZERO	xmm16
+
+# define YMMZERO	ymm16
+# define YMM0		ymm17
+# define YMM1		ymm18
+# define YMM2		ymm19
+# define YMM3		ymm20
+# define YMM4		ymm21
+# define YMM5		ymm22
+# define YMM6		ymm23
+# define YMM7		ymm24
+# define YMM8		ymm25
+
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+	.section .text.evex,"ax",@progbits
+ENTRY_P2ALIGN (STRCHR, 5)
+	/* Broadcast CHAR to YMM0.	*/
+	VPBROADCAST	%esi, %YMM0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	/* Check if we cross page boundary with one vector load.
+	   Otherwise it is safe to use an unaligned load.  */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
+	   null bytes.  */
+	VMOVU	(%rdi), %YMM1
+
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPTESTN	%YMM2, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.  */
+	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	/* NB: Use a branch instead of cmovcc here. The expectation is
+	   that with strchr the user will branch based on input being
+	   null. Since this branch will be 100% predictive of the user
+	   branch a branch miss here should save what otherwise would
+	   be branch miss in the user code. Otherwise using a branch 1)
+	   saves code size and 2) is faster in highly predictable
+	   environments.  */
+	jne	L(zero)
+# endif
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	 */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+	ret
+
+
+
+	.p2align 4,, 10
+L(first_vec_x4):
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	kmovd	%k1, %ecx
+	/* bzhil will not be 0 if first match was null.  */
+	bzhil	%eax, %ecx, %ecx
+	jne	L(zero)
+# else
+	/* Combine CHAR and null matches.  */
+	kord	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+# ifndef USE_AS_STRCHRNUL
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+
+	.p2align 4
+L(first_vec_x1):
+	/* Use bsf here to save 1-byte keeping keeping the block in 1x
+	   fetch block. eax guranteed non-zero.  */
+	bsfl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 10
+L(first_vec_x2):
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	kmovd	%k1, %ecx
+	/* bzhil will not be 0 if first match was null.  */
+	bzhil	%eax, %ecx, %ecx
+	jne	L(zero)
+# else
+	/* Combine CHAR and null matches.  */
+	kord	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 10
+L(first_vec_x3):
+	/* Use bsf here to save 1-byte keeping keeping the block in 1x
+	   fetch block. eax guranteed non-zero.  */
+	bsfl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(aligned_more):
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
+	   data is only aligned to VEC_SIZE. Use two alternating methods
+	   for checking VEC to balance latency and port contention.  */
+
+	/* This method has higher latency but has better port
+	   distribution.  */
+	VMOVA	(VEC_SIZE)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPTESTN	%YMM2, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	/* This method has higher latency but has better port
+	   distribution.  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+	/* Each bit in K0 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMM0, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPTESTN	%YMM1, %YMM1, %k1
+	kortestd	%k0, %k1
+	jnz	L(first_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPTESTN	%YMM2, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+	/* Each bit in K0 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMM0, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPTESTN	%YMM1, %YMM1, %k1
+	kortestd	%k0, %k1
+	jnz	L(first_vec_x4)
+
+	/* Align data to VEC_SIZE * 4 for the loop.  */
+	addq	$VEC_SIZE, %rdi
+	andq	$-(VEC_SIZE * 4), %rdi
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
+	   encoding.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
+	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
+
+	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+	   zero.  */
+	vpxorq	%YMM1, %YMM0, %YMM5
+	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
+	   k register. Its possible to save either 1 or 2 instructions
+	   using cmp no equals method for either YMM1 or YMM1 and YMM3
+	   respectively but bottleneck on p5 makes it not worth it.  */
+	VPCMP	$4, %YMM0, %YMM2, %k2
+	vpxorq	%YMM3, %YMM0, %YMM7
+	VPCMP	$4, %YMM0, %YMM4, %k4
+
+	/* Use min to select all zeros from either xor or end of string).
+	 */
+	VPMINU	%YMM1, %YMM5, %YMM1
+	VPMINU	%YMM3, %YMM7, %YMM3
+
+	/* Use min + zeromask to select for zeros. Since k2 and k4 will
+	   have 0 as positions that matched with CHAR which will set
+	   zero in the corresponding destination bytes in YMM2 / YMM4.
+	 */
+	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
+	VPMINU	%YMM3, %YMM4, %YMM4
+	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
+
+	VPTESTN	%YMM4, %YMM4, %k1
+	kmovd	%k1, %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
+
+	VPTESTN	%YMM1, %YMM1, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	VPTESTN	%YMM2, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	VPTESTN	%YMM3, %YMM3, %k0
+	kmovd	%k0, %eax
+	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
+# ifdef USE_AS_WCSCHR
+	sall	$8, %ecx
+	orl	%ecx, %eax
+	bsfl	%eax, %eax
+# else
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	bsfq	%rax, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	/* Check if match was CHAR or null.  */
+	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x1):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	   */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+
+# ifndef USE_AS_STRCHRNUL
+	/* Check if match was null.  */
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x2):
+	bsfl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Check if match was null.  */
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	/* Cold case for crossing page with first load.	 */
+	.p2align 4,, 8
+L(cross_page_boundary):
+	movq	%rdi, %rdx
+	/* Align rdi.  */
+	andq	$-VEC_SIZE, %rdi
+	VMOVA	(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPTESTN	%YMM2, %YMM2, %k0
+	kmovd	%k0, %eax
+	/* Remove the leading bits.  */
+# ifdef USE_AS_WCSCHR
+	movl	%edx, %SHIFT_REG
+	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	   bytes.  */
+	sarl	$2, %SHIFT_REG
+	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
+# endif
+	sarxl	%SHIFT_REG, %eax, %eax
+	/* If eax is zero continue.  */
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	bsfl	%eax, %eax
+
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of
+	   bytes.  */
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdx, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if match was CHAR or null.  */
+	cmp	(%rax), %CHAR_REG
+	je	L(cross_page_ret)
+L(zero_end):
+	xorl	%eax, %eax
+L(cross_page_ret):
+# endif
+	ret
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 583a15279..691770f33 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -29,16 +29,28 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
 
   if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF))
     return OPTIMIZE (sse2_no_bsf);
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
new file mode 100644
index 000000000..cdcf818b9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2_rtm
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S
new file mode 100644
index 000000000..064fe7ca9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_evex
+#define USE_AS_STRCHRNUL 1
+#include "strchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
new file mode 100644
index 000000000..aecd30d97
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCMP
+# define STRCMP __strcmp_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 53cb7a669..a9806daad 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,41 +20,146 @@
 
 # include <sysdep.h>
 
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
+
 # ifndef STRCMP
 #  define STRCMP	__strcmp_avx2
 # endif
 
 # define PAGE_SIZE	4096
 
-/* VEC_SIZE = Number of bytes in a ymm register */
+	/* VEC_SIZE = Number of bytes in a ymm register.  */
 # define VEC_SIZE	32
 
-/* Shift for dividing by (VEC_SIZE * 4).  */
-# define DIVIDE_BY_VEC_4_SHIFT	7
-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-# endif
+# define VMOVU	vmovdqu
+# define VMOVA	vmovdqa
 
 # ifdef USE_AS_WCSCMP
-/* Compare packed dwords.  */
+	/* Compare packed dwords.  */
 #  define VPCMPEQ	vpcmpeqd
-/* Compare packed dwords and store minimum.  */
+	/* Compare packed dwords and store minimum.  */
 #  define VPMINU	vpminud
-/* 1 dword char == 4 bytes.  */
+	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
 # else
-/* Compare packed bytes.  */
+	/* Compare packed bytes.  */
 #  define VPCMPEQ	vpcmpeqb
-/* Compare packed bytes and store minimum.  */
+	/* Compare packed bytes and store minimum.  */
 #  define VPMINU	vpminub
-/* 1 byte char == 1 byte.  */
+	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
 # endif
 
+# ifdef USE_AS_STRNCMP
+#  define LOOP_REG	r9d
+#  define LOOP_REG64	r9
+
+#  define OFFSET_REG8	r9b
+#  define OFFSET_REG	r9d
+#  define OFFSET_REG64	r9
+# else
+#  define LOOP_REG	edx
+#  define LOOP_REG64	rdx
+
+#  define OFFSET_REG8	dl
+#  define OFFSET_REG	edx
+#  define OFFSET_REG64	rdx
+# endif
+
 # ifndef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# if defined USE_AS_STRNCMP
+#  define VEC_OFFSET	0
+# else
+#  define VEC_OFFSET	(-VEC_SIZE)
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_avx2
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_avx2
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
+# define xmmZERO	xmm15
+# define ymmZERO	ymm15
+
+# define LCASE_MIN_ymm	%ymm10
+# define LCASE_MAX_ymm	%ymm11
+# define CASE_ADD_ymm	%ymm12
+
+# define LCASE_MIN_xmm	%xmm10
+# define LCASE_MAX_xmm	%xmm11
+# define CASE_ADD_xmm	%xmm12
+
+	/* r11 is never use elsewhere so this is safe to maintain.  */
+# define TOLOWER_BASE	%r11
+
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  define REG(x, y) x ## y
+#  define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext)			\
+	vpaddb	REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8);				\
+	vpaddb	REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9);				\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8);			\
+	vpandn	REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9);			\
+	vpaddb	REG(%ext, 8), reg1_in, reg1_out;							\
+	vpaddb	REG(%ext, 9), reg2_in, reg2_out
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_ymm(...)	TOLOWER(__VA_ARGS__, ymm)
+#  define TOLOWER_xmm(...)	TOLOWER(__VA_ARGS__, xmm)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext)			\
+	TOLOWER	(s1_reg, scratch_reg, s2_reg, s2_reg, ext);					\
+	VPCMPEQ	scratch_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext)			\
+	VMOVU	s2_mem, reg_out;											\
+	CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+#  define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+#  define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_ymm(...)
+#  define TOLOWER_xmm(...)
+
+#  define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out)			\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+#  define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+#  define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
            strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -75,788 +180,1142 @@
    the maximum offset is reached before a difference is found, zero is
    returned.  */
 
-	.section .text.avx,"ax",@progbits
-ENTRY (STRCMP)
+	.section SECTION(.text), "ax", @progbits
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifndef GLABEL
+#  define GLABEL(...)	__VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (GLABEL(STRCASECMP))
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
-	/* Check for simple cases (0 or 1) in offset.  */
+	/* Don't overwrite LOCALE_REG (rcx) until we have pass
+	   L(one_or_less). Otherwise we might use the wrong locale in
+	   the OVERFLOW_STRCMP (strcasecmp_l).  */
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
 	cmp	$1, %RDX_LP
-	je	L(char0)
-	jb	L(zero)
+	/* Signed comparison intentional. We use this branch to also
+	   test cases where length >= 2^63. These very large sizes can be
+	   handled with strcmp as there is no way for that length to
+	   actually bound the buffer.  */
+	jle	L(one_or_less)
 #  ifdef USE_AS_WCSCMP
-	/* Convert units: from wide to byte char.  */
-	shl	$2, %RDX_LP
+	movq	%rdx, %rcx
+
+	/* Multiplying length by sizeof(wchar_t) can result in overflow.
+	   Check if that is possible. All cases where overflow are possible
+	   are cases where length is large enough that it can never be a
+	   bound on valid memory so just use wcscmp.  */
+	shrq	$56, %rcx
+	jnz	OVERFLOW_STRCMP
+
+	leaq	(, %rdx, 4), %rdx
 #  endif
-	/* Register %r11 tracks the maximum offset.  */
-	mov	%RDX_LP, %R11_LP
+# endif
+	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa	L(lcase_min)(%rip), LCASE_MIN_ymm
+	vmovdqa	L(lcase_max)(%rip), LCASE_MAX_ymm
+	vmovdqa	L(case_add)(%rip), CASE_ADD_ymm
 # endif
 	movl	%edi, %eax
-	xorl	%edx, %edx
-	/* Make %xmm7 (%ymm7) all zeros in this function.  */
-	vpxor	%xmm7, %xmm7, %xmm7
 	orl	%esi, %eax
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
-	jg	L(cross_page)
-	/* Start comparing 4 vectors.  */
-	vmovdqu	(%rdi), %ymm1
-	VPCMPEQ	(%rsi), %ymm1, %ymm0
-	VPMINU	%ymm1, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
-	vpmovmskb %ymm0, %ecx
-	testl	%ecx, %ecx
-	je	L(next_3_vectors)
-	tzcntl	%ecx, %edx
+	sall	$20, %eax
+	/* Check if s1 or s2 may cross a page  in next 4x VEC loads.  */
+	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
+	ja	L(page_cross)
+
+L(no_page_cross):
+	/* Safe to compare 4x vectors.  */
+	VMOVU	(%rdi), %ymm0
+	/* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+	   Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+	   scratch and ymm1 is the return.  */
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+	/* 1s at null CHAR.  */
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	/* 1s where s1 and s2 equal AND not null CHAR.  */
+	vpandn	%ymm1, %ymm2, %ymm1
+
+	/* All 1s -> keep going, any 0s -> return.  */
+	vpmovmskb %ymm1, %ecx
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx) is after the maximum
-	   offset (%r11).   */
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(vec_0_test_len)
 # endif
+
+	/* All 1s represents all equals. incl will overflow to zero in
+	   all equals case. Otherwise 1s will carry until position of first
+	   mismatch.  */
+	incl	%ecx
+	jz	L(more_3x_vec)
+
+	.p2align 4,, 4
+L(return_vec_0):
+	tzcntl	%ecx, %ecx
 # ifdef USE_AS_WCSCMP
+	movl	(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	je	L(return)
-L(wcscmp_return):
+	cmpl	(%rsi, %rcx), %edx
+	je	L(ret0)
 	setl	%al
 	negl	%eax
 	orl	$1, %eax
-L(return):
 # else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 # endif
-	VZEROUPPER
-	ret
+L(ret0):
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
-	.p2align 4
-L(return_vec_size):
-	tzcntl	%ecx, %edx
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
-	   the maximum offset (%r11).  */
-	addq	$VEC_SIZE, %rdx
-	cmpq	%r11, %rdx
-	jae	L(zero)
-#  ifdef USE_AS_WCSCMP
+	.p2align 4,, 8
+L(vec_0_test_len):
+	notl	%ecx
+	bzhil	%edx, %ecx, %eax
+	jnz	L(return_vec_0)
+	/* Align if will cross fetch block.  */
+	.p2align 4,, 2
+L(ret_zero):
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 5
+L(one_or_less):
+#  ifdef USE_AS_STRCASECMP_L
+	/* Set locale argument for strcasecmp.  */
+	movq	%LOCALE_REG, %rdx
 #  endif
-# else
+	jb	L(ret_zero)
+	/* 'nbe' covers the case where length is negative (large
+	   unsigned).  */
+	jnbe	OVERFLOW_STRCMP
 #  ifdef USE_AS_WCSCMP
+	movl	(%rdi), %edx
 	xorl	%eax, %eax
-	movl	VEC_SIZE(%rdi, %rdx), %ecx
-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
+	cmpl	(%rsi), %edx
+	je	L(ret1)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
 #  else
-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 #  endif
-# endif
-	VZEROUPPER
+L(ret1):
 	ret
+# endif
 
-	.p2align 4
-L(return_2_vec_size):
-	tzcntl	%ecx, %edx
+	.p2align 4,, 10
+L(return_vec_1):
+	tzcntl	%ecx, %ecx
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
-	   after the maximum offset (%r11).  */
-	addq	$(VEC_SIZE * 2), %rdx
-	cmpq	%r11, %rdx
-	jae	L(zero)
-#  ifdef USE_AS_WCSCMP
+	/* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
+	   overflow.  */
+	addq	$-VEC_SIZE, %rdx
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero)
+# endif
+# ifdef USE_AS_WCSCMP
+	movl	VEC_SIZE(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+	je	L(ret2)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
 # else
-#  ifdef USE_AS_WCSCMP
-	xorl	%eax, %eax
-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
-	subl	%edx, %eax
-#  endif
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 # endif
-	VZEROUPPER
-	ret
+L(ret2):
+	VZEROUPPER_RETURN
 
-	.p2align 4
-L(return_3_vec_size):
-	tzcntl	%ecx, %edx
+	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
-	   after the maximum offset (%r11).  */
-	addq	$(VEC_SIZE * 3), %rdx
-	cmpq	%r11, %rdx
-	jae	L(zero)
-#  ifdef USE_AS_WCSCMP
+L(return_vec_3):
+	salq	$32, %rcx
+# endif
+
+L(return_vec_2):
+# ifndef USE_AS_STRNCMP
+	tzcntl	%ecx, %ecx
+# else
+	tzcntq	%rcx, %rcx
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	je	L(ret3)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
 # else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+# endif
+L(ret3):
+	VZEROUPPER_RETURN
+
+# ifndef USE_AS_STRNCMP
+	.p2align 4,, 10
+L(return_vec_3):
+	tzcntl	%ecx, %ecx
 #  ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	je	L(ret4)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
 #  else
-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
 #  endif
+L(ret4):
+	VZEROUPPER_RETURN
 # endif
-	VZEROUPPER
-	ret
 
-	.p2align 4
-L(next_3_vectors):
-	vmovdqu	VEC_SIZE(%rdi), %ymm6
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
-	VPMINU	%ymm6, %ymm3, %ymm3
-	VPCMPEQ	%ymm7, %ymm3, %ymm3
-	vpmovmskb %ymm3, %ecx
-	testl	%ecx, %ecx
-	jne	L(return_vec_size)
-	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
-	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
-	VPMINU	%ymm5, %ymm2, %ymm2
-	VPCMPEQ	%ymm4, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm2, %ymm2
-	vpmovmskb %ymm2, %ecx
-	testl	%ecx, %ecx
-	jne	L(return_2_vec_size)
-	VPMINU	%ymm4, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
-	vpmovmskb %ymm0, %ecx
-	testl	%ecx, %ecx
-	jne	L(return_3_vec_size)
-L(main_loop_header):
-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
-	movl	$PAGE_SIZE, %ecx
-	/* Align load via RAX.  */
-	andq	$-(VEC_SIZE * 4), %rdx
-	subq	%rdi, %rdx
-	leaq	(%rdi, %rdx), %rax
+	.p2align 4,, 10
+L(more_3x_vec):
+	/* Safe to compare 4x vectors.  */
+	VMOVU	VEC_SIZE(%rdi), %ymm0
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_1)
+
 # ifdef USE_AS_STRNCMP
-	/* Starting from this point, the maximum offset, or simply the
-	   'offset', DECREASES by the same amount when base pointers are
-	   moved forward.  Return 0 when:
-	     1) On match: offset <= the matched vector index.
-	     2) On mistmach, offset is before the mistmatched index.
+	subq	$(VEC_SIZE * 2), %rdx
+	jbe	L(ret_zero)
+# endif
+
+	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_2)
+
+	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_3)
+
+# ifdef USE_AS_STRNCMP
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(ret_zero)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	/* any non-zero positive value that doesn't inference with 0x1.
 	 */
-	subq	%rdx, %r11
-	jbe	L(zero)
-# endif
-	addq	%rsi, %rdx
-	movq	%rdx, %rsi
-	andl	$(PAGE_SIZE - 1), %esi
-	/* Number of bytes before page crossing.  */
-	subq	%rsi, %rcx
-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
-	movl	%ecx, %esi
-	jmp	L(loop_start)
+	movl	$2, %r8d
+
+# else
+	xorl	%r8d, %r8d
+# endif
+
+	/* The prepare labels are various entry points from the page
+	   cross logic.  */
+L(prepare_loop):
 
-	.p2align 4
-L(loop):
 # ifdef USE_AS_STRNCMP
-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
-	   the maximum offset (%r11) by the same amount.  */
-	subq	$(VEC_SIZE * 4), %r11
-	jbe	L(zero)
-# endif
-	addq	$(VEC_SIZE * 4), %rax
-	addq	$(VEC_SIZE * 4), %rdx
-L(loop_start):
-	testl	%esi, %esi
-	leal	-1(%esi), %esi
-	je	L(loop_cross_page)
-L(back_to_loop):
-	/* Main loop, comparing 4 vectors are a time.  */
-	vmovdqa	(%rax), %ymm0
-	vmovdqa	VEC_SIZE(%rax), %ymm3
-	VPCMPEQ	(%rdx), %ymm0, %ymm4
-	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
-	VPMINU	%ymm0, %ymm4, %ymm4
-	VPMINU	%ymm3, %ymm1, %ymm1
-	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
-	VPMINU	%ymm1, %ymm4, %ymm0
-	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
-	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
-	VPMINU	%ymm2, %ymm5, %ymm5
-	VPMINU	%ymm3, %ymm6, %ymm6
-	VPMINU	%ymm5, %ymm0, %ymm0
-	VPMINU	%ymm6, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
-
-	/* Test each mask (32 bits) individually because for VEC_SIZE
-	   == 32 is not possible to OR the four masks and keep all bits
-	   in a 64-bit integer register, differing from SSE2 strcmp
-	   where ORing is possible.  */
-	vpmovmskb %ymm0, %ecx
-	testl	%ecx, %ecx
-	je	L(loop)
-	VPCMPEQ	%ymm7, %ymm4, %ymm0
-	vpmovmskb %ymm0, %edi
-	testl	%edi, %edi
-	je	L(test_vec)
-	tzcntl	%edi, %ecx
+	/* Store N + (VEC_SIZE * 4) and place check at the begining of
+	   the loop.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
+# endif
+L(prepare_loop_no_len):
+
+	/* Align s1 and adjust s2 accordingly.  */
+	subq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 4), %rdi
+	addq	%rdi, %rsi
+
 # ifdef USE_AS_STRNCMP
-	cmpq	%rcx, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
-# else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	subq	%rdi, %rdx
 # endif
-	VZEROUPPER
-	ret
 
+L(prepare_loop_aligned):
+	/* eax stores distance from rsi to next page cross. These cases
+	   need to be handled specially as the 4x loop could potentially
+	   read memory past the length of s1 or s2 and across a page
+	   boundary.  */
+	movl	$-(VEC_SIZE * 4), %eax
+	subl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+
+	/* Loop 4x comparisons at a time.  */
 	.p2align 4
-L(test_vec):
+L(loop):
+
+	/* End condition for strncmp.  */
 # ifdef USE_AS_STRNCMP
-	/* The first vector matched.  Return 0 if the maximum offset
-	   (%r11) <= VEC_SIZE.  */
-	cmpq	$VEC_SIZE, %r11
-	jbe	L(zero)
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(ret_zero)
 # endif
-	VPCMPEQ	%ymm7, %ymm1, %ymm1
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+
+	/* Check if rsi loads will cross a page boundary.  */
+	addl	$-(VEC_SIZE * 4), %eax
+	jnb	L(page_cross_during_loop)
+
+	/* Loop entry after handling page cross during loop.  */
+L(loop_skip_page_cross_check):
+	VMOVA	(VEC_SIZE * 0)(%rdi), %ymm0
+	VMOVA	(VEC_SIZE * 1)(%rdi), %ymm2
+	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
+	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
+
+	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
+	CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+	CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+
+	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
+	   zero.  */
+	vpand	%ymm0, %ymm1, %ymm1
+
+
+	vpand	%ymm2, %ymm3, %ymm3
+	vpand	%ymm4, %ymm5, %ymm5
+	vpand	%ymm6, %ymm7, %ymm7
+
+	VPMINU	%ymm1, %ymm3, %ymm3
+	VPMINU	%ymm5, %ymm7, %ymm7
+
+	/* Reduce all 0 CHARs for the 4x VEC into ymm7.  */
+	VPMINU	%ymm3, %ymm7, %ymm7
+
+	/* If any 0 CHAR then done.  */
+	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
+	vpmovmskb %ymm7, %LOOP_REG
+	testl	%LOOP_REG, %LOOP_REG
+	jz	L(loop)
+
+	/* Find which VEC has the mismatch of end of string.  */
+	VPCMPEQ	%ymm1, %ymmZERO, %ymm1
 	vpmovmskb %ymm1, %ecx
 	testl	%ecx, %ecx
-	je	L(test_2_vec)
-	tzcntl	%ecx, %edi
+	jnz	L(return_vec_0_end)
+
+
+	VPCMPEQ	%ymm3, %ymmZERO, %ymm3
+	vpmovmskb %ymm3, %ecx
+	testl	%ecx, %ecx
+	jnz	L(return_vec_1_end)
+
+L(return_vec_2_3_end):
 # ifdef USE_AS_STRNCMP
-	addq	$VEC_SIZE, %rdi
-	cmpq	%rdi, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	subq	$(VEC_SIZE * 2), %rdx
+	jbe	L(ret_zero_end)
+# endif
+
+	VPCMPEQ	%ymm5, %ymmZERO, %ymm5
+	vpmovmskb %ymm5, %ecx
+	testl	%ecx, %ecx
+	jnz	L(return_vec_2_end)
+
+	/* LOOP_REG contains matches for null/mismatch from the loop. If
+	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
+	   must entirely be from VEC 3 which is fully represented by
+	   LOOP_REG.  */
+	tzcntl	%LOOP_REG, %LOOP_REG
+
+# ifdef USE_AS_STRNCMP
+	subl	$-(VEC_SIZE), %LOOP_REG
+	cmpq	%LOOP_REG64, %rdx
+	jbe	L(ret_zero_end)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
 	xorl	%eax, %eax
-	movl	(%rsi, %rdi), %ecx
-	cmpl	(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rdi), %eax
-	movzbl	(%rdx, %rdi), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+	je	L(ret5)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	VEC_SIZE(%rsi, %rdi), %ecx
-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	VEC_SIZE(%rax, %rdi), %eax
-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
-	subl	%edx, %eax
-#  endif
+	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
+	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
-	VZEROUPPER
-	ret
+L(ret5):
+	VZEROUPPER_RETURN
 
-	.p2align 4
-L(test_2_vec):
 # ifdef USE_AS_STRNCMP
-	/* The first 2 vectors matched.  Return 0 if the maximum offset
-	   (%r11) <= 2 * VEC_SIZE.  */
-	cmpq	$(VEC_SIZE * 2), %r11
-	jbe	L(zero)
+	.p2align 4,, 2
+L(ret_zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
 # endif
-	VPCMPEQ	%ymm7, %ymm5, %ymm5
-	vpmovmskb %ymm5, %ecx
-	testl	%ecx, %ecx
-	je	L(test_3_vec)
-	tzcntl	%ecx, %edi
+
+
+	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
+	   they use the value of `r8` to negate the return value. This is
+	   because the page cross logic can swap `rdi` and `rsi`.  */
+	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
-	addq	$(VEC_SIZE * 2), %rdi
-	cmpq	%rdi, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+L(return_vec_1_end):
+	salq	$32, %rcx
+# endif
+L(return_vec_0_end):
+# ifndef USE_AS_STRNCMP
+	tzcntl	%ecx, %ecx
+# else
+	tzcntq	%rcx, %rcx
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_end)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(%rsi, %rdi), %ecx
-	cmpl	(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rdi), %eax
-	movzbl	(%rdx, %rdi), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	(%rsi, %rcx), %edx
+	je	L(ret6)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+# endif
+L(ret6):
+	VZEROUPPER_RETURN
+
+# ifndef USE_AS_STRNCMP
+	.p2align 4,, 10
+L(return_vec_1_end):
+	tzcntl	%ecx, %ecx
 #  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	movl	VEC_SIZE(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
-	jne	L(wcscmp_return)
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+	je	L(ret7)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 #  else
-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
-	subl	%edx, %eax
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 #  endif
+L(ret7):
+	VZEROUPPER_RETURN
 # endif
-	VZEROUPPER
-	ret
 
-	.p2align 4
-L(test_3_vec):
+	.p2align 4,, 10
+L(return_vec_2_end):
+	tzcntl	%ecx, %ecx
 # ifdef USE_AS_STRNCMP
-	/* The first 3 vectors matched.  Return 0 if the maximum offset
-	   (%r11) <= 3 * VEC_SIZE.  */
-	cmpq	$(VEC_SIZE * 3), %r11
-	jbe	L(zero)
-# endif
-	VPCMPEQ	%ymm7, %ymm6, %ymm6
-	vpmovmskb %ymm6, %esi
-	tzcntl	%esi, %ecx
-# ifdef USE_AS_STRNCMP
-	addq	$(VEC_SIZE * 3), %rcx
-	cmpq	%rcx, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_page_cross)
+# endif
+# ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %esi
-	cmpl	(%rdx, %rcx), %esi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	je	L(ret11)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
-	VZEROUPPER
-	ret
+L(ret11):
+	VZEROUPPER_RETURN
+
+
+	/* Page cross in rsi in next 4x VEC.  */
+
+	/* TODO: Improve logic here.  */
+	.p2align 4,, 10
+L(page_cross_during_loop):
+	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
+
+	/* Optimistically rsi and rdi and both aligned inwhich case we
+	   don't need any logic here.  */
+	cmpl	$-(VEC_SIZE * 4), %eax
+	/* Don't adjust eax before jumping back to loop and we will
+	   never hit page cross case again.  */
+	je	L(loop_skip_page_cross_check)
+
+	/* Check if we can safely load a VEC.  */
+	cmpl	$-(VEC_SIZE * 3), %eax
+	jle	L(less_1x_vec_till_page_cross)
+
+	VMOVA	(%rdi), %ymm0
+	CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_0_end)
+
+	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
+	cmpl	$-(VEC_SIZE * 2), %eax
+	jg	L(more_2x_vec_till_page_cross)
+
+	.p2align 4,, 4
+L(less_1x_vec_till_page_cross):
+	subl	$-(VEC_SIZE * 4), %eax
+	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
+	   concerning case is first iteration if incoming s1 was near start
+	   of a page and s2 near end. If s1 was near the start of the page
+	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
+	   to read back -VEC_SIZE. If rdi is truly at the start of a page
+	   here, it means the previous page (rdi - VEC_SIZE) has already
+	   been loaded earlier so must be valid.  */
+	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
+	CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+
+	/* Mask of potentially valid bits. The lower bits can be out of
+	   range comparisons (but safe regarding page crosses).  */
+	movl	$-1, %r10d
+	shlxl	%esi, %r10d, %r10d
+	notl	%ecx
 
-	.p2align 4
-L(loop_cross_page):
-	xorl	%r10d, %r10d
-	movq	%rdx, %rcx
-	/* Align load via RDX.  We load the extra ECX bytes which should
-	   be ignored.  */
-	andl	$((VEC_SIZE * 4) - 1), %ecx
-	/* R10 is -RCX.  */
-	subq	%rcx, %r10
-
-	/* This works only if VEC_SIZE * 2 == 64. */
-# if (VEC_SIZE * 2) != 64
-#  error (VEC_SIZE * 2) != 64
-# endif
-
-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
-	cmpl	$(VEC_SIZE * 2), %ecx
-	jge	L(loop_cross_page_2_vec)
-
-	vmovdqu	(%rax, %r10), %ymm2
-	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
-	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
-	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
-	VPMINU	%ymm2, %ymm0, %ymm0
-	VPMINU	%ymm3, %ymm1, %ymm1
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm1, %ymm1
-
-	vpmovmskb %ymm0, %edi
-	vpmovmskb %ymm1, %esi
-
-	salq	$32, %rsi
-	xorq	%rsi, %rdi
-
-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
-	shrq	%cl, %rdi
-
-	testq	%rdi, %rdi
-	je	L(loop_cross_page_2_vec)
-	tzcntq	%rdi, %rcx
 # ifdef USE_AS_STRNCMP
-	cmpq	%rcx, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	cmpq	%rax, %rdx
+	jbe	L(return_page_cross_end_check)
+# endif
+	movl	%eax, %OFFSET_REG
+	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
+
+	andl	%r10d, %ecx
+	jz	L(loop_skip_page_cross_check)
+
+	.p2align 4,, 3
+L(return_page_cross_end):
+	tzcntl	%ecx, %ecx
+
+# ifdef USE_AS_STRNCMP
+	leal	-VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
+L(return_page_cross_cmp_mem):
+# else
+	addl	%OFFSET_REG, %ecx
+# endif
+# ifdef USE_AS_WCSCMP
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
 	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
+	je	L(ret8)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+# endif
+L(ret8):
+	VZEROUPPER_RETURN
+
+# ifdef USE_AS_STRNCMP
+	.p2align 4,, 10
+L(return_page_cross_end_check):
+	andl	%r10d, %ecx
+	tzcntl	%ecx, %ecx
+	leal	-VEC_SIZE(%rax, %rcx), %ecx
+	cmpl	%ecx, %edx
+	ja	L(return_page_cross_cmp_mem)
 	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	VZEROUPPER_RETURN
 # endif
-	VZEROUPPER
-	ret
 
-	.p2align 4
-L(loop_cross_page_2_vec):
-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
-	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
-	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
-	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
-	VPMINU	%ymm2, %ymm5, %ymm5
-	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
-	VPCMPEQ	%ymm7, %ymm5, %ymm5
-	VPMINU	%ymm3, %ymm6, %ymm6
-	VPCMPEQ	%ymm7, %ymm6, %ymm6
-
-	vpmovmskb %ymm5, %edi
-	vpmovmskb %ymm6, %esi
-
-	salq	$32, %rsi
-	xorq	%rsi, %rdi
 
-	xorl	%r8d, %r8d
-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
-	subl	$(VEC_SIZE * 2), %ecx
-	jle	1f
-	/* Skip ECX bytes.  */
-	shrq	%cl, %rdi
-	/* R8 has number of bytes skipped.  */
-	movl	%ecx, %r8d
-1:
-	/* Before jumping back to the loop, set ESI to the number of
-	   VEC_SIZE * 4 blocks before page crossing.  */
-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
-
-	testq	%rdi, %rdi
+	.p2align 4,, 10
+L(more_2x_vec_till_page_cross):
+	/* If more 2x vec till cross we will complete a full loop
+	   iteration here.  */
+
+	VMOVU	VEC_SIZE(%rdi), %ymm0
+	CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_1_end)
+
 # ifdef USE_AS_STRNCMP
-	/* At this point, if %rdi value is 0, it already tested
-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
-	   checks whether strncmp maximum offset reached or not.  */
-	je	L(string_nbyte_offset_check)
-# else
-	je	L(back_to_loop)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(ret_zero_in_loop_page_cross)
 # endif
-	tzcntq	%rdi, %rcx
-	addq	%r10, %rcx
-	/* Adjust for number of bytes skipped.  */
-	addq	%r8, %rcx
+
+	subl	$-(VEC_SIZE * 4), %eax
+
+	/* Safe to include comparisons from lower bytes.  */
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_page_cross_0)
+
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
+	CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+	jnz	L(return_vec_page_cross_1)
+
 # ifdef USE_AS_STRNCMP
-	addq	$(VEC_SIZE * 2), %rcx
-	subq	%rcx, %r11
-	jbe	L(zero)
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
+	/* Must check length here as length might proclude reading next
+	   page.  */
+	cmpq	%rax, %rdx
+	jbe	L(ret_zero_in_loop_page_cross)
+# endif
+
+	/* Finish the loop.  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
+	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
+
+	CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+	CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+	vpand	%ymm4, %ymm5, %ymm5
+	vpand	%ymm6, %ymm7, %ymm7
+	VPMINU	%ymm5, %ymm7, %ymm7
+	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
+	vpmovmskb %ymm7, %LOOP_REG
+	testl	%LOOP_REG, %LOOP_REG
+	jnz	L(return_vec_2_3_end)
+
+	/* Best for code size to include ucond-jmp here. Would be faster
+	   if this case is hot to duplicate the L(return_vec_2_3_end) code
+	   as fall-through and have jump back to loop on mismatch
+	   comparison.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
+# ifdef USE_AS_STRNCMP
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_skip_page_cross_check)
+L(ret_zero_in_loop_page_cross):
 	xorl	%eax, %eax
-	movl	(%rsi, %rcx), %edi
-	cmpl	(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	VZEROUPPER_RETURN
 # else
-#  ifdef USE_AS_WCSCMP
-	movq	%rax, %rsi
-	xorl	%eax, %eax
-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
-	subl	%edx, %eax
-#  endif
+	jmp	L(loop_skip_page_cross_check)
 # endif
-	VZEROUPPER
-	ret
 
+
+	.p2align 4,, 10
+L(return_vec_page_cross_0):
+	addl	$-VEC_SIZE, %eax
+L(return_vec_page_cross_1):
+	tzcntl	%ecx, %ecx
 # ifdef USE_AS_STRNCMP
-L(string_nbyte_offset_check):
-	leaq	(VEC_SIZE * 4)(%r10), %r10
-	cmpq	%r10, %r11
-	jbe	L(zero)
-	jmp	L(back_to_loop)
+	leal	-VEC_SIZE(%rax, %rcx), %ecx
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_in_loop_page_cross)
+# else
+	addl	%eax, %ecx
 # endif
 
-	.p2align 4
-L(cross_page_loop):
-	/* Check one byte/dword at a time.  */
 # ifdef USE_AS_WCSCMP
-	cmpl	%ecx, %eax
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
+	xorl	%eax, %eax
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
+	je	L(ret9)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
 # else
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
-	jne	L(different)
-	addl	$SIZE_OF_CHAR, %edx
-	cmpl	$(VEC_SIZE * 4), %edx
-	je	L(main_loop_header)
-# ifdef USE_AS_STRNCMP
-	cmpq	%r11, %rdx
-	jae	L(zero)
+L(ret9):
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 10
+L(page_cross):
+# ifndef USE_AS_STRNCMP
+	/* If both are VEC aligned we don't need any special logic here.
+	   Only valid for strcmp where stop condition is guranteed to be
+	   reachable by just reading memory.  */
+	testl	$((VEC_SIZE - 1) << 20), %eax
+	jz	L(no_page_cross)
 # endif
+
+	movl	%edi, %eax
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %eax
+	andl	$(PAGE_SIZE - 1), %ecx
+
+	xorl	%OFFSET_REG, %OFFSET_REG
+
+	/* Check which is closer to page cross, s1 or s2.  */
+	cmpl	%eax, %ecx
+	jg	L(page_cross_s2)
+
+	/* The previous page cross check has false positives. Check for
+	   true positive as page cross logic is very expensive.  */
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
+	jbe	L(no_page_cross)
+
+	/* Set r8 to not interfere with normal return value (rdi and rsi
+	   did not swap).  */
 # ifdef USE_AS_WCSCMP
-	movl	(%rdi, %rdx), %eax
-	movl	(%rsi, %rdx), %ecx
+	/* any non-zero positive value that doesn't inference with 0x1.
+	 */
+	movl	$2, %r8d
 # else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %ecx
+	xorl	%r8d, %r8d
 # endif
-	/* Check null char.  */
-	testl	%eax, %eax
-	jne	L(cross_page_loop)
-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
-	   comparisons.  */
-	subl	%ecx, %eax
-# ifndef USE_AS_WCSCMP
-L(different):
+
+	/* Check if less than 1x VEC till page cross.  */
+	subl	$(VEC_SIZE * 3), %eax
+	jg	L(less_1x_vec_till_page)
+
+	/* If more than 1x VEC till page cross, loop throuh safely
+	   loadable memory until within 1x VEC of page cross.  */
+
+	.p2align 4,, 10
+L(page_cross_loop):
+
+	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+	incl	%ecx
+
+	jnz	L(check_ret_vec_page_cross)
+	addl	$VEC_SIZE, %OFFSET_REG
+# ifdef USE_AS_STRNCMP
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross)
 # endif
-	VZEROUPPER
-	ret
+	addl	$VEC_SIZE, %eax
+	jl	L(page_cross_loop)
+
+	subl	%eax, %OFFSET_REG
+	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
+	   to not cross page so is safe to load. Since we have already
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
 
+	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
+	CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
+	vpandn	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %ecx
+
+# ifdef USE_AS_STRNCMP
+	leal	VEC_SIZE(%OFFSET_REG64), %eax
+	cmpq	%rax, %rdx
+	jbe	L(check_ret_vec_page_cross2)
+	addq	%rdi, %rdx
+# endif
+	incl	%ecx
+	jz	L(prepare_loop_no_len)
+
+	.p2align 4,, 4
+L(ret_vec_page_cross):
+# ifndef USE_AS_STRNCMP
+L(check_ret_vec_page_cross):
+# endif
+	tzcntl	%ecx, %ecx
+	addl	%OFFSET_REG, %ecx
+L(ret_vec_page_cross_cont):
 # ifdef USE_AS_WCSCMP
-	.p2align 4
-L(different):
-	/* Use movl to avoid modifying EFLAGS.  */
-	movl	$0, %eax
+	movl	(%rdi, %rcx), %edx
+	xorl	%eax, %eax
+	cmpl	(%rsi, %rcx), %edx
+	je	L(ret12)
 	setl	%al
 	negl	%eax
-	orl	$1, %eax
-	VZEROUPPER
-	ret
+	xorl	%r8d, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
 # endif
+L(ret12):
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNCMP
-	.p2align 4
-L(zero):
+	.p2align 4,, 10
+L(check_ret_vec_page_cross2):
+	incl	%ecx
+L(check_ret_vec_page_cross):
+	tzcntl	%ecx, %ecx
+	addl	%OFFSET_REG, %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_page_cross_cont)
+	.p2align 4,, 2
+L(ret_zero_page_cross):
 	xorl	%eax, %eax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
+# endif
 
-	.p2align 4
-L(char0):
-#  ifdef USE_AS_WCSCMP
-	xorl	%eax, %eax
-	movl	(%rdi), %ecx
-	cmpl	(%rsi), %ecx
-	jne	L(wcscmp_return)
-#  else
-	movzbl	(%rsi), %ecx
-	movzbl	(%rdi), %eax
-	subl	%ecx, %eax
-#  endif
-	VZEROUPPER
-	ret
+	.p2align 4,, 4
+L(page_cross_s2):
+	/* Ensure this is a true page cross.  */
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
+	jbe	L(no_page_cross)
+
+
+	movl	%ecx, %eax
+	movq	%rdi, %rcx
+	movq	%rsi, %rdi
+	movq	%rcx, %rsi
+
+	/* set r8 to negate return value as rdi and rsi swapped.  */
+# ifdef USE_AS_WCSCMP
+	movl	$-4, %r8d
+# else
+	movl	$-1, %r8d
 # endif
+	xorl	%OFFSET_REG, %OFFSET_REG
 
-	.p2align 4
-L(last_vector):
-	addq	%rdx, %rdi
-	addq	%rdx, %rsi
+	/* Check if more than 1x VEC till page cross.  */
+	subl	$(VEC_SIZE * 3), %eax
+	jle	L(page_cross_loop)
+
+	.p2align 4,, 6
+L(less_1x_vec_till_page):
+	/* Find largest load size we can use.  */
+	cmpl	$16, %eax
+	ja	L(less_16_till_page)
+
+	VMOVU	(%rdi), %xmm0
+	CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	incw	%cx
+	jnz	L(check_ret_vec_page_cross)
+	movl	$16, %OFFSET_REG
 # ifdef USE_AS_STRNCMP
-	subq	%rdx, %r11
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subl	%eax, %OFFSET_REG
+# else
+	/* Explicit check for 16 byte alignment.  */
+	subl	%eax, %OFFSET_REG
+	jz	L(prepare_loop)
 # endif
-	tzcntl	%ecx, %edx
+
+	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
+	CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	incw	%cx
+	jnz	L(check_ret_vec_page_cross)
+
 # ifdef USE_AS_STRNCMP
-	cmpq	%r11, %rdx
-	jae	L(zero)
-# endif
-# ifdef USE_AS_WCSCMP
-	xorl	%eax, %eax
-	movl	(%rdi, %rdx), %ecx
-	cmpl	(%rsi, %rdx), %ecx
-	jne	L(wcscmp_return)
+	addl	$16, %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subq	$-(VEC_SIZE * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
 # else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %edx
-	subl	%edx, %eax
+	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
 # endif
-	VZEROUPPER
+	jmp	L(prepare_loop_aligned)
+
+# ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case0):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	/* Comparing on page boundary region requires special treatment:
-	   It must done one vector at the time, starting with the wider
-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
-	   (xmm) still passes the boundary, byte comparison must be done.
-	 */
-	.p2align 4
-L(cross_page):
-	/* Try one ymm vector at a time.  */
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	jg	L(cross_page_1_vector)
-L(loop_1_vector):
-	vmovdqu	(%rdi, %rdx), %ymm1
-	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
-	VPMINU	%ymm1, %ymm0, %ymm0
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
-	vpmovmskb %ymm0, %ecx
-	testl	%ecx, %ecx
-	jne	L(last_vector)
 
-	addl	$VEC_SIZE, %edx
+	.p2align 4,, 10
+L(less_16_till_page):
+	/* Find largest load size we can use.  */
+	cmpl	$24, %eax
+	ja	L(less_8_till_page)
+
+	vmovq	(%rdi), %xmm0
+	vmovq	(%rsi), %xmm1
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	incb	%cl
+	jnz	L(check_ret_vec_page_cross)
 
-	addl	$VEC_SIZE, %eax
-# ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
-# endif
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	jle	L(loop_1_vector)
-L(cross_page_1_vector):
-	/* Less than 32 bytes to check, try one xmm vector.  */
-	cmpl	$(PAGE_SIZE - 16), %eax
-	jg	L(cross_page_1_xmm)
-	vmovdqu	(%rdi, %rdx), %xmm1
-	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
-	VPMINU	%xmm1, %xmm0, %xmm0
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
-	vpmovmskb %xmm0, %ecx
-	testl	%ecx, %ecx
-	jne	L(last_vector)
 
-	addl	$16, %edx
-# ifndef USE_AS_WCSCMP
-	addl	$16, %eax
+# ifdef USE_AS_STRNCMP
+	cmpq	$8, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
 # endif
+	movl	$24, %OFFSET_REG
+	/* Explicit check for 16 byte alignment.  */
+	subl	%eax, %OFFSET_REG
+
+
+
+	vmovq	(%rdi, %OFFSET_REG64), %xmm0
+	vmovq	(%rsi, %OFFSET_REG64), %xmm1
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	incb	%cl
+	jnz	L(check_ret_vec_page_cross)
+
 # ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
-# endif
-
-L(cross_page_1_xmm):
-# ifndef USE_AS_WCSCMP
-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
-	cmpl	$(PAGE_SIZE - 8), %eax
-	jg	L(cross_page_8bytes)
-	vmovq	(%rdi, %rdx), %xmm1
-	vmovq	(%rsi, %rdx), %xmm0
-	VPCMPEQ	%xmm0, %xmm1, %xmm0
-	VPMINU	%xmm1, %xmm0, %xmm0
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
-	vpmovmskb %xmm0, %ecx
-	/* Only last 8 bits are valid.  */
-	andl	$0xff, %ecx
-	testl	%ecx, %ecx
-	jne	L(last_vector)
+	addl	$8, %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subq	$-(VEC_SIZE * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+# else
+	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+# endif
+	jmp	L(prepare_loop_aligned)
 
-	addl	$8, %edx
-	addl	$8, %eax
+
+	.p2align 4,, 10
+L(less_8_till_page):
+# ifdef USE_AS_WCSCMP
+	/* If using wchar then this is the only check before we reach
+	   the page boundary.  */
+	movl	(%rdi), %eax
+	movl	(%rsi), %ecx
+	cmpl	%ecx, %eax
+	jnz	L(ret_less_8_wcs)
 #  ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	addq	%rdi, %rdx
+	/* We already checked for len <= 1 so cannot hit that case here.
+	 */
 #  endif
+	testl	%eax, %eax
+	jnz	L(prepare_loop_no_len)
+	ret
 
-L(cross_page_8bytes):
-	/* Less than 8 bytes to check, try 4 byte vector.  */
-	cmpl	$(PAGE_SIZE - 4), %eax
-	jg	L(cross_page_4bytes)
-	vmovd	(%rdi, %rdx), %xmm1
-	vmovd	(%rsi, %rdx), %xmm0
-	VPCMPEQ	%xmm0, %xmm1, %xmm0
-	VPMINU	%xmm1, %xmm0, %xmm0
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
-	vpmovmskb %xmm0, %ecx
-	/* Only last 4 bits are valid.  */
-	andl	$0xf, %ecx
-	testl	%ecx, %ecx
-	jne	L(last_vector)
+	.p2align 4,, 8
+L(ret_less_8_wcs):
+	setl	%OFFSET_REG8
+	negl	%OFFSET_REG
+	movl	%OFFSET_REG, %eax
+	xorl	%r8d, %eax
+	ret
+
+# else
+
+	/* Find largest load size we can use.  */
+	cmpl	$28, %eax
+	ja	L(less_4_till_page)
+
+	vmovd	(%rdi), %xmm0
+	vmovd	(%rsi), %xmm1
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	subl	$0xf, %ecx
+	jnz	L(check_ret_vec_page_cross)
 
-	addl	$4, %edx
 #  ifdef USE_AS_STRNCMP
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
-	   (%r11).  */
-	cmpq	%r11, %rdx
-	jae	L(zero)
+	cmpq	$4, %rdx
+	jbe	L(ret_zero_page_cross_slow_case1)
 #  endif
+	movl	$28, %OFFSET_REG
+	/* Explicit check for 16 byte alignment.  */
+	subl	%eax, %OFFSET_REG
 
-L(cross_page_4bytes):
-# endif
-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
-# ifdef USE_AS_STRNCMP
-	cmpq	%r11, %rdx
-	jae	L(zero)
-# endif
-# ifdef USE_AS_WCSCMP
-	movl	(%rdi, %rdx), %eax
-	movl	(%rsi, %rdx), %ecx
-# else
-	movzbl	(%rdi, %rdx), %eax
-	movzbl	(%rsi, %rdx), %ecx
-# endif
-	testl	%eax, %eax
-	jne	L(cross_page_loop)
-	subl	%ecx, %eax
-	VZEROUPPER
+
+
+	vmovd	(%rdi, %OFFSET_REG64), %xmm0
+	vmovd	(%rsi, %OFFSET_REG64), %xmm1
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
+	CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+	vpandn	%xmm1, %xmm2, %xmm1
+	vpmovmskb %ymm1, %ecx
+	subl	$0xf, %ecx
+	jnz	L(check_ret_vec_page_cross)
+
+#  ifdef USE_AS_STRNCMP
+	addl	$4, %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case1)
+	subq	$-(VEC_SIZE * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+#  else
+	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+#  endif
+	jmp	L(prepare_loop_aligned)
+
+#  ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case1):
+	xorl	%eax, %eax
 	ret
-END (STRCMP)
+#  endif
+
+	.p2align 4,, 10
+L(less_4_till_page):
+	subq	%rdi, %rsi
+	/* Extremely slow byte comparison loop.  */
+L(less_4_loop):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi, %rdi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
+	jnz	L(ret_less_4_loop)
+	testl	%ecx, %ecx
+	jz	L(ret_zero_4_loop)
+#  ifdef USE_AS_STRNCMP
+	decq	%rdx
+	jz	L(ret_zero_4_loop)
+#  endif
+	incq	%rdi
+	/* end condition is reach page boundary (rdi is aligned).  */
+	testl	$31, %edi
+	jnz	L(less_4_loop)
+	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
+	addq	$-(VEC_SIZE * 4), %rdi
+#  ifdef USE_AS_STRNCMP
+	subq	$-(VEC_SIZE * 4), %rdx
+#  endif
+	jmp	L(prepare_loop_aligned)
+
+L(ret_zero_4_loop):
+	xorl	%eax, %eax
+	ret
+L(ret_less_4_loop):
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+	ret
+# endif
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
new file mode 100644
index 000000000..b81b57753
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -0,0 +1,1417 @@
+/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
+
+# ifndef STRCMP
+#  define STRCMP	__strcmp_evex
+# endif
+
+# define PAGE_SIZE	4096
+
+	/* VEC_SIZE = Number of bytes in a ymm register.  */
+# define VEC_SIZE	32
+# define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
+
+# define VMOVU	vmovdqu64
+# define VMOVA	vmovdqa64
+
+# ifdef USE_AS_WCSCMP
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__wcscmp_evex
+#  endif
+
+#  define TESTEQ	subl $0xff,
+	/* Compare packed dwords.  */
+#  define VPCMP	vpcmpd
+#  define VPMINU	vpminud
+#  define VPTESTM	vptestmd
+#  define VPTESTNM	vptestnmd
+	/* 1 dword char == 4 bytes.  */
+#  define SIZE_OF_CHAR	4
+# else
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__strcmp_evex
+#  endif
+
+#  define TESTEQ	incl
+	/* Compare packed bytes.  */
+#  define VPCMP	vpcmpb
+#  define VPMINU	vpminub
+#  define VPTESTM	vptestmb
+#  define VPTESTNM	vptestnmb
+	/* 1 byte char == 1 byte.  */
+#  define SIZE_OF_CHAR	1
+# endif
+
+# ifdef USE_AS_STRNCMP
+#  define LOOP_REG	r9d
+#  define LOOP_REG64	r9
+
+#  define OFFSET_REG8	r9b
+#  define OFFSET_REG	r9d
+#  define OFFSET_REG64	r9
+# else
+#  define LOOP_REG	edx
+#  define LOOP_REG64	rdx
+
+#  define OFFSET_REG8	dl
+#  define OFFSET_REG	edx
+#  define OFFSET_REG64	rdx
+# endif
+
+# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
+#  define VEC_OFFSET	0
+# else
+#  define VEC_OFFSET	(-VEC_SIZE)
+# endif
+
+# define XMM0	xmm17
+# define XMM1	xmm18
+
+# define XMM10	xmm27
+# define XMM11	xmm28
+# define XMM12	xmm29
+# define XMM13	xmm30
+# define XMM14	xmm31
+
+
+# define YMM0	ymm17
+# define YMM1	ymm18
+# define YMM2	ymm19
+# define YMM3	ymm20
+# define YMM4	ymm21
+# define YMM5	ymm22
+# define YMM6	ymm23
+# define YMM7	ymm24
+# define YMM8	ymm25
+# define YMM9	ymm26
+# define YMM10	ymm27
+# define YMM11	ymm28
+# define YMM12	ymm29
+# define YMM13	ymm30
+# define YMM14	ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_evex
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_evex
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
+# define LCASE_MIN_YMM	%YMM12
+# define LCASE_MAX_YMM	%YMM13
+# define CASE_ADD_YMM	%YMM14
+
+# define LCASE_MIN_XMM	%XMM12
+# define LCASE_MAX_XMM	%XMM13
+# define CASE_ADD_XMM	%XMM14
+
+	/* NB: wcsncmp uses r11 but strcasecmp is never used in
+	   conjunction with wcscmp.  */
+# define TOLOWER_BASE	%r11
+
+# ifdef USE_AS_STRCASECMP_L
+#  define _REG(x, y) x ## y
+#  define REG(x, y) _REG(x, y)
+#  define TOLOWER(reg1, reg2, ext)										\
+	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
+	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
+	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
+	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
+#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
+	TOLOWER	(s1_reg, s2_reg, ext);										\
+	VPCMP	$0, s1_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
+	VMOVU	s2_mem, s2_reg;												\
+	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_YMM(...)
+#  define TOLOWER_XMM(...)
+
+#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
+	VPCMP	$0, s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
+	VPCMP	$0, s2_mem, s1_reg, reg_out
+
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
+
+/* Warning!
+           wcscmp/wcsncmp have to use SIGNED comparison for elements.
+           strcmp/strncmp have to use UNSIGNED comparison for elements.
+*/
+
+/* The main idea of the string comparison (byte or dword) using 256-bit
+   EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
+   latter can be on either packed bytes or dwords depending on
+   USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
+   matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
+   KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
+   are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
+   instructions.  Main loop (away from from page boundary) compares 4
+   vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
+   bytes) on each loop.
+
+   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
+   is the same as strcmp, except that an a maximum offset is tracked.  If
+   the maximum offset is reached before a difference is found, zero is
+   returned.  */
+
+	.section .text.evex, "ax", @progbits
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (STRCASECMP)
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
+# ifdef USE_AS_STRNCMP
+	/* Don't overwrite LOCALE_REG (rcx) until we have pass
+	   L(one_or_less). Otherwise we might use the wrong locale in
+	   the OVERFLOW_STRCMP (strcasecmp_l).  */
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
+	cmp	$1, %RDX_LP
+	/* Signed comparison intentional. We use this branch to also
+	   test cases where length >= 2^63. These very large sizes can be
+	   handled with strcmp as there is no way for that length to
+	   actually bound the buffer.  */
+	jle	L(one_or_less)
+# endif
+
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+L(lcase_max):
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+# endif
+
+	movl	%edi, %eax
+	orl	%esi, %eax
+	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
+	sall	$20, %eax
+	/* Check if s1 or s2 may cross a page in next 4x VEC loads.  */
+	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
+	ja	L(page_cross)
+
+L(no_page_cross):
+	/* Safe to compare 4x vectors.  */
+	VMOVU	(%rdi), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	/* Each bit cleared in K1 represents a mismatch or a null CHAR
+	   in YMM0 and 32 bytes at (%rsi).  */
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_STRNCMP
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(vec_0_test_len)
+# endif
+
+	/* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
+	   wcscmp/wcsncmp.  */
+
+	/* All 1s represents all equals. TESTEQ will overflow to zero in
+	   all equals case. Otherwise 1s will carry until position of first
+	   mismatch.  */
+	TESTEQ	%ecx
+	jz	L(more_3x_vec)
+
+	.p2align 4,, 4
+L(return_vec_0):
+	tzcntl	%ecx, %ecx
+# ifdef USE_AS_WCSCMP
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret0)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+# endif
+L(ret0):
+	ret
+
+# ifdef USE_AS_STRNCMP
+	.p2align 4,, 4
+L(vec_0_test_len):
+	notl	%ecx
+	bzhil	%edx, %ecx, %eax
+	jnz	L(return_vec_0)
+	/* Align if will cross fetch block.  */
+	.p2align 4,, 2
+L(ret_zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4,, 5
+L(one_or_less):
+#  ifdef USE_AS_STRCASECMP_L
+	/* Set locale argument for strcasecmp.  */
+	movq	%LOCALE_REG, %rdx
+#  endif
+	jb	L(ret_zero)
+	/* 'nbe' covers the case where length is negative (large
+	   unsigned).  */
+	jnbe	OVERFLOW_STRCMP
+#  ifdef USE_AS_WCSCMP
+	movl	(%rdi), %edx
+	xorl	%eax, %eax
+	cmpl	(%rsi), %edx
+	je	L(ret1)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+#  else
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+#  endif
+L(ret1):
+	ret
+# endif
+
+	.p2align 4,, 10
+L(return_vec_1):
+	tzcntl	%ecx, %ecx
+# ifdef USE_AS_STRNCMP
+	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
+	   worrying about underflow.  */
+	addq	$-CHAR_PER_VEC, %rdx
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero)
+# endif
+# ifdef USE_AS_WCSCMP
+	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret2)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+# endif
+L(ret2):
+	ret
+
+	.p2align 4,, 10
+# ifdef USE_AS_STRNCMP
+L(return_vec_3):
+#  if CHAR_PER_VEC <= 16
+	sall	$CHAR_PER_VEC, %ecx
+#  else
+	salq	$CHAR_PER_VEC, %rcx
+#  endif
+# endif
+L(return_vec_2):
+# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
+	tzcntl	%ecx, %ecx
+# else
+	tzcntq	%rcx, %rcx
+# endif
+
+# ifdef USE_AS_STRNCMP
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret3)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+# endif
+L(ret3):
+	ret
+
+# ifndef USE_AS_STRNCMP
+	.p2align 4,, 10
+L(return_vec_3):
+	tzcntl	%ecx, %ecx
+#  ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret4)
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+#  else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+#  endif
+L(ret4):
+	ret
+# endif
+
+	/* 32 byte align here ensures the main loop is ideally aligned
+	   for DSB.  */
+	.p2align 5
+L(more_3x_vec):
+	/* Safe to compare 4x vectors.  */
+	VMOVU	(VEC_SIZE)(%rdi), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_1)
+
+# ifdef USE_AS_STRNCMP
+	subq	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(ret_zero)
+# endif
+
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_2)
+
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_3)
+
+# ifdef USE_AS_STRNCMP
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(ret_zero)
+# endif
+
+
+# ifdef USE_AS_WCSCMP
+	/* any non-zero positive value that doesn't inference with 0x1.
+	 */
+	movl	$2, %r8d
+
+# else
+	xorl	%r8d, %r8d
+# endif
+
+	/* The prepare labels are various entry points from the page
+	   cross logic.  */
+L(prepare_loop):
+
+# ifdef USE_AS_STRNCMP
+#  ifdef USE_AS_WCSCMP
+L(prepare_loop_no_len):
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	shrl	$2, %ecx
+	leaq	(CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
+#  else
+	/* Store N + (VEC_SIZE * 4) and place check at the begining of
+	   the loop.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
+L(prepare_loop_no_len):
+#  endif
+# else
+L(prepare_loop_no_len):
+# endif
+
+	/* Align s1 and adjust s2 accordingly.  */
+	subq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 4), %rdi
+L(prepare_loop_readj):
+	addq	%rdi, %rsi
+# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
+	subq	%rdi, %rdx
+# endif
+
+L(prepare_loop_aligned):
+	/* eax stores distance from rsi to next page cross. These cases
+	   need to be handled specially as the 4x loop could potentially
+	   read memory past the length of s1 or s2 and across a page
+	   boundary.  */
+	movl	$-(VEC_SIZE * 4), %eax
+	subl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+
+
+	/* Loop 4x comparisons at a time.  */
+	.p2align 4
+L(loop):
+
+	/* End condition for strncmp.  */
+# ifdef USE_AS_STRNCMP
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(ret_zero)
+# endif
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+
+	/* Check if rsi loads will cross a page boundary.  */
+	addl	$-(VEC_SIZE * 4), %eax
+	jnb	L(page_cross_during_loop)
+
+	/* Loop entry after handling page cross during loop.  */
+L(loop_skip_page_cross_check):
+	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
+	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+
+	VPMINU	%YMM0, %YMM2, %YMM8
+	VPMINU	%YMM4, %YMM6, %YMM9
+
+	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
+	VPMINU	%YMM8, %YMM9, %YMM9
+
+	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
+	VPTESTM	%YMM9, %YMM9, %k1
+# ifndef USE_AS_STRCASECMP_L
+	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
+	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
+	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
+	   oring with YMM1. Result is stored in YMM6.  */
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+# else
+	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
+	TOLOWER_YMM (%YMM0, %YMM1)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
+	TOLOWER_YMM (%YMM2, %YMM3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM0, %YMM1, %YMM1
+	vpxorq	%YMM2, %YMM3, %YMM3
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
+	/* Or together YMM3, YMM5, and YMM6.  */
+	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+
+
+	/* A non-zero CHAR in YMM6 represents a mismatch.  */
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+	kmovd	%k0, %LOOP_REG
+
+	TESTEQ	%LOOP_REG
+	jz	L(loop)
+
+
+	/* Find which VEC has the mismatch of end of string.  */
+	VPTESTM	%YMM0, %YMM0, %k1
+	VPTESTNM %YMM1, %YMM1, %k0{%k1}
+	kmovd	%k0, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_0_end)
+
+	VPTESTM	%YMM2, %YMM2, %k1
+	VPTESTNM %YMM3, %YMM3, %k0{%k1}
+	kmovd	%k0, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_1_end)
+
+
+	/* Handle VEC 2 and 3 without branches.  */
+L(return_vec_2_3_end):
+# ifdef USE_AS_STRNCMP
+	subq	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(ret_zero_end)
+# endif
+
+	VPTESTM	%YMM4, %YMM4, %k1
+	VPTESTNM %YMM5, %YMM5, %k0{%k1}
+	kmovd	%k0, %ecx
+	TESTEQ	%ecx
+# if CHAR_PER_VEC <= 16
+	sall	$CHAR_PER_VEC, %LOOP_REG
+	orl	%ecx, %LOOP_REG
+# else
+	salq	$CHAR_PER_VEC, %LOOP_REG64
+	orq	%rcx, %LOOP_REG64
+# endif
+L(return_vec_3_end):
+	/* LOOP_REG contains matches for null/mismatch from the loop. If
+	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
+	   must entirely be from VEC 3 which is fully represented by
+	   LOOP_REG.  */
+# if CHAR_PER_VEC <= 16
+	tzcntl	%LOOP_REG, %LOOP_REG
+# else
+	tzcntq	%LOOP_REG64, %LOOP_REG64
+# endif
+# ifdef USE_AS_STRNCMP
+	cmpq	%LOOP_REG64, %rdx
+	jbe	L(ret_zero_end)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+	xorl	%eax, %eax
+	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+	je	L(ret5)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+# endif
+L(ret5):
+	ret
+
+# ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_end):
+	xorl	%eax, %eax
+	ret
+# endif
+
+
+	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
+	   they use the value of `r8` to negate the return value. This is
+	   because the page cross logic can swap `rdi` and `rsi`.  */
+	.p2align 4,, 10
+# ifdef USE_AS_STRNCMP
+L(return_vec_1_end):
+#  if CHAR_PER_VEC <= 16
+	sall	$CHAR_PER_VEC, %ecx
+#  else
+	salq	$CHAR_PER_VEC, %rcx
+#  endif
+# endif
+L(return_vec_0_end):
+# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
+	tzcntl	%ecx, %ecx
+# else
+	tzcntq	%rcx, %rcx
+# endif
+
+# ifdef USE_AS_STRNCMP
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_end)
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret6)
+	setl	%al
+	negl	%eax
+	/* This is the non-zero case for `eax` so just xorl with `r8d`
+	   flip is `rdi` and `rsi` where swapped.  */
+	xorl	%r8d, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
+	   logic. Subtract `r8d` after xor for zero case.  */
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+# endif
+L(ret6):
+	ret
+
+# ifndef USE_AS_STRNCMP
+	.p2align 4,, 10
+L(return_vec_1_end):
+	tzcntl	%ecx, %ecx
+#  ifdef USE_AS_WCSCMP
+	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret7)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
+#  else
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+#  endif
+L(ret7):
+	ret
+# endif
+
+
+	/* Page cross in rsi in next 4x VEC.  */
+
+	/* TODO: Improve logic here.  */
+	.p2align 4,, 10
+L(page_cross_during_loop):
+	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
+
+	/* Optimistically rsi and rdi and both aligned in which case we
+	   don't need any logic here.  */
+	cmpl	$-(VEC_SIZE * 4), %eax
+	/* Don't adjust eax before jumping back to loop and we will
+	   never hit page cross case again.  */
+	je	L(loop_skip_page_cross_check)
+
+	/* Check if we can safely load a VEC.  */
+	cmpl	$-(VEC_SIZE * 3), %eax
+	jle	L(less_1x_vec_till_page_cross)
+
+	VMOVA	(%rdi), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_0_end)
+
+	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
+	cmpl	$-(VEC_SIZE * 2), %eax
+	jg	L(more_2x_vec_till_page_cross)
+
+	.p2align 4,, 4
+L(less_1x_vec_till_page_cross):
+	subl	$-(VEC_SIZE * 4), %eax
+	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
+	   concerning case is first iteration if incoming s1 was near start
+	   of a page and s2 near end. If s1 was near the start of the page
+	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
+	   to read back -VEC_SIZE. If rdi is truly at the start of a page
+	   here, it means the previous page (rdi - VEC_SIZE) has already
+	   been loaded earlier so must be valid.  */
+	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+	/* Mask of potentially valid bits. The lower bits can be out of
+	   range comparisons (but safe regarding page crosses).  */
+
+# ifdef USE_AS_WCSCMP
+	movl	$-1, %r10d
+	movl	%esi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+	shrl	$2, %ecx
+	shlxl	%ecx, %r10d, %ecx
+	movzbl	%cl, %r10d
+# else
+	movl	$-1, %ecx
+	shlxl	%esi, %ecx, %r10d
+# endif
+
+	kmovd	%k1, %ecx
+	notl	%ecx
+
+
+# ifdef USE_AS_STRNCMP
+#  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
+	movl	%eax, %r11d
+	shrl	$2, %r11d
+	cmpq	%r11, %rdx
+#  else
+	cmpq	%rax, %rdx
+#  endif
+	jbe	L(return_page_cross_end_check)
+# endif
+	movl	%eax, %OFFSET_REG
+
+	/* Readjust eax before potentially returning to the loop.  */
+	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
+
+	andl	%r10d, %ecx
+	jz	L(loop_skip_page_cross_check)
+
+	.p2align 4,, 3
+L(return_page_cross_end):
+	tzcntl	%ecx, %ecx
+
+# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
+	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
+L(return_page_cross_cmp_mem):
+# else
+	addl	%OFFSET_REG, %ecx
+# endif
+# ifdef USE_AS_WCSCMP
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
+	xorl	%eax, %eax
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
+	je	L(ret8)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
+# else
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+# endif
+L(ret8):
+	ret
+
+# ifdef USE_AS_STRNCMP
+	.p2align 4,, 10
+L(return_page_cross_end_check):
+	andl	%r10d, %ecx
+	tzcntl	%ecx, %ecx
+	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
+#  ifdef USE_AS_WCSCMP
+	sall	$2, %edx
+#  endif
+	cmpl	%ecx, %edx
+	ja	L(return_page_cross_cmp_mem)
+	xorl	%eax, %eax
+	ret
+# endif
+
+
+	.p2align 4,, 10
+L(more_2x_vec_till_page_cross):
+	/* If more 2x vec till cross we will complete a full loop
+	   iteration here.  */
+
+	VMOVA	VEC_SIZE(%rdi), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_1_end)
+
+# ifdef USE_AS_STRNCMP
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(ret_zero_in_loop_page_cross)
+# endif
+
+	subl	$-(VEC_SIZE * 4), %eax
+
+	/* Safe to include comparisons from lower bytes.  */
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_page_cross_0)
+
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(return_vec_page_cross_1)
+
+# ifdef USE_AS_STRNCMP
+	/* Must check length here as length might proclude reading next
+	   page.  */
+#  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
+	movl	%eax, %r11d
+	shrl	$2, %r11d
+	cmpq	%r11, %rdx
+#  else
+	cmpq	%rax, %rdx
+#  endif
+	jbe	L(ret_zero_in_loop_page_cross)
+# endif
+
+	/* Finish the loop.  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+	VPMINU	%YMM4, %YMM6, %YMM9
+	VPTESTM	%YMM9, %YMM9, %k1
+# ifndef USE_AS_STRCASECMP_L
+	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
+# else
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+	kmovd	%k0, %LOOP_REG
+	TESTEQ	%LOOP_REG
+	jnz	L(return_vec_2_3_end)
+
+	/* Best for code size to include ucond-jmp here. Would be faster
+	   if this case is hot to duplicate the L(return_vec_2_3_end) code
+	   as fall-through and have jump back to loop on mismatch
+	   comparison.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
+# ifdef USE_AS_STRNCMP
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop_skip_page_cross_check)
+L(ret_zero_in_loop_page_cross):
+	xorl	%eax, %eax
+	ret
+# else
+	jmp	L(loop_skip_page_cross_check)
+# endif
+
+
+	.p2align 4,, 10
+L(return_vec_page_cross_0):
+	addl	$-VEC_SIZE, %eax
+L(return_vec_page_cross_1):
+	tzcntl	%ecx, %ecx
+# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
+	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
+#  ifdef USE_AS_STRNCMP
+#   ifdef USE_AS_WCSCMP
+	/* Must divide ecx instead of multiply rdx due to overflow.  */
+	movl	%ecx, %eax
+	shrl	$2, %eax
+	cmpq	%rax, %rdx
+#   else
+	cmpq	%rcx, %rdx
+#   endif
+	jbe	L(ret_zero_in_loop_page_cross)
+#  endif
+# else
+	addl	%eax, %ecx
+# endif
+
+# ifdef USE_AS_WCSCMP
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
+	xorl	%eax, %eax
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
+	je	L(ret9)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
+# else
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+# endif
+L(ret9):
+	ret
+
+
+	.p2align 4,, 10
+L(page_cross):
+# ifndef USE_AS_STRNCMP
+	/* If both are VEC aligned we don't need any special logic here.
+	   Only valid for strcmp where stop condition is guranteed to be
+	   reachable by just reading memory.  */
+	testl	$((VEC_SIZE - 1) << 20), %eax
+	jz	L(no_page_cross)
+# endif
+
+	movl	%edi, %eax
+	movl	%esi, %ecx
+	andl	$(PAGE_SIZE - 1), %eax
+	andl	$(PAGE_SIZE - 1), %ecx
+
+	xorl	%OFFSET_REG, %OFFSET_REG
+
+	/* Check which is closer to page cross, s1 or s2.  */
+	cmpl	%eax, %ecx
+	jg	L(page_cross_s2)
+
+	/* The previous page cross check has false positives. Check for
+	   true positive as page cross logic is very expensive.  */
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
+	jbe	L(no_page_cross)
+
+
+	/* Set r8 to not interfere with normal return value (rdi and rsi
+	   did not swap).  */
+# ifdef USE_AS_WCSCMP
+	/* any non-zero positive value that doesn't inference with 0x1.
+	 */
+	movl	$2, %r8d
+# else
+	xorl	%r8d, %r8d
+# endif
+
+	/* Check if less than 1x VEC till page cross.  */
+	subl	$(VEC_SIZE * 3), %eax
+	jg	L(less_1x_vec_till_page)
+
+
+	/* If more than 1x VEC till page cross, loop throuh safely
+	   loadable memory until within 1x VEC of page cross.  */
+	.p2align 4,, 8
+L(page_cross_loop):
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+	kmovd	%k1, %ecx
+	TESTEQ	%ecx
+	jnz	L(check_ret_vec_page_cross)
+	addl	$CHAR_PER_VEC, %OFFSET_REG
+# ifdef USE_AS_STRNCMP
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross)
+# endif
+	addl	$VEC_SIZE, %eax
+	jl	L(page_cross_loop)
+
+# ifdef USE_AS_WCSCMP
+	shrl	$2, %eax
+# endif
+
+
+	subl	%eax, %OFFSET_REG
+	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
+	   to not cross page so is safe to load. Since we have already
+	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
+	 */
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+	VPTESTM	%YMM0, %YMM0, %k2
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+
+	kmovd	%k1, %ecx
+# ifdef USE_AS_STRNCMP
+	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
+	cmpq	%rax, %rdx
+	jbe	L(check_ret_vec_page_cross2)
+#  ifdef USE_AS_WCSCMP
+	addq	$-(CHAR_PER_VEC * 2), %rdx
+#  else
+	addq	%rdi, %rdx
+#  endif
+# endif
+	TESTEQ	%ecx
+	jz	L(prepare_loop_no_len)
+
+	.p2align 4,, 4
+L(ret_vec_page_cross):
+# ifndef USE_AS_STRNCMP
+L(check_ret_vec_page_cross):
+# endif
+	tzcntl	%ecx, %ecx
+	addl	%OFFSET_REG, %ecx
+L(ret_vec_page_cross_cont):
+# ifdef USE_AS_WCSCMP
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret12)
+	setl	%al
+	negl	%eax
+	xorl	%r8d, %eax
+# else
+	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
+	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+# endif
+L(ret12):
+	ret
+
+
+# ifdef USE_AS_STRNCMP
+	.p2align 4,, 10
+L(check_ret_vec_page_cross2):
+	TESTEQ	%ecx
+L(check_ret_vec_page_cross):
+	tzcntl	%ecx, %ecx
+	addl	%OFFSET_REG, %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_page_cross_cont)
+	.p2align 4,, 2
+L(ret_zero_page_cross):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4,, 4
+L(page_cross_s2):
+	/* Ensure this is a true page cross.  */
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
+	jbe	L(no_page_cross)
+
+
+	movl	%ecx, %eax
+	movq	%rdi, %rcx
+	movq	%rsi, %rdi
+	movq	%rcx, %rsi
+
+	/* set r8 to negate return value as rdi and rsi swapped.  */
+# ifdef USE_AS_WCSCMP
+	movl	$-4, %r8d
+# else
+	movl	$-1, %r8d
+# endif
+	xorl	%OFFSET_REG, %OFFSET_REG
+
+	/* Check if more than 1x VEC till page cross.  */
+	subl	$(VEC_SIZE * 3), %eax
+	jle	L(page_cross_loop)
+
+	.p2align 4,, 6
+L(less_1x_vec_till_page):
+# ifdef USE_AS_WCSCMP
+	shrl	$2, %eax
+# endif
+	/* Find largest load size we can use.  */
+	cmpl	$(16 / SIZE_OF_CHAR), %eax
+	ja	L(less_16_till_page)
+
+	/* Use 16 byte comparison.  */
+	vmovdqu	(%rdi), %xmm0
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xf, %ecx
+# else
+	incw	%cx
+# endif
+	jnz	L(check_ret_vec_page_cross)
+	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
+# ifdef USE_AS_STRNCMP
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subl	%eax, %OFFSET_REG
+# else
+	/* Explicit check for 16 byte alignment.  */
+	subl	%eax, %OFFSET_REG
+	jz	L(prepare_loop)
+# endif
+	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0xf, %ecx
+# else
+	incw	%cx
+# endif
+	jnz	L(check_ret_vec_page_cross)
+# ifdef USE_AS_STRNCMP
+	addl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+# else
+	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+# endif
+	jmp	L(prepare_loop_aligned)
+
+# ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case0):
+	xorl	%eax, %eax
+	ret
+# endif
+
+
+	.p2align 4,, 10
+L(less_16_till_page):
+	cmpl	$(24 / SIZE_OF_CHAR), %eax
+	ja	L(less_8_till_page)
+
+	/* Use 8 byte comparison.  */
+	vmovq	(%rdi), %xmm0
+	vmovq	(%rsi), %xmm1
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0x3, %ecx
+# else
+	incb	%cl
+# endif
+	jnz	L(check_ret_vec_page_cross)
+
+
+# ifdef USE_AS_STRNCMP
+	cmpq	$(8 / SIZE_OF_CHAR), %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+# endif
+	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
+	subl	%eax, %OFFSET_REG
+
+	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+	kmovd	%k1, %ecx
+# ifdef USE_AS_WCSCMP
+	subl	$0x3, %ecx
+# else
+	incb	%cl
+# endif
+	jnz	L(check_ret_vec_page_cross)
+
+
+# ifdef USE_AS_STRNCMP
+	addl	$(8 / SIZE_OF_CHAR), %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case0)
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+# else
+	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+# endif
+	jmp	L(prepare_loop_aligned)
+
+
+
+
+	.p2align 4,, 10
+L(less_8_till_page):
+# ifdef USE_AS_WCSCMP
+	/* If using wchar then this is the only check before we reach
+	   the page boundary.  */
+	movl	(%rdi), %eax
+	movl	(%rsi), %ecx
+	cmpl	%ecx, %eax
+	jnz	L(ret_less_8_wcs)
+#  ifdef USE_AS_STRNCMP
+	addq	$-(CHAR_PER_VEC * 2), %rdx
+	/* We already checked for len <= 1 so cannot hit that case here.
+	 */
+#  endif
+	testl	%eax, %eax
+	jnz	L(prepare_loop)
+	ret
+
+	.p2align 4,, 8
+L(ret_less_8_wcs):
+	setl	%OFFSET_REG8
+	negl	%OFFSET_REG
+	movl	%OFFSET_REG, %eax
+	xorl	%r8d, %eax
+	ret
+
+# else
+	cmpl	$28, %eax
+	ja	L(less_4_till_page)
+
+	vmovd	(%rdi), %xmm0
+	vmovd	(%rsi), %xmm1
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+	kmovd	%k1, %ecx
+	subl	$0xf, %ecx
+	jnz	L(check_ret_vec_page_cross)
+
+#  ifdef USE_AS_STRNCMP
+	cmpq	$4, %rdx
+	jbe	L(ret_zero_page_cross_slow_case1)
+#  endif
+	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
+	subl	%eax, %OFFSET_REG
+
+	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+	VPTESTM	%xmm0, %xmm0, %k2
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+	kmovd	%k1, %ecx
+	subl	$0xf, %ecx
+	jnz	L(check_ret_vec_page_cross)
+#  ifdef USE_AS_STRNCMP
+	addl	$(4 / SIZE_OF_CHAR), %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case1)
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  else
+	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  endif
+	jmp	L(prepare_loop_aligned)
+
+
+#  ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case1):
+	xorl	%eax, %eax
+	ret
+#  endif
+
+	.p2align 4,, 10
+L(less_4_till_page):
+	subq	%rdi, %rsi
+	/* Extremely slow byte comparison loop.  */
+L(less_4_loop):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi, %rdi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
+	jnz	L(ret_less_4_loop)
+	testl	%ecx, %ecx
+	jz	L(ret_zero_4_loop)
+#  ifdef USE_AS_STRNCMP
+	decq	%rdx
+	jz	L(ret_zero_4_loop)
+#  endif
+	incq	%rdi
+	/* end condition is reach page boundary (rdi is aligned).  */
+	testl	$31, %edi
+	jnz	L(less_4_loop)
+	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
+	addq	$-(VEC_SIZE * 4), %rdi
+#  ifdef USE_AS_STRNCMP
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+#  endif
+	jmp	L(prepare_loop_aligned)
+
+L(ret_zero_4_loop):
+	xorl	%eax, %eax
+	ret
+L(ret_less_4_loop):
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+	ret
+# endif
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index bc19547b0..466c6a92a 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -42,13 +42,8 @@
 # define UPDATE_STRNCMP_COUNTER
 #endif
 
-#ifdef USE_AVX
-# define SECTION	avx
-# define GLABEL(l)	l##_avx
-#else
-# define SECTION	sse4.2
-# define GLABEL(l)	l##_sse42
-#endif
+#define SECTION	sse4.2
+#define GLABEL(l)	l##_sse42
 
 #define LABEL(l)	.L##l
 
@@ -89,9 +84,8 @@ ENTRY (GLABEL(__strcasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RDX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strcasecmp))
 	/* FALLTHROUGH to strcasecmp_l.  */
 #endif
@@ -100,29 +94,14 @@ ENTRY (GLABEL(__strncasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RCX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strncasecmp))
 	/* FALLTHROUGH to strncasecmp_l.  */
 #endif
 
 
-#ifdef USE_AVX
-# define movdqa vmovdqa
-# define movdqu vmovdqu
-# define pmovmskb vpmovmskb
-# define pcmpistri vpcmpistri
-# define psubb vpsubb
-# define pcmpeqb vpcmpeqb
-# define psrldq vpsrldq
-# define pslldq vpslldq
-# define palignr vpalignr
-# define pxor vpxor
-# define D(arg) arg, arg
-#else
-# define D(arg) arg
-#endif
+#define arg arg
 
 STRCMP_SSE42:
 	cfi_startproc
@@ -170,27 +149,22 @@ STRCMP_SSE42:
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
-LABEL(belowupper):
-	.quad	0x4040404040404040
-	.quad	0x4040404040404040
-LABEL(topupper):
-# ifdef USE_AVX
-	.quad	0x5a5a5a5a5a5a5a5a
-	.quad	0x5a5a5a5a5a5a5a5a
-# else
-	.quad	0x5b5b5b5b5b5b5b5b
-	.quad	0x5b5b5b5b5b5b5b5b
-# endif
-LABEL(touppermask):
+LABEL(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+LABEL(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+LABEL(case_add):
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.previous
-	movdqa	LABEL(belowupper)(%rip), %xmm4
-# define UCLOW_reg %xmm4
-	movdqa	LABEL(topupper)(%rip), %xmm5
-# define UCHIGH_reg %xmm5
-	movdqa	LABEL(touppermask)(%rip), %xmm6
-# define LCQWORD_reg %xmm6
+	movdqa	LABEL(lcase_min)(%rip), %xmm4
+# define LCASE_MIN_reg %xmm4
+	movdqa	LABEL(lcase_max)(%rip), %xmm5
+# define LCASE_MAX_reg %xmm5
+	movdqa	LABEL(case_add)(%rip), %xmm6
+# define CASE_ADD_reg %xmm6
 #endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
@@ -199,43 +173,26 @@ LABEL(touppermask):
 	movdqu	(%rdi), %xmm1
 	movdqu	(%rsi), %xmm2
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# ifdef USE_AVX
-#  define TOLOWER(reg1, reg2) \
-	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
-	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
-	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
-	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
-	vpandn	%xmm7, %xmm8, %xmm8;					\
-	vpandn	%xmm9, %xmm10, %xmm10;					\
-	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
-	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
-	vpor	reg1, %xmm8, reg1;					\
-	vpor	reg2, %xmm10, reg2
-# else
-#  define TOLOWER(reg1, reg2) \
-	movdqa	reg1, %xmm7;					\
-	movdqa	UCHIGH_reg, %xmm8;				\
-	movdqa	reg2, %xmm9;					\
-	movdqa	UCHIGH_reg, %xmm10;				\
-	pcmpgtb	UCLOW_reg, %xmm7;				\
-	pcmpgtb	reg1, %xmm8;					\
-	pcmpgtb	UCLOW_reg, %xmm9;				\
-	pcmpgtb	reg2, %xmm10;					\
-	pand	%xmm8, %xmm7;					\
-	pand	%xmm10, %xmm9;					\
-	pand	LCQWORD_reg, %xmm7;				\
-	pand	LCQWORD_reg, %xmm9;				\
-	por	%xmm7, reg1;					\
-	por	%xmm9, reg2
-# endif
+# define TOLOWER(reg1, reg2) \
+	movdqa	LCASE_MIN_reg, %xmm7;					\
+	movdqa	LCASE_MIN_reg, %xmm8;					\
+	paddb	reg1, %xmm7;					\
+	paddb	reg2, %xmm8;					\
+	pcmpgtb	LCASE_MAX_reg, %xmm7;				\
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
+	pandn	CASE_ADD_reg, %xmm7;					\
+	pandn	CASE_ADD_reg, %xmm8;					\
+	paddb	%xmm7, reg1;					\
+	paddb	%xmm8, reg2
+
 	TOLOWER (%xmm1, %xmm2)
 #else
 # define TOLOWER(reg1, reg2)
 #endif
-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char checks */
-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
-	pcmpeqb	%xmm2, D(%xmm1)		/* compare first 16 bytes for equality */
-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
 	jnz	LABEL(less16bytes)/* If not, find different value or null char */
@@ -259,7 +216,7 @@ LABEL(crosscache):
 	xor	%r8d, %r8d
 	and	$0xf, %ecx		/* offset of rsi */
 	and	$0xf, %eax		/* offset of rdi */
-	pxor	%xmm0, D(%xmm0)		/* clear %xmm0 for null char check */
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char check */
 	cmp	%eax, %ecx
 	je	LABEL(ashr_0)		/* rsi and rdi relative offset same */
 	ja	LABEL(bigger)
@@ -273,7 +230,7 @@ LABEL(bigger):
 	sub	%rcx, %r9
 	lea	LABEL(unaligned_table)(%rip), %r10
 	movslq	(%r10, %r9,4), %r9
-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	lea	(%r10, %r9), %r10
 	_CET_NOTRACK jmp *%r10		/* jump to corresponding case */
 
@@ -286,15 +243,15 @@ LABEL(bigger):
 LABEL(ashr_0):
 
 	movdqa	(%rsi), %xmm1
-	pcmpeqb	%xmm1, D(%xmm0)		/* Any null chars? */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
-	pcmpeqb	(%rdi), D(%xmm1)	/* compare 16 bytes for equality */
+	pcmpeqb	(%rdi), %xmm1		/* compare 16 bytes for equality */
 #else
 	movdqa	(%rdi), %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm2, D(%xmm1)		/* compare 16 bytes for equality */
+	pcmpeqb	%xmm2, %xmm1		/* compare 16 bytes for equality */
 #endif
-	psubb	%xmm0, D(%xmm1)		/* packed sub of comparison results*/
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
 	pmovmskb %xmm1, %r9d
 	shr	%cl, %edx		/* adjust 0xffff for offset */
 	shr	%cl, %r9d		/* adjust for 16-byte offset */
@@ -374,10 +331,10 @@ LABEL(ashr_0_exit_use):
  */
 	.p2align 4
 LABEL(ashr_1):
-	pslldq	$15, D(%xmm2)		/* shift first string to align with second */
+	pslldq	$15, %xmm2		/* shift first string to align with second */
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)		/* compare 16 bytes for equality */
-	psubb	%xmm0, D(%xmm2)		/* packed sub of comparison results*/
+	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx		/* adjust 0xffff for offset */
 	shr	%cl, %r9d		/* adjust for 16-byte offset */
@@ -405,7 +362,7 @@ LABEL(loop_ashr_1_use):
 
 LABEL(nibble_ashr_1_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
+	palignr $1, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -424,7 +381,7 @@ LABEL(nibble_ashr_1_restart_use):
 	jg	LABEL(nibble_ashr_1_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $1, -16(%rdi, %rdx), D(%xmm0)
+	palignr $1, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -444,7 +401,7 @@ LABEL(nibble_ashr_1_restart_use):
 LABEL(nibble_ashr_1_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$1, D(%xmm0)
+	psrldq	$1, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -462,10 +419,10 @@ LABEL(nibble_ashr_1_use):
  */
 	.p2align 4
 LABEL(ashr_2):
-	pslldq	$14, D(%xmm2)
+	pslldq	$14, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -493,7 +450,7 @@ LABEL(loop_ashr_2_use):
 
 LABEL(nibble_ashr_2_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
+	palignr $2, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -512,7 +469,7 @@ LABEL(nibble_ashr_2_restart_use):
 	jg	LABEL(nibble_ashr_2_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $2, -16(%rdi, %rdx), D(%xmm0)
+	palignr $2, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -532,7 +489,7 @@ LABEL(nibble_ashr_2_restart_use):
 LABEL(nibble_ashr_2_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$2, D(%xmm0)
+	psrldq	$2, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -550,10 +507,10 @@ LABEL(nibble_ashr_2_use):
  */
 	.p2align 4
 LABEL(ashr_3):
-	pslldq	$13, D(%xmm2)
+	pslldq	$13, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -581,7 +538,7 @@ LABEL(loop_ashr_3_use):
 
 LABEL(nibble_ashr_3_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
+	palignr $3, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -600,7 +557,7 @@ LABEL(nibble_ashr_3_restart_use):
 	jg	LABEL(nibble_ashr_3_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $3, -16(%rdi, %rdx), D(%xmm0)
+	palignr $3, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -620,7 +577,7 @@ LABEL(nibble_ashr_3_restart_use):
 LABEL(nibble_ashr_3_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$3, D(%xmm0)
+	psrldq	$3, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -638,10 +595,10 @@ LABEL(nibble_ashr_3_use):
  */
 	.p2align 4
 LABEL(ashr_4):
-	pslldq	$12, D(%xmm2)
+	pslldq	$12, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -670,7 +627,7 @@ LABEL(loop_ashr_4_use):
 
 LABEL(nibble_ashr_4_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
+	palignr $4, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -689,7 +646,7 @@ LABEL(nibble_ashr_4_restart_use):
 	jg	LABEL(nibble_ashr_4_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $4, -16(%rdi, %rdx), D(%xmm0)
+	palignr $4, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -709,7 +666,7 @@ LABEL(nibble_ashr_4_restart_use):
 LABEL(nibble_ashr_4_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$4, D(%xmm0)
+	psrldq	$4, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -727,10 +684,10 @@ LABEL(nibble_ashr_4_use):
  */
 	.p2align 4
 LABEL(ashr_5):
-	pslldq	$11, D(%xmm2)
+	pslldq	$11, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -759,7 +716,7 @@ LABEL(loop_ashr_5_use):
 
 LABEL(nibble_ashr_5_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
+	palignr $5, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -779,7 +736,7 @@ LABEL(nibble_ashr_5_restart_use):
 
 	movdqa	(%rdi, %rdx), %xmm0
 
-	palignr $5, -16(%rdi, %rdx), D(%xmm0)
+	palignr $5, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -799,7 +756,7 @@ LABEL(nibble_ashr_5_restart_use):
 LABEL(nibble_ashr_5_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$5, D(%xmm0)
+	psrldq	$5, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -817,10 +774,10 @@ LABEL(nibble_ashr_5_use):
  */
 	.p2align 4
 LABEL(ashr_6):
-	pslldq	$10, D(%xmm2)
+	pslldq	$10, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -849,7 +806,7 @@ LABEL(loop_ashr_6_use):
 
 LABEL(nibble_ashr_6_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
+	palignr $6, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -868,7 +825,7 @@ LABEL(nibble_ashr_6_restart_use):
 	jg	LABEL(nibble_ashr_6_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $6, -16(%rdi, %rdx), D(%xmm0)
+	palignr $6, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -888,7 +845,7 @@ LABEL(nibble_ashr_6_restart_use):
 LABEL(nibble_ashr_6_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$6, D(%xmm0)
+	psrldq	$6, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -906,10 +863,10 @@ LABEL(nibble_ashr_6_use):
  */
 	.p2align 4
 LABEL(ashr_7):
-	pslldq	$9, D(%xmm2)
+	pslldq	$9, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -938,7 +895,7 @@ LABEL(loop_ashr_7_use):
 
 LABEL(nibble_ashr_7_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
+	palignr $7, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -957,7 +914,7 @@ LABEL(nibble_ashr_7_restart_use):
 	jg	LABEL(nibble_ashr_7_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $7, -16(%rdi, %rdx), D(%xmm0)
+	palignr $7, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri	$0x1a,(%rsi,%rdx), %xmm0
 #else
@@ -977,7 +934,7 @@ LABEL(nibble_ashr_7_restart_use):
 LABEL(nibble_ashr_7_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$7, D(%xmm0)
+	psrldq	$7, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -995,10 +952,10 @@ LABEL(nibble_ashr_7_use):
  */
 	.p2align 4
 LABEL(ashr_8):
-	pslldq	$8, D(%xmm2)
+	pslldq	$8, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1027,7 +984,7 @@ LABEL(loop_ashr_8_use):
 
 LABEL(nibble_ashr_8_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
+	palignr $8, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1046,7 +1003,7 @@ LABEL(nibble_ashr_8_restart_use):
 	jg	LABEL(nibble_ashr_8_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $8, -16(%rdi, %rdx), D(%xmm0)
+	palignr $8, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1066,7 +1023,7 @@ LABEL(nibble_ashr_8_restart_use):
 LABEL(nibble_ashr_8_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$8, D(%xmm0)
+	psrldq	$8, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1084,10 +1041,10 @@ LABEL(nibble_ashr_8_use):
  */
 	.p2align 4
 LABEL(ashr_9):
-	pslldq	$7, D(%xmm2)
+	pslldq	$7, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1117,7 +1074,7 @@ LABEL(loop_ashr_9_use):
 LABEL(nibble_ashr_9_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
 
-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
+	palignr $9, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1136,7 +1093,7 @@ LABEL(nibble_ashr_9_restart_use):
 	jg	LABEL(nibble_ashr_9_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $9, -16(%rdi, %rdx), D(%xmm0)
+	palignr $9, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1156,7 +1113,7 @@ LABEL(nibble_ashr_9_restart_use):
 LABEL(nibble_ashr_9_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$9, D(%xmm0)
+	psrldq	$9, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1174,10 +1131,10 @@ LABEL(nibble_ashr_9_use):
  */
 	.p2align 4
 LABEL(ashr_10):
-	pslldq	$6, D(%xmm2)
+	pslldq	$6, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1206,7 +1163,7 @@ LABEL(loop_ashr_10_use):
 
 LABEL(nibble_ashr_10_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
+	palignr $10, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1225,7 +1182,7 @@ LABEL(nibble_ashr_10_restart_use):
 	jg	LABEL(nibble_ashr_10_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $10, -16(%rdi, %rdx), D(%xmm0)
+	palignr $10, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1245,7 +1202,7 @@ LABEL(nibble_ashr_10_restart_use):
 LABEL(nibble_ashr_10_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$10, D(%xmm0)
+	psrldq	$10, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1263,10 +1220,10 @@ LABEL(nibble_ashr_10_use):
  */
 	.p2align 4
 LABEL(ashr_11):
-	pslldq	$5, D(%xmm2)
+	pslldq	$5, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1295,7 +1252,7 @@ LABEL(loop_ashr_11_use):
 
 LABEL(nibble_ashr_11_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
+	palignr $11, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1314,7 +1271,7 @@ LABEL(nibble_ashr_11_restart_use):
 	jg	LABEL(nibble_ashr_11_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $11, -16(%rdi, %rdx), D(%xmm0)
+	palignr $11, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1334,7 +1291,7 @@ LABEL(nibble_ashr_11_restart_use):
 LABEL(nibble_ashr_11_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$11, D(%xmm0)
+	psrldq	$11, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1352,10 +1309,10 @@ LABEL(nibble_ashr_11_use):
  */
 	.p2align 4
 LABEL(ashr_12):
-	pslldq	$4, D(%xmm2)
+	pslldq	$4, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1384,7 +1341,7 @@ LABEL(loop_ashr_12_use):
 
 LABEL(nibble_ashr_12_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
+	palignr $12, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1403,7 +1360,7 @@ LABEL(nibble_ashr_12_restart_use):
 	jg	LABEL(nibble_ashr_12_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $12, -16(%rdi, %rdx), D(%xmm0)
+	palignr $12, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1423,7 +1380,7 @@ LABEL(nibble_ashr_12_restart_use):
 LABEL(nibble_ashr_12_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$12, D(%xmm0)
+	psrldq	$12, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1441,10 +1398,10 @@ LABEL(nibble_ashr_12_use):
  */
 	.p2align 4
 LABEL(ashr_13):
-	pslldq	$3, D(%xmm2)
+	pslldq	$3, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1474,7 +1431,7 @@ LABEL(loop_ashr_13_use):
 
 LABEL(nibble_ashr_13_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
+	palignr $13, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1493,7 +1450,7 @@ LABEL(nibble_ashr_13_restart_use):
 	jg	LABEL(nibble_ashr_13_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $13, -16(%rdi, %rdx), D(%xmm0)
+	palignr $13, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1513,7 +1470,7 @@ LABEL(nibble_ashr_13_restart_use):
 LABEL(nibble_ashr_13_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$13, D(%xmm0)
+	psrldq	$13, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1531,10 +1488,10 @@ LABEL(nibble_ashr_13_use):
  */
 	.p2align 4
 LABEL(ashr_14):
-	pslldq  $2, D(%xmm2)
+	pslldq  $2, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1564,7 +1521,7 @@ LABEL(loop_ashr_14_use):
 
 LABEL(nibble_ashr_14_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
+	palignr $14, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1583,7 +1540,7 @@ LABEL(nibble_ashr_14_restart_use):
 	jg	LABEL(nibble_ashr_14_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $14, -16(%rdi, %rdx), D(%xmm0)
+	palignr $14, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1603,7 +1560,7 @@ LABEL(nibble_ashr_14_restart_use):
 LABEL(nibble_ashr_14_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$14, D(%xmm0)
+	psrldq	$14, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1621,10 +1578,10 @@ LABEL(nibble_ashr_14_use):
  */
 	.p2align 4
 LABEL(ashr_15):
-	pslldq	$1, D(%xmm2)
+	pslldq	$1, %xmm2
 	TOLOWER (%xmm1, %xmm2)
-	pcmpeqb	%xmm1, D(%xmm2)
-	psubb	%xmm0, D(%xmm2)
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
 	pmovmskb %xmm2, %r9d
 	shr	%cl, %edx
 	shr	%cl, %r9d
@@ -1656,7 +1613,7 @@ LABEL(loop_ashr_15_use):
 
 LABEL(nibble_ashr_15_restart_use):
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
+	palignr $15, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1675,7 +1632,7 @@ LABEL(nibble_ashr_15_restart_use):
 	jg	LABEL(nibble_ashr_15_use)
 
 	movdqa	(%rdi, %rdx), %xmm0
-	palignr $15, -16(%rdi, %rdx), D(%xmm0)
+	palignr $15, -16(%rdi, %rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri $0x1a, (%rsi,%rdx), %xmm0
 #else
@@ -1695,7 +1652,7 @@ LABEL(nibble_ashr_15_restart_use):
 LABEL(nibble_ashr_15_use):
 	sub	$0x1000, %r10
 	movdqa	-16(%rdi, %rdx), %xmm0
-	psrldq	$15, D(%xmm0)
+	psrldq	$15, %xmm0
 	pcmpistri      $0x3a,%xmm0, %xmm0
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	%r11, %rcx
@@ -1771,8 +1728,8 @@ LABEL(strcmp_exitz):
 	.p2align 4
 	// XXX Same as code above
 LABEL(Byte0):
-	movzx	(%rsi), %ecx
-	movzx	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
 
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 6a9dca438..7c2901bf4 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -30,16 +30,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
 
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
new file mode 100644
index 000000000..c2c581ecf
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRCPY
+# define STRCPY __strcpy_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index b7629eaf1..5b6506d58 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -37,6 +37,10 @@
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 /* zero register */
 #define xmmZ	xmm0
 #define ymmZ	ymm0
@@ -46,7 +50,7 @@
 
 # ifndef USE_AS_STRCAT
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCPY)
 #  ifdef USE_AS_STRNCPY
 	mov	%RDX_LP, %R8_LP
@@ -369,8 +373,8 @@ L(CopyVecSizeExit):
 	lea	1(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
 L(CopyTwoVecSize1):
@@ -553,8 +557,7 @@ L(Exit1):
 	lea	2(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit2):
@@ -569,8 +572,7 @@ L(Exit2):
 	lea	3(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit3):
@@ -584,8 +586,7 @@ L(Exit3):
 	lea	4(%rdi), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit4_7):
@@ -602,8 +603,7 @@ L(Exit4_7):
 	lea	1(%rdi, %rdx), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit8_15):
@@ -620,8 +620,7 @@ L(Exit8_15):
 	lea	1(%rdi, %rdx), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit16_31):
@@ -638,8 +637,7 @@ L(Exit16_31):
 	lea 1(%rdi, %rdx), %rdi
 	jnz L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Exit32_63):
@@ -656,8 +654,7 @@ L(Exit32_63):
 	lea	1(%rdi, %rdx), %rdi
 	jnz	L(StrncpyFillTailWithZero)
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNCPY
 
@@ -671,8 +668,7 @@ L(StrncpyExit1):
 #  ifdef USE_AS_STRCAT
 	movb	$0, 1(%rdi)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit2):
@@ -684,8 +680,7 @@ L(StrncpyExit2):
 #  ifdef USE_AS_STRCAT
 	movb	$0, 2(%rdi)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit3_4):
@@ -699,8 +694,7 @@ L(StrncpyExit3_4):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi, %r8)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit5_8):
@@ -714,8 +708,7 @@ L(StrncpyExit5_8):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi, %r8)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit9_16):
@@ -729,8 +722,7 @@ L(StrncpyExit9_16):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi, %r8)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit17_32):
@@ -744,8 +736,7 @@ L(StrncpyExit17_32):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi, %r8)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit33_64):
@@ -760,8 +751,7 @@ L(StrncpyExit33_64):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi, %r8)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(StrncpyExit65):
@@ -778,50 +768,43 @@ L(StrncpyExit65):
 #  ifdef USE_AS_STRCAT
 	movb	$0, 65(%rdi)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 #  ifndef USE_AS_STRCAT
 
 	.p2align 4
 L(Fill1):
 	mov	%dl, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Fill2):
 	mov	%dx, (%rdi)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Fill3_4):
 	mov	%dx, (%rdi)
 	mov     %dx, -2(%rdi, %r8)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Fill5_8):
 	mov	%edx, (%rdi)
 	mov     %edx, -4(%rdi, %r8)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Fill9_16):
 	mov	%rdx, (%rdi)
 	mov	%rdx, -8(%rdi, %r8)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(Fill17_32):
 	vmovdqu %xmmZ, (%rdi)
 	vmovdqu %xmmZ, -16(%rdi, %r8)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(CopyVecSizeUnalignedVec2):
@@ -898,8 +881,7 @@ L(Fill):
 	cmp	$1, %r8d
 	ja	L(Fill2)
 	je	L(Fill1)
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 /* end of ifndef USE_AS_STRCAT */
 #  endif
@@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (VEC_SIZE * 4)(%rdi)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(UnalignedFourVecSizeLeaveCase2):
@@ -1001,16 +982,14 @@ L(StrncpyExit):
 #  ifdef USE_AS_STRCAT
 	movb	$0, (%rdi)
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
 L(ExitZero):
 #  ifndef USE_AS_STRCAT
 	mov	%rdi, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 # endif
 
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
new file mode 100644
index 000000000..a343a1a69
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -0,0 +1,1003 @@
+/* strcpy with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_evex
+#  endif
+
+# endif
+
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+/* Number of bytes in a vector register */
+# ifndef VEC_SIZE
+#  define VEC_SIZE	32
+# endif
+
+# define XMM2		xmm18
+# define XMM3		xmm19
+
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+# define YMM7		ymm23
+
+# ifndef USE_AS_STRCAT
+
+/* zero register */
+#  define XMMZERO	xmm16
+#  define YMMZERO	ymm16
+#  define YMM1		ymm17
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRCPY)
+#  ifdef USE_AS_STRNCPY
+	mov	%RDX_LP, %R8_LP
+	test	%R8_LP, %R8_LP
+	jz	L(ExitZero)
+#  endif
+	mov	%rsi, %rcx
+#  ifndef USE_AS_STPCPY
+	mov	%rdi, %rax      /* save result */
+#  endif
+
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+# endif
+
+	and	$((VEC_SIZE * 4) - 1), %ecx
+	cmp	$(VEC_SIZE * 2), %ecx
+	jbe	L(SourceStringAlignmentLessTwoVecSize)
+
+	and	$-VEC_SIZE, %rsi
+	and	$(VEC_SIZE - 1), %ecx
+
+	vpcmpb	$0, (%rsi), %YMMZERO, %k0
+	kmovd	%k0, %edx
+	shr	%cl, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	mov	$VEC_SIZE, %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  else
+	mov	$(VEC_SIZE + 1), %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  endif
+	jbe	L(CopyVecSizeTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyVecSizeTail)
+
+	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
+	kmovd	%k1, %edx
+
+# ifdef USE_AS_STRNCPY
+	add	$VEC_SIZE, %r10
+	cmp	%r10, %r8
+	jbe	L(CopyTwoVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyTwoVecSize)
+
+	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
+	VMOVU	%YMM2, (%rdi)
+
+/* If source address alignment != destination address alignment */
+	.p2align 4
+L(UnalignVecSizeBoth):
+	sub	%rcx, %rdi
+# ifdef USE_AS_STRNCPY
+	add	%rcx, %r8
+	sbb	%rcx, %rcx
+	or	%rcx, %r8
+# endif
+	mov	$VEC_SIZE, %rcx
+	VMOVA	(%rsi, %rcx), %YMM2
+	VMOVU	%YMM2, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 3), %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM2, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
+	vpcmpb	$0, %YMM3, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec3)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM3, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
+	vpcmpb	$0, %YMM4, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec4)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM4, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM2, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
+	VMOVU	%YMM2, (%rdi, %rcx)
+	vpcmpb	$0, %YMM3, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec3)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM3, (%rdi, %rcx)
+	mov	%rsi, %rdx
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
+	and	$-(VEC_SIZE * 4), %rsi
+	sub	%rsi, %rdx
+	sub	%rdx, %rdi
+# ifdef USE_AS_STRNCPY
+	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
+# endif
+L(UnalignedFourVecSizeLoop):
+	VMOVA	(%rsi), %YMM4
+	VMOVA	VEC_SIZE(%rsi), %YMM5
+	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
+	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
+	vpminub	%YMM5, %YMM4, %YMM2
+	vpminub	%YMM7, %YMM6, %YMM3
+	vpminub	%YMM2, %YMM3, %YMM2
+	/* If K7 != 0, there is a null byte.  */
+	vpcmpb	$0, %YMM2, %YMMZERO, %k7
+	kmovd	%k7, %edx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 4), %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(UnalignedFourVecSizeLeave)
+
+L(UnalignedFourVecSizeLoop_start):
+	add	$(VEC_SIZE * 4), %rdi
+	add	$(VEC_SIZE * 4), %rsi
+	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
+	VMOVA	(%rsi), %YMM4
+	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
+	VMOVA	VEC_SIZE(%rsi), %YMM5
+	vpminub	%YMM5, %YMM4, %YMM2
+	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
+	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
+	VMOVU	%YMM7, -VEC_SIZE(%rdi)
+	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
+	vpminub	%YMM7, %YMM6, %YMM3
+	vpminub	%YMM2, %YMM3, %YMM2
+	/* If K7 != 0, there is a null byte.  */
+	vpcmpb	$0, %YMM2, %YMMZERO, %k7
+	kmovd	%k7, %edx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 4), %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jz	L(UnalignedFourVecSizeLoop_start)
+
+L(UnalignedFourVecSizeLeave):
+	vpcmpb	$0, %YMM4, %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(CopyVecSizeUnaligned_0)
+
+	vpcmpb	$0, %YMM5, %YMMZERO, %k2
+	kmovd	%k2, %ecx
+	test	%ecx, %ecx
+	jnz	L(CopyVecSizeUnaligned_16)
+
+	vpcmpb	$0, %YMM6, %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(CopyVecSizeUnaligned_32)
+
+	vpcmpb	$0, %YMM7, %YMMZERO, %k4
+	kmovd	%k4, %ecx
+	bsf	%ecx, %edx
+	VMOVU	%YMM4, (%rdi)
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
+# endif
+	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
+	add	$(VEC_SIZE - 1), %r8
+	sub	%rdx, %r8
+	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$(VEC_SIZE * 3), %rsi
+	add	$(VEC_SIZE * 3), %rdi
+	jmp	L(CopyVecSizeExit)
+# endif
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentLessTwoVecSize):
+	VMOVU	(%rsi), %YMM3
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	vpcmpb	$0, %YMM3, %YMMZERO, %k0
+	kmovd	%k0, %edx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$VEC_SIZE, %r8
+#  else
+	cmp	$(VEC_SIZE + 1), %r8
+#  endif
+	jbe	L(CopyVecSizeTail1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyVecSizeTail1)
+
+	VMOVU	%YMM3, (%rdi)
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %edx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	cmp	$(VEC_SIZE * 2), %r8
+#  else
+	cmp	$((VEC_SIZE * 2) + 1), %r8
+#  endif
+	jbe	L(CopyTwoVecSize1Case2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyTwoVecSize1)
+
+	and	$-VEC_SIZE, %rsi
+	and	$(VEC_SIZE - 1), %ecx
+	jmp	L(UnalignVecSizeBoth)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+	.p2align 4
+L(CopyVecSize):
+	add	%rcx, %rdi
+# endif
+L(CopyVecSizeTail):
+	add	%rcx, %rsi
+L(CopyVecSizeTail1):
+	bsf	%edx, %edx
+L(CopyVecSizeExit):
+	cmp	$32, %edx
+	jae	L(Exit32_63)
+	cmp	$16, %edx
+	jae	L(Exit16_31)
+	cmp	$8, %edx
+	jae	L(Exit8_15)
+	cmp	$4, %edx
+	jae	L(Exit4_7)
+	cmp	$3, %edx
+	je	L(Exit3)
+	cmp	$1, %edx
+	ja	L(Exit2)
+	je	L(Exit1)
+	movb	$0, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$1, %r8
+	lea	1(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(CopyTwoVecSize1):
+	add	$VEC_SIZE, %rsi
+	add	$VEC_SIZE, %rdi
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$VEC_SIZE, %r8
+# endif
+	jmp	L(CopyVecSizeTail1)
+
+	.p2align 4
+L(CopyTwoVecSize):
+	bsf	%edx, %edx
+	add	%rcx, %rsi
+	add	$VEC_SIZE, %edx
+	sub	%ecx, %edx
+	jmp	L(CopyVecSizeExit)
+
+	.p2align 4
+L(CopyVecSizeUnaligned_0):
+	bsf	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+	VMOVU	%YMM4, (%rdi)
+	add	$((VEC_SIZE * 4) - 1), %r8
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	jmp	L(CopyVecSizeExit)
+# endif
+
+	.p2align 4
+L(CopyVecSizeUnaligned_16):
+	bsf	%ecx, %edx
+	VMOVU	%YMM4, (%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	VEC_SIZE(%rdi, %rdx), %rax
+# endif
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	add	$((VEC_SIZE * 3) - 1), %r8
+	sub	%rdx, %r8
+	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$VEC_SIZE, %rsi
+	add	$VEC_SIZE, %rdi
+	jmp	L(CopyVecSizeExit)
+# endif
+
+	.p2align 4
+L(CopyVecSizeUnaligned_32):
+	bsf	%edx, %edx
+	VMOVU	%YMM4, (%rdi)
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
+# endif
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+	add	$((VEC_SIZE * 2) - 1), %r8
+	sub	%rdx, %r8
+	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
+	jmp	L(StrncpyFillTailWithZero)
+# else
+	add	$(VEC_SIZE * 2), %rsi
+	add	$(VEC_SIZE * 2), %rdi
+	jmp	L(CopyVecSizeExit)
+# endif
+
+# ifdef USE_AS_STRNCPY
+#  ifndef USE_AS_STRCAT
+	.p2align 4
+L(CopyVecSizeUnalignedVec6):
+	VMOVU	%YMM6, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec5):
+	VMOVU	%YMM5, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec4):
+	VMOVU	%YMM4, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec3):
+	VMOVU	%YMM3, (%rdi, %rcx)
+	jmp	L(CopyVecSizeVecExit)
+#  endif
+
+/* Case2 */
+
+	.p2align 4
+L(CopyVecSizeCase2):
+	add	$VEC_SIZE, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
+	jb	L(CopyVecSizeExit)
+	jmp	L(StrncpyExit)
+
+	.p2align 4
+L(CopyTwoVecSizeCase2):
+	add	%rcx, %rsi
+	bsf	%edx, %edx
+	add	$VEC_SIZE, %edx
+	sub	%ecx, %edx
+	cmp	%r8d, %edx
+	jb	L(CopyVecSizeExit)
+	jmp	L(StrncpyExit)
+
+L(CopyVecSizeTailCase2):
+	add	%rcx, %rsi
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
+	jb	L(CopyVecSizeExit)
+	jmp	L(StrncpyExit)
+
+L(CopyVecSizeTail1Case2):
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
+	jb	L(CopyVecSizeExit)
+	jmp	L(StrncpyExit)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyVecSizeCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeCase2)
+L(CopyVecSizeCase3):
+	add	$VEC_SIZE, %r8
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	jmp	L(StrncpyExit)
+
+	.p2align 4
+L(CopyTwoVecSizeCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyTwoVecSizeCase2)
+	add	%rcx, %rsi
+	jmp	L(StrncpyExit)
+
+	.p2align 4
+L(CopyVecSizeTailCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeTailCase2)
+	add	%rcx, %rsi
+	jmp	L(StrncpyExit)
+
+	.p2align 4
+L(CopyTwoVecSize1Case2OrCase3):
+	add	$VEC_SIZE, %rdi
+	add	$VEC_SIZE, %rsi
+	sub	$VEC_SIZE, %r8
+L(CopyVecSizeTail1Case2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(CopyVecSizeTail1Case2)
+	jmp	L(StrncpyExit)
+# endif
+
+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
+
+	.p2align 4
+L(Exit1):
+	movzwl	(%rsi), %edx
+	mov	%dx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$2, %r8
+	lea	2(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit2):
+	movzwl	(%rsi), %ecx
+	mov	%cx, (%rdi)
+	movb	$0, 2(%rdi)
+# ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$3, %r8
+	lea	3(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit3):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+# ifdef USE_AS_STPCPY
+	lea	3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	$4, %r8
+	lea	4(%rdi), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit4_7):
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	-3(%rsi, %rdx), %ecx
+	mov	%ecx, -3(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	%rdx, %r8
+	sub	$1, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit8_15):
+	mov	(%rsi), %rcx
+	mov	-7(%rsi, %rdx), %r9
+	mov	%rcx, (%rdi)
+	mov	%r9, -7(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	%rdx, %r8
+	sub	$1, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit16_31):
+	VMOVU	(%rsi), %XMM2
+	VMOVU	-15(%rsi, %rdx), %XMM3
+	VMOVU	%XMM2, (%rdi)
+	VMOVU	%XMM3, -15(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub %rdx, %r8
+	sub $1, %r8
+	lea 1(%rdi, %rdx), %rdi
+	jnz L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+	.p2align 4
+L(Exit32_63):
+	VMOVU	(%rsi), %YMM2
+	VMOVU	-31(%rsi, %rdx), %YMM3
+	VMOVU	%YMM2, (%rdi)
+	VMOVU	%YMM3, -31(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	sub	%rdx, %r8
+	sub	$1, %r8
+	lea	1(%rdi, %rdx), %rdi
+	jnz	L(StrncpyFillTailWithZero)
+# endif
+	ret
+
+# ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(StrncpyExit1):
+	movzbl	(%rsi), %edx
+	mov	%dl, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	1(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, 1(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit2):
+	movzwl	(%rsi), %edx
+	mov	%dx, (%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	2(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, 2(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit3_4):
+	movzwl	(%rsi), %ecx
+	movzwl	-2(%rsi, %r8), %edx
+	mov	%cx, (%rdi)
+	mov	%dx, -2(%rdi, %r8)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdi, %r8), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi, %r8)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit5_8):
+	mov	(%rsi), %ecx
+	mov	-4(%rsi, %r8), %edx
+	mov	%ecx, (%rdi)
+	mov	%edx, -4(%rdi, %r8)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdi, %r8), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi, %r8)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit9_16):
+	mov	(%rsi), %rcx
+	mov	-8(%rsi, %r8), %rdx
+	mov	%rcx, (%rdi)
+	mov	%rdx, -8(%rdi, %r8)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdi, %r8), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi, %r8)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit17_32):
+	VMOVU	(%rsi), %XMM2
+	VMOVU	-16(%rsi, %r8), %XMM3
+	VMOVU	%XMM2, (%rdi)
+	VMOVU	%XMM3, -16(%rdi, %r8)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdi, %r8), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi, %r8)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit33_64):
+	/*  0/32, 31/16 */
+	VMOVU	(%rsi), %YMM2
+	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
+	VMOVU	%YMM2, (%rdi)
+	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
+#  ifdef USE_AS_STPCPY
+	lea	(%rdi, %r8), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi, %r8)
+#  endif
+	ret
+
+	.p2align 4
+L(StrncpyExit65):
+	/* 0/32, 32/32, 64/1 */
+	VMOVU	(%rsi), %YMM2
+	VMOVU	32(%rsi), %YMM3
+	mov	64(%rsi), %cl
+	VMOVU	%YMM2, (%rdi)
+	VMOVU	%YMM3, 32(%rdi)
+	mov	%cl, 64(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	65(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, 65(%rdi)
+#  endif
+	ret
+
+#  ifndef USE_AS_STRCAT
+
+	.p2align 4
+L(Fill1):
+	mov	%dl, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill2):
+	mov	%dx, (%rdi)
+	ret
+
+	.p2align 4
+L(Fill3_4):
+	mov	%dx, (%rdi)
+	mov     %dx, -2(%rdi, %r8)
+	ret
+
+	.p2align 4
+L(Fill5_8):
+	mov	%edx, (%rdi)
+	mov     %edx, -4(%rdi, %r8)
+	ret
+
+	.p2align 4
+L(Fill9_16):
+	mov	%rdx, (%rdi)
+	mov	%rdx, -8(%rdi, %r8)
+	ret
+
+	.p2align 4
+L(Fill17_32):
+	VMOVU	%XMMZERO, (%rdi)
+	VMOVU	%XMMZERO, -16(%rdi, %r8)
+	ret
+
+	.p2align 4
+L(CopyVecSizeUnalignedVec2):
+	VMOVU	%YMM2, (%rdi, %rcx)
+
+	.p2align 4
+L(CopyVecSizeVecExit):
+	bsf	%edx, %edx
+	add	$(VEC_SIZE - 1), %r8
+	add	%rcx, %rdi
+#   ifdef USE_AS_STPCPY
+	lea	(%rdi, %rdx), %rax
+#   endif
+	sub	%rdx, %r8
+	lea	1(%rdi, %rdx), %rdi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	xor	%edx, %edx
+	sub	$VEC_SIZE, %r8
+	jbe	L(StrncpyFillExit)
+
+	VMOVU	%YMMZERO, (%rdi)
+	add	$VEC_SIZE, %rdi
+
+	mov	%rdi, %rsi
+	and	$(VEC_SIZE - 1), %esi
+	sub	%rsi, %rdi
+	add	%rsi, %r8
+	sub	$(VEC_SIZE * 4), %r8
+	jb	L(StrncpyFillLessFourVecSize)
+
+L(StrncpyFillLoopVmovdqa):
+	VMOVA	%YMMZERO, (%rdi)
+	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
+	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
+	add	$(VEC_SIZE * 4), %rdi
+	sub	$(VEC_SIZE * 4), %r8
+	jae	L(StrncpyFillLoopVmovdqa)
+
+L(StrncpyFillLessFourVecSize):
+	add	$(VEC_SIZE * 2), %r8
+	jl	L(StrncpyFillLessTwoVecSize)
+	VMOVA	%YMMZERO, (%rdi)
+	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
+	add	$(VEC_SIZE * 2), %rdi
+	sub	$VEC_SIZE, %r8
+	jl	L(StrncpyFillExit)
+	VMOVA	%YMMZERO, (%rdi)
+	add	$VEC_SIZE, %rdi
+	jmp	L(Fill)
+
+	.p2align 4
+L(StrncpyFillLessTwoVecSize):
+	add	$VEC_SIZE, %r8
+	jl	L(StrncpyFillExit)
+	VMOVA	%YMMZERO, (%rdi)
+	add	$VEC_SIZE, %rdi
+	jmp	L(Fill)
+
+	.p2align 4
+L(StrncpyFillExit):
+	add	$VEC_SIZE, %r8
+L(Fill):
+	cmp	$17, %r8d
+	jae	L(Fill17_32)
+	cmp	$9, %r8d
+	jae	L(Fill9_16)
+	cmp	$5, %r8d
+	jae	L(Fill5_8)
+	cmp	$3, %r8d
+	jae	L(Fill3_4)
+	cmp	$1, %r8d
+	ja	L(Fill2)
+	je	L(Fill1)
+	ret
+
+/* end of ifndef USE_AS_STRCAT */
+#  endif
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%rdx, %rdx
+	jnz	L(UnalignedFourVecSizeLeaveCase2)
+L(UnalignedFourVecSizeLeaveCase3):
+	lea	(VEC_SIZE * 4)(%r8), %rcx
+	and	$-VEC_SIZE, %rcx
+	add	$(VEC_SIZE * 3), %r8
+	jl	L(CopyVecSizeCase3)
+	VMOVU	%YMM4, (%rdi)
+	sub	$VEC_SIZE, %r8
+	jb	L(CopyVecSizeCase3)
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	sub	$VEC_SIZE, %r8
+	jb	L(CopyVecSizeCase3)
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+	sub	$VEC_SIZE, %r8
+	jb	L(CopyVecSizeCase3)
+	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
+#  ifdef USE_AS_STPCPY
+	lea	(VEC_SIZE * 4)(%rdi), %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (VEC_SIZE * 4)(%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(UnalignedFourVecSizeLeaveCase2):
+	xor	%ecx, %ecx
+	vpcmpb	$0, %YMM4, %YMMZERO, %k1
+	kmovd	%k1, %edx
+	add	$(VEC_SIZE * 3), %r8
+	jle	L(CopyVecSizeCase2OrCase3)
+	test	%edx, %edx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec4)
+#  else
+	jnz	L(CopyVecSize)
+#  endif
+	vpcmpb	$0, %YMM5, %YMMZERO, %k2
+	kmovd	%k2, %edx
+	VMOVU	%YMM4, (%rdi)
+	add	$VEC_SIZE, %rcx
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+	test	%edx, %edx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec5)
+#  else
+	jnz	L(CopyVecSize)
+#  endif
+
+	vpcmpb	$0, %YMM6, %YMMZERO, %k3
+	kmovd	%k3, %edx
+	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	add	$VEC_SIZE, %rcx
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+	test	%edx, %edx
+#  ifndef USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec6)
+#  else
+	jnz	L(CopyVecSize)
+#  endif
+
+	vpcmpb	$0, %YMM7, %YMMZERO, %k4
+	kmovd	%k4, %edx
+	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+	lea	VEC_SIZE(%rdi, %rcx), %rdi
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
+	bsf	%edx, %edx
+	cmp	%r8d, %edx
+	jb	L(CopyVecSizeExit)
+L(StrncpyExit):
+	cmp	$65, %r8d
+	je	L(StrncpyExit65)
+	cmp	$33, %r8d
+	jae	L(StrncpyExit33_64)
+	cmp	$17, %r8d
+	jae	L(StrncpyExit17_32)
+	cmp	$9, %r8d
+	jae	L(StrncpyExit9_16)
+	cmp	$5, %r8d
+	jae	L(StrncpyExit5_8)
+	cmp	$3, %r8d
+	jae	L(StrncpyExit3_4)
+	cmp	$1, %r8d
+	ja	L(StrncpyExit2)
+	je	L(StrncpyExit1)
+#  ifdef USE_AS_STPCPY
+	mov	%rdi, %rax
+#  endif
+#  ifdef USE_AS_STRCAT
+	movb	$0, (%rdi)
+#  endif
+	ret
+
+	.p2align 4
+L(ExitZero):
+#  ifndef USE_AS_STRCAT
+	mov	%rdi, %rax
+#  endif
+	ret
+
+# endif
+
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# else
+END (STRCAT)
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index c56ddbd22..2436b6dcd 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -85,83 +85,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
     RETURN (NULL, strlen (s));
 
   const char *aligned;
-  __m128i mask;
-  int offset = (int) ((size_t) a & 15);
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (unsigned int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
   if (offset != 0)
     {
       /* Load masks.  */
       aligned = (const char *) ((size_t) a & -16L);
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
-      mask = __m128i_shift_right (mask0, offset);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
 
       /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16 - offset)
-	{
-	  /* There is no NULL terminator.  */
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
-	  length += index;
-
-	  /* Don't use SSE4.2 if the length of A > 16.  */
-	  if (length > 16)
-	    return STRCSPN_SSE2 (s, a);
-
-	  if (index != 0)
-	    {
-	      /* Combine mask0 and mask1.  We could play games with
-		 palignr, but frankly this data should be in L1 now
-		 so do the merge via an unaligned load.  */
-	      mask = _mm_loadu_si128 ((__m128i *) a);
-	    }
-	}
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
     }
-  else
-    {
-      /* A is aligned.  */
-      mask = _mm_load_si128 ((__m128i *) a);
 
-      /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16)
-	{
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
-	     of A > 16.  */
-	  if (a[16] != 0)
-	    return STRCSPN_SSE2 (s, a);
-	}
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return STRCSPN_SSE2 (s, a);
     }
 
-  offset = (int) ((size_t) s & 15);
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
   if (offset != 0)
     {
+    start_unaligned:
       /* Check partial string.  */
       aligned = (const char *) ((size_t) s & -16L);
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
 
       value = __m128i_shift_right (value, offset);
 
-      int length = _mm_cmpistri (mask, value, 0x2);
+      unsigned int length = _mm_cmpistri (mask, value, 0x2);
       /* No need to check ZFlag since ZFlag is always 1.  */
-      int cflag = _mm_cmpistrc (mask, value, 0x2);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
       if (cflag)
 	RETURN ((char *) (s + length), length);
       /* Find where the NULL terminator is.  */
-      int index = _mm_cmpistri (value, value, 0x3a);
+      unsigned int index = _mm_cmpistri (value, value, 0x3a);
       if (index < 16 - offset)
 	RETURN (NULL, index);
       aligned += 16;
     }
-  else
-    aligned = s;
 
+start_loop:
   while (1)
     {
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      int index = _mm_cmpistri (mask, value, 0x2);
-      int cflag = _mm_cmpistrc (mask, value, 0x2);
-      int zflag = _mm_cmpistrz (mask, value, 0x2);
+      unsigned int index = _mm_cmpistri (mask, value, 0x2);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
       if (cflag)
 	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
       if (zflag)
diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.S
deleted file mode 100644
index 63b260a9e..000000000
--- a/sysdeps/x86_64/multiarch/strcspn-sse2.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* strcspn optimized with SSE2.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-# define strcspn __strcspn_sse2
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcspn)
-#endif
-
-#include <sysdeps/x86_64/strcspn.S>
diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.c b/sysdeps/x86_64/multiarch/strcspn-sse2.c
new file mode 100644
index 000000000..9bd3dac82
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c
@@ -0,0 +1,28 @@
+/* strcspn optimized with SSE2.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define STRCSPN __strcspn_sse2
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(STRCSPN)
+#endif
+
+#include <string/strcspn.c>
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
new file mode 100644
index 000000000..75b4b7612
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRLEN
+# define STRLEN __strlen_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index caa615970..b282a7561 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -27,370 +27,531 @@
 # ifdef USE_AS_WCSLEN
 #  define VPCMPEQ	vpcmpeqd
 #  define VPMINU	vpminud
+#  define CHAR_SIZE	4
 # else
 #  define VPCMPEQ	vpcmpeqb
 #  define VPMINU	vpminub
+#  define CHAR_SIZE	1
 # endif
 
 # ifndef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	.section .text.avx,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
-	/* Check for zero length.  */
+	/* Check zero length.  */
+#  ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RSI_LP, %RSI_LP
+#  else
 	test	%RSI_LP, %RSI_LP
-	jz	L(zero)
-#  ifdef USE_AS_WCSLEN
-	shl	$2, %RSI_LP
-#  elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
 #  endif
+	jz	L(zero)
+	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
 	mov	%RSI_LP, %R8_LP
 # endif
-	movl	%edi, %ecx
+	movl	%edi, %eax
 	movq	%rdi, %rdx
 	vpxor	%xmm0, %xmm0, %xmm0
-
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check.  */
+	andl	$(PAGE_SIZE - 1), %eax
 	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
 # ifdef USE_AS_STRNLEN
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rsi
-	jbe	L(max)
-# else
-	jnz	L(first_vec_x0)
+	/* If length < VEC_SIZE handle special.  */
+	cmpq	$CHAR_PER_VEC, %rsi
+	jbe	L(first_vec_x0)
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	/* If empty continue to aligned_more. Otherwise return bit
+	   position of first match.  */
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNLEN
-	/* Adjust length.  */
-	addq	%rcx, %rsi
+L(zero):
+	xorl	%eax, %eax
+	ret
 
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
+	.p2align 4
+L(first_vec_x0):
+	/* Set bit for max len so that tzcnt will return min of max len
+	   and position of first match.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+	btsq	%rsi, %rax
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %eax
+#  endif
+	VZEROUPPER_RETURN
 # endif
-	jmp	L(more_4x_vec)
 
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
+L(first_vec_x1):
 	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+#  else
+	subl	$(VEC_SIZE * 4 + 1), %ecx
+	addl	%ecx, %eax
+#  endif
+# else
+	subl	%edx, %edi
+	incl	%edi
+	addl	%edi, %eax
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
-	subq	%rdx, %rax
 # ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %eax
 # endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(aligned_more):
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
-	    to void possible addition overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
-
-	/* Check the end of data.  */
-	subq	%rcx, %rsi
-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+#  else
+	subl	$(VEC_SIZE * 3 + 1), %ecx
+	addl	%ecx, %eax
+#  endif
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE + 1), %edi
+	addl	%edi, %eax
 # endif
+# ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+#  else
+	subl	$(VEC_SIZE * 2 + 1), %ecx
+	addl	%ecx, %eax
+#  endif
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 2 + 1), %edi
+	addl	%edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+#  else
+	subl	$(VEC_SIZE + 1), %ecx
+	addl	%ecx, %eax
+#  endif
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 3 + 1), %edi
+	addl	%edi, %eax
 # endif
+# ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
+	   code on the x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifdef USE_AS_STRNLEN
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+	   because it simplies the logic in last_4x_vec_or_less.  */
+	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+# ifdef USE_AS_STRNLEN
+	/* Adjust length. If near end handle specially.  */
+	subq	%rcx, %rsi
+	jb	L(last_4x_vec_or_less)
+# endif
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
-
-# ifdef USE_AS_STRNLEN
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
-# endif
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
 
+	/* Align data to VEC_SIZE * 4 - 1.  */
 # ifdef USE_AS_STRNLEN
-	/* Adjust length.  */
+	/* Before adjusting length check if at last VEC_SIZE * 4.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
+	jbe	L(last_4x_vec_or_less_load)
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+	/* Readjust length.  */
 	addq	%rcx, %rsi
+# else
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 # endif
-
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa (%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-	VPMINU	%ymm1, %ymm2, %ymm5
-	VPMINU	%ymm3, %ymm4, %ymm6
-	VPMINU	%ymm5, %ymm6, %ymm5
-
+# ifdef USE_AS_STRNLEN
+	/* Break if at end of length.  */
+	subq	$(CHAR_PER_VEC * 4), %rsi
+	jb	L(last_4x_vec_or_less_cmpeq)
+# endif
+	/* Save some code size by microfusing VPMINU with the load.
+	   Since the matches in ymm2/ymm4 can only be returned if there
+	   where no matches in ymm1/ymm3 respectively there is no issue
+	   with overlap.  */
+	vmovdqa	1(%rdi), %ymm1
+	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
+	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
+
+	VPMINU	%ymm2, %ymm4, %ymm5
 	VPCMPEQ	%ymm5, %ymm0, %ymm5
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
+	vpmovmskb %ymm5, %ecx
 
-	addq	$(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 
-# ifndef USE_AS_STRNLEN
-	jmp	L(loop_4x_vec)
-# else
-	subq	$(VEC_SIZE * 4), %rsi
-	ja	L(loop_4x_vec)
 
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %esi
-	jle	L(last_2x_vec)
-
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+	subq	%rdx, %rdi
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(last_vec_return_x0)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm2, %ymm0, %ymm2
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
+	jnz	L(last_vec_return_x1)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %esi
-	jle	L(max)
+	/* Combine last 2 VEC.  */
+	VPCMPEQ	%ymm3, %ymm0, %ymm3
+	vpmovmskb %ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrq	$2, %rax
+# endif
+	VZEROUPPER_RETURN
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
 
-	jnz	L(first_vec_x3_check)
-	movq	%r8, %rax
+# ifdef USE_AS_STRNLEN
+	.p2align 4
+L(last_4x_vec_or_less_load):
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
+	 */
+	subq	$-(VEC_SIZE * 4), %rdi
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+L(last_4x_vec_or_less):
 #  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
 #  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %esi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off
+	   by VEC_SIZE * 4.  */
+	testl	$(VEC_SIZE * 2), %esi
+	jnz	L(last_4x_vec)
+
+	/* length may have been negative or positive by an offset of
+	   VEC_SIZE * 4 depending on where this was called from. This fixes
+	   that.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
 	testl	%eax, %eax
+	jnz	L(last_vec_x1_check)
 
-	jnz	L(first_vec_x0_check)
 	subl	$VEC_SIZE, %esi
-	jle	L(max)
+	jb	L(max)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	movq	%r8, %rax
-#  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-#  endif
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(first_vec_x0_check):
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
+# endif
 
 	.p2align 4
-L(first_vec_x1_check):
+L(last_vec_return_x0):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$VEC_SIZE, %rax
+	subq	$(VEC_SIZE * 4 - 1), %rdi
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
-#  endif
-	VZEROUPPER
-	ret
+# endif
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x2_check):
+L(last_vec_return_x1):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$(VEC_SIZE * 2), %rax
+	subq	$(VEC_SIZE * 3 - 1), %rdi
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
-#  endif
-	VZEROUPPER
-	ret
+# endif
+	VZEROUPPER_RETURN
 
+# ifdef USE_AS_STRNLEN
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x1_check):
+
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$(VEC_SIZE * 3), %rax
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	incl	%eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
-	.p2align 4
 L(max):
 	movq	%r8, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(last_4x_vec):
+	/* Test first 2x VEC normally.  */
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	/* Normalize length.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	subl	$(VEC_SIZE * 3), %esi
+	jb	L(max)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 3 + 1), %eax
+	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
 
 	.p2align 4
-L(first_vec_x0):
+L(last_vec_x1):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+	incl	%eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
-# endif
-	VZEROUPPER
-	ret
+#  endif
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x1):
+L(last_vec_x2):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
 	tzcntl	%eax, %eax
-	addq	$VEC_SIZE, %rax
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
-# endif
-	VZEROUPPER
-	ret
+#  endif
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x2):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 2), %rax
+	subl	$(VEC_SIZE * 2), %esi
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max_end)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 2 + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
+#  endif
+	VZEROUPPER_RETURN
+L(max_end):
+	movq	%r8, %rax
+	VZEROUPPER_RETURN
 # endif
-	VZEROUPPER
-	ret
 
+	/* Cold case for crossing page with first load.  */
 	.p2align 4
-L(4x_vec_end):
-	VPCMPEQ	%ymm1, %ymm0, %ymm1
+L(cross_page_boundary):
+	/* Align data to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod rdx.  */
+	sarxl	%edx, %eax, %eax
+# ifdef USE_AS_STRNLEN
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	VPCMPEQ %ymm2, %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	VPCMPEQ %ymm3, %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
+	jnz	L(cross_page_less_vec)
+	leaq	1(%rdi), %rcx
+	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %ecx
+#  endif
+	/* Check length.  */
+	cmpq	%rsi, %rcx
+	jb	L(cross_page_continue)
+	movq	%r8, %rax
+# else
 	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	VPCMPEQ %ymm4, %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-L(first_vec_x3):
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide length by 4 to get wchar_t count.  */
+	shrl	$2, %eax
+#  endif
+# endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+# ifdef USE_AS_STRNLEN
+	.p2align 4
+L(cross_page_less_vec):
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+	VZEROUPPER_RETURN
 # endif
-	VZEROUPPER
-	ret
 
 END (STRLEN)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
new file mode 100644
index 000000000..4bf6874b8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -0,0 +1,489 @@
+/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+#  define STRLEN	__strlen_evex
+# endif
+
+# define VMOVA		vmovdqa64
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMP		vpcmpd
+#  define VPMINU	vpminud
+#  define SHIFT_REG ecx
+#  define CHAR_SIZE	4
+# else
+#  define VPCMP		vpcmpb
+#  define VPMINU	vpminub
+#  define SHIFT_REG edx
+#  define CHAR_SIZE	1
+# endif
+
+# define XMMZERO	xmm16
+# define YMMZERO	ymm16
+# define YMM1		ymm17
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRLEN)
+# ifdef USE_AS_STRNLEN
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(zero)
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+	mov	%RSI_LP, %R8_LP
+# endif
+	movl	%edi, %eax
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check.  */
+	andl	$(PAGE_SIZE - 1), %eax
+	/* Check if we may cross page boundary with one vector load.  */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+	   null byte.  */
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+# ifdef USE_AS_STRNLEN
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rsi
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	ret
+# ifdef USE_AS_STRNLEN
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(first_vec_x0):
+	/* Set bit for max len so that tzcnt will return min of max len
+	   and position of first match.  */
+	btsq	%rsi, %rax
+	tzcntl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	CHAR_PER_VEC(%rdi, %rax), %eax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
+# endif
+	ret
+
+	.p2align 5
+L(aligned_more):
+	movq	%rdi, %rdx
+	/* Align data to VEC_SIZE.  */
+	andq	$-(VEC_SIZE), %rdi
+L(cross_page_continue):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+# ifdef USE_AS_STRNLEN
+	/* + CHAR_SIZE because it simplies the logic in
+	   last_4x_vec_or_less.  */
+	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
+	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+# endif
+	/* Load first VEC regardless.  */
+	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+# ifdef USE_AS_STRNLEN
+	/* Adjust length. If near end handle specially.  */
+	subq	%rcx, %rsi
+	jb	L(last_4x_vec_or_less)
+# endif
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	test	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+
+	addq	$VEC_SIZE, %rdi
+# ifdef USE_AS_STRNLEN
+	/* Check if at last VEC_SIZE * 4 length.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
+	jbe	L(last_4x_vec_or_less_load)
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+	/* Readjust length.  */
+	addq	%rcx, %rsi
+# endif
+	/* Align data to VEC_SIZE * 4.  */
+	andq	$-(VEC_SIZE * 4), %rdi
+
+	/* Compare 4 * VEC at a time forward.  */
+	.p2align 4
+L(loop_4x_vec):
+	/* Load first VEC regardless.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+# ifdef USE_AS_STRNLEN
+	/* Break if at end of length.  */
+	subq	$(CHAR_PER_VEC * 4), %rsi
+	jb	L(last_4x_vec_or_less_cmpeq)
+# endif
+	/* Save some code size by microfusing VPMINU with the load. Since
+	   the matches in ymm2/ymm4 can only be returned if there where no
+	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+	 */
+	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
+	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+
+	VPCMP	$0, %YMM2, %YMMZERO, %k0
+	VPCMP	$0, %YMM4, %YMMZERO, %k1
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd	%k0, %k1
+	jz	L(loop_4x_vec)
+
+	/* Check if end was in first half.  */
+	kmovd	%k0, %eax
+	subq	%rdx, %rdi
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rdi
+# endif
+	testl	%eax, %eax
+	jz	L(second_vec_return)
+
+	VPCMP	$0, %YMM1, %YMMZERO, %k2
+	kmovd	%k2, %edx
+	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
+# ifdef USE_AS_WCSLEN
+	sall	$CHAR_PER_VEC, %eax
+	orl	%edx, %eax
+	tzcntl	%eax, %eax
+# else
+	salq	$CHAR_PER_VEC, %rax
+	orq	%rdx, %rax
+	tzcntq	%rax, %rax
+# endif
+	addq	%rdi, %rax
+	ret
+
+
+# ifdef USE_AS_STRNLEN
+
+L(last_4x_vec_or_less_load):
+	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, %YMM1, %YMMZERO, %k0
+	addq	$(VEC_SIZE * 3), %rdi
+L(last_4x_vec_or_less):
+	kmovd	%k0, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+	   VEC_SIZE * 4.  */
+	testl	$(CHAR_PER_VEC * 2), %esi
+	jnz	L(last_4x_vec)
+
+	/* length may have been negative or positive by an offset of
+	   CHAR_PER_VEC * 4 depending on where this was called from. This
+	   fixes that.  */
+	andl	$(CHAR_PER_VEC * 4 - 1), %esi
+	testl	%eax, %eax
+	jnz	L(last_vec_x1_check)
+
+	/* Check the end of data.  */
+	subl	$CHAR_PER_VEC, %esi
+	jb	L(max)
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+
+	subq	%rdx, %rdi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+#  endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+	ret
+L(max):
+	movq	%r8, %rax
+	ret
+# endif
+
+	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
+	   in the 4x VEC loop can use 2 byte encoding.  */
+	.p2align 4
+L(second_vec_return):
+	VPCMP	$0, %YMM3, %YMMZERO, %k0
+	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
+# ifdef USE_AS_WCSLEN
+	kunpckbw	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+# else
+	kunpckdq	%k0, %k1, %k0
+	kmovq	%k0, %rax
+	tzcntq	%rax, %rax
+# endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+	ret
+
+
+# ifdef USE_AS_STRNLEN
+L(last_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+#  endif
+	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(last_4x_vec):
+	/* Test first 2x VEC normally.  */
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	/* Normalize length.  */
+	andl	$(CHAR_PER_VEC * 4 - 1), %esi
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	/* Check the end of data.  */
+	subl	$(CHAR_PER_VEC * 3), %esi
+	jb	L(max)
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max_end)
+
+	subq	%rdx, %rdi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+#  endif
+	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(last_vec_x1):
+	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+#  endif
+	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(last_vec_x2):
+	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+#  endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4
+L(last_vec_x3):
+	tzcntl	%eax, %eax
+	subl	$(CHAR_PER_VEC * 2), %esi
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max_end)
+	subq	%rdx, %rdi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+#  endif
+	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
+	ret
+L(max_end):
+	movq	%r8, %rax
+	ret
+# endif
+
+	/* Cold case for crossing page with first load.	 */
+	.p2align 4
+L(cross_page_boundary):
+	movq	%rdi, %rdx
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	/* Remove the leading bytes.  */
+# ifdef USE_AS_WCSLEN
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+	   bytes.  */
+	movl	%edx, %ecx
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+# endif
+	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
+	sarxl	%SHIFT_REG, %eax, %eax
+	testl	%eax, %eax
+# ifndef USE_AS_STRNLEN
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+	ret
+# else
+	jnz	L(cross_page_less_vec)
+#  ifndef USE_AS_WCSLEN
+	movl	%edx, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+#  endif
+	movl	$CHAR_PER_VEC, %eax
+	subl	%ecx, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	ja	L(cross_page_continue)
+	movl	%esi, %eax
+	ret
+L(cross_page_less_vec):
+	tzcntl	%eax, %eax
+	/* Select min of length and position of first null.  */
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+	ret
+# endif
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
index 65769f3c2..f10741c07 100644
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
@@ -20,4 +20,4 @@
 # define strlen __strlen_sse2
 #endif
 
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
new file mode 100644
index 000000000..b7657282b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -0,0 +1,263 @@
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef AS_WCSLEN
+# define PMINU		pminud
+# define PCMPEQ		pcmpeqd
+# define SHIFT_RETURN	shrq $2, %rax
+#else
+# define PMINU		pminub
+# define PCMPEQ		pcmpeqb
+# define SHIFT_RETURN
+#endif
+
+/* Long lived register in strlen(s), strnlen(s, n) are:
+
+	%xmm3 - zero
+	%rdi   - s
+	%r10  (s+n) & (~(64-1))
+	%r11   s+n
+*/
+
+
+.text
+ENTRY(strlen)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+#define FIND_ZERO	\
+	PCMPEQ	(%rax), %xmm0;	\
+	PCMPEQ	16(%rax), %xmm1;	\
+	PCMPEQ	32(%rax), %xmm2;	\
+	PCMPEQ	48(%rax), %xmm3;	\
+	pmovmskb	%xmm0, %esi;	\
+	pmovmskb	%xmm1, %edx;	\
+	pmovmskb	%xmm2, %r8d;	\
+	pmovmskb	%xmm3, %ecx;	\
+	salq	$16, %rdx;	\
+	salq	$16, %rcx;	\
+	orq	%rsi, %rdx;	\
+	orq	%r8, %rcx;	\
+	salq	$32, %rcx;	\
+	orq	%rcx, %rdx;
+
+#ifdef AS_STRNLEN
+/* Do not read anything when n==0.  */
+	test	%RSI_LP, %RSI_LP
+	jne	L(n_nonzero)
+	xor	%rax, %rax
+	ret
+L(n_nonzero):
+# ifdef AS_WCSLEN
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+   overflow the only way this program doesn't have undefined behavior 
+   is if there is a null terminator in valid memory so wcslen will 
+   suffice.  */
+	mov	%RSI_LP, %R10_LP
+	sar	$62, %R10_LP
+	jnz	__wcslen_sse4_1
+	sal	$2, %RSI_LP
+# endif
+
+/* Initialize long lived registers.  */
+	add	%RDI_LP, %RSI_LP
+	mov	%RSI_LP, %R10_LP
+	and	$-64, %R10_LP
+	mov	%RSI_LP, %R11_LP
+#endif
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	movq	%rdi, %rax
+	movq	%rdi, %rcx
+	andq	$4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+	cmpq	$4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower.  */
+	ja	L(cross_page)
+
+#ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes.  */
+# define STRNLEN_PROLOG	\
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+#else
+# define STRNLEN_PROLOG  andq $-64, %rax;
+#endif
+
+/* Ignore bits in mask that come before start of string.  */
+#define PROLOG(lab)	\
+	movq	%rdi, %rcx;	\
+	xorq	%rax, %rcx;	\
+	STRNLEN_PROLOG;	\
+	sarq	%cl, %rdx;	\
+	test	%rdx, %rdx;	\
+	je	L(lab);	\
+	bsfq	%rdx, %rax;	\
+	SHIFT_RETURN;		\
+	ret
+
+#ifdef AS_STRNLEN
+	andq	$-16, %rax
+	FIND_ZERO
+#else
+	/* Test first 16 bytes unaligned.  */
+	movdqu	(%rax), %xmm4
+	PCMPEQ	%xmm0, %xmm4
+	pmovmskb	%xmm4, %edx
+	test	%edx, %edx
+	je 	L(next48_bytes)
+	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+	SHIFT_RETURN
+	ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+	andq	$-16, %rax
+	PCMPEQ 16(%rax), %xmm1
+	PCMPEQ 32(%rax), %xmm2
+	PCMPEQ 48(%rax), %xmm3
+	pmovmskb	%xmm1, %edx
+	pmovmskb	%xmm2, %r8d
+	pmovmskb	%xmm3, %ecx
+	salq	$16, %rdx
+	salq	$16, %rcx
+	orq	%r8, %rcx
+	salq	$32, %rcx
+	orq	%rcx, %rdx
+#endif
+
+	/* When no zero byte is found xmm1-3 are zero so we do not have to
+	   zero them.  */
+	PROLOG(loop)
+
+	.p2align 4
+L(cross_page):
+	andq	$-64, %rax
+	FIND_ZERO
+	PROLOG(loop_init)
+
+#ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1).  */
+L(strnlen_ret):
+	bts	%rsi, %rdx
+	sarq	%cl, %rdx
+	test	%rdx, %rdx
+	je	L(loop_init)
+	bsfq	%rdx, %rax
+	SHIFT_RETURN
+	ret
+#endif
+	.p2align 4
+L(loop_init):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+#ifdef AS_STRNLEN
+	.p2align 4
+L(loop):
+
+	addq	$64, %rax
+	cmpq	%rax, %r10
+	je	L(exit_end)
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit_end):
+	cmp	%rax, %r11
+	je	L(first) /* Do not read when end is at page boundary.  */
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+L(first):
+	bts	%r11, %rdx
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+	.p2align 4
+L(exit):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+#else
+
+	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+	.p2align 4
+L(loop):
+
+	movdqa	64(%rax), %xmm0
+	PMINU	80(%rax), %xmm0
+	PMINU	96(%rax), %xmm0
+	PMINU	112(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit64)
+
+	subq	$-128, %rax
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit0)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit64):
+	addq	$64, %rax
+L(exit0):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+#endif
+
+END(strlen)
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S
deleted file mode 100644
index f1d3fefdd..000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-avx.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* strncasecmp_l optimized with AVX.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#define STRCMP_SSE42 __strncasecmp_l_avx
-#define USE_AVX 1
-#define USE_AS_STRNCASECMP_L
-#include "strcmp-sse42.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 000000000..58c05dcfb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x)	x ## _rtm
+#define GLABEL(x)	_GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN	jmp L(return_vzeroupper)
+
+#define SECTION(p)	p##.avx.rtm
+#define OVERFLOW_STRCMP	__strcasecmp_l_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 000000000..48c0aa21f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP	__strcasecmp_l_avx2
+#endif
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
new file mode 100644
index 000000000..8a5af3695
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
@@ -0,0 +1,25 @@
+/* strncasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP	__strcasecmp_l_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
new file mode 100644
index 000000000..0dcea18db
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_avx2_rtm
+#include "strcat-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S
new file mode 100644
index 000000000..8884f0237
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-evex.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_evex
+#include "strcat-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
new file mode 100644
index 000000000..68bad365b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRCMP	__strncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#define OVERFLOW_STRCMP	__strcmp_avx2_rtm
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
index 1678bcc23..f138e9f1f 100644
--- a/sysdeps/x86_64/multiarch/strncmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
@@ -1,3 +1,4 @@
 #define STRCMP	__strncmp_avx2
 #define USE_AS_STRNCMP 1
+#define OVERFLOW_STRCMP __strcmp_avx2
 #include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S
new file mode 100644
index 000000000..a1d53e8c9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-evex.S
@@ -0,0 +1,3 @@
+#define STRCMP	__strncmp_evex
+#define USE_AS_STRNCMP 1
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index a565626ae..f94a42178 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -30,16 +30,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
 
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
new file mode 100644
index 000000000..79e708329
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_avx2_rtm
+#include "strcpy-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S
new file mode 100644
index 000000000..40e391f0d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-evex.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_evex
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
new file mode 100644
index 000000000..04f1626a5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_avx2_rtm
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
new file mode 100644
index 000000000..722022f30
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.S
deleted file mode 100644
index c5b95d08f..000000000
--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S
+++ /dev/null
@@ -1,29 +0,0 @@
-/* strpbrk optimized with SSE2.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-# define strcspn __strpbrk_sse2
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strpbrk)
-#endif
-
-#define USE_AS_STRPBRK
-#include <sysdeps/x86_64/strcspn.S>
diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.c b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
new file mode 100644
index 000000000..8a58f051c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c
@@ -0,0 +1,28 @@
+/* strpbrk optimized with SSE2.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define STRPBRK __strpbrk_sse2
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(STRPBRK)
+#endif
+
+#include <string/strpbrk.c>
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
new file mode 100644
index 000000000..5def14ec1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S
@@ -0,0 +1,12 @@
+#ifndef STRRCHR
+# define STRRCHR __strrchr_avx2_rtm
+#endif
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp	 L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 53ea44530..b8dec737d 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,209 +27,322 @@
 # ifdef USE_AS_WCSRCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
 # ifndef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
 # endif
 
+# ifndef SECTION
+#  define SECTION(p)	p##.avx
+# endif
+
 # define VEC_SIZE	32
+# define PAGE_SIZE	4096
 
-	.section .text.avx,"ax",@progbits
-ENTRY (STRRCHR)
-	movd	%esi, %xmm4
-	movl	%edi, %ecx
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+	movd	%esi, %xmm7
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMM4.  */
-	VPBROADCAST %xmm4, %ymm4
+	VPBROADCAST %xmm7, %ymm7
 	vpxor	%xmm0, %xmm0, %xmm0
 
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	/* Shift here instead of `andl` to save code size (saves a fetch
+	   block).  */
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(cross_page)
 
+L(page_cross_continue):
 	vmovdqu	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	addq	$VEC_SIZE, %rdi
-
-	testl	%eax, %eax
-	jnz	L(first_vec)
-
+	/* Check end of string match.  */
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
 	testl	%ecx, %ecx
-	jnz	L(return_null)
+	jz	L(aligned_more)
+
+	/* Only check match with search CHAR if needed.  */
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Check if match before first zero.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
+	.p2align 4,, 10
+L(first_vec_x1):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	.p2align 4,, 4
+L(first_vec_x0_test):
+	VPCMPEQ	%ymm1, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	testl	%eax, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret1):
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 10
+L(first_vec_x0_x1_test):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	/* Check ymm2 for search CHAR match. If no match then check ymm1
+	   before returning.  */
+	testl	%eax, %eax
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
+
+
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	/* If no in-range search CHAR match in ymm3 then need to check
+	   ymm1/ymm2 for an earlier match (we delay checking search
+	   CHAR matches until needed).  */
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
 
 	.p2align 4
-L(first_vec):
-	/* Check if there is a nul CHAR.  */
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+
+	/* Align src.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqu	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
 	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
+	jnz	L(first_vec_x1)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
+	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
+	VPCMPEQ	%ymm3, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	addq	$(VEC_SIZE + 1), %rdi
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %edx
-	vpmovmskb %ymm3, %eax
-	shrl	%cl, %edx
-	shrl	%cl, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
+L(first_aligned_loop):
+	/* Do 2x VEC at a time. Any more and the cost of finding the
+	   match outweights loop benefit.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm8
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm8, %ymm0, %ymm8
+	vpor	%ymm5, %ymm8, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
+	/* No zero or search CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
-
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	jz	L(first_aligned_loop)
 
-	.p2align 4
-L(aligned_loop):
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	add	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
+	/* If no zero CHAR then go to second loop (this allows us to
+	   throw away all prior work).  */
+	vpmovmskb %ymm8, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_prep)
 
-	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a nul CHAR in a loop.  */
+	/* Search char could be zero so we need to get the true match.
+	 */
+	vpmovmskb %ymm5, %eax
 	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	jnz	L(first_aligned_loop_return)
 
+	.p2align 4,, 4
+L(first_vec_x1_or_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm3
+	VPCMPEQ	%ymm2, %ymm7, %ymm2
+	vpmovmskb %ymm3, %eax
+	vpmovmskb %ymm2, %edx
+	/* Use add for macro-fusion.  */
+	addq	%rax, %rdx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	salq	$32, %rax
+	addq	%rdx, %rax
+	bsrq	%rax, %rax
+	leaq	1(%rsi, %rax), %rax
 # ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %eax
+	andq	$-CHAR_SIZE, %rax
 # endif
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER
-	ret
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 8
+L(first_aligned_loop_return):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(first_vec_x1_or_x2)
+
+	bsrq	%rax, %rax
+	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
+	/* Search char cannot be zero.  */
 	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a nul CHAR.  */
-	vpmovmskb %ymm2, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+L(second_aligned_loop_set_furthest_match):
+	/* Save VEC and pointer from most recent match.  */
+L(second_aligned_loop_prep):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	vmovdqu	%ymm6, %ymm2
+	vmovdqu	%ymm10, %ymm3
 
 	.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+L(second_aligned_loop):
+	/* Search 2x at at time.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm1
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpor	%ymm5, %ymm1, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
 	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER
-	ret
+	jz	L(second_aligned_loop)
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_set_furthest_match)
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(return_new_match)
 
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a nul CHAR.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
+	/* This is the hot patch. We know CHAR is inbounds and that
+	   ymm3/ymm2 have latest match.  */
+	.p2align 4,, 4
+L(return_old_match):
+	vpmovmskb %ymm3, %eax
+	vpmovmskb %ymm2, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
+	VZEROUPPER_RETURN
+
+	/* Last iteration also potentially has a match.  */
+	.p2align 4,, 8
+L(return_new_match):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(return_old_match)
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4,, 4
+L(cross_page):
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	/* Shift out zero CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+
+	/* Shift out search CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%edi, %eax, %eax
+	blsmskl	%ecx, %ecx
+	/* Check if any search CHAR match in range.  */
+	andl	%ecx, %eax
+	jz	L(ret2)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
 # ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
+	andq	$-CHAR_SIZE, %rax
 # endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
-	testl	%eax, %eax
-	/* Return null pointer if the nul CHAR comes first.  */
-	jz	L(return_null)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER
-	ret
-
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
-	VZEROUPPER
-	ret
-
-END (STRRCHR)
+L(ret2):
+	VZEROUPPER_RETURN
+END(STRRCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
new file mode 100644
index 000000000..f5b6d755c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -0,0 +1,374 @@
+/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRRCHR
+#  define STRRCHR	__strrchr_evex
+# endif
+
+# define VMOVU	vmovdqu64
+# define VMOVA	vmovdqa64
+
+# ifdef USE_AS_WCSRCHR
+#  define SHIFT_REG	esi
+
+#  define kunpck	kunpckbw
+#  define kmov_2x	kmovd
+#  define maskz_2x	ecx
+#  define maskm_2x	eax
+#  define CHAR_SIZE	4
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPBROADCAST	vpbroadcastd
+#  define VPCMP	vpcmpd
+# else
+#  define SHIFT_REG	edi
+
+#  define kunpck	kunpckdq
+#  define kmov_2x	kmovq
+#  define maskz_2x	rcx
+#  define maskm_2x	rax
+
+#  define CHAR_SIZE	1
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPBROADCAST	vpbroadcastb
+#  define VPCMP	vpcmpb
+# endif
+
+# define XMMZERO	xmm16
+# define YMMZERO	ymm16
+# define YMMMATCH	ymm17
+# define YMMSAVE	ymm18
+
+# define YMM1	ymm19
+# define YMM2	ymm20
+# define YMM3	ymm21
+# define YMM4	ymm22
+# define YMM5	ymm23
+# define YMM6	ymm24
+# define YMM7	ymm25
+# define YMM8	ymm26
+
+
+# define VEC_SIZE	32
+# define PAGE_SIZE	4096
+	.section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+	movl	%edi, %eax
+	/* Broadcast CHAR to YMMMATCH.  */
+	VPBROADCAST %esi, %YMMMATCH
+
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(cross_page_boundary)
+
+L(page_cross_continue):
+	VMOVU	(%rdi), %YMM1
+	/* k0 has a 1 for each zero CHAR in YMM1.  */
+	VPTESTN	%YMM1, %YMM1, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+	/* fallthrough: zero CHAR in first VEC.  */
+
+	/* K1 has a 1 for each search CHAR match in YMM1.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	/* Build mask up until first zero CHAR (used to mask of
+	   potential search CHAR matches past the end of the string).
+	 */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	/* Get last match (the `andl` removed any out of bounds
+	   matches).  */
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+L(ret0):
+	ret
+
+	/* Returns for first vec x1/x2/x3 have hard coded backward
+	   search path for earlier matches.  */
+	.p2align 4,, 6
+L(first_vec_x1):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	/* eax non-zero if search CHAR in range.  */
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	/* fallthrough: no match in YMM2 then need to check for earlier
+	   matches (in YMM1).  */
+	.p2align 4,, 4
+L(first_vec_x0_test):
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rsi, %rax
+# endif
+L(ret1):
+	ret
+
+	.p2align 4,, 10
+L(first_vec_x1_or_x2):
+	VPCMP	$0, %YMM3, %YMMMATCH, %k3
+	VPCMP	$0, %YMM2, %YMMMATCH, %k2
+	/* K2 and K3 have 1 for any search CHAR match. Test if any
+	   matches between either of them. Otherwise check YMM1.  */
+	kortestd %k2, %k3
+	jz	L(first_vec_x0_test)
+
+	/* Guranteed that YMM2 and YMM3 are within range so merge the
+	   two bitmasks then get last result.  */
+	kunpck	%k2, %k3, %k3
+	kmovq	%k3, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x3):
+	VPCMP	$0, %YMMMATCH, %YMM4, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_or_x2)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x0_x1_test):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	/* Check YMM2 for last match first. If no match try YMM1.  */
+	testl	%eax, %eax
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMP	$0, %YMMMATCH, %YMM3, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
+	 */
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+
+	.p2align 4
+L(aligned_more):
+	/* Need to keep original pointer incase YMM1 has last match.  */
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rdi
+	VMOVU	VEC_SIZE(%rdi), %YMM2
+	VPTESTN	%YMM2, %YMM2, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
+
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
+	VPTESTN	%YMM3, %YMM3, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
+	VPTESTN	%YMM4, %YMM4, %k0
+	kmovd	%k0, %ecx
+	movq	%rdi, %r8
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x3)
+
+	andq	$-(VEC_SIZE * 2), %rdi
+	.p2align 4
+L(first_aligned_loop):
+	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+	   they don't store a match.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
+
+	VPCMP	$0, %YMM5, %YMMMATCH, %k2
+	vpxord	%YMM6, %YMMMATCH, %YMM7
+
+	VPMIN	%YMM5, %YMM6, %YMM8
+	VPMIN	%YMM8, %YMM7, %YMM7
+
+	VPTESTN	%YMM7, %YMM7, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(first_aligned_loop)
+
+	VPCMP	$0, %YMM6, %YMMMATCH, %k3
+	VPTESTN	%YMM8, %YMM8, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_prep)
+
+	kortestd %k2, %k3
+	jnz	L(return_first_aligned_loop)
+
+	.p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+	VPCMP	$0, %YMM4, %YMMMATCH, %k4
+	kmovd	%k4, %eax
+	testl	%eax, %eax
+	jz	L(first_vec_x1_or_x2)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 8
+L(return_first_aligned_loop):
+	VPTESTN	%YMM5, %YMM5, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(first_vec_x1_or_x2_or_x3)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+	/* We can throw away the work done for the first 4x checks here
+	   as we have a later match. This is the 'fast' path persay.
+	 */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
+	movq	%rdi, %rsi
+	kunpck	%k2, %k3, %k4
+
+	.p2align 4
+L(second_aligned_loop):
+	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
+	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
+
+	VPCMP	$0, %YMM1, %YMMMATCH, %k2
+	vpxord	%YMM2, %YMMMATCH, %YMM3
+
+	VPMIN	%YMM1, %YMM2, %YMM4
+	VPMIN	%YMM3, %YMM4, %YMM3
+
+	VPTESTN	%YMM3, %YMM3, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(second_aligned_loop)
+
+	VPCMP	$0, %YMM2, %YMMMATCH, %k3
+	VPTESTN	%YMM4, %YMM4, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_set_furthest_match)
+
+	kortestd %k2, %k3
+	/* branch here because there is a significant advantage interms
+	   of output dependency chance in using edx.  */
+	jnz	L(return_new_match)
+L(return_old_match):
+	kmovq	%k4, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(return_new_match):
+	VPTESTN	%YMM1, %YMM1, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(return_old_match)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(cross_page_boundary):
+	/* eax contains all the page offset bits of src (rdi). `xor rdi,
+	   rax` sets pointer will all page offset bits cleared so
+	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+	   before page cross (guranteed to be safe to read). Doing this
+	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+	   a bit of code size.  */
+	xorq	%rdi, %rax
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+	VPTESTN	%YMM1, %YMM1, %k0
+	kmovd	%k0, %ecx
+
+	/* Shift out zero CHAR matches that are before the begining of
+	   src (rdi).  */
+# ifdef USE_AS_WCSRCHR
+	movl	%edi, %esi
+	andl	$(VEC_SIZE - 1), %esi
+	shrl	$2, %esi
+# endif
+	shrxl	%SHIFT_REG, %ecx, %ecx
+
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+
+	/* Found zero CHAR so need to test for search CHAR.  */
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	/* Shift out search CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%SHIFT_REG, %eax, %eax
+
+	/* Check if any search CHAR match in range.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret3)
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+L(ret3):
+	ret
+
+END(STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index 67c30d026..a56300bc1 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
 
 # undef weak_alias
 # define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index a17196296..3bcc479f1 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a)
     return 0;
 
   const char *aligned;
-  __m128i mask;
-  int offset = (int) ((size_t) a & 15);
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
   if (offset != 0)
     {
       /* Load masks.  */
       aligned = (const char *) ((size_t) a & -16L);
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
-      mask = __m128i_shift_right (mask0, offset);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
 
       /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16 - offset)
-	{
-	  /* There is no NULL terminator.  */
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
-	  length += index;
-
-	  /* Don't use SSE4.2 if the length of A > 16.  */
-	  if (length > 16)
-	    return __strspn_sse2 (s, a);
-
-	  if (index != 0)
-	    {
-	      /* Combine mask0 and mask1.  We could play games with
-		 palignr, but frankly this data should be in L1 now
-		 so do the merge via an unaligned load.  */
-	      mask = _mm_loadu_si128 ((__m128i *) a);
-	    }
-	}
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
     }
-  else
-    {
-      /* A is aligned.  */
-      mask = _mm_load_si128 ((__m128i *) a);
 
-      /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16)
-	{
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
-	     of A > 16.  */
-	  if (a[16] != 0)
-	    return __strspn_sse2 (s, a);
-	}
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return __strspn_sse2 (s, a);
     }
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
 
-  offset = (int) ((size_t) s & 15);
   if (offset != 0)
     {
+    start_unaligned:
       /* Check partial string.  */
       aligned = (const char *) ((size_t) s & -16L);
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
+      __m128i adj_value = __m128i_shift_right (value, offset);
 
-      value = __m128i_shift_right (value, offset);
-
-      int length = _mm_cmpistri (mask, value, 0x12);
+      unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
       /* No need to check CFlag since it is always 1.  */
       if (length < 16 - offset)
 	return length;
       /* Find where the NULL terminator is.  */
-      int index = _mm_cmpistri (value, value, 0x3a);
-      if (index < 16 - offset)
+      maskz = _mm_cmpeq_epi8 (value, zero);
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
 	return length;
       aligned += 16;
     }
-  else
-    aligned = s;
 
+start_loop:
   while (1)
     {
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      int index = _mm_cmpistri (mask, value, 0x12);
-      int cflag = _mm_cmpistrc (mask, value, 0x12);
+      unsigned int index = _mm_cmpistri (mask, value, 0x12);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
       if (cflag)
 	return (size_t) (aligned + index - s);
       aligned += 16;
diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.S
deleted file mode 100644
index e919fe492..000000000
--- a/sysdeps/x86_64/multiarch/strspn-sse2.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* strspn optimized with SSE2.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-# define strspn __strspn_sse2
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strspn)
-#endif
-
-#include <sysdeps/x86_64/strspn.S>
diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.c b/sysdeps/x86_64/multiarch/strspn-sse2.c
new file mode 100644
index 000000000..f5e5686db
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strspn-sse2.c
@@ -0,0 +1,28 @@
+/* strspn optimized with SSE2.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define STRSPN __strspn_sse2
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(STRSPN)
+#endif
+
+#include <string/strspn.c>
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
new file mode 100644
index 000000000..d49dbbf0b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2_rtm
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S
new file mode 100644
index 000000000..7cb8f1e41
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-evex.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_evex
+#define USE_AS_WCSCHR 1
+#include "strchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
new file mode 100644
index 000000000..d6ca2b806
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRCMP __wcscmp_avx2_rtm
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S
new file mode 100644
index 000000000..42e73e51e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S
@@ -0,0 +1,4 @@
+#define STRCMP __wcscmp_evex
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
new file mode 100644
index 000000000..35658d736
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_avx2_rtm
+#define USE_AS_WCSLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S
new file mode 100644
index 000000000..bdafa83bd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
new file mode 100644
index 000000000..7e62621af
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
@@ -0,0 +1,4 @@
+#define AS_WCSLEN
+#define strlen	__wcslen_sse4_1
+
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
index f89bed42a..3032061d3 100644
--- a/sysdeps/x86_64/multiarch/wcslen.c
+++ b/sysdeps/x86_64/multiarch/wcslen.c
@@ -24,7 +24,7 @@
 # undef __wcslen
 
 # define SYMBOL_NAME wcslen
-# include "ifunc-avx2.h"
+# include "ifunc-wcslen.h"
 
 libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
 weak_alias (__wcslen, wcslen);
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
new file mode 100644
index 000000000..f467582cb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
@@ -0,0 +1,5 @@
+#define STRCMP __wcsncmp_avx2_rtm
+#define USE_AS_STRNCMP 1
+#define USE_AS_WCSCMP 1
+#define OVERFLOW_STRCMP	__wcscmp_avx2_rtm
+#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
index 4fa1de4d3..e9ede522b 100644
--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
@@ -1,5 +1,5 @@
 #define STRCMP __wcsncmp_avx2
 #define USE_AS_STRNCMP 1
 #define USE_AS_WCSCMP 1
-
+#define OVERFLOW_STRCMP	__wcscmp_avx2
 #include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
new file mode 100644
index 000000000..8a8e31071
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
@@ -0,0 +1,5 @@
+#define STRCMP __wcsncmp_evex
+#define USE_AS_STRNCMP 1
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
new file mode 100644
index 000000000..7437ebee2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_avx2_rtm
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
new file mode 100644
index 000000000..24773bb4e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
index a8cab0cb0..5fa51fe07 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -2,4 +2,4 @@
 #define AS_STRNLEN
 #define strlen	__wcsnlen_sse4_1
 
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
index 81b1a221f..2963fbe05 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -24,27 +24,7 @@
 # undef __wcsnlen
 
 # define SYMBOL_NAME wcsnlen
-# include <init-arch.h>
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
-  const struct cpu_features* cpu_features = __get_cpu_features ();
-
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
-
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
-    return OPTIMIZE (sse4_1);
-
-  return OPTIMIZE (sse2);
-}
+# include "ifunc-wcslen.h"
 
 libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
 weak_alias (__wcsnlen, wcsnlen);
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
new file mode 100644
index 000000000..9bf760833
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_avx2_rtm
+#define USE_AS_WCSRCHR 1
+#include "strrchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
new file mode 100644
index 000000000..c64602f7d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_evex
+#define USE_AS_WCSRCHR 1
+#include "strrchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index a36034b40..00f69f2be 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR	__wcsrchr_sse2
 #endif
-
 #include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
new file mode 100644
index 000000000..58ed21db0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_avx2_rtm
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
new file mode 100644
index 000000000..a346cd35a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
@@ -0,0 +1,3 @@
+#define MEMCHR __wmemchr_evex_rtm
+#define USE_AS_WMEMCHR 1
+#include "memchr-evex-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S
new file mode 100644
index 000000000..06cd0f9f5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_evex
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
index 393d52065..74509dcdd 100644
--- a/sysdeps/x86_64/multiarch/wmemchr.c
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
@@ -26,7 +26,7 @@
 # undef __wmemchr
 
 # define SYMBOL_NAME wmemchr
-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
 
 libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
 weak_alias (__wmemchr, wmemchr)
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
new file mode 100644
index 000000000..31104d121
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_avx2_movbe_rtm
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2-movbe-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
new file mode 100644
index 000000000..4726d74aa
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_evex_movbe
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-evex-movbe.S"
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 824e64823..ca70b540e 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -78,9 +78,8 @@ ENTRY2 (__strcasecmp)
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RDX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END2 (__strcasecmp)
 # ifndef NO_NOLOCALE_ALIAS
 weak_alias (__strcasecmp, strcasecmp)
@@ -97,9 +96,8 @@ ENTRY2 (__strncasecmp)
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RCX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END2 (__strncasecmp)
 # ifndef NO_NOLOCALE_ALIAS
 weak_alias (__strncasecmp, strncasecmp)
@@ -149,22 +147,22 @@ ENTRY (STRCMP)
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
-.Lbelowupper:
-	.quad	0x4040404040404040
-	.quad	0x4040404040404040
-.Ltopupper:
-	.quad	0x5b5b5b5b5b5b5b5b
-	.quad	0x5b5b5b5b5b5b5b5b
-.Ltouppermask:
+.Llcase_min:
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+.Llcase_max:
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+.Lcase_add:
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.previous
-	movdqa	.Lbelowupper(%rip), %xmm5
-# define UCLOW_reg %xmm5
-	movdqa	.Ltopupper(%rip), %xmm6
-# define UCHIGH_reg %xmm6
-	movdqa	.Ltouppermask(%rip), %xmm7
-# define LCQWORD_reg %xmm7
+	movdqa	.Llcase_min(%rip), %xmm5
+# define LCASE_MIN_reg %xmm5
+	movdqa	.Llcase_max(%rip), %xmm6
+# define LCASE_MAX_reg %xmm6
+	movdqa	.Lcase_add(%rip), %xmm7
+# define CASE_ADD_reg %xmm7
 #endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
@@ -175,22 +173,18 @@ ENTRY (STRCMP)
 	movhpd	8(%rdi), %xmm1
 	movhpd	8(%rsi), %xmm2
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define TOLOWER(reg1, reg2) \
-	movdqa	reg1, %xmm8;					\
-	movdqa	UCHIGH_reg, %xmm9;				\
-	movdqa	reg2, %xmm10;					\
-	movdqa	UCHIGH_reg, %xmm11;				\
-	pcmpgtb	UCLOW_reg, %xmm8;				\
-	pcmpgtb	reg1, %xmm9;					\
-	pcmpgtb	UCLOW_reg, %xmm10;				\
-	pcmpgtb	reg2, %xmm11;					\
-	pand	%xmm9, %xmm8;					\
-	pand	%xmm11, %xmm10;					\
-	pand	LCQWORD_reg, %xmm8;				\
-	pand	LCQWORD_reg, %xmm10;				\
-	por	%xmm8, reg1;					\
-	por	%xmm10, reg2
-	TOLOWER (%xmm1, %xmm2)
+#  define TOLOWER(reg1, reg2) \
+	movdqa	LCASE_MIN_reg, %xmm8;					\
+	movdqa	LCASE_MIN_reg, %xmm9;					\
+	paddb	reg1, %xmm8;					\
+	paddb	reg2, %xmm9;					\
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
+	pcmpgtb	LCASE_MAX_reg, %xmm9;				\
+	pandn	CASE_ADD_reg, %xmm8;					\
+	pandn	CASE_ADD_reg, %xmm9;					\
+	paddb	%xmm8, reg1;					\
+	paddb	%xmm9, reg2
+	TOLOWER	(%xmm1, %xmm2)
 #else
 # define TOLOWER(reg1, reg2)
 #endif
@@ -2232,8 +2226,8 @@ LABEL(strcmp_exitz):
 
 	.p2align 4
 LABEL(Byte0):
-	movzx	(%rsi), %ecx
-	movzx	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
 
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S
deleted file mode 100644
index 6035a274c..000000000
--- a/sysdeps/x86_64/strcspn.S
+++ /dev/null
@@ -1,122 +0,0 @@
-/* strcspn (str, ss) -- Return the length of the initial segment of STR
-			which contains no characters from SS.
-   For AMD x86-64.
-   Copyright (C) 1994-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
-   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
-   Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-	.text
-ENTRY (strcspn)
-
-	movq %rdi, %rdx		/* Save SRC.  */
-
-	/* First we create a table with flags for all possible characters.
-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
-	   supported by the C string functions we have 256 characters.
-	   Before inserting marks for the stop characters we clear the whole
-	   table.  */
-	movq %rdi, %r8			/* Save value.  */
-	subq $256, %rsp			/* Make space for 256 bytes.  */
-	cfi_adjust_cfa_offset(256)
-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
-	movq %rsp, %rdi
-	xorl %eax, %eax			/* We store 0s.  */
-	cld
-	rep
-	stosq
-
-	movq %rsi, %rax			/* Setup skipset.  */
-
-/* For understanding the following code remember that %rcx == 0 now.
-   Although all the following instruction only modify %cl we always
-   have a correct zero-extended 64-bit value in %rcx.  */
-
-	.p2align 4
-L(2):	movb (%rax), %cl	/* get byte from skipset */
-	testb %cl, %cl		/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-
-	movb 1(%rax), %cl	/* get byte from skipset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-
-	movb 2(%rax), %cl	/* get byte from skipset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-
-	movb 3(%rax), %cl	/* get byte from skipset */
-	addq $4, %rax		/* increment skipset pointer */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in skipset table */
-	testb $0xff, %cl	/* is NUL char? */
-	jnz L(2)		/* no => process next dword from skipset */
-
-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
-
-	/* We use a neat trick for the following loop.  Normally we would
-	   have to test for two termination conditions
-	   1. a character in the skipset was found
-	   and
-	   2. the end of the string was found
-	   But as a sign that the character is in the skipset we store its
-	   value in the table.  But the value of NUL is NUL so the loop
-	   terminates for NUL in every case.  */
-
-	.p2align 4
-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
-
-	movb (%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	je L(4)			/* yes => return */
-
-	movb 1(%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	je L(5)			/* yes => return */
-
-	movb 2(%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(6)			/* yes => return */
-
-	movb 3(%rax), %cl	/* get byte from string */
-	cmpb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jne L(3)		/* no => start loop again */
-
-	incq %rax		/* adjust pointer */
-L(6):	incq %rax
-L(5):	incq %rax
-
-L(4):	addq $256, %rsp		/* remove skipset */
-	cfi_adjust_cfa_offset(-256)
-#ifdef USE_AS_STRPBRK
-	xorl %edx,%edx
-	orb %cl, %cl		/* was last character NUL? */
-	cmovzq %rdx, %rax	/* Yes:	return NULL */
-#else
-	subq %rdx, %rax		/* we have to return the number of valid
-				   characters, so compute distance to first
-				   non-valid character */
-#endif
-	ret
-END (strcspn)
-libc_hidden_builtin_def (strcspn)
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index d223ea170..8422c15cc 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,5 @@
-/* SSE2 version of strlen/wcslen.
-   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+/* SSE2 version of strlen.
+   Copyright (C) 2021 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,243 +16,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
+#include "multiarch/strlen-vec.S"
 
-#ifdef AS_WCSLEN
-# define PMINU		pminud
-# define PCMPEQ		pcmpeqd
-# define SHIFT_RETURN	shrq $2, %rax
-#else
-# define PMINU		pminub
-# define PCMPEQ		pcmpeqb
-# define SHIFT_RETURN
-#endif
-
-/* Long lived register in strlen(s), strnlen(s, n) are:
-
-	%xmm3 - zero
-	%rdi   - s
-	%r10  (s+n) & (~(64-1))
-	%r11   s+n
-*/
-
-
-.text
-ENTRY(strlen)
-
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
-#define FIND_ZERO	\
-	PCMPEQ	(%rax), %xmm0;	\
-	PCMPEQ	16(%rax), %xmm1;	\
-	PCMPEQ	32(%rax), %xmm2;	\
-	PCMPEQ	48(%rax), %xmm3;	\
-	pmovmskb	%xmm0, %esi;	\
-	pmovmskb	%xmm1, %edx;	\
-	pmovmskb	%xmm2, %r8d;	\
-	pmovmskb	%xmm3, %ecx;	\
-	salq	$16, %rdx;	\
-	salq	$16, %rcx;	\
-	orq	%rsi, %rdx;	\
-	orq	%r8, %rcx;	\
-	salq	$32, %rcx;	\
-	orq	%rcx, %rdx;
-
-#ifdef AS_STRNLEN
-/* Do not read anything when n==0.  */
-	test	%RSI_LP, %RSI_LP
-	jne	L(n_nonzero)
-	xor	%rax, %rax
-	ret
-L(n_nonzero):
-# ifdef AS_WCSLEN
-	shl	$2, %RSI_LP
-# endif
-
-/* Initialize long lived registers.  */
-
-	add	%RDI_LP, %RSI_LP
-	mov	%RSI_LP, %R10_LP
-	and	$-64, %R10_LP
-	mov	%RSI_LP, %R11_LP
-#endif
-
-	pxor	%xmm0, %xmm0
-	pxor	%xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	pxor	%xmm3, %xmm3
-	movq	%rdi, %rax
-	movq	%rdi, %rcx
-	andq	$4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
-	cmpq	$4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower.  */
-	ja	L(cross_page)
-
-#ifdef AS_STRNLEN
-/* Test if end is among first 64 bytes.  */
-# define STRNLEN_PROLOG	\
-	mov	%r11, %rsi;	\
-	subq	%rax, %rsi;	\
-	andq	$-64, %rax;	\
-	testq	$-64, %rsi;	\
-	je	L(strnlen_ret)
-#else
-# define STRNLEN_PROLOG  andq $-64, %rax;
-#endif
-
-/* Ignore bits in mask that come before start of string.  */
-#define PROLOG(lab)	\
-	movq	%rdi, %rcx;	\
-	xorq	%rax, %rcx;	\
-	STRNLEN_PROLOG;	\
-	sarq	%cl, %rdx;	\
-	test	%rdx, %rdx;	\
-	je	L(lab);	\
-	bsfq	%rdx, %rax;	\
-	SHIFT_RETURN;		\
-	ret
-
-#ifdef AS_STRNLEN
-	andq	$-16, %rax
-	FIND_ZERO
-#else
-	/* Test first 16 bytes unaligned.  */
-	movdqu	(%rax), %xmm4
-	PCMPEQ	%xmm0, %xmm4
-	pmovmskb	%xmm4, %edx
-	test	%edx, %edx
-	je 	L(next48_bytes)
-	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
-	SHIFT_RETURN
-	ret
-
-L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes.  */
-	andq	$-16, %rax
-	PCMPEQ 16(%rax), %xmm1
-	PCMPEQ 32(%rax), %xmm2
-	PCMPEQ 48(%rax), %xmm3
-	pmovmskb	%xmm1, %edx
-	pmovmskb	%xmm2, %r8d
-	pmovmskb	%xmm3, %ecx
-	salq	$16, %rdx
-	salq	$16, %rcx
-	orq	%r8, %rcx
-	salq	$32, %rcx
-	orq	%rcx, %rdx
-#endif
-
-	/* When no zero byte is found xmm1-3 are zero so we do not have to
-	   zero them.  */
-	PROLOG(loop)
-
-	.p2align 4
-L(cross_page):
-	andq	$-64, %rax
-	FIND_ZERO
-	PROLOG(loop_init)
-
-#ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1).  */
-L(strnlen_ret):
-	bts	%rsi, %rdx
-	sarq	%cl, %rdx
-	test	%rdx, %rdx
-	je	L(loop_init)
-	bsfq	%rdx, %rax
-	SHIFT_RETURN
-	ret
-#endif
-	.p2align 4
-L(loop_init):
-	pxor	%xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	pxor	%xmm3, %xmm3
-#ifdef AS_STRNLEN
-	.p2align 4
-L(loop):
-
-	addq	$64, %rax
-	cmpq	%rax, %r10
-	je	L(exit_end)
-
-	movdqa	(%rax), %xmm0
-	PMINU	16(%rax), %xmm0
-	PMINU	32(%rax), %xmm0
-	PMINU	48(%rax), %xmm0
-	PCMPEQ	%xmm3, %xmm0
-	pmovmskb	%xmm0, %edx
-	testl	%edx, %edx
-	jne	L(exit)
-	jmp	L(loop)
-
-	.p2align 4
-L(exit_end):
-	cmp	%rax, %r11
-	je	L(first) /* Do not read when end is at page boundary.  */
-	pxor	%xmm0, %xmm0
-	FIND_ZERO
-
-L(first):
-	bts	%r11, %rdx
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	SHIFT_RETURN
-	ret
-
-	.p2align 4
-L(exit):
-	pxor	%xmm0, %xmm0
-	FIND_ZERO
-
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	SHIFT_RETURN
-	ret
-
-#else
-
-	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
-	.p2align 4
-L(loop):
-
-	movdqa	64(%rax), %xmm0
-	PMINU	80(%rax), %xmm0
-	PMINU	96(%rax), %xmm0
-	PMINU	112(%rax), %xmm0
-	PCMPEQ	%xmm3, %xmm0
-	pmovmskb	%xmm0, %edx
-	testl	%edx, %edx
-	jne	L(exit64)
-
-	subq	$-128, %rax
-
-	movdqa	(%rax), %xmm0
-	PMINU	16(%rax), %xmm0
-	PMINU	32(%rax), %xmm0
-	PMINU	48(%rax), %xmm0
-	PCMPEQ	%xmm3, %xmm0
-	pmovmskb	%xmm0, %edx
-	testl	%edx, %edx
-	jne	L(exit0)
-	jmp	L(loop)
-
-	.p2align 4
-L(exit64):
-	addq	$64, %rax
-L(exit0):
-	pxor	%xmm0, %xmm0
-	FIND_ZERO
-
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	SHIFT_RETURN
-	ret
-
-#endif
-
-END(strlen)
 libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S
deleted file mode 100644
index 21888a5b9..000000000
--- a/sysdeps/x86_64/strpbrk.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define strcspn strpbrk
-#define USE_AS_STRPBRK
-#include <sysdeps/x86_64/strcspn.S>
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index dfd09fe95..fc1598bb1 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,360 @@
 
 #include <sysdep.h>
 
+#ifndef STRRCHR
+# define STRRCHR	strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define PMINU	pminud
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
+# define PMINU	pminub
+#endif
+
+#define PAGE_SIZE	4096
+#define VEC_SIZE	16
+
 	.text
-ENTRY (strrchr)
-	movd	%esi, %xmm1
+ENTRY(STRRCHR)
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	andl	$4095, %eax
-	punpcklbw	%xmm1, %xmm1
-	cmpq	$4032, %rax
-	punpcklwd	%xmm1, %xmm1
-	pshufd	$0, %xmm1, %xmm1
+	andl	$(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+#endif
+	pshufd	$0, %xmm0, %xmm0
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page)
-	movdqu	(%rdi), %xmm0
+
+L(cross_page_continue):
+	movups	(%rdi), %xmm1
 	pxor	%xmm2, %xmm2
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb	%xmm0, %ecx
-	pmovmskb	%xmm3, %edx
-	testq	%rdx, %rdx
-	je	L(next_48_bytes)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rcx, %rax
-	je	L(exit)
-	bsrq	%rax, %rax
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret0):
 	ret
 
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
 	.p2align 4
-L(next_48_bytes):
-	movdqu	16(%rdi), %xmm4
-	movdqa	%xmm4, %xmm5
-	movdqu	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm2, %xmm5
-	movdqu	48(%rdi), %xmm0
-	pmovmskb	%xmm5, %edx
-	movdqa	%xmm3, %xmm5
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm5
-	pcmpeqb	%xmm0, %xmm2
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r8d
-	pmovmskb	%xmm5, %eax
-	pmovmskb	%xmm2, %esi
-	salq	$32, %r8
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rdx, %rax
-	movq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	salq	$48, %rdx
-	salq	$16, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
-	pmovmskb	%xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rsi
-	orq	%rdx, %rax
-	je	L(loop_header2)
-	leaq	-1(%rax), %rcx
-	xorq	%rax, %rcx
-	andq	%rcx, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rsi
-	leaq	(%rdi,%rsi), %rax
+L(first_vec_x0_test):
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
-L(loop_header2):
-	testq	%rsi, %rsi
-	movq	%rdi, %rcx
-	je	L(no_c_found)
-L(loop_header):
-	addq	$64, %rdi
-	pxor	%xmm7, %xmm7
-	andq	$-64, %rdi
-	jmp	L(loop_entry)
+L(first_vec_x1):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
 
 	.p2align 4
-L(loop64):
-	testq	%rdx, %rdx
-	cmovne	%rdx, %rsi
-	cmovne	%rdi, %rcx
-	addq	$64, %rdi
-L(loop_entry):
-	movdqa	32(%rdi), %xmm3
-	pxor	%xmm6, %xmm6
-	movdqa	48(%rdi), %xmm2
-	movdqa	%xmm3, %xmm0
-	movdqa	16(%rdi), %xmm4
-	pminub	%xmm2, %xmm0
-	movdqa	(%rdi), %xmm5
-	pminub	%xmm4, %xmm0
-	pminub	%xmm5, %xmm0
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm0, %eax
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %r9d
-	movdqa	%xmm4, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %edx
-	movdqa	%xmm3, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm0, %r10d
-	movdqa	%xmm2, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	salq	$32, %r10
-	orq	%r10, %rdx
-	pmovmskb	%xmm0, %r8d
-	orq	%r9, %rdx
-	salq	$48, %r8
-	orq	%r8, %rdx
+L(first_vec_x1_test):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
 	testl	%eax, %eax
-	je	L(loop64)
-	pcmpeqb	%xmm6, %xmm4
-	pcmpeqb	%xmm6, %xmm3
-	pcmpeqb	%xmm6, %xmm5
-	pmovmskb	%xmm4, %eax
-	pmovmskb	%xmm3, %r10d
-	pcmpeqb	%xmm6, %xmm2
-	pmovmskb	%xmm5, %r9d
-	salq	$32, %r10
-	salq	$16, %rax
-	pmovmskb	%xmm2, %r8d
-	orq	%r10, %rax
-	orq	%r9, %rax
-	salq	$48, %r8
-	orq	%r8, %rax
-	leaq	-1(%rax), %r8
-	xorq	%rax, %r8
-	andq	%r8, %rdx
-	cmovne	%rdi, %rcx
-	cmovne	%rdx, %rsi
-	bsrq	%rsi, %rsi
-	leaq	(%rcx,%rsi), %rax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+	andq	$-VEC_SIZE, %rdi
+
+	movaps	VEC_SIZE(%rdi), %xmm2
+	pxor	%xmm3, %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pmovmskb %xmm3, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
+
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
+	pxor	%xmm4, %xmm4
+	PCMPEQ	%xmm3, %xmm4
+	pmovmskb %xmm4, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+
+	addq	$VEC_SIZE, %rdi
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 2), %rdi
+	.p2align 4
+L(first_loop):
+	/* Do 2x VEC at a time.  */
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
+	   macro-fuse with `jz`.  */
+	addl	%ecx, %eax
+	jz	L(first_loop)
+
+	/* Check if there is zero match.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+	/* Check if there was a match in last iteration.  */
+	subl	%ecx, %eax
+	jnz	L(new_match)
+
+L(first_loop_old_match):
+	PCMPEQ	%xmm0, %xmm2
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	addl	%eax, %ecx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	sall	$16, %eax
+	orl	%ecx, %eax
+
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
+	/* Save minimum state for getting most recent match. We can
+	   throw out all previous work.  */
 	.p2align 4
-L(no_c_found):
-	movl	$1, %esi
-	xorl	%ecx, %ecx
-	jmp	L(loop_header)
+L(second_loop_match):
+	movq	%rdi, %rsi
+	movaps	%xmm4, %xmm2
+	movaps	%xmm7, %xmm3
 
 	.p2align 4
-L(exit):
-	xorl	%eax, %eax
+L(second_loop):
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Either null term or new occurence of CHAR.  */
+	addl	%ecx, %eax
+	jz	L(second_loop)
+
+	/* No null term so much be new occurence of CHAR.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+
+	subl	%ecx, %eax
+	jnz	L(second_loop_new_match)
+
+L(second_loop_old_match):
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	sall	$16, %eax
+	orl	%ecx, %eax
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
 	ret
 
 	.p2align 4
+L(second_loop_new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(second_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4,, 4
 L(cross_page):
-	movq	%rdi, %rax
-	pxor	%xmm0, %xmm0
-	andq	$-64, %rax
-	movdqu	(%rax), %xmm5
-	movdqa	%xmm5, %xmm6
-	movdqu	16(%rax), %xmm4
-	pcmpeqb	%xmm1, %xmm5
-	pcmpeqb	%xmm0, %xmm6
-	movdqu	32(%rax), %xmm3
-	pmovmskb	%xmm6, %esi
-	movdqa	%xmm4, %xmm6
-	movdqu	48(%rax), %xmm2
-	pcmpeqb	%xmm1, %xmm4
-	pcmpeqb	%xmm0, %xmm6
-	pmovmskb	%xmm6, %edx
-	movdqa	%xmm3, %xmm6
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm0, %xmm6
-	pcmpeqb	%xmm2, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r9d
-	pmovmskb	%xmm6, %r8d
-	pmovmskb	%xmm0, %ecx
-	salq	$32, %r9
-	salq	$32, %r8
-	pcmpeqb	%xmm1, %xmm2
-	orq	%r8, %rdx
-	salq	$48, %rcx
-	pmovmskb	%xmm5, %r8d
-	orq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
-	orq	%rcx, %rdx
-	pmovmskb	%xmm2, %ecx
-	salq	$16, %rsi
-	salq	$48, %rcx
-	orq	%r9, %rsi
-	orq	%r8, %rsi
-	orq	%rcx, %rsi
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	movaps	(%rsi), %xmm1
+	pxor	%xmm2, %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
 	movl	%edi, %ecx
-	subl	%eax, %ecx
-	shrq	%cl, %rdx
-	shrq	%cl, %rsi
-	testq	%rdx, %rdx
-	je	L(loop_header2)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rax, %rsi
-	je	L(exit)
-	bsrq	%rsi, %rax
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	%cl, %edx
+	jz	L(cross_page_continue)
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	sarl	%cl, %eax
+	leal	-1(%rdx), %ecx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
 	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret1):
 	ret
-END (strrchr)
+END(STRRCHR)
 
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+	weak_alias (STRRCHR, rindex)
+	libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S
deleted file mode 100644
index e878f3288..000000000
--- a/sysdeps/x86_64/strspn.S
+++ /dev/null
@@ -1,115 +0,0 @@
-/* strspn (str, ss) -- Return the length of the initial segment of STR
-			which contains only characters from SS.
-   For AMD x86-64.
-   Copyright (C) 1994-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>.
-   Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au>.
-   Adopted for x86-64 by Andreas Jaeger <aj@suse.de>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-	.text
-ENTRY (strspn)
-
-	movq %rdi, %rdx		/* Save SRC.  */
-
-	/* First we create a table with flags for all possible characters.
-	   For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
-	   supported by the C string functions we have 256 characters.
-	   Before inserting marks for the stop characters we clear the whole
-	   table.  */
-	movq %rdi, %r8			/* Save value.  */
-	subq $256, %rsp			/* Make space for 256 bytes.  */
-	cfi_adjust_cfa_offset(256)
-	movl $32,  %ecx			/* 32*8 bytes = 256 bytes.  */
-	movq %rsp, %rdi
-	xorl %eax, %eax			/* We store 0s.  */
-	cld
-	rep
-	stosq
-
-	movq %rsi, %rax			/* Setup stopset.  */
-
-/* For understanding the following code remember that %rcx == 0 now.
-   Although all the following instruction only modify %cl we always
-   have a correct zero-extended 64-bit value in %rcx.  */
-
-	.p2align 4
-L(2):	movb (%rax), %cl	/* get byte from stopset */
-	testb %cl, %cl		/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-
-	movb 1(%rax), %cl	/* get byte from stopset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-
-	movb 2(%rax), %cl	/* get byte from stopset */
-	testb $0xff, %cl	/* is NUL char? */
-	jz L(1)			/* yes => start compare loop */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-
-	movb 3(%rax), %cl	/* get byte from stopset */
-	addq $4, %rax		/* increment stopset pointer */
-	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
-	testb $0xff, %cl	/* is NUL char? */
-	jnz L(2)		/* no => process next dword from stopset */
-
-L(1):	leaq -4(%rdx), %rax	/* prepare loop */
-
-	/* We use a neat trick for the following loop.  Normally we would
-	   have to test for two termination conditions
-	   1. a character in the stopset was found
-	   and
-	   2. the end of the string was found
-	   But as a sign that the character is in the stopset we store its
-	   value in the table.  But the value of NUL is NUL so the loop
-	   terminates for NUL in every case.  */
-
-	.p2align 4
-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
-
-	movb (%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(4)			/* no => return */
-
-	movb 1(%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(5)			/* no => return */
-
-	movb 2(%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jz L(6)			/* no => return */
-
-	movb 3(%rax), %cl	/* get byte from string */
-	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
-	jnz L(3)		/* yes => start loop again */
-
-	incq %rax		/* adjust pointer */
-L(6):	incq %rax
-L(5):	incq %rax
-
-L(4):	addq $256, %rsp		/* remove stopset */
-	cfi_adjust_cfa_offset(-256)
-	subq %rdx, %rax		/* we have to return the number of valid
-				   characters, so compute distance to first
-				   non-valid character */
-	ret
-END (strspn)
-libc_hidden_builtin_def (strspn)
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index d07b8f0aa..7bebdeb21 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -95,6 +95,28 @@ lose:									      \
 #define R14_LP	r14
 #define R15_LP	r15
 
+/* Zero upper vector registers and return with xtest.  NB: Use VZEROALL
+   to avoid RTM abort triggered by VZEROUPPER inside transactionally.  */
+#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
+	xtest;							\
+	jz	1f;						\
+	vzeroall;						\
+	ret;							\
+1:								\
+	vzeroupper;						\
+	ret
+
+/* Zero upper vector registers and return.  */
+#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
+# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+	VZEROUPPER;						\
+	ret
+#endif
+
+#ifndef VZEROUPPER_RETURN
+# define VZEROUPPER_RETURN	VZEROUPPER; ret
+#endif
+
 #else	/* __ASSEMBLER__ */
 
 /* Long and pointer size in bytes.  */
diff --git a/sysdeps/x86_64/tst-rsi-strlen.c b/sysdeps/x86_64/tst-rsi-strlen.c
new file mode 100644
index 000000000..a80c4f85c
--- /dev/null
+++ b/sysdeps/x86_64/tst-rsi-strlen.c
@@ -0,0 +1,81 @@
+/* Test strlen with 0 in the RSI register.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wcslen"
+#else
+# define TEST_NAME "strlen"
+#endif /* WIDE */
+
+#define TEST_MAIN
+#include <string/test-string.h>
+
+#ifdef WIDE
+# include <wchar.h>
+# define STRLEN wcslen
+# define CHAR wchar_t
+#else
+# define STRLEN strlen
+# define CHAR char
+#endif /* WIDE */
+
+IMPL (STRLEN, 1)
+
+typedef size_t (*proto_t) (const CHAR *);
+
+typedef struct
+{
+  void (*fn) (void);
+} parameter_t;
+
+size_t
+__attribute__ ((weak, noinline, noclone))
+do_strlen (parameter_t *a, int zero, const CHAR *str)
+{
+  return CALL (a, str);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  size_t size = page_size / sizeof (CHAR) - 1;
+  CHAR *buf = (CHAR *) buf2;
+  buf[size] = 0;
+
+  parameter_t a;
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      a.fn = impl->fn;
+      /* NB: Pass 0 in RSI.  */
+      size_t res = do_strlen (&a, 0, buf);
+      if (res != size)
+	{
+	  error (0, 0, "Wrong result in function %s: %zu != %zu",
+		 impl->name, res, size);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/tst-rsi-wcslen.c b/sysdeps/x86_64/tst-rsi-wcslen.c
new file mode 100644
index 000000000..f45a7dfb5
--- /dev/null
+++ b/sysdeps/x86_64/tst-rsi-wcslen.c
@@ -0,0 +1,20 @@
+/* Test wcslen with 0 in the RSI register.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-rsi-strlen.c"
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index 61edea1d1..ad066863a 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -41,82 +41,82 @@ ENTRY (__wcslen)
 	pxor	%xmm0, %xmm0
 
 	lea	32(%rdi), %rax
-	lea	16(%rdi), %rcx
+	addq	$16, %rdi
 	and	$-16, %rax
 
 	pcmpeqd	(%rax), %xmm0
 	pmovmskb %xmm0, %edx
 	pxor	%xmm1, %xmm1
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	pxor	%xmm2, %xmm2
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	pxor	%xmm3, %xmm3
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm3
 	pmovmskb %xmm3, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm0
 	pmovmskb %xmm0, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm1
 	pmovmskb %xmm1, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm2
 	pmovmskb %xmm2, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm3
 	pmovmskb %xmm3, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm0
 	pmovmskb %xmm0, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm1
 	pmovmskb %xmm1, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm2
 	pmovmskb %xmm2, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	pcmpeqd	(%rax), %xmm3
 	pmovmskb %xmm3, %edx
+	addq	$16, %rax
 	test	%edx, %edx
-	lea	16(%rax), %rax
 	jnz	L(exit)
 
 	and	$-0x40, %rax
@@ -133,104 +133,100 @@ L(aligned_64_loop):
 	pminub	%xmm0, %xmm2
 	pcmpeqd	%xmm3, %xmm2
 	pmovmskb %xmm2, %edx
+	addq	$64, %rax
 	test	%edx, %edx
-	lea	64(%rax), %rax
 	jz	L(aligned_64_loop)
 
 	pcmpeqd	-64(%rax), %xmm3
 	pmovmskb %xmm3, %edx
+    addq	$48, %rdi
 	test	%edx, %edx
-	lea	48(%rcx), %rcx
 	jnz	L(exit)
 
 	pcmpeqd	%xmm1, %xmm3
 	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
 	test	%edx, %edx
-	lea	-16(%rcx), %rcx
 	jnz	L(exit)
 
 	pcmpeqd	-32(%rax), %xmm3
 	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
 	test	%edx, %edx
-	lea	-16(%rcx), %rcx
 	jnz	L(exit)
 
 	pcmpeqd	%xmm6, %xmm3
 	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
 	test	%edx, %edx
-	lea	-16(%rcx), %rcx
-	jnz	L(exit)
-
-	jmp	L(aligned_64_loop)
+	jz	L(aligned_64_loop)
 
 	.p2align 4
 L(exit):
-	sub	%rcx, %rax
+	sub	%rdi, %rax
 	shr	$2, %rax
 	test	%dl, %dl
 	jz	L(exit_high)
 
-	mov	%dl, %cl
-	and	$15, %cl
+	andl	$15, %edx
 	jz	L(exit_1)
 	ret
 
-	.p2align 4
+	/* No align here. Naturally aligned % 16 == 1.  */
 L(exit_high):
-	mov	%dh, %ch
-	and	$15, %ch
+	andl	$(15 << 8), %edx
 	jz	L(exit_3)
 	add	$2, %rax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_1):
 	add	$1, %rax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_3):
 	add	$3, %rax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail0):
-	xor	%rax, %rax
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail1):
-	mov	$1, %rax
+	movl	$1, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail2):
-	mov	$2, %rax
+	movl	$2, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail3):
-	mov	$3, %rax
+	movl	$3, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail4):
-	mov	$4, %rax
+	movl	$4, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail5):
-	mov	$5, %rax
+	movl	$5, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail6):
-	mov	$6, %rax
+	movl	$6, %eax
 	ret
 
-	.p2align 4
+	.p2align 3
 L(exit_tail7):
-	mov	$7, %rax
+	movl	$7, %eax
 	ret
 
 END (__wcslen)
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 6b318d3f2..9006f2220 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -17,266 +17,12 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 
-	.text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR	1
+#define NO_PMINU	1
 
-	movd	%rsi, %xmm1
-	mov	%rdi, %rcx
-	punpckldq %xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	punpckldq %xmm1, %xmm1
-	and	$63, %rcx
-	cmp	$48, %rcx
-	ja	L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR	wcsrchr
+#endif
 
-	movdqu	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match1)
-
-	test	%rcx, %rcx
-	jnz	L(return_null)
-
-	and	$-16, %rdi
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match1):
-	test	%rcx, %rcx
-	jnz	L(prolog_find_zero_1)
-
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	and	$-16, %rdi
-	jmp	L(loop)
-
-	.p2align 4
-L(crosscache):
-	and	$15, %rcx
-	and	$-16, %rdi
-	pxor	%xmm3, %xmm3
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm3
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm3, %rdx
-	pmovmskb %xmm0, %rax
-	shr	%cl, %rdx
-	shr	%cl, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match)
-
-	test	%rdx, %rdx
-	jnz	L(return_null)
-
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match):
-	test	%rdx, %rdx
-	jnz	L(prolog_find_zero)
-
-	mov	%rax, %r8
-	lea	(%rdi, %rcx), %rsi
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqd	%xmm3, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm3
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm3, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqd	%xmm4, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm4
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm4, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm5
-	pcmpeqd	%xmm5, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm5
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm5, %rax
-	or	%rax, %rcx
-	jz	L(loop)
-
-	.p2align 4
-L(matches):
-	test	%rax, %rax
-	jnz	L(match)
-L(return_value):
-	test	%r8, %r8
-	jz	L(return_null)
-	mov	%r8, %rax
-	mov	%rsi, %rdi
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match):
-	pmovmskb %xmm2, %rcx
-	test	%rcx, %rcx
-	jnz	L(find_zero)
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	jmp	L(loop)
-
-	.p2align 4
-L(find_zero):
-	test	$15, %cl
-	jnz	L(find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_value)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_value)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_value)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero):
-	add	%rcx, %rdi
-	mov     %rdx, %rcx
-L(prolog_find_zero_1):
-	test	$15, %cl
-	jnz	L(prolog_find_zero_in_first_wchar)
-	test	%cl, %cl
-	jnz	L(prolog_find_zero_in_second_wchar)
-	test	$15, %ch
-	jnz	L(prolog_find_zero_in_third_wchar)
-
-	and	$1 << 13 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %ah
-	jnz	L(match_fourth_wchar)
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_first_wchar):
-	test	$1, %rax
-	jz	L(return_null)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_second_wchar):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_null)
-
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_in_third_wchar):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_null)
-
-	test	%ah, %ah
-	jnz	L(match_third_wchar)
-	test	$15 << 4, %al
-	jnz	L(match_second_wchar)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_second_wchar):
-	lea	-12(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_third_wchar):
-	lea	-8(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_fourth_wchar):
-	lea	-4(%rdi), %rax
-	ret
-
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-END (wcsrchr)
+#include "../strrchr.S"