Manual merge of version 2.31-13+rpi1+deb11u3 and 2.31-13+deb11u6 to produce 2.31... archive/raspbian/2.31-13+rpi1+deb11u6 raspbian/2.31-13+rpi1+deb11u6
authorPeter Michael Green <plugwash@raspbian.org>
Sat, 20 May 2023 09:21:41 +0000 (10:21 +0100)
committerPeter Michael Green <plugwash@raspbian.org>
Sat, 20 May 2023 09:21:41 +0000 (10:21 +0100)
1  2 
debian/changelog
debian/patches/git-updates.diff

index 098a25ed5281f4588154c4f8f02bc63526989269,a12b2b4320b2d176702af33eceacb12df3d28f9f..157c12b7a2d3fbdf0eb937a272a3eaaa42ed96dd
@@@ -1,19 -1,48 +1,65 @@@
- glibc (2.31-13+rpi1+deb11u3) bullseye-staging; urgency=medium
++glibc (2.31-13+rpi1+deb11u6) bullseye-staging; urgency=medium
 +
 +  [changes brought forward from 2.25-2+rpi1 by Peter Michael Green <plugwash@raspbian.org> at Wed, 29 Nov 2017 03:00:21 +0000]
 +  * Disable testsuite.
 +
 +  [changes introduced in 2.29-9+rpi1 by Peter Michale Green]
 +  * Change mode on scripts/check-obsolete-constructs.py to 644, 
 +    dgit does not like mode 755 files created by patches and the
 +    script does not seem to be used for anything in the Debian
 +    package.
 +
 +  [changes introduced in 2.31-13+rpi1+deb11u3 by Peter Michael Green]
 +  * Change mode on sysdeps/x86_64/configure to 644, same dgit issue
 +    as above.
 + 
-  -- Peter Michael Green <plugwash@raspbian.org>  Tue, 29 Mar 2022 22:11:31 +0000
++ -- Peter Michael Green <plugwash@raspbian.org>  Sat, 20 May 2023 09:21:02 +0000
++
+ glibc (2.31-13+deb11u6) bullseye; urgency=medium
+   [ Aurelien Jarno ]
+   * debian/patches/git-updates.diff: update from upstream stable branch:
+     - Drop debian/patches/amd64/local-require-bmi-in-avx2-ifunc.diff
+       (obsolete).
+     - Fix memory leak in printf-family functions with long multibyte strings.
+     - Fix a crash in printf-family due to width/precision-dependent
+       allocations.
+     - Fix a segfault in printf handling thousands separator.
+     - Fix an overflow in the AVX2 implementation of wcsnlen when crossing
+       pages.
+  -- Aurelien Jarno <aurel32@debian.org>  Wed, 19 Apr 2023 23:17:51 +0200
+ glibc (2.31-13+deb11u5) bullseye; urgency=medium
+   * debian/patches/local-require-bmi-in-avx2-ifunc.diff: new patch extracted
+     from an upstream commit, to change the AVX2 ifunc selector to require the
+     BMI2 feature. It happened that the wmemchr and wcslen changes backported
+     in 2.31-13+deb11u4 relied on that commit which got forgotten.
+     Closes: #1019855.
+  -- Aurelien Jarno <aurel32@debian.org>  Fri, 14 Oct 2022 21:35:00 +0200
+ glibc (2.31-13+deb11u4) bullseye; urgency=medium
+   [ Aurelien Jarno ]
+   * debian/debhelper.in/libc-dev.NEWS: New file to explain how to update
+     programs to use the TI-RPC library instead of the Sun RPC one.  Closes:
+     #1014735.
+   * debian/patches/git-updates.diff: update from upstream stable branch:
+     - Fix an off-by-one buffer overflow/underflow in getcwd() (CVE-2021-3999).
+     - Fix an overflow bug in the SSE2 and AVX2 implementations of wmemchr.
+     - Fix an overflow bug in the SSE4.1 and AVX2 implementations of wcslen and
+       wcsncat.
+     - Fix an overflow bug in the AVX2 and EVEX implementation of wcsncmp.
+     - Add a few EVEX optimized string functions to fix a performance issue (up
+       to 40%) with Skylake-X processors.
+     - Make grantpt usable after multi-threaded fork.  Closes: #1015740.
+     - debian/patches/hurd-i386/git-posix_openpt.diff: rebase.
+   * debian/rules.d/build.mk: pass --with-default-link=no to configure to
+     ensure that libio vtable protection is enabled.
+  -- Aurelien Jarno <aurel32@debian.org>  Fri, 26 Aug 2022 23:32:46 +0200
  
  glibc (2.31-13+deb11u3) bullseye; urgency=medium
  
index f6c05db170aa420ba4bdf21cb48d72f2ca00ff23,63246ab1a2b799e0589b023458f91c501eb646e0..cfa38a4802a8a0d065c161d293cfe5d808348303
@@@ -5888,2070 -7057,10745 +7057,10743 @@@ index 0000000000..48bb6d7ca
  +   Lesser General Public License for more details.
  +
  +   You should have received a copy of the GNU Lesser General Public
- +   License along with the GNU C Library; if not, see
+ +   License along with the GNU C Library.  If not, see
  +   <https://www.gnu.org/licenses/>.  */
  +
- +#ifndef _UNWIND_ARCH_H
- +#define _UNWIND_ARCH_H
+ +#include <sysdep.h>
  +
- +#include <unwind.h>
+ +/* Assumptions:
+ + *
+ + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ + *
+ + */
  +
- +static inline void *
- +unwind_arch_adjustment (void *prev, void *addr)
- +{
- +  return addr;
- +}
+ +#define dstin        x0
+ +#define src  x1
+ +#define count        x2
+ +#define dst  x3
+ +#define srcend       x4
+ +#define dstend       x5
+ +#define A_l  x6
+ +#define A_lw w6
+ +#define A_h  x7
+ +#define B_l  x8
+ +#define B_lw w8
+ +#define B_h  x9
+ +#define C_lw w10
+ +#define tmp1 x14
  +
- +#endif
- diff --git a/sysdeps/hppa/dl-fptr.c b/sysdeps/hppa/dl-fptr.c
- index 0a37397284..25ca8f8463 100644
- --- a/sysdeps/hppa/dl-fptr.c
- +++ b/sysdeps/hppa/dl-fptr.c
- @@ -172,8 +172,8 @@ make_fdesc (ElfW(Addr) ip, ElfW(Addr) gp)
-      }
-  
-   install:
- -  fdesc->ip = ip;
-    fdesc->gp = gp;
- +  fdesc->ip = ip;
-  
-    return (ElfW(Addr)) fdesc;
-  }
- @@ -350,7 +350,9 @@ ElfW(Addr)
-  _dl_lookup_address (const void *address)
-  {
-    ElfW(Addr) addr = (ElfW(Addr)) address;
- -  unsigned int *desc, *gptr;
- +  ElfW(Word) reloc_arg;
- +  volatile unsigned int *desc;
- +  unsigned int *gptr;
-  
-    /* Return ADDR if the least-significant two bits of ADDR are not consistent
-       with ADDR being a linker defined function pointer.  The normal value for
- @@ -367,7 +369,11 @@ _dl_lookup_address (const void *address)
-    if (!_dl_read_access_allowed (desc))
-      return addr;
-  
- -  /* Load first word of candidate descriptor.  It should be a pointer
- +  /* First load the relocation offset.  */
- +  reloc_arg = (ElfW(Word)) desc[1];
- +  atomic_full_barrier();
+ +#define A_q  q0
+ +#define B_q  q1
+ +#define C_q  q2
+ +#define D_q  q3
+ +#define E_q  q4
+ +#define F_q  q5
+ +#define G_q  q6
+ +#define H_q  q7
  +
- +  /* Then load first word of candidate descriptor.  It should be a pointer
-       with word alignment and point to memory that can be read.  */
-    gptr = (unsigned int *) desc[0];
-    if (((unsigned int) gptr & 3) != 0
- @@ -377,8 +383,8 @@ _dl_lookup_address (const void *address)
-    /* See if descriptor requires resolution.  The following trampoline is
-       used in each global offset table for function resolution:
-  
- -             ldw 0(r20),r22
- -             bv r0(r22)
- +             ldw 0(r20),r21
- +             bv r0(r21)
-               ldw 4(r20),r21
-       tramp:  b,l .-12,r20
-               depwi 0,31,2,r20
- @@ -389,7 +395,15 @@ _dl_lookup_address (const void *address)
-    if (gptr[0] == 0xea9f1fdd                  /* b,l .-12,r20     */
-        && gptr[1] == 0xd6801c1e                       /* depwi 0,31,2,r20 */
-        && (ElfW(Addr)) gptr[2] == elf_machine_resolve ())
- -    _dl_fixup ((struct link_map *) gptr[5], (ElfW(Word)) desc[1]);
- +    {
- +      struct link_map *l = (struct link_map *) gptr[5];
  +
- +      /* If gp has been resolved, we need to hunt for relocation offset.  */
- +      if (!(reloc_arg & PA_GP_RELOC))
- +     reloc_arg = _dl_fix_reloc_arg (addr, l);
+ +/* This implementation supports both memcpy and memmove and shares most code.
+ +   It uses unaligned accesses and branchless sequences to keep the code small,
+ +   simple and improve performance.
  +
- +      _dl_fixup (l, reloc_arg);
- +    }
-  
-    return (ElfW(Addr)) desc[0];
-  }
- diff --git a/sysdeps/hppa/dl-machine.h b/sysdeps/hppa/dl-machine.h
- index 9e98366ea3..8ecff97706 100644
- --- a/sysdeps/hppa/dl-machine.h
- +++ b/sysdeps/hppa/dl-machine.h
- @@ -48,6 +48,14 @@
-  #define GOT_FROM_PLT_STUB (4*4)
-  #define PLT_ENTRY_SIZE (2*4)
-  
- +/* The gp slot in the function descriptor contains the relocation offset
- +   before resolution.  To distinguish between a resolved gp value and an
- +   unresolved relocation offset we set an unused bit in the relocation
- +   offset.  This would allow us to do a synchronzied two word update
- +   using this bit (interlocked update), but instead of waiting for the
- +   update we simply recompute the gp value given that we know the ip.  */
- +#define PA_GP_RELOC 1
+ +   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ +   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+ +   check in memmove is negligible since it is only required for large copies.
  +
-  /* Initialize the function descriptor table before relocations */
-  static inline void
-  __hppa_init_bootstrap_fdesc_table (struct link_map *map)
- @@ -117,10 +125,28 @@ elf_machine_fixup_plt (struct link_map *map, lookup_t t,
-    volatile Elf32_Addr *rfdesc = reloc_addr;
-    /* map is the link_map for the caller, t is the link_map for the object
-       being called */
- -  rfdesc[1] = value.gp;
- -  /* Need to ensure that the gp is visible before the code
- -     entry point is updated */
- -  rfdesc[0] = value.ip;
+ +   Large copies use a software pipelined loop processing 64 bytes per
+ +   iteration.  The destination pointer is 16-byte aligned to minimize
+ +   unaligned accesses.  The loop tail is handled by always copying 64 bytes
+ +   from the end.  */
  +
- +  /* We would like the function descriptor to be double word aligned.  This
- +     helps performance (ip and gp then reside on the same cache line) and
- +     we can update the pair atomically with a single store.  The linker
- +     now ensures this alignment but we still have to handle old code.  */
- +  if ((unsigned int)reloc_addr & 7)
- +    {
- +      /* Need to ensure that the gp is visible before the code
- +         entry point is updated */
- +      rfdesc[1] = value.gp;
- +      atomic_full_barrier();
- +      rfdesc[0] = value.ip;
- +    }
- +  else
- +    {
- +      /* Update pair atomically with floating point store.  */
- +      union { ElfW(Word) v[2]; double d; } u;
+ +ENTRY (__memcpy_simd)
+ +     DELOUSE (0)
+ +     DELOUSE (1)
+ +     DELOUSE (2)
  +
- +      u.v[0] = value.ip;
- +      u.v[1] = value.gp;
- +      *(volatile double *)rfdesc = u.d;
- +    }
-    return value;
-  }
-  
- @@ -265,7 +291,7 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
-                    here.  The trampoline code will load the proper
-                    LTP and pass the reloc offset to the fixup
-                    function.  */
- -               fptr->gp = iplt - jmprel;
- +               fptr->gp = (iplt - jmprel) | PA_GP_RELOC;
-               } /* r_sym != 0 */
-             else
-               {
- diff --git a/sysdeps/hppa/dl-runtime.c b/sysdeps/hppa/dl-runtime.c
- new file mode 100644
- index 0000000000..885a3f1837
- --- /dev/null
- +++ b/sysdeps/hppa/dl-runtime.c
- @@ -0,0 +1,58 @@
- +/* On-demand PLT fixup for shared objects.  HPPA version.
- +   Copyright (C) 2019 Free Software Foundation, Inc.
- +   This file is part of the GNU C Library.
+ +     add     srcend, src, count
+ +     add     dstend, dstin, count
+ +     cmp     count, 128
+ +     b.hi    L(copy_long)
+ +     cmp     count, 32
+ +     b.hi    L(copy32_128)
  +
- +   The GNU C Library is free software; you can redistribute it and/or
- +   modify it under the terms of the GNU Lesser General Public
- +   License as published by the Free Software Foundation; either
- +   version 2.1 of the License, or (at your option) any later version.
+ +     /* Small copies: 0..32 bytes.  */
+ +     cmp     count, 16
+ +     b.lo    L(copy16)
+ +     ldr     A_q, [src]
+ +     ldr     B_q, [srcend, -16]
+ +     str     A_q, [dstin]
+ +     str     B_q, [dstend, -16]
+ +     ret
  +
- +   The GNU C Library is distributed in the hope that it will be useful,
- +   but WITHOUT ANY WARRANTY; without even the implied warranty of
- +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- +   Lesser General Public License for more details.
+ +     /* Copy 8-15 bytes.  */
+ +L(copy16):
+ +     tbz     count, 3, L(copy8)
+ +     ldr     A_l, [src]
+ +     ldr     A_h, [srcend, -8]
+ +     str     A_l, [dstin]
+ +     str     A_h, [dstend, -8]
+ +     ret
  +
- +   You should have received a copy of the GNU Lesser General Public
- +   License along with the GNU C Library; if not, write to the Free
- +   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- +   02111-1307 USA.  */
+ +     /* Copy 4-7 bytes.  */
+ +L(copy8):
+ +     tbz     count, 2, L(copy4)
+ +     ldr     A_lw, [src]
+ +     ldr     B_lw, [srcend, -4]
+ +     str     A_lw, [dstin]
+ +     str     B_lw, [dstend, -4]
+ +     ret
  +
- +/* Clear PA_GP_RELOC bit in relocation offset.  */
- +#define reloc_offset (reloc_arg & ~PA_GP_RELOC)
- +#define reloc_index  (reloc_arg & ~PA_GP_RELOC) / sizeof (PLTREL)
+ +     /* Copy 0..3 bytes using a branchless sequence.  */
+ +L(copy4):
+ +     cbz     count, L(copy0)
+ +     lsr     tmp1, count, 1
+ +     ldrb    A_lw, [src]
+ +     ldrb    C_lw, [srcend, -1]
+ +     ldrb    B_lw, [src, tmp1]
+ +     strb    A_lw, [dstin]
+ +     strb    B_lw, [dstin, tmp1]
+ +     strb    C_lw, [dstend, -1]
+ +L(copy0):
+ +     ret
  +
- +#include <elf/dl-runtime.c>
+ +     .p2align 4
+ +     /* Medium copies: 33..128 bytes.  */
+ +L(copy32_128):
+ +     ldp     A_q, B_q, [src]
+ +     ldp     C_q, D_q, [srcend, -32]
+ +     cmp     count, 64
+ +     b.hi    L(copy128)
+ +     stp     A_q, B_q, [dstin]
+ +     stp     C_q, D_q, [dstend, -32]
+ +     ret
  +
- +/* The caller has encountered a partially relocated function descriptor.
- +   The gp of the descriptor has been updated, but not the ip.  We find
- +   the function descriptor again and compute the relocation offset and
- +   return that to the caller.  The caller will continue on to call
- +   _dl_fixup with the relocation offset.  */
+ +     .p2align 4
+ +     /* Copy 65..128 bytes.  */
+ +L(copy128):
+ +     ldp     E_q, F_q, [src, 32]
+ +     cmp     count, 96
+ +     b.ls    L(copy96)
+ +     ldp     G_q, H_q, [srcend, -64]
+ +     stp     G_q, H_q, [dstend, -64]
+ +L(copy96):
+ +     stp     A_q, B_q, [dstin]
+ +     stp     E_q, F_q, [dstin, 32]
+ +     stp     C_q, D_q, [dstend, -32]
+ +     ret
  +
- +ElfW(Word)
- +attribute_hidden __attribute ((noinline)) ARCH_FIXUP_ATTRIBUTE
- +_dl_fix_reloc_arg (struct fdesc *fptr, struct link_map *l)
- +{
- +  Elf32_Addr l_addr, iplt, jmprel, end_jmprel, r_type;
- +  const Elf32_Rela *reloc;
+ +     /* Align loop64 below to 16 bytes.  */
+ +     nop
  +
- +  l_addr = l->l_addr;
- +  jmprel = D_PTR(l, l_info[DT_JMPREL]);
- +  end_jmprel = jmprel + l->l_info[DT_PLTRELSZ]->d_un.d_val;
+ +     /* Copy more than 128 bytes.  */
+ +L(copy_long):
+ +     /* Copy 16 bytes and then align src to 16-byte alignment.  */
+ +     ldr     D_q, [src]
+ +     and     tmp1, src, 15
+ +     bic     src, src, 15
+ +     sub     dst, dstin, tmp1
+ +     add     count, count, tmp1      /* Count is now 16 too large.  */
+ +     ldp     A_q, B_q, [src, 16]
+ +     str     D_q, [dstin]
+ +     ldp     C_q, D_q, [src, 48]
+ +     subs    count, count, 128 + 16  /* Test and readjust count.  */
+ +     b.ls    L(copy64_from_end)
+ +L(loop64):
+ +     stp     A_q, B_q, [dst, 16]
+ +     ldp     A_q, B_q, [src, 80]
+ +     stp     C_q, D_q, [dst, 48]
+ +     ldp     C_q, D_q, [src, 112]
+ +     add     src, src, 64
+ +     add     dst, dst, 64
+ +     subs    count, count, 64
+ +     b.hi    L(loop64)
  +
- +  /* Look for the entry...  */
- +  for (iplt = jmprel; iplt < end_jmprel; iplt += sizeof (Elf32_Rela))
- +    {
- +      reloc = (const Elf32_Rela *) iplt;
- +      r_type = ELF32_R_TYPE (reloc->r_info);
+ +     /* Write the last iteration and copy 64 bytes from the end.  */
+ +L(copy64_from_end):
+ +     ldp     E_q, F_q, [srcend, -64]
+ +     stp     A_q, B_q, [dst, 16]
+ +     ldp     A_q, B_q, [srcend, -32]
+ +     stp     C_q, D_q, [dst, 48]
+ +     stp     E_q, F_q, [dstend, -64]
+ +     stp     A_q, B_q, [dstend, -32]
+ +     ret
  +
- +      if (__builtin_expect (r_type == R_PARISC_IPLT, 1)
- +       && fptr == (struct fdesc *) (reloc->r_offset + l_addr))
- +     /* Found entry. Return the reloc offset.  */
- +     return iplt - jmprel;
- +    }
+ +END (__memcpy_simd)
+ +libc_hidden_builtin_def (__memcpy_simd)
  +
- +  /* Crash if we weren't passed a valid function pointer.  */
- +  ABORT_INSTRUCTION;
- +  return 0;
- +}
- diff --git a/sysdeps/hppa/dl-trampoline.S b/sysdeps/hppa/dl-trampoline.S
- index 0114ca8b19..d0804b30c0 100644
- --- a/sysdeps/hppa/dl-trampoline.S
- +++ b/sysdeps/hppa/dl-trampoline.S
- @@ -31,7 +31,7 @@
-     slow down __cffc when it attempts to call fixup to resolve function
-     descriptor references. Please refer to gcc/gcc/config/pa/fptr.c
-  
- -   Enter with r19 = reloc offset, r20 = got-8, r21 = fixup ltp.  */
- +   Enter with r19 = reloc offset, r20 = got-8, r21 = fixup ltp, r22 = fp.  */
-  
-       /* RELOCATION MARKER: bl to provide gcc's __cffc with fixup loc. */
-       .text
- @@ -61,17 +61,20 @@ _dl_runtime_resolve:
-       copy    %sp, %r1        /* Copy previous sp */
-       /* Save function result address (on entry) */
-       stwm    %r28,128(%sp)
- -     /* Fillin some frame info to follow ABI */
- +     /* Fill in some frame info to follow ABI */
-       stw     %r1,-4(%sp)     /* Previous sp */
-       stw     %r21,-32(%sp)   /* PIC register value */
-  
-       /* Save input floating point registers. This must be done
-          in the new frame since the previous frame doesn't have
-          enough space */
- -     ldo     -56(%sp),%r1
- +     ldo     -64(%sp),%r1
-       fstd,ma %fr4,-8(%r1)
-       fstd,ma %fr5,-8(%r1)
-       fstd,ma %fr6,-8(%r1)
  +
- +     /* Test PA_GP_RELOC bit.  */
- +     bb,>=   %r19,31,2f              /* branch if not reloc offset */
-       fstd,ma %fr7,-8(%r1)
-  
-       /* Set up args to fixup func, needs only two arguments  */
- @@ -79,7 +82,7 @@ _dl_runtime_resolve:
-       copy    %r19,%r25               /* (2) reloc offset  */
-  
-       /* Call the real address resolver. */
- -     bl      _dl_fixup,%rp
- +3:   bl      _dl_fixup,%rp
-       copy    %r21,%r19               /* set fixup func ltp */
-  
-       /* While the linker will set a function pointer to NULL when it
- @@ -102,7 +105,7 @@ _dl_runtime_resolve:
-       copy    %r29, %r19
-  
-       /* Reload arguments fp args */
- -     ldo     -56(%sp),%r1
- +     ldo     -64(%sp),%r1
-       fldd,ma -8(%r1),%fr4
-       fldd,ma -8(%r1),%fr5
-       fldd,ma -8(%r1),%fr6
- @@ -129,6 +132,25 @@ _dl_runtime_resolve:
-       bv      %r0(%rp)
-       ldo     -128(%sp),%sp
-  
- +2:
- +     /* Set up args for _dl_fix_reloc_arg.  */
- +     copy    %r22,%r26               /* (1) function pointer */
- +     depi    0,31,2,%r26             /* clear least significant bits */
- +     ldw     8+4(%r20),%r25          /* (2) got[1] == struct link_map */
+ +ENTRY (__memmove_simd)
+ +     DELOUSE (0)
+ +     DELOUSE (1)
+ +     DELOUSE (2)
  +
- +     /* Save ltp and link map arg for _dl_fixup.  */
- +     stw     %r21,-56(%sp)           /* ltp */
- +     stw     %r25,-60(%sp)           /* struct link map */
+ +     add     srcend, src, count
+ +     add     dstend, dstin, count
+ +     cmp     count, 128
+ +     b.hi    L(move_long)
+ +     cmp     count, 32
+ +     b.hi    L(copy32_128)
  +
- +     /* Find reloc offset. */
- +     bl      _dl_fix_reloc_arg,%rp
- +     copy    %r21,%r19               /* set func ltp */
+ +     /* Small moves: 0..32 bytes.  */
+ +     cmp     count, 16
+ +     b.lo    L(copy16)
+ +     ldr     A_q, [src]
+ +     ldr     B_q, [srcend, -16]
+ +     str     A_q, [dstin]
+ +     str     B_q, [dstend, -16]
+ +     ret
  +
- +     /* Set up args for _dl_fixup.  */
- +     ldw     -56(%sp),%r21           /* ltp */
- +     ldw     -60(%sp),%r26           /* (1) struct link map */
- +     b       3b
- +     copy    %ret0,%r25              /* (2) reloc offset */
-          .EXIT
-          .PROCEND
-       cfi_endproc
- @@ -153,7 +175,7 @@ _dl_runtime_profile:
-       copy    %sp, %r1        /* Copy previous sp */
-       /* Save function result address (on entry) */
-       stwm    %r28,192(%sp)
- -     /* Fillin some frame info to follow ABI */
- +     /* Fill in some frame info to follow ABI */
-       stw     %r1,-4(%sp)     /* Previous sp */
-       stw     %r21,-32(%sp)   /* PIC register value */
-  
- @@ -181,10 +203,11 @@ _dl_runtime_profile:
-       fstd,ma %fr5,8(%r1)
-       fstd,ma %fr6,8(%r1)
-       fstd,ma %fr7,8(%r1)
- -     /* 32-bit stack pointer and return register */
- -     stw     %sp,-56(%sp)
- -     stw     %r2,-52(%sp)
-  
- +     /* Test PA_GP_RELOC bit.  */
- +     bb,>=   %r19,31,2f              /* branch if not reloc offset */
- +     /* 32-bit stack pointer */
- +     stw     %sp,-56(%sp)
-  
-       /* Set up args to fixup func, needs five arguments  */
-       ldw     8+4(%r20),%r26          /* (1) got[1] == struct link_map */
- @@ -197,7 +220,7 @@ _dl_runtime_profile:
-       stw     %r1, -52(%sp)           /* (5) long int *framesizep */
-  
-       /* Call the real address resolver. */
- -     bl      _dl_profile_fixup,%rp
- +3:   bl      _dl_profile_fixup,%rp
-       copy    %r21,%r19               /* set fixup func ltp */
-  
-       /* Load up the returned function descriptor */
- @@ -215,7 +238,9 @@ _dl_runtime_profile:
-       fldd,ma 8(%r1),%fr5
-       fldd,ma 8(%r1),%fr6
-       fldd,ma 8(%r1),%fr7
- -     ldw     -52(%sp),%rp
+ +L(move_long):
+ +     /* Only use backward copy if there is an overlap.  */
+ +     sub     tmp1, dstin, src
+ +     cbz     tmp1, L(move0)
+ +     cmp     tmp1, count
+ +     b.hs    L(copy_long)
  +
- +     /* Reload rp register -(192+20) without adjusting stack */
- +     ldw     -212(%sp),%rp
-  
-       /* Reload static link register -(192+16) without adjusting stack */
-       ldw     -208(%sp),%r29
- @@ -303,6 +328,33 @@ L(cont):
-          ldw -20(%sp),%rp
-       /* Return */
-       bv,n    0(%r2)
- +
- +2:
- +     /* Set up args for _dl_fix_reloc_arg.  */
- +     copy    %r22,%r26               /* (1) function pointer */
- +     depi    0,31,2,%r26             /* clear least significant bits */
- +     ldw     8+4(%r20),%r25          /* (2) got[1] == struct link_map */
- +
- +     /* Save ltp and link map arg for _dl_fixup.  */
- +     stw     %r21,-92(%sp)           /* ltp */
- +     stw     %r25,-116(%sp)          /* struct link map */
+ +     /* Large backwards copy for overlapping copies.
+ +        Copy 16 bytes and then align srcend to 16-byte alignment.  */
+ +L(copy_long_backwards):
+ +     ldr     D_q, [srcend, -16]
+ +     and     tmp1, srcend, 15
+ +     bic     srcend, srcend, 15
+ +     sub     count, count, tmp1
+ +     ldp     A_q, B_q, [srcend, -32]
+ +     str     D_q, [dstend, -16]
+ +     ldp     C_q, D_q, [srcend, -64]
+ +     sub     dstend, dstend, tmp1
+ +     subs    count, count, 128
+ +     b.ls    L(copy64_from_start)
  +
- +     /* Find reloc offset. */
- +     bl      _dl_fix_reloc_arg,%rp
- +     copy    %r21,%r19               /* set func ltp */
+ +L(loop64_backwards):
+ +     str     B_q, [dstend, -16]
+ +     str     A_q, [dstend, -32]
+ +     ldp     A_q, B_q, [srcend, -96]
+ +     str     D_q, [dstend, -48]
+ +     str     C_q, [dstend, -64]!
+ +     ldp     C_q, D_q, [srcend, -128]
+ +     sub     srcend, srcend, 64
+ +     subs    count, count, 64
+ +     b.hi    L(loop64_backwards)
  +
- +      /* Restore fixup ltp.  */
- +     ldw     -92(%sp),%r21           /* ltp */
+ +     /* Write the last iteration and copy 64 bytes from the start.  */
+ +L(copy64_from_start):
+ +     ldp     E_q, F_q, [src, 32]
+ +     stp     A_q, B_q, [dstend, -32]
+ +     ldp     A_q, B_q, [src]
+ +     stp     C_q, D_q, [dstend, -64]
+ +     stp     E_q, F_q, [dstin, 32]
+ +     stp     A_q, B_q, [dstin]
+ +L(move0):
+ +     ret
  +
- +     /* Set up args to fixup func, needs five arguments  */
- +     ldw     -116(%sp),%r26          /* (1) struct link map */
- +     copy    %ret0,%r25              /* (2) reloc offset  */
- +     stw     %r25,-120(%sp)          /* Save reloc offset */
- +     ldw     -212(%sp),%r24          /* (3) profile_fixup needs rp */
- +     ldo     -56(%sp),%r23           /* (4) La_hppa_regs */
- +     ldo     -112(%sp), %r1
- +     b       3b
- +     stw     %r1, -52(%sp)           /* (5) long int *framesizep */
-          .EXIT
-          .PROCEND
-       cfi_endproc
- diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
- index 8af0789a9c..4334ade2a0 100644
- --- a/sysdeps/i386/dl-machine.h
- +++ b/sysdeps/i386/dl-machine.h
- @@ -338,16 +338,22 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
-       {
-  # ifndef RTLD_BOOTSTRAP
-         if (sym_map != map
- -           && sym_map->l_type != lt_executable
-             && !sym_map->l_relocated)
-           {
-             const char *strtab
-               = (const char *) D_PTR (map, l_info[DT_STRTAB]);
- -           _dl_error_printf ("\
- +           if (sym_map->l_type == lt_executable)
- +             _dl_fatal_printf ("\
- +%s: IFUNC symbol '%s' referenced in '%s' is defined in the executable \
- +and creates an unsatisfiable circular dependency.\n",
- +                               RTLD_PROGNAME, strtab + refsym->st_name,
- +                               map->l_name);
- +           else
- +             _dl_error_printf ("\
-  %s: Relink `%s' with `%s' for IFUNC symbol `%s'\n",
- -                             RTLD_PROGNAME, map->l_name,
- -                             sym_map->l_name,
- -                             strtab + refsym->st_name);
- +                               RTLD_PROGNAME, map->l_name,
- +                               sym_map->l_name,
- +                               strtab + refsym->st_name);
-           }
-  # endif
-         value = ((Elf32_Addr (*) (void)) value) ();
- diff --git a/sysdeps/i386/sysdep.h b/sysdeps/i386/sysdep.h
- index b4bcd8fb6c..6094af8fec 100644
- --- a/sysdeps/i386/sysdep.h
- +++ b/sysdeps/i386/sysdep.h
- @@ -61,7 +61,7 @@ lose: SYSCALL_PIC_SETUP                                                           \
+ +END (__memmove_simd)
+ +libc_hidden_builtin_def (__memmove_simd)
+ diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
+ index ed5a47f6f8..46a4cb3a54 100644
+ --- a/sysdeps/aarch64/multiarch/memmove.c
+ +++ b/sysdeps/aarch64/multiarch/memmove.c
+ @@ -29,6 +29,7 @@
+  extern __typeof (__redirect_memmove) __libc_memmove;
   
-  # define SETUP_PIC_REG(reg) \
-    .ifndef GET_PC_THUNK(reg);                                               \
- -  .section .gnu.linkonce.t.GET_PC_THUNK(reg),"ax",@progbits;               \
- +  .section .text.GET_PC_THUNK(reg),"axG",@progbits,GET_PC_THUNK(reg),comdat;  \
-    .globl GET_PC_THUNK(reg);                                                \
-    .hidden GET_PC_THUNK(reg);                                               \
-    .p2align 4;                                                                      \
- @@ -97,7 +97,8 @@ GET_PC_THUNK(reg):                                                        \
+  extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+ +extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
+  extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
+  extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
+  extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
+ @@ -40,7 +41,10 @@ libc_ifunc (__libc_memmove,
+               ? __memmove_falkor
+               : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+                 ? __memmove_thunderx2
+ -               : __memmove_generic))));
+ +               : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
+ +                  || IS_NEOVERSE_V1 (midr)
+ +                  ? __memmove_simd
+ +                  : __memmove_generic)))));
   
-  # define SETUP_PIC_REG_STR(reg)                                              \
-    ".ifndef " GET_PC_THUNK_STR (reg) "\n"                             \
- -  ".section .gnu.linkonce.t." GET_PC_THUNK_STR (reg) ",\"ax\",@progbits\n" \
- +  ".section .text." GET_PC_THUNK_STR (reg) ",\"axG\",@progbits,"     \
- +    GET_PC_THUNK_STR (reg) ",comdat\n"                                       \
-    ".globl " GET_PC_THUNK_STR (reg) "\n"                                      \
-    ".hidden " GET_PC_THUNK_STR (reg) "\n"                             \
-    ".p2align 4\n"                                                     \
- diff --git a/sysdeps/ieee754/ldbl-96/Makefile b/sysdeps/ieee754/ldbl-96/Makefile
- index 995e90d6da..6030adf7e7 100644
- --- a/sysdeps/ieee754/ldbl-96/Makefile
- +++ b/sysdeps/ieee754/ldbl-96/Makefile
- @@ -17,5 +17,8 @@
-  # <https://www.gnu.org/licenses/>.
+  # undef memmove
+  strong_alias (__libc_memmove, memmove);
+ diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
+ index 548130e413..a8ff52c072 100644
+ --- a/sysdeps/aarch64/strcpy.S
+ +++ b/sysdeps/aarch64/strcpy.S
+ @@ -234,8 +234,13 @@ L(entry_no_page_cross):
+  #endif
+       /* calculate the loc value */
+       cmeq    datav.16b, datav.16b, #0
+ +#ifdef __AARCH64EB__
+ +     mov     data1, datav.d[1]
+ +     mov     data2, datav.d[0]
+ +#else
+       mov     data1, datav.d[0]
+       mov     data2, datav.d[1]
+ +#endif
+       cmp     data1, 0
+       csel    data1, data1, data2, ne
+       mov     pos, 8
+ diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
+ index 5981247dd9..086a5c7e99 100644
+ --- a/sysdeps/aarch64/strnlen.S
+ +++ b/sysdeps/aarch64/strnlen.S
+ @@ -154,8 +154,13 @@ L(loop_end):
+          byte.  */
   
-  ifeq ($(subdir),math)
- -tests += test-canonical-ldbl-96 test-totalorderl-ldbl-96
- +tests += test-canonical-ldbl-96 test-totalorderl-ldbl-96 test-sinl-pseudo
- +ifeq ($(have-ssp),yes)
- +CFLAGS-test-sinl-pseudo.c += -fstack-protector-all
-  endif
- +endif # $(subdir) == math
- diff --git a/sysdeps/ieee754/ldbl-96/e_rem_pio2l.c b/sysdeps/ieee754/ldbl-96/e_rem_pio2l.c
- index 5f742321ae..bcdf20179f 100644
- --- a/sysdeps/ieee754/ldbl-96/e_rem_pio2l.c
- +++ b/sysdeps/ieee754/ldbl-96/e_rem_pio2l.c
- @@ -210,6 +210,18 @@ __ieee754_rem_pio2l (long double x, long double *y)
-        return 0;
-      }
+       cmeq    datav.16b, datav.16b, #0
+ +#ifdef __AARCH64EB__
+ +     mov     data1, datav.d[1]
+ +     mov     data2, datav.d[0]
+ +#else
+       mov     data1, datav.d[0]
+       mov     data2, datav.d[1]
+ +#endif
+       cmp     data1, 0
+       csel    data1, data1, data2, ne
+       sub     len, src, srcin
+ diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h
+ index 604c489170..f1feb19dc7 100644
+ --- a/sysdeps/aarch64/sysdep.h
+ +++ b/sysdeps/aarch64/sysdep.h
+ @@ -45,7 +45,7 @@
+  #define ENTRY(name)                                          \
+    .globl C_SYMBOL_NAME(name);                                        \
+    .type C_SYMBOL_NAME(name),%function;                               \
+ -  .align 4;                                                  \
+ +  .p2align 6;                                                        \
+    C_LABEL(name)                                                      \
+    cfi_startproc;                                             \
+    CALL_MCOUNT
+ diff --git a/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+ index bf4ac7077f..379bb56fc9 100644
+ --- a/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+ +++ b/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+ @@ -268,7 +268,7 @@ ENTRY(memcpy)
   
- +  if ((i0 & 0x80000000) == 0)
- +    {
- +      /* Pseudo-zero and unnormal representations are not valid
- +      representations of long double.  We need to avoid stack
- +      corruption in __kernel_rem_pio2, which expects input in a
- +      particular normal form, but those representations do not need
- +      to be consistently handled like any particular floating-point
- +      value.  */
- +      y[1] = y[0] = __builtin_nanl ("");
- +      return 0;
- +    }
- +
-    /* Split the 64 bits of the mantissa into three 24-bit integers
-       stored in a double array.  */
-    exp = j0 - 23;
- diff --git a/sysdeps/ieee754/ldbl-96/test-sinl-pseudo.c b/sysdeps/ieee754/ldbl-96/test-sinl-pseudo.c
- new file mode 100644
- index 0000000000..f59b97769d
- --- /dev/null
- +++ b/sysdeps/ieee754/ldbl-96/test-sinl-pseudo.c
- @@ -0,0 +1,41 @@
- +/* Test sinl for pseudo-zeros and unnormals for ldbl-96 (bug 25487).
- +   Copyright (C) 2020 Free Software Foundation, Inc.
- +   This file is part of the GNU C Library.
- +
- +   The GNU C Library is free software; you can redistribute it and/or
- +   modify it under the terms of the GNU Lesser General Public
- +   License as published by the Free Software Foundation; either
- +   version 2.1 of the License, or (at your option) any later version.
- +
- +   The GNU C Library is distributed in the hope that it will be useful,
- +   but WITHOUT ANY WARRANTY; without even the implied warranty of
- +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- +   Lesser General Public License for more details.
- +
- +   You should have received a copy of the GNU Lesser General Public
- +   License along with the GNU C Library; if not, see
- +   <https://www.gnu.org/licenses/>.  */
- +
- +#include <math.h>
- +#include <math_ldbl.h>
- +#include <stdint.h>
- +
- +static int
- +do_test (void)
- +{
- +  for (int i = 0; i < 64; i++)
- +    {
- +      uint64_t sig = i == 63 ? 0 : 1ULL << i;
- +      long double ld;
- +      SET_LDOUBLE_WORDS (ld, 0x4141,
- +                      sig >> 32, sig & 0xffffffffULL);
- +      /* The requirement is that no stack overflow occurs when the
- +      pseudo-zero or unnormal goes through range reduction.  */
- +      volatile long double ldr;
- +      ldr = sinl (ld);
- +      (void) ldr;
- +    }
- +  return 0;
- +}
- +
- +#include <support/test-driver.c>
- diff --git a/sysdeps/posix/system.c b/sysdeps/posix/system.c
- index e613e6a344..a03f478fc7 100644
- --- a/sysdeps/posix/system.c
- +++ b/sysdeps/posix/system.c
- @@ -101,7 +101,8 @@ cancel_handler (void *arg)
-  static int
-  do_system (const char *line)
-  {
- -  int status;
- +  int status = -1;
- +  int ret;
-    pid_t pid;
-    struct sigaction sa;
-  #ifndef _LIBC_REENTRANT
- @@ -144,14 +145,14 @@ do_system (const char *line)
-    __posix_spawnattr_setflags (&spawn_attr,
-                             POSIX_SPAWN_SETSIGDEF | POSIX_SPAWN_SETSIGMASK);
+       mov     dst, dstin      /* Preserve dstin, we need to return it.  */
+       cmp     count, #64
+ -     bge     .Lcpy_not_short
+ +     bhs     .Lcpy_not_short
+       /* Deal with small copies quickly by dropping straight into the
+          exit block.  */
   
- -  status = __posix_spawn (&pid, SHELL_PATH, 0, &spawn_attr,
- -                       (char *const[]){ (char*) SHELL_NAME,
- -                                        (char*) "-c",
- -                                        (char *) line, NULL },
- -                       __environ);
- +  ret = __posix_spawn (&pid, SHELL_PATH, 0, &spawn_attr,
- +                    (char *const[]){ (char *) SHELL_NAME,
- +                                     (char *) "-c",
- +                                     (char *) line, NULL },
- +                    __environ);
-    __posix_spawnattr_destroy (&spawn_attr);
+ @@ -351,10 +351,10 @@ ENTRY(memcpy)
   
- -  if (status == 0)
- +  if (ret == 0)
-      {
-        /* Cancellation results in cleanup handlers running as exceptions in
-        the block where they were installed, so it is safe to reference
- @@ -186,6 +187,9 @@ do_system (const char *line)
-      }
-    DO_UNLOCK ();
+  1:
+       subs    tmp2, count, #64        /* Use tmp2 for count.  */
+ -     blt     .Ltail63aligned
+ +     blo     .Ltail63aligned
   
- +  if (ret != 0)
- +    __set_errno (ret);
- +
-    return status;
-  }
+       cmp     tmp2, #512
+ -     bge     .Lcpy_body_long
+ +     bhs     .Lcpy_body_long
   
- diff --git a/sysdeps/powerpc/powerpc32/sysdep.h b/sysdeps/powerpc/powerpc32/sysdep.h
- index 2ba009e919..829eec266a 100644
- --- a/sysdeps/powerpc/powerpc32/sysdep.h
- +++ b/sysdeps/powerpc/powerpc32/sysdep.h
- @@ -179,8 +179,8 @@ GOT_LABEL:                        ;                                             \
-  #else
-  /* Position-dependent code does not require access to the GOT.  */
-  # define __GLRO(rOUT, rGOT, member, offset)                          \
- -     lis     rOUT,(member+LOWORD)@ha;                                        \
- -     lwz     rOUT,(member+LOWORD)@l(rOUT)
- +     lis     rOUT,(member)@ha;                                       \
- +     lwz     rOUT,(member)@l(rOUT)
-  #endif       /* PIC */
+  .Lcpy_body_medium:                   /* Count in tmp2.  */
+  #ifdef USE_VFP
+ @@ -378,7 +378,7 @@ ENTRY(memcpy)
+       add     src, src, #64
+       vstr    d1, [dst, #56]
+       add     dst, dst, #64
+ -     bge     1b
+ +     bhs     1b
+       tst     tmp2, #0x3f
+       beq     .Ldone
   
-  #endif       /* __ASSEMBLER__ */
- diff --git a/sysdeps/powerpc/powerpc64/backtrace.c b/sysdeps/powerpc/powerpc64/backtrace.c
- index 8a53a1088f..362a2b713c 100644
- --- a/sysdeps/powerpc/powerpc64/backtrace.c
- +++ b/sysdeps/powerpc/powerpc64/backtrace.c
- @@ -54,11 +54,22 @@ struct signal_frame_64 {
-    /* We don't care about the rest, since the IP value is at 'uc' field.  */
-  };
+ @@ -412,7 +412,7 @@ ENTRY(memcpy)
+       ldrd    A_l, A_h, [src, #64]!
+       strd    A_l, A_h, [dst, #64]!
+       subs    tmp2, tmp2, #64
+ -     bge     1b
+ +     bhs     1b
+       tst     tmp2, #0x3f
+       bne     1f
+       ldr     tmp2,[sp], #FRAME_SIZE
+ @@ -482,7 +482,7 @@ ENTRY(memcpy)
+       add     src, src, #32
   
- +/* Test if the address match to the inside the trampoline code.
- +   Up to and including kernel 5.8, returning from an interrupt or syscall to a
- +   signal handler starts execution directly at the handler's entry point, with
- +   LR set to address of the sigreturn trampoline (the vDSO symbol).
- +   Newer kernels will branch to signal handler from the trampoline instead, so
- +   checking the stacktrace against the vDSO entrypoint does not work in such
- +   case.
- +   The vDSO branches with a 'bctrl' instruction, so checking either the
- +   vDSO address itself and the next instruction should cover all kernel
- +   versions.  */
-  static inline bool
-  is_sigtramp_address (void *nip)
-  {
-  #ifdef HAVE_SIGTRAMP_RT64
- -  if (nip == GLRO (dl_vdso_sigtramp_rt64))
- +  if (nip == GLRO (dl_vdso_sigtramp_rt64) ||
- +      nip == GLRO (dl_vdso_sigtramp_rt64) + 4)
-      return true;
+       subs    tmp2, tmp2, #prefetch_lines * 64 * 2
+ -     blt     2f
+ +     blo     2f
+  1:
+       cpy_line_vfp    d3, 0
+       cpy_line_vfp    d4, 64
+ @@ -494,7 +494,7 @@ ENTRY(memcpy)
+       add     dst, dst, #2 * 64
+       add     src, src, #2 * 64
+       subs    tmp2, tmp2, #prefetch_lines * 64
+ -     bge     1b
+ +     bhs     1b
+  
+  2:
+       cpy_tail_vfp    d3, 0
+ @@ -615,8 +615,8 @@ ENTRY(memcpy)
+  1:
+       pld     [src, #(3 * 64)]
+       subs    count, count, #64
+ -     ldrmi   tmp2, [sp], #FRAME_SIZE
+ -     bmi     .Ltail63unaligned
+ +     ldrlo   tmp2, [sp], #FRAME_SIZE
+ +     blo     .Ltail63unaligned
+       pld     [src, #(4 * 64)]
+  
+  #ifdef USE_NEON
+ @@ -633,7 +633,7 @@ ENTRY(memcpy)
+       neon_load_multi d0-d3, src
+       neon_load_multi d4-d7, src
+       subs    count, count, #64
+ -     bmi     2f
+ +     blo     2f
+  1:
+       pld     [src, #(4 * 64)]
+       neon_store_multi d0-d3, dst
+ @@ -641,7 +641,7 @@ ENTRY(memcpy)
+       neon_store_multi d4-d7, dst
+       neon_load_multi d4-d7, src
+       subs    count, count, #64
+ -     bpl     1b
+ +     bhs     1b
+  2:
+       neon_store_multi d0-d3, dst
+       neon_store_multi d4-d7, dst
+ diff --git a/sysdeps/arm/be/nofpu/Implies b/sysdeps/arm/be/nofpu/Implies
+ new file mode 100644
+ index 0000000000..c90dd7fd5c
+ --- /dev/null
+ +++ b/sysdeps/arm/be/nofpu/Implies
+ @@ -0,0 +1 @@
+ +arm/nofpu
+ diff --git a/sysdeps/arm/le/nofpu/Implies b/sysdeps/arm/le/nofpu/Implies
+ new file mode 100644
+ index 0000000000..c90dd7fd5c
+ --- /dev/null
+ +++ b/sysdeps/arm/le/nofpu/Implies
+ @@ -0,0 +1 @@
+ +arm/nofpu
+ diff --git a/sysdeps/arm/memcpy.S b/sysdeps/arm/memcpy.S
+ index 510e8adaf2..bcfbc51d99 100644
+ --- a/sysdeps/arm/memcpy.S
+ +++ b/sysdeps/arm/memcpy.S
+ @@ -68,7 +68,7 @@ ENTRY(memcpy)
+               cfi_remember_state
+  
+               subs    r2, r2, #4
+ -             blt     8f
+ +             blo     8f
+               ands    ip, r0, #3
+       PLD(    pld     [r1, #0]                )
+               bne     9f
+ @@ -82,7 +82,7 @@ ENTRY(memcpy)
+               cfi_rel_offset (r6, 4)
+               cfi_rel_offset (r7, 8)
+               cfi_rel_offset (r8, 12)
+ -             blt     5f
+ +             blo     5f
+  
+       CALGN(  ands    ip, r1, #31             )
+       CALGN(  rsb     r3, ip, #32             )
+ @@ -98,9 +98,9 @@ ENTRY(memcpy)
   #endif
-    return false;
- diff --git a/sysdeps/s390/configure b/sysdeps/s390/configure
- index fa46e9e351..e7f576338d 100644
- --- a/sysdeps/s390/configure
- +++ b/sysdeps/s390/configure
- @@ -123,7 +123,9 @@ void testinsn (char *buf)
-      __asm__ (".machine \"arch13\" \n\t"
-            ".machinemode \"zarch_nohighgprs\" \n\t"
-            "lghi %%r0,16 \n\t"
- -          "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
- +          "mvcrl 0(%0),32(%0) \n\t"
- +          "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
- +          : : "a" (buf) : "memory", "r0");
-  }
-  EOF
-  if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c
- @@ -271,7 +273,9 @@ else
-  void testinsn (char *buf)
-  {
-      __asm__ ("lghi %%r0,16 \n\t"
- -          "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
- +          "mvcrl 0(%0),32(%0) \n\t"
- +          "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
- +          : : "a" (buf) : "memory", "r0");
-  }
-  EOF
-  if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c
- diff --git a/sysdeps/s390/configure.ac b/sysdeps/s390/configure.ac
- index 3ed5a8ef87..5c3479e8cf 100644
- --- a/sysdeps/s390/configure.ac
- +++ b/sysdeps/s390/configure.ac
- @@ -88,7 +88,9 @@ void testinsn (char *buf)
-      __asm__ (".machine \"arch13\" \n\t"
-            ".machinemode \"zarch_nohighgprs\" \n\t"
-            "lghi %%r0,16 \n\t"
- -          "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
- +          "mvcrl 0(%0),32(%0) \n\t"
- +          "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
- +          : : "a" (buf) : "memory", "r0");
-  }
-  EOF
-  dnl test, if assembler supports S390 arch13 instructions
- @@ -195,7 +197,9 @@ cat > conftest.c <<\EOF
-  void testinsn (char *buf)
-  {
-      __asm__ ("lghi %%r0,16 \n\t"
- -          "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
- +          "mvcrl 0(%0),32(%0) \n\t"
- +          "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
- +          : : "a" (buf) : "memory", "r0");
-  }
-  EOF
-  dnl test, if assembler supports S390 arch13 zarch instructions as default
- diff --git a/sysdeps/s390/memmove.c b/sysdeps/s390/memmove.c
- index 5fc85e129f..ee59b5de14 100644
- --- a/sysdeps/s390/memmove.c
- +++ b/sysdeps/s390/memmove.c
- @@ -43,7 +43,7 @@ extern __typeof (__redirect_memmove) MEMMOVE_ARCH13 attribute_hidden;
-  s390_libc_ifunc_expr (__redirect_memmove, memmove,
-                     ({
-                       s390_libc_ifunc_expr_stfle_init ();
- -                     (HAVE_MEMMOVE_ARCH13
- +                     (HAVE_MEMMOVE_ARCH13 && (hwcap & HWCAP_S390_VXRS_EXT2)
-                        && S390_IS_ARCH13_MIE3 (stfle_bits))
-                         ? MEMMOVE_ARCH13
-                         : (HAVE_MEMMOVE_Z13 && (hwcap & HWCAP_S390_VX))
- diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c
- index e6195c6e26..17c0cc3952 100644
- --- a/sysdeps/s390/multiarch/ifunc-impl-list.c
- +++ b/sysdeps/s390/multiarch/ifunc-impl-list.c
- @@ -171,7 +171,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-      IFUNC_IMPL (i, name, memmove,
-  # if HAVE_MEMMOVE_ARCH13
-               IFUNC_IMPL_ADD (array, i, memmove,
- -                             S390_IS_ARCH13_MIE3 (stfle_bits),
- +                             ((dl_hwcap & HWCAP_S390_VXRS_EXT2)
- +                              && S390_IS_ARCH13_MIE3 (stfle_bits)),
-                               MEMMOVE_ARCH13)
-  # endif
-  # if HAVE_MEMMOVE_Z13
- diff --git a/sysdeps/sh/be/sh4/fpu/Implies b/sysdeps/sh/be/sh4/fpu/Implies
- new file mode 100644
- index 0000000000..71b28ee1a4
- --- /dev/null
- +++ b/sysdeps/sh/be/sh4/fpu/Implies
- @@ -0,0 +1 @@
- +sh/sh4/fpu
- diff --git a/sysdeps/sh/le/sh4/fpu/Implies b/sysdeps/sh/le/sh4/fpu/Implies
- new file mode 100644
- index 0000000000..71b28ee1a4
- --- /dev/null
- +++ b/sysdeps/sh/le/sh4/fpu/Implies
- @@ -0,0 +1 @@
- +sh/sh4/fpu
- diff --git a/sysdeps/unix/make-syscalls.sh b/sysdeps/unix/make-syscalls.sh
- index c07626677f..4f6c3490a2 100644
- --- a/sysdeps/unix/make-syscalls.sh
- +++ b/sysdeps/unix/make-syscalls.sh
- @@ -30,6 +30,7 @@
-  # P: optionally-NULL pointer to typed object (e.g., 3rd argument to sigaction)
-  # s: non-NULL string (e.g., 1st arg to open)
-  # S: optionally-NULL string (e.g., 1st arg to acct)
- +# U: unsigned long int (32-bit types are zero-extended to 64-bit types)
-  # v: vararg scalar (e.g., optional 3rd arg to open)
-  # V: byte-per-page vector (3rd arg to mincore)
-  # W: wait status, optionally-NULL pointer to int (e.g., 2nd arg of wait4)
- @@ -184,6 +185,27 @@ while read file srcfile caller syscall args strong weak; do
-    ?:?????????) nargs=9;;
-    esac
   
- +  # Derive the unsigned long int arguments from the argument signature
- +  ulong_arg_1=0
- +  ulong_arg_2=0
- +  ulong_count=0
- +  for U in $(echo $args | sed -e "s/.*:/:/" | grep -ob U)
- +  do
- +    ulong_count=$(expr $ulong_count + 1)
- +    ulong_arg=$(echo $U | sed -e "s/:U//")
- +    case $ulong_count in
- +    1)
- +      ulong_arg_1=$ulong_arg
- +      ;;
- +    2)
- +      ulong_arg_2=$ulong_arg
- +      ;;
- +    *)
- +      echo >&2 "$0: Too many unsigned long int arguments for syscall ($strong $weak)"
- +      exit 2
- +    esac
- +  done
- +
-    # Make sure only the first syscall rule is used, if multiple dirs
-    # define the same syscall.
-    echo ''
- @@ -245,6 +267,8 @@ while read file srcfile caller syscall args strong weak; do
-       \$(make-target-directory)
-       (echo '#define SYSCALL_NAME $syscall'; \\
-        echo '#define SYSCALL_NARGS $nargs'; \\
- +      echo '#define SYSCALL_ULONG_ARG_1 $ulong_arg_1'; \\
- +      echo '#define SYSCALL_ULONG_ARG_2 $ulong_arg_2'; \\
-        echo '#define SYSCALL_SYMBOL $strong'; \\
-        echo '#define SYSCALL_NOERRNO $noerrno'; \\
-        echo '#define SYSCALL_ERRVAL $errval'; \\
- diff --git a/sysdeps/unix/syscall-template.S b/sysdeps/unix/syscall-template.S
- index cf6c7a58fb..f807a8603f 100644
- --- a/sysdeps/unix/syscall-template.S
- +++ b/sysdeps/unix/syscall-template.S
- @@ -25,6 +25,12 @@
-     defining a few macros:
-       SYSCALL_NAME            syscall name
-       SYSCALL_NARGS           number of arguments this call takes
- +     SYSCALL_ULONG_ARG_1     the first unsigned long int argument this
- +                             call takes.  0 means that there are no
- +                             unsigned long int arguments.
- +     SYSCALL_ULONG_ARG_2     the second unsigned long int argument this
- +                             call takes.  0 means that there is at most
- +                             one unsigned long int argument.
-       SYSCALL_SYMBOL          primary symbol name
-       SYSCALL_NOERRNO         1 to define a no-errno version (see below)
-       SYSCALL_ERRVAL          1 to define an error-value version (see below)
- @@ -44,9 +50,31 @@
-  /* This indirection is needed so that SYMBOL gets macro-expanded.  */
-  #define syscall_hidden_def(SYMBOL)           hidden_def (SYMBOL)
+       PLD(    pld     [r1, #0]                )
+ -2:   PLD(    subs    r2, r2, #96             )
+ +2:   PLD(    cmp     r2, #96                 )
+       PLD(    pld     [r1, #28]               )
+ -     PLD(    blt     4f                      )
+ +     PLD(    blo     4f                      )
+       PLD(    pld     [r1, #60]               )
+       PLD(    pld     [r1, #92]               )
   
- -#define T_PSEUDO(SYMBOL, NAME, N)            PSEUDO (SYMBOL, NAME, N)
- -#define T_PSEUDO_NOERRNO(SYMBOL, NAME, N)    PSEUDO_NOERRNO (SYMBOL, NAME, N)
- -#define T_PSEUDO_ERRVAL(SYMBOL, NAME, N)     PSEUDO_ERRVAL (SYMBOL, NAME, N)
- +/* If PSEUDOS_HAVE_ULONG_INDICES is defined, PSEUDO and T_PSEUDO macros
- +   have 2 extra arguments for unsigned long int arguments:
- +     Extra argument 1: Position of the first unsigned long int argument.
- +     Extra argument 2: Position of the second unsigned long int argument.
- + */
- +#ifndef PSEUDOS_HAVE_ULONG_INDICES
- +# undef SYSCALL_ULONG_ARG_1
- +# define SYSCALL_ULONG_ARG_1 0
- +#endif
- +
- +#if SYSCALL_ULONG_ARG_1
- +# define T_PSEUDO(SYMBOL, NAME, N, U1, U2) \
- +  PSEUDO (SYMBOL, NAME, N, U1, U2)
- +# define T_PSEUDO_NOERRNO(SYMBOL, NAME, N, U1, U2) \
- +  PSEUDO_NOERRNO (SYMBOL, NAME, N, U1, U2)
- +# define T_PSEUDO_ERRVAL(SYMBOL, NAME, N, U1, U2) \
- +  PSEUDO_ERRVAL (SYMBOL, NAME, N, U1, U2)
- +#else
- +# define T_PSEUDO(SYMBOL, NAME, N) \
- +  PSEUDO (SYMBOL, NAME, N)
- +# define T_PSEUDO_NOERRNO(SYMBOL, NAME, N) \
- +  PSEUDO_NOERRNO (SYMBOL, NAME, N)
- +# define T_PSEUDO_ERRVAL(SYMBOL, NAME, N) \
- +  PSEUDO_ERRVAL (SYMBOL, NAME, N)
- +#endif
-  #define T_PSEUDO_END(SYMBOL)                 PSEUDO_END (SYMBOL)
-  #define T_PSEUDO_END_NOERRNO(SYMBOL)         PSEUDO_END_NOERRNO (SYMBOL)
-  #define T_PSEUDO_END_ERRVAL(SYMBOL)          PSEUDO_END_ERRVAL (SYMBOL)
- @@ -56,7 +84,12 @@
-  /* This kind of system call stub never returns an error.
-     We return the return value register to the caller unexamined.  */
+ @@ -108,9 +108,7 @@ ENTRY(memcpy)
+  4:           ldmia   r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
+               subs    r2, r2, #32
+               stmia   r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
+ -             bge     3b
+ -     PLD(    cmn     r2, #96                 )
+ -     PLD(    bge     4b                      )
+ +             bhs     3b
   
- +# if SYSCALL_ULONG_ARG_1
- +T_PSEUDO_NOERRNO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS,
- +               SYSCALL_ULONG_ARG_1, SYSCALL_ULONG_ARG_2)
- +# else
-  T_PSEUDO_NOERRNO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
- +# endif
-       ret_NOERRNO
-  T_PSEUDO_END_NOERRNO (SYSCALL_SYMBOL)
+  5:           ands    ip, r2, #28
+               rsb     ip, ip, #32
+ @@ -222,7 +220,7 @@ ENTRY(memcpy)
+               strbge  r4, [r0], #1
+               subs    r2, r2, ip
+               strb    lr, [r0], #1
+ -             blt     8b
+ +             blo     8b
+               ands    ip, r1, #3
+               beq     1b
   
- @@ -66,7 +99,12 @@ T_PSEUDO_END_NOERRNO (SYSCALL_SYMBOL)
-     value, or zero for success.  We may massage the kernel's return value
-     to meet that ABI, but we never set errno here.  */
+ @@ -236,7 +234,7 @@ ENTRY(memcpy)
+               .macro  forward_copy_shift pull push
   
- +# if SYSCALL_ULONG_ARG_1
- +T_PSEUDO_ERRVAL (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS,
- +              SYSCALL_ULONG_ARG_1, SYSCALL_ULONG_ARG_2)
- +# else
-  T_PSEUDO_ERRVAL (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
- +# endif
-       ret_ERRVAL
-  T_PSEUDO_END_ERRVAL (SYSCALL_SYMBOL)
+               subs    r2, r2, #28
+ -             blt     14f
+ +             blo     14f
   
- @@ -75,7 +113,12 @@ T_PSEUDO_END_ERRVAL (SYSCALL_SYMBOL)
-  /* This is a "normal" system call stub: if there is an error,
-     it returns -1 and sets errno.  */
+       CALGN(  ands    ip, r1, #31             )
+       CALGN(  rsb     ip, ip, #32             )
+ @@ -253,9 +251,9 @@ ENTRY(memcpy)
+               cfi_rel_offset (r10, 16)
   
- +# if SYSCALL_ULONG_ARG_1
- +T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS,
- +       SYSCALL_ULONG_ARG_1, SYSCALL_ULONG_ARG_2)
- +# else
 T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
- +# endif
-       ret
-  T_PSEUDO_END (SYSCALL_SYMBOL)
+       PLD(    pld     [r1, #0]                )
+ -     PLD(    subs    r2, r2, #96             )
+ +     PLD(    cmp     r2, #96                 )
+       PLD(    pld     [r1, #28]               )
-     PLD(    blt     13f                     )
+ +     PLD(    blo     13f                     )
+       PLD(    pld     [r1, #60]               )
+       PLD(    pld     [r1, #92]               )
   
- diff --git a/sysdeps/unix/syscalls.list b/sysdeps/unix/syscalls.list
- index e28e801c7a..6b22b2cb45 100644
- --- a/sysdeps/unix/syscalls.list
- +++ b/sysdeps/unix/syscalls.list
- @@ -39,27 +39,27 @@ kill              -       kill            i:ii    __kill          kill
-  link         -       link            i:ss    __link          link
-  listen               -       listen          i:ii    __listen        listen
-  lseek                -       lseek           i:iii   __libc_lseek    __lseek lseek
- -madvise              -       madvise         i:pii   __madvise       madvise
- +madvise              -       madvise         i:pUi   __madvise       madvise
-  mkdir                -       mkdir           i:si    __mkdir         mkdir
- -mmap         -       mmap            b:aniiii __mmap         mmap
- -mprotect     -       mprotect        i:aii   __mprotect      mprotect
- -munmap               -       munmap          i:ai    __munmap        munmap
- +mmap         -       mmap            b:aUiiii __mmap         mmap
- +mprotect     -       mprotect        i:aUi   __mprotect      mprotect
- +munmap               -       munmap          i:aU    __munmap        munmap
-  open         -       open            Ci:siv  __libc_open __open open
-  profil               -       profil          i:piii  __profil        profil
-  ptrace               -       ptrace          i:iiii  ptrace
- -read         -       read            Ci:ibn  __libc_read     __read read
- -readlink     -       readlink        i:spi   __readlink      readlink
- +read         -       read            Ci:ibU  __libc_read     __read read
- +readlink     -       readlink        i:spU   __readlink      readlink
-  readv                -       readv           Ci:ipi  __readv         readv
-  reboot               -       reboot          i:i     reboot
- -recv         -       recv            Ci:ibni __libc_recv     recv
- -recvfrom     -       recvfrom        Ci:ibniBN       __libc_recvfrom __recvfrom recvfrom
- +recv         -       recv            Ci:ibUi __libc_recv     recv
- +recvfrom     -       recvfrom        Ci:ibUiBN       __libc_recvfrom __recvfrom recvfrom
-  recvmsg              -       recvmsg         Ci:ipi  __libc_recvmsg  __recvmsg recvmsg
-  rename               -       rename          i:ss    rename
-  rmdir                -       rmdir           i:s     __rmdir         rmdir
-  select               -       select          Ci:iPPPP        __select        __libc_select select
- -send         -       send            Ci:ibni __libc_send     __send send
- +send         -       send            Ci:ibUi __libc_send     __send send
-  sendmsg              -       sendmsg         Ci:ipi  __libc_sendmsg  __sendmsg sendmsg
- -sendto               -       sendto          Ci:ibnibn       __libc_sendto   __sendto sendto
- +sendto               -       sendto          Ci:ibUibn       __libc_sendto   __sendto sendto
-  setdomain    -       setdomainname   i:si    setdomainname
-  setegid              -       setegid         i:i     __setegid       setegid
-  seteuid              -       seteuid         i:i     __seteuid       seteuid
- @@ -94,5 +94,5 @@ uname               -       uname           i:p     __uname         uname
-  unlink               -       unlink          i:s     __unlink        unlink
-  utimes               -       utimes          i:sp    __utimes        utimes
-  vhangup              -       vhangup         i:i     vhangup
- -write                -       write           Ci:ibn  __libc_write    __write write
- +write                -       write           Ci:ibU  __libc_write    __write write
-  writev               -       writev          Ci:ipi  __writev        writev
- diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
- index f12b7b1a2d..5fbde369c3 100644
- --- a/sysdeps/unix/sysv/linux/Makefile
- +++ b/sysdeps/unix/sysv/linux/Makefile
- @@ -60,7 +60,9 @@ sysdep_routines += adjtimex clone umount umount2 readahead \
-                  setfsuid setfsgid epoll_pwait signalfd \
-                  eventfd eventfd_read eventfd_write prlimit \
-                  personality epoll_wait tee vmsplice splice \
- -                open_by_handle_at mlock2 pkey_mprotect pkey_set pkey_get
- +                open_by_handle_at mlock2 pkey_mprotect pkey_set pkey_get \
- +                prctl \
- +                process_vm_readv process_vm_writev
+ @@ -280,9 +278,7 @@ ENTRY(memcpy)
+               mov     ip, ip, PULL #\pull
+               orr     ip, ip, lr, PUSH #\push
+               stmia   r0!, {r3, r4, r5, r6, r7, r8, r10, ip}
+ -             bge     12b
+ -     PLD(    cmn     r2, #96                 )
+ -     PLD(    bge     13b                     )
+ +             bhs     12b
   
-  CFLAGS-gethostid.c = -fexceptions
-  CFLAGS-tee.c = -fexceptions -fasynchronous-unwind-tables
- diff --git a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
- index 9378387747..c8471947b9 100644
- --- a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
- +++ b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
- @@ -17,6 +17,7 @@
-  #define __NR_clock_nanosleep 115
-  #define __NR_clock_settime 112
-  #define __NR_clone 220
- +#define __NR_clone3 435
-  #define __NR_close 57
-  #define __NR_connect 203
-  #define __NR_copy_file_range 285
- diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
- index 1389cea1b3..346d045fb4 100644
- --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
- +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
- @@ -51,8 +51,12 @@
+               pop     {r5 - r8, r10}
+               cfi_adjust_cfa_offset (-20)
+ diff --git a/sysdeps/arm/memmove.S b/sysdeps/arm/memmove.S
+ index 954037ef3a..0d07b76ee6 100644
+ --- a/sysdeps/arm/memmove.S
+ +++ b/sysdeps/arm/memmove.S
+ @@ -85,7 +85,7 @@ ENTRY(memmove)
+               add     r1, r1, r2
+               add     r0, r0, r2
+               subs    r2, r2, #4
+ -             blt     8f
+ +             blo     8f
+               ands    ip, r0, #3
+       PLD(    pld     [r1, #-4]               )
+               bne     9f
+ @@ -99,7 +99,7 @@ ENTRY(memmove)
+               cfi_rel_offset (r6, 4)
+               cfi_rel_offset (r7, 8)
+               cfi_rel_offset (r8, 12)
+ -             blt     5f
+ +             blo     5f
   
-  #define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h'                             \
-                          && MIDR_PARTNUM(midr) == 0x000)
- -#define IS_ARES(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                       \
- -                     && MIDR_PARTNUM(midr) == 0xd0c)
- +#define IS_NEOVERSE_N1(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                \
- +                           && MIDR_PARTNUM(midr) == 0xd0c)
- +#define IS_NEOVERSE_N2(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                \
- +                           && MIDR_PARTNUM(midr) == 0xd49)
- +#define IS_NEOVERSE_V1(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                \
- +                           && MIDR_PARTNUM(midr) == 0xd40)
+       CALGN(  ands    ip, r1, #31             )
+       CALGN(  sbcsne  r4, ip, r2              )  @ C is always set here
+ @@ -114,9 +114,9 @@ ENTRY(memmove)
+  #endif
   
-  #define IS_EMAG(midr) (MIDR_IMPLEMENTOR(midr) == 'P'                       \
-                         && MIDR_PARTNUM(midr) == 0x000)
- diff --git a/sysdeps/unix/sysv/linux/aarch64/localplt.data b/sysdeps/unix/sysv/linux/aarch64/localplt.data
- index a60053b914..08af68b5e8 100644
- --- a/sysdeps/unix/sysv/linux/aarch64/localplt.data
- +++ b/sysdeps/unix/sysv/linux/aarch64/localplt.data
- @@ -7,6 +7,9 @@ libc.so: malloc
-  libc.so: memalign
-  libc.so: realloc
-  libm.so: matherr
- +# If outline atomics are used, libgcc (built outside of glibc) may
- +# call __getauxval using the PLT.
- +libc.so: __getauxval ?
-  # The dynamic loader needs __tls_get_addr for TLS.
-  ld.so: __tls_get_addr
-  # The main malloc is interposed into the dynamic linker, for
- diff --git a/sysdeps/unix/sysv/linux/hppa/atomic-machine.h b/sysdeps/unix/sysv/linux/hppa/atomic-machine.h
- index 9d8ffbe860..bf61b66b70 100644
- --- a/sysdeps/unix/sysv/linux/hppa/atomic-machine.h
- +++ b/sysdeps/unix/sysv/linux/hppa/atomic-machine.h
- @@ -36,9 +36,37 @@ typedef uintptr_t uatomicptr_t;
-  typedef intmax_t atomic_max_t;
-  typedef uintmax_t uatomic_max_t;
+       PLD(    pld     [r1, #-4]               )
+ -2:   PLD(    subs    r2, r2, #96             )
+ +2:   PLD(    cmp     r2, #96                 )
+       PLD(    pld     [r1, #-32]              )
+ -     PLD(    blt     4f                      )
+ +     PLD(    blo     4f                      )
+       PLD(    pld     [r1, #-64]              )
+       PLD(    pld     [r1, #-96]              )
   
- +#define atomic_full_barrier() __sync_synchronize ()
- +
-  #define __HAVE_64B_ATOMICS 0
-  #define USE_ATOMIC_COMPILER_BUILTINS 0
+ @@ -124,9 +124,7 @@ ENTRY(memmove)
+  4:           ldmdb   r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
+               subs    r2, r2, #32
+               stmdb   r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
+ -             bge     3b
+ -     PLD(    cmn     r2, #96                 )
+ -     PLD(    bge     4b                      )
+ +             bhs     3b
   
- +/* We use the compiler atomic load and store builtins as the generic
- +   defines are not atomic.  In particular, we need to use compare and
- +   exchange for stores as the implementation is synthesized.  */
- +void __atomic_link_error (void);
- +#define __atomic_check_size_ls(mem) \
- + if ((sizeof (*mem) != 1) && (sizeof (*mem) != 2) && sizeof (*mem) != 4)    \
- +   __atomic_link_error ();
- +
- +#define atomic_load_relaxed(mem) \
- + ({ __atomic_check_size_ls((mem));                                           \
- +    __atomic_load_n ((mem), __ATOMIC_RELAXED); })
- +#define atomic_load_acquire(mem) \
- + ({ __atomic_check_size_ls((mem));                                           \
- +    __atomic_load_n ((mem), __ATOMIC_ACQUIRE); })
- +
- +#define atomic_store_relaxed(mem, val) \
- + do {                                                                        \
- +   __atomic_check_size_ls((mem));                                            \
- +   __atomic_store_n ((mem), (val), __ATOMIC_RELAXED);                        \
- + } while (0)
- +#define atomic_store_release(mem, val) \
- + do {                                                                        \
- +   __atomic_check_size_ls((mem));                                            \
- +   __atomic_store_n ((mem), (val), __ATOMIC_RELEASE);                        \
- + } while (0)
- +
-  /* XXX Is this actually correct?  */
-  #define ATOMIC_EXCHANGE_USES_CAS 1
+  5:           ands    ip, r2, #28
+               rsb     ip, ip, #32
+ @@ -237,7 +235,7 @@ ENTRY(memmove)
+               strbge  r4, [r0, #-1]!
+               subs    r2, r2, ip
+               strb    lr, [r0, #-1]!
+ -             blt     8b
+ +             blo     8b
+               ands    ip, r1, #3
+               beq     1b
   
- diff --git a/sysdeps/unix/sysv/linux/microblaze/sysdep.h b/sysdeps/unix/sysv/linux/microblaze/sysdep.h
- index ed873d9dd4..796663a23a 100644
- --- a/sysdeps/unix/sysv/linux/microblaze/sysdep.h
- +++ b/sysdeps/unix/sysv/linux/microblaze/sysdep.h
- @@ -209,8 +209,8 @@ SYSCALL_ERROR_LABEL_DCL:                            \
+ @@ -251,7 +249,7 @@ ENTRY(memmove)
+               .macro  backward_copy_shift push pull
   
-  # define inline_syscall0(name,dummy)                                          \
-    ({                                                                          \
- -    register long __ret __asm__("r3");                                        \
- -    register long __r12 __asm__("r12") = name;                                \
- +    register long int __ret __asm__("r3");                                    \
- +    register long int __r12 __asm__("r12") = name;                            \
-      __asm__ __volatile__( "brki r14,8; nop;"                                  \
-        : "=r"(__ret)                                                           \
-        : "r"(__r12)                                                            \
- @@ -219,9 +219,10 @@ SYSCALL_ERROR_LABEL_DCL:                            \
+               subs    r2, r2, #28
+ -             blt     14f
+ +             blo     14f
   
-  # define inline_syscall1(name,arg1)                                           \
-    ({                                                                          \
- -    register long __ret __asm__("r3");                                        \
- -    register long __r12 __asm__("r12") = name;                                \
- -    register long __r5 __asm__("r5") = (long)(arg1);                          \
- +    long int __arg1 = (long int) (arg1);                                      \
- +    register long int __ret __asm__("r3");                                    \
- +    register long int __r12 __asm__("r12") = name;                            \
- +    register long int __r5 __asm__("r5") = __arg1;                            \
-      __asm__ __volatile__( "brki r14,8; nop;"                                  \
-        : "=r"(__ret)                                                           \
-        : "r"(__r5), "r"(__r12)                                                 \
- @@ -230,10 +231,12 @@ SYSCALL_ERROR_LABEL_DCL:                            \
+       CALGN(  ands    ip, r1, #31             )
+       CALGN(  rsb     ip, ip, #32             )
+ @@ -268,9 +266,9 @@ ENTRY(memmove)
+               cfi_rel_offset (r10, 16)
   
-  # define inline_syscall2(name,arg1,arg2)                                      \
-    ({                                                                          \
- -    register long __ret __asm__("r3");                                        \
- -    register long __r12 __asm__("r12") = name;                                \
- -    register long __r5 __asm__("r5") = (long)(arg1);                          \
- -    register long __r6 __asm__("r6") = (long)(arg2);                          \
- +    long int __arg1 = (long int) (arg1);                                      \
- +    long int __arg2 = (long int) (arg2);                                      \
- +    register long int __ret __asm__("r3");                                    \
- +    register long int __r12 __asm__("r12") = name;                            \
- +    register long int __r5 __asm__("r5") = __arg1;                            \
- +    register long int __r6 __asm__("r6") = __arg2;                            \
-      __asm__ __volatile__( "brki r14,8; nop;"                                  \
-        : "=r"(__ret)                                                           \
-        : "r"(__r5), "r"(__r6), "r"(__r12)                                      \
- @@ -243,11 +246,14 @@ SYSCALL_ERROR_LABEL_DCL:                            \
+       PLD(    pld     [r1, #-4]               )
+ -     PLD(    subs    r2, r2, #96             )
+ +     PLD(    cmp     r2, #96                 )
+       PLD(    pld     [r1, #-32]              )
+ -     PLD(    blt     13f                     )
+ +     PLD(    blo     13f                     )
+       PLD(    pld     [r1, #-64]              )
+       PLD(    pld     [r1, #-96]              )
   
-  # define inline_syscall3(name,arg1,arg2,arg3)                                 \
-    ({                                                                          \
- -    register long __ret __asm__("r3");                                        \
- -    register long __r12 __asm__("r12") = name;                                \
- -    register long __r5 __asm__("r5") = (long)(arg1);                          \
- -    register long __r6 __asm__("r6") = (long)(arg2);                          \
- -    register long __r7 __asm__("r7") = (long)(arg3);                          \
- +    long int __arg1 = (long int) (arg1);                                      \
- +    long int __arg2 = (long int) (arg2);                                      \
- +    long int __arg3 = (long int) (arg3);                                      \
- +    register long int __ret __asm__("r3");                                    \
- +    register long int __r12 __asm__("r12") = name;                            \
- +    register long int __r5 __asm__("r5") = __arg1;                            \
- +    register long int __r6 __asm__("r6") = __arg2;                            \
- +    register long int __r7 __asm__("r7") = __arg3;                            \
-      __asm__ __volatile__( "brki r14,8; nop;"                                  \
-        : "=r"(__ret)                                                           \
-        : "r"(__r5), "r"(__r6), "r"(__r7), "r"(__r12)                           \
- @@ -257,12 +263,16 @@ SYSCALL_ERROR_LABEL_DCL:                            \
-  
-  # define inline_syscall4(name,arg1,arg2,arg3,arg4)                            \
-    ({                                                                          \
- -    register long __ret __asm__("r3");                                        \
- -    register long __r12 __asm__("r12") = name;                                \
- -    register long __r5 __asm__("r5") = (long)(arg1);                          \
- -    register long __r6 __asm__("r6") = (long)(arg2);                          \
- -    register long __r7 __asm__("r7") = (long)(arg3);                          \
- -    register long __r8 __asm__("r8") = (long)(arg4);                          \
- +    long int __arg1 = (long int) (arg1);                                      \
- +    long int __arg2 = (long int) (arg2);                                      \
- +    long int __arg3 = (long int) (arg3);                                      \
- +    long int __arg4 = (long int) (arg4);                                      \
- +    register long int __ret __asm__("r3");                                    \
- +    register long int __r12 __asm__("r12") = name;                            \
- +    register long int __r5 __asm__("r5") = __arg1;                            \
- +    register long int __r6 __asm__("r6") = __arg2;                            \
- +    register long int __r7 __asm__("r7") = __arg3;                            \
- +    register long int __r8 __asm__("r8") = __arg4;                            \
-      __asm__ __volatile__( "brki r14,8; nop;"                                  \
-        : "=r"(__ret)                                                           \
-        : "r"(__r5), "r"(__r6), "r"(__r7), "r"(__r8),"r"(__r12)                 \
- @@ -272,13 +282,18 @@ SYSCALL_ERROR_LABEL_DCL:                            \
-  
-  # define inline_syscall5(name,arg1,arg2,arg3,arg4,arg5)                       \
-    ({                                                                          \
- -    register long __ret __asm__("r3");                                        \
- -    register long __r12 __asm__("r12") = name;                                \
- -    register long __r5 __asm__("r5") = (long)(arg1);                          \
- -    register long __r6 __asm__("r6") = (long)(arg2);                          \
- -    register long __r7 __asm__("r7") = (long)(arg3);                          \
- -    register long __r8 __asm__("r8") = (long)(arg4);                          \
- -    register long __r9 __asm__("r9") = (long)(arg5);                          \
- +    long int __arg1 = (long int) (arg1);                                      \
- +    long int __arg2 = (long int) (arg2);                                      \
- +    long int __arg3 = (long int) (arg3);                                      \
- +    long int __arg4 = (long int) (arg4);                                      \
- +    long int __arg5 = (long int) (arg5);                                      \
- +    register long int __ret __asm__("r3");                                    \
- +    register long int __r12 __asm__("r12") = name;                            \
- +    register long int __r5 __asm__("r5") = __arg1;                            \
- +    register long int __r6 __asm__("r6") = __arg2;                            \
- +    register long int __r7 __asm__("r7") = __arg3;                            \
- +    register long int __r8 __asm__("r8") = __arg4;                            \
- +    register long int __r9 __asm__("r9") = __arg5;                            \
-      __asm__ __volatile__( "brki r14,8; nop;"                                  \
-        : "=r"(__ret)                                                           \
-        : "r"(__r5), "r"(__r6), "r"(__r7), "r"(__r8),"r"(__r9), "r"(__r12)      \
- @@ -288,14 +303,20 @@ SYSCALL_ERROR_LABEL_DCL:                            \
-  
-  # define inline_syscall6(name,arg1,arg2,arg3,arg4,arg5,arg6)                  \
-    ({                                                                          \
- -    register long __ret __asm__("r3");                                        \
- -    register long __r12 __asm__("r12") = name;                                \
- -    register long __r5 __asm__("r5") = (long)(arg1);                          \
- -    register long __r6 __asm__("r6") = (long)(arg2);                          \
- -    register long __r7 __asm__("r7") = (long)(arg3);                          \
- -    register long __r8 __asm__("r8") = (long)(arg4);                          \
- -    register long __r9 __asm__("r9") = (long)(arg5);                          \
- -    register long __r10 __asm__("r10") = (long)(arg6);                        \
- +    long int __arg1 = (long int) (arg1);                                      \
- +    long int __arg2 = (long int) (arg2);                                      \
- +    long int __arg3 = (long int) (arg3);                                      \
- +    long int __arg4 = (long int) (arg4);                                      \
- +    long int __arg5 = (long int) (arg5);                                      \
- +    long int __arg6 = (long int) (arg6);                                      \
- +    register long int __ret __asm__("r3");                                    \
- +    register long int __r12 __asm__("r12") = name;                            \
- +    register long int __r5 __asm__("r5") = __arg1;                            \
- +    register long int __r6 __asm__("r6") = __arg2;                            \
- +    register long int __r7 __asm__("r7") = __arg3;                            \
- +    register long int __r8 __asm__("r8") = __arg4;                            \
- +    register long int __r9 __asm__("r9") = __arg5;                            \
- +    register long int __r10 __asm__("r10") = __arg6;                          \
-      __asm__ __volatile__( "brki r14,8; nop;"                                  \
-        : "=r"(__ret)                                                           \
-        : "r"(__r5), "r"(__r6), "r"(__r7), "r"(__r8),"r"(__r9), "r"(__r10),     \
- diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall5.S b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall5.S
- index b2bbf10181..ff445a5406 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall5.S
- +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall5.S
- @@ -22,9 +22,9 @@
-       .text
-       .set    nomips16
+ @@ -295,9 +293,7 @@ ENTRY(memmove)
+               mov     r4, r4, PUSH #\push
+               orr     r4, r4, r3, PULL #\pull
+               stmdb   r0!, {r4 - r8, r10, ip, lr}
+ -             bge     12b
+ -     PLD(    cmn     r2, #96                 )
+ -     PLD(    bge     13b                     )
+ +             bhs     12b
   
- -/* long long __mips_syscall5 (long arg1, long arg2, long arg3, long arg4,
- -                           long arg5,
- -                           long number)  */
- +/* long long int __mips_syscall5 (long int arg1, long int arg2, long int arg3,
- +                               long int arg4, long int arg5,
- +                               long int number)  */
+               pop     {r5 - r8, r10}
+               cfi_adjust_cfa_offset (-20)
+ diff --git a/sysdeps/generic/unwind-arch.h b/sysdeps/generic/unwind-arch.h
+ new file mode 100644
+ index 0000000000..d712e5e11d
+ --- /dev/null
+ +++ b/sysdeps/generic/unwind-arch.h
+ @@ -0,0 +1,30 @@
+ +/* Return backtrace of current program state.  Arch-specific bits.
+ +   Copyright (C) 2020 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#ifndef _UNWIND_ARCH_H
+ +#define _UNWIND_ARCH_H
+ +
+ +#include <unwind.h>
+ +
+ +static inline void *
+ +unwind_arch_adjustment (void *prev, void *addr)
+ +{
+ +  return addr;
+ +}
+ +
+ +#endif
+ diff --git a/sysdeps/hppa/dl-fptr.c b/sysdeps/hppa/dl-fptr.c
+ index 0a37397284..25ca8f8463 100644
+ --- a/sysdeps/hppa/dl-fptr.c
+ +++ b/sysdeps/hppa/dl-fptr.c
+ @@ -172,8 +172,8 @@ make_fdesc (ElfW(Addr) ip, ElfW(Addr) gp)
+      }
   
-  ENTRY(__mips_syscall5)
-       lw      v0, 20(sp)
- diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall6.S b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall6.S
- index 572d7c1137..2b4a3117d1 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall6.S
- +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall6.S
- @@ -22,9 +22,9 @@
-       .text
-       .set    nomips16
+   install:
+ -  fdesc->ip = ip;
+    fdesc->gp = gp;
+ +  fdesc->ip = ip;
   
- -/* long long __mips_syscall6 (long arg1, long arg2, long arg3, long arg4,
- -                           long arg5, long arg6,
- -                           long number)  */
- +/* long long int __mips_syscall6 (long int arg1, long int arg2, long int arg3,
- +                               long int arg4, long int arg5, long int arg6,
- +                               long int number)  */
+    return (ElfW(Addr)) fdesc;
+  }
+ @@ -350,7 +350,9 @@ ElfW(Addr)
+  _dl_lookup_address (const void *address)
+  {
+    ElfW(Addr) addr = (ElfW(Addr)) address;
+ -  unsigned int *desc, *gptr;
+ +  ElfW(Word) reloc_arg;
+ +  volatile unsigned int *desc;
+ +  unsigned int *gptr;
   
-  ENTRY(__mips_syscall6)
-       lw      v0, 24(sp)
- diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall7.S b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall7.S
- index 05164cb253..2723bbb138 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall7.S
- +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall7.S
- @@ -22,9 +22,10 @@
-       .text
-       .set    nomips16
+    /* Return ADDR if the least-significant two bits of ADDR are not consistent
+       with ADDR being a linker defined function pointer.  The normal value for
+ @@ -367,7 +369,11 @@ _dl_lookup_address (const void *address)
+    if (!_dl_read_access_allowed (desc))
+      return addr;
   
- -/* long long __mips_syscall7 (long arg1, long arg2, long arg3, long arg4,
- -                           long arg5, long arg6, long arg7,
- -                           long number)  */
- +/* long long int __mips_syscall7 (long int arg1, long int arg2, long int arg3,
- +                               long int arg4, long int arg5, long int arg6,
- +                               long int arg7,
- +                               long int number)  */
+ -  /* Load first word of candidate descriptor.  It should be a pointer
+ +  /* First load the relocation offset.  */
+ +  reloc_arg = (ElfW(Word)) desc[1];
+ +  atomic_full_barrier();
+ +
+ +  /* Then load first word of candidate descriptor.  It should be a pointer
+       with word alignment and point to memory that can be read.  */
+    gptr = (unsigned int *) desc[0];
+    if (((unsigned int) gptr & 3) != 0
+ @@ -377,8 +383,8 @@ _dl_lookup_address (const void *address)
+    /* See if descriptor requires resolution.  The following trampoline is
+       used in each global offset table for function resolution:
   
-  ENTRY(__mips_syscall7)
-       lw      v0, 28(sp)
- diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall.h b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall.h
- index 9bf551ace8..f23ede0259 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall.h
- +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall.h
- @@ -19,51 +19,57 @@
-  #ifndef MIPS16_SYSCALL_H
-  #define MIPS16_SYSCALL_H 1
+ -             ldw 0(r20),r22
+ -             bv r0(r22)
+ +             ldw 0(r20),r21
+ +             bv r0(r21)
+               ldw 4(r20),r21
+       tramp:  b,l .-12,r20
+               depwi 0,31,2,r20
+ @@ -389,7 +395,15 @@ _dl_lookup_address (const void *address)
+    if (gptr[0] == 0xea9f1fdd                  /* b,l .-12,r20     */
+        && gptr[1] == 0xd6801c1e                       /* depwi 0,31,2,r20 */
+        && (ElfW(Addr)) gptr[2] == elf_machine_resolve ())
+ -    _dl_fixup ((struct link_map *) gptr[5], (ElfW(Word)) desc[1]);
+ +    {
+ +      struct link_map *l = (struct link_map *) gptr[5];
+ +
+ +      /* If gp has been resolved, we need to hunt for relocation offset.  */
+ +      if (!(reloc_arg & PA_GP_RELOC))
+ +     reloc_arg = _dl_fix_reloc_arg (addr, l);
+ +
+ +      _dl_fixup (l, reloc_arg);
+ +    }
   
- -long long __nomips16 __mips16_syscall0 (long number);
- +long long int __nomips16 __mips16_syscall0 (long int number);
-  #define __mips16_syscall0(dummy, number)                             \
- -     __mips16_syscall0 ((long) (number))
- +     __mips16_syscall0 ((long int) (number))
+    return (ElfW(Addr)) desc[0];
+  }
+ diff --git a/sysdeps/hppa/dl-machine.h b/sysdeps/hppa/dl-machine.h
+ index 9e98366ea3..8ecff97706 100644
+ --- a/sysdeps/hppa/dl-machine.h
+ +++ b/sysdeps/hppa/dl-machine.h
+ @@ -48,6 +48,14 @@
+  #define GOT_FROM_PLT_STUB (4*4)
+  #define PLT_ENTRY_SIZE (2*4)
   
- -long long __nomips16 __mips16_syscall1 (long a0,
- -                                     long number);
- +long long int __nomips16 __mips16_syscall1 (long int a0,
- +                                         long int number);
-  #define __mips16_syscall1(a0, number)                                        \
- -     __mips16_syscall1 ((long) (a0),                                 \
- -                        (long) (number))
- +     __mips16_syscall1 ((long int) (a0),                             \
- +                        (long int) (number))
-  
- -long long __nomips16 __mips16_syscall2 (long a0, long a1,
- -                                     long number);
- +long long int __nomips16 __mips16_syscall2 (long int a0, long int a1,
- +                                         long int number);
-  #define __mips16_syscall2(a0, a1, number)                            \
- -     __mips16_syscall2 ((long) (a0), (long) (a1),                    \
- -                        (long) (number))
- +     __mips16_syscall2 ((long int) (a0), (long int) (a1),            \
- +                        (long int) (number))
-  
- -long long __nomips16 __mips16_syscall3 (long a0, long a1, long a2,
- -                                     long number);
- +long long int __nomips16 __mips16_syscall3 (long int a0, long int a1,
- +                                         long int a2,
- +                                         long int number);
-  #define __mips16_syscall3(a0, a1, a2, number)                                \
- -     __mips16_syscall3 ((long) (a0), (long) (a1), (long) (a2),       \
- -                        (long) (number))
- +     __mips16_syscall3 ((long int) (a0), (long int) (a1),            \
- +                        (long int) (a2),                             \
- +                        (long int) (number))
-  
- -long long __nomips16 __mips16_syscall4 (long a0, long a1, long a2, long a3,
- -                                     long number);
- +long long int __nomips16 __mips16_syscall4 (long int a0, long int a1,
- +                                         long int a2, long int a3,
- +                                         long int number);
-  #define __mips16_syscall4(a0, a1, a2, a3, number)                    \
- -     __mips16_syscall4 ((long) (a0), (long) (a1), (long) (a2),       \
- -                        (long) (a3),                                 \
- -                        (long) (number))
- +     __mips16_syscall4 ((long int) (a0), (long int) (a1),            \
- +                        (long int) (a2), (long int) (a3),            \
- +                        (long int) (number))
+ +/* The gp slot in the function descriptor contains the relocation offset
+ +   before resolution.  To distinguish between a resolved gp value and an
+ +   unresolved relocation offset we set an unused bit in the relocation
+ +   offset.  This would allow us to do a synchronzied two word update
+ +   using this bit (interlocked update), but instead of waiting for the
+ +   update we simply recompute the gp value given that we know the ip.  */
+ +#define PA_GP_RELOC 1
+ +
+  /* Initialize the function descriptor table before relocations */
+  static inline void
+  __hppa_init_bootstrap_fdesc_table (struct link_map *map)
+ @@ -117,10 +125,28 @@ elf_machine_fixup_plt (struct link_map *map, lookup_t t,
+    volatile Elf32_Addr *rfdesc = reloc_addr;
+    /* map is the link_map for the caller, t is the link_map for the object
+       being called */
+ -  rfdesc[1] = value.gp;
+ -  /* Need to ensure that the gp is visible before the code
+ -     entry point is updated */
+ -  rfdesc[0] = value.ip;
+ +
+ +  /* We would like the function descriptor to be double word aligned.  This
+ +     helps performance (ip and gp then reside on the same cache line) and
+ +     we can update the pair atomically with a single store.  The linker
+ +     now ensures this alignment but we still have to handle old code.  */
+ +  if ((unsigned int)reloc_addr & 7)
+ +    {
+ +      /* Need to ensure that the gp is visible before the code
+ +         entry point is updated */
+ +      rfdesc[1] = value.gp;
+ +      atomic_full_barrier();
+ +      rfdesc[0] = value.ip;
+ +    }
+ +  else
+ +    {
+ +      /* Update pair atomically with floating point store.  */
+ +      union { ElfW(Word) v[2]; double d; } u;
+ +
+ +      u.v[0] = value.ip;
+ +      u.v[1] = value.gp;
+ +      *(volatile double *)rfdesc = u.d;
+ +    }
+    return value;
+  }
   
-  /* The remaining ones use regular MIPS wrappers.  */
+ @@ -265,7 +291,7 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
+                    here.  The trampoline code will load the proper
+                    LTP and pass the reloc offset to the fixup
+                    function.  */
+ -               fptr->gp = iplt - jmprel;
+ +               fptr->gp = (iplt - jmprel) | PA_GP_RELOC;
+               } /* r_sym != 0 */
+             else
+               {
+ diff --git a/sysdeps/hppa/dl-runtime.c b/sysdeps/hppa/dl-runtime.c
+ new file mode 100644
+ index 0000000000..885a3f1837
+ --- /dev/null
+ +++ b/sysdeps/hppa/dl-runtime.c
+ @@ -0,0 +1,58 @@
+ +/* On-demand PLT fixup for shared objects.  HPPA version.
+ +   Copyright (C) 2019 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, write to the Free
+ +   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ +   02111-1307 USA.  */
+ +
+ +/* Clear PA_GP_RELOC bit in relocation offset.  */
+ +#define reloc_offset (reloc_arg & ~PA_GP_RELOC)
+ +#define reloc_index  (reloc_arg & ~PA_GP_RELOC) / sizeof (PLTREL)
+ +
+ +#include <elf/dl-runtime.c>
+ +
+ +/* The caller has encountered a partially relocated function descriptor.
+ +   The gp of the descriptor has been updated, but not the ip.  We find
+ +   the function descriptor again and compute the relocation offset and
+ +   return that to the caller.  The caller will continue on to call
+ +   _dl_fixup with the relocation offset.  */
+ +
+ +ElfW(Word)
+ +attribute_hidden __attribute ((noinline)) ARCH_FIXUP_ATTRIBUTE
+ +_dl_fix_reloc_arg (struct fdesc *fptr, struct link_map *l)
+ +{
+ +  Elf32_Addr l_addr, iplt, jmprel, end_jmprel, r_type;
+ +  const Elf32_Rela *reloc;
+ +
+ +  l_addr = l->l_addr;
+ +  jmprel = D_PTR(l, l_info[DT_JMPREL]);
+ +  end_jmprel = jmprel + l->l_info[DT_PLTRELSZ]->d_un.d_val;
+ +
+ +  /* Look for the entry...  */
+ +  for (iplt = jmprel; iplt < end_jmprel; iplt += sizeof (Elf32_Rela))
+ +    {
+ +      reloc = (const Elf32_Rela *) iplt;
+ +      r_type = ELF32_R_TYPE (reloc->r_info);
+ +
+ +      if (__builtin_expect (r_type == R_PARISC_IPLT, 1)
+ +       && fptr == (struct fdesc *) (reloc->r_offset + l_addr))
+ +     /* Found entry. Return the reloc offset.  */
+ +     return iplt - jmprel;
+ +    }
+ +
+ +  /* Crash if we weren't passed a valid function pointer.  */
+ +  ABORT_INSTRUCTION;
+ +  return 0;
+ +}
+ diff --git a/sysdeps/hppa/dl-trampoline.S b/sysdeps/hppa/dl-trampoline.S
+ index 0114ca8b19..d0804b30c0 100644
+ --- a/sysdeps/hppa/dl-trampoline.S
+ +++ b/sysdeps/hppa/dl-trampoline.S
+ @@ -31,7 +31,7 @@
+     slow down __cffc when it attempts to call fixup to resolve function
+     descriptor references. Please refer to gcc/gcc/config/pa/fptr.c
   
-  #define __mips16_syscall5(a0, a1, a2, a3, a4, number)                        \
- -     __mips_syscall5 ((long) (a0), (long) (a1), (long) (a2),         \
- -                      (long) (a3), (long) (a4),                      \
- -                      (long) (number))
- +     __mips_syscall5 ((long int) (a0), (long int) (a1),              \
- +                      (long int) (a2), (long int) (a3),              \
- +                      (long int) (a4),                               \
- +                      (long int) (number))
+ -   Enter with r19 = reloc offset, r20 = got-8, r21 = fixup ltp.  */
+ +   Enter with r19 = reloc offset, r20 = got-8, r21 = fixup ltp, r22 = fp.  */
   
-  #define __mips16_syscall6(a0, a1, a2, a3, a4, a5, number)            \
- -     __mips_syscall6 ((long) (a0), (long) (a1), (long) (a2),         \
- -                      (long) (a3), (long) (a4), (long) (a5),         \
- -                      (long) (number))
- +     __mips_syscall6 ((long int) (a0), (long int) (a1),              \
- +                      (long int) (a2), (long int) (a3),              \
- +                      (long int) (a4), (long int) (a5),              \
- +                      (long int) (number))
+       /* RELOCATION MARKER: bl to provide gcc's __cffc with fixup loc. */
+       .text
+ @@ -61,17 +61,20 @@ _dl_runtime_resolve:
+       copy    %sp, %r1        /* Copy previous sp */
+       /* Save function result address (on entry) */
+       stwm    %r28,128(%sp)
+ -     /* Fillin some frame info to follow ABI */
+ +     /* Fill in some frame info to follow ABI */
+       stw     %r1,-4(%sp)     /* Previous sp */
+       stw     %r21,-32(%sp)   /* PIC register value */
   
-  #define __mips16_syscall7(a0, a1, a2, a3, a4, a5, a6, number)                \
- -     __mips_syscall7 ((long) (a0), (long) (a1), (long) (a2),         \
- -                      (long) (a3), (long) (a4), (long) (a5),         \
- -                      (long) (a6),                                   \
- -                      (long) (number))
- +     __mips_syscall7 ((long int) (a0), (long int) (a1),              \
- +                      (long int) (a2), (long int) (a3),              \
- +                      (long int) (a4), (long int) (a5),              \
- +                      (long int) (a6),                               \
- +                      (long int) (number))
+       /* Save input floating point registers. This must be done
+          in the new frame since the previous frame doesn't have
+          enough space */
+ -     ldo     -56(%sp),%r1
+ +     ldo     -64(%sp),%r1
+       fstd,ma %fr4,-8(%r1)
+       fstd,ma %fr5,-8(%r1)
+       fstd,ma %fr6,-8(%r1)
+ +
+ +     /* Test PA_GP_RELOC bit.  */
+ +     bb,>=   %r19,31,2f              /* branch if not reloc offset */
+       fstd,ma %fr7,-8(%r1)
   
-  #endif
- diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall0.c b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall0.c
- index 92f16e2724..43c05f8050 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall0.c
- +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall0.c
- @@ -20,8 +20,8 @@
+       /* Set up args to fixup func, needs only two arguments  */
+ @@ -79,7 +82,7 @@ _dl_runtime_resolve:
+       copy    %r19,%r25               /* (2) reloc offset  */
   
-  #undef __mips16_syscall0
+       /* Call the real address resolver. */
+ -     bl      _dl_fixup,%rp
+ +3:   bl      _dl_fixup,%rp
+       copy    %r21,%r19               /* set fixup func ltp */
   
- -long long __nomips16
- -__mips16_syscall0 (long number)
- +long long int __nomips16
- +__mips16_syscall0 (long int number)
-  {
-    union __mips_syscall_return ret;
-    ret.reg.v0 = INTERNAL_SYSCALL_MIPS16 (number, ret.reg.v1, 0);
- diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall1.c b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall1.c
- index fa985a96e5..16a567e834 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall1.c
- +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall1.c
- @@ -20,9 +20,9 @@
+       /* While the linker will set a function pointer to NULL when it
+ @@ -102,7 +105,7 @@ _dl_runtime_resolve:
+       copy    %r29, %r19
   
-  #undef __mips16_syscall1
+       /* Reload arguments fp args */
+ -     ldo     -56(%sp),%r1
+ +     ldo     -64(%sp),%r1
+       fldd,ma -8(%r1),%fr4
+       fldd,ma -8(%r1),%fr5
+       fldd,ma -8(%r1),%fr6
+ @@ -129,6 +132,25 @@ _dl_runtime_resolve:
+       bv      %r0(%rp)
+       ldo     -128(%sp),%sp
   
- -long long __nomips16
- -__mips16_syscall1 (long a0,
- -                long number)
- +long long int __nomips16
- +__mips16_syscall1 (long int a0,
- +                long int number)
-  {
-    union __mips_syscall_return ret;
-    ret.reg.v0 = INTERNAL_SYSCALL_MIPS16 (number, ret.reg.v1, 1,
- diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall2.c b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall2.c
- index f042ac815d..c0a856c344 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall2.c
- +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall2.c
- @@ -20,9 +20,9 @@
-  
-  #undef __mips16_syscall2
+ +2:
+ +     /* Set up args for _dl_fix_reloc_arg.  */
+ +     copy    %r22,%r26               /* (1) function pointer */
+ +     depi    0,31,2,%r26             /* clear least significant bits */
+ +     ldw     8+4(%r20),%r25          /* (2) got[1] == struct link_map */
+ +
+ +     /* Save ltp and link map arg for _dl_fixup.  */
+ +     stw     %r21,-56(%sp)           /* ltp */
+ +     stw     %r25,-60(%sp)           /* struct link map */
+ +
+ +     /* Find reloc offset. */
+ +     bl      _dl_fix_reloc_arg,%rp
+ +     copy    %r21,%r19               /* set func ltp */
+ +
+ +     /* Set up args for _dl_fixup.  */
+ +     ldw     -56(%sp),%r21           /* ltp */
+ +     ldw     -60(%sp),%r26           /* (1) struct link map */
+ +     b       3b
+ +     copy    %ret0,%r25              /* (2) reloc offset */
+          .EXIT
+          .PROCEND
+       cfi_endproc
+ @@ -153,7 +175,7 @@ _dl_runtime_profile:
+       copy    %sp, %r1        /* Copy previous sp */
+       /* Save function result address (on entry) */
+       stwm    %r28,192(%sp)
+ -     /* Fillin some frame info to follow ABI */
+ +     /* Fill in some frame info to follow ABI */
+       stw     %r1,-4(%sp)     /* Previous sp */
+       stw     %r21,-32(%sp)   /* PIC register value */
   
- -long long __nomips16
- -__mips16_syscall2 (long a0, long a1,
- -                long number)
- +long long int __nomips16
- +__mips16_syscall2 (long int a0, long int a1,
- +                long int number)
-  {
-    union __mips_syscall_return ret;
-    ret.reg.v0 = INTERNAL_SYSCALL_MIPS16 (number, ret.reg.v1, 2,
- diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall3.c b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall3.c
- index dfe2f7feb5..042768ebf2 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall3.c
- +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall3.c
- @@ -20,9 +20,9 @@
+ @@ -181,10 +203,11 @@ _dl_runtime_profile:
+       fstd,ma %fr5,8(%r1)
+       fstd,ma %fr6,8(%r1)
+       fstd,ma %fr7,8(%r1)
+ -     /* 32-bit stack pointer and return register */
+ -     stw     %sp,-56(%sp)
+ -     stw     %r2,-52(%sp)
   
-  #undef __mips16_syscall3
+ +     /* Test PA_GP_RELOC bit.  */
+ +     bb,>=   %r19,31,2f              /* branch if not reloc offset */
+ +     /* 32-bit stack pointer */
+ +     stw     %sp,-56(%sp)
   
- -long long __nomips16
- -__mips16_syscall3 (long a0, long a1, long a2,
- -                long number)
- +long long int __nomips16
- +__mips16_syscall3 (long int a0, long int a1, long int a2,
- +                long int number)
-  {
-    union __mips_syscall_return ret;
-    ret.reg.v0 = INTERNAL_SYSCALL_MIPS16 (number, ret.reg.v1, 3,
- diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall4.c b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall4.c
- index 39de510357..8658d822ab 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall4.c
- +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall4.c
- @@ -20,9 +20,9 @@
+       /* Set up args to fixup func, needs five arguments  */
+       ldw     8+4(%r20),%r26          /* (1) got[1] == struct link_map */
+ @@ -197,7 +220,7 @@ _dl_runtime_profile:
+       stw     %r1, -52(%sp)           /* (5) long int *framesizep */
   
-  #undef __mips16_syscall4
+       /* Call the real address resolver. */
+ -     bl      _dl_profile_fixup,%rp
+ +3:   bl      _dl_profile_fixup,%rp
+       copy    %r21,%r19               /* set fixup func ltp */
   
- -long long __nomips16
- -__mips16_syscall4 (long a0, long a1, long a2, long a3,
- -                long number)
- +long long int __nomips16
- +__mips16_syscall4 (long int a0, long int a1, long int a2, long int a3,
- +                long int number)
-  {
-    union __mips_syscall_return ret;
-    ret.reg.v0 = INTERNAL_SYSCALL_MIPS16 (number, ret.reg.v1, 4,
- diff --git a/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h b/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h
- index beefcf284b..0c6a83e9b3 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h
- +++ b/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h
- @@ -52,7 +52,7 @@
-  #undef INLINE_SYSCALL
-  #define INLINE_SYSCALL(name, nr, args...)                               \
-    ({ INTERNAL_SYSCALL_DECL (_sc_err);                                        \
- -     long result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);   \
- +     long int result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);       \
-       if ( INTERNAL_SYSCALL_ERROR_P (result_var, _sc_err) )           \
-         {                                                             \
-        __set_errno (INTERNAL_SYSCALL_ERRNO (result_var, _sc_err));    \
- @@ -61,10 +61,10 @@
-       result_var; })
+       /* Load up the returned function descriptor */
+ @@ -215,7 +238,9 @@ _dl_runtime_profile:
+       fldd,ma 8(%r1),%fr5
+       fldd,ma 8(%r1),%fr6
+       fldd,ma 8(%r1),%fr7
+ -     ldw     -52(%sp),%rp
+ +
+ +     /* Reload rp register -(192+20) without adjusting stack */
+ +     ldw     -212(%sp),%rp
   
-  #undef INTERNAL_SYSCALL_DECL
- -#define INTERNAL_SYSCALL_DECL(err) long err __attribute__ ((unused))
- +#define INTERNAL_SYSCALL_DECL(err) long int err __attribute__ ((unused))
+       /* Reload static link register -(192+16) without adjusting stack */
+       ldw     -208(%sp),%r29
+ @@ -303,6 +328,33 @@ L(cont):
+          ldw -20(%sp),%rp
+       /* Return */
+       bv,n    0(%r2)
+ +
+ +2:
+ +     /* Set up args for _dl_fix_reloc_arg.  */
+ +     copy    %r22,%r26               /* (1) function pointer */
+ +     depi    0,31,2,%r26             /* clear least significant bits */
+ +     ldw     8+4(%r20),%r25          /* (2) got[1] == struct link_map */
+ +
+ +     /* Save ltp and link map arg for _dl_fixup.  */
+ +     stw     %r21,-92(%sp)           /* ltp */
+ +     stw     %r25,-116(%sp)          /* struct link map */
+ +
+ +     /* Find reloc offset. */
+ +     bl      _dl_fix_reloc_arg,%rp
+ +     copy    %r21,%r19               /* set func ltp */
+ +
+ +      /* Restore fixup ltp.  */
+ +     ldw     -92(%sp),%r21           /* ltp */
+ +
+ +     /* Set up args to fixup func, needs five arguments  */
+ +     ldw     -116(%sp),%r26          /* (1) struct link map */
+ +     copy    %ret0,%r25              /* (2) reloc offset  */
+ +     stw     %r25,-120(%sp)          /* Save reloc offset */
+ +     ldw     -212(%sp),%r24          /* (3) profile_fixup needs rp */
+ +     ldo     -56(%sp),%r23           /* (4) La_hppa_regs */
+ +     ldo     -112(%sp), %r1
+ +     b       3b
+ +     stw     %r1, -52(%sp)           /* (5) long int *framesizep */
+          .EXIT
+          .PROCEND
+       cfi_endproc
+ diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
+ index 8af0789a9c..4334ade2a0 100644
+ --- a/sysdeps/i386/dl-machine.h
+ +++ b/sysdeps/i386/dl-machine.h
+ @@ -338,16 +338,22 @@ elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc,
+       {
+  # ifndef RTLD_BOOTSTRAP
+         if (sym_map != map
+ -           && sym_map->l_type != lt_executable
+             && !sym_map->l_relocated)
+           {
+             const char *strtab
+               = (const char *) D_PTR (map, l_info[DT_STRTAB]);
+ -           _dl_error_printf ("\
+ +           if (sym_map->l_type == lt_executable)
+ +             _dl_fatal_printf ("\
+ +%s: IFUNC symbol '%s' referenced in '%s' is defined in the executable \
+ +and creates an unsatisfiable circular dependency.\n",
+ +                               RTLD_PROGNAME, strtab + refsym->st_name,
+ +                               map->l_name);
+ +           else
+ +             _dl_error_printf ("\
+  %s: Relink `%s' with `%s' for IFUNC symbol `%s'\n",
+ -                             RTLD_PROGNAME, map->l_name,
+ -                             sym_map->l_name,
+ -                             strtab + refsym->st_name);
+ +                               RTLD_PROGNAME, map->l_name,
+ +                               sym_map->l_name,
+ +                               strtab + refsym->st_name);
+           }
+  # endif
+         value = ((Elf32_Addr (*) (void)) value) ();
+ diff --git a/sysdeps/i386/sysdep.h b/sysdeps/i386/sysdep.h
+ index b4bcd8fb6c..6094af8fec 100644
+ --- a/sysdeps/i386/sysdep.h
+ +++ b/sysdeps/i386/sysdep.h
+ @@ -61,7 +61,7 @@ lose: SYSCALL_PIC_SETUP                                                           \
   
-  #undef INTERNAL_SYSCALL_ERROR_P
- -#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long) (err))
- +#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long int) (err))
+  # define SETUP_PIC_REG(reg) \
+    .ifndef GET_PC_THUNK(reg);                                               \
+ -  .section .gnu.linkonce.t.GET_PC_THUNK(reg),"ax",@progbits;               \
+ +  .section .text.GET_PC_THUNK(reg),"axG",@progbits,GET_PC_THUNK(reg),comdat;  \
+    .globl GET_PC_THUNK(reg);                                                \
+    .hidden GET_PC_THUNK(reg);                                               \
+    .p2align 4;                                                                      \
+ @@ -97,7 +97,8 @@ GET_PC_THUNK(reg):                                                        \
   
-  #undef INTERNAL_SYSCALL_ERRNO
-  #define INTERNAL_SYSCALL_ERRNO(val, err)     ((void) (err), val)
- @@ -103,11 +103,11 @@
+  # define SETUP_PIC_REG_STR(reg)                                              \
+    ".ifndef " GET_PC_THUNK_STR (reg) "\n"                             \
+ -  ".section .gnu.linkonce.t." GET_PC_THUNK_STR (reg) ",\"ax\",@progbits\n" \
+ +  ".section .text." GET_PC_THUNK_STR (reg) ",\"axG\",@progbits,"     \
+ +    GET_PC_THUNK_STR (reg) ",comdat\n"                                       \
+    ".globl " GET_PC_THUNK_STR (reg) "\n"                                      \
+    ".hidden " GET_PC_THUNK_STR (reg) "\n"                             \
+    ".p2align 4\n"                                                     \
+ diff --git a/sysdeps/ieee754/ldbl-96/Makefile b/sysdeps/ieee754/ldbl-96/Makefile
+ index 995e90d6da..6030adf7e7 100644
+ --- a/sysdeps/ieee754/ldbl-96/Makefile
+ +++ b/sysdeps/ieee754/ldbl-96/Makefile
+ @@ -17,5 +17,8 @@
+  # <https://www.gnu.org/licenses/>.
   
-  union __mips_syscall_return
-    {
- -    long long val;
- +    long long int val;
-      struct
-        {
- -     long v0;
- -     long v1;
- +     long int v0;
- +     long int v1;
-        }
-      reg;
-    };
- @@ -152,13 +152,13 @@ union __mips_syscall_return
+  ifeq ($(subdir),math)
+ -tests += test-canonical-ldbl-96 test-totalorderl-ldbl-96
+ +tests += test-canonical-ldbl-96 test-totalorderl-ldbl-96 test-sinl-pseudo
+ +ifeq ($(have-ssp),yes)
+ +CFLAGS-test-sinl-pseudo.c += -fstack-protector-all
+  endif
+ +endif # $(subdir) == math
+ diff --git a/sysdeps/ieee754/ldbl-96/e_rem_pio2l.c b/sysdeps/ieee754/ldbl-96/e_rem_pio2l.c
+ index 5f742321ae..bcdf20179f 100644
+ --- a/sysdeps/ieee754/ldbl-96/e_rem_pio2l.c
+ +++ b/sysdeps/ieee754/ldbl-96/e_rem_pio2l.c
+ @@ -210,6 +210,18 @@ __ieee754_rem_pio2l (long double x, long double *y)
+        return 0;
+      }
   
-  #define internal_syscall0(v0_init, input, number, err, dummy...)     \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a3 asm ("$7");                                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a3 asm ("$7");                              \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -175,14 +175,15 @@ union __mips_syscall_return
-  
-  #define internal_syscall1(v0_init, input, number, err, arg1)         \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     long int _arg1 = (long int) (arg1);                             \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a0 asm ("$4") = (long) (arg1);                  \
- -     register long __a3 asm ("$7");                                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a0 asm ("$4") = _arg1;                      \
- +     register long int __a3 asm ("$7");                              \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -199,15 +200,17 @@ union __mips_syscall_return
+ +  if ((i0 & 0x80000000) == 0)
+ +    {
+ +      /* Pseudo-zero and unnormal representations are not valid
+ +      representations of long double.  We need to avoid stack
+ +      corruption in __kernel_rem_pio2, which expects input in a
+ +      particular normal form, but those representations do not need
+ +      to be consistently handled like any particular floating-point
+ +      value.  */
+ +      y[1] = y[0] = __builtin_nanl ("");
+ +      return 0;
+ +    }
+ +
+    /* Split the 64 bits of the mantissa into three 24-bit integers
+       stored in a double array.  */
+    exp = j0 - 23;
+ diff --git a/sysdeps/ieee754/ldbl-96/test-sinl-pseudo.c b/sysdeps/ieee754/ldbl-96/test-sinl-pseudo.c
+ new file mode 100644
+ index 0000000000..f59b97769d
+ --- /dev/null
+ +++ b/sysdeps/ieee754/ldbl-96/test-sinl-pseudo.c
+ @@ -0,0 +1,41 @@
+ +/* Test sinl for pseudo-zeros and unnormals for ldbl-96 (bug 25487).
+ +   Copyright (C) 2020 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <math.h>
+ +#include <math_ldbl.h>
+ +#include <stdint.h>
+ +
+ +static int
+ +do_test (void)
+ +{
+ +  for (int i = 0; i < 64; i++)
+ +    {
+ +      uint64_t sig = i == 63 ? 0 : 1ULL << i;
+ +      long double ld;
+ +      SET_LDOUBLE_WORDS (ld, 0x4141,
+ +                      sig >> 32, sig & 0xffffffffULL);
+ +      /* The requirement is that no stack overflow occurs when the
+ +      pseudo-zero or unnormal goes through range reduction.  */
+ +      volatile long double ldr;
+ +      ldr = sinl (ld);
+ +      (void) ldr;
+ +    }
+ +  return 0;
+ +}
+ +
+ +#include <support/test-driver.c>
+ diff --git a/sysdeps/posix/getcwd.c b/sysdeps/posix/getcwd.c
+ index f00b337a13..839d78d7b7 100644
+ --- a/sysdeps/posix/getcwd.c
+ +++ b/sysdeps/posix/getcwd.c
+ @@ -241,6 +241,14 @@ __getcwd (char *buf, size_t size)
+    char *path;
+  #ifndef NO_ALLOCATION
+    size_t allocated = size;
+ +
+ +  /* A size of 1 byte is never useful.  */
+ +  if (allocated == 1)
+ +    {
+ +      __set_errno (ERANGE);
+ +      return NULL;
+ +    }
+ +
+    if (size == 0)
+      {
+        if (buf != NULL)
+ diff --git a/sysdeps/posix/system.c b/sysdeps/posix/system.c
+ index e613e6a344..a03f478fc7 100644
+ --- a/sysdeps/posix/system.c
+ +++ b/sysdeps/posix/system.c
+ @@ -101,7 +101,8 @@ cancel_handler (void *arg)
+  static int
+  do_system (const char *line)
+  {
+ -  int status;
+ +  int status = -1;
+ +  int ret;
+    pid_t pid;
+    struct sigaction sa;
+  #ifndef _LIBC_REENTRANT
+ @@ -144,14 +145,14 @@ do_system (const char *line)
+    __posix_spawnattr_setflags (&spawn_attr,
+                             POSIX_SPAWN_SETSIGDEF | POSIX_SPAWN_SETSIGMASK);
   
-  #define internal_syscall2(v0_init, input, number, err, arg1, arg2)   \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     long int _arg1 = (long int) (arg1);                             \
- +     long int _arg2 = (long int) (arg2);                             \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a0 asm ("$4") = (long) (arg1);                  \
- -     register long __a1 asm ("$5") = (long) (arg2);                  \
- -     register long __a3 asm ("$7");                                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a0 asm ("$4") = _arg1;                      \
- +     register long int __a1 asm ("$5") = _arg2;                      \
- +     register long int __a3 asm ("$7");                              \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -225,16 +228,19 @@ union __mips_syscall_return
-  #define internal_syscall3(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3)                             \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     long int _arg1 = (long int) (arg1);                             \
- +     long int _arg2 = (long int) (arg2);                             \
- +     long int _arg3 = (long int) (arg3);                             \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a0 asm ("$4") = (long) (arg1);                  \
- -     register long __a1 asm ("$5") = (long) (arg2);                  \
- -     register long __a2 asm ("$6") = (long) (arg3);                  \
- -     register long __a3 asm ("$7");                                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a0 asm ("$4") = _arg1;                      \
- +     register long int __a1 asm ("$5") = _arg2;                      \
- +     register long int __a2 asm ("$6") = _arg3;                      \
- +     register long int __a3 asm ("$7");                              \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -252,16 +258,20 @@ union __mips_syscall_return
-  #define internal_syscall4(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3, arg4)                       \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     long int _arg1 = (long int) (arg1);                             \
- +     long int _arg2 = (long int) (arg2);                             \
- +     long int _arg3 = (long int) (arg3);                             \
- +     long int _arg4 = (long int) (arg4);                             \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a0 asm ("$4") = (long) (arg1);                  \
- -     register long __a1 asm ("$5") = (long) (arg2);                  \
- -     register long __a2 asm ("$6") = (long) (arg3);                  \
- -     register long __a3 asm ("$7") = (long) (arg4);                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a0 asm ("$4") = _arg1;                      \
- +     register long int __a1 asm ("$5") = _arg2;                      \
- +     register long int __a2 asm ("$6") = _arg3;                      \
- +     register long int __a3 asm ("$7") = _arg4;                      \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -285,63 +295,66 @@ union __mips_syscall_return
-     compiler specifics required for the stack arguments to be pushed,
-     which would be the case if these syscalls were inlined.  */
+ -  status = __posix_spawn (&pid, SHELL_PATH, 0, &spawn_attr,
+ -                       (char *const[]){ (char*) SHELL_NAME,
+ -                                        (char*) "-c",
+ -                                        (char *) line, NULL },
+ -                       __environ);
+ +  ret = __posix_spawn (&pid, SHELL_PATH, 0, &spawn_attr,
+ +                    (char *const[]){ (char *) SHELL_NAME,
+ +                                     (char *) "-c",
+ +                                     (char *) line, NULL },
+ +                    __environ);
+    __posix_spawnattr_destroy (&spawn_attr);
   
- -long long __nomips16 __mips_syscall5 (long arg1, long arg2, long arg3,
- -                                   long arg4, long arg5,
- -                                   long number);
- +long long int __nomips16 __mips_syscall5 (long int arg1, long int arg2,
- +                                       long int arg3, long int arg4,
- +                                       long int arg5,
- +                                       long int number);
-  libc_hidden_proto (__mips_syscall5, nomips16)
+ -  if (status == 0)
+ +  if (ret == 0)
+      {
+        /* Cancellation results in cleanup handlers running as exceptions in
+        the block where they were installed, so it is safe to reference
+ @@ -186,6 +187,9 @@ do_system (const char *line)
+      }
+    DO_UNLOCK ();
   
-  #define internal_syscall5(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3, arg4, arg5)                 \
-  ({                                                                   \
-       union __mips_syscall_return _sc_ret;                            \
- -     _sc_ret.val = __mips_syscall5 ((long) (arg1),                   \
- -                                    (long) (arg2),                   \
- -                                    (long) (arg3),                   \
- -                                    (long) (arg4),                   \
- -                                    (long) (arg5),                   \
- -                                    (long) (number));                \
- +     _sc_ret.val = __mips_syscall5 ((long int) (arg1),               \
- +                                    (long int) (arg2),               \
- +                                    (long int) (arg3),               \
- +                                    (long int) (arg4),               \
- +                                    (long int) (arg5),               \
- +                                    (long int) (number));            \
-       err = _sc_ret.reg.v1;                                           \
-       _sc_ret.reg.v0;                                                 \
-  })
+ +  if (ret != 0)
+ +    __set_errno (ret);
+ +
+    return status;
+  }
   
- -long long __nomips16 __mips_syscall6 (long arg1, long arg2, long arg3,
- -                                   long arg4, long arg5, long arg6,
- -                                   long number);
- +long long int __nomips16 __mips_syscall6 (long int arg1, long int arg2,
- +                                       long int arg3, long int arg4,
- +                                       long int arg5, long int arg6,
- +                                       long int number);
-  libc_hidden_proto (__mips_syscall6, nomips16)
-  
-  #define internal_syscall6(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3, arg4, arg5, arg6)           \
-  ({                                                                   \
-       union __mips_syscall_return _sc_ret;                            \
- -     _sc_ret.val = __mips_syscall6 ((long) (arg1),                   \
- -                                    (long) (arg2),                   \
- -                                    (long) (arg3),                   \
- -                                    (long) (arg4),                   \
- -                                    (long) (arg5),                   \
- -                                    (long) (arg6),                   \
- -                                    (long) (number));                \
- +     _sc_ret.val = __mips_syscall6 ((long int) (arg1),               \
- +                                    (long int) (arg2),               \
- +                                    (long int) (arg3),               \
- +                                    (long int) (arg4),               \
- +                                    (long int) (arg5),               \
- +                                    (long int) (arg6),               \
- +                                    (long int) (number));            \
-       err = _sc_ret.reg.v1;                                           \
-       _sc_ret.reg.v0;                                                 \
-  })
-  
- -long long __nomips16 __mips_syscall7 (long arg1, long arg2, long arg3,
- -                                   long arg4, long arg5, long arg6,
- -                                   long arg7,
- -                                   long number);
- +long long int __nomips16 __mips_syscall7 (long int arg1, long int arg2,
- +                                       long int arg3, long int arg4,
- +                                       long int arg5, long int arg6,
- +                                       long int arg7,
- +                                       long int number);
-  libc_hidden_proto (__mips_syscall7, nomips16)
-  
-  #define internal_syscall7(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3, arg4, arg5, arg6, arg7)     \
-  ({                                                                   \
-       union __mips_syscall_return _sc_ret;                            \
- -     _sc_ret.val = __mips_syscall7 ((long) (arg1),                   \
- -                                    (long) (arg2),                   \
- -                                    (long) (arg3),                   \
- -                                    (long) (arg4),                   \
- -                                    (long) (arg5),                   \
- -                                    (long) (arg6),                   \
- -                                    (long) (arg7),                   \
- -                                    (long) (number));                \
- +     _sc_ret.val = __mips_syscall7 ((long int) (arg1),               \
- +                                    (long int) (arg2),               \
- +                                    (long int) (arg3),               \
- +                                    (long int) (arg4),               \
- +                                    (long int) (arg5),               \
- +                                    (long int) (arg6),               \
- +                                    (long int) (arg7),               \
- +                                    (long int) (number));            \
-       err = _sc_ret.reg.v1;                                           \
-       _sc_ret.reg.v0;                                                 \
-  })
- diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/sysdep.h b/sysdeps/unix/sysv/linux/mips/mips64/n32/sysdep.h
- index f96636538a..4a9d7054f9 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips64/n32/sysdep.h
- +++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/sysdep.h
- @@ -47,14 +47,14 @@
-  
-  /* Convert X to a long long, without losing any bits if it is one
-     already or warning if it is a 32-bit pointer.  */
- -#define ARGIFY(X) ((long long) (__typeof__ ((X) - (X))) (X))
- +#define ARGIFY(X) ((long long int) (__typeof__ ((X) - (X))) (X))
-  
-  /* Define a macro which expands into the inline wrapper code for a system
-     call.  */
-  #undef INLINE_SYSCALL
-  #define INLINE_SYSCALL(name, nr, args...)                            \
-    ({ INTERNAL_SYSCALL_DECL (_sc_err);                                        \
- -     long result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);   \
- +     long int result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);       \
-       if ( INTERNAL_SYSCALL_ERROR_P (result_var, _sc_err) )           \
-         {                                                             \
-        __set_errno (INTERNAL_SYSCALL_ERRNO (result_var, _sc_err));    \
- @@ -63,10 +63,10 @@
-       result_var; })
-  
-  #undef INTERNAL_SYSCALL_DECL
- -#define INTERNAL_SYSCALL_DECL(err) long err __attribute__ ((unused))
- +#define INTERNAL_SYSCALL_DECL(err) long int err __attribute__ ((unused))
+ diff --git a/sysdeps/powerpc/powerpc32/sysdep.h b/sysdeps/powerpc/powerpc32/sysdep.h
+ index 2ba009e919..829eec266a 100644
+ --- a/sysdeps/powerpc/powerpc32/sysdep.h
+ +++ b/sysdeps/powerpc/powerpc32/sysdep.h
+ @@ -179,8 +179,8 @@ GOT_LABEL:                        ;                                             \
+  #else
+  /* Position-dependent code does not require access to the GOT.  */
+  # define __GLRO(rOUT, rGOT, member, offset)                          \
+ -     lis     rOUT,(member+LOWORD)@ha;                                        \
+ -     lwz     rOUT,(member+LOWORD)@l(rOUT)
+ +     lis     rOUT,(member)@ha;                                       \
+ +     lwz     rOUT,(member)@l(rOUT)
+  #endif       /* PIC */
   
-  #undef INTERNAL_SYSCALL_ERROR_P
- -#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long) (err))
- +#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long int) (err))
+  #endif       /* __ASSEMBLER__ */
+ diff --git a/sysdeps/powerpc/powerpc64/backtrace.c b/sysdeps/powerpc/powerpc64/backtrace.c
+ index 8a53a1088f..362a2b713c 100644
+ --- a/sysdeps/powerpc/powerpc64/backtrace.c
+ +++ b/sysdeps/powerpc/powerpc64/backtrace.c
+ @@ -54,11 +54,22 @@ struct signal_frame_64 {
+    /* We don't care about the rest, since the IP value is at 'uc' field.  */
+  };
   
-  #undef INTERNAL_SYSCALL_ERRNO
-  #define INTERNAL_SYSCALL_ERRNO(val, err)     ((void) (err), val)
- @@ -112,13 +112,13 @@
+ +/* Test if the address match to the inside the trampoline code.
+ +   Up to and including kernel 5.8, returning from an interrupt or syscall to a
+ +   signal handler starts execution directly at the handler's entry point, with
+ +   LR set to address of the sigreturn trampoline (the vDSO symbol).
+ +   Newer kernels will branch to signal handler from the trampoline instead, so
+ +   checking the stacktrace against the vDSO entrypoint does not work in such
+ +   case.
+ +   The vDSO branches with a 'bctrl' instruction, so checking either the
+ +   vDSO address itself and the next instruction should cover all kernel
+ +   versions.  */
+  static inline bool
+  is_sigtramp_address (void *nip)
+  {
+  #ifdef HAVE_SIGTRAMP_RT64
+ -  if (nip == GLRO (dl_vdso_sigtramp_rt64))
+ +  if (nip == GLRO (dl_vdso_sigtramp_rt64) ||
+ +      nip == GLRO (dl_vdso_sigtramp_rt64) + 4)
+      return true;
+  #endif
+    return false;
+ diff --git a/sysdeps/s390/configure b/sysdeps/s390/configure
+ index fa46e9e351..e7f576338d 100644
+ --- a/sysdeps/s390/configure
+ +++ b/sysdeps/s390/configure
+ @@ -123,7 +123,9 @@ void testinsn (char *buf)
+      __asm__ (".machine \"arch13\" \n\t"
+            ".machinemode \"zarch_nohighgprs\" \n\t"
+            "lghi %%r0,16 \n\t"
+ -          "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+ +          "mvcrl 0(%0),32(%0) \n\t"
+ +          "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+ +          : : "a" (buf) : "memory", "r0");
+  }
+  EOF
+  if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c
+ @@ -271,7 +273,9 @@ else
+  void testinsn (char *buf)
+  {
+      __asm__ ("lghi %%r0,16 \n\t"
+ -          "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+ +          "mvcrl 0(%0),32(%0) \n\t"
+ +          "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+ +          : : "a" (buf) : "memory", "r0");
+  }
+  EOF
+  if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS --shared conftest.c
+ diff --git a/sysdeps/s390/configure.ac b/sysdeps/s390/configure.ac
+ index 3ed5a8ef87..5c3479e8cf 100644
+ --- a/sysdeps/s390/configure.ac
+ +++ b/sysdeps/s390/configure.ac
+ @@ -88,7 +88,9 @@ void testinsn (char *buf)
+      __asm__ (".machine \"arch13\" \n\t"
+            ".machinemode \"zarch_nohighgprs\" \n\t"
+            "lghi %%r0,16 \n\t"
+ -          "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+ +          "mvcrl 0(%0),32(%0) \n\t"
+ +          "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+ +          : : "a" (buf) : "memory", "r0");
+  }
+  EOF
+  dnl test, if assembler supports S390 arch13 instructions
+ @@ -195,7 +197,9 @@ cat > conftest.c <<\EOF
+  void testinsn (char *buf)
+  {
+      __asm__ ("lghi %%r0,16 \n\t"
+ -          "mvcrl 0(%0),32(%0)" : : "a" (buf) : "memory", "r0");
+ +          "mvcrl 0(%0),32(%0) \n\t"
+ +          "vstrs %%v20,%%v20,%%v20,%%v20,0,2"
+ +          : : "a" (buf) : "memory", "r0");
+  }
+  EOF
+  dnl test, if assembler supports S390 arch13 zarch instructions as default
+ diff --git a/sysdeps/s390/memmove.c b/sysdeps/s390/memmove.c
+ index 5fc85e129f..ee59b5de14 100644
+ --- a/sysdeps/s390/memmove.c
+ +++ b/sysdeps/s390/memmove.c
+ @@ -43,7 +43,7 @@ extern __typeof (__redirect_memmove) MEMMOVE_ARCH13 attribute_hidden;
+  s390_libc_ifunc_expr (__redirect_memmove, memmove,
+                     ({
+                       s390_libc_ifunc_expr_stfle_init ();
+ -                     (HAVE_MEMMOVE_ARCH13
+ +                     (HAVE_MEMMOVE_ARCH13 && (hwcap & HWCAP_S390_VXRS_EXT2)
+                        && S390_IS_ARCH13_MIE3 (stfle_bits))
+                         ? MEMMOVE_ARCH13
+                         : (HAVE_MEMMOVE_Z13 && (hwcap & HWCAP_S390_VX))
+ diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c
+ index e6195c6e26..17c0cc3952 100644
+ --- a/sysdeps/s390/multiarch/ifunc-impl-list.c
+ +++ b/sysdeps/s390/multiarch/ifunc-impl-list.c
+ @@ -171,7 +171,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+      IFUNC_IMPL (i, name, memmove,
+  # if HAVE_MEMMOVE_ARCH13
+               IFUNC_IMPL_ADD (array, i, memmove,
+ -                             S390_IS_ARCH13_MIE3 (stfle_bits),
+ +                             ((dl_hwcap & HWCAP_S390_VXRS_EXT2)
+ +                              && S390_IS_ARCH13_MIE3 (stfle_bits)),
+                               MEMMOVE_ARCH13)
+  # endif
+  # if HAVE_MEMMOVE_Z13
+ diff --git a/sysdeps/sh/be/sh4/fpu/Implies b/sysdeps/sh/be/sh4/fpu/Implies
+ new file mode 100644
+ index 0000000000..71b28ee1a4
+ --- /dev/null
+ +++ b/sysdeps/sh/be/sh4/fpu/Implies
+ @@ -0,0 +1 @@
+ +sh/sh4/fpu
+ diff --git a/sysdeps/sh/le/sh4/fpu/Implies b/sysdeps/sh/le/sh4/fpu/Implies
+ new file mode 100644
+ index 0000000000..71b28ee1a4
+ --- /dev/null
+ +++ b/sysdeps/sh/le/sh4/fpu/Implies
+ @@ -0,0 +1 @@
+ +sh/sh4/fpu
+ diff --git a/sysdeps/unix/make-syscalls.sh b/sysdeps/unix/make-syscalls.sh
+ index c07626677f..4f6c3490a2 100644
+ --- a/sysdeps/unix/make-syscalls.sh
+ +++ b/sysdeps/unix/make-syscalls.sh
+ @@ -30,6 +30,7 @@
+  # P: optionally-NULL pointer to typed object (e.g., 3rd argument to sigaction)
+  # s: non-NULL string (e.g., 1st arg to open)
+  # S: optionally-NULL string (e.g., 1st arg to acct)
+ +# U: unsigned long int (32-bit types are zero-extended to 64-bit types)
+  # v: vararg scalar (e.g., optional 3rd arg to open)
+  # V: byte-per-page vector (3rd arg to mincore)
+  # W: wait status, optionally-NULL pointer to int (e.g., 2nd arg of wait4)
+ @@ -184,6 +185,27 @@ while read file srcfile caller syscall args strong weak; do
+    ?:?????????) nargs=9;;
+    esac
   
-  #define internal_syscall0(v0_init, input, number, err, dummy...)     \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
- +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
-         = (number);                                                   \
- -     register long long __v0 asm ("$2");                             \
- -     register long long __a3 asm ("$7");                             \
- +     register long long int __v0 asm ("$2");                         \
- +     register long long int __a3 asm ("$7");                         \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
+ +  # Derive the unsigned long int arguments from the argument signature
+ +  ulong_arg_1=0
+ +  ulong_arg_2=0
+ +  ulong_count=0
+ +  for U in $(echo $args | sed -e "s/.*:/:/" | grep -ob U)
+ +  do
+ +    ulong_count=$(expr $ulong_count + 1)
+ +    ulong_arg=$(echo $U | sed -e "s/:U//")
+ +    case $ulong_count in
+ +    1)
+ +      ulong_arg_1=$ulong_arg
+ +      ;;
+ +    2)
+ +      ulong_arg_2=$ulong_arg
+ +      ;;
+ +    *)
+ +      echo >&2 "$0: Too many unsigned long int arguments for syscall ($strong $weak)"
+ +      exit 2
+ +    esac
+ +  done
+ +
+    # Make sure only the first syscall rule is used, if multiple dirs
+    # define the same syscall.
+    echo ''
+ @@ -245,6 +267,8 @@ while read file srcfile caller syscall args strong weak; do
+       \$(make-target-directory)
+       (echo '#define SYSCALL_NAME $syscall'; \\
+        echo '#define SYSCALL_NARGS $nargs'; \\
+ +      echo '#define SYSCALL_ULONG_ARG_1 $ulong_arg_1'; \\
+ +      echo '#define SYSCALL_ULONG_ARG_2 $ulong_arg_2'; \\
+        echo '#define SYSCALL_SYMBOL $strong'; \\
+        echo '#define SYSCALL_NOERRNO $noerrno'; \\
+        echo '#define SYSCALL_ERRVAL $errval'; \\
+ diff --git a/sysdeps/unix/syscall-template.S b/sysdeps/unix/syscall-template.S
+ index cf6c7a58fb..f807a8603f 100644
+ --- a/sysdeps/unix/syscall-template.S
+ +++ b/sysdeps/unix/syscall-template.S
+ @@ -25,6 +25,12 @@
+     defining a few macros:
+       SYSCALL_NAME            syscall name
+       SYSCALL_NARGS           number of arguments this call takes
+ +     SYSCALL_ULONG_ARG_1     the first unsigned long int argument this
+ +                             call takes.  0 means that there are no
+ +                             unsigned long int arguments.
+ +     SYSCALL_ULONG_ARG_2     the second unsigned long int argument this
+ +                             call takes.  0 means that there is at most
+ +                             one unsigned long int argument.
+       SYSCALL_SYMBOL          primary symbol name
+       SYSCALL_NOERRNO         1 to define a no-errno version (see below)
+       SYSCALL_ERRVAL          1 to define an error-value version (see below)
+ @@ -44,9 +50,31 @@
+  /* This indirection is needed so that SYMBOL gets macro-expanded.  */
+  #define syscall_hidden_def(SYMBOL)           hidden_def (SYMBOL)
+  
+ -#define T_PSEUDO(SYMBOL, NAME, N)            PSEUDO (SYMBOL, NAME, N)
+ -#define T_PSEUDO_NOERRNO(SYMBOL, NAME, N)    PSEUDO_NOERRNO (SYMBOL, NAME, N)
+ -#define T_PSEUDO_ERRVAL(SYMBOL, NAME, N)     PSEUDO_ERRVAL (SYMBOL, NAME, N)
+ +/* If PSEUDOS_HAVE_ULONG_INDICES is defined, PSEUDO and T_PSEUDO macros
+ +   have 2 extra arguments for unsigned long int arguments:
+ +     Extra argument 1: Position of the first unsigned long int argument.
+ +     Extra argument 2: Position of the second unsigned long int argument.
+ + */
+ +#ifndef PSEUDOS_HAVE_ULONG_INDICES
+ +# undef SYSCALL_ULONG_ARG_1
+ +# define SYSCALL_ULONG_ARG_1 0
+ +#endif
+ +
+ +#if SYSCALL_ULONG_ARG_1
+ +# define T_PSEUDO(SYMBOL, NAME, N, U1, U2) \
+ +  PSEUDO (SYMBOL, NAME, N, U1, U2)
+ +# define T_PSEUDO_NOERRNO(SYMBOL, NAME, N, U1, U2) \
+ +  PSEUDO_NOERRNO (SYMBOL, NAME, N, U1, U2)
+ +# define T_PSEUDO_ERRVAL(SYMBOL, NAME, N, U1, U2) \
+ +  PSEUDO_ERRVAL (SYMBOL, NAME, N, U1, U2)
+ +#else
+ +# define T_PSEUDO(SYMBOL, NAME, N) \
+ +  PSEUDO (SYMBOL, NAME, N)
+ +# define T_PSEUDO_NOERRNO(SYMBOL, NAME, N) \
+ +  PSEUDO_NOERRNO (SYMBOL, NAME, N)
+ +# define T_PSEUDO_ERRVAL(SYMBOL, NAME, N) \
+ +  PSEUDO_ERRVAL (SYMBOL, NAME, N)
+ +#endif
+  #define T_PSEUDO_END(SYMBOL)                 PSEUDO_END (SYMBOL)
+  #define T_PSEUDO_END_NOERRNO(SYMBOL)         PSEUDO_END_NOERRNO (SYMBOL)
+  #define T_PSEUDO_END_ERRVAL(SYMBOL)          PSEUDO_END_ERRVAL (SYMBOL)
+ @@ -56,7 +84,12 @@
+  /* This kind of system call stub never returns an error.
+     We return the return value register to the caller unexamined.  */
+  
+ +# if SYSCALL_ULONG_ARG_1
+ +T_PSEUDO_NOERRNO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS,
+ +               SYSCALL_ULONG_ARG_1, SYSCALL_ULONG_ARG_2)
+ +# else
+  T_PSEUDO_NOERRNO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
+ +# endif
+       ret_NOERRNO
+  T_PSEUDO_END_NOERRNO (SYSCALL_SYMBOL)
+  
+ @@ -66,7 +99,12 @@ T_PSEUDO_END_NOERRNO (SYSCALL_SYMBOL)
+     value, or zero for success.  We may massage the kernel's return value
+     to meet that ABI, but we never set errno here.  */
+  
+ +# if SYSCALL_ULONG_ARG_1
+ +T_PSEUDO_ERRVAL (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS,
+ +              SYSCALL_ULONG_ARG_1, SYSCALL_ULONG_ARG_2)
+ +# else
+  T_PSEUDO_ERRVAL (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
+ +# endif
+       ret_ERRVAL
+  T_PSEUDO_END_ERRVAL (SYSCALL_SYMBOL)
+  
+ @@ -75,7 +113,12 @@ T_PSEUDO_END_ERRVAL (SYSCALL_SYMBOL)
+  /* This is a "normal" system call stub: if there is an error,
+     it returns -1 and sets errno.  */
+  
+ +# if SYSCALL_ULONG_ARG_1
+ +T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS,
+ +       SYSCALL_ULONG_ARG_1, SYSCALL_ULONG_ARG_2)
+ +# else
+  T_PSEUDO (SYSCALL_SYMBOL, SYSCALL_NAME, SYSCALL_NARGS)
+ +# endif
+       ret
+  T_PSEUDO_END (SYSCALL_SYMBOL)
+  
+ diff --git a/sysdeps/unix/syscalls.list b/sysdeps/unix/syscalls.list
+ index e28e801c7a..6b22b2cb45 100644
+ --- a/sysdeps/unix/syscalls.list
+ +++ b/sysdeps/unix/syscalls.list
+ @@ -39,27 +39,27 @@ kill              -       kill            i:ii    __kill          kill
+  link         -       link            i:ss    __link          link
+  listen               -       listen          i:ii    __listen        listen
+  lseek                -       lseek           i:iii   __libc_lseek    __lseek lseek
+ -madvise              -       madvise         i:pii   __madvise       madvise
+ +madvise              -       madvise         i:pUi   __madvise       madvise
+  mkdir                -       mkdir           i:si    __mkdir         mkdir
+ -mmap         -       mmap            b:aniiii __mmap         mmap
+ -mprotect     -       mprotect        i:aii   __mprotect      mprotect
+ -munmap               -       munmap          i:ai    __munmap        munmap
+ +mmap         -       mmap            b:aUiiii __mmap         mmap
+ +mprotect     -       mprotect        i:aUi   __mprotect      mprotect
+ +munmap               -       munmap          i:aU    __munmap        munmap
+  open         -       open            Ci:siv  __libc_open __open open
+  profil               -       profil          i:piii  __profil        profil
+  ptrace               -       ptrace          i:iiii  ptrace
+ -read         -       read            Ci:ibn  __libc_read     __read read
+ -readlink     -       readlink        i:spi   __readlink      readlink
+ +read         -       read            Ci:ibU  __libc_read     __read read
+ +readlink     -       readlink        i:spU   __readlink      readlink
+  readv                -       readv           Ci:ipi  __readv         readv
+  reboot               -       reboot          i:i     reboot
+ -recv         -       recv            Ci:ibni __libc_recv     recv
+ -recvfrom     -       recvfrom        Ci:ibniBN       __libc_recvfrom __recvfrom recvfrom
+ +recv         -       recv            Ci:ibUi __libc_recv     recv
+ +recvfrom     -       recvfrom        Ci:ibUiBN       __libc_recvfrom __recvfrom recvfrom
+  recvmsg              -       recvmsg         Ci:ipi  __libc_recvmsg  __recvmsg recvmsg
+  rename               -       rename          i:ss    rename
+  rmdir                -       rmdir           i:s     __rmdir         rmdir
+  select               -       select          Ci:iPPPP        __select        __libc_select select
+ -send         -       send            Ci:ibni __libc_send     __send send
+ +send         -       send            Ci:ibUi __libc_send     __send send
+  sendmsg              -       sendmsg         Ci:ipi  __libc_sendmsg  __sendmsg sendmsg
+ -sendto               -       sendto          Ci:ibnibn       __libc_sendto   __sendto sendto
+ +sendto               -       sendto          Ci:ibUibn       __libc_sendto   __sendto sendto
+  setdomain    -       setdomainname   i:si    setdomainname
+  setegid              -       setegid         i:i     __setegid       setegid
+  seteuid              -       seteuid         i:i     __seteuid       seteuid
+ @@ -94,5 +94,5 @@ uname               -       uname           i:p     __uname         uname
+  unlink               -       unlink          i:s     __unlink        unlink
+  utimes               -       utimes          i:sp    __utimes        utimes
+  vhangup              -       vhangup         i:i     vhangup
+ -write                -       write           Ci:ibn  __libc_write    __write write
+ +write                -       write           Ci:ibU  __libc_write    __write write
+  writev               -       writev          Ci:ipi  __writev        writev
+ diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
+ index f12b7b1a2d..0a0da00151 100644
+ --- a/sysdeps/unix/sysv/linux/Makefile
+ +++ b/sysdeps/unix/sysv/linux/Makefile
+ @@ -60,7 +60,9 @@ sysdep_routines += adjtimex clone umount umount2 readahead \
+                  setfsuid setfsgid epoll_pwait signalfd \
+                  eventfd eventfd_read eventfd_write prlimit \
+                  personality epoll_wait tee vmsplice splice \
+ -                open_by_handle_at mlock2 pkey_mprotect pkey_set pkey_get
+ +                open_by_handle_at mlock2 pkey_mprotect pkey_set pkey_get \
+ +                prctl \
+ +                process_vm_readv process_vm_writev
+  
+  CFLAGS-gethostid.c = -fexceptions
+  CFLAGS-tee.c = -fexceptions -fasynchronous-unwind-tables
+ @@ -273,7 +275,7 @@ sysdep_routines += xstatconv internal_statvfs internal_statvfs64 \
+  
+  sysdep_headers += bits/fcntl-linux.h
+  
+ -tests += tst-fallocate tst-fallocate64
+ +tests += tst-fallocate tst-fallocate64 tst-getcwd-smallbuff
+  endif
+  
+  ifeq ($(subdir),elf)
+ diff --git a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
+ index 9378387747..c8471947b9 100644
+ --- a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
+ +++ b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
+ @@ -17,6 +17,7 @@
+  #define __NR_clock_nanosleep 115
+  #define __NR_clock_settime 112
+  #define __NR_clone 220
+ +#define __NR_clone3 435
+  #define __NR_close 57
+  #define __NR_connect 203
+  #define __NR_copy_file_range 285
+ diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+ index 1389cea1b3..346d045fb4 100644
+ --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+ +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+ @@ -51,8 +51,12 @@
+  
+  #define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h'                             \
+                          && MIDR_PARTNUM(midr) == 0x000)
+ -#define IS_ARES(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                       \
+ -                     && MIDR_PARTNUM(midr) == 0xd0c)
+ +#define IS_NEOVERSE_N1(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                \
+ +                           && MIDR_PARTNUM(midr) == 0xd0c)
+ +#define IS_NEOVERSE_N2(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                \
+ +                           && MIDR_PARTNUM(midr) == 0xd49)
+ +#define IS_NEOVERSE_V1(midr) (MIDR_IMPLEMENTOR(midr) == 'A'                \
+ +                           && MIDR_PARTNUM(midr) == 0xd40)
+  
+  #define IS_EMAG(midr) (MIDR_IMPLEMENTOR(midr) == 'P'                       \
+                         && MIDR_PARTNUM(midr) == 0x000)
+ diff --git a/sysdeps/unix/sysv/linux/aarch64/localplt.data b/sysdeps/unix/sysv/linux/aarch64/localplt.data
+ index a60053b914..08af68b5e8 100644
+ --- a/sysdeps/unix/sysv/linux/aarch64/localplt.data
+ +++ b/sysdeps/unix/sysv/linux/aarch64/localplt.data
+ @@ -7,6 +7,9 @@ libc.so: malloc
+  libc.so: memalign
+  libc.so: realloc
+  libm.so: matherr
+ +# If outline atomics are used, libgcc (built outside of glibc) may
+ +# call __getauxval using the PLT.
+ +libc.so: __getauxval ?
+  # The dynamic loader needs __tls_get_addr for TLS.
+  ld.so: __tls_get_addr
+  # The main malloc is interposed into the dynamic linker, for
+ diff --git a/sysdeps/unix/sysv/linux/getpt.c b/sysdeps/unix/sysv/linux/getpt.c
+ index 1803b232c9..3cc745e11a 100644
+ --- a/sysdeps/unix/sysv/linux/getpt.c
+ +++ b/sysdeps/unix/sysv/linux/getpt.c
+ @@ -16,69 +16,18 @@
+     License along with the GNU C Library; if not, see
+     <https://www.gnu.org/licenses/>.  */
+  
+ -#include <errno.h>
+  #include <fcntl.h>
+ -#include <stdlib.h>
+  #include <unistd.h>
+  #include <paths.h>
+ -#include <sys/statfs.h>
+ -
+ -#include "linux_fsinfo.h"
+  
+  /* Path to the master pseudo terminal cloning device.  */
+  #define _PATH_DEVPTMX _PATH_DEV "ptmx"
+ -/* Directory containing the UNIX98 pseudo terminals.  */
+ -#define _PATH_DEVPTS _PATH_DEV "pts"
+ -
+ -/* Prototype for function that opens BSD-style master pseudo-terminals.  */
+ -extern int __bsd_getpt (void) attribute_hidden;
+  
+  /* Open a master pseudo terminal and return its file descriptor.  */
+  int
+  __posix_openpt (int oflag)
+  {
+ -  static int have_no_dev_ptmx;
+ -  int fd;
+ -
+ -  if (!have_no_dev_ptmx)
+ -    {
+ -      fd = __open (_PATH_DEVPTMX, oflag);
+ -      if (fd != -1)
+ -     {
+ -       struct statfs fsbuf;
+ -       static int devpts_mounted;
+ -
+ -       /* Check that the /dev/pts filesystem is mounted
+ -          or if /dev is a devfs filesystem (this implies /dev/pts).  */
+ -       if (devpts_mounted
+ -           || (__statfs (_PATH_DEVPTS, &fsbuf) == 0
+ -               && fsbuf.f_type == DEVPTS_SUPER_MAGIC)
+ -           || (__statfs (_PATH_DEV, &fsbuf) == 0
+ -               && fsbuf.f_type == DEVFS_SUPER_MAGIC))
+ -         {
+ -           /* Everything is ok.  */
+ -           devpts_mounted = 1;
+ -           return fd;
+ -         }
+ -
+ -       /* If /dev/pts is not mounted then the UNIX98 pseudo terminals
+ -          are not usable.  */
+ -       __close (fd);
+ -       have_no_dev_ptmx = 1;
+ -       __set_errno (ENOENT);
+ -     }
+ -      else
+ -     {
+ -       if (errno == ENOENT || errno == ENODEV)
+ -         have_no_dev_ptmx = 1;
+ -       else
+ -         return -1;
+ -     }
+ -    }
+ -  else
+ -    __set_errno (ENOENT);
+ -
+ -  return -1;
+ +  return __open (_PATH_DEVPTMX, oflag);
+  }
+  weak_alias (__posix_openpt, posix_openpt)
+  
+ @@ -86,16 +35,6 @@ weak_alias (__posix_openpt, posix_openpt)
+  int
+  __getpt (void)
+  {
+ -  int fd = __posix_openpt (O_RDWR);
+ -  if (fd == -1)
+ -    fd = __bsd_getpt ();
+ -  return fd;
+ +  return __posix_openpt (O_RDWR);
+  }
+ -
+ -
+ -#define PTYNAME1 "pqrstuvwxyzabcde";
+ -#define PTYNAME2 "0123456789abcdef";
+ -
+ -#define __getpt __bsd_getpt
+ -#define HAVE_POSIX_OPENPT
+ -#include <sysdeps/unix/bsd/getpt.c>
+ +weak_alias (__getpt, getpt)
+ diff --git a/sysdeps/unix/sysv/linux/grantpt.c b/sysdeps/unix/sysv/linux/grantpt.c
+ index 2030e07fa6..43122f9a76 100644
+ --- a/sysdeps/unix/sysv/linux/grantpt.c
+ +++ b/sysdeps/unix/sysv/linux/grantpt.c
+ @@ -1,44 +1,41 @@
+ -#include <assert.h>
+ -#include <ctype.h>
+ -#include <dirent.h>
+ -#include <errno.h>
+ -#include <fcntl.h>
+ -#include <paths.h>
+ -#include <stdlib.h>
+ -#include <unistd.h>
+ +/* grantpt implementation for Linux.
+ +   Copyright (C) 1998-2020 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +   Contributed by Zack Weinberg <zack@rabi.phys.columbia.edu>, 1998.
+  
+ -#include <not-cancel.h>
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+  
+ -#include "pty-private.h"
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+  
+ -#if HAVE_PT_CHOWN
+ -/* Close all file descriptors except the one specified.  */
+ -static void
+ -close_all_fds (void)
+ -{
+ -  DIR *dir = __opendir ("/proc/self/fd");
+ -  if (dir != NULL)
+ -    {
+ -      struct dirent64 *d;
+ -      while ((d = __readdir64 (dir)) != NULL)
+ -     if (isdigit (d->d_name[0]))
+ -       {
+ -         char *endp;
+ -         long int fd = strtol (d->d_name, &endp, 10);
+ -         if (*endp == '\0' && fd != PTY_FILENO && fd != dirfd (dir))
+ -           __close_nocancel_nostatus (fd);
+ -       }
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <errno.h>
+ +#include <stdlib.h>
+ +#include <sys/ioctl.h>
+ +#include <termios.h>
+  
+ -      __closedir (dir);
+ +int
+ +grantpt (int fd)
+ +{
+ +  /* Without pt_chown on Linux, we have delegated the creation of the
+ +     pty node with the right group and permission mode to the kernel, and
+ +     non-root users are unlikely to be able to change it. Therefore let's
+ +     consider that POSIX enforcement is the responsibility of the whole
+ +     system and not only the GNU libc.   */
+  
+ -      int nullfd = __open_nocancel (_PATH_DEVNULL, O_RDONLY);
+ -      assert (nullfd == STDIN_FILENO);
+ -      nullfd = __open_nocancel (_PATH_DEVNULL, O_WRONLY);
+ -      assert (nullfd == STDOUT_FILENO);
+ -      __dup2 (STDOUT_FILENO, STDERR_FILENO);
+ -    }
+ +  /* Verify that fd refers to a ptmx descriptor.  */
+ +  unsigned int ptyno;
+ +  int ret = __ioctl (fd, TIOCGPTN, &ptyno);
+ +  if (ret != 0 && errno == ENOTTY)
+ +    /* POSIX requires EINVAL instead of ENOTTY provided by the kernel.  */
+ +    __set_errno (EINVAL);
+ +  return ret;
+  }
+ -# define CLOSE_ALL_FDS() close_all_fds()
+ -#endif
+ -
+ -#include <sysdeps/unix/grantpt.c>
+ diff --git a/sysdeps/unix/sysv/linux/hppa/atomic-machine.h b/sysdeps/unix/sysv/linux/hppa/atomic-machine.h
+ index 9d8ffbe860..bf61b66b70 100644
+ --- a/sysdeps/unix/sysv/linux/hppa/atomic-machine.h
+ +++ b/sysdeps/unix/sysv/linux/hppa/atomic-machine.h
+ @@ -36,9 +36,37 @@ typedef uintptr_t uatomicptr_t;
+  typedef intmax_t atomic_max_t;
+  typedef uintmax_t uatomic_max_t;
+  
+ +#define atomic_full_barrier() __sync_synchronize ()
+ +
+  #define __HAVE_64B_ATOMICS 0
+  #define USE_ATOMIC_COMPILER_BUILTINS 0
+  
+ +/* We use the compiler atomic load and store builtins as the generic
+ +   defines are not atomic.  In particular, we need to use compare and
+ +   exchange for stores as the implementation is synthesized.  */
+ +void __atomic_link_error (void);
+ +#define __atomic_check_size_ls(mem) \
+ + if ((sizeof (*mem) != 1) && (sizeof (*mem) != 2) && sizeof (*mem) != 4)    \
+ +   __atomic_link_error ();
+ +
+ +#define atomic_load_relaxed(mem) \
+ + ({ __atomic_check_size_ls((mem));                                           \
+ +    __atomic_load_n ((mem), __ATOMIC_RELAXED); })
+ +#define atomic_load_acquire(mem) \
+ + ({ __atomic_check_size_ls((mem));                                           \
+ +    __atomic_load_n ((mem), __ATOMIC_ACQUIRE); })
+ +
+ +#define atomic_store_relaxed(mem, val) \
+ + do {                                                                        \
+ +   __atomic_check_size_ls((mem));                                            \
+ +   __atomic_store_n ((mem), (val), __ATOMIC_RELAXED);                        \
+ + } while (0)
+ +#define atomic_store_release(mem, val) \
+ + do {                                                                        \
+ +   __atomic_check_size_ls((mem));                                            \
+ +   __atomic_store_n ((mem), (val), __ATOMIC_RELEASE);                        \
+ + } while (0)
+ +
+  /* XXX Is this actually correct?  */
+  #define ATOMIC_EXCHANGE_USES_CAS 1
+  
+ diff --git a/sysdeps/unix/sysv/linux/microblaze/sysdep.h b/sysdeps/unix/sysv/linux/microblaze/sysdep.h
+ index ed873d9dd4..796663a23a 100644
+ --- a/sysdeps/unix/sysv/linux/microblaze/sysdep.h
+ +++ b/sysdeps/unix/sysv/linux/microblaze/sysdep.h
+ @@ -209,8 +209,8 @@ SYSCALL_ERROR_LABEL_DCL:                            \
+  
+  # define inline_syscall0(name,dummy)                                          \
+    ({                                                                          \
+ -    register long __ret __asm__("r3");                                        \
+ -    register long __r12 __asm__("r12") = name;                                \
+ +    register long int __ret __asm__("r3");                                    \
+ +    register long int __r12 __asm__("r12") = name;                            \
+      __asm__ __volatile__( "brki r14,8; nop;"                                  \
+        : "=r"(__ret)                                                           \
+        : "r"(__r12)                                                            \
+ @@ -219,9 +219,10 @@ SYSCALL_ERROR_LABEL_DCL:                            \
+  
+  # define inline_syscall1(name,arg1)                                           \
+    ({                                                                          \
+ -    register long __ret __asm__("r3");                                        \
+ -    register long __r12 __asm__("r12") = name;                                \
+ -    register long __r5 __asm__("r5") = (long)(arg1);                          \
+ +    long int __arg1 = (long int) (arg1);                                      \
+ +    register long int __ret __asm__("r3");                                    \
+ +    register long int __r12 __asm__("r12") = name;                            \
+ +    register long int __r5 __asm__("r5") = __arg1;                            \
+      __asm__ __volatile__( "brki r14,8; nop;"                                  \
+        : "=r"(__ret)                                                           \
+        : "r"(__r5), "r"(__r12)                                                 \
+ @@ -230,10 +231,12 @@ SYSCALL_ERROR_LABEL_DCL:                            \
+  
+  # define inline_syscall2(name,arg1,arg2)                                      \
+    ({                                                                          \
+ -    register long __ret __asm__("r3");                                        \
+ -    register long __r12 __asm__("r12") = name;                                \
+ -    register long __r5 __asm__("r5") = (long)(arg1);                          \
+ -    register long __r6 __asm__("r6") = (long)(arg2);                          \
+ +    long int __arg1 = (long int) (arg1);                                      \
+ +    long int __arg2 = (long int) (arg2);                                      \
+ +    register long int __ret __asm__("r3");                                    \
+ +    register long int __r12 __asm__("r12") = name;                            \
+ +    register long int __r5 __asm__("r5") = __arg1;                            \
+ +    register long int __r6 __asm__("r6") = __arg2;                            \
+      __asm__ __volatile__( "brki r14,8; nop;"                                  \
+        : "=r"(__ret)                                                           \
+        : "r"(__r5), "r"(__r6), "r"(__r12)                                      \
+ @@ -243,11 +246,14 @@ SYSCALL_ERROR_LABEL_DCL:                            \
+  
+  # define inline_syscall3(name,arg1,arg2,arg3)                                 \
+    ({                                                                          \
+ -    register long __ret __asm__("r3");                                        \
+ -    register long __r12 __asm__("r12") = name;                                \
+ -    register long __r5 __asm__("r5") = (long)(arg1);                          \
+ -    register long __r6 __asm__("r6") = (long)(arg2);                          \
+ -    register long __r7 __asm__("r7") = (long)(arg3);                          \
+ +    long int __arg1 = (long int) (arg1);                                      \
+ +    long int __arg2 = (long int) (arg2);                                      \
+ +    long int __arg3 = (long int) (arg3);                                      \
+ +    register long int __ret __asm__("r3");                                    \
+ +    register long int __r12 __asm__("r12") = name;                            \
+ +    register long int __r5 __asm__("r5") = __arg1;                            \
+ +    register long int __r6 __asm__("r6") = __arg2;                            \
+ +    register long int __r7 __asm__("r7") = __arg3;                            \
+      __asm__ __volatile__( "brki r14,8; nop;"                                  \
+        : "=r"(__ret)                                                           \
+        : "r"(__r5), "r"(__r6), "r"(__r7), "r"(__r12)                           \
+ @@ -257,12 +263,16 @@ SYSCALL_ERROR_LABEL_DCL:                            \
+  
+  # define inline_syscall4(name,arg1,arg2,arg3,arg4)                            \
+    ({                                                                          \
+ -    register long __ret __asm__("r3");                                        \
+ -    register long __r12 __asm__("r12") = name;                                \
+ -    register long __r5 __asm__("r5") = (long)(arg1);                          \
+ -    register long __r6 __asm__("r6") = (long)(arg2);                          \
+ -    register long __r7 __asm__("r7") = (long)(arg3);                          \
+ -    register long __r8 __asm__("r8") = (long)(arg4);                          \
+ +    long int __arg1 = (long int) (arg1);                                      \
+ +    long int __arg2 = (long int) (arg2);                                      \
+ +    long int __arg3 = (long int) (arg3);                                      \
+ +    long int __arg4 = (long int) (arg4);                                      \
+ +    register long int __ret __asm__("r3");                                    \
+ +    register long int __r12 __asm__("r12") = name;                            \
+ +    register long int __r5 __asm__("r5") = __arg1;                            \
+ +    register long int __r6 __asm__("r6") = __arg2;                            \
+ +    register long int __r7 __asm__("r7") = __arg3;                            \
+ +    register long int __r8 __asm__("r8") = __arg4;                            \
+      __asm__ __volatile__( "brki r14,8; nop;"                                  \
+        : "=r"(__ret)                                                           \
+        : "r"(__r5), "r"(__r6), "r"(__r7), "r"(__r8),"r"(__r12)                 \
+ @@ -272,13 +282,18 @@ SYSCALL_ERROR_LABEL_DCL:                            \
+  
+  # define inline_syscall5(name,arg1,arg2,arg3,arg4,arg5)                       \
+    ({                                                                          \
+ -    register long __ret __asm__("r3");                                        \
+ -    register long __r12 __asm__("r12") = name;                                \
+ -    register long __r5 __asm__("r5") = (long)(arg1);                          \
+ -    register long __r6 __asm__("r6") = (long)(arg2);                          \
+ -    register long __r7 __asm__("r7") = (long)(arg3);                          \
+ -    register long __r8 __asm__("r8") = (long)(arg4);                          \
+ -    register long __r9 __asm__("r9") = (long)(arg5);                          \
+ +    long int __arg1 = (long int) (arg1);                                      \
+ +    long int __arg2 = (long int) (arg2);                                      \
+ +    long int __arg3 = (long int) (arg3);                                      \
+ +    long int __arg4 = (long int) (arg4);                                      \
+ +    long int __arg5 = (long int) (arg5);                                      \
+ +    register long int __ret __asm__("r3");                                    \
+ +    register long int __r12 __asm__("r12") = name;                            \
+ +    register long int __r5 __asm__("r5") = __arg1;                            \
+ +    register long int __r6 __asm__("r6") = __arg2;                            \
+ +    register long int __r7 __asm__("r7") = __arg3;                            \
+ +    register long int __r8 __asm__("r8") = __arg4;                            \
+ +    register long int __r9 __asm__("r9") = __arg5;                            \
+      __asm__ __volatile__( "brki r14,8; nop;"                                  \
+        : "=r"(__ret)                                                           \
+        : "r"(__r5), "r"(__r6), "r"(__r7), "r"(__r8),"r"(__r9), "r"(__r12)      \
+ @@ -288,14 +303,20 @@ SYSCALL_ERROR_LABEL_DCL:                            \
+  
+  # define inline_syscall6(name,arg1,arg2,arg3,arg4,arg5,arg6)                  \
+    ({                                                                          \
+ -    register long __ret __asm__("r3");                                        \
+ -    register long __r12 __asm__("r12") = name;                                \
+ -    register long __r5 __asm__("r5") = (long)(arg1);                          \
+ -    register long __r6 __asm__("r6") = (long)(arg2);                          \
+ -    register long __r7 __asm__("r7") = (long)(arg3);                          \
+ -    register long __r8 __asm__("r8") = (long)(arg4);                          \
+ -    register long __r9 __asm__("r9") = (long)(arg5);                          \
+ -    register long __r10 __asm__("r10") = (long)(arg6);                        \
+ +    long int __arg1 = (long int) (arg1);                                      \
+ +    long int __arg2 = (long int) (arg2);                                      \
+ +    long int __arg3 = (long int) (arg3);                                      \
+ +    long int __arg4 = (long int) (arg4);                                      \
+ +    long int __arg5 = (long int) (arg5);                                      \
+ +    long int __arg6 = (long int) (arg6);                                      \
+ +    register long int __ret __asm__("r3");                                    \
+ +    register long int __r12 __asm__("r12") = name;                            \
+ +    register long int __r5 __asm__("r5") = __arg1;                            \
+ +    register long int __r6 __asm__("r6") = __arg2;                            \
+ +    register long int __r7 __asm__("r7") = __arg3;                            \
+ +    register long int __r8 __asm__("r8") = __arg4;                            \
+ +    register long int __r9 __asm__("r9") = __arg5;                            \
+ +    register long int __r10 __asm__("r10") = __arg6;                          \
+      __asm__ __volatile__( "brki r14,8; nop;"                                  \
+        : "=r"(__ret)                                                           \
+        : "r"(__r5), "r"(__r6), "r"(__r7), "r"(__r8),"r"(__r9), "r"(__r10),     \
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall5.S b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall5.S
+ index b2bbf10181..ff445a5406 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall5.S
+ +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall5.S
+ @@ -22,9 +22,9 @@
+       .text
+       .set    nomips16
+  
+ -/* long long __mips_syscall5 (long arg1, long arg2, long arg3, long arg4,
+ -                           long arg5,
+ -                           long number)  */
+ +/* long long int __mips_syscall5 (long int arg1, long int arg2, long int arg3,
+ +                               long int arg4, long int arg5,
+ +                               long int number)  */
+  
+  ENTRY(__mips_syscall5)
+       lw      v0, 20(sp)
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall6.S b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall6.S
+ index 572d7c1137..2b4a3117d1 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall6.S
+ +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall6.S
+ @@ -22,9 +22,9 @@
+       .text
+       .set    nomips16
+  
+ -/* long long __mips_syscall6 (long arg1, long arg2, long arg3, long arg4,
+ -                           long arg5, long arg6,
+ -                           long number)  */
+ +/* long long int __mips_syscall6 (long int arg1, long int arg2, long int arg3,
+ +                               long int arg4, long int arg5, long int arg6,
+ +                               long int number)  */
+  
+  ENTRY(__mips_syscall6)
+       lw      v0, 24(sp)
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall7.S b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall7.S
+ index 05164cb253..2723bbb138 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall7.S
+ +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips-syscall7.S
+ @@ -22,9 +22,10 @@
+       .text
+       .set    nomips16
+  
+ -/* long long __mips_syscall7 (long arg1, long arg2, long arg3, long arg4,
+ -                           long arg5, long arg6, long arg7,
+ -                           long number)  */
+ +/* long long int __mips_syscall7 (long int arg1, long int arg2, long int arg3,
+ +                               long int arg4, long int arg5, long int arg6,
+ +                               long int arg7,
+ +                               long int number)  */
+  
+  ENTRY(__mips_syscall7)
+       lw      v0, 28(sp)
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall.h b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall.h
+ index 9bf551ace8..f23ede0259 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall.h
+ +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall.h
+ @@ -19,51 +19,57 @@
+  #ifndef MIPS16_SYSCALL_H
+  #define MIPS16_SYSCALL_H 1
+  
+ -long long __nomips16 __mips16_syscall0 (long number);
+ +long long int __nomips16 __mips16_syscall0 (long int number);
+  #define __mips16_syscall0(dummy, number)                             \
+ -     __mips16_syscall0 ((long) (number))
+ +     __mips16_syscall0 ((long int) (number))
+  
+ -long long __nomips16 __mips16_syscall1 (long a0,
+ -                                     long number);
+ +long long int __nomips16 __mips16_syscall1 (long int a0,
+ +                                         long int number);
+  #define __mips16_syscall1(a0, number)                                        \
+ -     __mips16_syscall1 ((long) (a0),                                 \
+ -                        (long) (number))
+ +     __mips16_syscall1 ((long int) (a0),                             \
+ +                        (long int) (number))
+  
+ -long long __nomips16 __mips16_syscall2 (long a0, long a1,
+ -                                     long number);
+ +long long int __nomips16 __mips16_syscall2 (long int a0, long int a1,
+ +                                         long int number);
+  #define __mips16_syscall2(a0, a1, number)                            \
+ -     __mips16_syscall2 ((long) (a0), (long) (a1),                    \
+ -                        (long) (number))
+ +     __mips16_syscall2 ((long int) (a0), (long int) (a1),            \
+ +                        (long int) (number))
+  
+ -long long __nomips16 __mips16_syscall3 (long a0, long a1, long a2,
+ -                                     long number);
+ +long long int __nomips16 __mips16_syscall3 (long int a0, long int a1,
+ +                                         long int a2,
+ +                                         long int number);
+  #define __mips16_syscall3(a0, a1, a2, number)                                \
+ -     __mips16_syscall3 ((long) (a0), (long) (a1), (long) (a2),       \
+ -                        (long) (number))
+ +     __mips16_syscall3 ((long int) (a0), (long int) (a1),            \
+ +                        (long int) (a2),                             \
+ +                        (long int) (number))
+  
+ -long long __nomips16 __mips16_syscall4 (long a0, long a1, long a2, long a3,
+ -                                     long number);
+ +long long int __nomips16 __mips16_syscall4 (long int a0, long int a1,
+ +                                         long int a2, long int a3,
+ +                                         long int number);
+  #define __mips16_syscall4(a0, a1, a2, a3, number)                    \
+ -     __mips16_syscall4 ((long) (a0), (long) (a1), (long) (a2),       \
+ -                        (long) (a3),                                 \
+ -                        (long) (number))
+ +     __mips16_syscall4 ((long int) (a0), (long int) (a1),            \
+ +                        (long int) (a2), (long int) (a3),            \
+ +                        (long int) (number))
+  
+  /* The remaining ones use regular MIPS wrappers.  */
+  
+  #define __mips16_syscall5(a0, a1, a2, a3, a4, number)                        \
+ -     __mips_syscall5 ((long) (a0), (long) (a1), (long) (a2),         \
+ -                      (long) (a3), (long) (a4),                      \
+ -                      (long) (number))
+ +     __mips_syscall5 ((long int) (a0), (long int) (a1),              \
+ +                      (long int) (a2), (long int) (a3),              \
+ +                      (long int) (a4),                               \
+ +                      (long int) (number))
+  
+  #define __mips16_syscall6(a0, a1, a2, a3, a4, a5, number)            \
+ -     __mips_syscall6 ((long) (a0), (long) (a1), (long) (a2),         \
+ -                      (long) (a3), (long) (a4), (long) (a5),         \
+ -                      (long) (number))
+ +     __mips_syscall6 ((long int) (a0), (long int) (a1),              \
+ +                      (long int) (a2), (long int) (a3),              \
+ +                      (long int) (a4), (long int) (a5),              \
+ +                      (long int) (number))
+  
+  #define __mips16_syscall7(a0, a1, a2, a3, a4, a5, a6, number)                \
+ -     __mips_syscall7 ((long) (a0), (long) (a1), (long) (a2),         \
+ -                      (long) (a3), (long) (a4), (long) (a5),         \
+ -                      (long) (a6),                                   \
+ -                      (long) (number))
+ +     __mips_syscall7 ((long int) (a0), (long int) (a1),              \
+ +                      (long int) (a2), (long int) (a3),              \
+ +                      (long int) (a4), (long int) (a5),              \
+ +                      (long int) (a6),                               \
+ +                      (long int) (number))
+  
+  #endif
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall0.c b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall0.c
+ index 92f16e2724..43c05f8050 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall0.c
+ +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall0.c
+ @@ -20,8 +20,8 @@
+  
+  #undef __mips16_syscall0
+  
+ -long long __nomips16
+ -__mips16_syscall0 (long number)
+ +long long int __nomips16
+ +__mips16_syscall0 (long int number)
+  {
+    union __mips_syscall_return ret;
+    ret.reg.v0 = INTERNAL_SYSCALL_MIPS16 (number, ret.reg.v1, 0);
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall1.c b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall1.c
+ index fa985a96e5..16a567e834 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall1.c
+ +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall1.c
+ @@ -20,9 +20,9 @@
+  
+  #undef __mips16_syscall1
+  
+ -long long __nomips16
+ -__mips16_syscall1 (long a0,
+ -                long number)
+ +long long int __nomips16
+ +__mips16_syscall1 (long int a0,
+ +                long int number)
+  {
+    union __mips_syscall_return ret;
+    ret.reg.v0 = INTERNAL_SYSCALL_MIPS16 (number, ret.reg.v1, 1,
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall2.c b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall2.c
+ index f042ac815d..c0a856c344 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall2.c
+ +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall2.c
+ @@ -20,9 +20,9 @@
+  
+  #undef __mips16_syscall2
+  
+ -long long __nomips16
+ -__mips16_syscall2 (long a0, long a1,
+ -                long number)
+ +long long int __nomips16
+ +__mips16_syscall2 (long int a0, long int a1,
+ +                long int number)
+  {
+    union __mips_syscall_return ret;
+    ret.reg.v0 = INTERNAL_SYSCALL_MIPS16 (number, ret.reg.v1, 2,
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall3.c b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall3.c
+ index dfe2f7feb5..042768ebf2 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall3.c
+ +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall3.c
+ @@ -20,9 +20,9 @@
+  
+  #undef __mips16_syscall3
+  
+ -long long __nomips16
+ -__mips16_syscall3 (long a0, long a1, long a2,
+ -                long number)
+ +long long int __nomips16
+ +__mips16_syscall3 (long int a0, long int a1, long int a2,
+ +                long int number)
+  {
+    union __mips_syscall_return ret;
+    ret.reg.v0 = INTERNAL_SYSCALL_MIPS16 (number, ret.reg.v1, 3,
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall4.c b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall4.c
+ index 39de510357..8658d822ab 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall4.c
+ +++ b/sysdeps/unix/sysv/linux/mips/mips32/mips16/mips16-syscall4.c
+ @@ -20,9 +20,9 @@
+  
+  #undef __mips16_syscall4
+  
+ -long long __nomips16
+ -__mips16_syscall4 (long a0, long a1, long a2, long a3,
+ -                long number)
+ +long long int __nomips16
+ +__mips16_syscall4 (long int a0, long int a1, long int a2, long int a3,
+ +                long int number)
+  {
+    union __mips_syscall_return ret;
+    ret.reg.v0 = INTERNAL_SYSCALL_MIPS16 (number, ret.reg.v1, 4,
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h b/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h
+ index beefcf284b..0c6a83e9b3 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h
+ +++ b/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h
+ @@ -52,7 +52,7 @@
+  #undef INLINE_SYSCALL
+  #define INLINE_SYSCALL(name, nr, args...)                               \
+    ({ INTERNAL_SYSCALL_DECL (_sc_err);                                        \
+ -     long result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);   \
+ +     long int result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);       \
+       if ( INTERNAL_SYSCALL_ERROR_P (result_var, _sc_err) )           \
+         {                                                             \
+        __set_errno (INTERNAL_SYSCALL_ERRNO (result_var, _sc_err));    \
+ @@ -61,10 +61,10 @@
+       result_var; })
+  
+  #undef INTERNAL_SYSCALL_DECL
+ -#define INTERNAL_SYSCALL_DECL(err) long err __attribute__ ((unused))
+ +#define INTERNAL_SYSCALL_DECL(err) long int err __attribute__ ((unused))
+  
+  #undef INTERNAL_SYSCALL_ERROR_P
+ -#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long) (err))
+ +#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long int) (err))
+  
+  #undef INTERNAL_SYSCALL_ERRNO
+  #define INTERNAL_SYSCALL_ERRNO(val, err)     ((void) (err), val)
+ @@ -103,11 +103,11 @@
+  
+  union __mips_syscall_return
+    {
+ -    long long val;
+ +    long long int val;
+      struct
+        {
+ -     long v0;
+ -     long v1;
+ +     long int v0;
+ +     long int v1;
+        }
+      reg;
+    };
+ @@ -152,13 +152,13 @@ union __mips_syscall_return
+  
+  #define internal_syscall0(v0_init, input, number, err, dummy...)     \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a3 asm ("$7");                                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a3 asm ("$7");                              \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -175,14 +175,15 @@ union __mips_syscall_return
+  
+  #define internal_syscall1(v0_init, input, number, err, arg1)         \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a0 asm ("$4") = (long) (arg1);                  \
+ -     register long __a3 asm ("$7");                                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a0 asm ("$4") = _arg1;                      \
+ +     register long int __a3 asm ("$7");                              \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -199,15 +200,17 @@ union __mips_syscall_return
+  
+  #define internal_syscall2(v0_init, input, number, err, arg1, arg2)   \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a0 asm ("$4") = (long) (arg1);                  \
+ -     register long __a1 asm ("$5") = (long) (arg2);                  \
+ -     register long __a3 asm ("$7");                                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a0 asm ("$4") = _arg1;                      \
+ +     register long int __a1 asm ("$5") = _arg2;                      \
+ +     register long int __a3 asm ("$7");                              \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -225,16 +228,19 @@ union __mips_syscall_return
+  #define internal_syscall3(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3)                             \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     long int _arg3 = (long int) (arg3);                             \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a0 asm ("$4") = (long) (arg1);                  \
+ -     register long __a1 asm ("$5") = (long) (arg2);                  \
+ -     register long __a2 asm ("$6") = (long) (arg3);                  \
+ -     register long __a3 asm ("$7");                                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a0 asm ("$4") = _arg1;                      \
+ +     register long int __a1 asm ("$5") = _arg2;                      \
+ +     register long int __a2 asm ("$6") = _arg3;                      \
+ +     register long int __a3 asm ("$7");                              \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -252,16 +258,20 @@ union __mips_syscall_return
+  #define internal_syscall4(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3, arg4)                       \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     long int _arg3 = (long int) (arg3);                             \
+ +     long int _arg4 = (long int) (arg4);                             \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a0 asm ("$4") = (long) (arg1);                  \
+ -     register long __a1 asm ("$5") = (long) (arg2);                  \
+ -     register long __a2 asm ("$6") = (long) (arg3);                  \
+ -     register long __a3 asm ("$7") = (long) (arg4);                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a0 asm ("$4") = _arg1;                      \
+ +     register long int __a1 asm ("$5") = _arg2;                      \
+ +     register long int __a2 asm ("$6") = _arg3;                      \
+ +     register long int __a3 asm ("$7") = _arg4;                      \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -285,63 +295,66 @@ union __mips_syscall_return
+     compiler specifics required for the stack arguments to be pushed,
+     which would be the case if these syscalls were inlined.  */
+  
+ -long long __nomips16 __mips_syscall5 (long arg1, long arg2, long arg3,
+ -                                   long arg4, long arg5,
+ -                                   long number);
+ +long long int __nomips16 __mips_syscall5 (long int arg1, long int arg2,
+ +                                       long int arg3, long int arg4,
+ +                                       long int arg5,
+ +                                       long int number);
+  libc_hidden_proto (__mips_syscall5, nomips16)
+  
+  #define internal_syscall5(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3, arg4, arg5)                 \
+  ({                                                                   \
+       union __mips_syscall_return _sc_ret;                            \
+ -     _sc_ret.val = __mips_syscall5 ((long) (arg1),                   \
+ -                                    (long) (arg2),                   \
+ -                                    (long) (arg3),                   \
+ -                                    (long) (arg4),                   \
+ -                                    (long) (arg5),                   \
+ -                                    (long) (number));                \
+ +     _sc_ret.val = __mips_syscall5 ((long int) (arg1),               \
+ +                                    (long int) (arg2),               \
+ +                                    (long int) (arg3),               \
+ +                                    (long int) (arg4),               \
+ +                                    (long int) (arg5),               \
+ +                                    (long int) (number));            \
+       err = _sc_ret.reg.v1;                                           \
+       _sc_ret.reg.v0;                                                 \
+  })
+  
+ -long long __nomips16 __mips_syscall6 (long arg1, long arg2, long arg3,
+ -                                   long arg4, long arg5, long arg6,
+ -                                   long number);
+ +long long int __nomips16 __mips_syscall6 (long int arg1, long int arg2,
+ +                                       long int arg3, long int arg4,
+ +                                       long int arg5, long int arg6,
+ +                                       long int number);
+  libc_hidden_proto (__mips_syscall6, nomips16)
+  
+  #define internal_syscall6(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3, arg4, arg5, arg6)           \
+  ({                                                                   \
+       union __mips_syscall_return _sc_ret;                            \
+ -     _sc_ret.val = __mips_syscall6 ((long) (arg1),                   \
+ -                                    (long) (arg2),                   \
+ -                                    (long) (arg3),                   \
+ -                                    (long) (arg4),                   \
+ -                                    (long) (arg5),                   \
+ -                                    (long) (arg6),                   \
+ -                                    (long) (number));                \
+ +     _sc_ret.val = __mips_syscall6 ((long int) (arg1),               \
+ +                                    (long int) (arg2),               \
+ +                                    (long int) (arg3),               \
+ +                                    (long int) (arg4),               \
+ +                                    (long int) (arg5),               \
+ +                                    (long int) (arg6),               \
+ +                                    (long int) (number));            \
+       err = _sc_ret.reg.v1;                                           \
+       _sc_ret.reg.v0;                                                 \
+  })
+  
+ -long long __nomips16 __mips_syscall7 (long arg1, long arg2, long arg3,
+ -                                   long arg4, long arg5, long arg6,
+ -                                   long arg7,
+ -                                   long number);
+ +long long int __nomips16 __mips_syscall7 (long int arg1, long int arg2,
+ +                                       long int arg3, long int arg4,
+ +                                       long int arg5, long int arg6,
+ +                                       long int arg7,
+ +                                       long int number);
+  libc_hidden_proto (__mips_syscall7, nomips16)
+  
+  #define internal_syscall7(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3, arg4, arg5, arg6, arg7)     \
+  ({                                                                   \
+       union __mips_syscall_return _sc_ret;                            \
+ -     _sc_ret.val = __mips_syscall7 ((long) (arg1),                   \
+ -                                    (long) (arg2),                   \
+ -                                    (long) (arg3),                   \
+ -                                    (long) (arg4),                   \
+ -                                    (long) (arg5),                   \
+ -                                    (long) (arg6),                   \
+ -                                    (long) (arg7),                   \
+ -                                    (long) (number));                \
+ +     _sc_ret.val = __mips_syscall7 ((long int) (arg1),               \
+ +                                    (long int) (arg2),               \
+ +                                    (long int) (arg3),               \
+ +                                    (long int) (arg4),               \
+ +                                    (long int) (arg5),               \
+ +                                    (long int) (arg6),               \
+ +                                    (long int) (arg7),               \
+ +                                    (long int) (number));            \
+       err = _sc_ret.reg.v1;                                           \
+       _sc_ret.reg.v0;                                                 \
+  })
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/sysdep.h b/sysdeps/unix/sysv/linux/mips/mips64/n32/sysdep.h
+ index f96636538a..4a9d7054f9 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips64/n32/sysdep.h
+ +++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/sysdep.h
+ @@ -47,14 +47,14 @@
+  
+  /* Convert X to a long long, without losing any bits if it is one
+     already or warning if it is a 32-bit pointer.  */
+ -#define ARGIFY(X) ((long long) (__typeof__ ((X) - (X))) (X))
+ +#define ARGIFY(X) ((long long int) (__typeof__ ((X) - (X))) (X))
+  
+  /* Define a macro which expands into the inline wrapper code for a system
+     call.  */
+  #undef INLINE_SYSCALL
+  #define INLINE_SYSCALL(name, nr, args...)                            \
+    ({ INTERNAL_SYSCALL_DECL (_sc_err);                                        \
+ -     long result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);   \
+ +     long int result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);       \
+       if ( INTERNAL_SYSCALL_ERROR_P (result_var, _sc_err) )           \
+         {                                                             \
+        __set_errno (INTERNAL_SYSCALL_ERRNO (result_var, _sc_err));    \
+ @@ -63,10 +63,10 @@
+       result_var; })
+  
+  #undef INTERNAL_SYSCALL_DECL
+ -#define INTERNAL_SYSCALL_DECL(err) long err __attribute__ ((unused))
+ +#define INTERNAL_SYSCALL_DECL(err) long int err __attribute__ ((unused))
+  
+  #undef INTERNAL_SYSCALL_ERROR_P
+ -#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long) (err))
+ +#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long int) (err))
+  
+  #undef INTERNAL_SYSCALL_ERRNO
+  #define INTERNAL_SYSCALL_ERRNO(val, err)     ((void) (err), val)
+ @@ -112,13 +112,13 @@
+  
+  #define internal_syscall0(v0_init, input, number, err, dummy...)     \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
+ +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
+         = (number);                                                   \
+ -     register long long __v0 asm ("$2");                             \
+ -     register long long __a3 asm ("$7");                             \
+ +     register long long int __v0 asm ("$2");                         \
+ +     register long long int __a3 asm ("$7");                         \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -135,14 +135,15 @@
+  
+  #define internal_syscall1(v0_init, input, number, err, arg1)         \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
+ +     long long int _arg1 = ARGIFY (arg1);                            \
+ +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
+         = (number);                                                   \
+ -     register long long __v0 asm ("$2");                             \
+ -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
+ -     register long long __a3 asm ("$7");                             \
+ +     register long long int __v0 asm ("$2");                         \
+ +     register long long int __a0 asm ("$4") = _arg1;                 \
+ +     register long long int __a3 asm ("$7");                         \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -159,15 +160,17 @@
+  
+  #define internal_syscall2(v0_init, input, number, err, arg1, arg2)   \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
+ +     long long int _arg1 = ARGIFY (arg1);                            \
+ +     long long int _arg2 = ARGIFY (arg2);                            \
+ +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
+         = (number);                                                   \
+ -     register long long __v0 asm ("$2");                             \
+ -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
+ -     register long long __a1 asm ("$5") = ARGIFY (arg2);             \
+ -     register long long __a3 asm ("$7");                             \
+ +     register long long int __v0 asm ("$2");                         \
+ +     register long long int __a0 asm ("$4") = _arg1;                 \
+ +     register long long int __a1 asm ("$5") = _arg2;                 \
+ +     register long long int __a3 asm ("$7");                         \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -185,16 +188,19 @@
+  #define internal_syscall3(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3)                             \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
+ +     long long int _arg1 = ARGIFY (arg1);                            \
+ +     long long int _arg2 = ARGIFY (arg2);                            \
+ +     long long int _arg3 = ARGIFY (arg3);                            \
+ +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
+         = (number);                                                   \
+ -     register long long __v0 asm ("$2");                             \
+ -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
+ -     register long long __a1 asm ("$5") = ARGIFY (arg2);             \
+ -     register long long __a2 asm ("$6") = ARGIFY (arg3);             \
+ -     register long long __a3 asm ("$7");                             \
+ +     register long long int __v0 asm ("$2");                         \
+ +     register long long int __a0 asm ("$4") = _arg1;                 \
+ +     register long long int __a1 asm ("$5") = _arg2;                 \
+ +     register long long int __a2 asm ("$6") = _arg3;                 \
+ +     register long long int __a3 asm ("$7");                         \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -212,16 +218,20 @@
+  #define internal_syscall4(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3, arg4)                       \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
+ +     long long int _arg1 = ARGIFY (arg1);                            \
+ +     long long int _arg2 = ARGIFY (arg2);                            \
+ +     long long int _arg3 = ARGIFY (arg3);                            \
+ +     long long int _arg4 = ARGIFY (arg4);                            \
+ +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
+         = (number);                                                   \
+ -     register long long __v0 asm ("$2");                             \
+ -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
+ -     register long long __a1 asm ("$5") = ARGIFY (arg2);             \
+ -     register long long __a2 asm ("$6") = ARGIFY (arg3);             \
+ -     register long long __a3 asm ("$7") = ARGIFY (arg4);             \
+ +     register long long int __v0 asm ("$2");                         \
+ +     register long long int __a0 asm ("$4") = _arg1;                 \
+ +     register long long int __a1 asm ("$5") = _arg2;                 \
+ +     register long long int __a2 asm ("$6") = _arg3;                 \
+ +     register long long int __a3 asm ("$7") = _arg4;                 \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -239,17 +249,22 @@
+  #define internal_syscall5(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3, arg4, arg5)                 \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
+ +     long long int _arg1 = ARGIFY (arg1);                            \
+ +     long long int _arg2 = ARGIFY (arg2);                            \
+ +     long long int _arg3 = ARGIFY (arg3);                            \
+ +     long long int _arg4 = ARGIFY (arg4);                            \
+ +     long long int _arg5 = ARGIFY (arg5);                            \
+ +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
+         = (number);                                                   \
+ -     register long long __v0 asm ("$2");                             \
+ -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
+ -     register long long __a1 asm ("$5") = ARGIFY (arg2);             \
+ -     register long long __a2 asm ("$6") = ARGIFY (arg3);             \
+ -     register long long __a3 asm ("$7") = ARGIFY (arg4);             \
+ -     register long long __a4 asm ("$8") = ARGIFY (arg5);             \
+ +     register long long int __v0 asm ("$2");                         \
+ +     register long long int __a0 asm ("$4") = _arg1;                 \
+ +     register long long int __a1 asm ("$5") = _arg2;                 \
+ +     register long long int __a2 asm ("$6") = _arg3;                 \
+ +     register long long int __a3 asm ("$7") = _arg4;                 \
+ +     register long long int __a4 asm ("$8") = _arg5;                 \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
        v0_init                                                         \
- @@ -135,14 +135,15 @@
+ @@ -267,18 +282,24 @@
+  #define internal_syscall6(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3, arg4, arg5, arg6)           \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
+ +     long long int _arg1 = ARGIFY (arg1);                            \
+ +     long long int _arg2 = ARGIFY (arg2);                            \
+ +     long long int _arg3 = ARGIFY (arg3);                            \
+ +     long long int _arg4 = ARGIFY (arg4);                            \
+ +     long long int _arg5 = ARGIFY (arg5);                            \
+ +     long long int _arg6 = ARGIFY (arg6);                            \
+ +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
+         = (number);                                                   \
+ -     register long long __v0 asm ("$2");                             \
+ -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
+ -     register long long __a1 asm ("$5") = ARGIFY (arg2);             \
+ -     register long long __a2 asm ("$6") = ARGIFY (arg3);             \
+ -     register long long __a3 asm ("$7") = ARGIFY (arg4);             \
+ -     register long long __a4 asm ("$8") = ARGIFY (arg5);             \
+ -     register long long __a5 asm ("$9") = ARGIFY (arg6);             \
+ +     register long long int __v0 asm ("$2");                         \
+ +     register long long int __a0 asm ("$4") = _arg1;                 \
+ +     register long long int __a1 asm ("$5") = _arg2;                 \
+ +     register long long int __a2 asm ("$6") = _arg3;                 \
+ +     register long long int __a3 asm ("$7") = _arg4;                 \
+ +     register long long int __a4 asm ("$8") = _arg5;                 \
+ +     register long long int __a5 asm ("$9") = _arg6;                 \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/sysdep.h b/sysdeps/unix/sysv/linux/mips/mips64/n64/sysdep.h
+ index 9d30291f84..3e1f1cc3c5 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips64/n64/sysdep.h
+ +++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/sysdep.h
+ @@ -50,7 +50,7 @@
+  #undef INLINE_SYSCALL
+  #define INLINE_SYSCALL(name, nr, args...)                            \
+    ({ INTERNAL_SYSCALL_DECL (_sc_err);                                        \
+ -     long result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);   \
+ +     long int result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);       \
+       if ( INTERNAL_SYSCALL_ERROR_P (result_var, _sc_err) )           \
+         {                                                             \
+        __set_errno (INTERNAL_SYSCALL_ERRNO (result_var, _sc_err));    \
+ @@ -59,10 +59,10 @@
+       result_var; })
+  
+  #undef INTERNAL_SYSCALL_DECL
+ -#define INTERNAL_SYSCALL_DECL(err) long err __attribute__ ((unused))
+ +#define INTERNAL_SYSCALL_DECL(err) long int err __attribute__ ((unused))
+  
+  #undef INTERNAL_SYSCALL_ERROR_P
+ -#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long) (err))
+ +#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long int) (err))
+  
+  #undef INTERNAL_SYSCALL_ERRNO
+  #define INTERNAL_SYSCALL_ERRNO(val, err)     ((void) (err), val)
+ @@ -108,13 +108,13 @@
+  
+  #define internal_syscall0(v0_init, input, number, err, dummy...)     \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a3 asm ("$7");                                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a3 asm ("$7");                              \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -131,14 +131,15 @@
+  
+  #define internal_syscall1(v0_init, input, number, err, arg1)         \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a0 asm ("$4") = (long) (arg1);                  \
+ -     register long __a3 asm ("$7");                                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a0 asm ("$4") = _arg1;                      \
+ +     register long int __a3 asm ("$7");                              \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -155,15 +156,17 @@
+  
+  #define internal_syscall2(v0_init, input, number, err, arg1, arg2)   \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a0 asm ("$4") = (long) (arg1);                  \
+ -     register long __a1 asm ("$5") = (long) (arg2);                  \
+ -     register long __a3 asm ("$7");                                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a0 asm ("$4") = _arg1;                      \
+ +     register long int __a1 asm ("$5") = _arg2;                      \
+ +     register long int __a3 asm ("$7");                              \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -181,16 +184,19 @@
+  #define internal_syscall3(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3)                             \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     long int _arg3 = (long int) (arg3);                             \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a0 asm ("$4") = (long) (arg1);                  \
+ -     register long __a1 asm ("$5") = (long) (arg2);                  \
+ -     register long __a2 asm ("$6") = (long) (arg3);                  \
+ -     register long __a3 asm ("$7");                                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a0 asm ("$4") = _arg1;                      \
+ +     register long int __a1 asm ("$5") = _arg2;                      \
+ +     register long int __a2 asm ("$6") = _arg3;                      \
+ +     register long int __a3 asm ("$7");                              \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -208,16 +214,20 @@
+  #define internal_syscall4(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3, arg4)                       \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     long int _arg3 = (long int) (arg3);                             \
+ +     long int _arg4 = (long int) (arg4);                             \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a0 asm ("$4") = (long) (arg1);                  \
+ -     register long __a1 asm ("$5") = (long) (arg2);                  \
+ -     register long __a2 asm ("$6") = (long) (arg3);                  \
+ -     register long __a3 asm ("$7") = (long) (arg4);                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a0 asm ("$4") = _arg1;                      \
+ +     register long int __a1 asm ("$5") = _arg2;                      \
+ +     register long int __a2 asm ("$6") = _arg3;                      \
+ +     register long int __a3 asm ("$7") = _arg4;                      \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -235,17 +245,22 @@
+  #define internal_syscall5(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3, arg4, arg5)                 \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     long int _arg3 = (long int) (arg3);                             \
+ +     long int _arg4 = (long int) (arg4);                             \
+ +     long int _arg5 = (long int) (arg5);                             \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a0 asm ("$4") = (long) (arg1);                  \
+ -     register long __a1 asm ("$5") = (long) (arg2);                  \
+ -     register long __a2 asm ("$6") = (long) (arg3);                  \
+ -     register long __a3 asm ("$7") = (long) (arg4);                  \
+ -     register long __a4 asm ("$8") = (long) (arg5);                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a0 asm ("$4") = _arg1;                      \
+ +     register long int __a1 asm ("$5") = _arg2;                      \
+ +     register long int __a2 asm ("$6") = _arg3;                      \
+ +     register long int __a3 asm ("$7") = _arg4;                      \
+ +     register long int __a4 asm ("$8") = _arg5;                      \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ @@ -263,18 +278,24 @@
+  #define internal_syscall6(v0_init, input, number, err,                       \
+                         arg1, arg2, arg3, arg4, arg5, arg6)           \
+  ({                                                                   \
+ -     long _sys_result;                                               \
+ +     long int _sys_result;                                           \
+                                                                       \
+       {                                                               \
+ -     register long __s0 asm ("$16") __attribute__ ((unused))         \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     long int _arg3 = (long int) (arg3);                             \
+ +     long int _arg4 = (long int) (arg4);                             \
+ +     long int _arg5 = (long int) (arg5);                             \
+ +     long int _arg6 = (long int) (arg6);                             \
+ +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
+         = (number);                                                   \
+ -     register long __v0 asm ("$2");                                  \
+ -     register long __a0 asm ("$4") = (long) (arg1);                  \
+ -     register long __a1 asm ("$5") = (long) (arg2);                  \
+ -     register long __a2 asm ("$6") = (long) (arg3);                  \
+ -     register long __a3 asm ("$7") = (long) (arg4);                  \
+ -     register long __a4 asm ("$8") = (long) (arg5);                  \
+ -     register long __a5 asm ("$9") = (long) (arg6);                  \
+ +     register long int __v0 asm ("$2");                              \
+ +     register long int __a0 asm ("$4") = _arg1;                      \
+ +     register long int __a1 asm ("$5") = _arg2;                      \
+ +     register long int __a2 asm ("$6") = _arg3;                      \
+ +     register long int __a3 asm ("$7") = _arg4;                      \
+ +     register long int __a4 asm ("$8") = _arg5;                      \
+ +     register long int __a5 asm ("$9") = _arg6;                      \
+       __asm__ volatile (                                              \
+       ".set\tnoreorder\n\t"                                           \
+       v0_init                                                         \
+ diff --git a/sysdeps/unix/sysv/linux/mips/mips64/syscall.S b/sysdeps/unix/sysv/linux/mips/mips64/syscall.S
+ index 26adf2cd04..a9baff3c17 100644
+ --- a/sysdeps/unix/sysv/linux/mips/mips64/syscall.S
+ +++ b/sysdeps/unix/sysv/linux/mips/mips64/syscall.S
+ @@ -20,7 +20,7 @@
+  #include <sys/asm.h>
+  
+  /* Usage:
+ -   long syscall (syscall_number, arg1, arg2, arg3, arg4, arg5, arg6, arg7)
+ +   long int syscall (syscall_number, arg1, arg2, arg3, arg4, arg5, arg6, arg7)
+  
+     We need to do some arg shifting, syscall_number will be in v0.  */
+  
+ diff --git a/sysdeps/unix/sysv/linux/mips/sysdep.h b/sysdeps/unix/sysv/linux/mips/sysdep.h
+ index cdfc0b1b58..a4cf1540fe 100644
+ --- a/sysdeps/unix/sysv/linux/mips/sysdep.h
+ +++ b/sysdeps/unix/sysv/linux/mips/sysdep.h
+ @@ -36,8 +36,8 @@
+     the INTERNAL_SYSCALL_{ERROR_P,ERRNO} macros work correctly.  */
+  #define INTERNAL_VSYSCALL_CALL(funcptr, err, nr, args...)            \
+    ({                                                                 \
+ -    long _ret = funcptr (args);                                              \
+ -    err = ((unsigned long) (_ret) >= (unsigned long) -4095L);                \
+ +    long int _ret = funcptr (args);                                  \
+ +    err = ((unsigned long int) (_ret) >= (unsigned long int) -4095L);        \
+      if (err)                                                         \
+        _ret = -_ret;                                                  \
+      _ret;                                                            \
+ diff --git a/sysdeps/unix/sysv/linux/mips/unwind-arch.h b/sysdeps/unix/sysv/linux/mips/unwind-arch.h
+ new file mode 100644
+ index 0000000000..a009899983
+ --- /dev/null
+ +++ b/sysdeps/unix/sysv/linux/mips/unwind-arch.h
+ @@ -0,0 +1,67 @@
+ +/* Return backtrace of current program state.  Arch-specific bits.
+ +   Copyright (C) 2020 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#ifndef _UNWIND_ARCH_H
+ +#define _UNWIND_ARCH_H
+ +
+ +#include <stdint.h>
+ +
+ +/* MIPS fallback code handle a frame where its FDE can not be obtained
+ +   (for instance a signal frame) by reading the kernel allocated signal frame
+ +   and adding '2' to the value of 'sc_pc' [1].  The added value is used to
+ +   recognize an end of an EH region on mips16 [2].
+ +
+ +   The idea here is to adjust the obtained signal frame ADDR value and remove
+ +   the libgcc added value by checking if the previous frame is a signal frame
+ +   one.
+ +
+ +   [1] libgcc/config/mips/linux-unwind.h from gcc code.
+ +   [2] gcc/config/mips/mips.h from gcc code.  */
+ +
+ +static inline void *
+ +unwind_arch_adjustment (void *prev, void *addr)
+ +{
+ +  uint32_t *pc = (uint32_t *) prev;
+ +
+ +  if (pc == NULL)
+ +    return addr;
+ +
+ +  /* For MIPS16 or microMIPS frame libgcc makes no adjustment.  */
+ +  if ((uintptr_t) pc & 0x3)
+ +    return addr;
+ +
+ +  /* The vDSO containes either
+ +
+ +     24021061 li v0, 0x1061 (rt_sigreturn)
+ +     0000000c syscall
+ +     or
+ +     24021017 li v0, 0x1017 (sigreturn)
+ +     0000000c syscall  */
+ +  if (pc[1] != 0x0000000c)
+ +    return addr;
+ +#if _MIPS_SIM == _ABIO32
+ +  if (pc[0] == (0x24020000 | __NR_sigreturn))
+ +    return (void *) ((uintptr_t) addr - 2);
+ +#endif
+ +  if (pc[0] == (0x24020000 | __NR_rt_sigreturn))
+ +    return (void *) ((uintptr_t) addr - 2);
+ +
+ +  return addr;
+ +}
+ +
+ +#endif
+ diff --git a/sysdeps/unix/sysv/linux/msgctl.c b/sysdeps/unix/sysv/linux/msgctl.c
+ index 27879e76cd..fd46aec1a0 100644
+ --- a/sysdeps/unix/sysv/linux/msgctl.c
+ +++ b/sysdeps/unix/sysv/linux/msgctl.c
+ @@ -21,6 +21,7 @@
+  #include <sysdep.h>
+  #include <shlib-compat.h>
+  #include <errno.h>
+ +#include <linux/posix_types.h>  /* For __kernel_mode_t.  */
+  
+  #ifndef DEFAULT_VERSION
+  # ifndef __ASSUME_SYSVIPC_BROKEN_MODE_T
+ @@ -61,7 +62,6 @@ __new_msgctl (int msqid, int cmd, struct msqid_ds *buf)
+  
+    int ret = msgctl_syscall (msqid, cmd, buf);
+  
+ -#ifdef __ASSUME_SYSVIPC_BROKEN_MODE_T
+    if (ret >= 0)
+      {
+        switch (cmd)
+ @@ -69,10 +69,16 @@ __new_msgctl (int msqid, int cmd, struct msqid_ds *buf)
+       case IPC_STAT:
+       case MSG_STAT:
+       case MSG_STAT_ANY:
+ +#ifdef __ASSUME_SYSVIPC_BROKEN_MODE_T
+         buf->msg_perm.mode >>= 16;
+ +#else
+ +       /* Old Linux kernel versions might not clear the mode padding.  */
+ +       if (sizeof ((struct msqid_ds){0}.msg_perm.mode)
+ +           != sizeof (__kernel_mode_t))
+ +         buf->msg_perm.mode &= 0xFFFF;
+ +#endif
+       }
+      }
+ -#endif
+  
+    return ret;
+  }
+ diff --git a/sysdeps/unix/sysv/linux/nios2/kernel-features.h b/sysdeps/unix/sysv/linux/nios2/kernel-features.h
+ deleted file mode 100644
+ index d68d114981..0000000000
+ --- a/sysdeps/unix/sysv/linux/nios2/kernel-features.h
+ +++ /dev/null
+ @@ -1,22 +0,0 @@
+ -/* Set flags signalling availability of kernel features based on given
+ -   kernel version number.  NIOS2 version.
+ -   Copyright (C) 2019-2020 Free Software Foundation, Inc.
+ -   This file is part of the GNU C Library.
+ -
+ -   The GNU C Library is free software; you can redistribute it and/or
+ -   modify it under the terms of the GNU Lesser General Public
+ -   License as published by the Free Software Foundation; either
+ -   version 2.1 of the License, or (at your option) any later version.
+ -
+ -   The GNU C Library is distributed in the hope that it will be useful,
+ -   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ -   Lesser General Public License for more details.
+ -
+ -   You should have received a copy of the GNU Lesser General Public
+ -   License along with the GNU C Library; if not, see
+ -   <https://www.gnu.org/licenses/>.  */
+ -
+ -#include_next <kernel-features.h>
+ -
+ -#undef __ASSUME_SYSVIPC_DEFAULT_IPC_64
+ diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h b/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h
+ index 725dfafde8..ffc150851e 100644
+ --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h
+ +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h
+ @@ -134,47 +134,47 @@
+  # define LOADARGS_0(name, dummy)                                           \
+       r0 = name
+  # define LOADARGS_1(name, __arg1) \
+ -     long int arg1 = (long int) (__arg1);    \
+ +     long int _arg1 = (long int) (__arg1);   \
+    LOADARGS_0(name, 0);                                          \
+       extern void __illegally_sized_syscall_arg1 (void); \
+       if (__builtin_classify_type (__arg1) != 5 && sizeof (__arg1) > 4) \
+         __illegally_sized_syscall_arg1 (); \
+ -     r3 = arg1
+ +     r3 = _arg1
+  # define LOADARGS_2(name, __arg1, __arg2) \
+ -     long int arg2 = (long int) (__arg2); \
+ +     long int _arg2 = (long int) (__arg2); \
+       LOADARGS_1(name, __arg1); \
+       extern void __illegally_sized_syscall_arg2 (void); \
+       if (__builtin_classify_type (__arg2) != 5 && sizeof (__arg2) > 4) \
+         __illegally_sized_syscall_arg2 (); \
+ -     r4 = arg2
+ +     r4 = _arg2
+  # define LOADARGS_3(name, __arg1, __arg2, __arg3) \
+ -     long int arg3 = (long int) (__arg3); \
+ +     long int _arg3 = (long int) (__arg3); \
+       LOADARGS_2(name, __arg1, __arg2); \
+       extern void __illegally_sized_syscall_arg3 (void); \
+       if (__builtin_classify_type (__arg3) != 5 && sizeof (__arg3) > 4) \
+         __illegally_sized_syscall_arg3 (); \
+ -     r5 = arg3
+ +     r5 = _arg3
+  # define LOADARGS_4(name, __arg1, __arg2, __arg3, __arg4) \
+ -     long int arg4 = (long int) (__arg4); \
+ +     long int _arg4 = (long int) (__arg4); \
+       LOADARGS_3(name, __arg1, __arg2, __arg3); \
+       extern void __illegally_sized_syscall_arg4 (void); \
+       if (__builtin_classify_type (__arg4) != 5 && sizeof (__arg4) > 4) \
+         __illegally_sized_syscall_arg4 (); \
+ -     r6 = arg4
+ +     r6 = _arg4
+  # define LOADARGS_5(name, __arg1, __arg2, __arg3, __arg4, __arg5) \
+ -     long int arg5 = (long int) (__arg5); \
+ +     long int _arg5 = (long int) (__arg5); \
+       LOADARGS_4(name, __arg1, __arg2, __arg3, __arg4); \
+       extern void __illegally_sized_syscall_arg5 (void); \
+       if (__builtin_classify_type (__arg5) != 5 && sizeof (__arg5) > 4) \
+         __illegally_sized_syscall_arg5 (); \
+ -     r7 = arg5
+ +     r7 = _arg5
+  # define LOADARGS_6(name, __arg1, __arg2, __arg3, __arg4, __arg5, __arg6) \
+ -     long int arg6 = (long int) (__arg6); \
+ +     long int _arg6 = (long int) (__arg6); \
+       LOADARGS_5(name, __arg1, __arg2, __arg3, __arg4, __arg5); \
+       extern void __illegally_sized_syscall_arg6 (void); \
+       if (__builtin_classify_type (__arg6) != 5 && sizeof (__arg6) > 4) \
+         __illegally_sized_syscall_arg6 (); \
+ -     r8 = arg6
+ +     r8 = _arg6
+  
+  # define ASM_INPUT_0 "0" (r0)
+  # define ASM_INPUT_1 ASM_INPUT_0, "1" (r3)
+ diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h b/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h
+ index ee7f43653d..8a3f1c43e4 100644
+ --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h
+ +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h
+ @@ -139,47 +139,47 @@
+  #define LOADARGS_0(name, dummy) \
+       r0 = name
+  #define LOADARGS_1(name, __arg1) \
+ -     long int arg1 = (long int) (__arg1); \
+ +     long int _arg1 = (long int) (__arg1); \
+       LOADARGS_0(name, 0); \
+       extern void __illegally_sized_syscall_arg1 (void); \
+       if (__builtin_classify_type (__arg1) != 5 && sizeof (__arg1) > 8) \
+         __illegally_sized_syscall_arg1 (); \
+ -     r3 = arg1
+ +     r3 = _arg1
+  #define LOADARGS_2(name, __arg1, __arg2) \
+ -     long int arg2 = (long int) (__arg2); \
+ +     long int _arg2 = (long int) (__arg2); \
+       LOADARGS_1(name, __arg1); \
+       extern void __illegally_sized_syscall_arg2 (void); \
+       if (__builtin_classify_type (__arg2) != 5 && sizeof (__arg2) > 8) \
+         __illegally_sized_syscall_arg2 (); \
+ -     r4 = arg2
+ +     r4 = _arg2
+  #define LOADARGS_3(name, __arg1, __arg2, __arg3) \
+ -     long int arg3 = (long int) (__arg3); \
+ +     long int _arg3 = (long int) (__arg3); \
+       LOADARGS_2(name, __arg1, __arg2); \
+       extern void __illegally_sized_syscall_arg3 (void); \
+       if (__builtin_classify_type (__arg3) != 5 && sizeof (__arg3) > 8) \
+         __illegally_sized_syscall_arg3 (); \
+ -     r5 = arg3
+ +     r5 = _arg3
+  #define LOADARGS_4(name, __arg1, __arg2, __arg3, __arg4) \
+ -     long int arg4 = (long int) (__arg4); \
+ +     long int _arg4 = (long int) (__arg4); \
+       LOADARGS_3(name, __arg1, __arg2, __arg3); \
+       extern void __illegally_sized_syscall_arg4 (void); \
+       if (__builtin_classify_type (__arg4) != 5 && sizeof (__arg4) > 8) \
+         __illegally_sized_syscall_arg4 (); \
+ -     r6 = arg4
+ +     r6 = _arg4
+  #define LOADARGS_5(name, __arg1, __arg2, __arg3, __arg4, __arg5) \
+ -     long int arg5 = (long int) (__arg5); \
+ +     long int _arg5 = (long int) (__arg5); \
+       LOADARGS_4(name, __arg1, __arg2, __arg3, __arg4); \
+       extern void __illegally_sized_syscall_arg5 (void); \
+       if (__builtin_classify_type (__arg5) != 5 && sizeof (__arg5) > 8) \
+         __illegally_sized_syscall_arg5 (); \
+ -     r7 = arg5
+ +     r7 = _arg5
+  #define LOADARGS_6(name, __arg1, __arg2, __arg3, __arg4, __arg5, __arg6) \
+ -     long int arg6 = (long int) (__arg6); \
+ +     long int _arg6 = (long int) (__arg6); \
+       LOADARGS_5(name, __arg1, __arg2, __arg3, __arg4, __arg5); \
+       extern void __illegally_sized_syscall_arg6 (void); \
+       if (__builtin_classify_type (__arg6) != 5 && sizeof (__arg6) > 8) \
+         __illegally_sized_syscall_arg6 (); \
+ -     r8 = arg6
+ +     r8 = _arg6
+  
+  #define ASM_INPUT_0 "0" (r0)
+  #define ASM_INPUT_1 ASM_INPUT_0, "1" (r3)
+ diff --git a/sysdeps/unix/sysv/linux/prctl.c b/sysdeps/unix/sysv/linux/prctl.c
+ new file mode 100644
+ index 0000000000..d5725f14cf
+ --- /dev/null
+ +++ b/sysdeps/unix/sysv/linux/prctl.c
+ @@ -0,0 +1,42 @@
+ +/* prctl - Linux specific syscall.
+ +   Copyright (C) 2020 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <sysdep.h>
+ +#include <stdarg.h>
+ +#include <sys/prctl.h>
+ +
+ +/* Unconditionally read all potential arguments.  This may pass
+ +   garbage values to the kernel, but avoids the need for teaching
+ +   glibc the argument counts of individual options (including ones
+ +   that are added to the kernel in the future).  */
+ +
+ +int
+ +__prctl (int option, ...)
+ +{
+ +  va_list arg;
+ +  va_start (arg, option);
+ +  unsigned long int arg2 = va_arg (arg, unsigned long int);
+ +  unsigned long int arg3 = va_arg (arg, unsigned long int);
+ +  unsigned long int arg4 = va_arg (arg, unsigned long int);
+ +  unsigned long int arg5 = va_arg (arg, unsigned long int);
+ +  va_end (arg);
+ +  return INLINE_SYSCALL_CALL (prctl, option, arg2, arg3, arg4, arg5);
+ +}
+ +
+ +libc_hidden_def (__prctl)
+ +weak_alias (__prctl, prctl)
+ diff --git a/sysdeps/unix/sysv/linux/process_vm_readv.c b/sysdeps/unix/sysv/linux/process_vm_readv.c
+ new file mode 100644
+ index 0000000000..e1377f7e50
+ --- /dev/null
+ +++ b/sysdeps/unix/sysv/linux/process_vm_readv.c
+ @@ -0,0 +1,32 @@
+ +/* process_vm_readv - Linux specific syscall.
+ +   Copyright (C) 2020 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <unistd.h>
+ +#include <sysdep.h>
+ +#include <errno.h>
+ +#include <sys/uio.h>
+ +
+ +ssize_t
+ +process_vm_readv (pid_t pid, const struct iovec *local_iov,
+ +               unsigned long int liovcnt,
+ +               const struct iovec *remote_iov,
+ +               unsigned long int riovcnt, unsigned long int flags)
+ +{
+ +  return INLINE_SYSCALL_CALL (process_vm_readv, pid, local_iov,
+ +                           liovcnt, remote_iov, riovcnt, flags);
+ +}
+ diff --git a/sysdeps/unix/sysv/linux/process_vm_writev.c b/sysdeps/unix/sysv/linux/process_vm_writev.c
+ new file mode 100644
+ index 0000000000..944ab9b7f1
+ --- /dev/null
+ +++ b/sysdeps/unix/sysv/linux/process_vm_writev.c
+ @@ -0,0 +1,32 @@
+ +/* process_vm_writev - Linux specific syscall.
+ +   Copyright (C) 2020 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <unistd.h>
+ +#include <sysdep.h>
+ +#include <errno.h>
+ +#include <sys/uio.h>
+ +
+ +ssize_t
+ +process_vm_writev (pid_t pid, const struct iovec *local_iov,
+ +                unsigned long int liovcnt,
+ +                const struct iovec *remote_iov,
+ +                unsigned long int riovcnt, unsigned long int flags)
+ +{
+ +  return INLINE_SYSCALL_CALL (process_vm_writev, pid, local_iov,
+ +                           liovcnt, remote_iov, riovcnt, flags);
+ +}
+ diff --git a/sysdeps/unix/sysv/linux/ptsname.c b/sysdeps/unix/sysv/linux/ptsname.c
+ index 81d9d26f1e..3e9be3f0d4 100644
+ --- a/sysdeps/unix/sysv/linux/ptsname.c
+ +++ b/sysdeps/unix/sysv/linux/ptsname.c
+ @@ -21,39 +21,14 @@
+  #include <stdlib.h>
+  #include <string.h>
+  #include <sys/ioctl.h>
+ -#include <sys/stat.h>
+ -#include <sys/sysmacros.h>
+  #include <termios.h>
+  #include <unistd.h>
+  
+  #include <_itoa.h>
+  
+ -/* Check if DEV corresponds to a master pseudo terminal device.  */
+ -#define MASTER_P(Dev)                                                        \
+ -  (__gnu_dev_major ((Dev)) == 2                                              \
+ -   || (__gnu_dev_major ((Dev)) == 4                                  \
+ -       && __gnu_dev_minor ((Dev)) >= 128 && __gnu_dev_minor ((Dev)) < 192) \
+ -   || (__gnu_dev_major ((Dev)) >= 128 && __gnu_dev_major ((Dev)) < 136))
+ -
+ -/* Check if DEV corresponds to a slave pseudo terminal device.  */
+ -#define SLAVE_P(Dev)                                                 \
+ -  (__gnu_dev_major ((Dev)) == 3                                              \
+ -   || (__gnu_dev_major ((Dev)) == 4                                  \
+ -       && __gnu_dev_minor ((Dev)) >= 192 && __gnu_dev_minor ((Dev)) < 256) \
+ -   || (__gnu_dev_major ((Dev)) >= 136 && __gnu_dev_major ((Dev)) < 144))
+ -
+ -/* Note that major number 4 corresponds to the old BSD style pseudo
+ -   terminal devices.  As of Linux 2.1.115 these are no longer
+ -   supported.  They have been replaced by major numbers 2 (masters)
+ -   and 3 (slaves).  */
+ -
+  /* Directory where we can find the slave pty nodes.  */
+  #define _PATH_DEVPTS "/dev/pts/"
+  
+ -/* The are declared in getpt.c.  */
+ -extern const char __libc_ptyname1[] attribute_hidden;
+ -extern const char __libc_ptyname2[] attribute_hidden;
+ -
+  /* Static buffer for `ptsname'.  */
+  static char buffer[sizeof (_PATH_DEVPTS) + 20];
+  
+ @@ -68,19 +43,15 @@ ptsname (int fd)
+  }
+  
+  
+ +/* Store at most BUFLEN characters of the pathname of the slave pseudo
+ +   terminal associated with the master FD is open on in BUF.
+ +   Return 0 on success, otherwise an error number.  */
+  int
+ -__ptsname_internal (int fd, char *buf, size_t buflen, struct stat64 *stp)
+ +__ptsname_r (int fd, char *buf, size_t buflen)
+  {
+    int save_errno = errno;
+    unsigned int ptyno;
+  
+ -  if (!__isatty (fd))
+ -    {
+ -      __set_errno (ENOTTY);
+ -      return ENOTTY;
+ -    }
+ -
+ -#ifdef TIOCGPTN
+    if (__ioctl (fd, TIOCGPTN, &ptyno) == 0)
+      {
+        /* Buffer we use to print the number in.  For a maximum size for
+ @@ -101,67 +72,11 @@ __ptsname_internal (int fd, char *buf, size_t buflen, struct stat64 *stp)
+  
+        memcpy (__stpcpy (buf, devpts), p, &numbuf[sizeof (numbuf)] - p);
+      }
+ -  else if (errno != EINVAL)
+ -    return errno;
+    else
+ -#endif
+ -    {
+ -      char *p;
+ -
+ -      if (buflen < strlen (_PATH_TTY) + 3)
+ -     {
+ -       __set_errno (ERANGE);
+ -       return ERANGE;
+ -     }
+ -
+ -      if (__fxstat64 (_STAT_VER, fd, stp) < 0)
+ -     return errno;
+ -
+ -      /* Check if FD really is a master pseudo terminal.  */
+ -      if (! MASTER_P (stp->st_rdev))
+ -     {
+ -       __set_errno (ENOTTY);
+ -       return ENOTTY;
+ -     }
+ -
+ -      ptyno = __gnu_dev_minor (stp->st_rdev);
+ -
+ -      if (ptyno / 16 >= strlen (__libc_ptyname1))
+ -     {
+ -       __set_errno (ENOTTY);
+ -       return ENOTTY;
+ -     }
+ -
+ -      p = __stpcpy (buf, _PATH_TTY);
+ -      p[0] = __libc_ptyname1[ptyno / 16];
+ -      p[1] = __libc_ptyname2[ptyno % 16];
+ -      p[2] = '\0';
+ -    }
+ -
+ -  if (__xstat64 (_STAT_VER, buf, stp) < 0)
+ +    /* Bad file descriptor, or not a ptmx descriptor.  */
+      return errno;
+  
+ -  /* Check if the name we're about to return really corresponds to a
+ -     slave pseudo terminal.  */
+ -  if (! S_ISCHR (stp->st_mode) || ! SLAVE_P (stp->st_rdev))
+ -    {
+ -      /* This really is a configuration problem.  */
+ -      __set_errno (ENOTTY);
+ -      return ENOTTY;
+ -    }
+ -
+    __set_errno (save_errno);
+    return 0;
+  }
+ -
+ -
+ -/* Store at most BUFLEN characters of the pathname of the slave pseudo
+ -   terminal associated with the master FD is open on in BUF.
+ -   Return 0 on success, otherwise an error number.  */
+ -int
+ -__ptsname_r (int fd, char *buf, size_t buflen)
+ -{
+ -  struct stat64 st;
+ -  return __ptsname_internal (fd, buf, buflen, &st);
+ -}
+  weak_alias (__ptsname_r, ptsname_r)
+ diff --git a/sysdeps/unix/sysv/linux/riscv/sysdep.h b/sysdeps/unix/sysv/linux/riscv/sysdep.h
+ index 201bf9a91b..2bd9b16f32 100644
+ --- a/sysdeps/unix/sysv/linux/riscv/sysdep.h
+ +++ b/sysdeps/unix/sysv/linux/riscv/sysdep.h
+ @@ -176,10 +176,11 @@
+  # define internal_syscall1(number, err, arg0)                                \
+  ({                                                                   \
+       long int _sys_result;                                           \
+ +     long int _arg0 = (long int) (arg0);                             \
+                                                                       \
+       {                                                               \
+       register long int __a7 asm ("a7") = number;                     \
+ -     register long int __a0 asm ("a0") = (long int) (arg0);          \
+ +     register long int __a0 asm ("a0") = _arg0;                      \
+       __asm__ volatile (                                              \
+       "scall\n\t"                                                     \
+       : "+r" (__a0)                                                   \
+ @@ -193,11 +194,13 @@
+  # define internal_syscall2(number, err, arg0, arg1)                  \
+  ({                                                                   \
+       long int _sys_result;                                           \
+ +     long int _arg0 = (long int) (arg0);                             \
+ +     long int _arg1 = (long int) (arg1);                             \
+                                                                       \
+       {                                                               \
+       register long int __a7 asm ("a7") = number;                     \
+ -     register long int __a0 asm ("a0") = (long int) (arg0);          \
+ -     register long int __a1 asm ("a1") = (long int) (arg1);          \
+ +     register long int __a0 asm ("a0") = _arg0;                      \
+ +     register long int __a1 asm ("a1") = _arg1;                      \
+       __asm__ volatile (                                              \
+       "scall\n\t"                                                     \
+       : "+r" (__a0)                                                   \
+ @@ -211,12 +214,15 @@
+  # define internal_syscall3(number, err, arg0, arg1, arg2)                    \
+  ({                                                                   \
+       long int _sys_result;                                           \
+ +     long int _arg0 = (long int) (arg0);                             \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+                                                                       \
+       {                                                               \
+       register long int __a7 asm ("a7") = number;                     \
+ -     register long int __a0 asm ("a0") = (long int) (arg0);          \
+ -     register long int __a1 asm ("a1") = (long int) (arg1);          \
+ -     register long int __a2 asm ("a2") = (long int) (arg2);          \
+ +     register long int __a0 asm ("a0") = _arg0;                      \
+ +     register long int __a1 asm ("a1") = _arg1;                      \
+ +     register long int __a2 asm ("a2") = _arg2;                      \
+       __asm__ volatile (                                              \
+       "scall\n\t"                                                     \
+       : "+r" (__a0)                                                   \
+ @@ -230,13 +236,17 @@
+  # define internal_syscall4(number, err, arg0, arg1, arg2, arg3)        \
+  ({                                                                   \
+       long int _sys_result;                                           \
+ +     long int _arg0 = (long int) (arg0);                             \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     long int _arg3 = (long int) (arg3);                             \
+                                                                       \
+       {                                                               \
+       register long int __a7 asm ("a7") = number;                     \
+ -     register long int __a0 asm ("a0") = (long int) (arg0);          \
+ -     register long int __a1 asm ("a1") = (long int) (arg1);          \
+ -     register long int __a2 asm ("a2") = (long int) (arg2);          \
+ -     register long int __a3 asm ("a3") = (long int) (arg3);          \
+ +     register long int __a0 asm ("a0") = _arg0;                      \
+ +     register long int __a1 asm ("a1") = _arg1;                      \
+ +     register long int __a2 asm ("a2") = _arg2;                      \
+ +     register long int __a3 asm ("a3") = _arg3;                      \
+       __asm__ volatile (                                              \
+       "scall\n\t"                                                     \
+       : "+r" (__a0)                                                   \
+ @@ -250,14 +260,19 @@
+  # define internal_syscall5(number, err, arg0, arg1, arg2, arg3, arg4)   \
+  ({                                                                   \
+       long int _sys_result;                                           \
+ +     long int _arg0 = (long int) (arg0);                             \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     long int _arg3 = (long int) (arg3);                             \
+ +     long int _arg4 = (long int) (arg4);                             \
+                                                                       \
+       {                                                               \
+       register long int __a7 asm ("a7") = number;                     \
+ -     register long int __a0 asm ("a0") = (long int) (arg0);          \
+ -     register long int __a1 asm ("a1") = (long int) (arg1);          \
+ -     register long int __a2 asm ("a2") = (long int) (arg2);          \
+ -     register long int __a3 asm ("a3") = (long int) (arg3);          \
+ -     register long int __a4 asm ("a4") = (long int) (arg4);          \
+ +     register long int __a0 asm ("a0") = _arg0;                      \
+ +     register long int __a1 asm ("a1") = _arg1;                      \
+ +     register long int __a2 asm ("a2") = _arg2;                      \
+ +     register long int __a3 asm ("a3") = _arg3;                      \
+ +     register long int __a4 asm ("a4") = _arg4;                      \
+       __asm__ volatile (                                              \
+       "scall\n\t"                                                     \
+       : "+r" (__a0)                                                   \
+ @@ -271,15 +286,21 @@
+  # define internal_syscall6(number, err, arg0, arg1, arg2, arg3, arg4, arg5) \
+  ({                                                                   \
+       long int _sys_result;                                           \
+ +     long int _arg0 = (long int) (arg0);                             \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     long int _arg3 = (long int) (arg3);                             \
+ +     long int _arg4 = (long int) (arg4);                             \
+ +     long int _arg5 = (long int) (arg5);                             \
+                                                                       \
+       {                                                               \
+       register long int __a7 asm ("a7") = number;                     \
+ -     register long int __a0 asm ("a0") = (long int) (arg0);          \
+ -     register long int __a1 asm ("a1") = (long int) (arg1);          \
+ -     register long int __a2 asm ("a2") = (long int) (arg2);          \
+ -     register long int __a3 asm ("a3") = (long int) (arg3);          \
+ -     register long int __a4 asm ("a4") = (long int) (arg4);          \
+ -     register long int __a5 asm ("a5") = (long int) (arg5);          \
+ +     register long int __a0 asm ("a0") = _arg0;                      \
+ +     register long int __a1 asm ("a1") = _arg1;                      \
+ +     register long int __a2 asm ("a2") = _arg2;                      \
+ +     register long int __a3 asm ("a3") = _arg3;                      \
+ +     register long int __a4 asm ("a4") = _arg4;                      \
+ +     register long int __a5 asm ("a5") = _arg5;                      \
+       __asm__ volatile (                                              \
+       "scall\n\t"                                                     \
+       : "+r" (__a0)                                                   \
+ @@ -294,16 +315,23 @@
+  # define internal_syscall7(number, err, arg0, arg1, arg2, arg3, arg4, arg5, arg6) \
+  ({                                                                   \
+       long int _sys_result;                                           \
+ +     long int _arg0 = (long int) (arg0);                             \
+ +     long int _arg1 = (long int) (arg1);                             \
+ +     long int _arg2 = (long int) (arg2);                             \
+ +     long int _arg3 = (long int) (arg3);                             \
+ +     long int _arg4 = (long int) (arg4);                             \
+ +     long int _arg5 = (long int) (arg5);                             \
+ +     long int _arg6 = (long int) (arg6);                             \
+                                                                       \
+       {                                                               \
+       register long int __a7 asm ("a7") = number;                     \
+ -     register long int __a0 asm ("a0") = (long int) (arg0);          \
+ -     register long int __a1 asm ("a1") = (long int) (arg1);          \
+ -     register long int __a2 asm ("a2") = (long int) (arg2);          \
+ -     register long int __a3 asm ("a3") = (long int) (arg3);          \
+ -     register long int __a4 asm ("a4") = (long int) (arg4);          \
+ -     register long int __a5 asm ("a5") = (long int) (arg5);          \
+ -     register long int __a6 asm ("a6") = (long int) (arg6);          \
+ +     register long int __a0 asm ("a0") = _arg0;                      \
+ +     register long int __a1 asm ("a1") = _arg1;                      \
+ +     register long int __a2 asm ("a2") = _arg2;                      \
+ +     register long int __a3 asm ("a3") = _arg3;                      \
+ +     register long int __a4 asm ("a4") = _arg4;                      \
+ +     register long int __a5 asm ("a5") = _arg5;                      \
+ +     register long int __a6 asm ("a6") = _arg6;                      \
+       __asm__ volatile (                                              \
+       "scall\n\t"                                                     \
+       : "+r" (__a0)                                                   \
+ diff --git a/sysdeps/unix/sysv/linux/semctl.c b/sysdeps/unix/sysv/linux/semctl.c
+ index 0c3eb0932f..30571af49f 100644
+ --- a/sysdeps/unix/sysv/linux/semctl.c
+ +++ b/sysdeps/unix/sysv/linux/semctl.c
+ @@ -22,6 +22,7 @@
+  #include <sysdep.h>
+  #include <shlib-compat.h>
+  #include <errno.h>
+ +#include <linux/posix_types.h>  /* For __kernel_mode_t.  */
+  
+  /* Define a `union semun' suitable for Linux here.  */
+  union semun
+ @@ -92,7 +93,6 @@ __new_semctl (int semid, int semnum, int cmd, ...)
+  
+    int ret = semctl_syscall (semid, semnum, cmd, arg);
+  
+ -#ifdef __ASSUME_SYSVIPC_BROKEN_MODE_T
+    if (ret >= 0)
+      {
+        switch (cmd)
+ @@ -100,10 +100,16 @@ __new_semctl (int semid, int semnum, int cmd, ...)
+          case IPC_STAT:
+          case SEM_STAT:
+          case SEM_STAT_ANY:
+ +#ifdef __ASSUME_SYSVIPC_BROKEN_MODE_T
+            arg.buf->sem_perm.mode >>= 16;
+ +#else
+ +       /* Old Linux kernel versions might not clear the mode padding.  */
+ +       if (sizeof ((struct semid_ds){0}.sem_perm.mode)
+ +           != sizeof (__kernel_mode_t))
+ +         arg.buf->sem_perm.mode &= 0xFFFF;
+ +#endif
+       }
+      }
+ -#endif
+  
+    return ret;
+  }
+ diff --git a/sysdeps/unix/sysv/linux/sh/be/sh4/fpu/Implies b/sysdeps/unix/sysv/linux/sh/be/sh4/fpu/Implies
+ new file mode 100644
+ index 0000000000..7eeaf15a5a
+ --- /dev/null
+ +++ b/sysdeps/unix/sysv/linux/sh/be/sh4/fpu/Implies
+ @@ -0,0 +1 @@
+ +unix/sysv/linux/sh/sh4/fpu
+ diff --git a/sysdeps/unix/sysv/linux/sh/le/sh4/fpu/Implies b/sysdeps/unix/sysv/linux/sh/le/sh4/fpu/Implies
+ new file mode 100644
+ index 0000000000..7eeaf15a5a
+ --- /dev/null
+ +++ b/sysdeps/unix/sysv/linux/sh/le/sh4/fpu/Implies
+ @@ -0,0 +1 @@
+ +unix/sysv/linux/sh/sh4/fpu
+ diff --git a/sysdeps/unix/sysv/linux/shmctl.c b/sysdeps/unix/sysv/linux/shmctl.c
+ index 39fa861e17..f41b359b8b 100644
+ --- a/sysdeps/unix/sysv/linux/shmctl.c
+ +++ b/sysdeps/unix/sysv/linux/shmctl.c
+ @@ -22,6 +22,7 @@
+  #include <sysdep.h>
+  #include <shlib-compat.h>
+  #include <errno.h>
+ +#include <linux/posix_types.h>  /* For __kernel_mode_t.  */
+  
+  #ifndef DEFAULT_VERSION
+  # ifndef __ASSUME_SYSVIPC_BROKEN_MODE_T
+ @@ -63,7 +64,6 @@ __new_shmctl (int shmid, int cmd, struct shmid_ds *buf)
+  
+    int ret = shmctl_syscall (shmid, cmd, buf);
+  
+ -#ifdef __ASSUME_SYSVIPC_BROKEN_MODE_T
+    if (ret >= 0)
+      {
+        switch (cmd)
+ @@ -71,10 +71,16 @@ __new_shmctl (int shmid, int cmd, struct shmid_ds *buf)
+          case IPC_STAT:
+          case SHM_STAT:
+          case SHM_STAT_ANY:
+ +#ifdef __ASSUME_SYSVIPC_BROKEN_MODE_T
+            buf->shm_perm.mode >>= 16;
+ +#else
+ +       /* Old Linux kernel versions might not clear the mode padding.  */
+ +       if (sizeof ((struct shmid_ds){0}.shm_perm.mode)
+ +           != sizeof (__kernel_mode_t))
+ +         buf->shm_perm.mode &= 0xFFFF;
+ +#endif
+       }
+      }
+ -#endif
+  
+    return ret;
+  }
+ diff --git a/sysdeps/unix/sysv/linux/sparc/Makefile b/sysdeps/unix/sysv/linux/sparc/Makefile
+ index b0d182a439..1475039677 100644
+ --- a/sysdeps/unix/sysv/linux/sparc/Makefile
+ +++ b/sysdeps/unix/sysv/linux/sparc/Makefile
+ @@ -11,8 +11,12 @@ ifeq ($(subdir),sysvipc)
+  sysdep_routines += getshmlba
+  endif
+  
+ +ifeq ($(subdir),signal)
+ +sysdep_routines += sigreturn_stub
+ +endif
+ +
+  ifeq ($(subdir),nptl)
+  # pull in __syscall_error routine
+ -libpthread-routines += sysdep
+ -libpthread-shared-only-routines += sysdep
+ +libpthread-routines += sysdep sigreturn_stub
+ +libpthread-shared-only-routines += sysdep sigreturn_stub
+  endif
+ diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/sigaction.c b/sysdeps/unix/sysv/linux/sparc/sparc32/sigaction.c
+ index 6b2f664226..938aa7aa8c 100644
+ --- a/sysdeps/unix/sysv/linux/sparc/sparc32/sigaction.c
+ +++ b/sysdeps/unix/sysv/linux/sparc/sparc32/sigaction.c
+ @@ -24,8 +24,8 @@
+  #include <kernel_sigaction.h>
+  #include <sysdep.h>
+  
+ -static void __rt_sigreturn_stub (void);
+ -static void __sigreturn_stub (void);
+ +void __rt_sigreturn_stub (void);
+ +void __sigreturn_stub (void);
+  
+  #define STUB(act, sigsetsize) \
+    (act) ? ((unsigned long)((act->sa_flags & SA_SIGINFO)      \
+ @@ -35,25 +35,3 @@ static void __sigreturn_stub (void);
+    (sigsetsize)
+  
+  #include <sysdeps/unix/sysv/linux/sigaction.c>
+ -
+ -static
+ -inhibit_stack_protector
+ -void
+ -__rt_sigreturn_stub (void)
+ -{
+ -  __asm__ ("mov %0, %%g1\n\t"
+ -        "ta  0x10\n\t"
+ -        : /* no outputs */
+ -        : "i" (__NR_rt_sigreturn));
+ -}
+ -
+ -static
+ -inhibit_stack_protector
+ -void
+ -__sigreturn_stub (void)
+ -{
+ -  __asm__ ("mov %0, %%g1\n\t"
+ -        "ta  0x10\n\t"
+ -        : /* no outputs */
+ -        : "i" (__NR_sigreturn));
+ -}
+ diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/sigreturn_stub.S b/sysdeps/unix/sysv/linux/sparc/sparc32/sigreturn_stub.S
+ new file mode 100644
+ index 0000000000..727cc94737
+ --- /dev/null
+ +++ b/sysdeps/unix/sysv/linux/sparc/sparc32/sigreturn_stub.S
+ @@ -0,0 +1,34 @@
+ +/* Sigreturn stub function used on sa_restore field.
+ +   Copyright (C) 2020 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <sysdep.h>
+ +
+ +/* These functions must not change the register window or the stack
+ +   pointer [1].
+ +
+ +   [1] https://lkml.org/lkml/2016/5/27/465  */
+ +
+ +ENTRY (__rt_sigreturn_stub)
+ +     mov     __NR_rt_sigreturn, %g1
+ +     ta      0x10
+ +END (__rt_sigreturn_stub)
+ +
+ +ENTRY (__sigreturn_stub)
+ +     mov     __NR_sigreturn, %g1
+ +     ta      0x10
+ +END (__sigreturn_stub)
+ diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/sigaction.c b/sysdeps/unix/sysv/linux/sparc/sparc64/sigaction.c
+ index 9c0dc2a630..4e26172321 100644
+ --- a/sysdeps/unix/sysv/linux/sparc/sparc64/sigaction.c
+ +++ b/sysdeps/unix/sysv/linux/sparc/sparc64/sigaction.c
+ @@ -22,21 +22,11 @@
+  #include <syscall.h>
+  #include <sysdep.h>
+  
+ -static void __rt_sigreturn_stub (void);
+ +/* Defined on sigreturn_stub.S.  */
+ +void __rt_sigreturn_stub (void);
+  
+  #define STUB(act, sigsetsize) \
+    (((unsigned long) &__rt_sigreturn_stub) - 8),      \
+    (sigsetsize)
+  
+  #include <sysdeps/unix/sysv/linux/sigaction.c>
+ -
+ -static
+ -inhibit_stack_protector
+ -void
+ -__rt_sigreturn_stub (void)
+ -{
+ -  __asm__ ("mov %0, %%g1\n\t"
+ -        "ta  0x6d\n\t"
+ -        : /* no outputs */
+ -        : "i" (__NR_rt_sigreturn));
+ -}
+ diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/sigreturn_stub.S b/sysdeps/unix/sysv/linux/sparc/sparc64/sigreturn_stub.S
+ new file mode 100644
+ index 0000000000..add4766831
+ --- /dev/null
+ +++ b/sysdeps/unix/sysv/linux/sparc/sparc64/sigreturn_stub.S
+ @@ -0,0 +1,29 @@
+ +/* Sigreturn stub function used on sa_restore field.
+ +   Copyright (C) 2020 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <sysdep.h>
+ +
+ +/* This function must not change the register window or the stack
+ +   pointer [1].
+ +
+ +   [1] https://lkml.org/lkml/2016/5/27/465  */
+ +
+ +ENTRY (__rt_sigreturn_stub)
+ +     mov     __NR_rt_sigreturn, %g1
+ +     ta      0x6d
+ +END (__rt_sigreturn_stub)
+ diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
+ index 36e087d8f4..3d89814003 100644
+ --- a/sysdeps/unix/sysv/linux/syscall-names.list
+ +++ b/sysdeps/unix/sysv/linux/syscall-names.list
+ @@ -21,8 +21,8 @@
+  # This file can list all potential system calls.  The names are only
+  # used if the installed kernel headers also provide them.
+  
+ -# The list of system calls is current as of Linux 5.4.
+ -kernel 5.4
+ +# The list of system calls is current as of Linux 5.5.
+ +kernel 5.5
+  
+  FAST_atomic_update
+  FAST_cmpxchg
+ diff --git a/sysdeps/unix/sysv/linux/syscalls.list b/sysdeps/unix/sysv/linux/syscalls.list
+ index 5f1352ad43..52e6dafc86 100644
+ --- a/sysdeps/unix/sysv/linux/syscalls.list
+ +++ b/sysdeps/unix/sysv/linux/syscalls.list
+ @@ -28,25 +28,24 @@ inotify_add_watch EXTRA   inotify_add_watch       i:isi   inotify_add_watch
+  inotify_init EXTRA   inotify_init    i:      inotify_init
+  inotify_init1        EXTRA   inotify_init1   i:I     inotify_init1
+  inotify_rm_watch     EXTRA   inotify_rm_watch        i:ii    inotify_rm_watch
+ -ioperm               -       ioperm          i:iii   ioperm
+ +ioperm               -       ioperm          i:UUi   ioperm
+  iopl         -       iopl            i:i     iopl
+  klogctl              EXTRA   syslog          i:isi   klogctl
+  lchown               -       lchown          i:sii   __lchown        lchown
+ -mincore              -       mincore         i:anV   mincore
+ -mlock                -       mlock           i:bn    mlock
+ +mincore              -       mincore         i:aUV   mincore
+ +mlock                -       mlock           i:bU    mlock
+  mlockall     -       mlockall        i:i     mlockall
+ -mount                EXTRA   mount           i:sssip __mount mount
+ -mremap               EXTRA   mremap          b:ainip __mremap        mremap
+ -munlock              -       munlock         i:ai    munlock
+ +mount                EXTRA   mount           i:sssUp __mount mount
+ +mremap               EXTRA   mremap          b:aUUip __mremap        mremap
+ +munlock              -       munlock         i:aU    munlock
+  munlockall   -       munlockall      i:      munlockall
+  nfsservctl   EXTRA   nfsservctl      i:ipp   __compat_nfsservctl     nfsservctl@GLIBC_2.0:GLIBC_2.28
+  pipe         -       pipe            i:f     __pipe          pipe
+  pipe2                -       pipe2           i:fi    __pipe2         pipe2
+  pivot_root   EXTRA   pivot_root      i:ss    pivot_root
+ -prctl                EXTRA   prctl           i:iiiii __prctl         prctl
+  query_module EXTRA   query_module    i:sipip __compat_query_module   query_module@GLIBC_2.0:GLIBC_2.23
+  quotactl     EXTRA   quotactl        i:isip  quotactl
+ -remap_file_pages -   remap_file_pages i:piiii        __remap_file_pages remap_file_pages
+ +remap_file_pages -   remap_file_pages i:pUiUi        __remap_file_pages remap_file_pages
+  sched_getp   -       sched_getparam  i:ip    __sched_getparam        sched_getparam
+  sched_gets   -       sched_getscheduler      i:i     __sched_getscheduler    sched_getscheduler
+  sched_primax -       sched_get_priority_max  i:i     __sched_get_priority_max        sched_get_priority_max
+ @@ -55,8 +54,8 @@ sched_rr_gi -       sched_rr_get_interval   i:ip    __sched_rr_get_interval sched_rr_get_in
+  sched_setp   -       sched_setparam  i:ip    __sched_setparam        sched_setparam
+  sched_sets   -       sched_setscheduler      i:iip   __sched_setscheduler    sched_setscheduler
+  sched_yield  -       sched_yield     i:      __sched_yield   sched_yield
+ -sendfile     -       sendfile        i:iipi  sendfile
+ -sendfile64   -       sendfile64      i:iipi  sendfile64
+ +sendfile     -       sendfile        i:iipU  sendfile
+ +sendfile64   -       sendfile64      i:iipU  sendfile64
+  setfsgid     EXTRA   setfsgid        i:i     setfsgid
+  setfsuid     EXTRA   setfsuid        i:i     setfsuid
+  setpgid              -       setpgid         i:ii    __setpgid       setpgid
+ @@ -73,19 +72,19 @@ chown             -       chown           i:sii   __libc_chown    __chown chown
+  fchownat     -       fchownat        i:isiii fchownat
+  linkat               -       linkat          i:isisi linkat
+  mkdirat              -       mkdirat         i:isi   mkdirat
+ -readlinkat   -       readlinkat      i:issi  readlinkat
+ +readlinkat   -       readlinkat      i:issU  readlinkat
+  symlinkat    -       symlinkat       i:sis   symlinkat
+  unlinkat     -       unlinkat        i:isi   unlinkat
+  
+ -setxattr     -       setxattr        i:sspii setxattr
+ -lsetxattr    -       lsetxattr       i:sspii lsetxattr
+ -fsetxattr    -       fsetxattr       i:ispii fsetxattr
+ -getxattr     -       getxattr        i:sspi  getxattr
+ -lgetxattr    -       lgetxattr       i:sspi  lgetxattr
+ -fgetxattr    -       fgetxattr       i:ispi  fgetxattr
+ -listxattr    -       listxattr       i:ssi   listxattr
+ -llistxattr   -       llistxattr      i:ssi   llistxattr
+ -flistxattr   -       flistxattr      i:isi   flistxattr
+ +setxattr     -       setxattr        i:sspUi setxattr
+ +lsetxattr    -       lsetxattr       i:sspUi lsetxattr
+ +fsetxattr    -       fsetxattr       i:ispUi fsetxattr
+ +getxattr     -       getxattr        i:sspU  getxattr
+ +lgetxattr    -       lgetxattr       i:sspU  lgetxattr
+ +fgetxattr    -       fgetxattr       i:ispU  fgetxattr
+ +listxattr    -       listxattr       i:ssU   listxattr
+ +llistxattr   -       llistxattr      i:ssU   llistxattr
+ +flistxattr   -       flistxattr      i:isU   flistxattr
+  removexattr  -       removexattr     i:ss    removexattr
+  lremovexattr -       lremovexattr    i:ss    lremovexattr
+  fremovexattr -       fremovexattr    i:is    fremovexattr
+ @@ -102,8 +101,6 @@ name_to_handle_at EXTRA   name_to_handle_at i:isppi name_to_handle_at
+  
+  setns                EXTRA   setns           i:ii    setns
+  
+ -process_vm_readv EXTRA       process_vm_readv i:ipipii process_vm_readv
+ -process_vm_writev EXTRA      process_vm_writev i:ipipii process_vm_writev
+  memfd_create    EXTRA        memfd_create    i:si    memfd_create
+  pkey_alloc   EXTRA   pkey_alloc      i:ii    pkey_alloc
+  pkey_free    EXTRA   pkey_free       i:i     pkey_free
+ diff --git a/sysdeps/unix/sysv/linux/tst-getcwd-smallbuff.c b/sysdeps/unix/sysv/linux/tst-getcwd-smallbuff.c
+ new file mode 100644
+ index 0000000000..55362f6060
+ --- /dev/null
+ +++ b/sysdeps/unix/sysv/linux/tst-getcwd-smallbuff.c
+ @@ -0,0 +1,259 @@
+ +/* Verify that getcwd returns ERANGE for size 1 byte and does not underflow
+ +   buffer when the CWD is too long and is also a mount target of /.  See bug
+ +   #28769 or CVE-2021-3999 for more context.
+ +   Copyright The GNU Toolchain Authors.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <errno.h>
+ +#include <fcntl.h>
+ +#include <intprops.h>
+ +#include <limits.h>
+ +#include <stdio.h>
+ +#include <stdlib.h>
+ +#include <string.h>
+ +#include <sys/mount.h>
+ +#include <sys/stat.h>
+ +#include <sys/types.h>
+ +#include <sys/wait.h>
+ +
+ +#include <sys/socket.h>
+ +#include <sys/un.h>
+ +#include <support/check.h>
+ +#include <support/temp_file.h>
+ +#include <support/test-driver.h>
+ +#include <support/xsched.h>
+ +#include <support/xunistd.h>
+ +
+ +static char *base;
+ +#define BASENAME "tst-getcwd-smallbuff"
+ +#define MOUNT_NAME "mpoint"
+ +static int sockfd[2];
+ +
+ +static void
+ +do_cleanup (void)
+ +{
+ +  support_chdir_toolong_temp_directory (base);
+ +  TEST_VERIFY_EXIT (rmdir (MOUNT_NAME) == 0);
+ +  free (base);
+ +}
+ +
+ +static void
+ +send_fd (const int sock, const int fd)
+ +{
+ +  struct msghdr msg = {0};
+ +  union
+ +    {
+ +      struct cmsghdr hdr;
+ +      char buf[CMSG_SPACE (sizeof (int))];
+ +    } cmsgbuf = {0};
+ +  struct cmsghdr *cmsg;
+ +  struct iovec vec;
+ +  char ch = 'A';
+ +  ssize_t n;
+ +
+ +  msg.msg_control = &cmsgbuf.buf;
+ +  msg.msg_controllen = sizeof (cmsgbuf.buf);
+ +
+ +  cmsg = CMSG_FIRSTHDR (&msg);
+ +  cmsg->cmsg_len = CMSG_LEN (sizeof (int));
+ +  cmsg->cmsg_level = SOL_SOCKET;
+ +  cmsg->cmsg_type = SCM_RIGHTS;
+ +  memcpy (CMSG_DATA (cmsg), &fd, sizeof (fd));
+ +
+ +  vec.iov_base = &ch;
+ +  vec.iov_len = 1;
+ +  msg.msg_iov = &vec;
+ +  msg.msg_iovlen = 1;
+ +
+ +  while ((n = sendmsg (sock, &msg, 0)) == -1 && errno == EINTR);
+ +
+ +  TEST_VERIFY_EXIT (n == 1);
+ +}
+ +
+ +static int
+ +recv_fd (const int sock)
+ +{
+ +  struct msghdr msg = {0};
+ +  union
+ +    {
+ +      struct cmsghdr hdr;
+ +      char buf[CMSG_SPACE(sizeof(int))];
+ +    } cmsgbuf = {0};
+ +  struct cmsghdr *cmsg;
+ +  struct iovec vec;
+ +  ssize_t n;
+ +  char ch = '\0';
+ +  int fd = -1;
+ +
+ +  vec.iov_base = &ch;
+ +  vec.iov_len = 1;
+ +  msg.msg_iov = &vec;
+ +  msg.msg_iovlen = 1;
+ +
+ +  msg.msg_control = &cmsgbuf.buf;
+ +  msg.msg_controllen = sizeof (cmsgbuf.buf);
+ +
+ +  while ((n = recvmsg (sock, &msg, 0)) == -1 && errno == EINTR);
+ +  if (n != 1 || ch != 'A')
+ +    return -1;
+ +
+ +  cmsg = CMSG_FIRSTHDR (&msg);
+ +  if (cmsg == NULL)
+ +    return -1;
+ +  if (cmsg->cmsg_type != SCM_RIGHTS)
+ +    return -1;
+ +  memcpy (&fd, CMSG_DATA (cmsg), sizeof (fd));
+ +  if (fd < 0)
+ +    return -1;
+ +  return fd;
+ +}
+ +
+ +static int
+ +child_func (void * const arg)
+ +{
+ +  xclose (sockfd[0]);
+ +  const int sock = sockfd[1];
+ +  char ch;
+ +
+ +  TEST_VERIFY_EXIT (read (sock, &ch, 1) == 1);
+ +  TEST_VERIFY_EXIT (ch == '1');
+ +
+ +  if (mount ("/", MOUNT_NAME, NULL, MS_BIND | MS_REC, NULL))
+ +    FAIL_EXIT1 ("mount failed: %m\n");
+ +  const int fd = xopen ("mpoint",
+ +                     O_RDONLY | O_PATH | O_DIRECTORY | O_NOFOLLOW, 0);
+ +
+ +  send_fd (sock, fd);
+ +  xclose (fd);
+ +
+ +  TEST_VERIFY_EXIT (read (sock, &ch, 1) == 1);
+ +  TEST_VERIFY_EXIT (ch == 'a');
+ +
+ +  xclose (sock);
+ +  return 0;
+ +}
+ +
+ +static void
+ +update_map (char * const mapping, const char * const map_file)
+ +{
+ +  const size_t map_len = strlen (mapping);
+ +
+ +  const int fd = xopen (map_file, O_WRONLY, 0);
+ +  xwrite (fd, mapping, map_len);
+ +  xclose (fd);
+ +}
+ +
+ +static void
+ +proc_setgroups_write (const long child_pid, const char * const str)
+ +{
+ +  const size_t str_len = strlen(str);
+ +
+ +  char setgroups_path[sizeof ("/proc//setgroups") + INT_STRLEN_BOUND (long)];
+ +
+ +  snprintf (setgroups_path, sizeof (setgroups_path),
+ +         "/proc/%ld/setgroups", child_pid);
+ +
+ +  const int fd = open (setgroups_path, O_WRONLY);
+ +
+ +  if (fd < 0)
+ +    {
+ +      TEST_VERIFY_EXIT (errno == ENOENT);
+ +      FAIL_UNSUPPORTED ("/proc/%ld/setgroups not found\n", child_pid);
+ +    }
+ +
+ +  xwrite (fd, str, str_len);
+ +  xclose(fd);
+ +}
+ +
+ +static char child_stack[1024 * 1024];
+ +
+ +int
+ +do_test (void)
+ +{
+ +  base = support_create_and_chdir_toolong_temp_directory (BASENAME);
+ +
+ +  xmkdir (MOUNT_NAME, S_IRWXU);
+ +  atexit (do_cleanup);
+ +
+ +  /* Check whether user namespaces are supported.  */
+ +  {
+ +    pid_t pid = xfork ();
+ +    if (pid == 0)
+ +      {
+ +     if (unshare (CLONE_NEWUSER | CLONE_NEWNS) != 0)
+ +       _exit (EXIT_UNSUPPORTED);
+ +     else
+ +       _exit (0);
+ +      }
+ +    int status;
+ +    xwaitpid (pid, &status, 0);
+ +    TEST_VERIFY_EXIT (WIFEXITED (status));
+ +    if (WEXITSTATUS (status) != 0)
+ +      return WEXITSTATUS (status);
+ +  }
+ +
+ +  TEST_VERIFY_EXIT (socketpair (AF_UNIX, SOCK_STREAM, 0, sockfd) == 0);
+ +  pid_t child_pid = xclone (child_func, NULL, child_stack,
+ +                         sizeof (child_stack),
+ +                         CLONE_NEWUSER | CLONE_NEWNS | SIGCHLD);
+ +
+ +  xclose (sockfd[1]);
+ +  const int sock = sockfd[0];
+ +
+ +  char map_path[sizeof ("/proc//uid_map") + INT_STRLEN_BOUND (long)];
+ +  char map_buf[sizeof ("0  1") + INT_STRLEN_BOUND (long)];
+ +
+ +  snprintf (map_path, sizeof (map_path), "/proc/%ld/uid_map",
+ +         (long) child_pid);
+ +  snprintf (map_buf, sizeof (map_buf), "0 %ld 1", (long) getuid());
+ +  update_map (map_buf, map_path);
+ +
+ +  proc_setgroups_write ((long) child_pid, "deny");
+ +  snprintf (map_path, sizeof (map_path), "/proc/%ld/gid_map",
+ +         (long) child_pid);
+ +  snprintf (map_buf, sizeof (map_buf), "0 %ld 1", (long) getgid());
+ +  update_map (map_buf, map_path);
+ +
+ +  TEST_VERIFY_EXIT (send (sock, "1", 1, MSG_NOSIGNAL) == 1);
+ +  const int fd = recv_fd (sock);
+ +  TEST_VERIFY_EXIT (fd >= 0);
+ +  TEST_VERIFY_EXIT (fchdir (fd) == 0);
+ +
+ +  static char buf[2 * 10 + 1];
+ +  memset (buf, 'A', sizeof (buf));
+ +
+ +  /* Finally, call getcwd and check if it resulted in a buffer underflow.  */
+ +  char * cwd = getcwd (buf + sizeof (buf) / 2, 1);
+ +  TEST_VERIFY (cwd == NULL);
+ +  TEST_VERIFY (errno == ERANGE);
+ +
+ +  for (int i = 0; i < sizeof (buf); i++)
+ +    if (buf[i] != 'A')
+ +      {
+ +     printf ("buf[%d] = %02x\n", i, (unsigned int) buf[i]);
+ +     support_record_failure ();
+ +      }
+ +
+ +  TEST_VERIFY_EXIT (send (sock, "a", 1, MSG_NOSIGNAL) == 1);
+ +  xclose (sock);
+ +  TEST_VERIFY_EXIT (xwaitpid (child_pid, NULL, 0) == child_pid);
+ +
+ +  return 0;
+ +}
+ +
+ +#define CLEANUP_HANDLER do_cleanup
+ +#include <support/test-driver.c>
+ diff --git a/sysdeps/unix/sysv/linux/x86_64/sysdep.h b/sysdeps/unix/sysv/linux/x86_64/sysdep.h
+ index c2eb37e575..c7f740a1df 100644
+ --- a/sysdeps/unix/sysv/linux/x86_64/sysdep.h
+ +++ b/sysdeps/unix/sysv/linux/x86_64/sysdep.h
+ @@ -61,13 +61,31 @@
+  #  define SYSCALL_ERROR_LABEL syscall_error
+  # endif
+  
+ +/* PSEUDO and T_PSEUDO macros have 2 extra arguments for unsigned long
+ +   int arguments.  */
+ +# define PSEUDOS_HAVE_ULONG_INDICES 1
+ +
+ +# ifndef SYSCALL_ULONG_ARG_1
+ +#  define SYSCALL_ULONG_ARG_1 0
+ +#  define SYSCALL_ULONG_ARG_2 0
+ +# endif
+ +
+  # undef      PSEUDO
+ -# define PSEUDO(name, syscall_name, args)                                  \
+ -  .text;                                                                   \
+ -  ENTRY (name)                                                                     \
+ -    DO_CALL (syscall_name, args);                                          \
+ -    cmpq $-4095, %rax;                                                             \
+ +# if SYSCALL_ULONG_ARG_1
+ +#  define PSEUDO(name, syscall_name, args, ulong_arg_1, ulong_arg_2) \
+ +  .text;                                                           \
+ +  ENTRY (name)                                                             \
+ +    DO_CALL (syscall_name, args, ulong_arg_1, ulong_arg_2);        \
+ +    cmpq $-4095, %rax;                                                     \
+      jae SYSCALL_ERROR_LABEL
+ +# else
+ +#  define PSEUDO(name, syscall_name, args) \
+ +  .text;                                                           \
+ +  ENTRY (name)                                                             \
+ +    DO_CALL (syscall_name, args, 0, 0);                                    \
+ +    cmpq $-4095, %rax;                                                     \
+ +    jae SYSCALL_ERROR_LABEL
+ +# endif
+  
+  # undef      PSEUDO_END
+  # define PSEUDO_END(name)                                                  \
+ @@ -75,10 +93,17 @@
+    END (name)
+  
+  # undef      PSEUDO_NOERRNO
+ -# define PSEUDO_NOERRNO(name, syscall_name, args) \
+ -  .text;                                                                   \
+ -  ENTRY (name)                                                                     \
+ -    DO_CALL (syscall_name, args)
+ +# if SYSCALL_ULONG_ARG_1
+ +#  define PSEUDO_NOERRNO(name, syscall_name, args, ulong_arg_1, ulong_arg_2) \
+ +  .text;                                                           \
+ +  ENTRY (name)                                                             \
+ +    DO_CALL (syscall_name, args, ulong_arg_1, ulong_arg_2)
+ +# else
+ +#  define PSEUDO_NOERRNO(name, syscall_name, args) \
+ +  .text;                                                           \
+ +  ENTRY (name)                                                             \
+ +    DO_CALL (syscall_name, args, 0, 0)
+ +# endif
+  
+  # undef      PSEUDO_END_NOERRNO
+  # define PSEUDO_END_NOERRNO(name) \
+ @@ -87,11 +112,19 @@
+  # define ret_NOERRNO ret
+  
+  # undef      PSEUDO_ERRVAL
+ -# define PSEUDO_ERRVAL(name, syscall_name, args) \
+ -  .text;                                                                   \
+ -  ENTRY (name)                                                                     \
+ -    DO_CALL (syscall_name, args);                                          \
+ +# if SYSCALL_ULONG_ARG_1
+ +#  define PSEUDO_ERRVAL(name, syscall_name, args, ulong_arg_1, ulong_arg_2) \
+ +  .text;                                                     \
+ +  ENTRY (name)                                                       \
+ +    DO_CALL (syscall_name, args, ulong_arg_1, ulong_arg_2);  \
+ +    negq %rax
+ +# else
+ +#  define PSEUDO_ERRVAL(name, syscall_name, args) \
+ +  .text;                                                     \
+ +  ENTRY (name)                                                       \
+ +    DO_CALL (syscall_name, args, 0, 0);                              \
+      negq %rax
+ +# endif
+  
+  # undef      PSEUDO_END_ERRVAL
+  # define PSEUDO_END_ERRVAL(name) \
+ @@ -163,8 +196,10 @@
+      Syscalls of more than 6 arguments are not supported.  */
+  
+  # undef      DO_CALL
+ -# define DO_CALL(syscall_name, args)         \
+ +# define DO_CALL(syscall_name, args, ulong_arg_1, ulong_arg_2) \
+      DOARGS_##args                            \
+ +    ZERO_EXTEND_##ulong_arg_1                        \
+ +    ZERO_EXTEND_##ulong_arg_2                        \
+      movl $SYS_ify (syscall_name), %eax;              \
+      syscall;
+  
+ @@ -176,6 +211,14 @@
+  # define DOARGS_5 DOARGS_4
+  # define DOARGS_6 DOARGS_5
+  
+ +# define ZERO_EXTEND_0 /* nothing */
+ +# define ZERO_EXTEND_1 /* nothing */
+ +# define ZERO_EXTEND_2 /* nothing */
+ +# define ZERO_EXTEND_3 /* nothing */
+ +# define ZERO_EXTEND_4 /* nothing */
+ +# define ZERO_EXTEND_5 /* nothing */
+ +# define ZERO_EXTEND_6 /* nothing */
+ +
+  #else        /* !__ASSEMBLER__ */
+  /* Define a macro which expands inline into the wrapper code for a system
+     call.  */
+ @@ -210,12 +253,15 @@
+  /* Registers clobbered by syscall.  */
+  # define REGISTERS_CLOBBERED_BY_SYSCALL "cc", "r11", "cx"
+  
+ -/* Create a variable 'name' based on type 'X' to avoid explicit types.
+ -   This is mainly used set use 64-bits arguments in x32.   */
+ -#define TYPEFY(X, name) __typeof__ ((X) - (X)) name
+ -/* Explicit cast the argument to avoid integer from pointer warning on
+ -   x32.  */
+ -#define ARGIFY(X) ((__typeof__ ((X) - (X))) (X))
+ +/* NB: This also works when X is an array.  For an array X,  type of
+ +   (X) - (X) is ptrdiff_t, which is signed, since size of ptrdiff_t
+ +   == size of pointer, cast is a NOP.   */
+ +#define TYPEFY1(X) __typeof__ ((X) - (X))
+ +/* Explicit cast the argument.  */
+ +#define ARGIFY(X) ((TYPEFY1 (X)) (X))
+ +/* Create a variable 'name' based on type of variable 'X' to avoid
+ +   explicit types.  */
+ +#define TYPEFY(X, name) __typeof__ (ARGIFY (X)) name
+  
+  #undef INTERNAL_SYSCALL
+  #define INTERNAL_SYSCALL(name, err, nr, args...)                     \
+ diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/sysdep.h b/sysdeps/unix/sysv/linux/x86_64/x32/sysdep.h
+ index 5bf9eed80b..62e6f8fe11 100644
+ --- a/sysdeps/unix/sysv/linux/x86_64/x32/sysdep.h
+ +++ b/sysdeps/unix/sysv/linux/x86_64/x32/sysdep.h
+ @@ -26,4 +26,39 @@
+  #undef LO_HI_LONG
+  #define LO_HI_LONG(val) (val)
+  
+ +#ifdef __ASSEMBLER__
+ +/* Zero-extend 32-bit unsigned long int arguments to 64 bits.  */
+ +# undef ZERO_EXTEND_1
+ +# define ZERO_EXTEND_1 movl %edi, %edi;
+ +# undef ZERO_EXTEND_2
+ +# define ZERO_EXTEND_2 movl %esi, %esi;
+ +# undef ZERO_EXTEND_3
+ +# define ZERO_EXTEND_3 movl %edx, %edx;
+ +# if SYSCALL_ULONG_ARG_1 == 4 || SYSCALL_ULONG_ARG_2 == 4
+ +#  undef DOARGS_4
+ +#  define DOARGS_4 movl %ecx, %r10d;
+ +# else
+ +#  undef ZERO_EXTEND_4
+ +#  define ZERO_EXTEND_4 movl %r10d, %r10d;
+ +# endif
+ +# undef ZERO_EXTEND_5
+ +# define ZERO_EXTEND_5 movl %r8d, %r8d;
+ +# undef ZERO_EXTEND_6
+ +# define ZERO_EXTEND_6 movl %r9d, %r9d;
+ +#else /* !__ASSEMBLER__ */
+ +# undef ARGIFY
+ +/* Enforce zero-extension for pointers and array system call arguments.
+ +   For integer types, extend to int64_t (the full register) using a
+ +   regular cast, resulting in zero or sign extension based on the
+ +   signedness of the original type.  */
+ +# define ARGIFY(X) \
+ + ({                                                                  \
+ +    _Pragma ("GCC diagnostic push");                                 \
+ +    _Pragma ("GCC diagnostic ignored \"-Wpointer-to-int-cast\"");    \
+ +    (__builtin_classify_type (X) == 5                                        \
+ +     ? (uintptr_t) (X) : (int64_t) (X));                             \
+ +    _Pragma ("GCC diagnostic pop");                                  \
+ +  })
+ +#endif       /* __ASSEMBLER__ */
+ +
+  #endif /* linux/x86_64/x32/sysdep.h */
+ diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+ index 95182a508c..b7aec5df2b 100644
+ --- a/sysdeps/x86/Makefile
+ +++ b/sysdeps/x86/Makefile
+ @@ -12,6 +12,42 @@ endif
+  ifeq ($(subdir),setjmp)
+  gen-as-const-headers += jmp_buf-ssp.sym
+  sysdep_routines += __longjmp_cancel
+ +ifneq ($(enable-cet),no)
+ +ifneq ($(have-tunables),no)
+ +tests += tst-setjmp-cet
+ +tst-setjmp-cet-ENV = GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on
+ +endif
+ +endif
+ +endif
+ +
+ +ifeq ($(subdir),string)
+ +sysdep_routines += cacheinfo
+ +
+ +tests += \
+ +  tst-memchr-rtm \
+ +  tst-memcmp-rtm \
+ +  tst-memmove-rtm \
+ +  tst-memrchr-rtm \
+ +  tst-memset-rtm \
+ +  tst-strchr-rtm \
+ +  tst-strcpy-rtm \
+ +  tst-strlen-rtm \
+ +  tst-strncmp-rtm \
+ +  tst-strrchr-rtm \
+ +  tst-wcsncmp-rtm \
+ +# tests
+ +
+ +CFLAGS-tst-memchr-rtm.c += -mrtm
+ +CFLAGS-tst-memcmp-rtm.c += -mrtm
+ +CFLAGS-tst-memmove-rtm.c += -mrtm
+ +CFLAGS-tst-memrchr-rtm.c += -mrtm
+ +CFLAGS-tst-memset-rtm.c += -mrtm
+ +CFLAGS-tst-strchr-rtm.c += -mrtm
+ +CFLAGS-tst-strcpy-rtm.c += -mrtm
+ +CFLAGS-tst-strlen-rtm.c += -mrtm
+ +CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
+ +CFLAGS-tst-strrchr-rtm.c += -mrtm
+ +CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
+  endif
+  
+  ifeq ($(enable-cet),yes)
+ diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
+ index e3e8ef27bb..39c13b7195 100644
+ --- a/sysdeps/x86/cacheinfo.c
+ +++ b/sysdeps/x86/cacheinfo.c
+ @@ -722,7 +722,7 @@ intel_bug_no_cache_info:
+             threads = 1 << ((ecx >> 12) & 0x0f);
+           }
+  
+ -       if (threads == 0)
+ +       if (threads == 0 || cpu_features->basic.family >= 0x17)
+           {
+             /* If APIC ID width is not available, use logical
+                processor count.  */
+ @@ -737,8 +737,22 @@ intel_bug_no_cache_info:
+         if (threads > 0)
+           shared /= threads;
+  
+ -       /* Account for exclusive L2 and L3 caches.  */
+ -       shared += core;
+ +       /* Get shared cache per ccx for Zen architectures.  */
+ +       if (cpu_features->basic.family >= 0x17)
+ +         {
+ +           unsigned int eax;
+ +
+ +           /* Get number of threads share the L3 cache in CCX.  */
+ +           __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
+ +
+ +           unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
+ +           shared *= threads_per_ccx;
+ +         }
+ +       else
+ +         {
+ +           /* Account for exclusive L2 and L3 caches.  */
+ +           shared += core;
+ +            }
+       }
+  
+  #ifndef DISABLE_PREFETCHW
+ @@ -778,14 +792,20 @@ intel_bug_no_cache_info:
+        __x86_shared_cache_size = shared;
+      }
+  
+ -  /* The large memcpy micro benchmark in glibc shows that 6 times of
+ -     shared cache size is the approximate value above which non-temporal
+ -     store becomes faster on a 8-core processor.  This is the 3/4 of the
+ -     total shared cache size.  */
+ +  /* The default setting for the non_temporal threshold is 3/4 of one
+ +     thread's share of the chip's cache. For most Intel and AMD processors
+ +     with an initial release date between 2017 and 2020, a thread's typical
+ +     share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
+ +     threshold leaves 125 KBytes to 500 KBytes of the thread's data
+ +     in cache after a maximum temporal copy, which will maintain
+ +     in cache a reasonable portion of the thread's stack and other
+ +     active data. If the threshold is set higher than one thread's
+ +     share of the cache, it has a substantial risk of negatively
+ +     impacting the performance of other threads running on the chip. */
+    __x86_shared_non_temporal_threshold
+      = (cpu_features->non_temporal_threshold != 0
+         ? cpu_features->non_temporal_threshold
+ -       : __x86_shared_cache_size * threads * 3 / 4);
+ +       : __x86_shared_cache_size * 3 / 4);
+  }
+  
+  #endif
+ diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+ index 81a170a819..e1c22e3e58 100644
+ --- a/sysdeps/x86/cpu-features.c
+ +++ b/sysdeps/x86/cpu-features.c
+ @@ -333,6 +333,9 @@ init_cpu_features (struct cpu_features *cpu_features)
+  
+        get_extended_indices (cpu_features);
+  
+ +      if (CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
+ +     cpu_features->cpuid[index_cpu_RTM].reg_RTM &= ~bit_cpu_RTM;
+ +
+        if (family == 0x06)
+       {
+         model += extended_model;
+ @@ -394,11 +397,42 @@ init_cpu_features (struct cpu_features *cpu_features)
+             break;
+           }
+  
+ -      /* Disable TSX on some Haswell processors to avoid TSX on kernels that
+ -         weren't updated with the latest microcode package (which disables
+ -         broken feature by default).  */
+ +      /* Disable TSX on some processors to avoid TSX on kernels that
+ +         weren't updated with the latest microcode package (which
+ +         disables broken feature by default).  */
+        switch (model)
+           {
+ +         case 0x55:
+ +           if (stepping <= 5)
+ +             goto disable_tsx;
+ +           break;
+ +         case 0x8e:
+ +           /* NB: Although the errata documents that for model == 0x8e,
+ +              only 0xb stepping or lower are impacted, the intention of
+ +              the errata was to disable TSX on all client processors on
+ +              all steppings.  Include 0xc stepping which is an Intel
+ +              Core i7-8665U, a client mobile processor.  */
+ +         case 0x9e:
+ +           if (stepping > 0xc)
+ +             break;
+ +           /* Fall through.  */
+ +         case 0x4e:
+ +         case 0x5e:
+ +           {
+ +             /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
+ +                processors listed in:
+ +
+ +https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+ +              */
+ +disable_tsx:
+ +             cpu_features->cpuid[index_cpu_HLE].reg_HLE
+ +               &= ~bit_cpu_HLE;
+ +             cpu_features->cpuid[index_cpu_RTM].reg_RTM
+ +               &= ~bit_cpu_RTM;
+ +             cpu_features->cpuid[index_cpu_RTM_ALWAYS_ABORT].reg_RTM_ALWAYS_ABORT
+ +               |= bit_cpu_RTM_ALWAYS_ABORT;
+ +           }
+ +           break;
+           case 0x3f:
+             /* Xeon E7 v3 with stepping >= 4 has working TSX.  */
+             if (stepping >= 4)
+ @@ -424,8 +458,24 @@ init_cpu_features (struct cpu_features *cpu_features)
+       cpu_features->feature[index_arch_Prefer_No_VZEROUPPER]
+         |= bit_arch_Prefer_No_VZEROUPPER;
+        else
+ -     cpu_features->feature[index_arch_Prefer_No_AVX512]
+ -       |= bit_arch_Prefer_No_AVX512;
+ +     {
+ +       cpu_features->feature[index_arch_Prefer_No_AVX512]
+ +         |= bit_arch_Prefer_No_AVX512;
+ +
+ +       /* Avoid RTM abort triggered by VZEROUPPER inside a
+ +          transactionally executing RTM region.  */
+ +       if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ +         cpu_features->feature[index_arch_Prefer_No_VZEROUPPER]
+ +           |= bit_arch_Prefer_No_VZEROUPPER;
+ +
+ +       /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
+ +          requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
+ +          requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
+ +          AVX2 strcmp is faster than EVEX strcmp.  */
+ +       if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
+ +         cpu_features->feature[index_arch_Prefer_AVX2_STRCMP]
+ +           |= bit_arch_Prefer_AVX2_STRCMP;
+ +     }
+      }
+    /* This spells out "AuthenticAMD" or "HygonGenuine".  */
+    else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+ diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
+ index aea83e6e31..9fb97907b5 100644
+ --- a/sysdeps/x86/cpu-features.h
+ +++ b/sysdeps/x86/cpu-features.h
+ @@ -499,6 +499,7 @@ extern const struct cpu_features *__get_cpu_features (void)
+  #define bit_cpu_AVX512_4VNNIW        (1u << 2)
+  #define bit_cpu_AVX512_4FMAPS        (1u << 3)
+  #define bit_cpu_FSRM         (1u << 4)
+ +#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
+  #define bit_cpu_PCONFIG              (1u << 18)
+  #define bit_cpu_IBT          (1u << 20)
+  #define bit_cpu_IBRS_IBPB    (1u << 26)
+ @@ -667,6 +668,7 @@ extern const struct cpu_features *__get_cpu_features (void)
+  #define index_cpu_AVX512_4VNNIW COMMON_CPUID_INDEX_7
+  #define index_cpu_AVX512_4FMAPS      COMMON_CPUID_INDEX_7
+  #define index_cpu_FSRM               COMMON_CPUID_INDEX_7
+ +#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
+  #define index_cpu_PCONFIG    COMMON_CPUID_INDEX_7
+  #define index_cpu_IBT                COMMON_CPUID_INDEX_7
+  #define index_cpu_IBRS_IBPB  COMMON_CPUID_INDEX_7
+ @@ -835,6 +837,7 @@ extern const struct cpu_features *__get_cpu_features (void)
+  #define reg_AVX512_4VNNIW    edx
+  #define reg_AVX512_4FMAPS    edx
+  #define reg_FSRM             edx
+ +#define reg_RTM_ALWAYS_ABORT edx
+  #define reg_PCONFIG          edx
+  #define reg_IBT                      edx
+  #define reg_IBRS_IBPB                edx
+ @@ -897,6 +900,7 @@ extern const struct cpu_features *__get_cpu_features (void)
+  #define bit_arch_Prefer_FSRM                 (1u << 13)
+  #define bit_arch_Prefer_No_AVX512            (1u << 14)
+  #define bit_arch_MathVec_Prefer_No_AVX512    (1u << 15)
+ +#define bit_arch_Prefer_AVX2_STRCMP          (1u << 16)
+  
+  #define index_arch_Fast_Rep_String           FEATURE_INDEX_2
+  #define index_arch_Fast_Copy_Backward                FEATURE_INDEX_2
+ @@ -914,6 +918,7 @@ extern const struct cpu_features *__get_cpu_features (void)
+  #define index_arch_Prefer_No_AVX512          FEATURE_INDEX_2
+  #define index_arch_MathVec_Prefer_No_AVX512  FEATURE_INDEX_2
+  #define index_arch_Prefer_FSRM                       FEATURE_INDEX_2
+ +#define index_arch_Prefer_AVX2_STRCMP                FEATURE_INDEX_2
+  
+  /* XCR0 Feature flags.  */
+  #define bit_XMM_state                (1u << 1)
+ diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+ index 861bd7bcaa..cb83ecc3b2 100644
+ --- a/sysdeps/x86/cpu-tunables.c
+ +++ b/sysdeps/x86/cpu-tunables.c
+ @@ -282,6 +282,9 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+             CHECK_GLIBC_IFUNC_ARCH_BOTH (n, cpu_features,
+                                          Fast_Copy_Backward, disable,
+                                          18);
+ +           CHECK_GLIBC_IFUNC_ARCH_NEED_ARCH_BOTH
+ +             (n, cpu_features, Prefer_AVX2_STRCMP, AVX2_Usable,
+ +              disable, 18);
+           }
+         break;
+       case 19:
+ diff --git a/sysdeps/x86/dl-cet.c b/sysdeps/x86/dl-cet.c
+ index ca3b5849bc..8ffaf94a00 100644
+ --- a/sysdeps/x86/dl-cet.c
+ +++ b/sysdeps/x86/dl-cet.c
+ @@ -105,7 +105,11 @@ dl_cet_check (struct link_map *m, const char *program)
+    /* No legacy object check if both IBT and SHSTK are always on.  */
+    if (enable_ibt_type == CET_ALWAYS_ON
+        && enable_shstk_type == CET_ALWAYS_ON)
+ -    return;
+ +    {
+ +      THREAD_SETMEM (THREAD_SELF, header.feature_1,
+ +                  GL(dl_x86_feature_1)[0]);
+ +      return;
+ +    }
+  
+    /* Check if IBT is enabled by kernel.  */
+    bool ibt_enabled
+ diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
+ index 0f55987ae5..bbb5cd356d 100644
+ --- a/sysdeps/x86/tst-get-cpu-features.c
+ +++ b/sysdeps/x86/tst-get-cpu-features.c
+ @@ -176,6 +176,7 @@ do_test (void)
+    CHECK_CPU_FEATURE (AVX512_4VNNIW);
+    CHECK_CPU_FEATURE (AVX512_4FMAPS);
+    CHECK_CPU_FEATURE (FSRM);
+ +  CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
+    CHECK_CPU_FEATURE (PCONFIG);
+    CHECK_CPU_FEATURE (IBT);
+    CHECK_CPU_FEATURE (IBRS_IBPB);
+ diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
+ new file mode 100644
+ index 0000000000..e47494011e
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-memchr-rtm.c
+ @@ -0,0 +1,54 @@
+ +/* Test case for memchr inside a transactionally executing RTM region.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <tst-string-rtm.h>
+ +
+ +#define LOOP 3000
+ +#define STRING_SIZE 1024
+ +char string1[STRING_SIZE];
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +prepare (void)
+ +{
+ +  memset (string1, 'a', STRING_SIZE);
+ +  string1[100] = 'c';
+ +  string1[STRING_SIZE - 100] = 'c';
+ +  char *p = memchr (string1, 'c', STRING_SIZE);
+ +  if (p == &string1[100])
+ +    return EXIT_SUCCESS;
+ +  else
+ +    return EXIT_FAILURE;
+ +}
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +function (void)
+ +{
+ +  char *p = memchr (string1, 'c', STRING_SIZE);
+ +  if (p == &string1[100])
+ +    return 0;
+ +  else
+ +    return 1;
+ +}
+ +
+ +static int
+ +do_test (void)
+ +{
+ +  return do_test_1 ("memchr", LOOP, prepare, function);
+ +}
+ diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
+ new file mode 100644
+ index 0000000000..e4c8a623bb
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-memcmp-rtm.c
+ @@ -0,0 +1,52 @@
+ +/* Test case for memcmp inside a transactionally executing RTM region.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <tst-string-rtm.h>
+ +
+ +#define LOOP 3000
+ +#define STRING_SIZE 1024
+ +char string1[STRING_SIZE];
+ +char string2[STRING_SIZE];
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +prepare (void)
+ +{
+ +  memset (string1, 'a', STRING_SIZE);
+ +  memset (string2, 'a', STRING_SIZE);
+ +  if (memcmp (string1, string2, STRING_SIZE) == 0)
+ +    return EXIT_SUCCESS;
+ +  else
+ +    return EXIT_FAILURE;
+ +}
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +function (void)
+ +{
+ +  if (memcmp (string1, string2, STRING_SIZE) == 0)
+ +    return 0;
+ +  else
+ +    return 1;
+ +}
+ +
+ +static int
+ +do_test (void)
+ +{
+ +  return do_test_1 ("memcmp", LOOP, prepare, function);
+ +}
+ diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
+ new file mode 100644
+ index 0000000000..4bf97ef1e3
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-memmove-rtm.c
+ @@ -0,0 +1,53 @@
+ +/* Test case for memmove inside a transactionally executing RTM region.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <tst-string-rtm.h>
+ +
+ +#define LOOP 3000
+ +#define STRING_SIZE 1024
+ +char string1[STRING_SIZE];
+ +char string2[STRING_SIZE];
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +prepare (void)
+ +{
+ +  memset (string1, 'a', STRING_SIZE);
+ +  if (memmove (string2, string1, STRING_SIZE) == string2
+ +      && memcmp (string2, string1, STRING_SIZE) == 0)
+ +    return EXIT_SUCCESS;
+ +  else
+ +    return EXIT_FAILURE;
+ +}
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +function (void)
+ +{
+ +  if (memmove (string2, string1, STRING_SIZE) == string2
+ +      && memcmp (string2, string1, STRING_SIZE) == 0)
+ +    return 0;
+ +  else
+ +    return 1;
+ +}
+ +
+ +static int
+ +do_test (void)
+ +{
+ +  return do_test_1 ("memmove", LOOP, prepare, function);
+ +}
+ diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
+ new file mode 100644
+ index 0000000000..a57a5a8eb9
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-memrchr-rtm.c
+ @@ -0,0 +1,54 @@
+ +/* Test case for memrchr inside a transactionally executing RTM region.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <tst-string-rtm.h>
+ +
+ +#define LOOP 3000
+ +#define STRING_SIZE 1024
+ +char string1[STRING_SIZE];
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +prepare (void)
+ +{
+ +  memset (string1, 'a', STRING_SIZE);
+ +  string1[100] = 'c';
+ +  string1[STRING_SIZE - 100] = 'c';
+ +  char *p = memrchr (string1, 'c', STRING_SIZE);
+ +  if (p == &string1[STRING_SIZE - 100])
+ +    return EXIT_SUCCESS;
+ +  else
+ +    return EXIT_FAILURE;
+ +}
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +function (void)
+ +{
+ +  char *p = memrchr (string1, 'c', STRING_SIZE);
+ +  if (p == &string1[STRING_SIZE - 100])
+ +    return 0;
+ +  else
+ +    return 1;
+ +}
+ +
+ +static int
+ +do_test (void)
+ +{
+ +  return do_test_1 ("memrchr", LOOP, prepare, function);
+ +}
+ diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
+ new file mode 100644
+ index 0000000000..bf343a4dad
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-memset-rtm.c
+ @@ -0,0 +1,45 @@
+ +/* Test case for memset inside a transactionally executing RTM region.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <tst-string-rtm.h>
+ +
+ +#define LOOP 3000
+ +#define STRING_SIZE 1024
+ +char string1[STRING_SIZE];
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +prepare (void)
+ +{
+ +  memset (string1, 'a', STRING_SIZE);
+ +  return EXIT_SUCCESS;
+ +}
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +function (void)
+ +{
+ +  memset (string1, 'a', STRING_SIZE);
+ +  return 0;
+ +}
+ +
+ +static int
+ +do_test (void)
+ +{
+ +  return do_test_1 ("memset", LOOP, prepare, function);
+ +}
+ diff --git a/sysdeps/x86/tst-setjmp-cet.c b/sysdeps/x86/tst-setjmp-cet.c
+ new file mode 100644
+ index 0000000000..42c795d2a8
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-setjmp-cet.c
+ @@ -0,0 +1 @@
+ +#include <setjmp/tst-setjmp.c>
+ diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
+ new file mode 100644
+ index 0000000000..a82e29c072
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-strchr-rtm.c
+ @@ -0,0 +1,54 @@
+ +/* Test case for strchr inside a transactionally executing RTM region.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <tst-string-rtm.h>
+ +
+ +#define LOOP 3000
+ +#define STRING_SIZE 1024
+ +char string1[STRING_SIZE];
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +prepare (void)
+ +{
+ +  memset (string1, 'a', STRING_SIZE - 1);
+ +  string1[100] = 'c';
+ +  string1[STRING_SIZE - 100] = 'c';
+ +  char *p = strchr (string1, 'c');
+ +  if (p == &string1[100])
+ +    return EXIT_SUCCESS;
+ +  else
+ +    return EXIT_FAILURE;
+ +}
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +function (void)
+ +{
+ +  char *p = strchr (string1, 'c');
+ +  if (p == &string1[100])
+ +    return 0;
+ +  else
+ +    return 1;
+ +}
+ +
+ +static int
+ +do_test (void)
+ +{
+ +  return do_test_1 ("strchr", LOOP, prepare, function);
+ +}
+ diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
+ new file mode 100644
+ index 0000000000..2b2a583fb4
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-strcpy-rtm.c
+ @@ -0,0 +1,53 @@
+ +/* Test case for strcpy inside a transactionally executing RTM region.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <tst-string-rtm.h>
+ +
+ +#define LOOP 3000
+ +#define STRING_SIZE 1024
+ +char string1[STRING_SIZE];
+ +char string2[STRING_SIZE];
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +prepare (void)
+ +{
+ +  memset (string1, 'a', STRING_SIZE - 1);
+ +  if (strcpy (string2, string1) == string2
+ +      && strcmp (string2, string1) == 0)
+ +    return EXIT_SUCCESS;
+ +  else
+ +    return EXIT_FAILURE;
+ +}
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +function (void)
+ +{
+ +  if (strcpy (string2, string1) == string2
+ +      && strcmp (string2, string1) == 0)
+ +    return 0;
+ +  else
+ +    return 1;
+ +}
+ +
+ +static int
+ +do_test (void)
+ +{
+ +  return do_test_1 ("strcpy", LOOP, prepare, function);
+ +}
+ diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
+ new file mode 100644
+ index 0000000000..6ed9eca017
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-string-rtm.h
+ @@ -0,0 +1,72 @@
+ +/* Test string function in a transactionally executing RTM region.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <string.h>
+ +#include <x86intrin.h>
+ +#include <cpu-features.h>
+ +#include <support/check.h>
+ +#include <support/test-driver.h>
+ +
+ +static int
+ +do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
+ +        int (*function) (void))
+ +{
+ +  if (!CPU_FEATURE_USABLE (RTM))
+ +    return EXIT_UNSUPPORTED;
+ +
+ +  int status = prepare ();
+ +  if (status != EXIT_SUCCESS)
+ +    return status;
+ +
+ +  unsigned int i;
+ +  unsigned int naborts = 0;
+ +  unsigned int failed = 0;
+ +  for (i = 0; i < loop; i++)
+ +    {
+ +      failed |= function ();
+ +      if (_xbegin() == _XBEGIN_STARTED)
+ +     {
+ +       failed |= function ();
+ +       _xend();
+ +     }
+ +      else
+ +     {
+ +       failed |= function ();
+ +       ++naborts;
+ +     }
+ +    }
+ +
+ +  if (failed)
+ +    FAIL_EXIT1 ("%s() failed", name);
+ +
+ +  if (naborts)
+ +    {
+ +      /* NB: Low single digit (<= 5%) noise-level aborts are normal for
+ +      TSX.  */
+ +      double rate = 100 * ((double) naborts) / ((double) loop);
+ +      if (rate > 5)
+ +     FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
+ +                 rate, naborts, loop);
+ +    }
+ +
+ +  return EXIT_SUCCESS;
+ +}
+ +
+ +static int do_test (void);
+ +
+ +#include <support/test-driver.c>
+ diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
+ new file mode 100644
+ index 0000000000..0dcf14db87
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-strlen-rtm.c
+ @@ -0,0 +1,53 @@
+ +/* Test case for strlen inside a transactionally executing RTM region.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <tst-string-rtm.h>
+ +
+ +#define LOOP 3000
+ +#define STRING_SIZE 1024
+ +char string1[STRING_SIZE];
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +prepare (void)
+ +{
+ +  memset (string1, 'a', STRING_SIZE - 1);
+ +  string1[STRING_SIZE - 100] = '\0';
+ +  size_t len = strlen (string1);
+ +  if (len == STRING_SIZE - 100)
+ +    return EXIT_SUCCESS;
+ +  else
+ +    return EXIT_FAILURE;
+ +}
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +function (void)
+ +{
+ +  size_t len = strlen (string1);
+ +  if (len == STRING_SIZE - 100)
+ +    return 0;
+ +  else
+ +    return 1;
+ +}
+ +
+ +static int
+ +do_test (void)
+ +{
+ +  return do_test_1 ("strlen", LOOP, prepare, function);
+ +}
+ diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+ new file mode 100644
+ index 0000000000..aef9866cf2
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-strncmp-rtm.c
+ @@ -0,0 +1,81 @@
+ +/* Test case for strncmp inside a transactionally executing RTM region.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <stdint.h>
+ +#include <tst-string-rtm.h>
+ +
+ +#ifdef WIDE
+ +# define CHAR wchar_t
+ +# define MEMSET wmemset
+ +# define STRNCMP wcsncmp
+ +# define TEST_NAME "wcsncmp"
+ +#else /* !WIDE */
+ +# define CHAR char
+ +# define MEMSET memset
+ +# define STRNCMP strncmp
+ +# define TEST_NAME "strncmp"
+ +#endif /* !WIDE */
+ +
+ +
+ +
+ +#define LOOP 3000
+ +#define STRING_SIZE 1024
+ +CHAR string1[STRING_SIZE];
+ +CHAR string2[STRING_SIZE];
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +prepare (void)
+ +{
+ +  MEMSET (string1, 'a', STRING_SIZE - 1);
+ +  MEMSET (string2, 'a', STRING_SIZE - 1);
+ +  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+ +    return EXIT_SUCCESS;
+ +  else
+ +    return EXIT_FAILURE;
+ +}
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +function (void)
+ +{
+ +  if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+ +    return 0;
+ +  else
+ +    return 1;
+ +}
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +function_overflow (void)
+ +{
+ +  if (STRNCMP (string1, string2, SIZE_MAX) == 0)
+ +    return 0;
+ +  else
+ +    return 1;
+ +}
+ +
+ +static int
+ +do_test (void)
+ +{
+ +  int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
+ +  if (status != EXIT_SUCCESS)
+ +    return status;
+ +  status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+ +  return status;
+ +}
+ diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
+ new file mode 100644
+ index 0000000000..e32bfaf5f5
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-strrchr-rtm.c
+ @@ -0,0 +1,53 @@
+ +/* Test case for strrchr inside a transactionally executing RTM region.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <tst-string-rtm.h>
+ +
+ +#define LOOP 3000
+ +#define STRING_SIZE 1024
+ +char string1[STRING_SIZE];
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +prepare (void)
+ +{
+ +  memset (string1, 'a', STRING_SIZE - 1);
+ +  string1[STRING_SIZE - 100] = 'c';
+ +  char *p = strrchr (string1, 'c');
+ +  if (p == &string1[STRING_SIZE - 100])
+ +    return EXIT_SUCCESS;
+ +  else
+ +    return EXIT_FAILURE;
+ +}
+ +
+ +__attribute__ ((noinline, noclone))
+ +static int
+ +function (void)
+ +{
+ +  char *p = strrchr (string1, 'c');
+ +  if (p == &string1[STRING_SIZE - 100])
+ +    return 0;
+ +  else
+ +    return 1;
+ +}
+ +
+ +static int
+ +do_test (void)
+ +{
+ +  return do_test_1 ("strrchr", LOOP, prepare, function);
+ +}
+ diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c
+ new file mode 100644
+ index 0000000000..bad3b86378
+ --- /dev/null
+ +++ b/sysdeps/x86/tst-wcsncmp-rtm.c
+ @@ -0,0 +1,21 @@
+ +/* Test case for wcsncmp inside a transactionally executing RTM region.
+ +   Copyright (C) 2022 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#define WIDE 1
+ +#include <wchar.h>
+ +#include "tst-strncmp-rtm.c"
+ diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
+ index d51cf03ac9..b1951adce9 100644
+ --- a/sysdeps/x86_64/Makefile
+ +++ b/sysdeps/x86_64/Makefile
+ @@ -20,6 +20,8 @@ endif
+  ifeq ($(subdir),string)
+  sysdep_routines += cacheinfo strcasecmp_l-nonascii strncase_l-nonascii
+  gen-as-const-headers += locale-defines.sym
+ +tests += \
+ +  tst-rsi-strlen
+  endif
+  
+  ifeq ($(subdir),elf)
+ @@ -150,6 +152,11 @@ ifeq ($(subdir),csu)
+  gen-as-const-headers += tlsdesc.sym rtld-offsets.sym
+  endif
+  
+ +ifeq ($(subdir),wcsmbs)
+ +tests += \
+ +  tst-rsi-wcslen
+ +endif
+ +
+  $(objpfx)x86_64/tst-x86_64mod-1.os: $(objpfx)tst-x86_64mod-1.os
+       $(make-target-directory)
+       rm -f $@
+ diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
 -old mode 100644
 -new mode 100755
+ index 84f82c2406..fc1840e23f
+ --- a/sysdeps/x86_64/configure
+ +++ b/sysdeps/x86_64/configure
+ @@ -107,39 +107,6 @@ if test x"$build_mathvec" = xnotset; then
+    build_mathvec=yes
+  fi
+  
+ -if test "$static_pie" = yes; then
+ -  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for linker static PIE support" >&5
+ -$as_echo_n "checking for linker static PIE support... " >&6; }
+ -if ${libc_cv_ld_static_pie+:} false; then :
+ -  $as_echo_n "(cached) " >&6
+ -else
+ -  cat > conftest.s <<\EOF
+ -     .text
+ -     .global _start
+ -     .weak foo
+ -_start:
+ -     leaq    foo(%rip), %rax
+ -EOF
+ -  libc_cv_pie_option="-Wl,-pie"
+ -  if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostartfiles -nostdlib $no_ssp $libc_cv_pie_option -o conftest conftest.s 1>&5'
+ -  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ -  (eval $ac_try) 2>&5
+ -  ac_status=$?
+ -  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ -  test $ac_status = 0; }; }; then
+ -    libc_cv_ld_static_pie=yes
+ -  else
+ -    libc_cv_ld_static_pie=no
+ -  fi
+ -rm -f conftest*
+ -fi
+ -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_ld_static_pie" >&5
+ -$as_echo "$libc_cv_ld_static_pie" >&6; }
+ -  if test "$libc_cv_ld_static_pie" != yes; then
+ -    as_fn_error $? "linker support for static PIE needed" "$LINENO" 5
+ -  fi
+ -fi
+ -
+  $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
+  
+  
+ diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
+ index cdaba0c075..611a7d9ba3 100644
+ --- a/sysdeps/x86_64/configure.ac
+ +++ b/sysdeps/x86_64/configure.ac
+ @@ -53,31 +53,6 @@ if test x"$build_mathvec" = xnotset; then
+    build_mathvec=yes
+  fi
+  
+ -dnl Check if linker supports static PIE with the fix for
+ -dnl
+ -dnl https://sourceware.org/bugzilla/show_bug.cgi?id=21782
+ -dnl
+ -if test "$static_pie" = yes; then
+ -  AC_CACHE_CHECK(for linker static PIE support, libc_cv_ld_static_pie, [dnl
+ -cat > conftest.s <<\EOF
+ -     .text
+ -     .global _start
+ -     .weak foo
+ -_start:
+ -     leaq    foo(%rip), %rax
+ -EOF
+ -  libc_cv_pie_option="-Wl,-pie"
+ -  if AC_TRY_COMMAND(${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostartfiles -nostdlib $no_ssp $libc_cv_pie_option -o conftest conftest.s 1>&AS_MESSAGE_LOG_FD); then
+ -    libc_cv_ld_static_pie=yes
+ -  else
+ -    libc_cv_ld_static_pie=no
+ -  fi
+ -rm -f conftest*])
+ -  if test "$libc_cv_ld_static_pie" != yes; then
+ -    AC_MSG_ERROR([linker support for static PIE needed])
+ -  fi
+ -fi
+ -
+  dnl It is always possible to access static and hidden symbols in an
+  dnl position independent way.
+  AC_DEFINE(PI_STATIC_AND_HIDDEN)
+ diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
+ index 8e9baffeb4..74029871d8 100644
+ --- a/sysdeps/x86_64/dl-machine.h
+ +++ b/sysdeps/x86_64/dl-machine.h
+ @@ -315,16 +315,22 @@ elf_machine_rela (struct link_map *map, const ElfW(Rela) *reloc,
+       {
+  # ifndef RTLD_BOOTSTRAP
+         if (sym_map != map
+ -           && sym_map->l_type != lt_executable
+             && !sym_map->l_relocated)
+           {
+             const char *strtab
+               = (const char *) D_PTR (map, l_info[DT_STRTAB]);
+ -           _dl_error_printf ("\
+ +           if (sym_map->l_type == lt_executable)
+ +             _dl_fatal_printf ("\
+ +%s: IFUNC symbol '%s' referenced in '%s' is defined in the executable \
+ +and creates an unsatisfiable circular dependency.\n",
+ +                               RTLD_PROGNAME, strtab + refsym->st_name,
+ +                               map->l_name);
+ +           else
+ +             _dl_error_printf ("\
+  %s: Relink `%s' with `%s' for IFUNC symbol `%s'\n",
+ -                             RTLD_PROGNAME, map->l_name,
+ -                             sym_map->l_name,
+ -                             strtab + refsym->st_name);
+ +                               RTLD_PROGNAME, map->l_name,
+ +                               sym_map->l_name,
+ +                               strtab + refsym->st_name);
+           }
+  # endif
+         value = ((ElfW(Addr) (*) (void)) value) ();
+ diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
+ index a5c879d2af..070e5ef90b 100644
+ --- a/sysdeps/x86_64/memchr.S
+ +++ b/sysdeps/x86_64/memchr.S
+ @@ -21,9 +21,11 @@
+  #ifdef USE_AS_WMEMCHR
+  # define MEMCHR              wmemchr
+  # define PCMPEQ              pcmpeqd
+ +# define CHAR_PER_VEC        4
+  #else
+  # define MEMCHR              memchr
+  # define PCMPEQ              pcmpeqb
+ +# define CHAR_PER_VEC        16
+  #endif
+  
+  /* fast SSE2 version with using pmaxub and 64 byte loop */
+ @@ -33,15 +35,14 @@ ENTRY(MEMCHR)
+       movd    %esi, %xmm1
+       mov     %edi, %ecx
+  
+ +#ifdef __ILP32__
+ +     /* Clear the upper 32 bits.  */
+ +     movl    %edx, %edx
+ +#endif
+  #ifdef USE_AS_WMEMCHR
+       test    %RDX_LP, %RDX_LP
+       jz      L(return_null)
+ -     shl     $2, %RDX_LP
+  #else
+ -# ifdef __ILP32__
+ -     /* Clear the upper 32 bits.  */
+ -     movl    %edx, %edx
+ -# endif
+       punpcklbw %xmm1, %xmm1
+       test    %RDX_LP, %RDX_LP
+       jz      L(return_null)
+ @@ -60,13 +61,16 @@ ENTRY(MEMCHR)
+       test    %eax, %eax
+  
+       jnz     L(matches_1)
+ -     sub     $16, %rdx
+ +     sub     $CHAR_PER_VEC, %rdx
+       jbe     L(return_null)
+       add     $16, %rdi
+       and     $15, %ecx
+       and     $-16, %rdi
+ +#ifdef USE_AS_WMEMCHR
+ +     shr     $2, %ecx
+ +#endif
+       add     %rcx, %rdx
+ -     sub     $64, %rdx
+ +     sub     $(CHAR_PER_VEC * 4), %rdx
+       jbe     L(exit_loop)
+       jmp     L(loop_prolog)
+  
+ @@ -77,16 +81,21 @@ L(crosscache):
+       movdqa  (%rdi), %xmm0
+  
+       PCMPEQ  %xmm1, %xmm0
+ -/* Check if there is a match.  */
+ +     /* Check if there is a match.  */
+       pmovmskb %xmm0, %eax
+ -/* Remove the leading bytes.  */
+ +     /* Remove the leading bytes.  */
+       sar     %cl, %eax
+       test    %eax, %eax
+       je      L(unaligned_no_match)
+ -/* Check which byte is a match.  */
+ +     /* Check which byte is a match.  */
+       bsf     %eax, %eax
+ -
+ +#ifdef USE_AS_WMEMCHR
+ +     mov     %eax, %esi
+ +     shr     $2, %esi
+ +     sub     %rsi, %rdx
+ +#else
+       sub     %rax, %rdx
+ +#endif
+       jbe     L(return_null)
+       add     %rdi, %rax
+       add     %rcx, %rax
+ @@ -94,15 +103,18 @@ L(crosscache):
+  
+       .p2align 4
+  L(unaligned_no_match):
+ -        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+ +     /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+          "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
+          possible addition overflow.  */
+       neg     %rcx
+       add     $16, %rcx
+ +#ifdef USE_AS_WMEMCHR
+ +     shr     $2, %ecx
+ +#endif
+       sub     %rcx, %rdx
+       jbe     L(return_null)
+       add     $16, %rdi
+ -     sub     $64, %rdx
+ +     sub     $(CHAR_PER_VEC * 4), %rdx
+       jbe     L(exit_loop)
+  
+       .p2align 4
+ @@ -135,7 +147,7 @@ L(loop_prolog):
+       test    $0x3f, %rdi
+       jz      L(align64_loop)
+  
+ -     sub     $64, %rdx
+ +     sub     $(CHAR_PER_VEC * 4), %rdx
+       jbe     L(exit_loop)
+  
+       movdqa  (%rdi), %xmm0
+ @@ -167,11 +179,14 @@ L(loop_prolog):
+       mov     %rdi, %rcx
+       and     $-64, %rdi
+       and     $63, %ecx
+ +#ifdef USE_AS_WMEMCHR
+ +     shr     $2, %ecx
+ +#endif
+       add     %rcx, %rdx
+  
+       .p2align 4
+  L(align64_loop):
+ -     sub     $64, %rdx
+ +     sub     $(CHAR_PER_VEC * 4), %rdx
+       jbe     L(exit_loop)
+       movdqa  (%rdi), %xmm0
+       movdqa  16(%rdi), %xmm2
+ @@ -218,7 +233,7 @@ L(align64_loop):
+  
+       .p2align 4
+  L(exit_loop):
+ -     add     $32, %edx
+ +     add     $(CHAR_PER_VEC * 2), %edx
+       jle     L(exit_loop_32)
+  
+       movdqa  (%rdi), %xmm0
+ @@ -238,7 +253,7 @@ L(exit_loop):
+       pmovmskb %xmm3, %eax
+       test    %eax, %eax
+       jnz     L(matches32_1)
+ -     sub     $16, %edx
+ +     sub     $CHAR_PER_VEC, %edx
+       jle     L(return_null)
+  
+       PCMPEQ  48(%rdi), %xmm1
+ @@ -250,13 +265,13 @@ L(exit_loop):
+  
+       .p2align 4
+  L(exit_loop_32):
+ -     add     $32, %edx
+ +     add     $(CHAR_PER_VEC * 2), %edx
+       movdqa  (%rdi), %xmm0
+       PCMPEQ  %xmm1, %xmm0
+       pmovmskb %xmm0, %eax
+       test    %eax, %eax
+       jnz     L(matches_1)
+ -     sub     $16, %edx
+ +     sub     $CHAR_PER_VEC, %edx
+       jbe     L(return_null)
+  
+       PCMPEQ  16(%rdi), %xmm1
+ @@ -293,7 +308,13 @@ L(matches32):
+       .p2align 4
+  L(matches_1):
+       bsf     %eax, %eax
+ +#ifdef USE_AS_WMEMCHR
+ +     mov     %eax, %esi
+ +     shr     $2, %esi
+ +     sub     %rsi, %rdx
+ +#else
+       sub     %rax, %rdx
+ +#endif
+       jbe     L(return_null)
+       add     %rdi, %rax
+       ret
+ @@ -301,7 +322,13 @@ L(matches_1):
+       .p2align 4
+  L(matches16_1):
+       bsf     %eax, %eax
+ +#ifdef USE_AS_WMEMCHR
+ +     mov     %eax, %esi
+ +     shr     $2, %esi
+ +     sub     %rsi, %rdx
+ +#else
+       sub     %rax, %rdx
+ +#endif
+       jbe     L(return_null)
+       lea     16(%rdi, %rax), %rax
+       ret
+ @@ -309,7 +336,13 @@ L(matches16_1):
+       .p2align 4
+  L(matches32_1):
+       bsf     %eax, %eax
+ +#ifdef USE_AS_WMEMCHR
+ +     mov     %eax, %esi
+ +     shr     $2, %esi
+ +     sub     %rsi, %rdx
+ +#else
+       sub     %rax, %rdx
+ +#endif
+       jbe     L(return_null)
+       lea     32(%rdi, %rax), %rax
+       ret
+ @@ -317,7 +350,13 @@ L(matches32_1):
+       .p2align 4
+  L(matches48_1):
+       bsf     %eax, %eax
+ +#ifdef USE_AS_WMEMCHR
+ +     mov     %eax, %esi
+ +     shr     $2, %esi
+ +     sub     %rsi, %rdx
+ +#else
+       sub     %rax, %rdx
+ +#endif
+       jbe     L(return_null)
+       lea     48(%rdi, %rax), %rax
+       ret
+ diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+ index 395e432c09..da1446d731 100644
+ --- a/sysdeps/x86_64/multiarch/Makefile
+ +++ b/sysdeps/x86_64/multiarch/Makefile
+ @@ -43,7 +43,45 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+                  memmove-avx512-unaligned-erms \
+                  memset-sse2-unaligned-erms \
+                  memset-avx2-unaligned-erms \
+ -                memset-avx512-unaligned-erms
+ +                memset-avx512-unaligned-erms \
+ +                memchr-avx2-rtm \
+ +                memcmp-avx2-movbe-rtm \
+ +                memmove-avx-unaligned-erms-rtm \
+ +                memrchr-avx2-rtm \
+ +                memset-avx2-unaligned-erms-rtm \
+ +                rawmemchr-avx2-rtm \
+ +                strchr-avx2-rtm \
+ +                strcmp-avx2-rtm \
+ +                strchrnul-avx2-rtm \
+ +                stpcpy-avx2-rtm \
+ +                stpncpy-avx2-rtm \
+ +                strcat-avx2-rtm \
+ +                strcpy-avx2-rtm \
+ +                strlen-avx2-rtm \
+ +                strncat-avx2-rtm \
+ +                strncmp-avx2-rtm \
+ +                strncpy-avx2-rtm \
+ +                strnlen-avx2-rtm \
+ +                strrchr-avx2-rtm \
+ +                memchr-evex \
+ +                memcmp-evex-movbe \
+ +                memmove-evex-unaligned-erms \
+ +                memrchr-evex \
+ +                memset-evex-unaligned-erms \
+ +                rawmemchr-evex \
+ +                stpcpy-evex \
+ +                stpncpy-evex \
+ +                strcat-evex \
+ +                strchr-evex \
+ +                strchrnul-evex \
+ +                strcmp-evex \
+ +                strcpy-evex \
+ +                strlen-evex \
+ +                strncat-evex \
+ +                strncmp-evex \
+ +                strncpy-evex \
+ +                strnlen-evex \
+ +                strrchr-evex
+  CFLAGS-varshift.c += -msse4
+  CFLAGS-strcspn-c.c += -msse4
+  CFLAGS-strpbrk-c.c += -msse4
+ @@ -59,8 +97,24 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+                  wcscpy-ssse3 wcscpy-c \
+                  wcschr-sse2 wcschr-avx2 \
+                  wcsrchr-sse2 wcsrchr-avx2 \
+ -                wcsnlen-sse4_1 wcsnlen-c \
+ -                wcslen-sse2 wcslen-avx2 wcsnlen-avx2
+ +                wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+ +                wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+ +                wcschr-avx2-rtm \
+ +                wcscmp-avx2-rtm \
+ +                wcslen-avx2-rtm \
+ +                wcsncmp-avx2-rtm \
+ +                wcsnlen-avx2-rtm \
+ +                wcsrchr-avx2-rtm \
+ +                wmemchr-avx2-rtm \
+ +                wmemcmp-avx2-movbe-rtm \
+ +                wcschr-evex \
+ +                wcscmp-evex \
+ +                wcslen-evex \
+ +                wcsncmp-evex \
+ +                wcsnlen-evex \
+ +                wcsrchr-evex \
+ +                wmemchr-evex \
+ +                wmemcmp-evex-movbe
+  endif
+  
+  ifeq ($(subdir),debug)
+ diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+ index 69f30398ae..925e5b61eb 100644
+ --- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+ +++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+ @@ -21,16 +21,28 @@
+  
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+  
+  static inline void *
+  IFUNC_SELECTOR (void)
+  {
+    const struct cpu_features* cpu_features = __get_cpu_features ();
+  
+ -  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ -      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ +  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ +      && CPU_FEATURES_CPU_P (cpu_features, BMI2)
+        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ -    return OPTIMIZE (avx2);
+ +    {
+ +      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ +       && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+ +     return OPTIMIZE (evex);
+ +
+ +      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ +     return OPTIMIZE (avx2_rtm);
+ +
+ +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ +     return OPTIMIZE (avx2);
+ +    }
+  
+    return OPTIMIZE (sse2);
+  }
+ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+ index ce7eb1eecf..e712b148f5 100644
+ --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+ +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+ @@ -41,8 +41,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+    /* Support sysdeps/x86_64/multiarch/memchr.c.  */
+    IFUNC_IMPL (i, name, memchr,
+             IFUNC_IMPL_ADD (array, i, memchr,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __memchr_avx2)
+ +           IFUNC_IMPL_ADD (array, i, memchr,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memchr_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, memchr,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __memchr_evex)
+             IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
+ @@ -51,6 +62,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             (HAS_ARCH_FEATURE (AVX2_Usable)
+                              && HAS_CPU_FEATURE (MOVBE)),
+                             __memcmp_avx2_movbe)
+ +           IFUNC_IMPL_ADD (array, i, memcmp,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (MOVBE)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memcmp_avx2_movbe_rtm)
+ +           IFUNC_IMPL_ADD (array, i, memcmp,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (MOVBE)),
+ +                           __memcmp_evex_movbe)
+             IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
+                             __memcmp_sse4_1)
+             IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
+ @@ -64,10 +85,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memmove_chk_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memmove_chk_avx512_unaligned)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memmove_chk_avx512_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+ @@ -75,6 +96,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memmove_chk_avx_unaligned_erms)
+ +           IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memmove_chk_avx_unaligned_rtm)
+ +           IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memmove_chk_avx_unaligned_erms_rtm)
+ +           IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __memmove_chk_evex_unaligned)
+ +           IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __memmove_chk_evex_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memmove_chk,
+                             HAS_CPU_FEATURE (SSSE3),
+                             __memmove_chk_ssse3_back)
+ @@ -97,14 +132,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+             IFUNC_IMPL_ADD (array, i, memmove,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memmove_avx_unaligned_erms)
+ +           IFUNC_IMPL_ADD (array, i, memmove,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memmove_avx_unaligned_rtm)
+ +           IFUNC_IMPL_ADD (array, i, memmove,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memmove_avx_unaligned_erms_rtm)
+ +           IFUNC_IMPL_ADD (array, i, memmove,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __memmove_evex_unaligned)
+ +           IFUNC_IMPL_ADD (array, i, memmove,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __memmove_evex_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memmove,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memmove_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, memmove,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memmove_avx512_unaligned)
+             IFUNC_IMPL_ADD (array, i, memmove,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memmove_avx512_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
+                             __memmove_ssse3_back)
+ @@ -119,8 +168,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+    /* Support sysdeps/x86_64/multiarch/memrchr.c.  */
+    IFUNC_IMPL (i, name, memrchr,
+             IFUNC_IMPL_ADD (array, i, memrchr,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __memrchr_avx2)
+ +           IFUNC_IMPL_ADD (array, i, memrchr,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memrchr_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, memrchr,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __memrchr_evex)
+ +
+             IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
+  
+  #ifdef SHARED
+ @@ -139,10 +200,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             HAS_ARCH_FEATURE (AVX2_Usable),
+                             __memset_chk_avx2_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memset_chk,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memset_chk_avx2_unaligned_rtm)
+ +           IFUNC_IMPL_ADD (array, i, __memset_chk,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memset_chk_avx2_unaligned_erms_rtm)
+ +           IFUNC_IMPL_ADD (array, i, __memset_chk,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ +                           __memset_chk_evex_unaligned)
+ +           IFUNC_IMPL_ADD (array, i, __memset_chk,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ +                           __memset_chk_evex_unaligned_erms)
+ +           IFUNC_IMPL_ADD (array, i, __memset_chk,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+                             __memset_chk_avx512_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memset_chk,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+                             __memset_chk_avx512_unaligned)
+             IFUNC_IMPL_ADD (array, i, __memset_chk,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+ @@ -164,10 +243,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             HAS_ARCH_FEATURE (AVX2_Usable),
+                             __memset_avx2_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memset,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memset_avx2_unaligned_rtm)
+ +           IFUNC_IMPL_ADD (array, i, memset,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memset_avx2_unaligned_erms_rtm)
+ +           IFUNC_IMPL_ADD (array, i, memset,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ +                           __memset_evex_unaligned)
+ +           IFUNC_IMPL_ADD (array, i, memset,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ +                           __memset_evex_unaligned_erms)
+ +           IFUNC_IMPL_ADD (array, i, memset,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+                             __memset_avx512_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memset,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+                             __memset_avx512_unaligned)
+             IFUNC_IMPL_ADD (array, i, memset,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+ @@ -177,22 +274,55 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+    /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
+    IFUNC_IMPL (i, name, rawmemchr,
+             IFUNC_IMPL_ADD (array, i, rawmemchr,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __rawmemchr_avx2)
+ +           IFUNC_IMPL_ADD (array, i, rawmemchr,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __rawmemchr_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, rawmemchr,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __rawmemchr_evex)
+             IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+    IFUNC_IMPL (i, name, strlen,
+             IFUNC_IMPL_ADD (array, i, strlen,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __strlen_avx2)
+ +           IFUNC_IMPL_ADD (array, i, strlen,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __strlen_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, strlen,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __strlen_evex)
+             IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
+    IFUNC_IMPL (i, name, strnlen,
+             IFUNC_IMPL_ADD (array, i, strnlen,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __strnlen_avx2)
+ +           IFUNC_IMPL_ADD (array, i, strnlen,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __strnlen_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, strnlen,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __strnlen_evex)
+             IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
+ @@ -201,6 +331,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             __stpncpy_ssse3)
+             IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+                             __stpncpy_avx2)
+ +           IFUNC_IMPL_ADD (array, i, stpncpy,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __stpncpy_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, stpncpy,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ +                           __stpncpy_evex)
+             IFUNC_IMPL_ADD (array, i, stpncpy, 1,
+                             __stpncpy_sse2_unaligned)
+             IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
+ @@ -211,6 +349,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             __stpcpy_ssse3)
+             IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+                             __stpcpy_avx2)
+ +           IFUNC_IMPL_ADD (array, i, stpcpy,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __stpcpy_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, stpcpy,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ +                           __stpcpy_evex)
+             IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
+             IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
+  
+ @@ -245,6 +391,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+    IFUNC_IMPL (i, name, strcat,
+             IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
+                             __strcat_avx2)
+ +           IFUNC_IMPL_ADD (array, i, strcat,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __strcat_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, strcat,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ +                           __strcat_evex)
+             IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
+                             __strcat_ssse3)
+             IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
+ @@ -253,23 +407,56 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+    /* Support sysdeps/x86_64/multiarch/strchr.c.  */
+    IFUNC_IMPL (i, name, strchr,
+             IFUNC_IMPL_ADD (array, i, strchr,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __strchr_avx2)
+ +           IFUNC_IMPL_ADD (array, i, strchr,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __strchr_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, strchr,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __strchr_evex)
+             IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
+             IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/strchrnul.c.  */
+    IFUNC_IMPL (i, name, strchrnul,
+             IFUNC_IMPL_ADD (array, i, strchrnul,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __strchrnul_avx2)
+ +           IFUNC_IMPL_ADD (array, i, strchrnul,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __strchrnul_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, strchrnul,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __strchrnul_evex)
+             IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/strrchr.c.  */
+    IFUNC_IMPL (i, name, strrchr,
+             IFUNC_IMPL_ADD (array, i, strrchr,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __strrchr_avx2)
+ +           IFUNC_IMPL_ADD (array, i, strrchr,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __strrchr_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, strrchr,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __strrchr_evex)
+             IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/strcmp.c.  */
+ @@ -277,6 +464,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+             IFUNC_IMPL_ADD (array, i, strcmp,
+                             HAS_ARCH_FEATURE (AVX2_Usable),
+                             __strcmp_avx2)
+ +           IFUNC_IMPL_ADD (array, i, strcmp,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __strcmp_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, strcmp,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __strcmp_evex)
+             IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
+                             __strcmp_sse42)
+             IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
+ @@ -288,6 +484,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+    IFUNC_IMPL (i, name, strcpy,
+             IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+                             __strcpy_avx2)
+ +           IFUNC_IMPL_ADD (array, i, strcpy,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __strcpy_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, strcpy,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ +                           __strcpy_evex)
+             IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
+                             __strcpy_ssse3)
+             IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
+ @@ -331,6 +535,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+    IFUNC_IMPL (i, name, strncat,
+             IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
+                             __strncat_avx2)
+ +           IFUNC_IMPL_ADD (array, i, strncat,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __strncat_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, strncat,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ +                           __strncat_evex)
+             IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
+                             __strncat_ssse3)
+             IFUNC_IMPL_ADD (array, i, strncat, 1,
+ @@ -341,6 +553,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+    IFUNC_IMPL (i, name, strncpy,
+             IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+                             __strncpy_avx2)
+ +           IFUNC_IMPL_ADD (array, i, strncpy,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __strncpy_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, strncpy,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ +                           __strncpy_evex)
+             IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
+                             __strncpy_ssse3)
+             IFUNC_IMPL_ADD (array, i, strncpy, 1,
+ @@ -368,29 +588,73 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+    /* Support sysdeps/x86_64/multiarch/wcschr.c.  */
+    IFUNC_IMPL (i, name, wcschr,
+             IFUNC_IMPL_ADD (array, i, wcschr,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __wcschr_avx2)
+ +           IFUNC_IMPL_ADD (array, i, wcschr,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __wcschr_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, wcschr,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __wcschr_evex)
+             IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/wcsrchr.c.  */
+    IFUNC_IMPL (i, name, wcsrchr,
+             IFUNC_IMPL_ADD (array, i, wcsrchr,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __wcsrchr_avx2)
+ +           IFUNC_IMPL_ADD (array, i, wcsrchr,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __wcsrchr_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, wcsrchr,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __wcsrchr_evex)
+             IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/wcscmp.c.  */
+    IFUNC_IMPL (i, name, wcscmp,
+             IFUNC_IMPL_ADD (array, i, wcscmp,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __wcscmp_avx2)
+ +           IFUNC_IMPL_ADD (array, i, wcscmp,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __wcscmp_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, wcscmp,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __wcscmp_evex)
+             IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/wcsncmp.c.  */
+    IFUNC_IMPL (i, name, wcsncmp,
+             IFUNC_IMPL_ADD (array, i, wcsncmp,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __wcsncmp_avx2)
+ +           IFUNC_IMPL_ADD (array, i, wcsncmp,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __wcsncmp_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, wcsncmp,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __wcsncmp_evex)
+             IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/wcscpy.c.  */
+ @@ -402,15 +666,40 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
+    IFUNC_IMPL (i, name, wcslen,
+             IFUNC_IMPL_ADD (array, i, wcslen,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __wcslen_avx2)
+ +           IFUNC_IMPL_ADD (array, i, wcslen,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __wcslen_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, wcslen,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __wcslen_evex)
+ +           IFUNC_IMPL_ADD (array, i, wcslen,
+ +                           CPU_FEATURE_USABLE (SSE4_1),
+ +                           __wcslen_sse4_1)
+             IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+    IFUNC_IMPL (i, name, wcsnlen,
+             IFUNC_IMPL_ADD (array, i, wcsnlen,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __wcsnlen_avx2)
+ +           IFUNC_IMPL_ADD (array, i, wcsnlen,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __wcsnlen_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, wcsnlen,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __wcsnlen_evex)
+             IFUNC_IMPL_ADD (array, i, wcsnlen,
+                             HAS_CPU_FEATURE (SSE4_1),
+                             __wcsnlen_sse4_1)
+ @@ -419,8 +708,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+    /* Support sysdeps/x86_64/multiarch/wmemchr.c.  */
+    IFUNC_IMPL (i, name, wmemchr,
+             IFUNC_IMPL_ADD (array, i, wmemchr,
+ -                           HAS_ARCH_FEATURE (AVX2_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+                             __wmemchr_avx2)
+ +           IFUNC_IMPL_ADD (array, i, wmemchr,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __wmemchr_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, wmemchr,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (BMI2)),
+ +                           __wmemchr_evex)
+             IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+  
+    /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
+ @@ -429,6 +729,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             (HAS_ARCH_FEATURE (AVX2_Usable)
+                              && HAS_CPU_FEATURE (MOVBE)),
+                             __wmemcmp_avx2_movbe)
+ +           IFUNC_IMPL_ADD (array, i, wmemcmp,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (MOVBE)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __wmemcmp_avx2_movbe_rtm)
+ +           IFUNC_IMPL_ADD (array, i, wmemcmp,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)
+ +                            && HAS_CPU_FEATURE (MOVBE)),
+ +                           __wmemcmp_evex_movbe)
+             IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
+                             __wmemcmp_sse4_1)
+             IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
+ @@ -443,7 +753,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             HAS_ARCH_FEATURE (AVX2_Usable),
+                             __wmemset_avx2_unaligned)
+             IFUNC_IMPL_ADD (array, i, wmemset,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __wmemset_avx2_unaligned_rtm)
+ +           IFUNC_IMPL_ADD (array, i, wmemset,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __wmemset_evex_unaligned)
+ +           IFUNC_IMPL_ADD (array, i, wmemset,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __wmemset_avx512_unaligned))
+  
+  #ifdef SHARED
+ @@ -453,10 +770,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memcpy_chk_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memcpy_chk_avx512_unaligned)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memcpy_chk_avx512_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+ @@ -464,6 +781,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memcpy_chk_avx_unaligned_erms)
+ +           IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memcpy_chk_avx_unaligned_rtm)
+ +           IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memcpy_chk_avx_unaligned_erms_rtm)
+ +           IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __memcpy_chk_evex_unaligned)
+ +           IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __memcpy_chk_evex_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+                             HAS_CPU_FEATURE (SSSE3),
+                             __memcpy_chk_ssse3_back)
+ @@ -486,6 +817,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+             IFUNC_IMPL_ADD (array, i, memcpy,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __memcpy_avx_unaligned_erms)
+ +           IFUNC_IMPL_ADD (array, i, memcpy,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memcpy_avx_unaligned_rtm)
+ +           IFUNC_IMPL_ADD (array, i, memcpy,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __memcpy_avx_unaligned_erms_rtm)
+ +           IFUNC_IMPL_ADD (array, i, memcpy,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __memcpy_evex_unaligned)
+ +           IFUNC_IMPL_ADD (array, i, memcpy,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __memcpy_evex_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+                             __memcpy_ssse3_back)
+             IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
+ @@ -494,10 +839,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __memcpy_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, memcpy,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memcpy_avx512_unaligned)
+             IFUNC_IMPL_ADD (array, i, memcpy,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __memcpy_avx512_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+             IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ @@ -511,10 +856,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __mempcpy_chk_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __mempcpy_chk_avx512_unaligned)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __mempcpy_chk_avx512_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+ @@ -522,6 +867,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __mempcpy_chk_avx_unaligned_erms)
+ +           IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __mempcpy_chk_avx_unaligned_rtm)
+ +           IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __mempcpy_chk_avx_unaligned_erms_rtm)
+ +           IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __mempcpy_chk_evex_unaligned)
+ +           IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __mempcpy_chk_evex_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+                             HAS_CPU_FEATURE (SSSE3),
+                             __mempcpy_chk_ssse3_back)
+ @@ -542,10 +901,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __mempcpy_avx512_no_vzeroupper)
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __mempcpy_avx512_unaligned)
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+ -                           HAS_ARCH_FEATURE (AVX512F_Usable),
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+                             __mempcpy_avx512_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+ @@ -553,6 +912,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+             IFUNC_IMPL_ADD (array, i, mempcpy,
+                             HAS_ARCH_FEATURE (AVX_Usable),
+                             __mempcpy_avx_unaligned_erms)
+ +           IFUNC_IMPL_ADD (array, i, mempcpy,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __mempcpy_avx_unaligned_rtm)
+ +           IFUNC_IMPL_ADD (array, i, mempcpy,
+ +                           (HAS_ARCH_FEATURE (AVX_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __mempcpy_avx_unaligned_erms_rtm)
+ +           IFUNC_IMPL_ADD (array, i, mempcpy,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __mempcpy_evex_unaligned)
+ +           IFUNC_IMPL_ADD (array, i, mempcpy,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __mempcpy_evex_unaligned_erms)
+             IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+                             __mempcpy_ssse3_back)
+             IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
+ @@ -568,6 +941,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+             IFUNC_IMPL_ADD (array, i, strncmp,
+                             HAS_ARCH_FEATURE (AVX2_Usable),
+                             __strncmp_avx2)
+ +           IFUNC_IMPL_ADD (array, i, strncmp,
+ +                           (HAS_ARCH_FEATURE (AVX2_Usable)
+ +                            && HAS_CPU_FEATURE (RTM)),
+ +                           __strncmp_avx2_rtm)
+ +           IFUNC_IMPL_ADD (array, i, strncmp,
+ +                           (HAS_ARCH_FEATURE (AVX512VL_Usable)
+ +                            && HAS_ARCH_FEATURE (AVX512BW_Usable)),
+ +                           __strncmp_evex)
+             IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
+                             __strncmp_sse42)
+             IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
+ @@ -582,6 +963,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+             IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+                             HAS_ARCH_FEATURE (AVX2_Usable),
+                             __wmemset_chk_avx2_unaligned)
+ +           IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ +                           HAS_ARCH_FEATURE (AVX512VL_Usable),
+ +                           __wmemset_chk_evex_unaligned)
+             IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+                             HAS_ARCH_FEATURE (AVX512F_Usable),
+                             __wmemset_chk_avx512_unaligned))
+ diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+ index c14db39cf4..ebbb0c01cf 100644
+ --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+ +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+ @@ -23,17 +23,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+  
+  static inline void *
+  IFUNC_SELECTOR (void)
+  {
+    const struct cpu_features* cpu_features = __get_cpu_features ();
+  
+ -  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ -      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ +  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+        && CPU_FEATURES_CPU_P (cpu_features, MOVBE)
+        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ -    return OPTIMIZE (avx2_movbe);
+ +    {
+ +      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ +       && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+ +     return OPTIMIZE (evex_movbe);
+ +
+ +      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ +     return OPTIMIZE (avx2_movbe_rtm);
+ +
+ +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ +     return OPTIMIZE (avx2_movbe);
+ +    }
+  
+    if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
+      return OPTIMIZE (sse4_1);
+ diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+ index 81673d2019..dfc5a28487 100644
+ --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+ +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+ @@ -29,6 +29,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
+    attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
+ +  attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
+ +  attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ +  attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ +  attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+    attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+ @@ -48,21 +56,42 @@ IFUNC_SELECTOR (void)
+    if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
+        && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+      {
+ -      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ -     return OPTIMIZE (avx512_no_vzeroupper);
+ +      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
+ +     {
+ +     if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ +         return OPTIMIZE (avx512_unaligned_erms);
+  
+ -      if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ -     return OPTIMIZE (avx512_unaligned_erms);
+ +       return OPTIMIZE (avx512_unaligned);
+ +     }
+  
+ -      return OPTIMIZE (avx512_unaligned);
+ +      return OPTIMIZE (avx512_no_vzeroupper);
+      }
+  
+    if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+      {
+ -      if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ -     return OPTIMIZE (avx_unaligned_erms);
+ +      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
+ +     {
+ +       if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ +         return OPTIMIZE (evex_unaligned_erms);
+ +
+ +       return OPTIMIZE (evex_unaligned);
+ +     }
+ +
+ +      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ +     {
+ +       if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ +         return OPTIMIZE (avx_unaligned_erms_rtm);
+ +
+ +       return OPTIMIZE (avx_unaligned_rtm);
+ +     }
+ +
+ +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ +     {
+ +       if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ +         return OPTIMIZE (avx_unaligned_erms);
+  
+ -      return OPTIMIZE (avx_unaligned);
+ +       return OPTIMIZE (avx_unaligned);
+ +     }
+      }
+  
+    if (!CPU_FEATURES_CPU_P (cpu_features, SSSE3)
+ diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+ index d690293385..48fdb24b02 100644
+ --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+ +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+ @@ -27,6 +27,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
+    attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+ +  attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
+ +  attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ +  attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ +  attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+    attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+ @@ -45,21 +53,44 @@ IFUNC_SELECTOR (void)
+    if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
+        && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+      {
+ -      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ -     return OPTIMIZE (avx512_no_vzeroupper);
+ +      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ +       && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+ +     {
+ +       if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ +         return OPTIMIZE (avx512_unaligned_erms);
+  
+ -      if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ -     return OPTIMIZE (avx512_unaligned_erms);
+ +       return OPTIMIZE (avx512_unaligned);
+ +     }
+  
+ -      return OPTIMIZE (avx512_unaligned);
+ +      return OPTIMIZE (avx512_no_vzeroupper);
+      }
+  
+    if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
+      {
+ -      if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ -     return OPTIMIZE (avx2_unaligned_erms);
+ -      else
+ -     return OPTIMIZE (avx2_unaligned);
+ +      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ +       && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+ +     {
+ +       if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ +         return OPTIMIZE (evex_unaligned_erms);
+ +
+ +       return OPTIMIZE (evex_unaligned);
+ +     }
+ +
+ +      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ +     {
+ +       if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ +         return OPTIMIZE (avx2_unaligned_erms_rtm);
+ +
+ +       return OPTIMIZE (avx2_unaligned_rtm);
+ +     }
+ +
+ +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ +     {
+ +       if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ +         return OPTIMIZE (avx2_unaligned_erms);
+ +
+ +       return OPTIMIZE (avx2_unaligned);
+ +     }
+      }
+  
+    if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+ index ae4f451803..f38a3b7501 100644
+ --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+ +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+ @@ -25,16 +25,27 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+    attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+  
+  static inline void *
+  IFUNC_SELECTOR (void)
+  {
+    const struct cpu_features* cpu_features = __get_cpu_features ();
+  
+ -  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ -      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ +  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ -    return OPTIMIZE (avx2);
+ +    {
+ +      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ +       && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+ +     return OPTIMIZE (evex);
+ +
+ +      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ +     return OPTIMIZE (avx2_rtm);
+ +
+ +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ +     return OPTIMIZE (avx2);
+ +    }
+  
+    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+      return OPTIMIZE (sse2_unaligned);
+ diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+ new file mode 100644
+ index 0000000000..564cc8cbec
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+ @@ -0,0 +1,52 @@
+ +/* Common definition for ifunc selections for wcslen and wcsnlen
+ +   All versions must be listed in ifunc-impl-list.c.
+ +   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#include <init-arch.h>
+ +
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+ +
+ +static inline void *
+ +IFUNC_SELECTOR (void)
+ +{
+ +  const struct cpu_features* cpu_features = __get_cpu_features ();
+ +
+ +  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ +      && CPU_FEATURES_CPU_P (cpu_features, BMI2)
+ +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ +    {
+ +      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ +       && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
+ +     return OPTIMIZE (evex);
+ +
+ +      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ +     return OPTIMIZE (avx2_rtm);
+ +
+ +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ +     return OPTIMIZE (avx2);
+ +    }
+ +
+ +  if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1))
+ +    return OPTIMIZE (sse4_1);
+ +
+ +  return OPTIMIZE (sse2);
+ +}
+ diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+ index 583f6310a1..0ce29a229d 100644
+ --- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+ +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+ @@ -20,6 +20,9 @@
+  
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
+ +  attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
+  
+  static inline void *
+ @@ -27,14 +30,21 @@ IFUNC_SELECTOR (void)
+  {
+    const struct cpu_features* cpu_features = __get_cpu_features ();
+  
+ -  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ -      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ +  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+      {
+ -      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
+ -       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+ -     return OPTIMIZE (avx512_unaligned);
+ -      else
+ +      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
+ +     {
+ +       if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+ +         return OPTIMIZE (avx512_unaligned);
+ +
+ +       return OPTIMIZE (evex_unaligned);
+ +     }
+ +
+ +      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ +     return OPTIMIZE (avx2_unaligned_rtm);
+ +
+ +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+       return OPTIMIZE (avx2_unaligned);
+      }
+  
+ diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+ new file mode 100644
+ index 0000000000..87b076c7c4
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
+ @@ -0,0 +1,12 @@
+ +#ifndef MEMCHR
+ +# define MEMCHR __memchr_avx2_rtm
+ +#endif
+ +
+ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ +  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ +
+ +#define VZEROUPPER_RETURN jmp         L(return_vzeroupper)
+ +
+ +#define SECTION(p) p##.avx.rtm
+ +
+ +#include "memchr-avx2.S"
+ diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+ index e5a9abd211..0987616a1b 100644
+ --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+ +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+ @@ -26,319 +26,407 @@
+  
+  # ifdef USE_AS_WMEMCHR
+  #  define VPCMPEQ    vpcmpeqd
+ +#  define VPBROADCAST        vpbroadcastd
+ +#  define CHAR_SIZE  4
+  # else
+  #  define VPCMPEQ    vpcmpeqb
+ +#  define VPBROADCAST        vpbroadcastb
+ +#  define CHAR_SIZE  1
+ +# endif
+ +
+ +# ifdef USE_AS_RAWMEMCHR
+ +#  define ERAW_PTR_REG       ecx
+ +#  define RRAW_PTR_REG       rcx
+ +#  define ALGN_PTR_REG       rdi
+ +# else
+ +#  define ERAW_PTR_REG       edi
+ +#  define RRAW_PTR_REG       rdi
+ +#  define ALGN_PTR_REG       rcx
+  # endif
+  
+  # ifndef VZEROUPPER
+  #  define VZEROUPPER vzeroupper
+  # endif
+  
+ +# ifndef SECTION
+ +#  define SECTION(p) p##.avx
+ +# endif
+ +
+  # define VEC_SIZE 32
+ +# define PAGE_SIZE 4096
+ +# define CHAR_PER_VEC        (VEC_SIZE / CHAR_SIZE)
+  
+ -     .section .text.avx,"ax",@progbits
+ +     .section SECTION(.text),"ax",@progbits
+  ENTRY (MEMCHR)
+  # ifndef USE_AS_RAWMEMCHR
+       /* Check for zero length.  */
+ +#  ifdef __ILP32__
+ +     /* Clear upper bits.  */
+ +     and     %RDX_LP, %RDX_LP
+ +#  else
+       test    %RDX_LP, %RDX_LP
+ +#  endif
+       jz      L(null)
+  # endif
+ -     movl    %edi, %ecx
+ -     /* Broadcast CHAR to YMM0.  */
+ +     /* Broadcast CHAR to YMMMATCH.  */
+       vmovd   %esi, %xmm0
+ -# ifdef USE_AS_WMEMCHR
+ -     shl     $2, %RDX_LP
+ -     vpbroadcastd %xmm0, %ymm0
+ -# else
+ -#  ifdef __ILP32__
+ -     /* Clear the upper 32 bits.  */
+ -     movl    %edx, %edx
+ -#  endif
+ -     vpbroadcastb %xmm0, %ymm0
+ -# endif
+ +     VPBROADCAST %xmm0, %ymm0
+       /* Check if we may cross page boundary with one vector load.  */
+ -     andl    $(2 * VEC_SIZE - 1), %ecx
+ -     cmpl    $VEC_SIZE, %ecx
+ -     ja      L(cros_page_boundary)
+ +     movl    %edi, %eax
+ +     andl    $(PAGE_SIZE - 1), %eax
+ +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
+ +     ja      L(cross_page_boundary)
+  
+       /* Check the first VEC_SIZE bytes.  */
+ -     VPCMPEQ (%rdi), %ymm0, %ymm1
+ +     VPCMPEQ (%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+ -     testl   %eax, %eax
+ -
+  # ifndef USE_AS_RAWMEMCHR
+ -     jnz     L(first_vec_x0_check)
+ -     /* Adjust length and check the end of data.  */
+ -     subq    $VEC_SIZE, %rdx
+ -     jbe     L(zero)
+ -# else
+ -     jnz     L(first_vec_x0)
+ +     /* If length < CHAR_PER_VEC handle special.  */
+ +     cmpq    $CHAR_PER_VEC, %rdx
+ +     jbe     L(first_vec_x0)
+  # endif
+ -
+ -     /* Align data for aligned loads in the loop.  */
+ -     addq    $VEC_SIZE, %rdi
+ -     andl    $(VEC_SIZE - 1), %ecx
+ -     andq    $-VEC_SIZE, %rdi
+ +     testl   %eax, %eax
+ +     jz      L(aligned_more)
+ +     tzcntl  %eax, %eax
+ +     addq    %rdi, %rax
+ +     VZEROUPPER_RETURN
+  
+  # ifndef USE_AS_RAWMEMCHR
+ -     /* Adjust length.  */
+ -     addq    %rcx, %rdx
+ +     .p2align 5
+ +L(first_vec_x0):
+ +     /* Check if first match was before length.  */
+ +     tzcntl  %eax, %eax
+ +#  ifdef USE_AS_WMEMCHR
+ +     /* NB: Multiply length by 4 to get byte count.  */
+ +     sall    $2, %edx
+ +#  endif
+ +     xorl    %ecx, %ecx
+ +     cmpl    %eax, %edx
+ +     leaq    (%rdi, %rax), %rax
+ +     cmovle  %rcx, %rax
+ +     VZEROUPPER_RETURN
+  
+ -     subq    $(VEC_SIZE * 4), %rdx
+ -     jbe     L(last_4x_vec_or_less)
+ +L(null):
+ +     xorl    %eax, %eax
+ +     ret
+  # endif
+ -     jmp     L(more_4x_vec)
+ -
+       .p2align 4
+ -L(cros_page_boundary):
+ -     andl    $(VEC_SIZE - 1), %ecx
+ -     andq    $-VEC_SIZE, %rdi
+ -     VPCMPEQ (%rdi), %ymm0, %ymm1
+ +L(cross_page_boundary):
+ +     /* Save pointer before aligning as its original value is
+ +        necessary for computer return address if byte is found or
+ +        adjusting length if it is not and this is memchr.  */
+ +     movq    %rdi, %rcx
+ +     /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+ +        and rdi for rawmemchr.  */
+ +     orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
+ +     VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+ +# ifndef USE_AS_RAWMEMCHR
+ +     /* Calculate length until end of page (length checked for a
+ +        match).  */
+ +     leaq    1(%ALGN_PTR_REG), %rsi
+ +     subq    %RRAW_PTR_REG, %rsi
+ +#  ifdef USE_AS_WMEMCHR
+ +     /* NB: Divide bytes by 4 to get wchar_t count.  */
+ +     shrl    $2, %esi
+ +#  endif
+ +# endif
+       /* Remove the leading bytes.  */
+ -     sarl    %cl, %eax
+ -     testl   %eax, %eax
+ -     jz      L(aligned_more)
+ -     tzcntl  %eax, %eax
+ +     sarxl   %ERAW_PTR_REG, %eax, %eax
+  # ifndef USE_AS_RAWMEMCHR
+       /* Check the end of data.  */
+ -     cmpq    %rax, %rdx
+ -     jbe     L(zero)
+ +     cmpq    %rsi, %rdx
+ +     jbe     L(first_vec_x0)
+  # endif
+ +     testl   %eax, %eax
+ +     jz      L(cross_page_continue)
+ +     tzcntl  %eax, %eax
+ +     addq    %RRAW_PTR_REG, %rax
+ +L(return_vzeroupper):
+ +     ZERO_UPPER_VEC_REGISTERS_RETURN
+ +
+ +     .p2align 4
+ +L(first_vec_x1):
+ +     tzcntl  %eax, %eax
+ +     incq    %rdi
+       addq    %rdi, %rax
+ -     addq    %rcx, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+ -L(aligned_more):
+ -# ifndef USE_AS_RAWMEMCHR
+ -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+ -        instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+ -        overflow.  */
+ -     negq    %rcx
+ -     addq    $VEC_SIZE, %rcx
+ +L(first_vec_x2):
+ +     tzcntl  %eax, %eax
+ +     addq    $(VEC_SIZE + 1), %rdi
+ +     addq    %rdi, %rax
+ +     VZEROUPPER_RETURN
+  
+ -     /* Check the end of data.  */
+ -     subq    %rcx, %rdx
+ -     jbe     L(zero)
+ -# endif
+ +     .p2align 4
+ +L(first_vec_x3):
+ +     tzcntl  %eax, %eax
+ +     addq    $(VEC_SIZE * 2 + 1), %rdi
+ +     addq    %rdi, %rax
+ +     VZEROUPPER_RETURN
+  
+ -     addq    $VEC_SIZE, %rdi
+  
+ -# ifndef USE_AS_RAWMEMCHR
+ -     subq    $(VEC_SIZE * 4), %rdx
+ -     jbe     L(last_4x_vec_or_less)
+ -# endif
+ +     .p2align 4
+ +L(first_vec_x4):
+ +     tzcntl  %eax, %eax
+ +     addq    $(VEC_SIZE * 3 + 1), %rdi
+ +     addq    %rdi, %rax
+ +     VZEROUPPER_RETURN
+  
+ -L(more_4x_vec):
+ +     .p2align 4
+ +L(aligned_more):
+       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+          since data is only aligned to VEC_SIZE.  */
+ -     VPCMPEQ (%rdi), %ymm0, %ymm1
+ -     vpmovmskb %ymm1, %eax
+ -     testl   %eax, %eax
+ -     jnz     L(first_vec_x0)
+  
+ -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ +# ifndef USE_AS_RAWMEMCHR
+ +L(cross_page_continue):
+ +     /* Align data to VEC_SIZE - 1.  */
+ +     xorl    %ecx, %ecx
+ +     subl    %edi, %ecx
+ +     orq     $(VEC_SIZE - 1), %rdi
+ +     /* esi is for adjusting length to see if near the end.  */
+ +     leal    (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+ +#  ifdef USE_AS_WMEMCHR
+ +     /* NB: Divide bytes by 4 to get the wchar_t count.  */
+ +     sarl    $2, %esi
+ +#  endif
+ +# else
+ +     orq     $(VEC_SIZE - 1), %rdi
+ +L(cross_page_continue):
+ +# endif
+ +     /* Load first VEC regardless.  */
+ +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+ +# ifndef USE_AS_RAWMEMCHR
+ +     /* Adjust length. If near end handle specially.  */
+ +     subq    %rsi, %rdx
+ +     jbe     L(last_4x_vec_or_less)
+ +# endif
+       testl   %eax, %eax
+       jnz     L(first_vec_x1)
+  
+ -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+       testl   %eax, %eax
+       jnz     L(first_vec_x2)
+  
+ -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+       testl   %eax, %eax
+       jnz     L(first_vec_x3)
+  
+ -     addq    $(VEC_SIZE * 4), %rdi
+ -
+ -# ifndef USE_AS_RAWMEMCHR
+ -     subq    $(VEC_SIZE * 4), %rdx
+ -     jbe     L(last_4x_vec_or_less)
+ -# endif
+ -
+ -     /* Align data to 4 * VEC_SIZE.  */
+ -     movq    %rdi, %rcx
+ -     andl    $(4 * VEC_SIZE - 1), %ecx
+ -     andq    $-(4 * VEC_SIZE), %rdi
+ +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ +     vpmovmskb %ymm1, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x4)
+  
+  # ifndef USE_AS_RAWMEMCHR
+ -     /* Adjust length.  */
+ +     /* Check if at last VEC_SIZE * 4 length.  */
+ +     subq    $(CHAR_PER_VEC * 4), %rdx
+ +     jbe     L(last_4x_vec_or_less_cmpeq)
+ +     /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ +        length.  */
+ +     incq    %rdi
+ +     movl    %edi, %ecx
+ +     orq     $(VEC_SIZE * 4 - 1), %rdi
+ +     andl    $(VEC_SIZE * 4 - 1), %ecx
+ +#  ifdef USE_AS_WMEMCHR
+ +     /* NB: Divide bytes by 4 to get the wchar_t count.  */
+ +     sarl    $2, %ecx
+ +#  endif
+       addq    %rcx, %rdx
+ +# else
+ +     /* Align data to VEC_SIZE * 4 - 1 for loop.  */
+ +     incq    %rdi
+ +     orq     $(VEC_SIZE * 4 - 1), %rdi
+  # endif
+  
+ +     /* Compare 4 * VEC at a time forward.  */
+       .p2align 4
+  L(loop_4x_vec):
+ -     /* Compare 4 * VEC at a time forward.  */
+ -     VPCMPEQ (%rdi), %ymm0, %ymm1
+ -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
+ -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+ -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+ -
+ +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
+ +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+ +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+ +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
+       vpor    %ymm1, %ymm2, %ymm5
+       vpor    %ymm3, %ymm4, %ymm6
+       vpor    %ymm5, %ymm6, %ymm5
+  
+ -     vpmovmskb %ymm5, %eax
+ -     testl   %eax, %eax
+ -     jnz     L(4x_vec_end)
+ -
+ -     addq    $(VEC_SIZE * 4), %rdi
+ -
+ +     vpmovmskb %ymm5, %ecx
+  # ifdef USE_AS_RAWMEMCHR
+ -     jmp     L(loop_4x_vec)
+ +     subq    $-(VEC_SIZE * 4), %rdi
+ +     testl   %ecx, %ecx
+ +     jz      L(loop_4x_vec)
+  # else
+ -     subq    $(VEC_SIZE * 4), %rdx
+ -     ja      L(loop_4x_vec)
+ +     testl   %ecx, %ecx
+ +     jnz     L(loop_4x_vec_end)
+  
+ -L(last_4x_vec_or_less):
+ -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
+ -     addl    $(VEC_SIZE * 2), %edx
+ -     jle     L(last_2x_vec)
+ +     subq    $-(VEC_SIZE * 4), %rdi
+  
+ -     VPCMPEQ (%rdi), %ymm0, %ymm1
+ -     vpmovmskb %ymm1, %eax
+ -     testl   %eax, %eax
+ -     jnz     L(first_vec_x0)
+ +     subq    $(CHAR_PER_VEC * 4), %rdx
+ +     ja      L(loop_4x_vec)
+  
+ -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ +     /* Fall through into less than 4 remaining vectors of length
+ +        case.  */
+ +     VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+ +     .p2align 4
+ +L(last_4x_vec_or_less):
+ +#  ifdef USE_AS_WMEMCHR
+ +     /* NB: Multiply length by 4 to get byte count.  */
+ +     sall    $2, %edx
+ +#  endif
+ +     /* Check if first VEC contained match.  */
+       testl   %eax, %eax
+ -     jnz     L(first_vec_x1)
+ +     jnz     L(first_vec_x1_check)
+  
+ -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ -     vpmovmskb %ymm1, %eax
+ -     testl   %eax, %eax
+ +     /* If remaining length > VEC_SIZE * 2.  */
+ +     addl    $(VEC_SIZE * 2), %edx
+ +     jg      L(last_4x_vec)
+  
+ -     jnz     L(first_vec_x2_check)
+ -     subl    $VEC_SIZE, %edx
+ -     jle     L(zero)
+ +L(last_2x_vec):
+ +     /* If remaining length < VEC_SIZE.  */
+ +     addl    $VEC_SIZE, %edx
+ +     jle     L(zero_end)
+  
+ -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ +     /* Check VEC2 and compare any match with remaining length.  */
+ +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+       vpmovmskb %ymm1, %eax
+ -     testl   %eax, %eax
+ -
+ -     jnz     L(first_vec_x3_check)
+ -     xorl    %eax, %eax
+ -     VZEROUPPER
+ -     ret
+ +     tzcntl  %eax, %eax
+ +     cmpl    %eax, %edx
+ +     jbe     L(set_zero_end)
+ +     addq    $(VEC_SIZE + 1), %rdi
+ +     addq    %rdi, %rax
+ +L(zero_end):
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+ -L(last_2x_vec):
+ -     addl    $(VEC_SIZE * 2), %edx
+ -     VPCMPEQ (%rdi), %ymm0, %ymm1
+ +L(loop_4x_vec_end):
+ +# endif
+ +     /* rawmemchr will fall through into this if match was found in
+ +        loop.  */
+ +
+       vpmovmskb %ymm1, %eax
+       testl   %eax, %eax
+ +     jnz     L(last_vec_x1_return)
+  
+ -     jnz     L(first_vec_x0_check)
+ -     subl    $VEC_SIZE, %edx
+ -     jle     L(zero)
+ -
+ -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ -     vpmovmskb %ymm1, %eax
+ +     vpmovmskb %ymm2, %eax
+       testl   %eax, %eax
+ -     jnz     L(first_vec_x1_check)
+ -     xorl    %eax, %eax
+ -     VZEROUPPER
+ -     ret
+ +     jnz     L(last_vec_x2_return)
+  
+ -     .p2align 4
+ -L(first_vec_x0_check):
+ -     tzcntl  %eax, %eax
+ -     /* Check the end of data.  */
+ -     cmpq    %rax, %rdx
+ -     jbe     L(zero)
+ +     vpmovmskb %ymm3, %eax
+ +     /* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
+ +     salq    $32, %rcx
+ +     orq     %rcx, %rax
+ +     tzcntq  %rax, %rax
+ +# ifdef USE_AS_RAWMEMCHR
+ +     subq    $(VEC_SIZE * 2 - 1), %rdi
+ +# else
+ +     subq    $-(VEC_SIZE * 2 + 1), %rdi
+ +# endif
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+ +# ifndef USE_AS_RAWMEMCHR
+  
+       .p2align 4
+  L(first_vec_x1_check):
+       tzcntl  %eax, %eax
+ -     /* Check the end of data.  */
+ -     cmpq    %rax, %rdx
+ -     jbe     L(zero)
+ -     addq    $VEC_SIZE, %rax
+ +     /* Adjust length.  */
+ +     subl    $-(VEC_SIZE * 4), %edx
+ +     /* Check if match within remaining length.  */
+ +     cmpl    %eax, %edx
+ +     jbe     L(set_zero_end)
+ +     incq    %rdi
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+ +     .p2align 4
+ +L(set_zero_end):
+ +     xorl    %eax, %eax
+ +     VZEROUPPER_RETURN
+ +# endif
+  
+       .p2align 4
+ -L(first_vec_x2_check):
+ +L(last_vec_x1_return):
+       tzcntl  %eax, %eax
+ -     /* Check the end of data.  */
+ -     cmpq    %rax, %rdx
+ -     jbe     L(zero)
+ -     addq    $(VEC_SIZE * 2), %rax
+ +# ifdef USE_AS_RAWMEMCHR
+ +     subq    $(VEC_SIZE * 4 - 1), %rdi
+ +# else
+ +     incq    %rdi
+ +# endif
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+ -L(first_vec_x3_check):
+ +L(last_vec_x2_return):
+       tzcntl  %eax, %eax
+ -     /* Check the end of data.  */
+ -     cmpq    %rax, %rdx
+ -     jbe     L(zero)
+ -     addq    $(VEC_SIZE * 3), %rax
+ +# ifdef USE_AS_RAWMEMCHR
+ +     subq    $(VEC_SIZE * 3 - 1), %rdi
+ +# else
+ +     subq    $-(VEC_SIZE + 1), %rdi
+ +# endif
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+ +# ifndef USE_AS_RAWMEMCHR
+       .p2align 4
+ -L(zero):
+ -     VZEROUPPER
+ -L(null):
+ -     xorl    %eax, %eax
+ -     ret
+ -# endif
+ +L(last_4x_vec_or_less_cmpeq):
+ +     VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ +     vpmovmskb %ymm1, %eax
+ +#  ifdef USE_AS_WMEMCHR
+ +     /* NB: Multiply length by 4 to get byte count.  */
+ +     sall    $2, %edx
+ +#  endif
+ +     subq    $-(VEC_SIZE * 4), %rdi
+ +     /* Check first VEC regardless.  */
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x1_check)
+  
+ +     /* If remaining length <= CHAR_PER_VEC * 2.  */
+ +     addl    $(VEC_SIZE * 2), %edx
+ +     jle     L(last_2x_vec)
+       .p2align 4
+ -L(first_vec_x0):
+ -     tzcntl  %eax, %eax
+ -     addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +L(last_4x_vec):
+ +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ +     vpmovmskb %ymm1, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x2_return)
+  
+ -     .p2align 4
+ -L(first_vec_x1):
+ -     tzcntl  %eax, %eax
+ -     addq    $VEC_SIZE, %rax
+ -     addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ +     vpmovmskb %ymm1, %eax
+  
+ -     .p2align 4
+ -L(first_vec_x2):
+ +     /* Create mask for possible matches within remaining length.  */
+ +     movq    $-1, %rcx
+ +     bzhiq   %rdx, %rcx, %rcx
+ +
+ +     /* Test matches in data against length match.  */
+ +     andl    %ecx, %eax
+ +     jnz     L(last_vec_x3)
+ +
+ +     /* if remaining length <= VEC_SIZE * 3 (Note this is after
+ +        remaining length was found to be > VEC_SIZE * 2.  */
+ +     subl    $VEC_SIZE, %edx
+ +     jbe     L(zero_end2)
+ +
+ +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ +     vpmovmskb %ymm1, %eax
+ +     /* Shift remaining length mask for last VEC.  */
+ +     shrq    $32, %rcx
+ +     andl    %ecx, %eax
+ +     jz      L(zero_end2)
+       tzcntl  %eax, %eax
+ -     addq    $(VEC_SIZE * 2), %rax
+ +     addq    $(VEC_SIZE * 3 + 1), %rdi
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +L(zero_end2):
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+ -L(4x_vec_end):
+ -     vpmovmskb %ymm1, %eax
+ -     testl   %eax, %eax
+ -     jnz     L(first_vec_x0)
+ -     vpmovmskb %ymm2, %eax
+ -     testl   %eax, %eax
+ -     jnz     L(first_vec_x1)
+ -     vpmovmskb %ymm3, %eax
+ -     testl   %eax, %eax
+ -     jnz     L(first_vec_x2)
+ -     vpmovmskb %ymm4, %eax
+ -     testl   %eax, %eax
+ -L(first_vec_x3):
+ +L(last_vec_x3):
+       tzcntl  %eax, %eax
+ -     addq    $(VEC_SIZE * 3), %rax
+ +     subq    $-(VEC_SIZE * 2 + 1), %rdi
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+ +# endif
+  
+  END (MEMCHR)
+  #endif
+ diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+ new file mode 100644
+ index 0000000000..f3fdad4fda
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+ @@ -0,0 +1,478 @@
+ +/* memchr/wmemchr optimized with 256-bit EVEX instructions.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#if IS_IN (libc)
+ +
+ +# include <sysdep.h>
+ +
+ +# ifndef MEMCHR
+ +#  define MEMCHR     __memchr_evex
+ +# endif
+ +
+ +# ifdef USE_AS_WMEMCHR
+ +#  define VPBROADCAST        vpbroadcastd
+ +#  define VPMINU     vpminud
+ +#  define VPCMP      vpcmpd
+ +#  define VPCMPEQ    vpcmpeqd
+ +#  define CHAR_SIZE  4
+ +# else
+ +#  define VPBROADCAST        vpbroadcastb
+ +#  define VPMINU     vpminub
+ +#  define VPCMP      vpcmpb
+ +#  define VPCMPEQ    vpcmpeqb
+ +#  define CHAR_SIZE  1
+ +# endif
+ +
+ +# ifdef USE_AS_RAWMEMCHR
+ +#  define RAW_PTR_REG        rcx
+ +#  define ALGN_PTR_REG       rdi
+ +# else
+ +#  define RAW_PTR_REG        rdi
+ +#  define ALGN_PTR_REG       rcx
+ +# endif
+ +
+ +# define XMMZERO     xmm23
+ +# define YMMZERO     ymm23
+ +# define XMMMATCH    xmm16
+ +# define YMMMATCH    ymm16
+ +# define YMM1                ymm17
+ +# define YMM2                ymm18
+ +# define YMM3                ymm19
+ +# define YMM4                ymm20
+ +# define YMM5                ymm21
+ +# define YMM6                ymm22
+ +
+ +# define VEC_SIZE 32
+ +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ +# define PAGE_SIZE 4096
+ +
+ +     .section .text.evex,"ax",@progbits
+ +ENTRY (MEMCHR)
+ +# ifndef USE_AS_RAWMEMCHR
+ +     /* Check for zero length.  */
+ +     test    %RDX_LP, %RDX_LP
+ +     jz      L(zero)
+ +
+ +#  ifdef __ILP32__
+ +     /* Clear the upper 32 bits.  */
+ +     movl    %edx, %edx
+ +#  endif
+ +# endif
+ +     /* Broadcast CHAR to YMMMATCH.  */
+ +     VPBROADCAST %esi, %YMMMATCH
+ +     /* Check if we may cross page boundary with one vector load.  */
+ +     movl    %edi, %eax
+ +     andl    $(PAGE_SIZE - 1), %eax
+ +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
+ +     ja      L(cross_page_boundary)
+ +
+ +     /* Check the first VEC_SIZE bytes.  */
+ +     VPCMP   $0, (%rdi), %YMMMATCH, %k0
+ +     kmovd   %k0, %eax
+ +# ifndef USE_AS_RAWMEMCHR
+ +     /* If length < CHAR_PER_VEC handle special.  */
+ +     cmpq    $CHAR_PER_VEC, %rdx
+ +     jbe     L(first_vec_x0)
+ +# endif
+ +     testl   %eax, %eax
+ +     jz      L(aligned_more)
+ +     tzcntl  %eax, %eax
+ +# ifdef USE_AS_WMEMCHR
+ +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+ +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
+ +# else
+ +     addq    %rdi, %rax
+ +# endif
+ +     ret
+ +
+ +# ifndef USE_AS_RAWMEMCHR
+ +L(zero):
+ +     xorl    %eax, %eax
+ +     ret
+ +
+ +     .p2align 5
+ +L(first_vec_x0):
+ +     /* Check if first match was before length.  */
+ +     tzcntl  %eax, %eax
+ +     xorl    %ecx, %ecx
+ +     cmpl    %eax, %edx
+ +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
+ +     cmovle  %rcx, %rax
+ +     ret
+ +# else
+ +     /* NB: first_vec_x0 is 17 bytes which will leave
+ +        cross_page_boundary (which is relatively cold) close enough
+ +        to ideal alignment. So only realign L(cross_page_boundary) if
+ +        rawmemchr.  */
+ +     .p2align 4
+ +# endif
+ +L(cross_page_boundary):
+ +     /* Save pointer before aligning as its original value is
+ +        necessary for computer return address if byte is found or
+ +        adjusting length if it is not and this is memchr.  */
+ +     movq    %rdi, %rcx
+ +     /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+ +        for rawmemchr.  */
+ +     andq    $-VEC_SIZE, %ALGN_PTR_REG
+ +     VPCMP   $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+ +     kmovd   %k0, %r8d
+ +# ifdef USE_AS_WMEMCHR
+ +     /* NB: Divide shift count by 4 since each bit in K0 represent 4
+ +        bytes.  */
+ +     sarl    $2, %eax
+ +# endif
+ +# ifndef USE_AS_RAWMEMCHR
+ +     movl    $(PAGE_SIZE / CHAR_SIZE), %esi
+ +     subl    %eax, %esi
+ +# endif
+ +# ifdef USE_AS_WMEMCHR
+ +     andl    $(CHAR_PER_VEC - 1), %eax
+ +# endif
+ +     /* Remove the leading bytes.  */
+ +     sarxl   %eax, %r8d, %eax
+ +# ifndef USE_AS_RAWMEMCHR
+ +     /* Check the end of data.  */
+ +     cmpq    %rsi, %rdx
+ +     jbe     L(first_vec_x0)
+ +# endif
+ +     testl   %eax, %eax
+ +     jz      L(cross_page_continue)
+ +     tzcntl  %eax, %eax
+ +# ifdef USE_AS_WMEMCHR
+ +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+ +     leaq    (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+ +# else
+ +     addq    %RAW_PTR_REG, %rax
+ +# endif
+ +     ret
+ +
+ +     .p2align 4
+ +L(first_vec_x1):
+ +     tzcntl  %eax, %eax
+ +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(first_vec_x2):
+ +     tzcntl  %eax, %eax
+ +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(first_vec_x3):
+ +     tzcntl  %eax, %eax
+ +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(first_vec_x4):
+ +     tzcntl  %eax, %eax
+ +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ +     ret
+ +
+ +     .p2align 5
+ +L(aligned_more):
+ +     /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ +        since data is only aligned to VEC_SIZE.  */
+ +
+ +# ifndef USE_AS_RAWMEMCHR
+ +     /* Align data to VEC_SIZE.  */
+ +L(cross_page_continue):
+ +     xorl    %ecx, %ecx
+ +     subl    %edi, %ecx
+ +     andq    $-VEC_SIZE, %rdi
+ +     /* esi is for adjusting length to see if near the end.  */
+ +     leal    (VEC_SIZE * 5)(%rdi, %rcx), %esi
+ +#  ifdef USE_AS_WMEMCHR
+ +     /* NB: Divide bytes by 4 to get the wchar_t count.  */
+ +     sarl    $2, %esi
+ +#  endif
+ +# else
+ +     andq    $-VEC_SIZE, %rdi
+ +L(cross_page_continue):
+ +# endif
+ +     /* Load first VEC regardless.  */
+ +     VPCMP   $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+ +     kmovd   %k0, %eax
+ +# ifndef USE_AS_RAWMEMCHR
+ +     /* Adjust length. If near end handle specially.  */
+ +     subq    %rsi, %rdx
+ +     jbe     L(last_4x_vec_or_less)
+ +# endif
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x1)
+ +
+ +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ +     kmovd   %k0, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x2)
+ +
+ +     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ +     kmovd   %k0, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x3)
+ +
+ +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ +     kmovd   %k0, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x4)
+ +
+ +
+ +# ifndef USE_AS_RAWMEMCHR
+ +     /* Check if at last CHAR_PER_VEC * 4 length.  */
+ +     subq    $(CHAR_PER_VEC * 4), %rdx
+ +     jbe     L(last_4x_vec_or_less_cmpeq)
+ +     addq    $VEC_SIZE, %rdi
+ +
+ +     /* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ +      */
+ +#  ifdef USE_AS_WMEMCHR
+ +     movl    %edi, %ecx
+ +     andq    $-(4 * VEC_SIZE), %rdi
+ +     andl    $(VEC_SIZE * 4 - 1), %ecx
+ +     /* NB: Divide bytes by 4 to get the wchar_t count.  */
+ +     sarl    $2, %ecx
+ +     addq    %rcx, %rdx
+ +#  else
+ +     addq    %rdi, %rdx
+ +     andq    $-(4 * VEC_SIZE), %rdi
+ +     subq    %rdi, %rdx
+ +#  endif
+ +# else
+ +     addq    $VEC_SIZE, %rdi
+ +     andq    $-(4 * VEC_SIZE), %rdi
+ +# endif
+ +
+ +     vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
+ +
+ +     /* Compare 4 * VEC at a time forward.  */
+ +     .p2align 4
+ +L(loop_4x_vec):
+ +     /* It would be possible to save some instructions using 4x VPCMP
+ +        but bottleneck on port 5 makes it not woth it.  */
+ +     VPCMP   $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+ +     /* xor will set bytes match esi to zero.  */
+ +     vpxorq  (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+ +     vpxorq  (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ +     VPCMP   $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ +     /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+ +     VPMINU  %YMM2, %YMM3, %YMM3{%k1}{z}
+ +     VPCMP   $0, %YMM3, %YMMZERO, %k2
+ +# ifdef USE_AS_RAWMEMCHR
+ +     subq    $-(VEC_SIZE * 4), %rdi
+ +     kortestd %k2, %k3
+ +     jz      L(loop_4x_vec)
+ +# else
+ +     kortestd %k2, %k3
+ +     jnz     L(loop_4x_vec_end)
+ +
+ +     subq    $-(VEC_SIZE * 4), %rdi
+ +
+ +     subq    $(CHAR_PER_VEC * 4), %rdx
+ +     ja      L(loop_4x_vec)
+ +
+ +     /* Fall through into less than 4 remaining vectors of length case.
+ +      */
+ +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ +     kmovd   %k0, %eax
+ +     addq    $(VEC_SIZE * 3), %rdi
+ +     .p2align 4
+ +L(last_4x_vec_or_less):
+ +     /* Check if first VEC contained match.  */
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x1_check)
+ +
+ +     /* If remaining length > CHAR_PER_VEC * 2.  */
+ +     addl    $(CHAR_PER_VEC * 2), %edx
+ +     jg      L(last_4x_vec)
+ +
+ +L(last_2x_vec):
+ +     /* If remaining length < CHAR_PER_VEC.  */
+ +     addl    $CHAR_PER_VEC, %edx
+ +     jle     L(zero_end)
+ +
+ +     /* Check VEC2 and compare any match with remaining length.  */
+ +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ +     kmovd   %k0, %eax
+ +     tzcntl  %eax, %eax
+ +     cmpl    %eax, %edx
+ +     jbe     L(set_zero_end)
+ +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ +L(zero_end):
+ +     ret
+ +
+ +
+ +     .p2align 4
+ +L(first_vec_x1_check):
+ +     tzcntl  %eax, %eax
+ +     /* Adjust length.  */
+ +     subl    $-(CHAR_PER_VEC * 4), %edx
+ +     /* Check if match within remaining length.  */
+ +     cmpl    %eax, %edx
+ +     jbe     L(set_zero_end)
+ +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+ +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ +     ret
+ +L(set_zero_end):
+ +     xorl    %eax, %eax
+ +     ret
+ +
+ +     .p2align 4
+ +L(loop_4x_vec_end):
+ +# endif
+ +     /* rawmemchr will fall through into this if match was found in
+ +        loop.  */
+ +
+ +     /* k1 has not of matches with VEC1.  */
+ +     kmovd   %k1, %eax
+ +# ifdef USE_AS_WMEMCHR
+ +     subl    $((1 << CHAR_PER_VEC) - 1), %eax
+ +# else
+ +     incl    %eax
+ +# endif
+ +     jnz     L(last_vec_x1_return)
+ +
+ +     VPCMP   $0, %YMM2, %YMMZERO, %k0
+ +     kmovd   %k0, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x2_return)
+ +
+ +     kmovd   %k2, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x3_return)
+ +
+ +     kmovd   %k3, %eax
+ +     tzcntl  %eax, %eax
+ +# ifdef USE_AS_RAWMEMCHR
+ +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ +# else
+ +     leaq    (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
+ +# endif
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_x1_return):
+ +     tzcntl  %eax, %eax
+ +# ifdef USE_AS_RAWMEMCHR
+ +#  ifdef USE_AS_WMEMCHR
+ +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+ +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
+ +#  else
+ +     addq    %rdi, %rax
+ +#  endif
+ +# else
+ +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+ +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ +# endif
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_x2_return):
+ +     tzcntl  %eax, %eax
+ +# ifdef USE_AS_RAWMEMCHR
+ +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+ +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ +# else
+ +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+ +     leaq    (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+ +# endif
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_x3_return):
+ +     tzcntl  %eax, %eax
+ +# ifdef USE_AS_RAWMEMCHR
+ +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+ +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ +# else
+ +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+ +     leaq    (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+ +# endif
+ +     ret
+ +
+ +
+ +# ifndef USE_AS_RAWMEMCHR
+ +L(last_4x_vec_or_less_cmpeq):
+ +     VPCMP   $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+ +     kmovd   %k0, %eax
+ +     subq    $-(VEC_SIZE * 4), %rdi
+ +     /* Check first VEC regardless.  */
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x1_check)
+ +
+ +     /* If remaining length <= CHAR_PER_VEC * 2.  */
+ +     addl    $(CHAR_PER_VEC * 2), %edx
+ +     jle     L(last_2x_vec)
+ +
+ +     .p2align 4
+ +L(last_4x_vec):
+ +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ +     kmovd   %k0, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x2)
+ +
+ +
+ +     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ +     kmovd   %k0, %eax
+ +     /* Create mask for possible matches within remaining length.  */
+ +#  ifdef USE_AS_WMEMCHR
+ +     movl    $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+ +     bzhil   %edx, %ecx, %ecx
+ +#  else
+ +     movq    $-1, %rcx
+ +     bzhiq   %rdx, %rcx, %rcx
+ +#  endif
+ +     /* Test matches in data against length match.  */
+ +     andl    %ecx, %eax
+ +     jnz     L(last_vec_x3)
+ +
+ +     /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+ +        remaining length was found to be > CHAR_PER_VEC * 2.  */
+ +     subl    $CHAR_PER_VEC, %edx
+ +     jbe     L(zero_end2)
+ +
+ +
+ +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ +     kmovd   %k0, %eax
+ +     /* Shift remaining length mask for last VEC.  */
+ +#  ifdef USE_AS_WMEMCHR
+ +     shrl    $CHAR_PER_VEC, %ecx
+ +#  else
+ +     shrq    $CHAR_PER_VEC, %rcx
+ +#  endif
+ +     andl    %ecx, %eax
+ +     jz      L(zero_end2)
+ +     tzcntl  %eax, %eax
+ +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ +L(zero_end2):
+ +     ret
+ +
+ +L(last_vec_x2):
+ +     tzcntl  %eax, %eax
+ +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_x3):
+ +     tzcntl  %eax, %eax
+ +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ +     ret
+ +# endif
+ +
+ +END (MEMCHR)
+ +#endif
+ diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
+ new file mode 100644
+ index 0000000000..cf4eff5d4a
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S
+ @@ -0,0 +1,12 @@
+ +#ifndef MEMCMP
+ +# define MEMCMP __memcmp_avx2_movbe_rtm
+ +#endif
+ +
+ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ +  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ +
+ +#define VZEROUPPER_RETURN jmp         L(return_vzeroupper)
+ +
+ +#define SECTION(p) p##.avx.rtm
+ +
+ +#include "memcmp-avx2-movbe.S"
+ diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+ index 67fc575b59..87f9478eaf 100644
+ --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+ +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+ @@ -47,6 +47,10 @@
+  #  define VZEROUPPER vzeroupper
+  # endif
+  
+ +# ifndef SECTION
+ +#  define SECTION(p) p##.avx
+ +# endif
+ +
+  # define VEC_SIZE 32
+  # define VEC_MASK ((1 << VEC_SIZE) - 1)
+  
+ @@ -55,7 +59,7 @@
+             memcmp has to use UNSIGNED comparison for elemnts.
+  */
+  
+ -     .section .text.avx,"ax",@progbits
+ +     .section SECTION(.text),"ax",@progbits
+  ENTRY (MEMCMP)
+  # ifdef USE_AS_WMEMCMP
+       shl     $2, %RDX_LP
+ @@ -123,8 +127,8 @@ ENTRY (MEMCMP)
+       vptest  %ymm0, %ymm5
+       jnc     L(4x_vec_end)
+       xorl    %eax, %eax
+ -     VZEROUPPER
+ -     ret
+ +L(return_vzeroupper):
+ +     ZERO_UPPER_VEC_REGISTERS_RETURN
+  
+       .p2align 4
+  L(last_2x_vec):
+ @@ -144,8 +148,7 @@ L(last_vec):
+       vpmovmskb %ymm2, %eax
+       subl    $VEC_MASK, %eax
+       jnz     L(first_vec)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(first_vec):
+ @@ -164,8 +167,7 @@ L(wmemcmp_return):
+       movzbl  (%rsi, %rcx), %edx
+       sub     %edx, %eax
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+  # ifdef USE_AS_WMEMCMP
+       .p2align 4
+ @@ -367,8 +369,7 @@ L(last_4x_vec):
+       vpmovmskb %ymm2, %eax
+       subl    $VEC_MASK, %eax
+       jnz     L(first_vec)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(4x_vec_end):
+ @@ -394,8 +395,7 @@ L(4x_vec_end):
+       movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %edx
+       sub     %edx, %eax
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(first_vec_x1):
+ @@ -410,8 +410,7 @@ L(first_vec_x1):
+       movzbl  VEC_SIZE(%rsi, %rcx), %edx
+       sub     %edx, %eax
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(first_vec_x2):
+ @@ -426,7 +425,6 @@ L(first_vec_x2):
+       movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %edx
+       sub     %edx, %eax
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  END (MEMCMP)
+  #endif
+ diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+ new file mode 100644
+ index 0000000000..9c093972e1
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+ @@ -0,0 +1,440 @@
+ +/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#if IS_IN (libc)
+ +
+ +/* memcmp/wmemcmp is implemented as:
+ +   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+ +      to avoid branches.
+ +   2. Use overlapping compare to avoid branch.
+ +   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+ +      bytes for wmemcmp.
+ +   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+ +   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+ +      area.
+ +   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+ +   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+ +   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+ +
+ +# include <sysdep.h>
+ +
+ +# ifndef MEMCMP
+ +#  define MEMCMP     __memcmp_evex_movbe
+ +# endif
+ +
+ +# define VMOVU               vmovdqu64
+ +
+ +# ifdef USE_AS_WMEMCMP
+ +#  define VPCMPEQ    vpcmpeqd
+ +# else
+ +#  define VPCMPEQ    vpcmpeqb
+ +# endif
+ +
+ +# define XMM1                xmm17
+ +# define XMM2                xmm18
+ +# define YMM1                ymm17
+ +# define YMM2                ymm18
+ +# define YMM3                ymm19
+ +# define YMM4                ymm20
+ +# define YMM5                ymm21
+ +# define YMM6                ymm22
+ +
+ +# define VEC_SIZE 32
+ +# ifdef USE_AS_WMEMCMP
+ +#  define VEC_MASK 0xff
+ +#  define XMM_MASK 0xf
+ +# else
+ +#  define VEC_MASK 0xffffffff
+ +#  define XMM_MASK 0xffff
+ +# endif
+ +
+ +/* Warning!
+ +           wmemcmp has to use SIGNED comparison for elements.
+ +           memcmp has to use UNSIGNED comparison for elemnts.
+ +*/
+ +
+ +     .section .text.evex,"ax",@progbits
+ +ENTRY (MEMCMP)
+ +# ifdef USE_AS_WMEMCMP
+ +     shl     $2, %RDX_LP
+ +# elif defined __ILP32__
+ +     /* Clear the upper 32 bits.  */
+ +     movl    %edx, %edx
+ +# endif
+ +     cmp     $VEC_SIZE, %RDX_LP
+ +     jb      L(less_vec)
+ +
+ +     /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ +     VMOVU   (%rsi), %YMM2
+ +     VPCMPEQ (%rdi), %YMM2, %k1
+ +     kmovd   %k1, %eax
+ +     subl    $VEC_MASK, %eax
+ +     jnz     L(first_vec)
+ +
+ +     cmpq    $(VEC_SIZE * 2), %rdx
+ +     jbe     L(last_vec)
+ +
+ +     /* More than 2 * VEC.  */
+ +     cmpq    $(VEC_SIZE * 8), %rdx
+ +     ja      L(more_8x_vec)
+ +     cmpq    $(VEC_SIZE * 4), %rdx
+ +     jb      L(last_4x_vec)
+ +
+ +     /* From 4 * VEC to 8 * VEC, inclusively. */
+ +     VMOVU   (%rsi), %YMM1
+ +     VPCMPEQ (%rdi), %YMM1, %k1
+ +
+ +     VMOVU   VEC_SIZE(%rsi), %YMM2
+ +     VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ +
+ +     VMOVU   (VEC_SIZE * 2)(%rsi), %YMM3
+ +     VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ +
+ +     VMOVU   (VEC_SIZE * 3)(%rsi), %YMM4
+ +     VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ +
+ +     kandd   %k1, %k2, %k5
+ +     kandd   %k3, %k4, %k6
+ +     kandd   %k5, %k6, %k6
+ +
+ +     kmovd   %k6, %eax
+ +     cmpl    $VEC_MASK, %eax
+ +     jne     L(4x_vec_end)
+ +
+ +     leaq    -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ +     leaq    -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+ +     VMOVU   (%rsi), %YMM1
+ +     VPCMPEQ (%rdi), %YMM1, %k1
+ +
+ +     VMOVU   VEC_SIZE(%rsi), %YMM2
+ +     VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ +     kandd   %k1, %k2, %k5
+ +
+ +     VMOVU   (VEC_SIZE * 2)(%rsi), %YMM3
+ +     VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ +     kandd   %k3, %k5, %k5
+ +
+ +     VMOVU   (VEC_SIZE * 3)(%rsi), %YMM4
+ +     VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ +     kandd   %k4, %k5, %k5
+ +
+ +     kmovd   %k5, %eax
+ +     cmpl    $VEC_MASK, %eax
+ +     jne     L(4x_vec_end)
+ +     xorl    %eax, %eax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_2x_vec):
+ +     /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ +     VMOVU   (%rsi), %YMM2
+ +     VPCMPEQ (%rdi), %YMM2, %k2
+ +     kmovd   %k2, %eax
+ +     subl    $VEC_MASK, %eax
+ +     jnz     L(first_vec)
+ +
+ +L(last_vec):
+ +     /* Use overlapping loads to avoid branches.  */
+ +     leaq    -VEC_SIZE(%rdi, %rdx), %rdi
+ +     leaq    -VEC_SIZE(%rsi, %rdx), %rsi
+ +     VMOVU   (%rsi), %YMM2
+ +     VPCMPEQ (%rdi), %YMM2, %k2
+ +     kmovd   %k2, %eax
+ +     subl    $VEC_MASK, %eax
+ +     jnz     L(first_vec)
+ +     ret
+ +
+ +     .p2align 4
+ +L(first_vec):
+ +     /* A byte or int32 is different within 16 or 32 bytes.  */
+ +     tzcntl  %eax, %ecx
+ +# ifdef USE_AS_WMEMCMP
+ +     xorl    %eax, %eax
+ +     movl    (%rdi, %rcx, 4), %edx
+ +     cmpl    (%rsi, %rcx, 4), %edx
+ +L(wmemcmp_return):
+ +     setl    %al
+ +     negl    %eax
+ +     orl     $1, %eax
+ +# else
+ +     movzbl  (%rdi, %rcx), %eax
+ +     movzbl  (%rsi, %rcx), %edx
+ +     sub     %edx, %eax
+ +# endif
+ +     ret
+ +
+ +# ifdef USE_AS_WMEMCMP
+ +     .p2align 4
+ +L(4):
+ +     xorl    %eax, %eax
+ +     movl    (%rdi), %edx
+ +     cmpl    (%rsi), %edx
+ +     jne     L(wmemcmp_return)
+ +     ret
+ +# else
+ +     .p2align 4
+ +L(between_4_7):
+ +     /* Load as big endian with overlapping movbe to avoid branches.  */
+ +     movbe   (%rdi), %eax
+ +     movbe   (%rsi), %ecx
+ +     shlq    $32, %rax
+ +     shlq    $32, %rcx
+ +     movbe   -4(%rdi, %rdx), %edi
+ +     movbe   -4(%rsi, %rdx), %esi
+ +     orq     %rdi, %rax
+ +     orq     %rsi, %rcx
+ +     subq    %rcx, %rax
+ +     je      L(exit)
+ +     sbbl    %eax, %eax
+ +     orl     $1, %eax
+ +     ret
+ +
+ +     .p2align 4
+ +L(exit):
+ +     ret
+ +
+ +     .p2align 4
+ +L(between_2_3):
+ +     /* Load as big endian to avoid branches.  */
+ +     movzwl  (%rdi), %eax
+ +     movzwl  (%rsi), %ecx
+ +     shll    $8, %eax
+ +     shll    $8, %ecx
+ +     bswap   %eax
+ +     bswap   %ecx
+ +     movb    -1(%rdi, %rdx), %al
+ +     movb    -1(%rsi, %rdx), %cl
+ +     /* Subtraction is okay because the upper 8 bits are zero.  */
+ +     subl    %ecx, %eax
+ +     ret
+ +
+ +     .p2align 4
+ +L(1):
+ +     movzbl  (%rdi), %eax
+ +     movzbl  (%rsi), %ecx
+ +     subl    %ecx, %eax
+ +     ret
+ +# endif
+ +
+ +     .p2align 4
+ +L(zero):
+ +     xorl    %eax, %eax
+ +     ret
+ +
+ +     .p2align 4
+ +L(less_vec):
+ +# ifdef USE_AS_WMEMCMP
+ +     /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+ +     cmpb    $4, %dl
+ +     je      L(4)
+ +     jb      L(zero)
+ +# else
+ +     cmpb    $1, %dl
+ +     je      L(1)
+ +     jb      L(zero)
+ +     cmpb    $4, %dl
+ +     jb      L(between_2_3)
+ +     cmpb    $8, %dl
+ +     jb      L(between_4_7)
+ +# endif
+ +     cmpb    $16, %dl
+ +     jae     L(between_16_31)
+ +     /* It is between 8 and 15 bytes.  */
+ +     vmovq   (%rdi), %XMM1
+ +     vmovq   (%rsi), %XMM2
+ +     VPCMPEQ %XMM1, %XMM2, %k2
+ +     kmovw   %k2, %eax
+ +     subl    $XMM_MASK, %eax
+ +     jnz     L(first_vec)
+ +     /* Use overlapping loads to avoid branches.  */
+ +     leaq    -8(%rdi, %rdx), %rdi
+ +     leaq    -8(%rsi, %rdx), %rsi
+ +     vmovq   (%rdi), %XMM1
+ +     vmovq   (%rsi), %XMM2
+ +     VPCMPEQ %XMM1, %XMM2, %k2
+ +     kmovw   %k2, %eax
+ +     subl    $XMM_MASK, %eax
+ +     jnz     L(first_vec)
+ +     ret
+ +
+ +     .p2align 4
+ +L(between_16_31):
+ +     /* From 16 to 31 bytes.  No branch when size == 16.  */
+ +     VMOVU   (%rsi), %XMM2
+ +     VPCMPEQ (%rdi), %XMM2, %k2
+ +     kmovw   %k2, %eax
+ +     subl    $XMM_MASK, %eax
+ +     jnz     L(first_vec)
+ +
+ +     /* Use overlapping loads to avoid branches.  */
+ +     leaq    -16(%rdi, %rdx), %rdi
+ +     leaq    -16(%rsi, %rdx), %rsi
+ +     VMOVU   (%rsi), %XMM2
+ +     VPCMPEQ (%rdi), %XMM2, %k2
+ +     kmovw   %k2, %eax
+ +     subl    $XMM_MASK, %eax
+ +     jnz     L(first_vec)
+ +     ret
+ +
+ +     .p2align 4
+ +L(more_8x_vec):
+ +     /* More than 8 * VEC.  Check the first VEC.  */
+ +     VMOVU   (%rsi), %YMM2
+ +     VPCMPEQ (%rdi), %YMM2, %k2
+ +     kmovd   %k2, %eax
+ +     subl    $VEC_MASK, %eax
+ +     jnz     L(first_vec)
+ +
+ +     /* Align the first memory area for aligned loads in the loop.
+ +        Compute how much the first memory area is misaligned.  */
+ +     movq    %rdi, %rcx
+ +     andl    $(VEC_SIZE - 1), %ecx
+ +     /* Get the negative of offset for alignment.  */
+ +     subq    $VEC_SIZE, %rcx
+ +     /* Adjust the second memory area.  */
+ +     subq    %rcx, %rsi
+ +     /* Adjust the first memory area which should be aligned now.  */
+ +     subq    %rcx, %rdi
+ +     /* Adjust length.  */
+ +     addq    %rcx, %rdx
+ +
+ +L(loop_4x_vec):
+ +     /* Compare 4 * VEC at a time forward.  */
+ +     VMOVU   (%rsi), %YMM1
+ +     VPCMPEQ (%rdi), %YMM1, %k1
+ +
+ +     VMOVU   VEC_SIZE(%rsi), %YMM2
+ +     VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ +     kandd   %k2, %k1, %k5
+ +
+ +     VMOVU   (VEC_SIZE * 2)(%rsi), %YMM3
+ +     VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ +     kandd   %k3, %k5, %k5
+ +
+ +     VMOVU   (VEC_SIZE * 3)(%rsi), %YMM4
+ +     VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ +     kandd   %k4, %k5, %k5
+ +
+ +     kmovd   %k5, %eax
+ +     cmpl    $VEC_MASK, %eax
+ +     jne     L(4x_vec_end)
+ +
+ +     addq    $(VEC_SIZE * 4), %rdi
+ +     addq    $(VEC_SIZE * 4), %rsi
+ +
+ +     subq    $(VEC_SIZE * 4), %rdx
+ +     cmpq    $(VEC_SIZE * 4), %rdx
+ +     jae     L(loop_4x_vec)
+ +
+ +     /* Less than 4 * VEC.  */
+ +     cmpq    $VEC_SIZE, %rdx
+ +     jbe     L(last_vec)
+ +     cmpq    $(VEC_SIZE * 2), %rdx
+ +     jbe     L(last_2x_vec)
+ +
+ +L(last_4x_vec):
+ +     /* From 2 * VEC to 4 * VEC. */
+ +     VMOVU   (%rsi), %YMM2
+ +     VPCMPEQ (%rdi), %YMM2, %k2
+ +     kmovd   %k2, %eax
+ +     subl    $VEC_MASK, %eax
+ +     jnz     L(first_vec)
+ +
+ +     addq    $VEC_SIZE, %rdi
+ +     addq    $VEC_SIZE, %rsi
+ +     VMOVU   (%rsi), %YMM2
+ +     VPCMPEQ (%rdi), %YMM2, %k2
+ +     kmovd   %k2, %eax
+ +     subl    $VEC_MASK, %eax
+ +     jnz     L(first_vec)
+ +
+ +     /* Use overlapping loads to avoid branches.  */
+ +     leaq    -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+ +     leaq    -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+ +     VMOVU   (%rsi), %YMM2
+ +     VPCMPEQ (%rdi), %YMM2, %k2
+ +     kmovd   %k2, %eax
+ +     subl    $VEC_MASK, %eax
+ +     jnz     L(first_vec)
+ +
+ +     addq    $VEC_SIZE, %rdi
+ +     addq    $VEC_SIZE, %rsi
+ +     VMOVU   (%rsi), %YMM2
+ +     VPCMPEQ (%rdi), %YMM2, %k2
+ +     kmovd   %k2, %eax
+ +     subl    $VEC_MASK, %eax
+ +     jnz     L(first_vec)
+ +     ret
+ +
+ +     .p2align 4
+ +L(4x_vec_end):
+ +     kmovd   %k1, %eax
+ +     subl    $VEC_MASK, %eax
+ +     jnz     L(first_vec)
+ +     kmovd   %k2, %eax
+ +     subl    $VEC_MASK, %eax
+ +     jnz     L(first_vec_x1)
+ +     kmovd   %k3, %eax
+ +     subl    $VEC_MASK, %eax
+ +     jnz     L(first_vec_x2)
+ +     kmovd   %k4, %eax
+ +     subl    $VEC_MASK, %eax
+ +     tzcntl  %eax, %ecx
+ +# ifdef USE_AS_WMEMCMP
+ +     xorl    %eax, %eax
+ +     movl    (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+ +     cmpl    (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+ +     jmp     L(wmemcmp_return)
+ +# else
+ +     movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ +     movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %edx
+ +     sub     %edx, %eax
+ +# endif
+ +     ret
+ +
+ +     .p2align 4
+ +L(first_vec_x1):
+ +     tzcntl  %eax, %ecx
+ +# ifdef USE_AS_WMEMCMP
+ +     xorl    %eax, %eax
+ +     movl    VEC_SIZE(%rdi, %rcx, 4), %edx
+ +     cmpl    VEC_SIZE(%rsi, %rcx, 4), %edx
+ +     jmp     L(wmemcmp_return)
+ +# else
+ +     movzbl  VEC_SIZE(%rdi, %rcx), %eax
+ +     movzbl  VEC_SIZE(%rsi, %rcx), %edx
+ +     sub     %edx, %eax
+ +# endif
+ +     ret
+ +
+ +     .p2align 4
+ +L(first_vec_x2):
+ +     tzcntl  %eax, %ecx
+ +# ifdef USE_AS_WMEMCMP
+ +     xorl    %eax, %eax
+ +     movl    (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+ +     cmpl    (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+ +     jmp     L(wmemcmp_return)
+ +# else
+ +     movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
+ +     movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %edx
+ +     sub     %edx, %eax
+ +# endif
+ +     ret
+ +END (MEMCMP)
+ +#endif
+ diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+ new file mode 100644
+ index 0000000000..1ec1962e86
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+ @@ -0,0 +1,17 @@
+ +#if IS_IN (libc)
+ +# define VEC_SIZE    32
+ +# define VEC(i)              ymm##i
+ +# define VMOVNT              vmovntdq
+ +# define VMOVU               vmovdqu
+ +# define VMOVA               vmovdqa
+ +
+ +# define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ +  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ +
+ +# define VZEROUPPER_RETURN jmp        L(return)
+ +
+ +# define SECTION(p)          p##.avx.rtm
+ +# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm
+ +
+ +# include "memmove-vec-unaligned-erms.S"
+ +#endif
+ diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+ index aac1515cf6..7dad1ad74c 100644
+ --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+ +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+ @@ -1,11 +1,25 @@
+  #if IS_IN (libc)
+  # define VEC_SIZE    64
+ -# define VEC(i)              zmm##i
+ +# define XMM0                xmm16
+ +# define XMM1                xmm17
+ +# define YMM0                ymm16
+ +# define YMM1                ymm17
+ +# define VEC0                zmm16
+ +# define VEC1                zmm17
+ +# define VEC2                zmm18
+ +# define VEC3                zmm19
+ +# define VEC4                zmm20
+ +# define VEC5                zmm21
+ +# define VEC6                zmm22
+ +# define VEC7                zmm23
+ +# define VEC8                zmm24
+ +# define VEC(i)              VEC##i
+  # define VMOVNT              vmovntdq
+  # define VMOVU               vmovdqu64
+  # define VMOVA               vmovdqa64
+ +# define VZEROUPPER
+  
+ -# define SECTION(p)          p##.avx512
+ +# define SECTION(p)          p##.evex512
+  # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
+  
+  # include "memmove-vec-unaligned-erms.S"
+ diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+ new file mode 100644
+ index 0000000000..b879007e89
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+ @@ -0,0 +1,26 @@
+ +#if IS_IN (libc)
+ +# define VEC_SIZE    32
+ +# define XMM0                xmm16
+ +# define XMM1                xmm17
+ +# define YMM0                ymm16
+ +# define YMM1                ymm17
+ +# define VEC0                ymm16
+ +# define VEC1                ymm17
+ +# define VEC2                ymm18
+ +# define VEC3                ymm19
+ +# define VEC4                ymm20
+ +# define VEC5                ymm21
+ +# define VEC6                ymm22
+ +# define VEC7                ymm23
+ +# define VEC8                ymm24
+ +# define VEC(i)              VEC##i
+ +# define VMOVNT              vmovntdq
+ +# define VMOVU               vmovdqu64
+ +# define VMOVA               vmovdqa64
+ +# define VZEROUPPER
+ +
+ +# define SECTION(p)          p##.evex
+ +# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
+ +
+ +# include "memmove-vec-unaligned-erms.S"
+ +#endif
+ diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+ index c763b7d871..d13d23d6ce 100644
+ --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+ +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+ @@ -48,6 +48,14 @@
+  # define MEMMOVE_CHK_SYMBOL(p,s)     MEMMOVE_SYMBOL(p, s)
+  #endif
+  
+ +#ifndef XMM0
+ +# define XMM0                                xmm0
+ +#endif
+ +
+ +#ifndef YMM0
+ +# define YMM0                                ymm0
+ +#endif
+ +
+  #ifndef VZEROUPPER
+  # if VEC_SIZE > 16
+  #  define VZEROUPPER vzeroupper
+ @@ -67,6 +75,13 @@
+  # define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
+  #endif
+  
+ +/* Avoid short distance rep movsb only with non-SSE vector.  */
+ +#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
+ +# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
+ +#else
+ +# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
+ +#endif
+ +
+  #ifndef PREFETCH
+  # define PREFETCH(addr) prefetcht0 addr
+  #endif
+ @@ -143,11 +158,12 @@ L(last_2x_vec):
+       VMOVU   -VEC_SIZE(%rsi,%rdx), %VEC(1)
+       VMOVU   %VEC(0), (%rdi)
+       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
+ -     VZEROUPPER
+  #if !defined USE_MULTIARCH || !IS_IN (libc)
+  L(nop):
+ -#endif
+       ret
+ +#else
+ +     VZEROUPPER_RETURN
+ +#endif
+  #if defined USE_MULTIARCH && IS_IN (libc)
+  END (MEMMOVE_SYMBOL (__memmove, unaligned))
+  
+ @@ -240,11 +256,14 @@ L(last_2x_vec):
+       VMOVU   %VEC(0), (%rdi)
+       VMOVU   %VEC(1), -VEC_SIZE(%rdi,%rdx)
+  L(return):
+ -     VZEROUPPER
+ +#if VEC_SIZE > 16
+ +     ZERO_UPPER_VEC_REGISTERS_RETURN
+ +#else
+       ret
+ +#endif
+  
+  L(movsb):
+ -     cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
+ +     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+       jae     L(more_8x_vec)
+       cmpq    %rsi, %rdi
+       jb      1f
+ @@ -257,7 +276,21 @@ L(movsb):
+  #  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
+  # endif
+       jb      L(more_8x_vec_backward)
+ +# if AVOID_SHORT_DISTANCE_REP_MOVSB
+ +     movq    %rdi, %rcx
+ +     subq    %rsi, %rcx
+ +     jmp     2f
+ +# endif
+  1:
+ +# if AVOID_SHORT_DISTANCE_REP_MOVSB
+ +     movq    %rsi, %rcx
+ +     subq    %rdi, %rcx
+ +2:
+ +/* Avoid "rep movsb" if RCX, the distance between source and destination,
+ +   is N*4GB + [1..63] with N >= 0.  */
+ +     cmpl    $63, %ecx
+ +     jbe     L(more_2x_vec)  /* Avoid "rep movsb" if ECX <= 63.  */
+ +# endif
+       mov     %RDX_LP, %RCX_LP
+       rep movsb
+  L(nop):
+ @@ -291,21 +324,20 @@ L(less_vec):
+  #if VEC_SIZE > 32
+  L(between_32_63):
+       /* From 32 to 63.  No branch when size == 32.  */
+ -     vmovdqu (%rsi), %ymm0
+ -     vmovdqu -32(%rsi,%rdx), %ymm1
+ -     vmovdqu %ymm0, (%rdi)
+ -     vmovdqu %ymm1, -32(%rdi,%rdx)
+ -     VZEROUPPER
+ -     ret
+ +     VMOVU   (%rsi), %YMM0
+ +     VMOVU   -32(%rsi,%rdx), %YMM1
+ +     VMOVU   %YMM0, (%rdi)
+ +     VMOVU   %YMM1, -32(%rdi,%rdx)
+ +     VZEROUPPER_RETURN
+  #endif
+  #if VEC_SIZE > 16
+       /* From 16 to 31.  No branch when size == 16.  */
+  L(between_16_31):
+ -     vmovdqu (%rsi), %xmm0
+ -     vmovdqu -16(%rsi,%rdx), %xmm1
+ -     vmovdqu %xmm0, (%rdi)
+ -     vmovdqu %xmm1, -16(%rdi,%rdx)
+ -     ret
+ +     VMOVU   (%rsi), %XMM0
+ +     VMOVU   -16(%rsi,%rdx), %XMM1
+ +     VMOVU   %XMM0, (%rdi)
+ +     VMOVU   %XMM1, -16(%rdi,%rdx)
+ +     VZEROUPPER_RETURN
+  #endif
+  L(between_8_15):
+       /* From 8 to 15.  No branch when size == 8.  */
+ @@ -358,8 +390,7 @@ L(more_2x_vec):
+       VMOVU   %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+       VMOVU   %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+       VMOVU   %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  L(last_4x_vec):
+       /* Copy from 2 * VEC to 4 * VEC. */
+       VMOVU   (%rsi), %VEC(0)
+ @@ -370,8 +401,7 @@ L(last_4x_vec):
+       VMOVU   %VEC(1), VEC_SIZE(%rdi)
+       VMOVU   %VEC(2), -VEC_SIZE(%rdi,%rdx)
+       VMOVU   %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+  L(more_8x_vec):
+       cmpq    %rsi, %rdi
+ @@ -402,7 +432,7 @@ L(more_8x_vec):
+       addq    %r8, %rdx
+  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+       /* Check non-temporal store threshold.  */
+ -     cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
+ +     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+       ja      L(large_forward)
+  #endif
+  L(loop_4x_vec_forward):
+ @@ -427,8 +457,7 @@ L(loop_4x_vec_forward):
+       VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
+       /* Store the first VEC.  */
+       VMOVU   %VEC(4), (%r11)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+  L(more_8x_vec_backward):
+       /* Load the first 4 * VEC and last VEC to support overlapping
+ @@ -454,7 +483,7 @@ L(more_8x_vec_backward):
+       subq    %r8, %rdx
+  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+       /* Check non-temporal store threshold.  */
+ -     cmpq    __x86_shared_non_temporal_threshold(%rip), %rdx
+ +     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+       ja      L(large_backward)
+  #endif
+  L(loop_4x_vec_backward):
+ @@ -479,8 +508,7 @@ L(loop_4x_vec_backward):
+       VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
+       /* Store the last VEC.  */
+       VMOVU   %VEC(8), (%r11)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+  L(large_forward):
+ @@ -515,8 +543,7 @@ L(loop_large_forward):
+       VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
+       /* Store the first VEC.  */
+       VMOVU   %VEC(4), (%r11)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+  L(large_backward):
+       /* Don't use non-temporal store if there is overlap between
+ @@ -550,8 +577,7 @@ L(loop_large_backward):
+       VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
+       /* Store the last VEC.  */
+       VMOVU   %VEC(8), (%r11)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  #endif
+  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+  
+ diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+ new file mode 100644
+ index 0000000000..cea2d2a72d
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
+ @@ -0,0 +1,12 @@
+ +#ifndef MEMRCHR
+ +# define MEMRCHR __memrchr_avx2_rtm
+ +#endif
+ +
+ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ +  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ +
+ +#define VZEROUPPER_RETURN jmp         L(return_vzeroupper)
+ +
+ +#define SECTION(p) p##.avx.rtm
+ +
+ +#include "memrchr-avx2.S"
+ diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+ index f5437b54de..c8d54c08d6 100644
+ --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+ +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+ @@ -20,14 +20,22 @@
+  
+  # include <sysdep.h>
+  
+ +# ifndef MEMRCHR
+ +#  define MEMRCHR    __memrchr_avx2
+ +# endif
+ +
+  # ifndef VZEROUPPER
+  #  define VZEROUPPER vzeroupper
+  # endif
+  
+ +# ifndef SECTION
+ +#  define SECTION(p) p##.avx
+ +# endif
+ +
+  # define VEC_SIZE 32
+  
+ -     .section .text.avx,"ax",@progbits
+ -ENTRY (__memrchr_avx2)
+ +     .section SECTION(.text),"ax",@progbits
+ +ENTRY (MEMRCHR)
+       /* Broadcast CHAR to YMM0.  */
+       vmovd   %esi, %xmm0
+       vpbroadcastb %xmm0, %ymm0
+ @@ -134,8 +142,8 @@ L(loop_4x_vec):
+       vpmovmskb %ymm1, %eax
+       bsrl    %eax, %eax
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +L(return_vzeroupper):
+ +     ZERO_UPPER_VEC_REGISTERS_RETURN
+  
+       .p2align 4
+  L(last_4x_vec_or_less):
+ @@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
+       addq    %rax, %rdx
+       jl      L(zero)
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(last_2x_vec):
+ @@ -191,31 +198,27 @@ L(last_2x_vec):
+       jl      L(zero)
+       addl    $(VEC_SIZE * 2), %eax
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(last_vec_x0):
+       bsrl    %eax, %eax
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(last_vec_x1):
+       bsrl    %eax, %eax
+       addl    $VEC_SIZE, %eax
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(last_vec_x2):
+       bsrl    %eax, %eax
+       addl    $(VEC_SIZE * 2), %eax
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(last_vec_x3):
+ @@ -232,8 +235,7 @@ L(last_vec_x1_check):
+       jl      L(zero)
+       addl    $VEC_SIZE, %eax
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(last_vec_x3_check):
+ @@ -243,12 +245,14 @@ L(last_vec_x3_check):
+       jl      L(zero)
+       addl    $(VEC_SIZE * 3), %eax
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(zero):
+ -     VZEROUPPER
+ +     xorl    %eax, %eax
+ +     VZEROUPPER_RETURN
+ +
+ +     .p2align 4
+  L(null):
+       xorl    %eax, %eax
+       ret
+ @@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
+  
+       bsrl    %eax, %eax
+       addq    %rdi, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(last_vec_or_less):
+ @@ -315,8 +318,7 @@ L(last_vec_or_less):
+       bsrl    %eax, %eax
+       addq    %rdi, %rax
+       addq    %r8, %rax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(last_vec_2x_aligned):
+ @@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
+       bsrl    %eax, %eax
+       addq    %rdi, %rax
+       addq    %r8, %rax
+ -     VZEROUPPER
+ -     ret
+ -END (__memrchr_avx2)
+ +     VZEROUPPER_RETURN
+ +END (MEMRCHR)
+  #endif
+ diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
+ new file mode 100644
+ index 0000000000..16bf8e02b1
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
+ @@ -0,0 +1,337 @@
+ +/* memrchr optimized with 256-bit EVEX instructions.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#if IS_IN (libc)
+ +
+ +# include <sysdep.h>
+ +
+ +# define VMOVA               vmovdqa64
+ +
+ +# define YMMMATCH    ymm16
+ +
+ +# define VEC_SIZE 32
+ +
+ +     .section .text.evex,"ax",@progbits
+ +ENTRY (__memrchr_evex)
+ +     /* Broadcast CHAR to YMMMATCH.  */
+ +     vpbroadcastb %esi, %YMMMATCH
+ +
+ +     sub     $VEC_SIZE, %RDX_LP
+ +     jbe     L(last_vec_or_less)
+ +
+ +     add     %RDX_LP, %RDI_LP
+ +
+ +     /* Check the last VEC_SIZE bytes.  */
+ +     vpcmpb  $0, (%rdi), %YMMMATCH, %k1
+ +     kmovd   %k1, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x0)
+ +
+ +     subq    $(VEC_SIZE * 4), %rdi
+ +     movl    %edi, %ecx
+ +     andl    $(VEC_SIZE - 1), %ecx
+ +     jz      L(aligned_more)
+ +
+ +     /* Align data for aligned loads in the loop.  */
+ +     addq    $VEC_SIZE, %rdi
+ +     addq    $VEC_SIZE, %rdx
+ +     andq    $-VEC_SIZE, %rdi
+ +     subq    %rcx, %rdx
+ +
+ +     .p2align 4
+ +L(aligned_more):
+ +     subq    $(VEC_SIZE * 4), %rdx
+ +     jbe     L(last_4x_vec_or_less)
+ +
+ +     /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ +        since data is only aligned to VEC_SIZE.  */
+ +     vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+ +     kmovd   %k1, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x3)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
+ +     kmovd   %k2, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x2)
+ +
+ +     vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
+ +     kmovd   %k3, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x1)
+ +
+ +     vpcmpb  $0, (%rdi), %YMMMATCH, %k4
+ +     kmovd   %k4, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x0)
+ +
+ +     /* Align data to 4 * VEC_SIZE for loop with fewer branches.
+ +        There are some overlaps with above if data isn't aligned
+ +        to 4 * VEC_SIZE.  */
+ +     movl    %edi, %ecx
+ +     andl    $(VEC_SIZE * 4 - 1), %ecx
+ +     jz      L(loop_4x_vec)
+ +
+ +     addq    $(VEC_SIZE * 4), %rdi
+ +     addq    $(VEC_SIZE * 4), %rdx
+ +     andq    $-(VEC_SIZE * 4), %rdi
+ +     subq    %rcx, %rdx
+ +
+ +     .p2align 4
+ +L(loop_4x_vec):
+ +     /* Compare 4 * VEC at a time forward.  */
+ +     subq    $(VEC_SIZE * 4), %rdi
+ +     subq    $(VEC_SIZE * 4), %rdx
+ +     jbe     L(last_4x_vec_or_less)
+ +
+ +     vpcmpb  $0, (%rdi), %YMMMATCH, %k1
+ +     vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
+ +     kord    %k1, %k2, %k5
+ +     vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
+ +     vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
+ +
+ +     kord    %k3, %k4, %k6
+ +     kortestd %k5, %k6
+ +     jz      L(loop_4x_vec)
+ +
+ +     /* There is a match.  */
+ +     kmovd   %k4, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x3)
+ +
+ +     kmovd   %k3, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x2)
+ +
+ +     kmovd   %k2, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x1)
+ +
+ +     kmovd   %k1, %eax
+ +     bsrl    %eax, %eax
+ +     addq    %rdi, %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_4x_vec_or_less):
+ +     addl    $(VEC_SIZE * 4), %edx
+ +     cmpl    $(VEC_SIZE * 2), %edx
+ +     jbe     L(last_2x_vec)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+ +     kmovd   %k1, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x3)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
+ +     kmovd   %k2, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x2)
+ +
+ +     vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
+ +     kmovd   %k3, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x1_check)
+ +     cmpl    $(VEC_SIZE * 3), %edx
+ +     jbe     L(zero)
+ +
+ +     vpcmpb  $0, (%rdi), %YMMMATCH, %k4
+ +     kmovd   %k4, %eax
+ +     testl   %eax, %eax
+ +     jz      L(zero)
+ +     bsrl    %eax, %eax
+ +     subq    $(VEC_SIZE * 4), %rdx
+ +     addq    %rax, %rdx
+ +     jl      L(zero)
+ +     addq    %rdi, %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_2x_vec):
+ +     vpcmpb  $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+ +     kmovd   %k1, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x3_check)
+ +     cmpl    $VEC_SIZE, %edx
+ +     jbe     L(zero)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+ +     kmovd   %k1, %eax
+ +     testl   %eax, %eax
+ +     jz      L(zero)
+ +     bsrl    %eax, %eax
+ +     subq    $(VEC_SIZE * 2), %rdx
+ +     addq    %rax, %rdx
+ +     jl      L(zero)
+ +     addl    $(VEC_SIZE * 2), %eax
+ +     addq    %rdi, %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_x0):
+ +     bsrl    %eax, %eax
+ +     addq    %rdi, %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_x1):
+ +     bsrl    %eax, %eax
+ +     addl    $VEC_SIZE, %eax
+ +     addq    %rdi, %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_x2):
+ +     bsrl    %eax, %eax
+ +     addl    $(VEC_SIZE * 2), %eax
+ +     addq    %rdi, %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_x3):
+ +     bsrl    %eax, %eax
+ +     addl    $(VEC_SIZE * 3), %eax
+ +     addq    %rdi, %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_x1_check):
+ +     bsrl    %eax, %eax
+ +     subq    $(VEC_SIZE * 3), %rdx
+ +     addq    %rax, %rdx
+ +     jl      L(zero)
+ +     addl    $VEC_SIZE, %eax
+ +     addq    %rdi, %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_x3_check):
+ +     bsrl    %eax, %eax
+ +     subq    $VEC_SIZE, %rdx
+ +     addq    %rax, %rdx
+ +     jl      L(zero)
+ +     addl    $(VEC_SIZE * 3), %eax
+ +     addq    %rdi, %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(zero):
+ +     xorl    %eax, %eax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_or_less_aligned):
+ +     movl    %edx, %ecx
+ +
+ +     vpcmpb  $0, (%rdi), %YMMMATCH, %k1
+ +
+ +     movl    $1, %edx
+ +     /* Support rdx << 32.  */
+ +     salq    %cl, %rdx
+ +     subq    $1, %rdx
+ +
+ +     kmovd   %k1, %eax
+ +
+ +     /* Remove the trailing bytes.  */
+ +     andl    %edx, %eax
+ +     testl   %eax, %eax
+ +     jz      L(zero)
+ +
+ +     bsrl    %eax, %eax
+ +     addq    %rdi, %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_or_less):
+ +     addl    $VEC_SIZE, %edx
+ +
+ +     /* Check for zero length.  */
+ +     testl   %edx, %edx
+ +     jz      L(zero)
+ +
+ +     movl    %edi, %ecx
+ +     andl    $(VEC_SIZE - 1), %ecx
+ +     jz      L(last_vec_or_less_aligned)
+ +
+ +     movl    %ecx, %esi
+ +     movl    %ecx, %r8d
+ +     addl    %edx, %esi
+ +     andq    $-VEC_SIZE, %rdi
+ +
+ +     subl    $VEC_SIZE, %esi
+ +     ja      L(last_vec_2x_aligned)
+ +
+ +     /* Check the last VEC.  */
+ +     vpcmpb  $0, (%rdi), %YMMMATCH, %k1
+ +     kmovd   %k1, %eax
+ +
+ +     /* Remove the leading and trailing bytes.  */
+ +     sarl    %cl, %eax
+ +     movl    %edx, %ecx
+ +
+ +     movl    $1, %edx
+ +     sall    %cl, %edx
+ +     subl    $1, %edx
+ +
+ +     andl    %edx, %eax
+ +     testl   %eax, %eax
+ +     jz      L(zero)
+ +
+ +     bsrl    %eax, %eax
+ +     addq    %rdi, %rax
+ +     addq    %r8, %rax
+ +     ret
+ +
+ +     .p2align 4
+ +L(last_vec_2x_aligned):
+ +     movl    %esi, %ecx
+ +
+ +     /* Check the last VEC.  */
+ +     vpcmpb  $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+ +
+ +     movl    $1, %edx
+ +     sall    %cl, %edx
+ +     subl    $1, %edx
+ +
+ +     kmovd   %k1, %eax
+ +
+ +     /* Remove the trailing bytes.  */
+ +     andl    %edx, %eax
+ +
+ +     testl   %eax, %eax
+ +     jnz     L(last_vec_x1)
+ +
+ +     /* Check the second last VEC.  */
+ +     vpcmpb  $0, (%rdi), %YMMMATCH, %k1
+ +
+ +     movl    %r8d, %ecx
+ +
+ +     kmovd   %k1, %eax
+ +
+ +     /* Remove the leading bytes.  Must use unsigned right shift for
+ +        bsrl below.  */
+ +     shrl    %cl, %eax
+ +     testl   %eax, %eax
+ +     jz      L(zero)
+ +
+ +     bsrl    %eax, %eax
+ +     addq    %rdi, %rax
+ +     addq    %r8, %rax
+ +     ret
+ +END (__memrchr_evex)
+ +#endif
+ diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+ new file mode 100644
+ index 0000000000..8ac3e479bb
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+ @@ -0,0 +1,10 @@
+ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ +  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ +
+ +#define VZEROUPPER_RETURN jmp         L(return)
+ +
+ +#define SECTION(p) p##.avx.rtm
+ +#define MEMSET_SYMBOL(p,s)   p##_avx2_##s##_rtm
+ +#define WMEMSET_SYMBOL(p,s)  p##_avx2_##s##_rtm
+ +
+ +#include "memset-avx2-unaligned-erms.S"
+ diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+ index 7ab3d89849..ae0860f36a 100644
+ --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+ +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+ @@ -14,9 +14,15 @@
+    movq r, %rax; \
+    vpbroadcastd %xmm0, %ymm0
+  
+ -# define SECTION(p)          p##.avx
+ -# define MEMSET_SYMBOL(p,s)  p##_avx2_##s
+ -# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
+ +# ifndef SECTION
+ +#  define SECTION(p)         p##.avx
+ +# endif
+ +# ifndef MEMSET_SYMBOL
+ +#  define MEMSET_SYMBOL(p,s) p##_avx2_##s
+ +# endif
+ +# ifndef WMEMSET_SYMBOL
+ +#  define WMEMSET_SYMBOL(p,s)        p##_avx2_##s
+ +# endif
+  
+  # include "memset-vec-unaligned-erms.S"
+  #endif
+ diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+ index 0783979ca5..22e7b187c8 100644
+ --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+ +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+ @@ -1,22 +1,22 @@
+  #if IS_IN (libc)
+  # define VEC_SIZE    64
+ -# define VEC(i)              zmm##i
+ +# define XMM0                xmm16
+ +# define YMM0                ymm16
+ +# define VEC0                zmm16
+ +# define VEC(i)              VEC##i
+  # define VMOVU               vmovdqu64
+  # define VMOVA               vmovdqa64
+ +# define VZEROUPPER
+  
+  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ -  vmovd d, %xmm0; \
+    movq r, %rax; \
+ -  vpbroadcastb %xmm0, %xmm0; \
+ -  vpbroadcastq %xmm0, %zmm0
+ +  vpbroadcastb d, %VEC0
+  
+  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ -  vmovd d, %xmm0; \
+    movq r, %rax; \
+ -  vpbroadcastd %xmm0, %xmm0; \
+ -  vpbroadcastq %xmm0, %zmm0
+ +  vpbroadcastd d, %VEC0
+  
+ -# define SECTION(p)          p##.avx512
+ +# define SECTION(p)          p##.evex512
+  # define MEMSET_SYMBOL(p,s)  p##_avx512_##s
+  # define WMEMSET_SYMBOL(p,s) p##_avx512_##s
+  
+ diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+ new file mode 100644
+ index 0000000000..ae0a4d6e46
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+ @@ -0,0 +1,24 @@
+ +#if IS_IN (libc)
+ +# define VEC_SIZE    32
+ +# define XMM0                xmm16
+ +# define YMM0                ymm16
+ +# define VEC0                ymm16
+ +# define VEC(i)              VEC##i
+ +# define VMOVU               vmovdqu64
+ +# define VMOVA               vmovdqa64
+ +# define VZEROUPPER
+ +
+ +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ +  movq r, %rax; \
+ +  vpbroadcastb d, %VEC0
+ +
+ +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ +  movq r, %rax; \
+ +  vpbroadcastd d, %VEC0
+ +
+ +# define SECTION(p)          p##.evex
+ +# define MEMSET_SYMBOL(p,s)  p##_evex_##s
+ +# define WMEMSET_SYMBOL(p,s) p##_evex_##s
+ +
+ +# include "memset-vec-unaligned-erms.S"
+ +#endif
+ diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+ index af2299709c..16bed6ec11 100644
+ --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+ +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+ @@ -34,20 +34,25 @@
+  # define WMEMSET_CHK_SYMBOL(p,s)     WMEMSET_SYMBOL(p, s)
+  #endif
+  
+ +#ifndef XMM0
+ +# define XMM0                                xmm0
+ +#endif
+ +
+ +#ifndef YMM0
+ +# define YMM0                                ymm0
+ +#endif
+ +
+  #ifndef VZEROUPPER
+  # if VEC_SIZE > 16
+  #  define VZEROUPPER                 vzeroupper
+ +#  define VZEROUPPER_SHORT_RETURN    vzeroupper; ret
+  # else
+  #  define VZEROUPPER
+  # endif
+  #endif
+  
+  #ifndef VZEROUPPER_SHORT_RETURN
+ -# if VEC_SIZE > 16
+ -#  define VZEROUPPER_SHORT_RETURN    vzeroupper
+ -# else
+ -#  define VZEROUPPER_SHORT_RETURN    rep
+ -# endif
+ +# define VZEROUPPER_SHORT_RETURN     rep; ret
+  #endif
+  
+  #ifndef MOVQ
+ @@ -77,7 +82,7 @@
+  ENTRY (__bzero)
+       mov     %RDI_LP, %RAX_LP /* Set return value.  */
+       mov     %RSI_LP, %RDX_LP /* Set n.  */
+ -     pxor    %xmm0, %xmm0
+ +     pxor    %XMM0, %XMM0
+       jmp     L(entry_from_bzero)
+  END (__bzero)
+  weak_alias (__bzero, bzero)
+ @@ -119,8 +124,7 @@ L(entry_from_bzero):
+       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+       VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
+       VMOVU   %VEC(0), (%rdi)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  #if defined USE_MULTIARCH && IS_IN (libc)
+  END (MEMSET_SYMBOL (__memset, unaligned))
+  
+ @@ -143,14 +147,12 @@ ENTRY (__memset_erms)
+  ENTRY (MEMSET_SYMBOL (__memset, erms))
+  # endif
+  L(stosb):
+ -     /* Issue vzeroupper before rep stosb.  */
+ -     VZEROUPPER
+       mov     %RDX_LP, %RCX_LP
+       movzbl  %sil, %eax
+       mov     %RDI_LP, %RDX_LP
+       rep stosb
+       mov     %RDX_LP, %RAX_LP
+ -     ret
+ +     VZEROUPPER_RETURN
+  # if VEC_SIZE == 16
+  END (__memset_erms)
+  # else
+ @@ -177,8 +179,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+       VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
+       VMOVU   %VEC(0), (%rdi)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+  L(stosb_more_2x_vec):
+       cmpq    $REP_STOSB_THRESHOLD, %rdx
+ @@ -192,8 +193,11 @@ L(more_2x_vec):
+       VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
+       VMOVU   %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+  L(return):
+ -     VZEROUPPER
+ +#if VEC_SIZE > 16
+ +     ZERO_UPPER_VEC_REGISTERS_RETURN
+ +#else
+       ret
+ +#endif
+  
+  L(loop_start):
+       leaq    (VEC_SIZE * 4)(%rdi), %rcx
+ @@ -219,7 +223,6 @@ L(loop):
+       cmpq    %rcx, %rdx
+       jne     L(loop)
+       VZEROUPPER_SHORT_RETURN
+ -     ret
+  L(less_vec):
+       /* Less than 1 VEC.  */
+  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ @@ -233,7 +236,7 @@ L(less_vec):
+       cmpb    $16, %dl
+       jae     L(between_16_31)
+  # endif
+ -     MOVQ    %xmm0, %rcx
+ +     MOVQ    %XMM0, %rcx
+       cmpb    $8, %dl
+       jae     L(between_8_15)
+       cmpb    $4, %dl
+ @@ -243,40 +246,34 @@ L(less_vec):
+       jb      1f
+       movb    %cl, (%rdi)
+  1:
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  # if VEC_SIZE > 32
+       /* From 32 to 63.  No branch when size == 32.  */
+  L(between_32_63):
+ -     vmovdqu %ymm0, -32(%rdi,%rdx)
+ -     vmovdqu %ymm0, (%rdi)
+ -     VZEROUPPER
+ -     ret
+ +     VMOVU   %YMM0, -32(%rdi,%rdx)
+ +     VMOVU   %YMM0, (%rdi)
+ +     VZEROUPPER_RETURN
+  # endif
+  # if VEC_SIZE > 16
+       /* From 16 to 31.  No branch when size == 16.  */
+  L(between_16_31):
+ -     vmovdqu %xmm0, -16(%rdi,%rdx)
+ -     vmovdqu %xmm0, (%rdi)
+ -     VZEROUPPER
+ -     ret
+ +     VMOVU   %XMM0, -16(%rdi,%rdx)
+ +     VMOVU   %XMM0, (%rdi)
+ +     VZEROUPPER_RETURN
+  # endif
+       /* From 8 to 15.  No branch when size == 8.  */
+  L(between_8_15):
+       movq    %rcx, -8(%rdi,%rdx)
+       movq    %rcx, (%rdi)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  L(between_4_7):
+       /* From 4 to 7.  No branch when size == 4.  */
+       movl    %ecx, -4(%rdi,%rdx)
+       movl    %ecx, (%rdi)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  L(between_2_3):
+       /* From 2 to 3.  No branch when size == 2.  */
+       movw    %cx, -2(%rdi,%rdx)
+       movw    %cx, (%rdi)
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  END (MEMSET_SYMBOL (__memset, unaligned_erms))
+ diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
+ new file mode 100644
+ index 0000000000..acc5f6e2fb
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S
+ @@ -0,0 +1,4 @@
+ +#define MEMCHR __rawmemchr_avx2_rtm
+ +#define USE_AS_RAWMEMCHR 1
+ +
+ +#include "memchr-avx2-rtm.S"
+ diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+ new file mode 100644
+ index 0000000000..ec942b77ba
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+ @@ -0,0 +1,4 @@
+ +#define MEMCHR __rawmemchr_evex
+ +#define USE_AS_RAWMEMCHR 1
+ +
+ +#include "memchr-evex.S"
+ diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+ new file mode 100644
+ index 0000000000..2b9c07a59f
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+ @@ -0,0 +1,3 @@
+ +#define USE_AS_STPCPY
+ +#define STRCPY __stpcpy_avx2_rtm
+ +#include "strcpy-avx2-rtm.S"
+ diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S
+ new file mode 100644
+ index 0000000000..7c6f26cd98
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S
+ @@ -0,0 +1,3 @@
+ +#define USE_AS_STPCPY
+ +#define STRCPY __stpcpy_evex
+ +#include "strcpy-evex.S"
+ diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+ new file mode 100644
+ index 0000000000..60a2ccfe53
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+ @@ -0,0 +1,4 @@
+ +#define USE_AS_STPCPY
+ +#define USE_AS_STRNCPY
+ +#define STRCPY __stpncpy_avx2_rtm
+ +#include "strcpy-avx2-rtm.S"
+ diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
+ new file mode 100644
+ index 0000000000..1570014d1c
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
+ @@ -0,0 +1,4 @@
+ +#define USE_AS_STPCPY
+ +#define USE_AS_STRNCPY
+ +#define STRCPY __stpncpy_evex
+ +#include "strcpy-evex.S"
+ diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+ new file mode 100644
+ index 0000000000..637fb557c4
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+ @@ -0,0 +1,12 @@
+ +#ifndef STRCAT
+ +# define STRCAT __strcat_avx2_rtm
+ +#endif
+ +
+ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ +  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ +
+ +#define VZEROUPPER_RETURN jmp         L(return_vzeroupper)
+ +
+ +#define SECTION(p) p##.avx.rtm
+ +
+ +#include "strcat-avx2.S"
+ diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
+ index a4143bf8f5..1e6d4827ee 100644
+ --- a/sysdeps/x86_64/multiarch/strcat-avx2.S
+ +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
+ @@ -30,7 +30,11 @@
+  /* Number of bytes in a vector register */
+  # define VEC_SIZE    32
+  
+ -     .section .text.avx,"ax",@progbits
+ +# ifndef SECTION
+ +#  define SECTION(p) p##.avx
+ +# endif
+ +
+ +     .section SECTION(.text),"ax",@progbits
+  ENTRY (STRCAT)
+       mov     %rdi, %r9
+  # ifdef USE_AS_STRNCAT
+ diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
+ new file mode 100644
+ index 0000000000..97c3d85b6d
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/strcat-evex.S
+ @@ -0,0 +1,283 @@
+ +/* strcat with 256-bit EVEX instructions.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#if IS_IN (libc)
+ +
+ +# include <sysdep.h>
+ +
+ +# ifndef STRCAT
+ +#  define STRCAT  __strcat_evex
+ +# endif
+ +
+ +# define VMOVU               vmovdqu64
+ +# define VMOVA               vmovdqa64
+ +
+ +/* zero register */
+ +# define XMMZERO     xmm16
+ +# define YMMZERO     ymm16
+ +# define YMM0                ymm17
+ +# define YMM1                ymm18
+ +
+ +# define USE_AS_STRCAT
+ +
+ +/* Number of bytes in a vector register */
+ +# define VEC_SIZE    32
+ +
+ +     .section .text.evex,"ax",@progbits
+ +ENTRY (STRCAT)
+ +     mov     %rdi, %r9
+ +# ifdef USE_AS_STRNCAT
+ +     mov     %rdx, %r8
+ +# endif
+ +
+ +     xor     %eax, %eax
+ +     mov     %edi, %ecx
+ +     and     $((VEC_SIZE * 4) - 1), %ecx
+ +     vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
+ +     cmp     $(VEC_SIZE * 3), %ecx
+ +     ja      L(fourth_vector_boundary)
+ +     vpcmpb  $0, (%rdi), %YMMZERO, %k0
+ +     kmovd   %k0, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_first_vector)
+ +     mov     %rdi, %rax
+ +     and     $-VEC_SIZE, %rax
+ +     jmp     L(align_vec_size_start)
+ +L(fourth_vector_boundary):
+ +     mov     %rdi, %rax
+ +     and     $-VEC_SIZE, %rax
+ +     vpcmpb  $0, (%rax), %YMMZERO, %k0
+ +     mov     $-1, %r10d
+ +     sub     %rax, %rcx
+ +     shl     %cl, %r10d
+ +     kmovd   %k0, %edx
+ +     and     %r10d, %edx
+ +     jnz     L(exit)
+ +
+ +L(align_vec_size_start):
+ +     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
+ +     kmovd   %k0, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_second_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ +     kmovd   %k1, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_third_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ +     kmovd   %k2, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_fourth_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ +     kmovd   %k3, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_fifth_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ +     add     $(VEC_SIZE * 4), %rax
+ +     kmovd   %k4, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_second_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ +     kmovd   %k1, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_third_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ +     kmovd   %k2, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_fourth_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ +     kmovd   %k3, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_fifth_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ +     kmovd   %k4, %edx
+ +     add     $(VEC_SIZE * 4), %rax
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_second_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ +     kmovd   %k1, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_third_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ +     kmovd   %k2, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_fourth_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ +     kmovd   %k3, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_fifth_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ +     add     $(VEC_SIZE * 4), %rax
+ +     kmovd   %k4, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_second_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ +     kmovd   %k1, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_third_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ +     kmovd   %k2, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_fourth_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ +     kmovd   %k3, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_fifth_vector)
+ +
+ +     test    $((VEC_SIZE * 4) - 1), %rax
+ +     jz      L(align_four_vec_loop)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ +     add     $(VEC_SIZE * 5), %rax
+ +     kmovd   %k4, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit)
+ +
+ +     test    $((VEC_SIZE * 4) - 1), %rax
+ +     jz      L(align_four_vec_loop)
+ +
+ +     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
+ +     add     $VEC_SIZE, %rax
+ +     kmovd   %k0, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit)
+ +
+ +     test    $((VEC_SIZE * 4) - 1), %rax
+ +     jz      L(align_four_vec_loop)
+ +
+ +     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k0
+ +     add     $VEC_SIZE, %rax
+ +     kmovd   %k0, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit)
+ +
+ +     test    $((VEC_SIZE * 4) - 1), %rax
+ +     jz      L(align_four_vec_loop)
+ +
+ +     vpcmpb  $0, VEC_SIZE(%rax), %YMMZERO, %k1
+ +     add     $VEC_SIZE, %rax
+ +     kmovd   %k1, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit)
+ +
+ +     add     $VEC_SIZE, %rax
+ +
+ +     .p2align 4
+ +L(align_four_vec_loop):
+ +     VMOVA   (%rax), %YMM0
+ +     VMOVA   (VEC_SIZE * 2)(%rax), %YMM1
+ +     vpminub VEC_SIZE(%rax), %YMM0, %YMM0
+ +     vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
+ +     vpminub %YMM0, %YMM1, %YMM0
+ +     /* If K0 != 0, there is a null byte.  */
+ +     vpcmpb  $0, %YMM0, %YMMZERO, %k0
+ +     add     $(VEC_SIZE * 4), %rax
+ +     ktestd  %k0, %k0
+ +     jz      L(align_four_vec_loop)
+ +
+ +     vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
+ +     sub     $(VEC_SIZE * 5), %rax
+ +     kmovd   %k0, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_second_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ +     kmovd   %k1, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_third_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ +     kmovd   %k2, %edx
+ +     test    %edx, %edx
+ +     jnz     L(exit_null_on_fourth_vector)
+ +
+ +     vpcmpb  $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ +     kmovd   %k3, %edx
+ +     sub     %rdi, %rax
+ +     bsf     %rdx, %rdx
+ +     add     %rdx, %rax
+ +     add     $(VEC_SIZE * 4), %rax
+ +     jmp     L(StartStrcpyPart)
+ +
+ +     .p2align 4
+ +L(exit):
+ +     sub     %rdi, %rax
+ +L(exit_null_on_first_vector):
+ +     bsf     %rdx, %rdx
+ +     add     %rdx, %rax
+ +     jmp     L(StartStrcpyPart)
+ +
+ +     .p2align 4
+ +L(exit_null_on_second_vector):
+ +     sub     %rdi, %rax
+ +     bsf     %rdx, %rdx
+ +     add     %rdx, %rax
+ +     add     $VEC_SIZE, %rax
+ +     jmp     L(StartStrcpyPart)
+ +
+ +     .p2align 4
+ +L(exit_null_on_third_vector):
+ +     sub     %rdi, %rax
+ +     bsf     %rdx, %rdx
+ +     add     %rdx, %rax
+ +     add     $(VEC_SIZE * 2), %rax
+ +     jmp     L(StartStrcpyPart)
+ +
+ +     .p2align 4
+ +L(exit_null_on_fourth_vector):
+ +     sub     %rdi, %rax
+ +     bsf     %rdx, %rdx
+ +     add     %rdx, %rax
+ +     add     $(VEC_SIZE * 3), %rax
+ +     jmp     L(StartStrcpyPart)
+ +
+ +     .p2align 4
+ +L(exit_null_on_fifth_vector):
+ +     sub     %rdi, %rax
+ +     bsf     %rdx, %rdx
+ +     add     %rdx, %rax
+ +     add     $(VEC_SIZE * 4), %rax
+ +
+ +     .p2align 4
+ +L(StartStrcpyPart):
+ +     lea     (%r9, %rax), %rdi
+ +     mov     %rsi, %rcx
+ +     mov     %r9, %rax      /* save result */
+ +
+ +# ifdef USE_AS_STRNCAT
+ +     test    %r8, %r8
+ +     jz      L(ExitZero)
+ +#  define USE_AS_STRNCPY
+ +# endif
+ +
+ +# include "strcpy-evex.S"
+ +#endif
+ diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
+ new file mode 100644
+ index 0000000000..81f20d1d8e
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S
+ @@ -0,0 +1,12 @@
+ +#ifndef STRCHR
+ +# define STRCHR __strchr_avx2_rtm
+ +#endif
+ +
+ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ +  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ +
+ +#define VZEROUPPER_RETURN jmp         L(return_vzeroupper)
+ +
+ +#define SECTION(p) p##.avx.rtm
+ +
+ +#include "strchr-avx2.S"
+ diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+ index 39fc69da7b..0a5217514a 100644
+ --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+ +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+ @@ -38,9 +38,13 @@
+  #  define VZEROUPPER vzeroupper
+  # endif
+  
+ +# ifndef SECTION
+ +#  define SECTION(p) p##.avx
+ +# endif
+ +
+  # define VEC_SIZE 32
+  
+ -     .section .text.avx,"ax",@progbits
+ +     .section SECTION(.text),"ax",@progbits
+  ENTRY (STRCHR)
+       movl    %edi, %ecx
+       /* Broadcast CHAR to YMM0.  */
+ @@ -93,8 +97,8 @@ L(cros_page_boundary):
+       cmp     (%rax), %CHAR_REG
+       cmovne  %rdx, %rax
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +L(return_vzeroupper):
+ +     ZERO_UPPER_VEC_REGISTERS_RETURN
+  
+       .p2align 4
+  L(aligned_more):
+ @@ -190,8 +194,7 @@ L(first_vec_x0):
+       cmp     (%rax), %CHAR_REG
+       cmovne  %rdx, %rax
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
   
-  #define internal_syscall1(v0_init, input, number, err, arg1)         \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
- +     long long int _arg1 = ARGIFY (arg1);                            \
- +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
-         = (number);                                                   \
- -     register long long __v0 asm ("$2");                             \
- -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
- -     register long long __a3 asm ("$7");                             \
- +     register long long int __v0 asm ("$2");                         \
- +     register long long int __a0 asm ("$4") = _arg1;                 \
- +     register long long int __a3 asm ("$7");                         \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -159,15 +160,17 @@
+       .p2align 4
+  L(first_vec_x1):
+ @@ -205,8 +208,7 @@ L(first_vec_x1):
+       cmp     (%rax), %CHAR_REG
+       cmovne  %rdx, %rax
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
   
-  #define internal_syscall2(v0_init, input, number, err, arg1, arg2)   \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
- +     long long int _arg1 = ARGIFY (arg1);                            \
- +     long long int _arg2 = ARGIFY (arg2);                            \
- +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
-         = (number);                                                   \
- -     register long long __v0 asm ("$2");                             \
- -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
- -     register long long __a1 asm ("$5") = ARGIFY (arg2);             \
- -     register long long __a3 asm ("$7");                             \
- +     register long long int __v0 asm ("$2");                         \
- +     register long long int __a0 asm ("$4") = _arg1;                 \
- +     register long long int __a1 asm ("$5") = _arg2;                 \
- +     register long long int __a3 asm ("$7");                         \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -185,16 +188,19 @@
-  #define internal_syscall3(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3)                             \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
- +     long long int _arg1 = ARGIFY (arg1);                            \
- +     long long int _arg2 = ARGIFY (arg2);                            \
- +     long long int _arg3 = ARGIFY (arg3);                            \
- +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
-         = (number);                                                   \
- -     register long long __v0 asm ("$2");                             \
- -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
- -     register long long __a1 asm ("$5") = ARGIFY (arg2);             \
- -     register long long __a2 asm ("$6") = ARGIFY (arg3);             \
- -     register long long __a3 asm ("$7");                             \
- +     register long long int __v0 asm ("$2");                         \
- +     register long long int __a0 asm ("$4") = _arg1;                 \
- +     register long long int __a1 asm ("$5") = _arg2;                 \
- +     register long long int __a2 asm ("$6") = _arg3;                 \
- +     register long long int __a3 asm ("$7");                         \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -212,16 +218,20 @@
-  #define internal_syscall4(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3, arg4)                       \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
- +     long long int _arg1 = ARGIFY (arg1);                            \
- +     long long int _arg2 = ARGIFY (arg2);                            \
- +     long long int _arg3 = ARGIFY (arg3);                            \
- +     long long int _arg4 = ARGIFY (arg4);                            \
- +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
-         = (number);                                                   \
- -     register long long __v0 asm ("$2");                             \
- -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
- -     register long long __a1 asm ("$5") = ARGIFY (arg2);             \
- -     register long long __a2 asm ("$6") = ARGIFY (arg3);             \
- -     register long long __a3 asm ("$7") = ARGIFY (arg4);             \
- +     register long long int __v0 asm ("$2");                         \
- +     register long long int __a0 asm ("$4") = _arg1;                 \
- +     register long long int __a1 asm ("$5") = _arg2;                 \
- +     register long long int __a2 asm ("$6") = _arg3;                 \
- +     register long long int __a3 asm ("$7") = _arg4;                 \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -239,17 +249,22 @@
-  #define internal_syscall5(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3, arg4, arg5)                 \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
- +     long long int _arg1 = ARGIFY (arg1);                            \
- +     long long int _arg2 = ARGIFY (arg2);                            \
- +     long long int _arg3 = ARGIFY (arg3);                            \
- +     long long int _arg4 = ARGIFY (arg4);                            \
- +     long long int _arg5 = ARGIFY (arg5);                            \
- +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
-         = (number);                                                   \
- -     register long long __v0 asm ("$2");                             \
- -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
- -     register long long __a1 asm ("$5") = ARGIFY (arg2);             \
- -     register long long __a2 asm ("$6") = ARGIFY (arg3);             \
- -     register long long __a3 asm ("$7") = ARGIFY (arg4);             \
- -     register long long __a4 asm ("$8") = ARGIFY (arg5);             \
- +     register long long int __v0 asm ("$2");                         \
- +     register long long int __a0 asm ("$4") = _arg1;                 \
- +     register long long int __a1 asm ("$5") = _arg2;                 \
- +     register long long int __a2 asm ("$6") = _arg3;                 \
- +     register long long int __a3 asm ("$7") = _arg4;                 \
- +     register long long int __a4 asm ("$8") = _arg5;                 \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -267,18 +282,24 @@
-  #define internal_syscall6(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3, arg4, arg5, arg6)           \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long long __s0 asm ("$16") __attribute__ ((unused))    \
- +     long long int _arg1 = ARGIFY (arg1);                            \
- +     long long int _arg2 = ARGIFY (arg2);                            \
- +     long long int _arg3 = ARGIFY (arg3);                            \
- +     long long int _arg4 = ARGIFY (arg4);                            \
- +     long long int _arg5 = ARGIFY (arg5);                            \
- +     long long int _arg6 = ARGIFY (arg6);                            \
- +     register long long int __s0 asm ("$16") __attribute__ ((unused))\
-         = (number);                                                   \
- -     register long long __v0 asm ("$2");                             \
- -     register long long __a0 asm ("$4") = ARGIFY (arg1);             \
- -     register long long __a1 asm ("$5") = ARGIFY (arg2);             \
- -     register long long __a2 asm ("$6") = ARGIFY (arg3);             \
- -     register long long __a3 asm ("$7") = ARGIFY (arg4);             \
- -     register long long __a4 asm ("$8") = ARGIFY (arg5);             \
- -     register long long __a5 asm ("$9") = ARGIFY (arg6);             \
- +     register long long int __v0 asm ("$2");                         \
- +     register long long int __a0 asm ("$4") = _arg1;                 \
- +     register long long int __a1 asm ("$5") = _arg2;                 \
- +     register long long int __a2 asm ("$6") = _arg3;                 \
- +     register long long int __a3 asm ("$7") = _arg4;                 \
- +     register long long int __a4 asm ("$8") = _arg5;                 \
- +     register long long int __a5 asm ("$9") = _arg6;                 \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/sysdep.h b/sysdeps/unix/sysv/linux/mips/mips64/n64/sysdep.h
- index 9d30291f84..3e1f1cc3c5 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips64/n64/sysdep.h
- +++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/sysdep.h
- @@ -50,7 +50,7 @@
-  #undef INLINE_SYSCALL
-  #define INLINE_SYSCALL(name, nr, args...)                            \
-    ({ INTERNAL_SYSCALL_DECL (_sc_err);                                        \
- -     long result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);   \
- +     long int result_var = INTERNAL_SYSCALL (name, _sc_err, nr, args);       \
-       if ( INTERNAL_SYSCALL_ERROR_P (result_var, _sc_err) )           \
-         {                                                             \
-        __set_errno (INTERNAL_SYSCALL_ERRNO (result_var, _sc_err));    \
- @@ -59,10 +59,10 @@
-       result_var; })
+       .p2align 4
+  L(first_vec_x2):
+ @@ -220,8 +222,7 @@ L(first_vec_x2):
+       cmp     (%rax), %CHAR_REG
+       cmovne  %rdx, %rax
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(4x_vec_end):
+ @@ -247,8 +248,7 @@ L(first_vec_x3):
+       cmp     (%rax), %CHAR_REG
+       cmovne  %rdx, %rax
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+  END (STRCHR)
+  #endif
+ diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+ new file mode 100644
+ index 0000000000..ddc86a7058
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+ @@ -0,0 +1,335 @@
+ +/* strchr/strchrnul optimized with 256-bit EVEX instructions.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
+ +   This file is part of the GNU C Library.
+ +
+ +   The GNU C Library is free software; you can redistribute it and/or
+ +   modify it under the terms of the GNU Lesser General Public
+ +   License as published by the Free Software Foundation; either
+ +   version 2.1 of the License, or (at your option) any later version.
+ +
+ +   The GNU C Library is distributed in the hope that it will be useful,
+ +   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ +   Lesser General Public License for more details.
+ +
+ +   You should have received a copy of the GNU Lesser General Public
+ +   License along with the GNU C Library; if not, see
+ +   <https://www.gnu.org/licenses/>.  */
+ +
+ +#if IS_IN (libc)
+ +
+ +# include <sysdep.h>
+ +
+ +# ifndef STRCHR
+ +#  define STRCHR     __strchr_evex
+ +# endif
+ +
+ +# define VMOVU               vmovdqu64
+ +# define VMOVA               vmovdqa64
+ +
+ +# ifdef USE_AS_WCSCHR
+ +#  define VPBROADCAST        vpbroadcastd
+ +#  define VPCMP              vpcmpd
+ +#  define VPMINU     vpminud
+ +#  define CHAR_REG   esi
+ +#  define SHIFT_REG  r8d
+ +# else
+ +#  define VPBROADCAST        vpbroadcastb
+ +#  define VPCMP              vpcmpb
+ +#  define VPMINU     vpminub
+ +#  define CHAR_REG   sil
+ +#  define SHIFT_REG  ecx
+ +# endif
+ +
+ +# define XMMZERO     xmm16
+ +
+ +# define YMMZERO     ymm16
+ +# define YMM0                ymm17
+ +# define YMM1                ymm18
+ +# define YMM2                ymm19
+ +# define YMM3                ymm20
+ +# define YMM4                ymm21
+ +# define YMM5                ymm22
+ +# define YMM6                ymm23
+ +# define YMM7                ymm24
+ +# define YMM8                ymm25
+ +
+ +# define VEC_SIZE 32
+ +# define PAGE_SIZE 4096
+ +
+ +     .section .text.evex,"ax",@progbits
+ +ENTRY (STRCHR)
+ +     movl    %edi, %ecx
+ +# ifndef USE_AS_STRCHRNUL
+ +     xorl    %edx, %edx
+ +# endif
+ +
+ +     /* Broadcast CHAR to YMM0.      */
+ +     VPBROADCAST %esi, %YMM0
+ +
+ +     vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
+ +
+ +     /* Check if we cross page boundary with one vector load.  */
+ +     andl    $(PAGE_SIZE - 1), %ecx
+ +     cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
+ +     ja  L(cross_page_boundary)
+ +
+ +     /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ +        null bytes.  */
+ +     VMOVU   (%rdi), %YMM1
+ +
+ +     /* Leaves only CHARS matching esi as 0.  */
+ +     vpxorq  %YMM1, %YMM0, %YMM2
+ +     VPMINU  %YMM2, %YMM1, %YMM2
+ +     /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ +     VPCMP   $0, %YMMZERO, %YMM2, %k0
+ +     ktestd  %k0, %k0
+ +     jz      L(more_vecs)
+ +     kmovd   %k0, %eax
+ +     tzcntl  %eax, %eax
+ +     /* Found CHAR or the null byte.  */
+ +# ifdef USE_AS_WCSCHR
+ +     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ +     leaq    (%rdi, %rax, 4), %rax
+ +# else
+ +     addq    %rdi, %rax
+ +# endif
+ +# ifndef USE_AS_STRCHRNUL
+ +     cmp (%rax), %CHAR_REG
+ +     cmovne  %rdx, %rax
+ +# endif
+ +     ret
+ +
+ +     .p2align 4
+ +L(more_vecs):
+ +     /* Align data for aligned loads in the loop.  */
+ +     andq    $-VEC_SIZE, %rdi
+ +L(aligned_more):
+ +
+ +     /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ +        since data is only aligned to VEC_SIZE.      */
+ +     VMOVA   VEC_SIZE(%rdi), %YMM1
+ +     addq    $VEC_SIZE, %rdi
+ +
+ +     /* Leaves only CHARS matching esi as 0.  */
+ +     vpxorq  %YMM1, %YMM0, %YMM2
+ +     VPMINU  %YMM2, %YMM1, %YMM2
+ +     /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ +     VPCMP   $0, %YMMZERO, %YMM2, %k0
+ +     kmovd   %k0, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x0)
+ +
+ +     VMOVA   VEC_SIZE(%rdi), %YMM1
+ +     /* Leaves only CHARS matching esi as 0.  */
+ +     vpxorq  %YMM1, %YMM0, %YMM2
+ +     VPMINU  %YMM2, %YMM1, %YMM2
+ +     /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ +     VPCMP   $0, %YMMZERO, %YMM2, %k0
+ +     kmovd   %k0, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x1)
+ +
+ +     VMOVA   (VEC_SIZE * 2)(%rdi), %YMM1
+ +     /* Leaves only CHARS matching esi as 0.  */
+ +     vpxorq  %YMM1, %YMM0, %YMM2
+ +     VPMINU  %YMM2, %YMM1, %YMM2
+ +     /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ +     VPCMP   $0, %YMMZERO, %YMM2, %k0
+ +     kmovd   %k0, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x2)
+ +
+ +     VMOVA   (VEC_SIZE * 3)(%rdi), %YMM1
+ +     /* Leaves only CHARS matching esi as 0.  */
+ +     vpxorq  %YMM1, %YMM0, %YMM2
+ +     VPMINU  %YMM2, %YMM1, %YMM2
+ +     /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ +     VPCMP   $0, %YMMZERO, %YMM2, %k0
+ +     ktestd  %k0, %k0
+ +     jz      L(prep_loop_4x)
+ +
+ +     kmovd   %k0, %eax
+ +     tzcntl  %eax, %eax
+ +     /* Found CHAR or the null byte.  */
+ +# ifdef USE_AS_WCSCHR
+ +     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ +     leaq    (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+ +# else
+ +     leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
+ +# endif
+ +# ifndef USE_AS_STRCHRNUL
+ +     cmp (%rax), %CHAR_REG
+ +     cmovne  %rdx, %rax
+ +# endif
+ +     ret
+ +
+ +     .p2align 4
+ +L(first_vec_x0):
+ +     tzcntl  %eax, %eax
+ +     /* Found CHAR or the null byte.  */
+ +# ifdef USE_AS_WCSCHR
+ +     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ +     leaq    (%rdi, %rax, 4), %rax
+ +# else
+ +     addq    %rdi, %rax
+ +# endif
+ +# ifndef USE_AS_STRCHRNUL
+ +     cmp (%rax), %CHAR_REG
+ +     cmovne  %rdx, %rax
+ +# endif
+ +     ret
+ +
+ +     .p2align 4
+ +L(first_vec_x1):
+ +     tzcntl  %eax, %eax
+ +     /* Found CHAR or the null byte.  */
+ +# ifdef USE_AS_WCSCHR
+ +     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ +     leaq    VEC_SIZE(%rdi, %rax, 4), %rax
+ +# else
+ +     leaq    VEC_SIZE(%rdi, %rax), %rax
+ +# endif
+ +# ifndef USE_AS_STRCHRNUL
+ +     cmp (%rax), %CHAR_REG
+ +     cmovne  %rdx, %rax
+ +# endif
+ +     ret
+ +
+ +     .p2align 4
+ +L(first_vec_x2):
+ +     tzcntl  %eax, %eax
+ +     /* Found CHAR or the null byte.  */
+ +# ifdef USE_AS_WCSCHR
+ +     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ +     leaq    (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+ +# else
+ +     leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
+ +# endif
+ +# ifndef USE_AS_STRCHRNUL
+ +     cmp (%rax), %CHAR_REG
+ +     cmovne  %rdx, %rax
+ +# endif
+ +     ret
+ +
+ +L(prep_loop_4x):
+ +     /* Align data to 4 * VEC_SIZE.  */
+ +     andq    $-(VEC_SIZE * 4), %rdi
+ +
+ +     .p2align 4
+ +L(loop_4x_vec):
+ +     /* Compare 4 * VEC at a time forward.  */
+ +     VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
+ +     VMOVA   (VEC_SIZE * 5)(%rdi), %YMM2
+ +     VMOVA   (VEC_SIZE * 6)(%rdi), %YMM3
+ +     VMOVA   (VEC_SIZE * 7)(%rdi), %YMM4
+ +
+ +     /* Leaves only CHARS matching esi as 0.  */
+ +     vpxorq  %YMM1, %YMM0, %YMM5
+ +     vpxorq  %YMM2, %YMM0, %YMM6
+ +     vpxorq  %YMM3, %YMM0, %YMM7
+ +     vpxorq  %YMM4, %YMM0, %YMM8
+ +
+ +     VPMINU  %YMM5, %YMM1, %YMM5
+ +     VPMINU  %YMM6, %YMM2, %YMM6
+ +     VPMINU  %YMM7, %YMM3, %YMM7
+ +     VPMINU  %YMM8, %YMM4, %YMM8
+ +
+ +     VPMINU  %YMM5, %YMM6, %YMM1
+ +     VPMINU  %YMM7, %YMM8, %YMM2
+ +
+ +     VPMINU  %YMM1, %YMM2, %YMM1
+ +
+ +     /* Each bit in K0 represents a CHAR or a null byte.  */
+ +     VPCMP   $0, %YMMZERO, %YMM1, %k0
+ +
+ +     addq    $(VEC_SIZE * 4), %rdi
+ +
+ +     ktestd  %k0, %k0
+ +     jz      L(loop_4x_vec)
+ +
+ +     /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ +     VPCMP   $0, %YMMZERO, %YMM5, %k0
+ +     kmovd   %k0, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x0)
+ +
+ +     /* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
+ +     VPCMP   $0, %YMMZERO, %YMM6, %k1
+ +     kmovd   %k1, %eax
+ +     testl   %eax, %eax
+ +     jnz     L(first_vec_x1)
+ +
+ +     /* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
+ +     VPCMP   $0, %YMMZERO, %YMM7, %k2
+ +     /* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
+ +     VPCMP   $0, %YMMZERO, %YMM8, %k3
+ +
+ +# ifdef USE_AS_WCSCHR
+ +     /* NB: Each bit in K2/K3 represents 4-byte element.  */
+ +     kshiftlw $8, %k3, %k1
+ +# else
+ +     kshiftlq $32, %k3, %k1
+ +# endif
+ +
+ +     /* Each bit in K1 represents a NULL or a mismatch.  */
+ +     korq    %k1, %k2, %k1
+ +     kmovq   %k1, %rax
+ +
+ +     tzcntq  %rax, %rax
+ +# ifdef USE_AS_WCSCHR
+ +     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ +     leaq    (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+ +# else
+ +     leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
+ +# endif
+ +# ifndef USE_AS_STRCHRNUL
+ +     cmp (%rax), %CHAR_REG
+ +     cmovne  %rdx, %rax
+ +# endif
+ +     ret
+ +
+ +     /* Cold case for crossing page with first load.  */
+ +     .p2align 4
+ +L(cross_page_boundary):
+ +     andq    $-VEC_SIZE, %rdi
+ +     andl    $(VEC_SIZE - 1), %ecx
+ +
+ +     VMOVA   (%rdi), %YMM1
+ +
+ +     /* Leaves only CHARS matching esi as 0.  */
+ +     vpxorq  %YMM1, %YMM0, %YMM2
+ +     VPMINU  %YMM2, %YMM1, %YMM2
+ +     /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ +     VPCMP   $0, %YMMZERO, %YMM2, %k0
+ +     kmovd   %k0, %eax
+ +     testl   %eax, %eax
+ +
+ +# ifdef USE_AS_WCSCHR
+ +     /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ +        bytes.  */
+ +     movl    %ecx, %SHIFT_REG
+ +     sarl    $2, %SHIFT_REG
+ +# endif
+ +
+ +     /* Remove the leading bits.      */
+ +     sarxl   %SHIFT_REG, %eax, %eax
+ +     testl   %eax, %eax
+ +
+ +     jz      L(aligned_more)
+ +     tzcntl  %eax, %eax
+ +     addq    %rcx, %rdi
+ +# ifdef USE_AS_WCSCHR
+ +     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ +     leaq    (%rdi, %rax, 4), %rax
+ +# else
+ +     addq    %rdi, %rax
+ +# endif
+ +# ifndef USE_AS_STRCHRNUL
+ +     cmp (%rax), %CHAR_REG
+ +     cmovne  %rdx, %rax
+ +# endif
+ +     ret
+ +
+ +END (STRCHR)
+ +# endif
+ diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+ index f27980dd36..a04ac8eb1d 100644
+ --- a/sysdeps/x86_64/multiarch/strchr.c
+ +++ b/sysdeps/x86_64/multiarch/strchr.c
+ @@ -29,16 +29,28 @@
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
+  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+ +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+  
+  static inline void *
+  IFUNC_SELECTOR (void)
+  {
+    const struct cpu_features* cpu_features = __get_cpu_features ();
   
-  #undef INTERNAL_SYSCALL_DECL
- -#define INTERNAL_SYSCALL_DECL(err) long err __attribute__ ((unused))
- +#define INTERNAL_SYSCALL_DECL(err) long int err __attribute__ ((unused))
+ -  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ -      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ +  if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ -    return OPTIMIZE (avx2);
+ +    {
+ +      if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
+ +       && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)
+ +       && CPU_FEATURES_CPU_P (cpu_features, BMI2))
+ +     return OPTIMIZE (evex);
+ +
+ +      if (CPU_FEATURES_CPU_P (cpu_features, RTM))
+ +     return OPTIMIZE (avx2_rtm);
+ +
+ +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ +     return OPTIMIZE (avx2);
+ +    }
   
-  #undef INTERNAL_SYSCALL_ERROR_P
- -#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long) (err))
- +#define INTERNAL_SYSCALL_ERROR_P(val, err)   ((void) (val), (long int) (err))
+    if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF))
+      return OPTIMIZE (sse2_no_bsf);
+ diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
+ new file mode 100644
+ index 0000000000..cdcf818b91
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S
+ @@ -0,0 +1,3 @@
+ +#define STRCHR __strchrnul_avx2_rtm
+ +#define USE_AS_STRCHRNUL 1
+ +#include "strchr-avx2-rtm.S"
+ diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S
+ new file mode 100644
+ index 0000000000..064fe7ca9e
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S
+ @@ -0,0 +1,3 @@
+ +#define STRCHR __strchrnul_evex
+ +#define USE_AS_STRCHRNUL 1
+ +#include "strchr-evex.S"
+ diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
+ new file mode 100644
+ index 0000000000..aecd30d97f
+ --- /dev/null
+ +++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S
+ @@ -0,0 +1,12 @@
+ +#ifndef STRCMP
+ +# define STRCMP __strcmp_avx2_rtm
+ +#endif
+ +
+ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ +  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+ +
+ +#define VZEROUPPER_RETURN jmp         L(return_vzeroupper)
+ +
+ +#define SECTION(p) p##.avx.rtm
+ +
+ +#include "strcmp-avx2.S"
+ diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+ index 48d03a9f46..4d434fd14e 100644
+ --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+ +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+ @@ -55,6 +55,10 @@
+  #  define VZEROUPPER vzeroupper
+  # endif
   
-  #undef INTERNAL_SYSCALL_ERRNO
-  #define INTERNAL_SYSCALL_ERRNO(val, err)     ((void) (err), val)
- @@ -108,13 +108,13 @@
+ +# ifndef SECTION
+ +#  define SECTION(p) p##.avx
+ +# endif
+ +
+  /* Warning!
+             wcscmp/wcsncmp have to use SIGNED comparison for elements.
+             strcmp/strncmp have to use UNSIGNED comparison for elements.
+ @@ -75,7 +79,7 @@
+     the maximum offset is reached before a difference is found, zero is
+     returned.  */
+  
+ -     .section .text.avx,"ax",@progbits
+ +     .section SECTION(.text),"ax",@progbits
+  ENTRY (STRCMP)
+  # ifdef USE_AS_STRNCMP
+       /* Check for simple cases (0 or 1) in offset.  */
+ @@ -83,6 +87,16 @@ ENTRY (STRCMP)
+       je      L(char0)
+       jb      L(zero)
+  #  ifdef USE_AS_WCSCMP
+ +#  ifndef __ILP32__
+ +     movq    %rdx, %rcx
+ +     /* Check if length could overflow when multiplied by
+ +        sizeof(wchar_t). Checking top 8 bits will cover all potential
+ +        overflow cases as well as redirect cases where its impossible to
+ +        length to bound a valid memory region. In these cases just use
+ +        'wcscmp'.  */
+ +     shrq    $56, %rcx
+ +     jnz     OVERFLOW_STRCMP
+ +#  endif
+       /* Convert units: from wide to byte char.  */
+       shl     $2, %RDX_LP
+  #  endif
+ @@ -127,8 +141,8 @@ L(return):
+       movzbl  (%rsi, %rdx), %edx
+       subl    %edx, %eax
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +L(return_vzeroupper):
+ +     ZERO_UPPER_VEC_REGISTERS_RETURN
   
-  #define internal_syscall0(v0_init, input, number, err, dummy...)     \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a3 asm ("$7");                                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a3 asm ("$7");                              \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -131,14 +131,15 @@
+       .p2align 4
+  L(return_vec_size):
+ @@ -161,8 +175,7 @@ L(return_vec_size):
+       subl    %edx, %eax
+  #  endif
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
   
-  #define internal_syscall1(v0_init, input, number, err, arg1)         \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     long int _arg1 = (long int) (arg1);                             \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a0 asm ("$4") = (long) (arg1);                  \
- -     register long __a3 asm ("$7");                                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a0 asm ("$4") = _arg1;                      \
- +     register long int __a3 asm ("$7");                              \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -155,15 +156,17 @@
+       .p2align 4
+  L(return_2_vec_size):
+ @@ -195,8 +208,7 @@ L(return_2_vec_size):
+       subl    %edx, %eax
+  #  endif
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
   
-  #define internal_syscall2(v0_init, input, number, err, arg1, arg2)   \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     long int _arg1 = (long int) (arg1);                             \
- +     long int _arg2 = (long int) (arg2);                             \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a0 asm ("$4") = (long) (arg1);                  \
- -     register long __a1 asm ("$5") = (long) (arg2);                  \
- -     register long __a3 asm ("$7");                                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a0 asm ("$4") = _arg1;                      \
- +     register long int __a1 asm ("$5") = _arg2;                      \
- +     register long int __a3 asm ("$7");                              \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -181,16 +184,19 @@
-  #define internal_syscall3(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3)                             \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     long int _arg1 = (long int) (arg1);                             \
- +     long int _arg2 = (long int) (arg2);                             \
- +     long int _arg3 = (long int) (arg3);                             \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a0 asm ("$4") = (long) (arg1);                  \
- -     register long __a1 asm ("$5") = (long) (arg2);                  \
- -     register long __a2 asm ("$6") = (long) (arg3);                  \
- -     register long __a3 asm ("$7");                                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a0 asm ("$4") = _arg1;                      \
- +     register long int __a1 asm ("$5") = _arg2;                      \
- +     register long int __a2 asm ("$6") = _arg3;                      \
- +     register long int __a3 asm ("$7");                              \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -208,16 +214,20 @@
-  #define internal_syscall4(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3, arg4)                       \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     long int _arg1 = (long int) (arg1);                             \
- +     long int _arg2 = (long int) (arg2);                             \
- +     long int _arg3 = (long int) (arg3);                             \
- +     long int _arg4 = (long int) (arg4);                             \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a0 asm ("$4") = (long) (arg1);                  \
- -     register long __a1 asm ("$5") = (long) (arg2);                  \
- -     register long __a2 asm ("$6") = (long) (arg3);                  \
- -     register long __a3 asm ("$7") = (long) (arg4);                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a0 asm ("$4") = _arg1;                      \
- +     register long int __a1 asm ("$5") = _arg2;                      \
- +     register long int __a2 asm ("$6") = _arg3;                      \
- +     register long int __a3 asm ("$7") = _arg4;                      \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -235,17 +245,22 @@
-  #define internal_syscall5(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3, arg4, arg5)                 \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     long int _arg1 = (long int) (arg1);                             \
- +     long int _arg2 = (long int) (arg2);                             \
- +     long int _arg3 = (long int) (arg3);                             \
- +     long int _arg4 = (long int) (arg4);                             \
- +     long int _arg5 = (long int) (arg5);                             \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a0 asm ("$4") = (long) (arg1);                  \
- -     register long __a1 asm ("$5") = (long) (arg2);                  \
- -     register long __a2 asm ("$6") = (long) (arg3);                  \
- -     register long __a3 asm ("$7") = (long) (arg4);                  \
- -     register long __a4 asm ("$8") = (long) (arg5);                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a0 asm ("$4") = _arg1;                      \
- +     register long int __a1 asm ("$5") = _arg2;                      \
- +     register long int __a2 asm ("$6") = _arg3;                      \
- +     register long int __a3 asm ("$7") = _arg4;                      \
- +     register long int __a4 asm ("$8") = _arg5;                      \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- @@ -263,18 +278,24 @@
-  #define internal_syscall6(v0_init, input, number, err,                       \
-                         arg1, arg2, arg3, arg4, arg5, arg6)           \
-  ({                                                                   \
- -     long _sys_result;                                               \
- +     long int _sys_result;                                           \
-                                                                       \
-       {                                                               \
- -     register long __s0 asm ("$16") __attribute__ ((unused))         \
- +     long int _arg1 = (long int) (arg1);                             \
- +     long int _arg2 = (long int) (arg2);                             \
- +     long int _arg3 = (long int) (arg3);                             \
- +     long int _arg4 = (long int) (arg4);                             \
- +     long int _arg5 = (long int) (arg5);                             \
- +     long int _arg6 = (long int) (arg6);                             \
- +     register long int __s0 asm ("$16") __attribute__ ((unused))     \
-         = (number);                                                   \
- -     register long __v0 asm ("$2");                                  \
- -     register long __a0 asm ("$4") = (long) (arg1);                  \
- -     register long __a1 asm ("$5") = (long) (arg2);                  \
- -     register long __a2 asm ("$6") = (long) (arg3);                  \
- -     register long __a3 asm ("$7") = (long) (arg4);                  \
- -     register long __a4 asm ("$8") = (long) (arg5);                  \
- -     register long __a5 asm ("$9") = (long) (arg6);                  \
- +     register long int __v0 asm ("$2");                              \
- +     register long int __a0 asm ("$4") = _arg1;                      \
- +     register long int __a1 asm ("$5") = _arg2;                      \
- +     register long int __a2 asm ("$6") = _arg3;                      \
- +     register long int __a3 asm ("$7") = _arg4;                      \
- +     register long int __a4 asm ("$8") = _arg5;                      \
- +     register long int __a5 asm ("$9") = _arg6;                      \
-       __asm__ volatile (                                              \
-       ".set\tnoreorder\n\t"                                           \
-       v0_init                                                         \
- diff --git a/sysdeps/unix/sysv/linux/mips/mips64/syscall.S b/sysdeps/unix/sysv/linux/mips/mips64/syscall.S
- index 26adf2cd04..a9baff3c17 100644
- --- a/sysdeps/unix/sysv/linux/mips/mips64/syscall.S
- +++ b/sysdeps/unix/sysv/linux/mips/mips64/syscall.S
- @@ -20,7 +20,7 @@
-  #include <sys/asm.h>
+       .p2align 4
+  L(return_3_vec_size):
+ @@ -229,8 +241,7 @@ L(return_3_vec_size):
+       subl    %edx, %eax
+  #  endif
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(next_3_vectors):
+ @@ -356,8 +367,7 @@ L(back_to_loop):
+       subl    %edx, %eax
+  #  endif
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(test_vec):
+ @@ -400,8 +410,7 @@ L(test_vec):
+       subl    %edx, %eax
+  #  endif
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(test_2_vec):
+ @@ -444,8 +453,7 @@ L(test_2_vec):
+       subl    %edx, %eax
+  #  endif
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(test_3_vec):
+ @@ -486,8 +494,7 @@ L(test_3_vec):
+       subl    %edx, %eax
+  #  endif
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
   
-  /* Usage:
- -   long syscall (syscall_number, arg1, arg2, arg3, arg4, arg5, arg6, arg7)
- +   long int syscall (syscall_number, arg1, arg2, arg3, arg4, arg5, arg6, arg7)
+       .p2align 4
+  L(loop_cross_page):
+ @@ -556,8 +563,7 @@ L(loop_cross_page):
+       subl    %edx, %eax
+  #  endif
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
   
-     We need to do some arg shifting, syscall_number will be in v0.  */
+       .p2align 4
+  L(loop_cross_page_2_vec):
+ @@ -591,7 +597,14 @@ L(loop_cross_page_2_vec):
+       movl    $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
   
- diff --git a/sysdeps/unix/sysv/linux/mips/sysdep.h b/sysdeps/unix/sysv/linux/mips/sysdep.h
- index cdfc0b1b58..a4cf1540fe 100644
- --- a/sysdeps/unix/sysv/linux/mips/sysdep.h
- +++ b/sysdeps/unix/sysv/linux/mips/sysdep.h
- @@ -36,8 +36,8 @@
-     the INTERNAL_SYSCALL_{ERROR_P,ERRNO} macros work correctly.  */
-  #define INTERNAL_VSYSCALL_CALL(funcptr, err, nr, args...)            \
-    ({                                                                 \
- -    long _ret = funcptr (args);                                              \
- -    err = ((unsigned long) (_ret) >= (unsigned long) -4095L);                \
- +    long int _ret = funcptr (args);                                  \
- +    err = ((unsigned long int) (_ret) >= (unsigned long int) -4095L);        \
-      if (err)                                                         \
-        _ret = -_ret;                                                  \
-      _ret;                                                            \
- diff --git a/sysdeps/unix/sysv/linux/mips/unwind-arch.h b/sysdeps/unix/sysv/linux/mips/unwind-arch.h
+       testq   %rdi, %rdi
+ +# ifdef USE_AS_STRNCMP
+ +     /* At this point, if %rdi value is 0, it already tested
+ +        VEC_SIZE*4+%r10 byte starting from %rax. This label
+ +        checks whether strncmp maximum offset reached or not.  */
+ +     je      L(string_nbyte_offset_check)
+ +# else
+       je      L(back_to_loop)
+ +# endif
+       tzcntq  %rdi, %rcx
+       addq    %r10, %rcx
+       /* Adjust for number of bytes skipped.  */
+ @@ -624,8 +637,15 @@ L(loop_cross_page_2_vec):
+       subl    %edx, %eax
+  #  endif
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+ +
+ +# ifdef USE_AS_STRNCMP
+ +L(string_nbyte_offset_check):
+ +     leaq    (VEC_SIZE * 4)(%r10), %r10
+ +     cmpq    %r10, %r11
+ +     jbe     L(zero)
+ +     jmp     L(back_to_loop)
+ +# endif
+  
+       .p2align 4
+  L(cross_page_loop):
+ @@ -659,8 +679,7 @@ L(cross_page_loop):
+  # ifndef USE_AS_WCSCMP
+  L(different):
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+  # ifdef USE_AS_WCSCMP
+       .p2align 4
+ @@ -670,16 +689,14 @@ L(different):
+       setl    %al
+       negl    %eax
+       orl     $1, %eax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  # endif
+  
+  # ifdef USE_AS_STRNCMP
+       .p2align 4
+  L(zero):
+       xorl    %eax, %eax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       .p2align 4
+  L(char0):
+ @@ -693,8 +710,7 @@ L(char0):
+       movzbl  (%rdi), %eax
+       subl    %ecx, %eax
+  #  endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  # endif
+  
+       .p2align 4
+ @@ -719,8 +735,7 @@ L(last_vector):
+       movzbl  (%rsi, %rdx), %edx
+       subl    %edx, %eax
+  # endif
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  
+       /* Comparing on page boundary region requires special treatment:
+          It must done one vector at the time, starting with the wider
+ @@ -841,7 +856,6 @@ L(cross_page_4bytes):
+       testl   %eax, %eax
+       jne     L(cross_page_loop)
+       subl    %ecx, %eax
+ -     VZEROUPPER
+ -     ret
+ +     VZEROUPPER_RETURN
+  END (STRCMP)
+  #endif
+ diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
  new file mode 100644
- index 0000000000..a009899983
+ index 0000000000..459eeed09f
  --- /dev/null
- +++ b/sysdeps/unix/sysv/linux/mips/unwind-arch.h
- @@ -0,0 +1,67 @@
- +/* Return backtrace of current program state.  Arch-specific bits.
- +   Copyright (C) 2020 Free Software Foundation, Inc.
+ +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+ @@ -0,0 +1,1043 @@
+ +/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
+ +   Copyright (C) 2021 Free Software Foundation, Inc.
  +   This file is part of the GNU C Library.
  +
  +   The GNU C Library is free software; you can redistribute it and/or