GIT update of https://sourceware.org/git/glibc.git/release/2.35/master from glibc-2.35
GIT update of https://sourceware.org/git/glibc.git/release/2.35/master from glibc-2.35
Gbp-Pq: Name git-updates.diff
library will still be usable, but functionality may be lost--for
example, you can't build a shared libc with old binutils.
+'--with-default-link'
+ With '--with-default-link', the build system does not use a custom
+ linker script for linking shared objects. The default is
+ '--without-default-link', because the custom linker script is
+ needed for full RELRO protection.
+
'--with-nonshared-cflags=CFLAGS'
Use additional compiler flags CFLAGS to build the parts of the
library which are always statically linked into applications and
Please send GNU C library bug reports via <https://sourceware.org/bugzilla/>
using `glibc' in the "product" field.
+\f
+Version 2.35.1
+
+The following bugs are resolved with this release:
+
+ [12154] Do not fail DNS resolution for CNAMEs which are not host names
+ [25812] Libio vtable protection is sometimes only partially enforced
+ [28838] FAIL: elf/tst-p_align3
+ [28846] CMSG_NXTHDR may trigger -Wstrict-overflow warning
+ [28850] linux: __get_nprocs_sched reads uninitialized memory from the stack
+ [28853] libc: tst-spawn6 changes current foreground process group
+ (breaks test isolation)
+ [28857] libc: FAIL: elf/tst-audit24a
+ [28860] build: --enable-kernel=5.1.0 build fails because of missing
+ __convert_scm_timestamps
+ [28865] linux: _SC_NPROCESSORS_CONF and _SC_NPROCESSORS_ONLN are inaccurate
+ without /sys and /proc
+ [28868] dynamic-link: Dynamic loader DFS algorithm segfaults on
+ missing libraries
+ [28896] strncmp-avx2-rtm and wcsncmp-avx2-rtm fallback on non-rtm
+ variants when avoiding overflow
+ [28937] New DSO dependency sorter does not put new map first if in a cycle
+ [28953] nss: Protect against errno changes in function lookup
+ [29029] nptl: poll() spuriously returns EINTR during thread
+ cancellation and with cancellation disabled
+ [29062] elf: Fix memory leak in _dl_find_object_update
+ [29078] <dlfcn.h> functions unusable during early auditing
+ [29097] time: fchmodat does not handle 64 bit time_t for
+ AT_SYMLINK_NOFOLLOW
+ [29109] libc: posix_spawn() always returns 1 (EPERM) on clone()
+ failure
+ [29165] libc: [Regression] broken argv adjustment
+ [29187] dynamic-link: [regression] broken argv adjustment for nios2
+ [29203] libc: daemon is not y2038 aware
+ [29204] libc: getusershell is not 2038 aware
+ [29207] libc: posix_fallocate fallback implementation is not y2038
+ [29208] libc: fpathconf(_PC_ASYNC_IO) is not y2038 aware
+ [29209] libc: isfdtype is not y2038 aware
+ [29210] network: ruserpass is not y2038 aware
+ [29211] libc: __open_catalog is not y2038 aware
+ [29213] libc: gconv_parseconfdir is not y2038 aware
+ [29214] nptl: pthread_setcanceltype fails to set type
+ [29225] network: Mistyped define statement in socket/sys/socket.h in
+ line 184
+ [29305] Conserve NSS buffer space during DNS packet parsing
+ [29415] nscd: Fix netlink cache invalidation if epoll is used
+ [29446] _dlopen now ignores dl_caller argument in static mode
+ [29490] alpha: New __brk_call implementation is broken
+ [29528] elf: Call __libc_early_init for reused namespaces
+ [29537] libc: [2.34 regression]: Alignment issue on m68k when using
+ [29583] Use 64-bit interfaces in gconv_parseconfdir
+
\f
Version 2.35
fortification balanced against additional runtime cost (checking non-constant
bounds).
+* The audit libraries will avoid unnecessary slowdown if it is not required
+ PLT tracking (by not implementing the la_pltenter or la_pltexit callbacks).
+
Deprecated and removed features, and other changes affecting compatibility:
* On x86-64, the LD_PREFER_MAP_32BIT_EXEC environment variable support
[28837] libc: FAIL: socket/tst-socket-timestamp-compat
[28847] locale: Empty mon_decimal_point in LC_MONETARY results in non-
empty mon_decimal_point_wc
+ [29069] libc: fstatat64_time64_statx wrapper broken on MIPS N32 with
+ -D_FILE_OFFSET_BITS=64 and -D_TIME_BITS=64
\f
Version 2.34
execute programs that do not have any dynamic dependency (that is,
they are statically linked). This feature is Linux-specific.
-* The audit libraries will avoid unnecessary slowdown if it is not required
- PLT tracking (by not implementing the la_pltenter or la_pltexit callbacks).
-
Deprecated and removed features, and other changes affecting compatibility:
* The function pthread_mutex_consistent_np has been deprecated; programs
+ CMSG_ALIGN (sizeof (struct cmsghdr)))
#define CMSG_LEN(len) (CMSG_ALIGN (sizeof (struct cmsghdr)) + (len))
+/* Given a length, return the additional padding necessary such that
+ len + __CMSG_PADDING(len) == CMSG_ALIGN (len). */
+#define __CMSG_PADDING(len) ((sizeof (size_t) \
+ - ((len) & (sizeof (size_t) - 1))) \
+ & (sizeof (size_t) - 1))
+
extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr,
struct cmsghdr *__cmsg) __THROW;
#ifdef __USE_EXTERN_INLINES
_EXTERN_INLINE struct cmsghdr *
__NTH (__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg))
{
+ /* We may safely assume that __cmsg lies between __mhdr->msg_control and
+ __mhdr->msg_controllen because the user is required to obtain the first
+ cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs
+ via CMSG_NXTHDR, setting lengths along the way. However, we don't yet
+ trust the value of __cmsg->cmsg_len and therefore do not use it in any
+ pointer arithmetic until we check its value. */
+
+ unsigned char * __msg_control_ptr = (unsigned char *) __mhdr->msg_control;
+ unsigned char * __cmsg_ptr = (unsigned char *) __cmsg;
+
+ size_t __size_needed = sizeof (struct cmsghdr)
+ + __CMSG_PADDING (__cmsg->cmsg_len);
+
+ /* The current header is malformed, too small to be a full header. */
if ((size_t) __cmsg->cmsg_len < sizeof (struct cmsghdr))
- /* The kernel header does this so there may be a reason. */
return (struct cmsghdr *) 0;
+ /* There isn't enough space between __cmsg and the end of the buffer to
+ hold the current cmsg *and* the next one. */
+ if (((size_t)
+ (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr)
+ < __size_needed)
+ || ((size_t)
+ (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr
+ - __size_needed)
+ < __cmsg->cmsg_len))
+
+ return (struct cmsghdr *) 0;
+
+ /* Now, we trust cmsg_len and can use it to find the next header. */
__cmsg = (struct cmsghdr *) ((unsigned char *) __cmsg
+ CMSG_ALIGN (__cmsg->cmsg_len));
- if ((unsigned char *) (__cmsg + 1) > ((unsigned char *) __mhdr->msg_control
- + __mhdr->msg_controllen)
- || ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len)
- > ((unsigned char *) __mhdr->msg_control + __mhdr->msg_controllen)))
- /* No more entries. */
- return (struct cmsghdr *) 0;
return __cmsg;
}
#endif /* Use `extern inline'. */
__nl_catd catalog)
{
int fd = -1;
- struct stat64 st;
+ struct __stat64_t64 st;
int swapping;
size_t cnt;
size_t max_offset;
return -1;
}
- if (__builtin_expect (__fstat64 (fd, &st), 0) < 0)
+ if (__glibc_unlikely (__fstat64_time64 (fd, &st) < 0))
goto close_unlock_return;
if (__builtin_expect (!S_ISREG (st.st_mode), 0)
docdir
oldincludedir
includedir
-runstatedir
localstatedir
sharedstatedir
sysconfdir
sysconfdir='${prefix}/etc'
sharedstatedir='${prefix}/com'
localstatedir='${prefix}/var'
-runstatedir='${localstatedir}/run'
includedir='${prefix}/include'
oldincludedir='/usr/include'
docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
| -silent | --silent | --silen | --sile | --sil)
silent=yes ;;
- -runstatedir | --runstatedir | --runstatedi | --runstated \
- | --runstate | --runstat | --runsta | --runst | --runs \
- | --run | --ru | --r)
- ac_prev=runstatedir ;;
- -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
- | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
- | --run=* | --ru=* | --r=*)
- runstatedir=$ac_optarg ;;
-
-sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
ac_prev=sbindir ;;
-sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \
datadir sysconfdir sharedstatedir localstatedir includedir \
oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
- libdir localedir mandir runstatedir
+ libdir localedir mandir
do
eval ac_val=\$$ac_var
# Remove trailing slashes.
--sysconfdir=DIR read-only single-machine data [PREFIX/etc]
--sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com]
--localstatedir=DIR modifiable single-machine data [PREFIX/var]
- --runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run]
--libdir=DIR object code libraries [EPREFIX/lib]
--includedir=DIR C header files [PREFIX/include]
--oldincludedir=DIR C header files for non-gcc [/usr/include]
if test "${with_default_link+set}" = set; then :
withval=$with_default_link; use_default_link=$withval
else
- use_default_link=default
+ use_default_link=no
fi
$as_echo "$libc_cv_hashstyle" >&6; }
-# The linker's default -shared behavior is good enough if it
-# does these things that our custom linker scripts ensure that
-# all allocated NOTE sections come first.
-if test "$use_default_link" = default; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sufficient default -shared layout" >&5
-$as_echo_n "checking for sufficient default -shared layout... " >&6; }
-if ${libc_cv_use_default_link+:} false; then :
- $as_echo_n "(cached) " >&6
-else
- libc_cv_use_default_link=no
- cat > conftest.s <<\EOF
- .section .note.a,"a",%note
- .balign 4
- .long 4,4,9
- .string "GNU"
- .string "foo"
- .section .note.b,"a",%note
- .balign 4
- .long 4,4,9
- .string "GNU"
- .string "bar"
-EOF
- if { ac_try=' ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&5'
- { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
- (eval $ac_try) 2>&5
- ac_status=$?
- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
- test $ac_status = 0; }; } &&
- ac_try=`$READELF -S conftest.so | sed -n \
- '${x;p;}
- s/^ *\[ *[1-9][0-9]*\] *\([^ ][^ ]*\) *\([^ ][^ ]*\) .*$/\2 \1/
- t a
- b
- : a
- H'`
- then
- libc_seen_a=no libc_seen_b=no
- set -- $ac_try
- while test $# -ge 2 -a "$1" = NOTE; do
- case "$2" in
- .note.a) libc_seen_a=yes ;;
- .note.b) libc_seen_b=yes ;;
- esac
- shift 2
- done
- case "$libc_seen_a$libc_seen_b" in
- yesyes)
- libc_cv_use_default_link=yes
- ;;
- *)
- echo >&5 "\
-$libc_seen_a$libc_seen_b from:
-$ac_try"
- ;;
- esac
- fi
- rm -f conftest*
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_use_default_link" >&5
-$as_echo "$libc_cv_use_default_link" >&6; }
- use_default_link=$libc_cv_use_default_link
-fi
-
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for GLOB_DAT reloc" >&5
$as_echo_n "checking for GLOB_DAT reloc... " >&6; }
if ${libc_cv_has_glob_dat+:} false; then :
AS_HELP_STRING([--with-default-link],
[do not use explicit linker scripts]),
[use_default_link=$withval],
- [use_default_link=default])
+ [use_default_link=no])
dnl Additional build flags injection.
AC_ARG_WITH([nonshared-cflags],
rm -f conftest*])
AC_SUBST(libc_cv_hashstyle)
-# The linker's default -shared behavior is good enough if it
-# does these things that our custom linker scripts ensure that
-# all allocated NOTE sections come first.
-if test "$use_default_link" = default; then
- AC_CACHE_CHECK([for sufficient default -shared layout],
- libc_cv_use_default_link, [dnl
- libc_cv_use_default_link=no
- cat > conftest.s <<\EOF
- .section .note.a,"a",%note
- .balign 4
- .long 4,4,9
- .string "GNU"
- .string "foo"
- .section .note.b,"a",%note
- .balign 4
- .long 4,4,9
- .string "GNU"
- .string "bar"
-EOF
- if AC_TRY_COMMAND([dnl
- ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&AS_MESSAGE_LOG_FD]) &&
- ac_try=`$READELF -S conftest.so | sed -n \
- ['${x;p;}
- s/^ *\[ *[1-9][0-9]*\] *\([^ ][^ ]*\) *\([^ ][^ ]*\) .*$/\2 \1/
- t a
- b
- : a
- H']`
- then
- libc_seen_a=no libc_seen_b=no
- set -- $ac_try
- while test $# -ge 2 -a "$1" = NOTE; do
- case "$2" in
- .note.a) libc_seen_a=yes ;;
- .note.b) libc_seen_b=yes ;;
- esac
- shift 2
- done
- case "$libc_seen_a$libc_seen_b" in
- yesyes)
- libc_cv_use_default_link=yes
- ;;
- *)
- echo >&AS_MESSAGE_LOG_FD "\
-$libc_seen_a$libc_seen_b from:
-$ac_try"
- ;;
- esac
- fi
- rm -f conftest*])
- use_default_link=$libc_cv_use_default_link
-fi
-
AC_CACHE_CHECK(for GLOB_DAT reloc,
libc_cv_has_glob_dat, [dnl
cat > conftest.c <<EOF
}
}
- /* Initialize very early so that tunables can use it. */
- __libc_init_secure ();
-
__tunables_init (__environ);
ARCH_INIT_CPU_FEATURES ();
_dl_allocate_tls_storage (in elf/dl-tls.c) does using __libc_memalign
and dl_tls_static_align. */
tcb_offset = roundup (memsz + GLRO(dl_tls_static_surplus), max_align);
- tlsblock = __sbrk (tcb_offset + TLS_INIT_TCB_SIZE + max_align);
+ tlsblock = _dl_early_allocate (tcb_offset + TLS_INIT_TCB_SIZE + max_align);
+ if (tlsblock == NULL)
+ _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
#elif TLS_DTV_AT_TP
tcb_offset = roundup (TLS_INIT_TCB_SIZE, align ?: 1);
- tlsblock = __sbrk (tcb_offset + memsz + max_align
- + TLS_PRE_TCB_SIZE + GLRO(dl_tls_static_surplus));
+ tlsblock = _dl_early_allocate (tcb_offset + memsz + max_align
+ + TLS_PRE_TCB_SIZE
+ + GLRO(dl_tls_static_surplus));
+ if (tlsblock == NULL)
+ _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
tlsblock += TLS_PRE_TCB_SIZE;
#else
/* In case a model with a different layout for the TCB and DTV
CHK_FAIL_END
#endif
+ /* Bug 29030 regresion check */
+ cp = "HelloWorld";
+ if (mbsrtowcs (NULL, &cp, (size_t)-1, &s) != 10)
+ FAIL ();
+
cp = "A";
if (mbstowcs (wenough, cp, 10) != 1
|| wcscmp (wenough, L"A") != 0)
tststatic4-ENV = $(tststatic-ENV)
tststatic5-ENV = $(tststatic-ENV)
+tests-internal += \
+ tst-dlinfo-phdr \
+ # tests-internal
+
ifneq (,$(CXX))
modules-names += bug-atexit3-lib
else
__dladdr (const void *address, Dl_info *info)
{
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dladdr (address, info);
#endif
return _dl_addr (address, info, NULL, NULL);
__dladdr1 (const void *address, Dl_info *info, void **extra, int flags)
{
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dladdr1 (address, info, extra, flags);
#endif
__dlclose (void *handle)
{
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlclose (handle);
#endif
__dlerror (void)
{
# ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlerror ();
# endif
segment, or if the calling thread has not allocated a block for it. */
RTLD_DI_TLS_DATA = 10,
- RTLD_DI_MAX = 10
+ /* Treat ARG as const ElfW(Phdr) **, and store the address of the
+ program header array at that location. The dlinfo call returns
+ the number of program headers in the array. */
+ RTLD_DI_PHDR = 11,
+
+ RTLD_DI_MAX = 11
};
void *handle;
int request;
void *arg;
+
+ /* This is the value that is returned from dlinfo if no error is
+ signaled. */
+ int result;
};
static void
{
case RTLD_DI_CONFIGADDR:
default:
+ args->result = -1;
_dl_signal_error (0, NULL, NULL, N_("unsupported dlinfo request"));
break;
*(void **) args->arg = data;
break;
}
+
+ case RTLD_DI_PHDR:
+ *(const ElfW(Phdr) **) args->arg = l->l_phdr;
+ args->result = l->l_phnum;
+ break;
}
}
dlinfo_implementation (void *handle, int request, void *arg)
{
struct dlinfo_args args = { handle, request, arg };
- return _dlerror_run (&dlinfo_doit, &args) ? -1 : 0;
+ _dlerror_run (&dlinfo_doit, &args);
+ return args.result;
}
#ifdef SHARED
int
___dlinfo (void *handle, int request, void *arg)
{
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlinfo (handle, request, arg);
else
return dlinfo_implementation (handle, request, arg);
void *
___dlmopen (Lmid_t nsid, const char *file, int mode)
{
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlmopen (nsid, file, mode, RETURN_ADDRESS (0));
else
return dlmopen_implementation (nsid, file, mode, RETURN_ADDRESS (0));
void *
___dlopen (const char *file, int mode)
{
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
else
return dlopen_implementation (file, mode, RETURN_ADDRESS (0));
void *
__dlopen (const char *file, int mode, void *dl_caller)
{
- return dlopen_implementation (file, mode, RETURN_ADDRESS (0));
+ return dlopen_implementation (file, mode, dl_caller);
}
void *
mode |= RTLD_LAZY;
args.mode = mode;
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0));
return _dlerror_run (dlopen_doit, &args) ? NULL : args.new;
void *
___dlsym (void *handle, const char *name)
{
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlsym (handle, name, RETURN_ADDRESS (0));
else
return dlsym_implementation (handle, name, RETURN_ADDRESS (0));
void *
___dlvsym (void *handle, const char *name, const char *version)
{
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->dlvsym (handle, name, version,
RETURN_ADDRESS (0));
else
--- /dev/null
+/* Test for dlinfo (RTLD_DI_PHDR).
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/auxv.h>
+
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+/* Used to verify that the program header array appears as expected
+ among the dl_iterate_phdr callback invocations. */
+
+struct dlip_callback_args
+{
+ struct link_map *l; /* l->l_addr is used to find the object. */
+ const ElfW(Phdr) *phdr; /* Expected program header pointed. */
+ int phnum; /* Expected program header count. */
+ bool found; /* True if l->l_addr has been found. */
+};
+
+static int
+dlip_callback (struct dl_phdr_info *dlpi, size_t size, void *closure)
+{
+ TEST_COMPARE (sizeof (*dlpi), size);
+ struct dlip_callback_args *args = closure;
+
+ if (dlpi->dlpi_addr == args->l->l_addr)
+ {
+ TEST_VERIFY (!args->found);
+ args->found = true;
+ TEST_VERIFY (args->phdr == dlpi->dlpi_phdr);
+ TEST_COMPARE (args->phnum, dlpi->dlpi_phnum);
+ }
+
+ return 0;
+}
+
+static int
+do_test (void)
+{
+ /* Avoid a copy relocation. */
+ struct r_debug *debug = xdlsym (RTLD_DEFAULT, "_r_debug");
+ struct link_map *l = (struct link_map *) debug->r_map;
+ TEST_VERIFY_EXIT (l != NULL);
+
+ do
+ {
+ printf ("info: checking link map %p (%p) for \"%s\"\n",
+ l, l->l_phdr, l->l_name);
+
+ /* Cause dlerror () to return an error message. */
+ dlsym (RTLD_DEFAULT, "does-not-exist");
+
+ /* Use the extension that link maps are valid dlopen handles. */
+ const ElfW(Phdr) *phdr;
+ int phnum = dlinfo (l, RTLD_DI_PHDR, &phdr);
+ TEST_VERIFY (phnum >= 0);
+ /* Verify that the error message has been cleared. */
+ TEST_COMPARE_STRING (dlerror (), NULL);
+
+ TEST_VERIFY (phdr == l->l_phdr);
+ TEST_COMPARE (phnum, l->l_phnum);
+
+ /* Check that we can find PT_DYNAMIC among the array. */
+ {
+ bool dynamic_found = false;
+ for (int i = 0; i < phnum; ++i)
+ if (phdr[i].p_type == PT_DYNAMIC)
+ {
+ dynamic_found = true;
+ TEST_COMPARE ((ElfW(Addr)) l->l_ld, l->l_addr + phdr[i].p_vaddr);
+ }
+ TEST_VERIFY (dynamic_found);
+ }
+
+ /* Check that dl_iterate_phdr finds the link map with the same
+ program headers. */
+ {
+ struct dlip_callback_args args =
+ {
+ .l = l,
+ .phdr = phdr,
+ .phnum = phnum,
+ .found = false,
+ };
+ TEST_COMPARE (dl_iterate_phdr (dlip_callback, &args), 0);
+ TEST_VERIFY (args.found);
+ }
+
+ if (l->l_prev == NULL)
+ {
+ /* This is the executable, so the information is also
+ available via getauxval. */
+ TEST_COMPARE_STRING (l->l_name, "");
+ TEST_VERIFY (phdr == (const ElfW(Phdr) *) getauxval (AT_PHDR));
+ TEST_COMPARE (phnum, getauxval (AT_PHNUM));
+ }
+
+ l = l->l_next;
+ }
+ while (l != NULL);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
$(all-dl-routines) \
dl-addr \
dl-addr-obj \
+ dl-early_allocate \
dl-error \
dl-iteratephdr \
dl-libc \
# But they are absent from the shared libc, because that code is in ld.so.
elide-routines.os = \
$(all-dl-routines) \
+ dl-early_allocate \
dl-exception \
dl-origin \
dl-reloc-static-pie \
CFLAGS-rtld.c += -fno-tree-loop-distribute-patterns
endif
+ifeq (yes,$(have-loop-to-function))
+# Likewise, during static library startup, memset is not yet available.
+CFLAGS-dl-support.c = -fno-tree-loop-distribute-patterns
+endif
+
# Compile rtld itself without stack protection.
# Also compile all routines in the static library that are elided from
# the shared libc because they are in libc.a in the same way.
tst-linkall-static \
tst-single_threaded-pthread-static \
tst-single_threaded-static \
+ tst-tls-allocation-failure-static \
tst-tlsalign-extern-static \
tst-tlsalign-static \
# tests-static-normal
tst-align \
tst-align2 \
tst-align3 \
+ tst-audit-tlsdesc \
+ tst-audit-tlsdesc-dlopen \
tst-audit1 \
tst-audit2 \
tst-audit8 \
tst-audit24d \
tst-audit25a \
tst-audit25b \
+ tst-audit26 \
tst-auditmany \
tst-auxobj \
tst-auxobj-dlopen \
tst-dlmopen4 \
tst-dlmopen-dlerror \
tst-dlmopen-gethostbyname \
+ tst-dlmopen-twice \
tst-dlopenfail \
tst-dlopenfail-2 \
tst-dlopenrpath \
endif
endif
+tests-special += $(objpfx)tst-relro-ldso.out $(objpfx)tst-relro-libc.out
+$(objpfx)tst-relro-ldso.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
+ $(objpfx)ld.so
+ $(PYTHON) tst-relro-symbols.py $(objpfx)ld.so \
+ --required=_rtld_global_ro \
+ > $@ 2>&1; $(evaluate-test)
+# The optional symbols are present in libc only if the architecture has
+# the GLIBC_2.0 symbol set in libc.
+$(objpfx)tst-relro-libc.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \
+ $(common-objpfx)libc.so
+ $(PYTHON) tst-relro-symbols.py $(common-objpfx)libc.so \
+ --required=_IO_cookie_jumps \
+ --required=_IO_file_jumps \
+ --required=_IO_file_jumps_maybe_mmap \
+ --required=_IO_file_jumps_mmap \
+ --required=_IO_helper_jumps \
+ --required=_IO_mem_jumps \
+ --required=_IO_obstack_jumps \
+ --required=_IO_proc_jumps \
+ --required=_IO_str_chk_jumps \
+ --required=_IO_str_jumps \
+ --required=_IO_strn_jumps \
+ --required=_IO_wfile_jumps \
+ --required=_IO_wfile_jumps_maybe_mmap \
+ --required=_IO_wfile_jumps_mmap \
+ --required=_IO_wmem_jumps \
+ --required=_IO_wstr_jumps \
+ --required=_IO_wstrn_jumps \
+ --optional=_IO_old_cookie_jumps \
+ --optional=_IO_old_file_jumps \
+ --optional=_IO_old_proc_jumps \
+ > $@ 2>&1; $(evaluate-test)
+
ifeq ($(run-built-tests),yes)
tests-special += $(objpfx)tst-valgrind-smoke.out
endif
libmarkermod4-2 \
libmarkermod4-3 \
libmarkermod4-4 \
+ libmarkermod5-1 \
+ libmarkermod5-2 \
+ libmarkermod5-3 \
+ libmarkermod5-4 \
+ libmarkermod5-5 \
+ libtracemod1-1 \
+ libtracemod2-1 \
+ libtracemod3-1 \
+ libtracemod4-1 \
+ libtracemod5-1 \
ltglobmod1 \
ltglobmod2 \
neededobj1 \
tst-alignmod3 \
tst-array2dep \
tst-array5dep \
+ tst-audit-tlsdesc-mod1 \
+ tst-audit-tlsdesc-mod2 \
tst-audit11mod1 \
tst-audit11mod2 \
tst-audit12mod1 \
tst-auditmanymod7 \
tst-auditmanymod8 \
tst-auditmanymod9 \
+ tst-auditmod-tlsdesc \
tst-auditmod1 \
tst-auditmod9a \
tst-auditmod9b \
tst-auditmod24c \
tst-auditmod24d \
tst-auditmod25 \
+ tst-auditmod26 \
tst-auxvalmod \
tst-big-note-lib \
tst-deep1mod1 \
tst-dlmopen1mod \
tst-dlmopen-dlerror-mod \
tst-dlmopen-gethostbyname-mod \
+ tst-dlmopen-twice-mod1 \
+ tst-dlmopen-twice-mod2 \
tst-dlopenfaillinkmod \
tst-dlopenfailmod1 \
tst-dlopenfailmod2 \
$(objpfx)tst-gnu2-tls1: $(objpfx)tst-gnu2-tls1mod.so
tst-gnu2-tls1mod.so-no-z-defs = yes
CFLAGS-tst-gnu2-tls1mod.c += -mtls-dialect=gnu2
+endif # $(have-mtls-dialect-gnu2)
-tests += tst-audit-tlsdesc tst-audit-tlsdesc-dlopen
-modules-names += tst-audit-tlsdesc-mod1 tst-audit-tlsdesc-mod2 tst-auditmod-tlsdesc
-$(objpfx)tst-audit-tlsdesc: $(objpfx)tst-audit-tlsdesc-mod1.so \
- $(objpfx)tst-audit-tlsdesc-mod2.so \
- $(shared-thread-library)
-CFLAGS-tst-audit-tlsdesc-mod1.c += -mtls-dialect=gnu2
-CFLAGS-tst-audit-tlsdesc-mod2.c += -mtls-dialect=gnu2
-$(objpfx)tst-audit-tlsdesc-dlopen: $(shared-thread-library)
-$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-audit-tlsdesc-mod1.so \
- $(objpfx)tst-audit-tlsdesc-mod2.so
-$(objpfx)tst-audit-tlsdesc-mod1.so: $(objpfx)tst-audit-tlsdesc-mod2.so
-$(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
-tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
-$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
-tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
-endif
ifeq (yes,$(have-protected-data))
modules-names += tst-protected1moda tst-protected1modb
tests += tst-protected1a tst-protected1b
# filtmod1.so, tst-big-note-lib.so, tst-ro-dynamic-mod.so have special
# rules.
modules-names-nobuild := filtmod1 tst-big-note-lib tst-ro-dynamic-mod \
- tst-audit24bmod1 tst-audit24bmod2.so
+ tst-audit24bmod1 tst-audit24bmod2
tests += $(tests-static)
$(objpfx)tst-initorder2-cmp.out \
$(objpfx)tst-unused-dep-cmp.out \
$(objpfx)tst-unused-dep.out \
+ $(objpfx)tst-trace1.out \
+ $(objpfx)tst-trace2.out \
+ $(objpfx)tst-trace3.out \
+ $(objpfx)tst-trace4.out \
+ $(objpfx)tst-trace5.out \
# tests-special
endif
tst-prelink-no-pie = yes
endif
+tests-special += $(objpfx)tst-glibcelf.out
+$(objpfx)tst-glibcelf.out: tst-glibcelf.py elf.h $(..)/scripts/glibcelf.py \
+ $(..)/scripts/glibcextract.py
+ PYTHONPATH=$(..)scripts $(PYTHON) tst-glibcelf.py \
+ --cc="$(CC) $(patsubst -DMODULE_NAME=%,-DMODULE_NAME=testsuite,$(CPPFLAGS))" \
+ < /dev/null > $@ 2>&1; $(evaluate-test)
+
+ifeq ($(run-built-tests),yes)
+tests-special += $(objpfx)tst-tls-allocation-failure-static-patched.out
+endif
+
# The test requires shared _and_ PIE because the executable
# unit test driver must be able to link with the shared object
# that is going to eventually go into an installed DSO.
$(LINK.o) -nostdlib -nostartfiles -shared -o $@.new \
$(LDFLAGS-rtld) -Wl,-z,defs $(z-now-$(bind-now)) \
$(filter-out $(map-file),$^) $(load-map-file) \
- -Wl,-soname=$(rtld-installed-name) \
- -Wl,-defsym=_begin=0
+ -Wl,-soname=$(rtld-installed-name)
$(call after-link,$@.new)
$(READELF) -s $@.new \
| $(AWK) '($$7 ~ /^UND(|EF)$$/ && $$1 != "0:" && $$4 != "REGISTER") { print; p=1 } END { exit p != 0 }'
$(objpfx)tst-audit24c: $(objpfx)tst-audit24amod1.so \
$(objpfx)tst-audit24amod2.so
tst-audit24c-ENV = LD_BIND_NOW=1 LD_AUDIT=$(objpfx)tst-auditmod24c.so
-LDFLAGS-tst-audit24b = -Wl,-z,lazy
+LDFLAGS-tst-audit24c = -Wl,-z,lazy
$(objpfx)tst-audit24d.out: $(objpfx)tst-auditmod24d.so
$(objpfx)tst-audit24d: $(objpfx)tst-audit24dmod1.so \
LDFLAGS-tst-audit25b = -Wl,-z,now
tst-audit25b-ARGS = -- $(host-test-program-cmd)
+$(objpfx)tst-audit26.out: $(objpfx)tst-auditmod26.so
+$(objpfx)tst-auditmod26.so: $(libsupport)
+tst-audit26-ENV = LD_AUDIT=$(objpfx)tst-auditmod26.so
+
# tst-sonamemove links against an older implementation of the library.
LDFLAGS-tst-sonamemove-linkmod1.so = \
-Wl,--version-script=tst-sonamemove-linkmod1.map \
LDFLAGS-libmarkermod2-1.so += -Wl,-soname,libmarkermod2.so
LDFLAGS-libmarkermod3-1.so += -Wl,-soname,libmarkermod3.so
LDFLAGS-libmarkermod4-1.so += -Wl,-soname,libmarkermod4.so
+LDFLAGS-libmarkermod5-1.so += -Wl,-soname,libmarkermod5.so
$(objpfx)libmarkermod%.os : markermodMARKER-VALUE.c
$(compile-command.c) \
-DMARKER=marker$(firstword $(subst -, ,$*)) \
cp $< $@
$(objpfx)libmarkermod4.so: $(objpfx)libmarkermod4-1.so
cp $< $@
+$(objpfx)libmarkermod5.so: $(objpfx)libmarkermod5-1.so
+ cp $< $@
# tst-glibc-hwcaps-prepend checks that --glibc-hwcaps-prepend is
# preferred over auto-detected subdirectories.
$(objpfx)tst-p_align3.out: tst-p_align3.sh $(objpfx)tst-p_align3
$(SHELL) $< $(common-objpfx) '$(test-program-prefix)'; \
$(evaluate-test)
+
+LDFLAGS-libtracemod1-1.so += -Wl,-soname,libtracemod1.so
+LDFLAGS-libtracemod2-1.so += -Wl,-soname,libtracemod2.so
+LDFLAGS-libtracemod3-1.so += -Wl,-soname,libtracemod3.so
+LDFLAGS-libtracemod4-1.so += -Wl,-soname,libtracemod4.so
+LDFLAGS-libtracemod5-1.so += -Wl,-soname,libtracemod5.so
+
+$(objpfx)libtracemod1-1.so: $(objpfx)libtracemod2-1.so \
+ $(objpfx)libtracemod3-1.so
+$(objpfx)libtracemod2-1.so: $(objpfx)libtracemod4-1.so \
+ $(objpfx)libtracemod5-1.so
+
+define libtracemod-x
+$(objpfx)libtracemod$(1)/libtracemod$(1).so: $(objpfx)libtracemod$(1)-1.so
+ $$(make-target-directory)
+ cp $$< $$@
+endef
+libtracemod-suffixes = 1 2 3 4 5
+$(foreach i,$(libtracemod-suffixes), $(eval $(call libtracemod-x,$(i))))
+
+define tst-trace-skeleton
+$(objpfx)tst-trace$(1).out: $(objpfx)libtracemod1/libtracemod1.so \
+ $(objpfx)libtracemod2/libtracemod2.so \
+ $(objpfx)libtracemod3/libtracemod3.so \
+ $(objpfx)libtracemod4/libtracemod4.so \
+ $(objpfx)libtracemod5/libtracemod5.so \
+ $(..)scripts/tst-ld-trace.py \
+ tst-trace$(1).exp
+ ${ $(PYTHON) $(..)scripts/tst-ld-trace.py \
+ "$(test-wrapper-env) $(elf-objpfx)$(rtld-installed-name) \
+ --library-path $(common-objpfx):$(strip $(2)) \
+ $(objpfx)libtracemod1/libtracemod1.so" tst-trace$(1).exp \
+ } > $$@; $$(evaluate-test)
+endef
+
+$(eval $(call tst-trace-skeleton,1,))
+$(eval $(call tst-trace-skeleton,2,\
+ $(objpfx)libtracemod2))
+$(eval $(call tst-trace-skeleton,3,\
+ $(objpfx)libtracemod2:$(objpfx)libtracemod3))
+$(eval $(call tst-trace-skeleton,4,\
+ $(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4))
+$(eval $(call tst-trace-skeleton,5,\
+ $(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4:$(objpfx)libtracemod5))
+
+$(objpfx)tst-tls-allocation-failure-static-patched: \
+ $(objpfx)tst-tls-allocation-failure-static $(..)scripts/tst-elf-edit.py
+ cp $< $@
+ $(PYTHON) $(..)scripts/tst-elf-edit.py --maximize-tls-size $@
+
+$(objpfx)tst-tls-allocation-failure-static-patched.out: \
+ $(objpfx)tst-tls-allocation-failure-static-patched
+ $< > $@ 2>&1; echo "status: $$?" >> $@
+ grep -q '^Fatal glibc error: Cannot allocate TLS block$$' $@ \
+ && grep -q '^status: 127$$' $@; \
+ $(evaluate-test)
+
+$(objpfx)tst-audit-tlsdesc: $(objpfx)tst-audit-tlsdesc-mod1.so \
+ $(objpfx)tst-audit-tlsdesc-mod2.so \
+ $(shared-thread-library)
+ifeq (yes,$(have-mtls-dialect-gnu2))
+# The test is valid for all TLS types, but we want to exercise GNU2
+# TLS if possible.
+CFLAGS-tst-audit-tlsdesc-mod1.c += -mtls-dialect=gnu2
+CFLAGS-tst-audit-tlsdesc-mod2.c += -mtls-dialect=gnu2
+endif
+$(objpfx)tst-audit-tlsdesc-dlopen: $(shared-thread-library)
+$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-audit-tlsdesc-mod1.so \
+ $(objpfx)tst-audit-tlsdesc-mod2.so
+$(objpfx)tst-audit-tlsdesc-mod1.so: $(objpfx)tst-audit-tlsdesc-mod2.so
+$(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
+tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
+$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
+tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
+
+$(objpfx)tst-dlmopen-twice.out: \
+ $(objpfx)tst-dlmopen-twice-mod1.so \
+ $(objpfx)tst-dlmopen-twice-mod2.so
reloc_result->flags = flags;
}
- DL_FIXUP_BINDNOW_RELOC (value, new_value, sym.st_value);
+ if (flags & LA_SYMB_ALTVALUE)
+ DL_FIXUP_BINDNOW_RELOC (value, new_value, sym.st_value);
}
void
for (nlist = 0, runp = known; runp; runp = runp->next)
{
+ /* _dl_sort_maps ignores l_faked object, so it is safe to not consider
+ them for nlist. */
if (__builtin_expect (trace_mode, 0) && runp->map->l_faked)
/* This can happen when we trace the loading. */
--map->l_searchlist.r_nlist;
--- /dev/null
+/* Early memory allocation for the dynamic loader. Generic version.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ldsodefs.h>
+#include <stddef.h>
+#include <unistd.h>
+
+void *
+_dl_early_allocate (size_t size)
+{
+ void *result = __sbrk (size);
+ if (result == (void *) -1)
+ result = NULL;
+ return result;
+}
for (struct link_map *l = new_map; l != NULL; l = l->l_next)
/* Skip proxy maps and already-processed maps. */
count += l == l->l_real && !l->l_find_object_processed;
+ if (count == 0)
+ return true;
+
struct link_map **map_array = malloc (count * sizeof (*map_array));
if (map_array == NULL)
return false;
if (l == l->l_real && !l->l_find_object_processed)
map_array[i++] = l;
}
- if (count == 0)
- return true;
_dl_find_object_link_map_sort (map_array, count);
bool ok = _dl_find_object_update_1 (map_array, count);
/* Each hwcaps subdirectory has a GLIBC_HWCAPS_PREFIX string prefix
and a "/" suffix once stored in the result. */
hwcaps_counts.maximum_length += strlen (GLIBC_HWCAPS_PREFIX) + 1;
- size_t total = (hwcaps_counts.count * (strlen (GLIBC_HWCAPS_PREFIX) + 1)
+ size_t hwcaps_sz = (hwcaps_counts.count * (strlen (GLIBC_HWCAPS_PREFIX) + 1)
+ hwcaps_counts.total_length);
/* Count the number of bits set in the masked value. */
assert (m == cnt);
/* Determine the total size of all strings together. */
+ size_t total;
if (cnt == 1)
- total += temp[0].len + 1;
+ total = temp[0].len + 1;
else
{
- total += temp[0].len + temp[cnt - 1].len + 2;
+ total = temp[0].len + temp[cnt - 1].len + 2;
if (cnt > 2)
{
total <<= 1;
/* This is the overall result, including both glibc-hwcaps
subdirectories and the legacy hwcaps subdirectories using the
power set construction. */
+ total += hwcaps_sz;
struct r_strlenpair *overall_result
= malloc (*sz * sizeof (*result) + total);
if (overall_result == NULL)
args.caller_dlopen = RETURN_ADDRESS (0);
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->libc_dlopen_mode (name, mode);
#endif
return dlerror_run (do_dlopen, &args) ? NULL : (void *) args.map;
args.name = name;
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->libc_dlsym (map, name);
#endif
return (dlerror_run (do_dlsym, &args) ? NULL
__libc_dlvsym (void *map, const char *name, const char *version)
{
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->libc_dlvsym (map, name, version);
#endif
__libc_dlclose (void *map)
{
#ifdef SHARED
- if (!rtld_active ())
+ if (GLRO (dl_dlfcn_hook) != NULL)
return GLRO (dl_dlfcn_hook)->libc_dlclose (map);
#endif
return dlerror_run (do_dlclose, map);
unallocated. Then jump into the normal segment-mapping loop to
handle the portion of the segment past the end of the file
mapping. */
+ if (__glibc_unlikely (loadcmds[nloadcmds - 1].mapstart <
+ c->mapend))
+ return N_("ELF load command address/offset not page-aligned");
if (__glibc_unlikely
(__mprotect ((caddr_t) (l->l_addr + c->mapend),
loadcmds[nloadcmds - 1].mapstart - c->mapend,
_dl_signal_error (EINVAL, file, NULL, N_("\
no more namespaces available for dlmopen()"));
}
- else if (nsid == GL(dl_nns))
- {
- __rtld_lock_initialize (GL(dl_ns)[nsid]._ns_unique_sym_table.lock);
- ++GL(dl_nns);
- }
+
+ if (nsid == GL(dl_nns))
+ ++GL(dl_nns);
+
+ /* Initialize the new namespace. Most members are
+ zero-initialized, only the lock needs special treatment. */
+ memset (&GL(dl_ns)[nsid], 0, sizeof (GL(dl_ns)[nsid]));
+ __rtld_lock_initialize (GL(dl_ns)[nsid]._ns_unique_sym_table.lock);
_dl_debug_update (nsid)->r_state = RT_CONSISTENT;
}
If FOR_FINI is true, this is called for finishing an object. */
static void
_dl_sort_maps_original (struct link_map **maps, unsigned int nmaps,
- unsigned int skip, bool for_fini)
+ bool force_first, bool for_fini)
{
/* Allows caller to do the common optimization of skipping the first map,
usually the main binary. */
- maps += skip;
- nmaps -= skip;
+ maps += force_first;
+ nmaps -= force_first;
/* A list of one element need not be sorted. */
if (nmaps <= 1)
dfs_traversal (struct link_map ***rpo, struct link_map *map,
bool *do_reldeps)
{
- if (map->l_visited)
+ /* _dl_map_object_deps ignores l_faked objects when calculating the
+ number of maps before calling _dl_sort_maps, ignore them as well. */
+ if (map->l_visited || map->l_faked)
return;
map->l_visited = 1;
static void
_dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
- unsigned int skip __attribute__ ((unused)), bool for_fini)
+ bool force_first, bool for_fini)
{
+ struct link_map *first_map = maps[0];
for (int i = nmaps - 1; i >= 0; i--)
maps[i]->l_visited = 0;
Adjusting the order so that maps[0] is last traversed naturally avoids
this problem.
- Further, the old "optimization" of skipping the main object at maps[0]
- from the call-site (i.e. _dl_sort_maps(maps+1,nmaps-1)) is in general
- no longer valid, since traversing along object dependency-links
- may "find" the main object even when it is not included in the initial
- order (e.g. a dlopen()'ed shared object can have circular dependencies
- linked back to itself). In such a case, traversing N-1 objects will
- create a N-object result, and raise problems.
-
To summarize, just passing in the full list, and iterating from back
to front makes things much more straightforward. */
}
memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
+
+ /* Skipping the first object at maps[0] is not valid in general,
+ since traversing along object dependency-links may "find" that
+ first object even when it is not included in the initial order
+ (e.g., a dlopen'ed shared object can have circular dependencies
+ linked back to itself). In such a case, traversing N-1 objects
+ will create a N-object result, and raise problems. Instead,
+ force the object back into first place after sorting. This naive
+ approach may introduce further dependency ordering violations
+ compared to rotating the cycle until the first map is again in
+ the first position, but as there is a cycle, at least one
+ violation is already present. */
+ if (force_first && maps[0] != first_map)
+ {
+ int i;
+ for (i = 0; maps[i] != first_map; ++i)
+ ;
+ assert (i < nmaps);
+ memmove (&maps[1], maps, i * sizeof (maps[0]));
+ maps[0] = first_map;
+ }
}
void
void
_dl_sort_maps (struct link_map **maps, unsigned int nmaps,
- unsigned int skip, bool for_fini)
+ bool force_first, bool for_fini)
{
/* It can be tempting to use a static function pointer to store and call
the current selected sorting algorithm routine, but experimentation
input cases. A simple if-case with direct function calls appears to
be the fastest. */
if (__glibc_likely (GLRO(dl_dso_sort_algo) == dso_sort_algorithm_original))
- _dl_sort_maps_original (maps, nmaps, skip, for_fini);
+ _dl_sort_maps_original (maps, nmaps, force_first, for_fini);
else
- _dl_sort_maps_dfs (maps, nmaps, skip, for_fini);
+ _dl_sort_maps_dfs (maps, nmaps, force_first, for_fini);
}
#endif /* HAVE_TUNABLES. */
#include <dl-vdso-setup.h>
#include <dl-auxv.h>
#include <dl-find_object.h>
+#include <array_length.h>
extern char *__progname;
char **_dl_argv = &__progname; /* This is checked for some error messages. */
#ifdef HAVE_AUX_VECTOR
+#include <dl-parse_auxv.h>
+
int _dl_clktck;
void
_dl_aux_init (ElfW(auxv_t) *av)
{
- int seen = 0;
- uid_t uid = 0;
- gid_t gid = 0;
-
#ifdef NEED_DL_SYSINFO
/* NB: Avoid RELATIVE relocation in static PIE. */
GL(dl_sysinfo) = DL_SYSINFO_DEFAULT;
#endif
_dl_auxv = av;
- for (; av->a_type != AT_NULL; ++av)
- switch (av->a_type)
- {
- case AT_PAGESZ:
- if (av->a_un.a_val != 0)
- GLRO(dl_pagesize) = av->a_un.a_val;
- break;
- case AT_CLKTCK:
- GLRO(dl_clktck) = av->a_un.a_val;
- break;
- case AT_PHDR:
- GL(dl_phdr) = (const void *) av->a_un.a_val;
- break;
- case AT_PHNUM:
- GL(dl_phnum) = av->a_un.a_val;
- break;
- case AT_PLATFORM:
- GLRO(dl_platform) = (void *) av->a_un.a_val;
- break;
- case AT_HWCAP:
- GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val;
- break;
- case AT_HWCAP2:
- GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val;
- break;
- case AT_FPUCW:
- GLRO(dl_fpu_control) = av->a_un.a_val;
- break;
-#ifdef NEED_DL_SYSINFO
- case AT_SYSINFO:
- GL(dl_sysinfo) = av->a_un.a_val;
- break;
-#endif
-#ifdef NEED_DL_SYSINFO_DSO
- case AT_SYSINFO_EHDR:
- GL(dl_sysinfo_dso) = (void *) av->a_un.a_val;
- break;
-#endif
- case AT_UID:
- uid ^= av->a_un.a_val;
- seen |= 1;
- break;
- case AT_EUID:
- uid ^= av->a_un.a_val;
- seen |= 2;
- break;
- case AT_GID:
- gid ^= av->a_un.a_val;
- seen |= 4;
- break;
- case AT_EGID:
- gid ^= av->a_un.a_val;
- seen |= 8;
- break;
- case AT_SECURE:
- seen = -1;
- __libc_enable_secure = av->a_un.a_val;
- __libc_enable_secure_decided = 1;
- break;
- case AT_RANDOM:
- _dl_random = (void *) av->a_un.a_val;
- break;
- case AT_MINSIGSTKSZ:
- _dl_minsigstacksize = av->a_un.a_val;
- break;
- DL_PLATFORM_AUXV
- }
- if (seen == 0xf)
- {
- __libc_enable_secure = uid != 0 || gid != 0;
- __libc_enable_secure_decided = 1;
- }
+ dl_parse_auxv_t auxv_values;
+ /* Use an explicit initialization loop here because memset may not
+ be available yet. */
+ for (int i = 0; i < array_length (auxv_values); ++i)
+ auxv_values[i] = 0;
+ _dl_parse_auxv (av, auxv_values);
}
#endif
-/* Operating system support for run-time dynamic linker. Generic Unix version.
+/* Operating system support for run-time dynamic linker. Stub version.
Copyright (C) 1995-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-/* We conditionalize the whole of this file rather than simply eliding it
- from the static build, because other sysdeps/ versions of this file
- might define things needed by a static build. */
-
-#ifdef SHARED
-
-#include <assert.h>
-#include <elf.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <libintl.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <ldsodefs.h>
-#include <_itoa.h>
-#include <fpu_control.h>
-
-#include <entry.h>
-#include <dl-machine.h>
-#include <dl-procinfo.h>
-#include <dl-osinfo.h>
-#include <libc-internal.h>
-#include <tls.h>
-
-#include <dl-tunables.h>
-#include <dl-auxv.h>
-#include <dl-hwcap-check.h>
-
-extern char **_environ attribute_hidden;
-extern char _end[] attribute_hidden;
-
-/* Protect SUID program against misuse of file descriptors. */
-extern void __libc_check_standard_fds (void);
-
-int __libc_enable_secure attribute_relro = 0;
-rtld_hidden_data_def (__libc_enable_secure)
-/* This variable contains the lowest stack address ever used. */
-void *__libc_stack_end attribute_relro = NULL;
-rtld_hidden_data_def(__libc_stack_end)
-void *_dl_random attribute_relro = NULL;
-
-#ifndef DL_FIND_ARG_COMPONENTS
-# define DL_FIND_ARG_COMPONENTS(cookie, argc, argv, envp, auxp) \
- do { \
- void **_tmp; \
- (argc) = *(long int *) cookie; \
- (argv) = (char **) ((long int *) cookie + 1); \
- (envp) = (argv) + (argc) + 1; \
- for (_tmp = (void **) (envp); *_tmp; ++_tmp) \
- continue; \
- (auxp) = (void *) ++_tmp; \
- } while (0)
-#endif
-
-#ifndef DL_STACK_END
-# define DL_STACK_END(cookie) ((void *) (cookie))
-#endif
-
-ElfW(Addr)
-_dl_sysdep_start (void **start_argptr,
- void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum,
- ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv))
-{
- const ElfW(Phdr) *phdr = NULL;
- ElfW(Word) phnum = 0;
- ElfW(Addr) user_entry;
- ElfW(auxv_t) *av;
-#ifdef HAVE_AUX_SECURE
-# define set_seen(tag) (tag) /* Evaluate for the side effects. */
-# define set_seen_secure() ((void) 0)
-#else
- uid_t uid = 0;
- gid_t gid = 0;
- unsigned int seen = 0;
-# define set_seen_secure() (seen = -1)
-# ifdef HAVE_AUX_XID
-# define set_seen(tag) (tag) /* Evaluate for the side effects. */
-# else
-# define M(type) (1 << (type))
-# define set_seen(tag) seen |= M ((tag)->a_type)
-# endif
-#endif
-#ifdef NEED_DL_SYSINFO
- uintptr_t new_sysinfo = 0;
-#endif
-
- __libc_stack_end = DL_STACK_END (start_argptr);
- DL_FIND_ARG_COMPONENTS (start_argptr, _dl_argc, _dl_argv, _environ,
- GLRO(dl_auxv));
-
- user_entry = (ElfW(Addr)) ENTRY_POINT;
- GLRO(dl_platform) = NULL; /* Default to nothing known about the platform. */
-
- /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */
- _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ),
- "CONSTANT_MINSIGSTKSZ is constant");
- GLRO(dl_minsigstacksize) = CONSTANT_MINSIGSTKSZ;
-
- for (av = GLRO(dl_auxv); av->a_type != AT_NULL; set_seen (av++))
- switch (av->a_type)
- {
- case AT_PHDR:
- phdr = (void *) av->a_un.a_val;
- break;
- case AT_PHNUM:
- phnum = av->a_un.a_val;
- break;
- case AT_PAGESZ:
- GLRO(dl_pagesize) = av->a_un.a_val;
- break;
- case AT_ENTRY:
- user_entry = av->a_un.a_val;
- break;
-#ifndef HAVE_AUX_SECURE
- case AT_UID:
- case AT_EUID:
- uid ^= av->a_un.a_val;
- break;
- case AT_GID:
- case AT_EGID:
- gid ^= av->a_un.a_val;
- break;
-#endif
- case AT_SECURE:
-#ifndef HAVE_AUX_SECURE
- seen = -1;
-#endif
- __libc_enable_secure = av->a_un.a_val;
- break;
- case AT_PLATFORM:
- GLRO(dl_platform) = (void *) av->a_un.a_val;
- break;
- case AT_HWCAP:
- GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val;
- break;
- case AT_HWCAP2:
- GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val;
- break;
- case AT_CLKTCK:
- GLRO(dl_clktck) = av->a_un.a_val;
- break;
- case AT_FPUCW:
- GLRO(dl_fpu_control) = av->a_un.a_val;
- break;
-#ifdef NEED_DL_SYSINFO
- case AT_SYSINFO:
- new_sysinfo = av->a_un.a_val;
- break;
-#endif
-#ifdef NEED_DL_SYSINFO_DSO
- case AT_SYSINFO_EHDR:
- GLRO(dl_sysinfo_dso) = (void *) av->a_un.a_val;
- break;
-#endif
- case AT_RANDOM:
- _dl_random = (void *) av->a_un.a_val;
- break;
- case AT_MINSIGSTKSZ:
- GLRO(dl_minsigstacksize) = av->a_un.a_val;
- break;
- DL_PLATFORM_AUXV
- }
-
- dl_hwcap_check ();
-
-#ifndef HAVE_AUX_SECURE
- if (seen != -1)
- {
- /* Fill in the values we have not gotten from the kernel through the
- auxiliary vector. */
-# ifndef HAVE_AUX_XID
-# define SEE(UID, var, uid) \
- if ((seen & M (AT_##UID)) == 0) var ^= __get##uid ()
- SEE (UID, uid, uid);
- SEE (EUID, uid, euid);
- SEE (GID, gid, gid);
- SEE (EGID, gid, egid);
-# endif
-
- /* If one of the two pairs of IDs does not match this is a setuid
- or setgid run. */
- __libc_enable_secure = uid | gid;
- }
-#endif
-
-#ifndef HAVE_AUX_PAGESIZE
- if (GLRO(dl_pagesize) == 0)
- GLRO(dl_pagesize) = __getpagesize ();
-#endif
-
-#ifdef NEED_DL_SYSINFO
- if (new_sysinfo != 0)
- {
-# ifdef NEED_DL_SYSINFO_DSO
- /* Only set the sysinfo value if we also have the vsyscall DSO. */
- if (GLRO(dl_sysinfo_dso) != 0)
-# endif
- GLRO(dl_sysinfo) = new_sysinfo;
- }
-#endif
-
- __tunables_init (_environ);
-
- /* Initialize DSO sorting algorithm after tunables. */
- _dl_sort_maps_init ();
-
-#ifdef DL_SYSDEP_INIT
- DL_SYSDEP_INIT;
-#endif
-
-#ifdef DL_PLATFORM_INIT
- DL_PLATFORM_INIT;
-#endif
-
- /* Determine the length of the platform name. */
- if (GLRO(dl_platform) != NULL)
- GLRO(dl_platformlen) = strlen (GLRO(dl_platform));
-
- if (__sbrk (0) == _end)
- /* The dynamic linker was run as a program, and so the initial break
- starts just after our bss, at &_end. The malloc in dl-minimal.c
- will consume the rest of this page, so tell the kernel to move the
- break up that far. When the user program examines its break, it
- will see this new value and not clobber our data. */
- __sbrk (GLRO(dl_pagesize)
- - ((_end - (char *) 0) & (GLRO(dl_pagesize) - 1)));
-
- /* If this is a SUID program we make sure that FDs 0, 1, and 2 are
- allocated. If necessary we are doing it ourself. If it is not
- possible we stop the program. */
- if (__builtin_expect (__libc_enable_secure, 0))
- __libc_check_standard_fds ();
-
- (*dl_main) (phdr, phnum, &user_entry, GLRO(dl_auxv));
- return user_entry;
-}
-
-void
-_dl_sysdep_start_cleanup (void)
-{
-}
-
-void
-_dl_show_auxv (void)
-{
- char buf[64];
- ElfW(auxv_t) *av;
-
- /* Terminate string. */
- buf[63] = '\0';
-
- /* The following code assumes that the AT_* values are encoded
- starting from 0 with AT_NULL, 1 for AT_IGNORE, and all other values
- close by (otherwise the array will be too large). In case we have
- to support a platform where these requirements are not fulfilled
- some alternative implementation has to be used. */
- for (av = GLRO(dl_auxv); av->a_type != AT_NULL; ++av)
- {
- static const struct
- {
- const char label[22];
- enum { unknown = 0, dec, hex, str, ignore } form : 8;
- } auxvars[] =
- {
- [AT_EXECFD - 2] = { "EXECFD: ", dec },
- [AT_EXECFN - 2] = { "EXECFN: ", str },
- [AT_PHDR - 2] = { "PHDR: 0x", hex },
- [AT_PHENT - 2] = { "PHENT: ", dec },
- [AT_PHNUM - 2] = { "PHNUM: ", dec },
- [AT_PAGESZ - 2] = { "PAGESZ: ", dec },
- [AT_BASE - 2] = { "BASE: 0x", hex },
- [AT_FLAGS - 2] = { "FLAGS: 0x", hex },
- [AT_ENTRY - 2] = { "ENTRY: 0x", hex },
- [AT_NOTELF - 2] = { "NOTELF: ", hex },
- [AT_UID - 2] = { "UID: ", dec },
- [AT_EUID - 2] = { "EUID: ", dec },
- [AT_GID - 2] = { "GID: ", dec },
- [AT_EGID - 2] = { "EGID: ", dec },
- [AT_PLATFORM - 2] = { "PLATFORM: ", str },
- [AT_HWCAP - 2] = { "HWCAP: ", hex },
- [AT_CLKTCK - 2] = { "CLKTCK: ", dec },
- [AT_FPUCW - 2] = { "FPUCW: ", hex },
- [AT_DCACHEBSIZE - 2] = { "DCACHEBSIZE: 0x", hex },
- [AT_ICACHEBSIZE - 2] = { "ICACHEBSIZE: 0x", hex },
- [AT_UCACHEBSIZE - 2] = { "UCACHEBSIZE: 0x", hex },
- [AT_IGNOREPPC - 2] = { "IGNOREPPC", ignore },
- [AT_SECURE - 2] = { "SECURE: ", dec },
- [AT_BASE_PLATFORM - 2] = { "BASE_PLATFORM: ", str },
- [AT_SYSINFO - 2] = { "SYSINFO: 0x", hex },
- [AT_SYSINFO_EHDR - 2] = { "SYSINFO_EHDR: 0x", hex },
- [AT_RANDOM - 2] = { "RANDOM: 0x", hex },
- [AT_HWCAP2 - 2] = { "HWCAP2: 0x", hex },
- [AT_MINSIGSTKSZ - 2] = { "MINSIGSTKSZ: ", dec },
- [AT_L1I_CACHESIZE - 2] = { "L1I_CACHESIZE: ", dec },
- [AT_L1I_CACHEGEOMETRY - 2] = { "L1I_CACHEGEOMETRY: 0x", hex },
- [AT_L1D_CACHESIZE - 2] = { "L1D_CACHESIZE: ", dec },
- [AT_L1D_CACHEGEOMETRY - 2] = { "L1D_CACHEGEOMETRY: 0x", hex },
- [AT_L2_CACHESIZE - 2] = { "L2_CACHESIZE: ", dec },
- [AT_L2_CACHEGEOMETRY - 2] = { "L2_CACHEGEOMETRY: 0x", hex },
- [AT_L3_CACHESIZE - 2] = { "L3_CACHESIZE: ", dec },
- [AT_L3_CACHEGEOMETRY - 2] = { "L3_CACHEGEOMETRY: 0x", hex },
- };
- unsigned int idx = (unsigned int) (av->a_type - 2);
-
- if ((unsigned int) av->a_type < 2u
- || (idx < sizeof (auxvars) / sizeof (auxvars[0])
- && auxvars[idx].form == ignore))
- continue;
-
- assert (AT_NULL == 0);
- assert (AT_IGNORE == 1);
-
- /* Some entries are handled in a special way per platform. */
- if (_dl_procinfo (av->a_type, av->a_un.a_val) == 0)
- continue;
-
- if (idx < sizeof (auxvars) / sizeof (auxvars[0])
- && auxvars[idx].form != unknown)
- {
- const char *val = (char *) av->a_un.a_val;
-
- if (__builtin_expect (auxvars[idx].form, dec) == dec)
- val = _itoa ((unsigned long int) av->a_un.a_val,
- buf + sizeof buf - 1, 10, 0);
- else if (__builtin_expect (auxvars[idx].form, hex) == hex)
- val = _itoa ((unsigned long int) av->a_un.a_val,
- buf + sizeof buf - 1, 16, 0);
-
- _dl_printf ("AT_%s%s\n", auxvars[idx].label, val);
-
- continue;
- }
-
- /* Unknown value: print a generic line. */
- char buf2[17];
- buf2[sizeof (buf2) - 1] = '\0';
- const char *val2 = _itoa ((unsigned long int) av->a_un.a_val,
- buf2 + sizeof buf2 - 1, 16, 0);
- const char *val = _itoa ((unsigned long int) av->a_type,
- buf + sizeof buf - 1, 16, 0);
- _dl_printf ("AT_??? (0x%s): 0x%s\n", val, val2);
- }
-}
-
-#endif
+#error dl-sysdep support missing.
tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<c<d<g<f<b<e];}
output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<g<f<a<b<c<d<e];}
+
+# Test that even in the presence of dependency loops involving dlopen'ed
+# object, that object is initialized last (and not unloaded prematurely).
+# Final destructor order is indeterminate due to the cycle.
+tst-bz28937: {+a;+b;-b;+c;%c};a->a1;a->a2;a2->a;b->b1;c->a1;c=>a1
+output(glibc.rtld.dynamic_sort=1): {+a[a2>a1>a>];+b[b1>b>];-b[<b<b1];+c[c>];%c(a1());}<a<a2<c<a1
+output(glibc.rtld.dynamic_sort=2): {+a[a2>a1>a>];+b[b1>b>];-b[<b<b1];+c[c>];%c(a1());}<a2<a<c<a1
#include <startup.h>
#include <libc-internal.h>
-/* If nonzero __libc_enable_secure is already set. */
-int __libc_enable_secure_decided;
/* Safest assumption, if somehow the initializer isn't run. */
int __libc_enable_secure = 1;
-
-void
-__libc_init_secure (void)
-{
- if (__libc_enable_secure_decided == 0)
- __libc_enable_secure = (startup_geteuid () != startup_getuid ()
- || startup_getegid () != startup_getgid ());
-}
--- /dev/null
+/* Empty */
--- /dev/null
+/* Empty */
--- /dev/null
+/* Empty */
--- /dev/null
+/* Empty */
--- /dev/null
+/* Empty */
struct dl_start_final_info *info);
#endif
-/* These defined magically in the linker script. */
-extern char _begin[] attribute_hidden;
+/* These are defined magically by the linker. */
+extern const ElfW(Ehdr) __ehdr_start attribute_hidden;
extern char _etext[] attribute_hidden;
extern char _end[] attribute_hidden;
#endif
_dl_setup_hash (&GL(dl_rtld_map));
GL(dl_rtld_map).l_real = &GL(dl_rtld_map);
- GL(dl_rtld_map).l_map_start = (ElfW(Addr)) _begin;
+ GL(dl_rtld_map).l_map_start = (ElfW(Addr)) &__ehdr_start;
GL(dl_rtld_map).l_map_end = (ElfW(Addr)) _end;
GL(dl_rtld_map).l_text_end = (ElfW(Addr)) _etext;
/* Copy the TLS related data if necessary. */
return has_interp;
}
+/* Adjusts the contents of the stack and related globals for the user
+ entry point. The ld.so processed skip_args arguments and bumped
+ _dl_argv and _dl_argc accordingly. Those arguments are removed from
+ argv here. */
+static void
+_dl_start_args_adjust (int skip_args)
+{
+ void **sp = (void **) (_dl_argv - skip_args - 1);
+ void **p = sp + skip_args;
+
+ if (skip_args == 0)
+ return;
+
+ /* Sanity check. */
+ intptr_t argc = (intptr_t) sp[0] - skip_args;
+ assert (argc == _dl_argc);
+
+ /* Adjust argc on stack. */
+ sp[0] = (void *) (intptr_t) _dl_argc;
+
+ /* Update globals in rtld. */
+ _dl_argv -= skip_args;
+ _environ -= skip_args;
+
+ /* Shuffle argv down. */
+ do
+ *++sp = *++p;
+ while (*p != NULL);
+
+ assert (_environ == (char **) (sp + 1));
+
+ /* Shuffle envp down. */
+ do
+ *++sp = *++p;
+ while (*p != NULL);
+
+#ifdef HAVE_AUX_VECTOR
+ void **auxv = (void **) GLRO(dl_auxv) - skip_args;
+ GLRO(dl_auxv) = (ElfW(auxv_t) *) auxv; /* Aliasing violation. */
+ assert (auxv == sp + 1);
+
+ /* Shuffle auxv down. */
+ ElfW(auxv_t) ax;
+ char *oldp = (char *) (p + 1);
+ char *newp = (char *) (sp + 1);
+ do
+ {
+ memcpy (&ax, oldp, sizeof (ax));
+ memcpy (newp, &ax, sizeof (ax));
+ oldp += sizeof (ax);
+ newp += sizeof (ax);
+ }
+ while (ax.a_type != AT_NULL);
+#endif
+}
+
static void
dl_main (const ElfW(Phdr) *phdr,
ElfW(Word) phnum,
rtld_is_main = true;
char *argv0 = NULL;
+ char **orig_argv = _dl_argv;
/* Note the place where the dynamic linker actually came from. */
GL(dl_rtld_map).l_name = rtld_progname;
GLRO(dl_lazy) = -1;
}
- ++_dl_skip_args;
--_dl_argc;
++_dl_argv;
}
if (state.mode != rtld_mode_help)
state.mode = rtld_mode_verify;
- ++_dl_skip_args;
--_dl_argc;
++_dl_argv;
}
else if (! strcmp (_dl_argv[1], "--inhibit-cache"))
{
GLRO(dl_inhibit_cache) = 1;
- ++_dl_skip_args;
--_dl_argc;
++_dl_argv;
}
state.library_path = _dl_argv[2];
state.library_path_source = "--library-path";
- _dl_skip_args += 2;
_dl_argc -= 2;
_dl_argv += 2;
}
{
GLRO(dl_inhibit_rpath) = _dl_argv[2];
- _dl_skip_args += 2;
_dl_argc -= 2;
_dl_argv += 2;
}
{
audit_list_add_string (&state.audit_list, _dl_argv[2]);
- _dl_skip_args += 2;
_dl_argc -= 2;
_dl_argv += 2;
}
else if (! strcmp (_dl_argv[1], "--preload") && _dl_argc > 2)
{
state.preloadarg = _dl_argv[2];
- _dl_skip_args += 2;
_dl_argc -= 2;
_dl_argv += 2;
}
{
argv0 = _dl_argv[2];
- _dl_skip_args += 2;
_dl_argc -= 2;
_dl_argv += 2;
}
&& _dl_argc > 2)
{
state.glibc_hwcaps_prepend = _dl_argv[2];
- _dl_skip_args += 2;
_dl_argc -= 2;
_dl_argv += 2;
}
&& _dl_argc > 2)
{
state.glibc_hwcaps_mask = _dl_argv[2];
- _dl_skip_args += 2;
_dl_argc -= 2;
_dl_argv += 2;
}
{
state.mode = rtld_mode_list_tunables;
- ++_dl_skip_args;
--_dl_argc;
++_dl_argv;
}
{
state.mode = rtld_mode_list_diagnostics;
- ++_dl_skip_args;
--_dl_argc;
++_dl_argv;
}
_dl_usage (ld_so_name, NULL);
}
- ++_dl_skip_args;
--_dl_argc;
++_dl_argv;
/* Set the argv[0] string now that we've processed the executable. */
if (argv0 != NULL)
_dl_argv[0] = argv0;
+
+ /* Adjust arguments for the application entry point. */
+ _dl_start_args_adjust (_dl_argv - orig_argv);
}
else
{
segment that also includes the phdrs. If that's not available, we use
the old method that assumes the beginning of the file is part of the
lowest-addressed PT_LOAD segment. */
- extern const ElfW(Ehdr) __ehdr_start __attribute__ ((visibility ("hidden")));
/* Set up the program header information for the dynamic linker
itself. It is needed in the dl_iterate_phdr callbacks. */
--- /dev/null
+/* Check the usability of <dlfcn.h> functions in audit modules.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <gnu/lib-names.h>
+
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+static int
+do_test (void)
+{
+ /* Check that the audit module has been loaded. */
+ void *handle = xdlopen ("mapped to libc", RTLD_LOCAL | RTLD_NOW);
+ TEST_VERIFY (handle
+ == xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD));
+
+ return 0;
+}
+
+#include <support/test-driver.c>
return sym->st_value;
}
- abort ();
+ if (symname[0] != '\0')
+ abort ();
+ return sym->st_value;
}
}
}
- abort ();
+ if (symname[0] != '\0')
+ abort ();
+ return sym->st_value;
}
unsigned int *flags, const char *symname)
#endif
{
- if (*refcook != -1 && *defcook != -1)
+ if (*refcook != -1 && *defcook != -1 && symname[0] != '\0')
fprintf (stderr, "la_symbind: %s %u\n", symname,
*flags & (LA_SYMB_NOPLTENTER | LA_SYMB_NOPLTEXIT) ? 1 : 0);
return sym->st_value;
--- /dev/null
+/* Check the usability of <dlfcn.h> functions in audit modules. Audit module.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <dlfcn.h>
+#include <first-versions.h>
+#include <gnu/lib-names.h>
+#include <link.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+unsigned int
+la_version (unsigned int current)
+{
+ /* Exercise various <dlfcn.h> functions. */
+
+ /* Check dlopen, dlsym, dlclose. */
+ void *handle = xdlopen (LIBM_SO, RTLD_LOCAL | RTLD_NOW);
+ void *ptr = xdlsym (handle, "sincos");
+ TEST_VERIFY (ptr != NULL);
+ ptr = dlsym (handle, "SINCOS");
+ TEST_VERIFY (ptr == NULL);
+ const char *message = dlerror ();
+ TEST_VERIFY (strstr (message, ": undefined symbol: SINCOS") != NULL);
+ ptr = dlsym (handle, "SINCOS");
+ TEST_VERIFY (ptr == NULL);
+ xdlclose (handle);
+ TEST_COMPARE_STRING (dlerror (), NULL);
+
+ handle = xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD);
+
+ /* Check dlvsym. _exit is unlikely to gain another symbol
+ version. */
+ TEST_VERIFY (xdlsym (handle, "_exit")
+ == xdlvsym (handle, "_exit", FIRST_VERSION_libc__exit_STRING));
+
+ /* Check dlinfo. */
+ {
+ void *handle2 = NULL;
+ TEST_COMPARE (dlinfo (handle, RTLD_DI_LINKMAP, &handle2), 0);
+ TEST_VERIFY (handle2 == handle);
+ }
+
+ /* Check dladdr and dladdr1. */
+ Dl_info info = { };
+ TEST_VERIFY (dladdr (&_exit, &info) != 0);
+ if (strcmp (info.dli_sname, "_Exit") != 0) /* _Exit is an alias. */
+ TEST_COMPARE_STRING (info.dli_sname, "_exit");
+ TEST_VERIFY (info.dli_saddr == &_exit);
+ TEST_VERIFY (strstr (info.dli_fname, LIBC_SO));
+ void *extra_info;
+ memset (&info, 0, sizeof (info));
+ TEST_VERIFY (dladdr1 (&_exit, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
+ TEST_VERIFY (extra_info == handle);
+
+ /* Verify that dlmopen creates a new namespace. */
+ void *dlmopen_handle = xdlmopen (LM_ID_NEWLM, LIBC_SO, RTLD_NOW);
+ TEST_VERIFY (dlmopen_handle != handle);
+ memset (&info, 0, sizeof (info));
+ extra_info = NULL;
+ ptr = xdlsym (dlmopen_handle, "_exit");
+ TEST_VERIFY (dladdr1 (ptr, &info, &extra_info, RTLD_DL_LINKMAP) != 0);
+ TEST_VERIFY (extra_info == dlmopen_handle);
+ xdlclose (dlmopen_handle);
+
+ /* Terminate the process with an error state. This does not happen
+ automatically because the audit module state is not shared with
+ the main program. */
+ if (support_record_failure_is_failed ())
+ {
+ fflush (stdout);
+ fflush (stderr);
+ _exit (1);
+ }
+
+ return LAV_CURRENT;
+}
+
+char *
+la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag)
+{
+ if (strcmp (name, "mapped to libc") == 0)
+ return (char *) LIBC_SO;
+ else
+ return (char *) name;
+}
--- /dev/null
+/* Initialization of libc after dlmopen/dlclose/dlmopen (bug 29528). Module 1.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <stdio.h>
+
+static void __attribute__ ((constructor))
+init (void)
+{
+ puts ("info: tst-dlmopen-twice-mod1.so loaded");
+ fflush (stdout);
+}
+
+static void __attribute__ ((destructor))
+fini (void)
+{
+ puts ("info: tst-dlmopen-twice-mod1.so about to be unloaded");
+ fflush (stdout);
+}
+
+/* Large allocation. The second module does not have this, so it
+ should load libc at a different address. */
+char large_allocate[16 * 1024 * 1024];
--- /dev/null
+/* Initialization of libc after dlmopen/dlclose/dlmopen (bug 29528). Module 2.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <ctype.h>
+#include <stdio.h>
+
+static void __attribute__ ((constructor))
+init (void)
+{
+ puts ("info: tst-dlmopen-twice-mod2.so loaded");
+ fflush (stdout);
+}
+
+static void __attribute__ ((destructor))
+fini (void)
+{
+ puts ("info: tst-dlmopen-twice-mod2.so about to be unloaded");
+ fflush (stdout);
+}
+
+int
+run_check (void)
+{
+ puts ("info: about to call isalpha");
+ fflush (stdout);
+
+ volatile char ch = 'a';
+ if (!isalpha (ch))
+ {
+ puts ("error: isalpha ('a') is not true");
+ fflush (stdout);
+ return 1;
+ }
+ return 0;
+}
--- /dev/null
+/* Initialization of libc after dlmopen/dlclose/dlmopen (bug 29528). Main.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <support/xdlfcn.h>
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+ void *handle = xdlmopen (LM_ID_NEWLM, "tst-dlmopen-twice-mod1.so", RTLD_NOW);
+ xdlclose (handle);
+ handle = xdlmopen (LM_ID_NEWLM, "tst-dlmopen-twice-mod2.so", RTLD_NOW);
+ int (*run_check) (void) = xdlsym (handle, "run_check");
+ TEST_COMPARE (run_check (), 0);
+ xdlclose (handle);
+ return 0;
+}
+
+#include <support/test-driver.c>
cp $B/elf/libmarkermod2-1.so $L/libmarkermod2.so
cp $B/elf/libmarkermod3-1.so $L/libmarkermod3.so
cp $B/elf/libmarkermod4-1.so $L/libmarkermod4.so
+cp $B/elf/libmarkermod5-1.so $L/libmarkermod5.so
mkdirp 0770 $L/glibc-hwcaps/power9
cp $B/elf/libmarkermod2-2.so $L/glibc-hwcaps/power9/libmarkermod2.so
cp $B/elf/libmarkermod4-2.so $L/glibc-hwcaps/z13/libmarkermod4.so
cp $B/elf/libmarkermod4-3.so $L/glibc-hwcaps/z14/libmarkermod4.so
cp $B/elf/libmarkermod4-4.so $L/glibc-hwcaps/z15/libmarkermod4.so
+mkdirp 0770 $L/glibc-hwcaps/z16
+cp $B/elf/libmarkermod5-2.so $L/glibc-hwcaps/z13/libmarkermod5.so
+cp $B/elf/libmarkermod5-3.so $L/glibc-hwcaps/z14/libmarkermod5.so
+cp $B/elf/libmarkermod5-4.so $L/glibc-hwcaps/z15/libmarkermod5.so
+cp $B/elf/libmarkermod5-5.so $L/glibc-hwcaps/z16/libmarkermod5.so
mkdirp 0770 $L/glibc-hwcaps/x86-64-v2
cp $B/elf/libmarkermod2-2.so $L/glibc-hwcaps/x86-64-v2/libmarkermod2.so
--- /dev/null
+#!/usr/bin/python3
+# Verify scripts/glibcelf.py contents against elf/elf.h.
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+import argparse
+import enum
+import sys
+
+import glibcelf
+import glibcextract
+
+errors_encountered = 0
+
+def error(message):
+ global errors_encountered
+ sys.stdout.write('error: {}\n'.format(message))
+ errors_encountered += 1
+
+# The enum constants in glibcelf are expected to have exactly these
+# prefixes.
+expected_constant_prefixes = tuple(
+ 'ELFCLASS ELFDATA EM_ ET_ DT_ PF_ PT_ SHF_ SHN_ SHT_ STB_ STT_'.split())
+
+def find_constant_prefix(name):
+ """Returns a matching prefix from expected_constant_prefixes or None."""
+ for prefix in expected_constant_prefixes:
+ if name.startswith(prefix):
+ return prefix
+ return None
+
+def find_enum_types():
+ """A generator for OpenIntEnum and IntFlag classes in glibcelf."""
+ for obj in vars(glibcelf).values():
+ if isinstance(obj, type) and obj.__bases__[0] in (
+ glibcelf._OpenIntEnum, enum.Enum, enum.IntFlag):
+ yield obj
+
+def check_duplicates():
+ """Verifies that enum types do not have duplicate values.
+
+ Different types must have different member names, too.
+
+ """
+ global_seen = {}
+ for typ in find_enum_types():
+ seen = {}
+ last = None
+ for (name, e) in typ.__members__.items():
+ if e.value in seen:
+ error('{} has {}={} and {}={}'.format(
+ typ, seen[e.value], e.value, name, e.value))
+ last = e
+ else:
+ seen[e.value] = name
+ if last is not None and last.value > e.value:
+ error('{} has {}={} after {}={}'.format(
+ typ, name, e.value, last.name, last.value))
+ if name in global_seen:
+ error('{} used in {} and {}'.format(
+ name, global_seen[name], typ))
+ else:
+ global_seen[name] = typ
+
+def check_constant_prefixes():
+ """Check that the constant prefixes match expected_constant_prefixes."""
+ seen = set()
+ for typ in find_enum_types():
+ typ_prefix = None
+ for val in typ:
+ prefix = find_constant_prefix(val.name)
+ if prefix is None:
+ error('constant {!r} for {} has unknown prefix'.format(
+ val, typ))
+ break
+ elif typ_prefix is None:
+ typ_prefix = prefix
+ seen.add(typ_prefix)
+ elif prefix != typ_prefix:
+ error('prefix {!r} for constant {!r}, expected {!r}'.format(
+ prefix, val, typ_prefix))
+ if typ_prefix is None:
+ error('empty enum type {}'.format(typ))
+
+ for prefix in sorted(set(expected_constant_prefixes) - seen):
+ error('missing constant prefix {!r}'.format(prefix))
+ # Reverse difference is already covered inside the loop.
+
+def find_elf_h_constants(cc):
+ """Returns a dictionary of relevant constants from <elf.h>."""
+ return glibcextract.compute_macro_consts(
+ source_text='#include <elf.h>',
+ cc=cc,
+ macro_re='|'.join(
+ prefix + '.*' for prefix in expected_constant_prefixes))
+
+# The first part of the pair is a name of an <elf.h> constant that is
+# dropped from glibcelf. The second part is the constant as it is
+# used in <elf.h>.
+glibcelf_skipped_aliases = (
+ ('EM_ARC_A5', 'EM_ARC_COMPACT'),
+ ('PF_PARISC_SBP', 'PF_HP_SBP')
+)
+
+# Constants that provide little value and are not included in
+# glibcelf: *LO*/*HI* range constants, *NUM constants counting the
+# number of constants. Also includes the alias names from
+# glibcelf_skipped_aliases.
+glibcelf_skipped_constants = frozenset(
+ [e[0] for e in glibcelf_skipped_aliases]) | frozenset("""
+DT_AARCH64_NUM
+DT_ADDRNUM
+DT_ADDRRNGHI
+DT_ADDRRNGLO
+DT_ALPHA_NUM
+DT_ENCODING
+DT_EXTRANUM
+DT_HIOS
+DT_HIPROC
+DT_IA_64_NUM
+DT_LOOS
+DT_LOPROC
+DT_MIPS_NUM
+DT_NUM
+DT_PPC64_NUM
+DT_PPC_NUM
+DT_PROCNUM
+DT_SPARC_NUM
+DT_VALNUM
+DT_VALRNGHI
+DT_VALRNGLO
+DT_VERSIONTAGNUM
+ELFCLASSNUM
+ELFDATANUM
+ET_HIOS
+ET_HIPROC
+ET_LOOS
+ET_LOPROC
+ET_NUM
+PF_MASKOS
+PF_MASKPROC
+PT_HIOS
+PT_HIPROC
+PT_HISUNW
+PT_LOOS
+PT_LOPROC
+PT_LOSUNW
+SHF_MASKOS
+SHF_MASKPROC
+SHN_HIOS
+SHN_HIPROC
+SHN_HIRESERVE
+SHN_LOOS
+SHN_LOPROC
+SHN_LORESERVE
+SHT_HIOS
+SHT_HIPROC
+SHT_HIPROC
+SHT_HISUNW
+SHT_HIUSER
+SHT_LOOS
+SHT_LOPROC
+SHT_LOSUNW
+SHT_LOUSER
+SHT_NUM
+STB_HIOS
+STB_HIPROC
+STB_LOOS
+STB_LOPROC
+STB_NUM
+STT_HIOS
+STT_HIPROC
+STT_LOOS
+STT_LOPROC
+STT_NUM
+""".strip().split())
+
+def check_constant_values(cc):
+ """Checks the values of <elf.h> constants against glibcelf."""
+
+ glibcelf_constants = {
+ e.name: e for typ in find_enum_types() for e in typ}
+ elf_h_constants = find_elf_h_constants(cc=cc)
+
+ missing_in_glibcelf = (set(elf_h_constants) - set(glibcelf_constants)
+ - glibcelf_skipped_constants)
+ for name in sorted(missing_in_glibcelf):
+ error('constant {} is missing from glibcelf'.format(name))
+
+ unexpected_in_glibcelf = \
+ set(glibcelf_constants) & glibcelf_skipped_constants
+ for name in sorted(unexpected_in_glibcelf):
+ error('constant {} is supposed to be filtered from glibcelf'.format(
+ name))
+
+ missing_in_elf_h = set(glibcelf_constants) - set(elf_h_constants)
+ for name in sorted(missing_in_elf_h):
+ error('constant {} is missing from <elf.h>'.format(name))
+
+ expected_in_elf_h = glibcelf_skipped_constants - set(elf_h_constants)
+ for name in expected_in_elf_h:
+ error('filtered constant {} is missing from <elf.h>'.format(name))
+
+ for alias_name, name_in_glibcelf in glibcelf_skipped_aliases:
+ if name_in_glibcelf not in glibcelf_constants:
+ error('alias value {} for {} not in glibcelf'.format(
+ name_in_glibcelf, alias_name))
+ elif (int(elf_h_constants[alias_name])
+ != glibcelf_constants[name_in_glibcelf].value):
+ error('<elf.h> has {}={}, glibcelf has {}={}'.format(
+ alias_name, elf_h_constants[alias_name],
+ name_in_glibcelf, glibcelf_constants[name_in_glibcelf]))
+
+ # Check for value mismatches:
+ for name in sorted(set(glibcelf_constants) & set(elf_h_constants)):
+ glibcelf_value = glibcelf_constants[name].value
+ elf_h_value = int(elf_h_constants[name])
+ # On 32-bit architectures <elf.h> as some constants that are
+ # parsed as signed, while they are unsigned in glibcelf. So
+ # far, this only affects some flag constants, so special-case
+ # them here.
+ if (glibcelf_value != elf_h_value
+ and not (isinstance(glibcelf_constants[name], enum.IntFlag)
+ and glibcelf_value == 1 << 31
+ and elf_h_value == -(1 << 31))):
+ error('{}: glibcelf has {!r}, <elf.h> has {!r}'.format(
+ name, glibcelf_value, elf_h_value))
+
+def main():
+ """The main entry point."""
+ parser = argparse.ArgumentParser(
+ description="Check glibcelf.py and elf.h against each other.")
+ parser.add_argument('--cc', metavar='CC',
+ help='C compiler (including options) to use')
+ args = parser.parse_args()
+
+ check_duplicates()
+ check_constant_prefixes()
+ check_constant_values(cc=args.cc)
+
+ if errors_encountered > 0:
+ print("note: errors encountered:", errors_encountered)
+ sys.exit(1)
+
+if __name__ == '__main__':
+ main()
--- /dev/null
+#!/usr/bin/python3
+# Verify that certain symbols are covered by RELRO.
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+"""Analyze a (shared) object to verify that certain symbols are
+present and covered by the PT_GNU_RELRO segment.
+
+"""
+
+import argparse
+import os.path
+import sys
+
+# Make available glibc Python modules.
+sys.path.append(os.path.join(
+ os.path.dirname(os.path.realpath(__file__)), os.path.pardir, 'scripts'))
+
+import glibcelf
+
+def find_relro(path: str, img: glibcelf.Image) -> (int, int):
+ """Discover the address range of the PT_GNU_RELRO segment."""
+ for phdr in img.phdrs():
+ if phdr.p_type == glibcelf.Pt.PT_GNU_RELRO:
+ # The computation is not entirely accurate because
+ # _dl_protect_relro in elf/dl-reloc.c rounds both the
+ # start end and downwards using the run-time page size.
+ return phdr.p_vaddr, phdr.p_vaddr + phdr.p_memsz
+ sys.stdout.write('{}: error: no PT_GNU_RELRO segment\n'.format(path))
+ sys.exit(1)
+
+def check_in_relro(kind, relro_begin, relro_end, name, start, size, error):
+ """Check if a section or symbol falls within in the RELRO segment."""
+ end = start + size - 1
+ if not (relro_begin <= start < end < relro_end):
+ error(
+ '{} {!r} of size {} at 0x{:x} is not in RELRO range [0x{:x}, 0x{:x})'.format(
+ kind, name.decode('UTF-8'), start, size,
+ relro_begin, relro_end))
+
+def get_parser():
+ """Return an argument parser for this script."""
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument('object', help='path to object file to check')
+ parser.add_argument('--required', metavar='NAME', default=(),
+ help='required symbol names', nargs='*')
+ parser.add_argument('--optional', metavar='NAME', default=(),
+ help='required symbol names', nargs='*')
+ return parser
+
+def main(argv):
+ """The main entry point."""
+ parser = get_parser()
+ opts = parser.parse_args(argv)
+ img = glibcelf.Image.readfile(opts.object)
+
+ required_symbols = frozenset([sym.encode('UTF-8')
+ for sym in opts.required])
+ optional_symbols = frozenset([sym.encode('UTF-8')
+ for sym in opts.optional])
+ check_symbols = required_symbols | optional_symbols
+
+ # Tracks the symbols in check_symbols that have been found.
+ symbols_found = set()
+
+ # Discover the extent of the RELRO segment.
+ relro_begin, relro_end = find_relro(opts.object, img)
+ symbol_table_found = False
+
+ errors = False
+ def error(msg: str) -> None:
+ """Record an error condition and write a message to standard output."""
+ nonlocal errors
+ errors = True
+ sys.stdout.write('{}: error: {}\n'.format(opts.object, msg))
+
+ # Iterate over section headers to find the symbol table.
+ for shdr in img.shdrs():
+ if shdr.sh_type == glibcelf.Sht.SHT_SYMTAB:
+ symbol_table_found = True
+ for sym in img.syms(shdr):
+ if sym.st_name in check_symbols:
+ symbols_found.add(sym.st_name)
+
+ # Validate symbol type, section, and size.
+ if sym.st_info.type != glibcelf.Stt.STT_OBJECT:
+ error('symbol {!r} has wrong type {}'.format(
+ sym.st_name.decode('UTF-8'), sym.st_info.type))
+ if sym.st_shndx in glibcelf.Shn:
+ error('symbol {!r} has reserved section {}'.format(
+ sym.st_name.decode('UTF-8'), sym.st_shndx))
+ continue
+ if sym.st_size == 0:
+ error('symbol {!r} has size zero'.format(
+ sym.st_name.decode('UTF-8')))
+ continue
+
+ check_in_relro('symbol', relro_begin, relro_end,
+ sym.st_name, sym.st_value, sym.st_size,
+ error)
+ continue # SHT_SYMTAB
+ if shdr.sh_name == b'.data.rel.ro' \
+ or shdr.sh_name.startswith(b'.data.rel.ro.'):
+ check_in_relro('section', relro_begin, relro_end,
+ shdr.sh_name, shdr.sh_addr, shdr.sh_size,
+ error)
+ continue
+
+ if required_symbols - symbols_found:
+ for sym in sorted(required_symbols - symbols_found):
+ error('symbol {!r} not found'.format(sym.decode('UTF-8')))
+
+ if errors:
+ sys.exit(1)
+
+ if not symbol_table_found:
+ sys.stdout.write(
+ '{}: warning: no symbol table found (stripped object)\n'.format(
+ opts.object))
+ sys.exit(77)
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
--- /dev/null
+/* Base for test program with impossiblyh large PT_TLS segment.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* The test actual binary is patched using scripts/tst-elf-edit.py
+ --maximize-tls-size, and this introduces the expected test
+ allocation failure due to an excessive PT_LS p_memsz value.
+
+ Patching the binary is required because on some 64-bit targets, TLS
+ relocations can only cover a 32-bit range, and glibc-internal TLS
+ variables such as errno end up outside that range. */
+
+int
+main (void)
+{
+ return 0;
+}
--- /dev/null
+ld 1
+libc 1
+libtracemod2.so 0
+libtracemod3.so 0
--- /dev/null
+ld 1
+libc 1
+libtracemod2.so 1
+libtracemod3.so 0
+libtracemod4.so 0
+libtracemod5.so 0
--- /dev/null
+ld 1
+libc 1
+libtracemod2.so 1
+libtracemod3.so 1
+libtracemod4.so 0
+libtracemod5.so 0
--- /dev/null
+ld 1
+libc 1
+libtracemod2.so 1
+libtracemod3.so 1
+libtracemod4.so 1
+libtracemod5.so 0
--- /dev/null
+ld 1
+libc 1
+libtracemod2.so 1
+libtracemod3.so 1
+libtracemod4.so 1
+libtracemod5.so 1
# define isspace(__c) __isspace_l ((__c), _nl_C_locobj_ptr)
# define asprintf __asprintf
# define opendir __opendir
-# define readdir __readdir
+# define readdir64 __readdir64
# define closedir __closedir
# define mempcpy __mempcpy
-# define lstat64 __lstat64
+# define struct_stat64 struct __stat64_t64
+# define lstat64 __lstat64_time64
# define feof_unlocked __feof_unlocked
+#else
+# define struct_stat64 struct stat64
#endif
/* Name of the file containing the module information in the directories
DIR *confdir = opendir (buf);
if (confdir != NULL)
{
- struct dirent *ent;
- while ((ent = readdir (confdir)) != NULL)
+ struct dirent64 *ent;
+ while ((ent = readdir64 (confdir)) != NULL)
{
if (ent->d_type != DT_REG && ent->d_type != DT_UNKNOWN)
continue;
&& strcmp (ent->d_name + len - strlen (suffix), suffix) == 0)
{
char *conf;
- struct stat64 st;
+ struct_stat64 st;
if (asprintf (&conf, "%s/%s", buf, ent->d_name) < 0)
continue;
int __ns_name_unpack (const unsigned char *, const unsigned char *,
const unsigned char *, unsigned char *, size_t) __THROW;
+/* Like ns_samename, but for uncompressed binary names. Return true
+ if the two arguments compare are equal as case-insensitive domain
+ names. */
+_Bool __ns_samebinaryname (const unsigned char *, const unsigned char *)
+ attribute_hidden;
+
#define ns_msg_getflag(handle, flag) \
(((handle)._flags & _ns_flagdata[flag].mask) >> _ns_flagdata[flag].shift)
extern __typeof (ns_samename) __libc_ns_samename;
libc_hidden_proto (__libc_ns_samename)
+/* Packet parser helper functions. */
+
+/* Verify that P points to an uncompressed domain name in wire format.
+ On success, return the length of the encoded name, including the
+ terminating null byte. On failure, return -1 and set errno. EOM
+ must point one past the last byte in the packet. */
+int __ns_name_length_uncompressed (const unsigned char *p,
+ const unsigned char *eom) attribute_hidden;
+
+/* Iterator over the resource records in a DNS packet. */
+struct ns_rr_cursor
+{
+ /* These members are not changed after initialization. */
+ const unsigned char *begin; /* First byte of packet. */
+ const unsigned char *end; /* One past the last byte of the packet. */
+ const unsigned char *first_rr; /* First resource record (or packet end). */
+
+ /* Advanced towards the end while reading the packet. */
+ const unsigned char *current;
+};
+
+/* Returns the RCODE field from the DNS header. */
+static inline int
+ns_rr_cursor_rcode (const struct ns_rr_cursor *c)
+{
+ return c->begin[3] & 0x0f; /* Lower 4 bits at offset 3. */
+}
+
+/* Returns the length of the answer section according to the DNS header. */
+static inline int
+ns_rr_cursor_ancount (const struct ns_rr_cursor *c)
+{
+ return c->begin[6] * 256 + c->begin[7]; /* 16 bits at offset 6. */
+}
+
+/* Returns the length of the authority (name server) section according
+ to the DNS header. */
+static inline int
+ns_rr_cursor_nscount (const struct ns_rr_cursor *c)
+{
+ return c->begin[8] * 256 + c->begin[9]; /* 16 bits at offset 8. */
+}
+
+/* Returns the length of the additional data section according to the
+ DNS header. */
+static inline int
+ns_rr_cursor_adcount (const struct ns_rr_cursor *c)
+{
+ return c->begin[10] * 256 + c->begin[11]; /* 16 bits at offset 10. */
+}
+
+/* Returns a pointer to the uncompressed question name in wire
+ format. */
+static inline const unsigned char *
+ns_rr_cursor_qname (const struct ns_rr_cursor *c)
+{
+ return c->begin + 12; /* QNAME starts right after the header. */
+}
+
+/* Returns the question type of the first and only question. */
+static inline const int
+ns_rr_cursor_qtype (const struct ns_rr_cursor *c)
+{
+ /* 16 bits 4 bytes back from the first RR header start. */
+ return c->first_rr[-4] * 256 + c->first_rr[-3];
+}
+
+/* Returns the clss of the first and only question (usally C_IN). */
+static inline const int
+ns_rr_cursor_qclass (const struct ns_rr_cursor *c)
+{
+ /* 16 bits 2 bytes back from the first RR header start. */
+ return c->first_rr[-2] * 256 + c->first_rr[-1];
+}
+
+/* Initializes *C to cover the packet [BUF, BUF+LEN). Returns false
+ if LEN is less than sizeof (*HD), if the packet does not contain a
+ full (uncompressed) question, or if the question count is not 1. */
+_Bool __ns_rr_cursor_init (struct ns_rr_cursor *c,
+ const unsigned char *buf, size_t len)
+ attribute_hidden;
+
+/* Like ns_rr, but the record owner name is not decoded into text format. */
+struct ns_rr_wire
+{
+ unsigned char rname[NS_MAXCDNAME]; /* Owner name of the record. */
+ uint16_t rtype; /* Resource record type (T_*). */
+ uint16_t rclass; /* Resource record class (C_*). */
+ uint32_t ttl; /* Time-to-live field. */
+ const unsigned char *rdata; /* Start of resource record data. */
+ uint16_t rdlength; /* Length of the data at rdata, in bytes. */
+};
+
+/* Attempts to parse the record at C into *RR. On success, return
+ true, and C is advanced past the record, and RR->rdata points to
+ the record data. On failure, errno is set to EMSGSIZE, and false
+ is returned. */
+_Bool __ns_rr_cursor_next (struct ns_rr_cursor *c, struct ns_rr_wire *rr)
+ attribute_hidden;
+
# endif /* !_ISOMAC */
#endif
--- /dev/null
+#include <libio/bits/stdio2-decl.h>
--- /dev/null
+#include <wcsmbs/bits/wchar2-decl.h>
#include <hp-timing.h>
-/* Initialize the `__libc_enable_secure' flag. */
-extern void __libc_init_secure (void);
-
/* Discover the tick frequency of the machine if something goes wrong,
we return 0, an impossible hertz. */
extern int __profile_frequency (void);
void (*parent_handler) (void);
void (*child_handler) (void);
void *dso_handle;
+ uint64_t id;
};
/* Function to call to unregister fork handlers. */
atfork_run_parent
};
-/* Run the atfork handlers and lock/unlock the internal lock depending
- of the WHO argument:
-
- - atfork_run_prepare: run all the PREPARE_HANDLER in reverse order of
- insertion and locks the internal lock.
- - atfork_run_child: run all the CHILD_HANDLER and unlocks the internal
- lock.
- - atfork_run_parent: run all the PARENT_HANDLER and unlocks the internal
- lock.
-
- Perform locking only if DO_LOCKING. */
-extern void __run_fork_handlers (enum __run_fork_handler_type who,
- _Bool do_locking) attribute_hidden;
+/* Run the atfork prepare handlers in the reverse order of registration and
+ return the ID of the last registered handler. If DO_LOCKING is true, the
+ internal lock is held locked upon return. */
+extern uint64_t __run_prefork_handlers (_Bool do_locking) attribute_hidden;
+
+/* Given a handler type (parent or child), run all the atfork handlers in
+ the order of registration up to and including the handler with id equal
+ to LASTRUN. If DO_LOCKING is true, the internal lock is unlocked prior
+ to return. */
+extern void __run_postfork_handlers (enum __run_fork_handler_type who,
+ _Bool do_locking,
+ uint64_t lastrun) attribute_hidden;
/* C library side function to register new fork handlers. */
extern int __register_atfork (void (*__prepare) (void),
extern __typeof (__res_queriesmatch) __libc_res_queriesmatch;
libc_hidden_proto (__libc_res_queriesmatch)
+/* Variant of res_hnok which operates on binary (but uncompressed) names. */
+bool __res_binary_hnok (const unsigned char *dn) attribute_hidden;
+
# endif /* _RESOLV_H_ && !_ISOMAC */
#endif
and some functions contained in the C library ignore various
environment variables that normally affect them. */
extern int __libc_enable_secure attribute_relro;
-extern int __libc_enable_secure_decided;
rtld_hidden_proto (__libc_enable_secure)
char *hdir, *buf, *tmp;
char myname[1024], *mydomain;
int t, usedefault = 0;
- struct stat64 stb;
+ struct __stat64_t64 stb;
hdir = __libc_secure_getenv("HOME");
if (hdir == NULL) {
break;
case PASSWD:
if (strcmp(*aname, "anonymous") &&
- __fstat64(fileno(cfile), &stb) >= 0 &&
+ __fstat64_time64(fileno(cfile), &stb) >= 0 &&
(stb.st_mode & 077) != 0) {
warnx(_("Error: .netrc file is readable by others."));
warnx(_("Remove 'password' line or make file unreadable by others."));
tst-ftw-bz28126
tests-time64 := \
+ tst-fcntl-time64 \
+ tst-fts-time64 \
tst-futimens-time64 \
tst-futimes-time64\
- tst-fts-time64 \
+ tst-futimesat-time64 \
+ tst-lchmod-time64 \
tst-lutimes-time64 \
tst-stat-time64 \
- tst-futimesat-time64 \
tst-utime-time64 \
tst-utimensat-time64 \
tst-utimes-time64 \
- tst-fcntl-time64 \
# tests-time64
# Likewise for statx, but we do not need static linking here.
CFLAGS-test-stat.c += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE
CFLAGS-test-lfs.c += -D_LARGEFILE64_SOURCE
+CFLAGS-tst-lchmod.c += -D_FILE_OFFSET_BITS=64
test-stat2-ARGS = Makefile . $(objpfx)test-stat2
--- /dev/null
+#define CHECK_TIME64
+#include "tst-lchmod.c"
return full_path;
}
+static void
+update_file_time_to_y2038 (const char *fname, int flags)
+{
+#ifdef CHECK_TIME64
+ /* Y2038 threshold plus 1 second. */
+ const struct timespec ts[] = { { 0x80000001LL, 0}, { 0x80000001LL } };
+ TEST_VERIFY_EXIT (utimensat (AT_FDCWD, fname, ts, flags) == 0);
+#endif
+}
+
static void
test_1 (bool do_relative_path, int (*chmod_func) (int fd, const char *, mode_t, int))
{
char *tempdir = support_create_temp_directory ("tst-lchmod-");
+#ifdef CHECK_TIME64
+ if (!support_path_support_time64 (tempdir))
+ {
+ puts ("info: test skipped, filesystem does not support 64 bit time_t");
+ return;
+ }
+#endif
char *path_dangling = xasprintf ("%s/dangling", tempdir);
char *path_file = xasprintf ("%s/file", tempdir);
xsymlink ("loop", path_loop);
xsymlink ("target-does-not-exist", path_dangling);
+ update_file_time_to_y2038 (path_file, 0);
+ update_file_time_to_y2038 (path_to_file, AT_SYMLINK_NOFOLLOW);
+
/* Check that the modes do not collide with what we will use in the
test. */
- struct stat64 st;
+ struct stat st;
xstat (path_file, &st);
TEST_VERIFY ((st.st_mode & 0777) != 1);
xlstat (path_to_file, &st);
TEST_VERIFY_EXIT (fd >= 0);
support_write_file_string (path, "abc");
+ /* This should help to prevent delayed allocation, which may result
+ in a spurious stx_blocks/st_blocks difference. */
+ fsync (fd);
+
bool check_ns = support_stat_nanoseconds (path);
if (!check_ns)
printf ("warning: timestamp with nanoseconds not supported\n");
include ../Makeconfig
headers := stdio.h \
- bits/stdio.h bits/stdio2.h bits/stdio-ldbl.h \
+ bits/stdio.h bits/stdio2.h bits/stdio2-decl.h bits/stdio-ldbl.h \
bits/types/FILE.h bits/types/__FILE.h bits/types/struct_FILE.h \
bits/types/__fpos_t.h bits/types/__fpos64_t.h \
bits/types/cookie_io_functions_t.h
--- /dev/null
+/* Checking macros for stdio functions. Declarations only.
+ Copyright (C) 2004-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _BITS_STDIO2_DEC_H
+#define _BITS_STDIO2_DEC_H 1
+
+#ifndef _STDIO_H
+# error "Never include <bits/stdio2-decl.h> directly; use <stdio.h> instead."
+#endif
+
+extern int __sprintf_chk (char *__restrict __s, int __flag, size_t __slen,
+ const char *__restrict __format, ...) __THROW
+ __attr_access ((__write_only__, 1, 3));
+extern int __vsprintf_chk (char *__restrict __s, int __flag, size_t __slen,
+ const char *__restrict __format,
+ __gnuc_va_list __ap) __THROW
+ __attr_access ((__write_only__, 1, 3));
+
+#if defined __USE_ISOC99 || defined __USE_UNIX98
+
+extern int __snprintf_chk (char *__restrict __s, size_t __n, int __flag,
+ size_t __slen, const char *__restrict __format,
+ ...) __THROW
+ __attr_access ((__write_only__, 1, 2));
+extern int __vsnprintf_chk (char *__restrict __s, size_t __n, int __flag,
+ size_t __slen, const char *__restrict __format,
+ __gnuc_va_list __ap) __THROW
+ __attr_access ((__write_only__, 1, 2));
+
+#endif
+
+#if __USE_FORTIFY_LEVEL > 1
+
+extern int __fprintf_chk (FILE *__restrict __stream, int __flag,
+ const char *__restrict __format, ...);
+extern int __printf_chk (int __flag, const char *__restrict __format, ...);
+extern int __vfprintf_chk (FILE *__restrict __stream, int __flag,
+ const char *__restrict __format, __gnuc_va_list __ap);
+extern int __vprintf_chk (int __flag, const char *__restrict __format,
+ __gnuc_va_list __ap);
+
+# ifdef __USE_XOPEN2K8
+extern int __dprintf_chk (int __fd, int __flag, const char *__restrict __fmt,
+ ...) __attribute__ ((__format__ (__printf__, 3, 4)));
+extern int __vdprintf_chk (int __fd, int __flag,
+ const char *__restrict __fmt, __gnuc_va_list __arg)
+ __attribute__ ((__format__ (__printf__, 3, 0)));
+# endif
+
+# ifdef __USE_GNU
+
+extern int __asprintf_chk (char **__restrict __ptr, int __flag,
+ const char *__restrict __fmt, ...)
+ __THROW __attribute__ ((__format__ (__printf__, 3, 4))) __wur;
+extern int __vasprintf_chk (char **__restrict __ptr, int __flag,
+ const char *__restrict __fmt, __gnuc_va_list __arg)
+ __THROW __attribute__ ((__format__ (__printf__, 3, 0))) __wur;
+extern int __obstack_printf_chk (struct obstack *__restrict __obstack,
+ int __flag, const char *__restrict __format,
+ ...)
+ __THROW __attribute__ ((__format__ (__printf__, 3, 4)));
+extern int __obstack_vprintf_chk (struct obstack *__restrict __obstack,
+ int __flag,
+ const char *__restrict __format,
+ __gnuc_va_list __args)
+ __THROW __attribute__ ((__format__ (__printf__, 3, 0)));
+
+# endif
+#endif
+
+#if __GLIBC_USE (DEPRECATED_GETS)
+extern char *__gets_chk (char *__str, size_t) __wur;
+#endif
+
+extern char *__fgets_chk (char *__restrict __s, size_t __size, int __n,
+ FILE *__restrict __stream)
+ __wur __attr_access ((__write_only__, 1, 3));
+
+extern size_t __fread_chk (void *__restrict __ptr, size_t __ptrlen,
+ size_t __size, size_t __n,
+ FILE *__restrict __stream) __wur;
+
+#ifdef __USE_GNU
+extern char *__fgets_unlocked_chk (char *__restrict __s, size_t __size,
+ int __n, FILE *__restrict __stream)
+ __wur __attr_access ((__write_only__, 1, 3));
+#endif
+
+#ifdef __USE_MISC
+# undef fread_unlocked
+extern size_t __fread_unlocked_chk (void *__restrict __ptr, size_t __ptrlen,
+ size_t __size, size_t __n,
+ FILE *__restrict __stream) __wur;
+#endif
+
+#endif /* bits/stdio2-decl.h. */
# error "Never include <bits/stdio2.h> directly; use <stdio.h> instead."
#endif
-extern int __sprintf_chk (char *__restrict __s, int __flag, size_t __slen,
- const char *__restrict __format, ...) __THROW
- __attr_access ((__write_only__, 1, 3));
-extern int __vsprintf_chk (char *__restrict __s, int __flag, size_t __slen,
- const char *__restrict __format,
- __gnuc_va_list __ap) __THROW
- __attr_access ((__write_only__, 1, 3));
-
#ifdef __va_arg_pack
__fortify_function int
__NTH (sprintf (char *__restrict __s, const char *__restrict __fmt, ...))
}
#if defined __USE_ISOC99 || defined __USE_UNIX98
-
-extern int __snprintf_chk (char *__restrict __s, size_t __n, int __flag,
- size_t __slen, const char *__restrict __format,
- ...) __THROW
- __attr_access ((__write_only__, 1, 2));
-extern int __vsnprintf_chk (char *__restrict __s, size_t __n, int __flag,
- size_t __slen, const char *__restrict __format,
- __gnuc_va_list __ap) __THROW;
-
# ifdef __va_arg_pack
__fortify_function int
__NTH (snprintf (char *__restrict __s, size_t __n,
#endif
#if __USE_FORTIFY_LEVEL > 1
-
-extern int __fprintf_chk (FILE *__restrict __stream, int __flag,
- const char *__restrict __format, ...);
-extern int __printf_chk (int __flag, const char *__restrict __format, ...);
-extern int __vfprintf_chk (FILE *__restrict __stream, int __flag,
- const char *__restrict __format, __gnuc_va_list __ap);
-extern int __vprintf_chk (int __flag, const char *__restrict __format,
- __gnuc_va_list __ap);
-
# ifdef __va_arg_pack
__fortify_function int
fprintf (FILE *__restrict __stream, const char *__restrict __fmt, ...)
}
# ifdef __USE_XOPEN2K8
-extern int __dprintf_chk (int __fd, int __flag, const char *__restrict __fmt,
- ...) __attribute__ ((__format__ (__printf__, 3, 4)));
-extern int __vdprintf_chk (int __fd, int __flag,
- const char *__restrict __fmt, __gnuc_va_list __arg)
- __attribute__ ((__format__ (__printf__, 3, 0)));
-
# ifdef __va_arg_pack
__fortify_function int
dprintf (int __fd, const char *__restrict __fmt, ...)
# endif
# ifdef __USE_GNU
-
-extern int __asprintf_chk (char **__restrict __ptr, int __flag,
- const char *__restrict __fmt, ...)
- __THROW __attribute__ ((__format__ (__printf__, 3, 4))) __wur;
-extern int __vasprintf_chk (char **__restrict __ptr, int __flag,
- const char *__restrict __fmt, __gnuc_va_list __arg)
- __THROW __attribute__ ((__format__ (__printf__, 3, 0))) __wur;
-extern int __obstack_printf_chk (struct obstack *__restrict __obstack,
- int __flag, const char *__restrict __format,
- ...)
- __THROW __attribute__ ((__format__ (__printf__, 3, 4)));
-extern int __obstack_vprintf_chk (struct obstack *__restrict __obstack,
- int __flag,
- const char *__restrict __format,
- __gnuc_va_list __args)
- __THROW __attribute__ ((__format__ (__printf__, 3, 0)));
-
# ifdef __va_arg_pack
__fortify_function int
__NTH (asprintf (char **__restrict __ptr, const char *__restrict __fmt, ...))
#endif
#if __GLIBC_USE (DEPRECATED_GETS)
-extern char *__gets_chk (char *__str, size_t) __wur;
extern char *__REDIRECT (__gets_warn, (char *__str), gets)
__wur __warnattr ("please use fgets or getline instead, gets can't "
"specify buffer size");
}
#endif
-extern char *__fgets_chk (char *__restrict __s, size_t __size, int __n,
- FILE *__restrict __stream)
- __wur __attr_access ((__write_only__, 1, 3));
extern char *__REDIRECT (__fgets_alias,
(char *__restrict __s, int __n,
FILE *__restrict __stream), fgets)
return __fgets_chk (__s, sz, __n, __stream);
}
-extern size_t __fread_chk (void *__restrict __ptr, size_t __ptrlen,
- size_t __size, size_t __n,
- FILE *__restrict __stream) __wur;
extern size_t __REDIRECT (__fread_alias,
(void *__restrict __ptr, size_t __size,
size_t __n, FILE *__restrict __stream),
}
#ifdef __USE_GNU
-extern char *__fgets_unlocked_chk (char *__restrict __s, size_t __size,
- int __n, FILE *__restrict __stream)
- __wur __attr_access ((__write_only__, 1, 3));
extern char *__REDIRECT (__fgets_unlocked_alias,
(char *__restrict __s, int __n,
FILE *__restrict __stream), fgets_unlocked)
#ifdef __USE_MISC
# undef fread_unlocked
-extern size_t __fread_unlocked_chk (void *__restrict __ptr, size_t __ptrlen,
- size_t __size, size_t __n,
- FILE *__restrict __stream) __wur;
extern size_t __REDIRECT (__fread_unlocked_alias,
(void *__restrict __ptr, size_t __size,
size_t __n, FILE *__restrict __stream),
extern int __uflow (FILE *);
extern int __overflow (FILE *, int);
+#if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function
+/* Declare all functions from bits/stdio2-decl.h first. */
+# include <bits/stdio2-decl.h>
+#endif
+
+/* The following headers provide asm redirections. These redirections must
+ appear before the first usage of these functions, e.g. in bits/stdio.h. */
+#if defined __LDBL_COMPAT || __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI == 1
+# include <bits/stdio-ldbl.h>
+#endif
+
/* If we are compiling with optimizing read this file. It contains
several optimizing inline functions and macros. */
#ifdef __USE_EXTERN_INLINES
# include <bits/stdio.h>
#endif
#if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function
+/* Now include the function definitions and redirects too. */
# include <bits/stdio2.h>
#endif
-#include <bits/floatn.h>
-#if defined __LDBL_COMPAT || __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI == 1
-# include <bits/stdio-ldbl.h>
-#endif
-
__END_DECLS
#endif /* <stdio.h> included. */
}
}
+ /* Generally speaking there are 3 standards the define the default,
+ warning, and error behaviour of LC_MONETARY. They are ISO/IEC TR 30112,
+ ISO/IEC 9899:2018 (ISO C17), and POSIX.1-2017. Within 30112 we have the
+ definition of a standard i18n FDCC-set, which for LC_MONETARY has the
+ following default values:
+ int_curr_symbol ""
+ currency_symbol ""
+ mon_decimal_point "<U002C>" i.e. ","
+ mon_thousand_sep ""
+ mon_grouping "\177" i.e. CHAR_MAX
+ positive_sign ""
+ negative_sign "<U002E>" i.e. "."
+ int_frac_digits -1
+ frac_digits -1
+ p_cs_precedes -1
+ p_sep_by_space -1
+ n_cs_precedes -1
+ n_sep_by_space -1
+ p_sign_posn -1
+ n_sign_posn -1
+ Under 30112 a keyword that is not provided implies an empty string ""
+ for string values or a -1 for integer values, and indicates the value
+ is unspecified with no default implied. No errors are considered.
+ The exception is mon_grouping which is a string with a terminating
+ CHAR_MAX.
+ For POSIX Issue 7 we have:
+ https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html
+ and again values not provided default to "" or -1, and indicate the value
+ is not available to the locale. The exception is mon_grouping which is
+ a string with a terminating CHAR_MAX. For the POSIX locale the values of
+ LC_MONETARY should be:
+ int_curr_symbol ""
+ currency_symbol ""
+ mon_decimal_point ""
+ mon_thousands_sep ""
+ mon_grouping "\177" i.e. CHAR_MAX
+ positive_sign ""
+ negative_sign ""
+ int_frac_digits -1
+ frac_digits -1
+ p_cs_precedes -1
+ p_sep_by_space -1
+ n_cs_precedes -1
+ n_sep_by_space -1
+ p_sign_posn -1
+ n_sign_posn -1
+ int_p_cs_precedes -1
+ int_p_sep_by_space -1
+ int_n_cs_precedes -1
+ int_n_sep_by_space -1
+ int_p_sign_posn -1
+ int_n_sign_posn -1
+ Like with 30112, POSIX also considers no error if the keywords are
+ missing, only that if the cateory as a whole is missing the referencing
+ of the category results in unspecified behaviour.
+ For ISO C17 there is no default value provided, but the localeconv
+ specification in 7.11.2.1 admits that members of char * type may point
+ to "" to indicate a value is not available or is of length zero.
+ The exception is decimal_point (not mon_decimal_point) which must be a
+ defined non-empty string. The values of char, which are generally
+ mapped to integer values in 30112 and POSIX, must be non-negative
+ numbers that map to CHAR_MAX when a value is not available in the
+ locale.
+ In ISO C17 for the "C" locale all values are empty strings "", or
+ CHAR_MAX, with the exception of decimal_point which is "." (defined
+ in LC_NUMERIC). ISO C17 makes no exception for mon_grouping like
+ 30112 and POSIX, but a value of "" is functionally equivalent to
+ "\177" since neither defines a grouping (though the latter terminates
+ the grouping).
+
+ Lastly, we must consider the legacy C/POSIX locale that implemented
+ as a builtin in glibc and wether a default value mapping to the
+ C/POSIX locale may benefit the user from a compatibility perspective.
+
+ Thus given 30112, POSIX, ISO C, and the builtin C/POSIX locale we
+ need to pick appropriate defaults below. */
+
+ /* The members of LC_MONETARY are handled in the order of their definition
+ in locale/categories.def. Please keep them in that order. */
+
+ /* The purpose of TEST_ELEM is to define a default value for the fields
+ in the category if the field was not defined in the cateory. If the
+ category was present but we didn't see a definition for the field then
+ we also issue a warning, otherwise the only warning you get is the one
+ earlier when a default category is created (completely missing category).
+ This missing field warning is glibc-specific since no standard requires
+ this warning, but we consider it valuable to print a warning for all
+ missing fields in the category. */
#define TEST_ELEM(cat, initval) \
if (monetary->cat == NULL) \
{ \
if (! nothing) \
- record_error (0, 0, _("%s: field `%s' not defined"), \
- "LC_MONETARY", #cat); \
+ record_warning (_("%s: field `%s' not defined"), \
+ "LC_MONETARY", #cat); \
monetary->cat = initval; \
}
+ /* Keyword: int_curr_symbol. */
TEST_ELEM (int_curr_symbol, "");
- TEST_ELEM (currency_symbol, "");
- TEST_ELEM (mon_thousands_sep, "");
- TEST_ELEM (positive_sign, "");
- TEST_ELEM (negative_sign, "");
-
/* The international currency symbol must come from ISO 4217. */
if (monetary->int_curr_symbol != NULL)
{
}
}
- /* The decimal point must not be empty. This is not said explicitly
- in POSIX but ANSI C (ISO/IEC 9899) says in 4.4.2.1 it has to be
- != "". */
+ /* Keyword: currency_symbol */
+ TEST_ELEM (currency_symbol, "");
+
+ /* Keyword: mon_decimal_point */
+ /* ISO C17 7.11.2.1.3 explicitly allows mon_decimal_point to be the
+ empty string e.g. "". This indicates the value is not available in the
+ current locale or is of zero length. However, if the value was never
+ defined then we issue a warning and use a glibc-specific default. ISO
+ 30112 in the i18n FDCC-Set uses <U002C> ",", and POSIX Issue 7 in the
+ POSIX locale uses "". It is specific to glibc that the default is <U002E>
+ "."; we retain this existing behaviour for backwards compatibility. */
if (monetary->mon_decimal_point == NULL)
{
if (! nothing)
- record_error (0, 0, _("%s: field `%s' not defined"),
- "LC_MONETARY", "mon_decimal_point");
+ record_warning (_("%s: field `%s' not defined, using defaults"),
+ "LC_MONETARY", "mon_decimal_point");
monetary->mon_decimal_point = ".";
monetary->mon_decimal_point_wc = L'.';
}
- else if (monetary->mon_decimal_point[0] == '\0' && ! be_quiet && ! nothing)
+
+ /* Keyword: mon_thousands_sep */
+ if (monetary->mon_thousands_sep == NULL)
{
- record_error (0, 0, _("\
-%s: value for field `%s' must not be an empty string"),
- "LC_MONETARY", "mon_decimal_point");
+ if (! nothing)
+ record_warning (_("%s: field `%s' not defined, using defaults"),
+ "LC_MONETARY", "mon_thousands_sep");
+ monetary->mon_thousands_sep = "";
+ monetary->mon_thousands_sep_wc = L'\0';
}
+ /* Keyword: mon_grouping */
if (monetary->mon_grouping_len == 0)
{
if (! nothing)
- record_error (0, 0, _("%s: field `%s' not defined"),
- "LC_MONETARY", "mon_grouping");
-
+ record_warning (_("%s: field `%s' not defined"),
+ "LC_MONETARY", "mon_grouping");
+ /* Missing entries are given 1 element in their bytearray with
+ a value of CHAR_MAX which indicates that "No further grouping
+ is to be performed" (functionally equivalent to ISO C's "C"
+ locale default of ""). */
monetary->mon_grouping = (char *) "\177";
monetary->mon_grouping_len = 1;
}
+ /* Keyword: positive_sign */
+ TEST_ELEM (positive_sign, "");
+
+ /* Keyword: negative_sign */
+ TEST_ELEM (negative_sign, "");
+
#undef TEST_ELEM
#define TEST_ELEM(cat, min, max, initval) \
if (monetary->cat == -2) \
{ \
if (! nothing) \
- record_error (0, 0, _("%s: field `%s' not defined"), \
- "LC_MONETARY", #cat); \
+ record_warning (_("%s: field `%s' not defined"), \
+ "LC_MONETARY", #cat); \
monetary->cat = initval; \
} \
else if ((monetary->cat < min || monetary->cat > max) \
TEST_ELEM (p_sign_posn, -1, 4, -1);
TEST_ELEM (n_sign_posn, -1, 4, -1);
- /* The non-POSIX.2 extensions are optional. */
- if (monetary->duo_int_curr_symbol == NULL)
- monetary->duo_int_curr_symbol = monetary->int_curr_symbol;
- if (monetary->duo_currency_symbol == NULL)
- monetary->duo_currency_symbol = monetary->currency_symbol;
-
- if (monetary->duo_int_frac_digits == -2)
- monetary->duo_int_frac_digits = monetary->int_frac_digits;
- if (monetary->duo_frac_digits == -2)
- monetary->duo_frac_digits = monetary->frac_digits;
+ /* Keyword: crncystr */
+ monetary->crncystr = (char *) xmalloc (strlen (monetary->currency_symbol)
+ + 2);
+ monetary->crncystr[0] = monetary->p_cs_precedes ? '-' : '+';
+ strcpy (&monetary->crncystr[1], monetary->currency_symbol);
#undef TEST_ELEM
#define TEST_ELEM(cat, alt, min, max) \
TEST_ELEM (int_p_sign_posn, p_sign_posn, -1, 4);
TEST_ELEM (int_n_sign_posn, n_sign_posn, -1, 4);
+ /* The non-POSIX.2 extensions are optional. */
+ if (monetary->duo_int_curr_symbol == NULL)
+ monetary->duo_int_curr_symbol = monetary->int_curr_symbol;
+ if (monetary->duo_currency_symbol == NULL)
+ monetary->duo_currency_symbol = monetary->currency_symbol;
+
+ if (monetary->duo_int_frac_digits == -2)
+ monetary->duo_int_frac_digits = monetary->int_frac_digits;
+ if (monetary->duo_frac_digits == -2)
+ monetary->duo_frac_digits = monetary->frac_digits;
+
TEST_ELEM (duo_p_cs_precedes, p_cs_precedes, -1, 1);
TEST_ELEM (duo_p_sep_by_space, p_sep_by_space, -1, 2);
TEST_ELEM (duo_n_cs_precedes, n_cs_precedes, -1, 1);
if (monetary->duo_valid_to == 0)
monetary->duo_valid_to = 99991231;
+ /* Keyword: conversion_rate */
if (monetary->conversion_rate[0] == 0)
{
monetary->conversion_rate[0] = 1;
monetary->conversion_rate[1] = 1;
}
- /* Create the crncystr entry. */
- monetary->crncystr = (char *) xmalloc (strlen (monetary->currency_symbol)
- + 2);
- monetary->crncystr[0] = monetary->p_cs_precedes ? '-' : '+';
- strcpy (&monetary->crncystr[1], monetary->currency_symbol);
+ /* A value for monetary-decimal-point-wc was set when
+ monetary_decimal_point was set, likewise for monetary-thousands-sep-wc. */
}
{
char fullname[fnamelen + 2 * strlen (d->d_name) + 7];
- if (d_type == DT_UNKNOWN)
+ if (d_type == DT_UNKNOWN || d_type == DT_LNK)
{
strcpy (stpcpy (stpcpy (fullname, fname), "/"),
d->d_name);
endef
$(INSTALL-SUPPORTED-LOCALE-ARCHIVE): install-locales-dir
- @flags="-c"; \
+ @flags=""; \
$(build-one-locale)
$(INSTALL-SUPPORTED-LOCALE-FILES): install-locales-dir
- @flags="-c --no-archive --no-hard-links"; \
+ @flags="--no-archive --no-hard-links"; \
$(build-one-locale)
tst-setlocale-ENV = LC_ALL=ja_JP.EUC-JP
echo "Generating locale $locale.$charmap: this might take a while..."
-# Run quietly and force output.
-flags="--quiet -c"
+# Do not force output with '-c', all locales should compile without
+# warning or errors. There is likewise no need to run quietly with
+# '--quiet' since all locales should compile without additional
+# diagnostics. If there are messages printed then we want to see
+# them, fix them, and the associated error or warning. During
+# development it may be beneficialy to put '--quiet -c' here to allow
+# you to develop in-progress locales.
+flags=""
# For SJIS the charmap is SHIFT_JIS. We just want the locale to have
# a slightly nicer name instead of using "*.SHIFT_SJIS", but that
# define __assert_fail(assertion, file, line, function) \
__malloc_assert(assertion, file, line, function)
-extern const char *__progname;
-
-static void
+_Noreturn static void
__malloc_assert (const char *assertion, const char *file, unsigned int line,
const char *function)
{
- (void) __fxprintf (NULL, "%s%s%s:%u: %s%sAssertion `%s' failed.\n",
- __progname, __progname[0] ? ": " : "",
- file, line,
- function ? function : "", function ? ": " : "",
- assertion);
- fflush (stderr);
- abort ();
+ __libc_message (do_abort, "\
+Fatal glibc error: malloc assertion failure in %s: %s\n",
+ function, assertion);
+ __builtin_unreachable ();
}
#endif
#endif
(void)__chdir("/");
if (!noclose) {
- struct stat64 st;
+ struct __stat64_t64 st;
if ((fd = __open_nocancel(_PATH_DEVNULL, O_RDWR, 0)) != -1
- && (__builtin_expect (__fstat64 (fd, &st), 0)
- == 0)) {
+ && __glibc_likely (__fstat64_time64 (fd, &st) == 0)) {
if (__builtin_expect (S_ISCHR (st.st_mode), 1) != 0
#if defined DEV_NULL_MAJOR && defined DEV_NULL_MINOR
&& (st.st_rdev
{
char **sp, *cp;
FILE *fp;
- struct stat64 statb;
+ struct __stat64_t64 statb;
size_t flen;
free(shells);
strings = NULL;
if ((fp = fopen(_PATH_SHELLS, "rce")) == NULL)
goto init_okshells_noclose;
- if (__fstat64(fileno(fp), &statb) == -1) {
+ if (__fstat64_time64(fileno(fp), &statb) == -1) {
init_okshells:
(void)fclose(fp);
init_okshells_noclose:
|| (__builtin_constant_p (__l) && (__l) > 0))
/* Length is known to be safe at compile time if the __L * __S <= __OBJSZ
- condition can be folded to a constant and if it is true. The -1 check is
- redundant because since it implies that __glibc_safe_len_cond is true. */
+ condition can be folded to a constant and if it is true, or unknown (-1) */
#define __glibc_safe_or_unknown_len(__l, __s, __osz) \
- (__glibc_unsigned_or_positive (__l) \
- && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \
- __s, __osz)) \
- && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), __s, __osz))
+ ((__builtin_constant_p (__osz) && (__osz) == (__SIZE_TYPE__) -1) \
+ || (__glibc_unsigned_or_positive (__l) \
+ && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \
+ (__s), (__osz))) \
+ && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), (__s), (__osz))))
/* Conversely, we know at compile time that the length is unsafe if the
__L * __S <= __OBJSZ condition can be folded to a constant and if it is
/* Cancellation handling is back to the default. */
result->cancelhandling = 0;
- result->cancelstate = PTHREAD_CANCEL_ENABLE;
- result->canceltype = PTHREAD_CANCEL_DEFERRED;
result->cleanup = NULL;
result->setup_failed = 0;
__pthread_enable_asynccancel (void)
{
struct pthread *self = THREAD_SELF;
+ int oldval = atomic_load_relaxed (&self->cancelhandling);
- int oldval = THREAD_GETMEM (self, canceltype);
- THREAD_SETMEM (self, canceltype, PTHREAD_CANCEL_ASYNCHRONOUS);
+ while (1)
+ {
+ int newval = oldval | CANCELTYPE_BITMASK;
- int ch = THREAD_GETMEM (self, cancelhandling);
+ if (newval == oldval)
+ break;
- if (self->cancelstate == PTHREAD_CANCEL_ENABLE
- && (ch & CANCELED_BITMASK)
- && !(ch & EXITING_BITMASK)
- && !(ch & TERMINATED_BITMASK))
- {
- THREAD_SETMEM (self, result, PTHREAD_CANCELED);
- __do_cancel ();
+ if (atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+ &oldval, newval))
+ {
+ if (cancel_enabled_and_canceled_and_async (newval))
+ {
+ self->result = PTHREAD_CANCELED;
+ __do_cancel ();
+ }
+
+ break;
+ }
}
return oldval;
{
/* If asynchronous cancellation was enabled before we do not have
anything to do. */
- if (oldtype == PTHREAD_CANCEL_ASYNCHRONOUS)
+ if (oldtype & CANCELTYPE_BITMASK)
return;
struct pthread *self = THREAD_SELF;
- self->canceltype = PTHREAD_CANCEL_DEFERRED;
+ int newval;
+ int oldval = atomic_load_relaxed (&self->cancelhandling);
+ do
+ {
+ newval = oldval & ~CANCELTYPE_BITMASK;
+ }
+ while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+ &oldval, newval));
+
+ /* We cannot return when we are being canceled. Upon return the
+ thread might be things which would have to be undone. The
+ following loop should loop until the cancellation signal is
+ delivered. */
+ while (__glibc_unlikely ((newval & (CANCELING_BITMASK | CANCELED_BITMASK))
+ == CANCELING_BITMASK))
+ {
+ futex_wait_simple ((unsigned int *) &self->cancelhandling, newval,
+ FUTEX_PRIVATE);
+ newval = atomic_load_relaxed (&self->cancelhandling);
+ }
}
libc_hidden_def (__pthread_disable_asynccancel)
ibuf->priv.data.prev = THREAD_GETMEM (self, cleanup_jmp_buf);
ibuf->priv.data.cleanup = THREAD_GETMEM (self, cleanup);
- /* Disable asynchronous cancellation for now. */
- ibuf->priv.data.canceltype = THREAD_GETMEM (self, canceltype);
- THREAD_SETMEM (self, canceltype, PTHREAD_CANCEL_DEFERRED);
+ int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
+ if (__glibc_unlikely (cancelhandling & CANCELTYPE_BITMASK))
+ {
+ int newval;
+ do
+ {
+ newval = cancelhandling & ~CANCELTYPE_BITMASK;
+ }
+ while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+ &cancelhandling,
+ newval));
+ }
+
+ ibuf->priv.data.canceltype = (cancelhandling & CANCELTYPE_BITMASK
+ ? PTHREAD_CANCEL_ASYNCHRONOUS
+ : PTHREAD_CANCEL_DEFERRED);
/* Store the new cleanup handler info. */
THREAD_SETMEM (self, cleanup_jmp_buf, (struct pthread_unwind_buf *) buf);
THREAD_SETMEM (self, cleanup_jmp_buf, ibuf->priv.data.prev);
- THREAD_SETMEM (self, canceltype, ibuf->priv.data.canceltype);
- if (ibuf->priv.data.canceltype == PTHREAD_CANCEL_ASYNCHRONOUS)
- __pthread_testcancel ();
+ if (ibuf->priv.data.canceltype == PTHREAD_CANCEL_DEFERRED)
+ return;
+
+ int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
+ if ((cancelhandling & CANCELTYPE_BITMASK) == 0)
+ {
+ int newval;
+ do
+ {
+ newval = cancelhandling | CANCELTYPE_BITMASK;
+ }
+ while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+ &cancelhandling, newval));
+
+ if (cancel_enabled_and_canceled (cancelhandling))
+ {
+ self->result = PTHREAD_CANCELED;
+ __do_cancel ();
+ }
+ }
}
versioned_symbol (libc, ___pthread_unregister_cancel_restore,
__pthread_unregister_cancel_restore, GLIBC_2_34);
/* Flags determining processing of cancellation. */
int cancelhandling;
+ /* Bit set if cancellation is disabled. */
+#define CANCELSTATE_BIT 0
+#define CANCELSTATE_BITMASK (1 << CANCELSTATE_BIT)
+ /* Bit set if asynchronous cancellation mode is selected. */
+#define CANCELTYPE_BIT 1
+#define CANCELTYPE_BITMASK (1 << CANCELTYPE_BIT)
+ /* Bit set if canceling has been initiated. */
+#define CANCELING_BIT 2
+#define CANCELING_BITMASK (1 << CANCELING_BIT)
/* Bit set if canceled. */
#define CANCELED_BIT 3
-#define CANCELED_BITMASK (0x01 << CANCELED_BIT)
+#define CANCELED_BITMASK (1 << CANCELED_BIT)
/* Bit set if thread is exiting. */
#define EXITING_BIT 4
-#define EXITING_BITMASK (0x01 << EXITING_BIT)
+#define EXITING_BITMASK (1 << EXITING_BIT)
/* Bit set if thread terminated and TCB is freed. */
#define TERMINATED_BIT 5
-#define TERMINATED_BITMASK (0x01 << TERMINATED_BIT)
+#define TERMINATED_BITMASK (1 << TERMINATED_BIT)
/* Bit set if thread is supposed to change XID. */
#define SETXID_BIT 6
-#define SETXID_BITMASK (0x01 << SETXID_BIT)
+#define SETXID_BITMASK (1 << SETXID_BIT)
/* Flags. Including those copied from the thread attribute. */
int flags;
/* Indicates whether is a C11 thread created by thrd_creat. */
bool c11;
- /* Thread cancel state (PTHREAD_CANCEL_ENABLE or
- PTHREAD_CANCEL_DISABLE). */
- unsigned char cancelstate;
-
- /* Thread cancel type (PTHREAD_CANCEL_DEFERRED or
- PTHREAD_CANCEL_ASYNCHRONOUS). */
- unsigned char canceltype;
-
/* Used in __pthread_kill_internal to detected a thread that has
exited or is about to exit. exit_lock must only be acquired
after blocking signals. */
(sizeof (struct pthread) - offsetof (struct pthread, end_padding))
} __attribute ((aligned (TCB_ALIGNMENT)));
+static inline bool
+cancel_enabled_and_canceled (int value)
+{
+ return (value & (CANCELSTATE_BITMASK | CANCELED_BITMASK | EXITING_BITMASK
+ | TERMINATED_BITMASK))
+ == CANCELED_BITMASK;
+}
+
+static inline bool
+cancel_enabled_and_canceled_and_async (int value)
+{
+ return ((value) & (CANCELSTATE_BITMASK | CANCELTYPE_BITMASK | CANCELED_BITMASK
+ | EXITING_BITMASK | TERMINATED_BITMASK))
+ == (CANCELTYPE_BITMASK | CANCELED_BITMASK);
+}
+
/* This yields the pointer that TLS support code calls the thread pointer. */
#if TLS_TCB_AT_TP
# define TLS_TPADJ(pd) (pd)
buffer->__prev = THREAD_GETMEM (self, cleanup);
+ int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
+
/* Disable asynchronous cancellation for now. */
- buffer->__canceltype = THREAD_GETMEM (self, canceltype);
- THREAD_SETMEM (self, canceltype, PTHREAD_CANCEL_DEFERRED);
+ if (__glibc_unlikely (cancelhandling & CANCELTYPE_BITMASK))
+ {
+ int newval;
+ do
+ {
+ newval = cancelhandling & ~CANCELTYPE_BITMASK;
+ }
+ while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+ &cancelhandling,
+ newval));
+ }
+
+ buffer->__canceltype = (cancelhandling & CANCELTYPE_BITMASK
+ ? PTHREAD_CANCEL_ASYNCHRONOUS
+ : PTHREAD_CANCEL_DEFERRED);
THREAD_SETMEM (self, cleanup, buffer);
}
THREAD_SETMEM (self, cleanup, buffer->__prev);
- THREAD_SETMEM (self, canceltype, buffer->__canceltype);
- if (buffer->__canceltype == PTHREAD_CANCEL_ASYNCHRONOUS)
- __pthread_testcancel ();
+ int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
+ if (buffer->__canceltype != PTHREAD_CANCEL_DEFERRED
+ && (cancelhandling & CANCELTYPE_BITMASK) == 0)
+ {
+ int newval;
+ do
+ {
+ newval = cancelhandling | CANCELTYPE_BITMASK;
+ }
+ while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+ &cancelhandling, newval));
+
+ if (cancel_enabled_and_canceled (cancelhandling))
+ {
+ self->result = PTHREAD_CANCELED;
+ __do_cancel ();
+ }
+ }
}
libc_hidden_def (__libc_cleanup_pop_restore)
struct pthread *self = THREAD_SELF;
- int ch = atomic_load_relaxed (&self->cancelhandling);
- /* Cancelation not enabled, not cancelled, or already exitting. */
- if (self->cancelstate == PTHREAD_CANCEL_DISABLE
- || (ch & CANCELED_BITMASK) == 0
- || (ch & EXITING_BITMASK) != 0)
- return;
-
- /* Set the return value. */
- THREAD_SETMEM (self, result, PTHREAD_CANCELED);
- /* Make sure asynchronous cancellation is still enabled. */
- if (self->canceltype == PTHREAD_CANCEL_ASYNCHRONOUS)
- __do_cancel ();
+ int oldval = atomic_load_relaxed (&self->cancelhandling);
+ while (1)
+ {
+ /* We are canceled now. When canceled by another thread this flag
+ is already set but if the signal is directly send (internally or
+ from another process) is has to be done here. */
+ int newval = oldval | CANCELING_BITMASK | CANCELED_BITMASK;
+
+ if (oldval == newval || (oldval & EXITING_BITMASK) != 0)
+ /* Already canceled or exiting. */
+ break;
+
+ if (atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+ &oldval, newval))
+ {
+ self->result = PTHREAD_CANCELED;
+
+ /* Make sure asynchronous cancellation is still enabled. */
+ if ((oldval & CANCELTYPE_BITMASK) != 0)
+ /* Run the registered destructors and terminate the thread. */
+ __do_cancel ();
+ }
+ }
}
int
}
#endif
- int oldch = atomic_fetch_or_acquire (&pd->cancelhandling, CANCELED_BITMASK);
- if ((oldch & CANCELED_BITMASK) != 0)
- return 0;
-
- if (pd == THREAD_SELF)
+ /* Some syscalls are never restarted after being interrupted by a signal
+ handler, regardless of the use of SA_RESTART (they always fail with
+ EINTR). So pthread_cancel cannot send SIGCANCEL unless the cancellation
+ is enabled and set as asynchronous (in this case the cancellation will
+ be acted in the cancellation handler instead by the syscall wrapper).
+ Otherwise the target thread is set as 'cancelling' (CANCELING_BITMASK)
+ by atomically setting 'cancelhandling' and the cancelation will be acted
+ upon on next cancellation entrypoing in the target thread.
+
+ It also requires to atomically check if cancellation is enabled and
+ asynchronous, so both cancellation state and type are tracked on
+ 'cancelhandling'. */
+
+ int result = 0;
+ int oldval = atomic_load_relaxed (&pd->cancelhandling);
+ int newval;
+ do
{
- /* A single-threaded process should be able to kill itself, since there
- is nothing in the POSIX specification that says that it cannot. So
- we set multiple_threads to true so that cancellation points get
- executed. */
- THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
+ again:
+ newval = oldval | CANCELING_BITMASK | CANCELED_BITMASK;
+ if (oldval == newval)
+ break;
+
+ /* If the cancellation is handled asynchronously just send a
+ signal. We avoid this if possible since it's more
+ expensive. */
+ if (cancel_enabled_and_canceled_and_async (newval))
+ {
+ /* Mark the cancellation as "in progress". */
+ int newval2 = oldval | CANCELING_BITMASK;
+ if (!atomic_compare_exchange_weak_acquire (&pd->cancelhandling,
+ &oldval, newval2))
+ goto again;
+
+ if (pd == THREAD_SELF)
+ /* This is not merely an optimization: An application may
+ call pthread_cancel (pthread_self ()) without calling
+ pthread_create, so the signal handler may not have been
+ set up for a self-cancel. */
+ {
+ pd->result = PTHREAD_CANCELED;
+ if ((newval & CANCELTYPE_BITMASK) != 0)
+ __do_cancel ();
+ }
+ else
+ /* The cancellation handler will take care of marking the
+ thread as canceled. */
+ result = __pthread_kill_internal (th, SIGCANCEL);
+
+ break;
+ }
+
+ /* A single-threaded process should be able to kill itself, since
+ there is nothing in the POSIX specification that says that it
+ cannot. So we set multiple_threads to true so that cancellation
+ points get executed. */
+ THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
#ifndef TLS_MULTIPLE_THREADS_IN_TCB
__libc_multiple_threads = 1;
#endif
-
- THREAD_SETMEM (pd, result, PTHREAD_CANCELED);
- if (pd->cancelstate == PTHREAD_CANCEL_ENABLE
- && pd->canceltype == PTHREAD_CANCEL_ASYNCHRONOUS)
- __do_cancel ();
- return 0;
}
+ while (!atomic_compare_exchange_weak_acquire (&pd->cancelhandling, &oldval,
+ newval));
- return __pthread_kill_internal (th, SIGCANCEL);
+ return result;
}
versioned_symbol (libc, __pthread_cancel, pthread_cancel, GLIBC_2_34);
if ((pd == self
|| (self->joinid == pd
&& (pd->cancelhandling
- & (CANCELED_BITMASK | EXITING_BITMASK
+ & (CANCELING_BITMASK | CANCELED_BITMASK | EXITING_BITMASK
| TERMINATED_BITMASK)) == 0))
- && !(self->cancelstate == PTHREAD_CANCEL_ENABLE
- && (pd->cancelhandling & (CANCELED_BITMASK | EXITING_BITMASK
- | TERMINATED_BITMASK))
- == CANCELED_BITMASK))
+ && !cancel_enabled_and_canceled (self->cancelhandling))
/* This is a deadlock situation. The threads are waiting for each
other to finish. Note that this is a "may" error. To be 100%
sure we catch this error we would have to lock the data
self = THREAD_SELF;
- if (oldstate != NULL)
- *oldstate = self->cancelstate;
- self->cancelstate = state;
+ int oldval = atomic_load_relaxed (&self->cancelhandling);
+ while (1)
+ {
+ int newval = (state == PTHREAD_CANCEL_DISABLE
+ ? oldval | CANCELSTATE_BITMASK
+ : oldval & ~CANCELSTATE_BITMASK);
+
+ if (oldstate != NULL)
+ *oldstate = ((oldval & CANCELSTATE_BITMASK)
+ ? PTHREAD_CANCEL_DISABLE : PTHREAD_CANCEL_ENABLE);
+
+ if (oldval == newval)
+ break;
+
+ if (atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+ &oldval, newval))
+ {
+ if (cancel_enabled_and_canceled_and_async (newval))
+ __do_cancel ();
+
+ break;
+ }
+ }
return 0;
}
volatile struct pthread *self = THREAD_SELF;
- if (oldtype != NULL)
- *oldtype = self->canceltype;
- self->canceltype = type;
- if (type == PTHREAD_CANCEL_ASYNCHRONOUS)
- __pthread_testcancel ();
+ int oldval = atomic_load_relaxed (&self->cancelhandling);
+ while (1)
+ {
+ int newval = (type == PTHREAD_CANCEL_ASYNCHRONOUS
+ ? oldval | CANCELTYPE_BITMASK
+ : oldval & ~CANCELTYPE_BITMASK);
+
+ if (oldtype != NULL)
+ *oldtype = ((oldval & CANCELTYPE_BITMASK)
+ ? PTHREAD_CANCEL_ASYNCHRONOUS : PTHREAD_CANCEL_DEFERRED);
+
+ if (oldval == newval)
+ break;
+
+ if (atomic_compare_exchange_weak_acquire (&self->cancelhandling,
+ &oldval, newval))
+ {
+ if (cancel_enabled_and_canceled_and_async (newval))
+ {
+ THREAD_SETMEM (self, result, PTHREAD_CANCELED);
+ __do_cancel ();
+ }
+
+ break;
+ }
+ }
return 0;
}
___pthread_testcancel (void)
{
struct pthread *self = THREAD_SELF;
- int cancelhandling = THREAD_GETMEM (self, cancelhandling);
- if (self->cancelstate == PTHREAD_CANCEL_ENABLE
- && (cancelhandling & CANCELED_BITMASK)
- && !(cancelhandling & EXITING_BITMASK)
- && !(cancelhandling & TERMINATED_BITMASK))
+ int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
+ if (cancel_enabled_and_canceled (cancelhandling))
{
- THREAD_SETMEM (self, result, PTHREAD_CANCELED);
+ self->result = PTHREAD_CANCELED;
__do_cancel ();
}
}
#include <jmpbuf-unwind.h>
#include <shlib-compat.h>
-#ifdef _STACK_GROWS_DOWN
+#if _STACK_GROWS_DOWN
# define FRAME_LEFT(frame, other, adj) \
((uintptr_t) frame - adj >= (uintptr_t) other - adj)
#elif _STACK_GROWS_UP
sizeof (buf))) != -1)
;
- __bump_nl_timestamp ();
+ dbs[hstdb].head->extra_data[NSCD_HST_IDX_CONF_TIMESTAMP]
+ = __bump_nl_timestamp ();
}
# endif
else
tst-nss-test1 \
tst-nss-test2 \
tst-nss-test4 \
- tst-nss-test5
+ tst-nss-test5 \
+ tst-nss-test_errno
xtests = bug-erange
tests-container = \
ifeq ($(build-static-nss),yes)
tests-static += tst-nss-static
endif
-extra-test-objs += nss_test1.os nss_test2.os
+extra-test-objs += nss_test1.os nss_test2.os nss_test_errno.os
include ../Rules
libof-nss_test1 = extramodules
libof-nss_test2 = extramodules
+libof-nss_test_errno = extramodules
$(objpfx)/libnss_test1.so: $(objpfx)nss_test1.os $(link-libc-deps)
$(build-module)
$(objpfx)/libnss_test2.so: $(objpfx)nss_test2.os $(link-libc-deps)
$(build-module)
+$(objpfx)/libnss_test_errno.so: $(objpfx)nss_test_errno.os $(link-libc-deps)
+ $(build-module)
$(objpfx)nss_test2.os : nss_test1.c
-ifdef libnss_test1.so-version
-$(objpfx)/libnss_test1.so$(libnss_test1.so-version): $(objpfx)/libnss_test1.so
+# Use the nss_files suffix for these objects as well.
+$(objpfx)/libnss_test1.so$(libnss_files.so-version): $(objpfx)/libnss_test1.so
$(make-link)
-endif
-ifdef libnss_test2.so-version
-$(objpfx)/libnss_test2.so$(libnss_test2.so-version): $(objpfx)/libnss_test2.so
+$(objpfx)/libnss_test2.so$(libnss_files.so-version): $(objpfx)/libnss_test2.so
+ $(make-link)
+$(objpfx)/libnss_test_errno.so$(libnss_files.so-version): \
+ $(objpfx)/libnss_test_errno.so
$(make-link)
-endif
$(patsubst %,$(objpfx)%.out,$(tests) $(tests-container)) : \
- $(objpfx)/libnss_test1.so$(libnss_test1.so-version) \
- $(objpfx)/libnss_test2.so$(libnss_test2.so-version)
+ $(objpfx)/libnss_test1.so$(libnss_files.so-version) \
+ $(objpfx)/libnss_test2.so$(libnss_files.so-version) \
+ $(objpfx)/libnss_test_errno.so$(libnss_files.so-version)
ifeq (yes,$(have-thread-library))
$(objpfx)tst-cancel-getpwuid_r: $(shared-thread-library)
LDFLAGS-tst-nss-test3 = -Wl,--disable-new-dtags
LDFLAGS-tst-nss-test4 = -Wl,--disable-new-dtags
LDFLAGS-tst-nss-test5 = -Wl,--disable-new-dtags
+LDFLAGS-tst-nss-test_errno = -Wl,--disable-new-dtags
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
+#include <assert.h>
#include "nsswitch.h"
/*******************************************************************\
*ni = DATABASE_NAME_SYMBOL;
+ /* We want to know about it if we've somehow got a NULL action list;
+ in the past, we had bad state if seccomp interfered with setup. */
+ assert(*ni != NULL);
+
return __nss_lookup (ni, fct_name, fct2_name, fctp);
}
libc_hidden_def (DB_LOOKUP_FCT)
return true;
}
- /* Before we reload, verify that "/" hasn't changed. We assume that
- errors here are very unlikely, but the chance that we're entering
- a container is also very unlikely, so we err on the side of both
- very unlikely things not happening at the same time. */
- if (__stat64_time64 ("/", &str) != 0
- || (local->root_ino != 0
- && (str.st_ino != local->root_ino
- || str.st_dev != local->root_dev)))
+ int stat_rv = __stat64_time64 ("/", &str);
+
+ if (local->data.services[database_index] != NULL)
{
- /* Change detected; disable reloading and return current state. */
- atomic_store_release (&local->data.reload_disabled, 1);
- *result = local->data.services[database_index];
- __libc_lock_unlock (local->lock);
- return true;
+ /* Before we reload, verify that "/" hasn't changed. We assume that
+ errors here are very unlikely, but the chance that we're entering
+ a container is also very unlikely, so we err on the side of both
+ very unlikely things not happening at the same time. */
+ if (stat_rv != 0
+ || (local->root_ino != 0
+ && (str.st_ino != local->root_ino
+ || str.st_dev != local->root_dev)))
+ {
+ /* Change detected; disable reloading and return current state. */
+ atomic_store_release (&local->data.reload_disabled, 1);
+ *result = local->data.services[database_index];
+ __libc_lock_unlock (local->lock);
+ return true;
+ }
+ }
+ if (stat_rv == 0)
+ {
+ local->root_ino = str.st_ino;
+ local->root_dev = str.st_dev;
}
- local->root_ino = str.st_ino;
- local->root_dev = str.st_dev;
+
__libc_lock_unlock (local->lock);
/* Avoid overwriting the global configuration until we have loaded
void *
__nss_module_get_function (struct nss_module *module, const char *name)
{
+ /* A successful dlopen might clobber errno. */
+ int saved_errno = errno;
+
if (!__nss_module_load (module))
- return NULL;
+ {
+ /* Reporting module load failure is currently inaccurate. See
+ bug 22041. Not changing errno is the conservative choice. */
+ __set_errno (saved_errno);
+ return NULL;
+ }
+
+ __set_errno (saved_errno);
function_name *name_entry = bsearch (name, nss_function_name_array,
array_length (nss_function_name_array),
--- /dev/null
+/* NSS service provider with errno clobber.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <errno.h>
+#include <nss.h>
+#include <stdlib.h>
+
+/* Catch misnamed and functions. */
+#pragma GCC diagnostic error "-Wmissing-prototypes"
+NSS_DECLARE_MODULE_FUNCTIONS (test_errno)
+
+static void __attribute__ ((constructor))
+init (void)
+{
+ /* An arbitrary error code which is otherwise not used. */
+ errno = -1009;
+}
+
+/* Lookup functions for pwd follow that do not return any data. */
+
+/* Catch misnamed function definitions. */
+
+enum nss_status
+_nss_test_errno_setpwent (int stayopen)
+{
+ setenv ("_nss_test_errno_setpwent", "yes", 1);
+ return NSS_STATUS_SUCCESS;
+}
+
+enum nss_status
+_nss_test_errno_getpwent_r (struct passwd *result,
+ char *buffer, size_t size, int *errnop)
+{
+ setenv ("_nss_test_errno_getpwent_r", "yes", 1);
+ return NSS_STATUS_NOTFOUND;
+}
+
+enum nss_status
+_nss_test_errno_endpwent (void)
+{
+ setenv ("_nss_test_errno_endpwent", "yes", 1);
+ return NSS_STATUS_SUCCESS;
+}
--- /dev/null
+/* getpwent failure when dlopen clobbers errno (bug 28953).
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <nss.h>
+#include <support/check.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <pwd.h>
+#include <string.h>
+
+static int
+do_test (void)
+{
+ __nss_configure_lookup ("passwd", "files test_errno");
+
+ errno = 0;
+ setpwent ();
+ TEST_COMPARE (errno, 0);
+
+ bool root_seen = false;
+ while (true)
+ {
+ errno = 0;
+ struct passwd *e = getpwent ();
+ if (e == NULL)
+ break;
+ if (strcmp (e->pw_name, "root"))
+ root_seen = true;
+ }
+
+ TEST_COMPARE (errno, 0);
+ TEST_VERIFY (root_seen);
+
+ errno = 0;
+ endpwent ();
+ TEST_COMPARE (errno, 0);
+
+ TEST_COMPARE_STRING (getenv ("_nss_test_errno_setpwent"), "yes");
+ TEST_COMPARE_STRING (getenv ("_nss_test_errno_getpwent_r"), "yes");
+ TEST_COMPARE_STRING (getenv ("_nss_test_errno_endpwent"), "yes");
+
+ return 0;
+}
+
+#include <support/test-driver.c>
best effort to make is async-signal-safe at least for single-thread
case. */
bool multiple_threads = __libc_single_threaded == 0;
+ uint64_t lastrun;
- __run_fork_handlers (atfork_run_prepare, multiple_threads);
+ lastrun = __run_prefork_handlers (multiple_threads);
struct nss_database_data nss_database_data;
reclaim_stacks ();
/* Run the handlers registered for the child. */
- __run_fork_handlers (atfork_run_child, multiple_threads);
+ __run_postfork_handlers (atfork_run_child, multiple_threads, lastrun);
}
else
{
}
/* Run the handlers registered for the parent. */
- __run_fork_handlers (atfork_run_parent, multiple_threads);
+ __run_postfork_handlers (atfork_run_parent, multiple_threads, lastrun);
if (pid < 0)
__set_errno (save_errno);
optimizes away the pattern == NULL test below. */
# define _GL_ARG_NONNULL(params)
-# include <config.h>
+# include <libc-config.h>
#endif
#include <glob.h>
#include <errno.h>
+#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <stdbool.h>
# define sysconf(id) __sysconf (id)
# define closedir(dir) __closedir (dir)
# define opendir(name) __opendir (name)
+# undef dirfd
+# define dirfd(str) __dirfd (str)
# define readdir(str) __readdir64 (str)
# define getpwnam_r(name, bufp, buf, len, res) \
__getpwnam_r (name, bufp, buf, len, res)
# ifndef GLOB_LSTAT
# define GLOB_LSTAT gl_lstat
# endif
-# ifndef GLOB_STAT64
-# define GLOB_STAT64 __stat64
-# endif
-# ifndef GLOB_LSTAT64
-# define GLOB_LSTAT64 __lstat64
+# ifndef GLOB_FSTATAT64
+# define GLOB_FSTATAT64 __fstatat64
# endif
# include <shlib-compat.h>
#else /* !_LIBC */
# define struct_stat struct stat
# define struct_stat64 struct stat
# define GLOB_LSTAT gl_lstat
-# define GLOB_STAT64 stat
-# define GLOB_LSTAT64 lstat
+# define GLOB_FSTATAT64 fstatat
#endif /* _LIBC */
#include <fnmatch.h>
} ust;
return (__glibc_unlikely (flags & GLOB_ALTDIRFUNC)
? pglob->GLOB_LSTAT (fullname, &ust.st)
- : GLOB_LSTAT64 (fullname, &ust.st64));
+ : GLOB_FSTATAT64 (AT_FDCWD, fullname, &ust.st64,
+ AT_SYMLINK_NOFOLLOW));
}
/* Set *R = A + B. Return true if the answer is mathematically
struct_stat64 st64;
return (__glibc_unlikely (flags & GLOB_ALTDIRFUNC)
? pglob->gl_stat (filename, &st) == 0 && S_ISDIR (st.st_mode)
- : GLOB_STAT64 (filename, &st64) == 0 && S_ISDIR (st64.st_mode));
+ : (GLOB_FSTATAT64 (AT_FDCWD, filename, &st64, 0) == 0
+ && S_ISDIR (st64.st_mode)));
}
/* Find the end of the sub-pattern in a brace expression. */
else
{
#ifndef WINDOWS32
+ /* Recognize ~user as a shorthand for the specified user's home
+ directory. */
char *end_name = strchr (dirname, '/');
char *user_name;
int malloc_user_name = 0;
}
scratch_buffer_free (&pwtmpbuf);
}
-#endif /* !WINDOWS32 */
+#else /* WINDOWS32 */
+ /* On native Windows, access to a user's home directory
+ (via GetUserProfileDirectory) or to a user's environment
+ variables (via ExpandEnvironmentStringsForUser) requires
+ the credentials of the user. Therefore we cannot support
+ the ~user syntax on this platform.
+ Handling ~user specially (and treat it like plain ~) if
+ user is getenv ("USERNAME") would not be a good idea,
+ since it would make people think that ~user is supported
+ in general. */
+ if (flags & GLOB_TILDE_CHECK)
+ {
+ retval = GLOB_NOMATCH;
+ goto out;
+ }
+#endif /* WINDOWS32 */
}
}
{
size_t dirlen = strlen (directory);
void *stream = NULL;
+ struct scratch_buffer s;
+ scratch_buffer_init (&s);
# define GLOBNAMES_MEMBERS(nnames) \
struct globnames *next; size_t count; char *name[nnames];
struct globnames { GLOBNAMES_MEMBERS (FLEXIBLE_ARRAY_MEMBER) };
}
else
{
+ int dfd = dirfd (stream);
int fnm_flags = ((!(flags & GLOB_PERIOD) ? FNM_PERIOD : 0)
| ((flags & GLOB_NOESCAPE) ? FNM_NOESCAPE : 0));
flags |= GLOB_MAGCHAR;
if (flags & GLOB_ONLYDIR)
switch (readdir_result_type (d))
{
- case DT_DIR: case DT_LNK: case DT_UNKNOWN: break;
default: continue;
+ case DT_DIR: break;
+ case DT_LNK: case DT_UNKNOWN:
+ /* The filesystem was too lazy to give us a hint,
+ so we have to do it the hard way. */
+ if (__glibc_unlikely (dfd < 0 || flags & GLOB_ALTDIRFUNC))
+ {
+ size_t namelen = strlen (d.name);
+ size_t need = dirlen + 1 + namelen + 1;
+ if (s.length < need
+ && !scratch_buffer_set_array_size (&s, need, 1))
+ goto memory_error;
+ char *p = mempcpy (s.data, directory, dirlen);
+ *p = '/';
+ p += p[-1] != '/';
+ memcpy (p, d.name, namelen + 1);
+ if (! is_dir (s.data, flags, pglob))
+ continue;
+ }
+ else
+ {
+ struct_stat64 st64;
+ if (! (GLOB_FSTATAT64 (dfd, d.name, &st64, 0) == 0
+ && S_ISDIR (st64.st_mode)))
+ continue;
+ }
}
if (fnmatch (pattern, d.name, fnm_flags) == 0)
__set_errno (save);
}
+ scratch_buffer_free (&s);
return result;
}
#include <libc-lock.h>
#include <stdbool.h>
#include <register-atfork.h>
+#include <intprops.h>
+#include <stdio.h>
#define DYNARRAY_ELEMENT struct fork_handler
#define DYNARRAY_STRUCT fork_handler_list
#include <malloc/dynarray-skeleton.c>
static struct fork_handler_list fork_handlers;
-static bool fork_handler_init = false;
+static uint64_t fork_handler_counter;
static int atfork_lock = LLL_LOCK_INITIALIZER;
{
lll_lock (atfork_lock, LLL_PRIVATE);
- if (!fork_handler_init)
- {
- fork_handler_list_init (&fork_handlers);
- fork_handler_init = true;
- }
+ if (fork_handler_counter == 0)
+ fork_handler_list_init (&fork_handlers);
struct fork_handler *newp = fork_handler_list_emplace (&fork_handlers);
if (newp != NULL)
newp->parent_handler = parent;
newp->child_handler = child;
newp->dso_handle = dso_handle;
+
+ /* IDs assigned to handlers start at 1 and increment with handler
+ registration. Un-registering a handlers discards the corresponding
+ ID. It is not reused in future registrations. */
+ if (INT_ADD_OVERFLOW (fork_handler_counter, 1))
+ __libc_fatal ("fork handler counter overflow");
+ newp->id = ++fork_handler_counter;
}
/* Release the lock. */
lll_unlock (atfork_lock, LLL_PRIVATE);
}
-void
-__run_fork_handlers (enum __run_fork_handler_type who, _Bool do_locking)
+uint64_t
+__run_prefork_handlers (_Bool do_locking)
{
- struct fork_handler *runp;
+ uint64_t lastrun;
- if (who == atfork_run_prepare)
+ if (do_locking)
+ lll_lock (atfork_lock, LLL_PRIVATE);
+
+ /* We run prepare handlers from last to first. After fork, only
+ handlers up to the last handler found here (pre-fork) will be run.
+ Handlers registered during __run_prefork_handlers or
+ __run_postfork_handlers will be positioned after this last handler, and
+ since their prepare handlers won't be run now, their parent/child
+ handlers should also be ignored. */
+ lastrun = fork_handler_counter;
+
+ size_t sl = fork_handler_list_size (&fork_handlers);
+ for (size_t i = sl; i > 0;)
{
- if (do_locking)
- lll_lock (atfork_lock, LLL_PRIVATE);
- size_t sl = fork_handler_list_size (&fork_handlers);
- for (size_t i = sl; i > 0; i--)
- {
- runp = fork_handler_list_at (&fork_handlers, i - 1);
- if (runp->prepare_handler != NULL)
- runp->prepare_handler ();
- }
+ struct fork_handler *runp
+ = fork_handler_list_at (&fork_handlers, i - 1);
+
+ uint64_t id = runp->id;
+
+ if (runp->prepare_handler != NULL)
+ {
+ if (do_locking)
+ lll_unlock (atfork_lock, LLL_PRIVATE);
+
+ runp->prepare_handler ();
+
+ if (do_locking)
+ lll_lock (atfork_lock, LLL_PRIVATE);
+ }
+
+ /* We unlocked, ran the handler, and locked again. In the
+ meanwhile, one or more deregistrations could have occurred leading
+ to the current (just run) handler being moved up the list or even
+ removed from the list itself. Since handler IDs are guaranteed to
+ to be in increasing order, the next handler has to have: */
+
+ /* A. An earlier position than the current one has. */
+ i--;
+
+ /* B. A lower ID than the current one does. The code below skips
+ any newly added handlers with higher IDs. */
+ while (i > 0
+ && fork_handler_list_at (&fork_handlers, i - 1)->id >= id)
+ i--;
}
- else
+
+ return lastrun;
+}
+
+void
+__run_postfork_handlers (enum __run_fork_handler_type who, _Bool do_locking,
+ uint64_t lastrun)
+{
+ size_t sl = fork_handler_list_size (&fork_handlers);
+ for (size_t i = 0; i < sl;)
{
- size_t sl = fork_handler_list_size (&fork_handlers);
- for (size_t i = 0; i < sl; i++)
- {
- runp = fork_handler_list_at (&fork_handlers, i);
- if (who == atfork_run_child && runp->child_handler)
- runp->child_handler ();
- else if (who == atfork_run_parent && runp->parent_handler)
- runp->parent_handler ();
- }
+ struct fork_handler *runp = fork_handler_list_at (&fork_handlers, i);
+ uint64_t id = runp->id;
+
+ /* prepare handlers were not run for handlers with ID > LASTRUN.
+ Thus, parent/child handlers will also not be run. */
+ if (id > lastrun)
+ break;
+
if (do_locking)
- lll_unlock (atfork_lock, LLL_PRIVATE);
+ lll_unlock (atfork_lock, LLL_PRIVATE);
+
+ if (who == atfork_run_child && runp->child_handler)
+ runp->child_handler ();
+ else if (who == atfork_run_parent && runp->parent_handler)
+ runp->parent_handler ();
+
+ if (do_locking)
+ lll_lock (atfork_lock, LLL_PRIVATE);
+
+ /* We unlocked, ran the handler, and locked again. In the meanwhile,
+ one or more [de]registrations could have occurred. Due to this,
+ the list size must be updated. */
+ sl = fork_handler_list_size (&fork_handlers);
+
+ /* The just-run handler could also have moved up the list. */
+
+ if (sl > i && fork_handler_list_at (&fork_handlers, i)->id == id)
+ /* The position of the recently run handler hasn't changed. The
+ next handler to be run is an easy increment away. */
+ i++;
+ else
+ {
+ /* The next handler to be run is the first handler in the list
+ to have an ID higher than the current one. */
+ for (i = 0; i < sl; i++)
+ {
+ if (fork_handler_list_at (&fork_handlers, i)->id > id)
+ break;
+ }
+ }
}
+
+ if (do_locking)
+ lll_unlock (atfork_lock, LLL_PRIVATE);
}
#include <support/check.h>
#include <support/xunistd.h>
#include <sys/wait.h>
+#include <sys/ioctl.h>
#include <stdlib.h>
+#include <termios.h>
+
+#ifndef PATH_MAX
+# define PATH_MAX 1024
+#endif
+static char ptmxpath[PATH_MAX];
static int
handle_restart (const char *argv1, const char *argv2)
}
static int
-do_test (int argc, char *argv[])
+run_test (int argc, char *argv[])
{
/* We must have either:
- four parameters left if called initially:
+ --setgrpr optional
*/
- if (restart)
- return handle_restart (argv[1], argv[2]);
-
- int tcfd = open64 (_PATH_TTY, O_RDONLY, 0600);
- if (tcfd == -1)
- {
- if (errno == ENXIO)
- FAIL_UNSUPPORTED ("terminal not available, skipping test");
- FAIL_EXIT1 ("open64 (\"%s\", 0x%x, 0600): %m", _PATH_TTY, O_RDONLY);
- }
+ int tcfd = xopen (ptmxpath, O_RDONLY, 0600);
/* Check setting the controlling terminal without changing the group. */
{
return 0;
}
+static int
+do_test (int argc, char *argv[])
+{
+ if (restart)
+ return handle_restart (argv[1], argv[2]);
+
+ pid_t pid = xfork ();
+ if (pid == 0)
+ {
+ /* Create a pseudo-terminal to avoid interfering with the one using by
+ test itself, creates a new session (so there is no controlling
+ terminal), and set the pseudo-terminal as the controlling one. */
+ int ptmx = posix_openpt (0);
+ if (ptmx == -1)
+ {
+ if (errno == ENXIO)
+ FAIL_UNSUPPORTED ("terminal not available, skipping test");
+ FAIL_EXIT1 ("posix_openpt (0): %m");
+ }
+ TEST_VERIFY_EXIT (grantpt (ptmx) == 0);
+ TEST_VERIFY_EXIT (unlockpt (ptmx) == 0);
+
+ TEST_VERIFY_EXIT (setsid () != -1);
+ TEST_VERIFY_EXIT (ioctl (ptmx, TIOCSCTTY, NULL) == 0);
+ while (dup2 (ptmx, STDIN_FILENO) == -1 && errno == EBUSY)
+ ;
+ while (dup2 (ptmx, STDOUT_FILENO) == -1 && errno == EBUSY)
+ ;
+ while (dup2 (ptmx, STDERR_FILENO) == -1 && errno == EBUSY)
+ ;
+ TEST_VERIFY_EXIT (ptsname_r (ptmx, ptmxpath, sizeof ptmxpath) == 0);
+ xclose (ptmx);
+
+ run_test (argc, argv);
+ _exit (0);
+ }
+ int status;
+ xwaitpid (pid, &status, 0);
+ TEST_VERIFY (WIFEXITED (status));
+ exit (0);
+}
+
#define TEST_FUNCTION_ARGV do_test
#include <support/test-driver.c>
inet_pton \
ns_makecanon \
ns_name_compress \
+ ns_name_length_uncompressed \
ns_name_ntop \
ns_name_pack \
ns_name_pton \
ns_name_skip \
ns_name_uncompress \
ns_name_unpack \
+ ns_rr_cursor_init \
+ ns_rr_cursor_next \
+ ns_samebinaryname \
ns_samename \
nsap_addr \
nss_dns_functions \
extra-libs := libresolv libnss_dns
ifeq ($(have-thread-library),yes)
routines += gai_sigqueue
-endif
-
-ifeq ($(have-GLIBC_2.34)$(have-thread-library),yesyes)
-# Empty compatibility library for old binaries.
-extra-libs += libanl
tests += \
tst-bug18665 \
tst-ns_name_pton \
tst-res_hconf_reorder \
tst-res_hnok \
+ tst-resolv-aliases \
tst-resolv-basic \
tst-resolv-binary \
+ tst-resolv-byaddr \
tst-resolv-edns \
+ tst-resolv-invalid-cname \
tst-resolv-network \
tst-resolv-nondecimal \
tst-resolv-res_init-multi \
tests-internal += tst-resolv-txnid-collision
tests-static += tst-resolv-txnid-collision
+# Likewise for __ns_samebinaryname.
+tests-internal += tst-ns_samebinaryname
+tests-static += tst-ns_samebinaryname
+
+# Likewise for __ns_name_length_uncompressed.
+tests-internal += tst-ns_name_length_uncompressed
+tests-static += tst-ns_name_length_uncompressed
+
+# Likewise for struct ns_rr_cursor and its functions.
+tests-internal += tst-ns_rr_cursor
+tests-static += tst-ns_rr_cursor
+
# These tests need libdl.
ifeq (yes,$(build-shared))
tests += \
# This test has dropped packet tests and runs for a long time.
xtests += tst-resolv-rotate
-endif
+endif # $(have-thread-library)
+
extra-libs-others = $(extra-libs)
libresolv-routines := \
base64 \
resolv-deprecated \
# libresolv-routines
+ifeq ($(have-GLIBC_2.34)$(have-thread-library),yesyes)
+# Empty compatibility library for old binaries.
+extra-libs += libanl
+libanl-routines += libanl-compat
+libanl-shared-only-routines += libanl-compat
+endif
+
$(libanl-routines-var) += \
gai_cancel \
gai_error \
getaddrinfo_a \
# $(libanl-routines-var)
-libanl-routines += libanl-compat
-libanl-shared-only-routines += libanl-compat
-
# Pretend that libanl.so is a linker script, so that the symbolic link
# is not installed.
install-lib-ldscripts = libanl.so
$(objpfx)tst-resolv-ai_idn-latin1.out: $(gen-locales)
$(objpfx)tst-resolv-ai_idn-nolibidn2.out: \
$(gen-locales) $(objpfx)tst-no-libidn2.so
+$(objpfx)tst-resolv-aliases: $(objpfx)libresolv.so $(shared-thread-library)
$(objpfx)tst-resolv-basic: $(objpfx)libresolv.so $(shared-thread-library)
$(objpfx)tst-resolv-binary: $(objpfx)libresolv.so $(shared-thread-library)
+$(objpfx)tst-resolv-byaddr: $(objpfx)libresolv.so $(shared-thread-library)
$(objpfx)tst-resolv-edns: $(objpfx)libresolv.so $(shared-thread-library)
$(objpfx)tst-resolv-network: $(objpfx)libresolv.so $(shared-thread-library)
$(objpfx)tst-resolv-res_init: $(objpfx)libresolv.so
$(shared-thread-library)
$(objpfx)tst-resolv-res_init-thread: $(objpfx)libresolv.so \
$(shared-thread-library)
+$(objpfx)tst-resolv-invalid-cname: $(objpfx)libresolv.so \
+ $(shared-thread-library)
$(objpfx)tst-resolv-nondecimal: $(objpfx)libresolv.so $(shared-thread-library)
$(objpfx)tst-resolv-qtypes: $(objpfx)libresolv.so $(shared-thread-library)
$(objpfx)tst-resolv-rotate: $(objpfx)libresolv.so $(shared-thread-library)
res_hconf.c and res_hconf.h were contributed by David Mosberger, and
do not come from BIND.
-
-The files gethnamaddr.c, mapv4v6addr.h and mapv4v6hostent.h are
-leftovers from BIND 4.9.7.
+++ /dev/null
-/*
- * ++Copyright++ 1985, 1988, 1993
- * -
- * Copyright (c) 1985, 1988, 1993
- * The Regents of the University of California. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * -
- * Portions Copyright (c) 1993 by Digital Equipment Corporation.
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies, and that
- * the name of Digital Equipment Corporation not be used in advertising or
- * publicity pertaining to distribution of the document or software without
- * specific, written prior permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND DIGITAL EQUIPMENT CORP. DISCLAIMS ALL
- * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL DIGITAL EQUIPMENT
- * CORPORATION BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
- * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
- * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- * -
- * --Copyright--
- */
-
-#include <string.h>
-#include <arpa/nameser.h>
-
-static void
-map_v4v6_address (const char *src, char *dst)
-{
- u_char *p = (u_char *) dst;
- int i;
-
- /* Move the IPv4 part to the right position. */
- memcpy (dst + 12, src, INADDRSZ);
-
- /* Mark this ipv6 addr as a mapped ipv4. */
- for (i = 0; i < 10; i++)
- *p++ = 0x00;
- *p++ = 0xff;
- *p = 0xff;
-}
+++ /dev/null
-/*
- * ++Copyright++ 1985, 1988, 1993
- * -
- * Copyright (c) 1985, 1988, 1993
- * The Regents of the University of California. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * -
- * Portions Copyright (c) 1993 by Digital Equipment Corporation.
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies, and that
- * the name of Digital Equipment Corporation not be used in advertising or
- * publicity pertaining to distribution of the document or software without
- * specific, written prior permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND DIGITAL EQUIPMENT CORP. DISCLAIMS ALL
- * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL DIGITAL EQUIPMENT
- * CORPORATION BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
- * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
- * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- * -
- * --Copyright--
- */
-
-#include <arpa/nameser.h>
-#include <sys/socket.h>
-
-typedef union {
- int32_t al;
- char ac;
-} align;
-
-static int
-map_v4v6_hostent (struct hostent *hp, char **bpp, int *lenp)
-{
- char **ap;
-
- if (hp->h_addrtype != AF_INET || hp->h_length != INADDRSZ)
- return 0;
- hp->h_addrtype = AF_INET6;
- hp->h_length = IN6ADDRSZ;
- for (ap = hp->h_addr_list; *ap; ap++)
- {
- int i = sizeof (align) - ((u_long) *bpp % sizeof (align));
-
- if (*lenp < (i + IN6ADDRSZ))
- /* Out of memory. */
- return 1;
- *bpp += i;
- *lenp -= i;
- map_v4v6_address (*ap, *bpp);
- *ap = *bpp;
- *bpp += IN6ADDRSZ;
- *lenp -= IN6ADDRSZ;
- }
- return 0;
-}
--- /dev/null
+/* Skip over an uncompressed name in wire format.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <arpa/nameser.h>
+#include <errno.h>
+#include <stdbool.h>
+
+int
+__ns_name_length_uncompressed (const unsigned char *p,
+ const unsigned char *eom)
+{
+ const unsigned char *start = p;
+
+ while (true)
+ {
+ if (p == eom)
+ {
+ /* Truncated packet: no room for label length. */
+ __set_errno (EMSGSIZE);
+ return -1;
+ }
+
+ unsigned char b = *p;
+ ++p;
+ if (b == 0)
+ {
+ /* Root label. */
+ size_t length = p - start;
+ if (length > NS_MAXCDNAME)
+ {
+ /* Domain name too long. */
+ __set_errno (EMSGSIZE);
+ return -1;
+ }
+ return length;
+ }
+
+ if (b <= 63)
+ {
+ /* Regular label. */
+ if (b <= eom - p)
+ p += b;
+ else
+ {
+ /* Truncated packet: label incomplete. */
+ __set_errno (EMSGSIZE);
+ return -1;
+ }
+ }
+ else
+ {
+ /* Compression reference or corrupted label length. */
+ __set_errno (EMSGSIZE);
+ return -1;
+ }
+ }
+}
--- /dev/null
+/* Initialize a simple DNS packet parser.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <arpa/nameser.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <string.h>
+
+bool
+__ns_rr_cursor_init (struct ns_rr_cursor *c,
+ const unsigned char *buf, size_t len)
+{
+ c->begin = buf;
+ c->end = buf + len;
+
+ /* Check for header size and 16-bit question count value (it must be 1). */
+ if (len < 12 || buf[4] != 0 || buf[5] != 1)
+ {
+ __set_errno (EMSGSIZE);
+ c->current = c->end;
+ return false;
+ }
+ c->current = buf + 12;
+
+ int consumed = __ns_name_length_uncompressed (c->current, c->end);
+ if (consumed < 0)
+ {
+ __set_errno (EMSGSIZE);
+ c->current = c->end;
+ c->first_rr = NULL;
+ return false;
+ }
+ c->current += consumed;
+
+ /* Ensure there is room for question type and class. */
+ if (c->end - c->current < 4)
+ {
+ __set_errno (EMSGSIZE);
+ c->current = c->end;
+ c->first_rr = NULL;
+ return false;
+ }
+ c->current += 4;
+ c->first_rr = c->current;
+
+ return true;
+}
--- /dev/null
+/* Simple DNS record parser without textual name decoding.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <arpa/nameser.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <string.h>
+
+bool
+__ns_rr_cursor_next (struct ns_rr_cursor *c, struct ns_rr_wire *rr)
+{
+ rr->rdata = NULL;
+
+ /* Extract the record owner name. */
+ int consumed = __ns_name_unpack (c->begin, c->end, c->current,
+ rr->rname, sizeof (rr->rname));
+ if (consumed < 0)
+ {
+ memset (rr, 0, sizeof (*rr));
+ __set_errno (EMSGSIZE);
+ return false;
+ }
+ c->current += consumed;
+
+ /* Extract the metadata. */
+ struct
+ {
+ uint16_t rtype;
+ uint16_t rclass;
+ uint32_t ttl;
+ uint16_t rdlength;
+ } __attribute__ ((packed)) metadata;
+ _Static_assert (sizeof (metadata) == 10, "sizeof metadata");
+ if (c->end - c->current < sizeof (metadata))
+ {
+ memset (rr, 0, sizeof (*rr));
+ __set_errno (EMSGSIZE);
+ return false;
+ }
+ memcpy (&metadata, c->current, sizeof (metadata));
+ c->current += sizeof (metadata);
+ /* Endianess conversion. */
+ rr->rtype = ntohs (metadata.rtype);
+ rr->rclass = ntohs (metadata.rclass);
+ rr->ttl = ntohl (metadata.ttl);
+ rr->rdlength = ntohs (metadata.rdlength);
+
+ /* Extract record data. */
+ if (c->end - c->current < rr->rdlength)
+ {
+ memset (rr, 0, sizeof (*rr));
+ __set_errno (EMSGSIZE);
+ return false;
+ }
+ rr->rdata = c->current;
+ c->current += rr->rdlength;
+
+ return true;
+}
--- /dev/null
+/* Compare two binary domain names for quality.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <arpa/nameser.h>
+#include <stdbool.h>
+
+/* Convert ASCII letters to upper case. */
+static inline int
+ascii_toupper (unsigned char ch)
+{
+ if (ch >= 'a' && ch <= 'z')
+ return ch - 'a' + 'A';
+ else
+ return ch;
+}
+
+bool
+__ns_samebinaryname (const unsigned char *a, const unsigned char *b)
+{
+ while (*a != 0 && *b != 0)
+ {
+ if (*a != *b)
+ /* Different label length. */
+ return false;
+ int labellen = *a;
+ ++a;
+ ++b;
+ for (int i = 0; i < labellen; ++i)
+ {
+ if (*a != *b && ascii_toupper (*a) != ascii_toupper (*b))
+ /* Different character in label. */
+ return false;
+ ++a;
+ ++b;
+ }
+ }
+
+ /* Match if both names are at the root label. */
+ return *a == 0 && *b == 0;
+}
* --Copyright--
*/
+#include <alloc_buffer.h>
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <resolv/resolv-internal.h>
#include <resolv/resolv_context.h>
-/* Get implementations of some internal functions. */
-#include <resolv/mapv4v6addr.h>
-#include <resolv/mapv4v6hostent.h>
-
#define RESOLVSORT
#if PACKETSZ > 65536
#endif
#define MAXHOSTNAMELEN 256
-/* We need this time later. */
-typedef union querybuf
-{
- HEADER hdr;
- u_char buf[MAXPACKET];
-} querybuf;
-
-static enum nss_status getanswer_r (struct resolv_context *ctx,
- const querybuf *answer, int anslen,
- const char *qname, int qtype,
- struct hostent *result, char *buffer,
- size_t buflen, int *errnop, int *h_errnop,
- int map, int32_t *ttlp, char **canonp);
-
-static enum nss_status gaih_getanswer (const querybuf *answer1, int anslen1,
- const querybuf *answer2, int anslen2,
- const char *qname,
+/* For historic reasons, pointers to IP addresses are char *, so use a
+ single list type for addresses and host names. */
+#define DYNARRAY_STRUCT ptrlist
+#define DYNARRAY_ELEMENT char *
+#define DYNARRAY_PREFIX ptrlist_
+#include <malloc/dynarray-skeleton.c>
+
+static enum nss_status getanswer_r (unsigned char *packet, size_t packetlen,
+ uint16_t qtype, struct alloc_buffer *abuf,
+ struct ptrlist *addresses,
+ struct ptrlist *aliases,
+ int *errnop, int *h_errnop, int32_t *ttlp);
+static void addrsort (struct resolv_context *ctx, char **ap, int num);
+static enum nss_status getanswer_ptr (unsigned char *packet, size_t packetlen,
+ struct alloc_buffer *abuf,
+ char **hnamep, int *errnop,
+ int *h_errnop, int32_t *ttlp);
+
+static enum nss_status gaih_getanswer (unsigned char *packet1,
+ size_t packet1len,
+ unsigned char *packet2,
+ size_t packet2len,
+ struct alloc_buffer *abuf,
struct gaih_addrtuple **pat,
- char *buffer, size_t buflen,
int *errnop, int *h_errnop,
int32_t *ttlp);
char *buffer, size_t buflen, int *errnop,
int *h_errnop, int32_t *ttlp, char **canonp)
{
- union
- {
- querybuf *buf;
- u_char *ptr;
- } host_buffer;
- querybuf *orig_host_buffer;
char tmp[NS_MAXDNAME];
int size, type, n;
const char *cp;
- int map = 0;
int olderr = errno;
enum nss_status status;
&& (cp = __res_context_hostalias (ctx, name, tmp, sizeof (tmp))) != NULL)
name = cp;
- host_buffer.buf = orig_host_buffer = (querybuf *) alloca (1024);
+ unsigned char dns_packet_buffer[1024];
+ unsigned char *alt_dns_packet_buffer = dns_packet_buffer;
- n = __res_context_search (ctx, name, C_IN, type, host_buffer.buf->buf,
- 1024, &host_buffer.ptr, NULL, NULL, NULL, NULL);
+ n = __res_context_search (ctx, name, C_IN, type,
+ dns_packet_buffer, sizeof (dns_packet_buffer),
+ &alt_dns_packet_buffer, NULL, NULL, NULL, NULL);
if (n < 0)
{
switch (errno)
*errnop = EAGAIN;
else
__set_errno (olderr);
+ }
+ else
+ {
+ struct alloc_buffer abuf = alloc_buffer_create (buffer, buflen);
- /* If we are looking for an IPv6 address and mapping is enabled
- by having the RES_USE_INET6 bit in _res.options set, we try
- another lookup. */
- if (af == AF_INET6 && res_use_inet6 ())
- n = __res_context_search (ctx, name, C_IN, T_A, host_buffer.buf->buf,
- host_buffer.buf != orig_host_buffer
- ? MAXPACKET : 1024, &host_buffer.ptr,
- NULL, NULL, NULL, NULL);
+ struct ptrlist addresses;
+ ptrlist_init (&addresses);
+ struct ptrlist aliases;
+ ptrlist_init (&aliases);
- if (n < 0)
+ status = getanswer_r (alt_dns_packet_buffer, n, type,
+ &abuf, &addresses, &aliases,
+ errnop, h_errnop, ttlp);
+ if (status == NSS_STATUS_SUCCESS)
{
- if (host_buffer.buf != orig_host_buffer)
- free (host_buffer.buf);
- return status;
- }
+ if (ptrlist_has_failed (&addresses)
+ || ptrlist_has_failed (&aliases))
+ {
+ /* malloc failure. Do not retry using the ERANGE protocol. */
+ *errnop = ENOMEM;
+ *h_errnop = NETDB_INTERNAL;
+ status = NSS_STATUS_UNAVAIL;
+ }
- map = 1;
+ /* Reserve the address and alias arrays in the result
+ buffer. Both are NULL-terminated, but the first element
+ of the alias array is stored in h_name, so no extra space
+ for the NULL terminator is needed there. */
+ result->h_addr_list
+ = alloc_buffer_alloc_array (&abuf, char *,
+ ptrlist_size (&addresses) + 1);
+ result->h_aliases
+ = alloc_buffer_alloc_array (&abuf, char *,
+ ptrlist_size (&aliases));
+ if (alloc_buffer_has_failed (&abuf))
+ {
+ /* Retry using the ERANGE protocol. */
+ *errnop = ERANGE;
+ *h_errnop = NETDB_INTERNAL;
+ status = NSS_STATUS_TRYAGAIN;
+ }
+ else
+ {
+ /* Copy the address list and NULL-terminate it. */
+ memcpy (result->h_addr_list, ptrlist_begin (&addresses),
+ ptrlist_size (&addresses) * sizeof (char *));
+ result->h_addr_list[ptrlist_size (&addresses)] = NULL;
+
+ /* Sort the address list if requested. */
+ if (type == T_A && __resolv_context_sort_count (ctx) > 0)
+ addrsort (ctx, result->h_addr_list, ptrlist_size (&addresses));
- result->h_addrtype = AF_INET;
- result->h_length = INADDRSZ;
+ /* Copy the aliases, excluding the last one. */
+ memcpy (result->h_aliases, ptrlist_begin (&aliases),
+ (ptrlist_size (&aliases) - 1) * sizeof (char *));
+ result->h_aliases[ptrlist_size (&aliases) - 1] = NULL;
+
+ /* The last alias goes into h_name. */
+ assert (ptrlist_size (&aliases) >= 1);
+ result->h_name = ptrlist_end (&aliases)[-1];
+
+ /* This is also the canonical name. */
+ if (canonp != NULL)
+ *canonp = result->h_name;
+ }
+ }
+
+ ptrlist_free (&aliases);
+ ptrlist_free (&addresses);
}
- status = getanswer_r
- (ctx, host_buffer.buf, n, name, type, result, buffer, buflen,
- errnop, h_errnop, map, ttlp, canonp);
- if (host_buffer.buf != orig_host_buffer)
- free (host_buffer.buf);
+ if (alt_dns_packet_buffer != dns_packet_buffer)
+ free (alt_dns_packet_buffer);
return status;
}
*h_errnop = NETDB_INTERNAL;
return NSS_STATUS_UNAVAIL;
}
- status = NSS_STATUS_NOTFOUND;
- if (res_use_inet6 ())
- status = gethostbyname3_context (ctx, name, AF_INET6, result, buffer,
- buflen, errnop, h_errnop, NULL, NULL);
- if (status == NSS_STATUS_NOTFOUND)
- status = gethostbyname3_context (ctx, name, AF_INET, result, buffer,
- buflen, errnop, h_errnop, NULL, NULL);
+ status = gethostbyname3_context (ctx, name, AF_INET, result, buffer,
+ buflen, errnop, h_errnop, NULL, NULL);
__resolv_context_put (ctx);
return status;
}
name = cp;
}
- union
- {
- querybuf *buf;
- u_char *ptr;
- } host_buffer;
- querybuf *orig_host_buffer;
- host_buffer.buf = orig_host_buffer = (querybuf *) alloca (2048);
+ unsigned char dns_packet_buffer[2048];
+ unsigned char *alt_dns_packet_buffer = dns_packet_buffer;
u_char *ans2p = NULL;
int nans2p = 0;
int resplen2 = 0;
int ans2p_malloced = 0;
+ struct alloc_buffer abuf = alloc_buffer_create (buffer, buflen);
int olderr = errno;
int n = __res_context_search (ctx, name, C_IN, T_QUERY_A_AND_AAAA,
- host_buffer.buf->buf, 2048, &host_buffer.ptr,
- &ans2p, &nans2p, &resplen2, &ans2p_malloced);
+ dns_packet_buffer, sizeof (dns_packet_buffer),
+ &alt_dns_packet_buffer, &ans2p, &nans2p,
+ &resplen2, &ans2p_malloced);
if (n >= 0)
{
- status = gaih_getanswer (host_buffer.buf, n, (const querybuf *) ans2p,
- resplen2, name, pat, buffer, buflen,
- errnop, herrnop, ttlp);
+ status = gaih_getanswer (alt_dns_packet_buffer, n, ans2p, resplen2,
+ &abuf, pat, errnop, herrnop, ttlp);
}
else
{
__set_errno (olderr);
}
+ /* Implement the buffer resizing protocol. */
+ if (alloc_buffer_has_failed (&abuf))
+ {
+ *errnop = ERANGE;
+ *herrnop = NETDB_INTERNAL;
+ status = NSS_STATUS_TRYAGAIN;
+ }
+
/* Check whether ans2p was separately allocated. */
if (ans2p_malloced)
free (ans2p);
- if (host_buffer.buf != orig_host_buffer)
- free (host_buffer.buf);
+ if (alt_dns_packet_buffer != dns_packet_buffer)
+ free (alt_dns_packet_buffer);
__resolv_context_put (ctx);
return status;
static const u_char tunnelled[] = { 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 };
static const u_char v6local[] = { 0,0, 0,1 };
const u_char *uaddr = (const u_char *)addr;
- struct host_data
- {
- char *aliases[MAX_NR_ALIASES];
- unsigned char host_addr[16]; /* IPv4 or IPv6 */
- char *h_addr_ptrs[MAX_NR_ADDRS + 1];
- char linebuffer[0];
- } *host_data = (struct host_data *) buffer;
- union
- {
- querybuf *buf;
- u_char *ptr;
- } host_buffer;
- querybuf *orig_host_buffer;
char qbuf[MAXDNAME+1], *qp = NULL;
size_t size;
int n, status;
int olderr = errno;
- uintptr_t pad = -(uintptr_t) buffer % __alignof__ (struct host_data);
- buffer += pad;
- buflen = buflen > pad ? buflen - pad : 0;
-
- if (__glibc_unlikely (buflen < sizeof (struct host_data)))
- {
- *errnop = ERANGE;
- *h_errnop = NETDB_INTERNAL;
- return NSS_STATUS_TRYAGAIN;
- }
-
- host_data = (struct host_data *) buffer;
+ /* Prepare the allocation buffer. Store the pointer array first, to
+ benefit from buffer alignment. */
+ struct alloc_buffer abuf = alloc_buffer_create (buffer, buflen);
+ char **address_array = alloc_buffer_alloc_array (&abuf, char *, 2);
+ if (address_array == NULL)
+ {
+ *errnop = ERANGE;
+ *h_errnop = NETDB_INTERNAL;
+ return NSS_STATUS_TRYAGAIN;
+ }
struct resolv_context *ctx = __resolv_context_get ();
if (ctx == NULL)
return NSS_STATUS_UNAVAIL;
}
- host_buffer.buf = orig_host_buffer = (querybuf *) alloca (1024);
-
switch (af)
{
case AF_INET:
break;
}
- n = __res_context_query (ctx, qbuf, C_IN, T_PTR, host_buffer.buf->buf,
- 1024, &host_buffer.ptr, NULL, NULL, NULL, NULL);
+ unsigned char dns_packet_buffer[1024];
+ unsigned char *alt_dns_packet_buffer = dns_packet_buffer;
+ n = __res_context_query (ctx, qbuf, C_IN, T_PTR,
+ dns_packet_buffer, sizeof (dns_packet_buffer),
+ &alt_dns_packet_buffer,
+ NULL, NULL, NULL, NULL);
if (n < 0)
{
*h_errnop = h_errno;
__set_errno (olderr);
- if (host_buffer.buf != orig_host_buffer)
- free (host_buffer.buf);
+ if (alt_dns_packet_buffer != dns_packet_buffer)
+ free (alt_dns_packet_buffer);
__resolv_context_put (ctx);
return errno == ECONNREFUSED ? NSS_STATUS_UNAVAIL : NSS_STATUS_NOTFOUND;
}
- status = getanswer_r
- (ctx, host_buffer.buf, n, qbuf, T_PTR, result, buffer, buflen,
- errnop, h_errnop, 0 /* XXX */, ttlp, NULL);
- if (host_buffer.buf != orig_host_buffer)
- free (host_buffer.buf);
+ status = getanswer_ptr (alt_dns_packet_buffer, n,
+ &abuf, &result->h_name, errnop, h_errnop, ttlp);
+
+ if (alt_dns_packet_buffer != dns_packet_buffer)
+ free (alt_dns_packet_buffer);
+ __resolv_context_put (ctx);
+
if (status != NSS_STATUS_SUCCESS)
- {
- __resolv_context_put (ctx);
- return status;
- }
+ return status;
+ /* result->h_name has already been set by getanswer_ptr. */
result->h_addrtype = af;
result->h_length = len;
- memcpy (host_data->host_addr, addr, len);
- host_data->h_addr_ptrs[0] = (char *) host_data->host_addr;
- host_data->h_addr_ptrs[1] = NULL;
+ /* Increase the alignment to 4, in case there are applications out
+ there that expect at least this level of address alignment. */
+ address_array[0] = (char *) alloc_buffer_next (&abuf, uint32_t);
+ alloc_buffer_copy_bytes (&abuf, uaddr, len);
+ address_array[1] = NULL;
+
+ /* This check also covers allocation failure in getanswer_ptr. */
+ if (alloc_buffer_has_failed (&abuf))
+ {
+ *errnop = ERANGE;
+ *h_errnop = NETDB_INTERNAL;
+ return NSS_STATUS_TRYAGAIN;
+ }
+ result->h_addr_list = address_array;
+ result->h_aliases = &address_array[1]; /* Points to NULL. */
+
*h_errnop = NETDB_SUCCESS;
- __resolv_context_put (ctx);
return NSS_STATUS_SUCCESS;
}
libc_hidden_def (_nss_dns_gethostbyaddr2_r)
break;
}
-static enum nss_status
-getanswer_r (struct resolv_context *ctx,
- const querybuf *answer, int anslen, const char *qname, int qtype,
- struct hostent *result, char *buffer, size_t buflen,
- int *errnop, int *h_errnop, int map, int32_t *ttlp, char **canonp)
+/* Convert the uncompressed, binary domain name CDNAME into its
+ textual representation and add it to the end of ALIASES, allocating
+ space for a copy of the name from ABUF. Skip adding the name if it
+ is not a valid host name, and return false in that case, otherwise
+ true. */
+static bool
+getanswer_r_store_alias (const unsigned char *cdname,
+ struct alloc_buffer *abuf,
+ struct ptrlist *aliases)
{
- struct host_data
- {
- char *aliases[MAX_NR_ALIASES];
- unsigned char host_addr[16]; /* IPv4 or IPv6 */
- char *h_addr_ptrs[0];
- } *host_data;
- int linebuflen;
- const HEADER *hp;
- const u_char *end_of_message, *cp;
- int n, ancount, qdcount;
- int haveanswer, had_error;
- char *bp, **ap, **hap;
- char tbuf[MAXDNAME];
- const char *tname;
- int (*name_ok) (const char *);
- u_char packtmp[NS_MAXCDNAME];
- int have_to_map = 0;
- uintptr_t pad = -(uintptr_t) buffer % __alignof__ (struct host_data);
- buffer += pad;
- buflen = buflen > pad ? buflen - pad : 0;
- if (__glibc_unlikely (buflen < sizeof (struct host_data)))
- {
- /* The buffer is too small. */
- too_small:
- *errnop = ERANGE;
- *h_errnop = NETDB_INTERNAL;
- return NSS_STATUS_TRYAGAIN;
- }
- host_data = (struct host_data *) buffer;
- linebuflen = buflen - sizeof (struct host_data);
- if (buflen - sizeof (struct host_data) != linebuflen)
- linebuflen = INT_MAX;
-
- tname = qname;
- result->h_name = NULL;
- end_of_message = answer->buf + anslen;
- switch (qtype)
- {
- case T_A:
- case T_AAAA:
- name_ok = __libc_res_hnok;
- break;
- case T_PTR:
- name_ok = __libc_res_dnok;
- break;
- default:
- *errnop = ENOENT;
- return NSS_STATUS_UNAVAIL; /* XXX should be abort(); */
- }
+ /* Filter out domain names that are not host names. */
+ if (!__res_binary_hnok (cdname))
+ return false;
+
+ /* Note: Not NS_MAXCDNAME, so that __ns_name_ntop implicitly checks
+ for length. */
+ char dname[MAXHOSTNAMELEN + 1];
+ if (__ns_name_ntop (cdname, dname, sizeof (dname)) < 0)
+ return false;
+ /* Do not report an error on allocation failure, instead store NULL
+ or do nothing. getanswer_r's caller will see NSS_STATUS_SUCCESS
+ and detect the memory allocation failure or buffer space
+ exhaustion, and report it accordingly. */
+ ptrlist_add (aliases, alloc_buffer_copy_string (abuf, dname));
+ return true;
+}
- /*
- * find first satisfactory answer
- */
- hp = &answer->hdr;
- ancount = ntohs (hp->ancount);
- qdcount = ntohs (hp->qdcount);
- cp = answer->buf + HFIXEDSZ;
- if (__glibc_unlikely (qdcount != 1))
+static enum nss_status __attribute__ ((noinline))
+getanswer_r (unsigned char *packet, size_t packetlen, uint16_t qtype,
+ struct alloc_buffer *abuf,
+ struct ptrlist *addresses, struct ptrlist *aliases,
+ int *errnop, int *h_errnop, int32_t *ttlp)
+{
+ struct ns_rr_cursor c;
+ if (!__ns_rr_cursor_init (&c, packet, packetlen))
{
+ /* This should not happen because __res_context_query already
+ perfroms response validation. */
*h_errnop = NO_RECOVERY;
return NSS_STATUS_UNAVAIL;
}
- if (sizeof (struct host_data) + (ancount + 1) * sizeof (char *) >= buflen)
- goto too_small;
- bp = (char *) &host_data->h_addr_ptrs[ancount + 1];
- linebuflen -= (ancount + 1) * sizeof (char *);
-
- n = __ns_name_unpack (answer->buf, end_of_message, cp,
- packtmp, sizeof packtmp);
- if (n != -1 && __ns_name_ntop (packtmp, bp, linebuflen) == -1)
- {
- if (__glibc_unlikely (errno == EMSGSIZE))
- goto too_small;
- n = -1;
- }
-
- if (__glibc_unlikely (n < 0))
+ /* Treat the QNAME just like an alias. Error out if it is not a
+ valid host name. */
+ if (ns_rr_cursor_rcode (&c) == NXDOMAIN
+ || !getanswer_r_store_alias (ns_rr_cursor_qname (&c), abuf, aliases))
{
- *errnop = errno;
- *h_errnop = NO_RECOVERY;
- return NSS_STATUS_UNAVAIL;
- }
- if (__glibc_unlikely (name_ok (bp) == 0))
- {
- errno = EBADMSG;
- *errnop = EBADMSG;
- *h_errnop = NO_RECOVERY;
- return NSS_STATUS_UNAVAIL;
+ if (ttlp != NULL)
+ /* No negative caching. */
+ *ttlp = 0;
+ *h_errnop = HOST_NOT_FOUND;
+ *errnop = ENOENT;
+ return NSS_STATUS_NOTFOUND;
}
- cp += n + QFIXEDSZ;
- if (qtype == T_A || qtype == T_AAAA)
+ int ancount = ns_rr_cursor_ancount (&c);
+ const unsigned char *expected_name = ns_rr_cursor_qname (&c);
+ /* expected_name may be updated to point into this buffer. */
+ unsigned char name_buffer[NS_MAXCDNAME];
+
+ for (; ancount > 0; --ancount)
{
- /* res_send() has already verified that the query name is the
- * same as the one we sent; this just gets the expanded name
- * (i.e., with the succeeding search-domain tacked on).
- */
- n = strlen (bp) + 1; /* for the \0 */
- if (n >= MAXHOSTNAMELEN)
+ struct ns_rr_wire rr;
+ if (!__ns_rr_cursor_next (&c, &rr))
{
*h_errnop = NO_RECOVERY;
- *errnop = ENOENT;
- return NSS_STATUS_TRYAGAIN;
+ return NSS_STATUS_UNAVAIL;
}
- result->h_name = bp;
- bp += n;
- linebuflen -= n;
- if (linebuflen < 0)
- goto too_small;
- /* The qname can be abbreviated, but h_name is now absolute. */
- qname = result->h_name;
- }
- ap = host_data->aliases;
- *ap = NULL;
- result->h_aliases = host_data->aliases;
- hap = host_data->h_addr_ptrs;
- *hap = NULL;
- result->h_addr_list = host_data->h_addr_ptrs;
- haveanswer = 0;
- had_error = 0;
+ /* Skip over records with the wrong class. */
+ if (rr.rclass != C_IN)
+ continue;
- while (ancount-- > 0 && cp < end_of_message && had_error == 0)
- {
- int type, class;
+ /* Update TTL for recognized record types. */
+ if ((rr.rtype == T_CNAME || rr.rtype == qtype)
+ && ttlp != NULL && *ttlp > rr.ttl)
+ *ttlp = rr.ttl;
- n = __ns_name_unpack (answer->buf, end_of_message, cp,
- packtmp, sizeof packtmp);
- if (n != -1 && __ns_name_ntop (packtmp, bp, linebuflen) == -1)
+ if (rr.rtype == T_CNAME)
{
- if (__glibc_unlikely (errno == EMSGSIZE))
- goto too_small;
-
- n = -1;
+ /* NB: No check for owner name match, based on historic
+ precedent. Record the CNAME target as the new expected
+ name. */
+ int n = __ns_name_unpack (c.begin, c.end, rr.rdata,
+ name_buffer, sizeof (name_buffer));
+ if (n < 0)
+ {
+ *h_errnop = NO_RECOVERY;
+ return NSS_STATUS_UNAVAIL;
+ }
+ /* And store the new name as an alias. */
+ getanswer_r_store_alias (name_buffer, abuf, aliases);
+ expected_name = name_buffer;
}
-
- if (__glibc_unlikely (n < 0 || (*name_ok) (bp) == 0))
+ else if (rr.rtype == qtype
+ && __ns_samebinaryname (rr.rname, expected_name)
+ && rr.rdlength == rrtype_to_rdata_length (qtype))
{
- ++had_error;
- continue;
+ /* Make a copy of the address and store it. Increase the
+ alignment to 4, in case there are applications out there
+ that expect at least this level of address alignment. */
+ ptrlist_add (addresses, (char *) alloc_buffer_next (abuf, uint32_t));
+ alloc_buffer_copy_bytes (abuf, rr.rdata, rr.rdlength);
}
- cp += n; /* name */
+ }
- if (__glibc_unlikely (cp + 10 > end_of_message))
- {
- ++had_error;
- continue;
- }
+ if (ptrlist_size (addresses) == 0)
+ {
+ /* No address record found. */
+ if (ttlp != NULL)
+ /* No caching of negative responses. */
+ *ttlp = 0;
- NS_GET16 (type, cp);
- NS_GET16 (class, cp);
- int32_t ttl;
- NS_GET32 (ttl, cp);
- NS_GET16 (n, cp); /* RDATA length. */
+ *h_errnop = NO_RECOVERY;
+ *errnop = ENOENT;
+ return NSS_STATUS_TRYAGAIN;
+ }
+ else
+ {
+ *h_errnop = NETDB_SUCCESS;
+ return NSS_STATUS_SUCCESS;
+ }
+}
- if (end_of_message - cp < n)
- {
- /* RDATA extends beyond the end of the packet. */
- ++had_error;
- continue;
- }
+static enum nss_status
+getanswer_ptr (unsigned char *packet, size_t packetlen,
+ struct alloc_buffer *abuf, char **hnamep,
+ int *errnop, int *h_errnop, int32_t *ttlp)
+{
+ struct ns_rr_cursor c;
+ if (!__ns_rr_cursor_init (&c, packet, packetlen))
+ {
+ /* This should not happen because __res_context_query already
+ perfroms response validation. */
+ *h_errnop = NO_RECOVERY;
+ return NSS_STATUS_UNAVAIL;
+ }
+ int ancount = ns_rr_cursor_ancount (&c);
+ const unsigned char *expected_name = ns_rr_cursor_qname (&c);
+ /* expected_name may be updated to point into this buffer. */
+ unsigned char name_buffer[NS_MAXCDNAME];
- if (__glibc_unlikely (class != C_IN))
+ while (ancount > 0)
+ {
+ struct ns_rr_wire rr;
+ if (!__ns_rr_cursor_next (&c, &rr))
{
- /* XXX - debug? syslog? */
- cp += n;
- continue; /* XXX - had_error++ ? */
+ *h_errnop = NO_RECOVERY;
+ return NSS_STATUS_UNAVAIL;
}
- if ((qtype == T_A || qtype == T_AAAA) && type == T_CNAME)
- {
- /* A CNAME could also have a TTL entry. */
- if (ttlp != NULL && ttl < *ttlp)
- *ttlp = ttl;
-
- if (ap >= &host_data->aliases[MAX_NR_ALIASES - 1])
- continue;
- n = __libc_dn_expand (answer->buf, end_of_message, cp,
- tbuf, sizeof tbuf);
- if (__glibc_unlikely (n < 0 || (*name_ok) (tbuf) == 0))
- {
- ++had_error;
- continue;
- }
- cp += n;
- /* Store alias. */
- *ap++ = bp;
- n = strlen (bp) + 1; /* For the \0. */
- if (__glibc_unlikely (n >= MAXHOSTNAMELEN))
- {
- ++had_error;
- continue;
- }
- bp += n;
- linebuflen -= n;
- /* Get canonical name. */
- n = strlen (tbuf) + 1; /* For the \0. */
- if (__glibc_unlikely (n > linebuflen))
- goto too_small;
- if (__glibc_unlikely (n >= MAXHOSTNAMELEN))
- {
- ++had_error;
- continue;
- }
- result->h_name = bp;
- bp = __mempcpy (bp, tbuf, n); /* Cannot overflow. */
- linebuflen -= n;
- continue;
- }
+ /* Skip over records with the wrong class. */
+ if (rr.rclass != C_IN)
+ continue;
- if (qtype == T_PTR && type == T_CNAME)
- {
- /* A CNAME could also have a TTL entry. */
- if (ttlp != NULL && ttl < *ttlp)
- *ttlp = ttl;
+ /* Update TTL for known record types. */
+ if ((rr.rtype == T_CNAME || rr.rtype == T_PTR)
+ && ttlp != NULL && *ttlp > rr.ttl)
+ *ttlp = rr.ttl;
- n = __libc_dn_expand (answer->buf, end_of_message, cp,
- tbuf, sizeof tbuf);
- if (__glibc_unlikely (n < 0 || __libc_res_dnok (tbuf) == 0))
- {
- ++had_error;
- continue;
- }
- cp += n;
- /* Get canonical name. */
- n = strlen (tbuf) + 1; /* For the \0. */
- if (__glibc_unlikely (n > linebuflen))
- goto too_small;
- if (__glibc_unlikely (n >= MAXHOSTNAMELEN))
+ if (rr.rtype == T_CNAME)
+ {
+ /* NB: No check for owner name match, based on historic
+ precedent. Record the CNAME target as the new expected
+ name. */
+ int n = __ns_name_unpack (c.begin, c.end, rr.rdata,
+ name_buffer, sizeof (name_buffer));
+ if (n < 0)
{
- ++had_error;
- continue;
+ *h_errnop = NO_RECOVERY;
+ return NSS_STATUS_UNAVAIL;
}
- tname = bp;
- bp = __mempcpy (bp, tbuf, n); /* Cannot overflow. */
- linebuflen -= n;
- continue;
+ expected_name = name_buffer;
}
-
- if (type == T_A && qtype == T_AAAA && map)
- have_to_map = 1;
- else if (__glibc_unlikely (type != qtype))
+ else if (rr.rtype == T_PTR
+ && __ns_samebinaryname (rr.rname, expected_name))
{
- cp += n;
- continue; /* XXX - had_error++ ? */
- }
-
- switch (type)
- {
- case T_PTR:
- if (__glibc_unlikely (__strcasecmp (tname, bp) != 0))
+ /* Decompress the target of the PTR record. This is the
+ host name we are looking for. We can only use it if it
+ is syntactically valid. Historically, only one host name
+ is returned here. If the recursive resolver performs DNS
+ record rotation, the returned host name is essentially
+ random, which is why multiple PTR records are rarely
+ used. Use MAXHOSTNAMELEN instead of NS_MAXCDNAME for
+ additional length checking. */
+ char hname[MAXHOSTNAMELEN + 1];
+ if (__ns_name_unpack (c.begin, c.end, rr.rdata,
+ name_buffer, sizeof (name_buffer)) < 0
+ || !__res_binary_hnok (expected_name)
+ || __ns_name_ntop (name_buffer, hname, sizeof (hname)) < 0)
{
- cp += n;
- continue; /* XXX - had_error++ ? */
+ *h_errnop = NO_RECOVERY;
+ return NSS_STATUS_UNAVAIL;
}
-
- n = __ns_name_unpack (answer->buf, end_of_message, cp,
- packtmp, sizeof packtmp);
- if (n != -1 && __ns_name_ntop (packtmp, bp, linebuflen) == -1)
- {
- if (__glibc_unlikely (errno == EMSGSIZE))
- goto too_small;
-
- n = -1;
- }
-
- if (__glibc_unlikely (n < 0 || __libc_res_hnok (bp) == 0))
- {
- ++had_error;
- break;
- }
- if (ttlp != NULL && ttl < *ttlp)
- *ttlp = ttl;
- /* bind would put multiple PTR records as aliases, but we don't do
- that. */
- result->h_name = bp;
- *h_errnop = NETDB_SUCCESS;
+ /* Successful allocation is checked by the caller. */
+ *hnamep = alloc_buffer_copy_string (abuf, hname);
return NSS_STATUS_SUCCESS;
- case T_A:
- case T_AAAA:
- if (__glibc_unlikely (__strcasecmp (result->h_name, bp) != 0))
- {
- cp += n;
- continue; /* XXX - had_error++ ? */
- }
-
- /* Stop parsing at a record whose length is incorrect. */
- if (n != rrtype_to_rdata_length (type))
- {
- ++had_error;
- break;
- }
-
- /* Skip records of the wrong type. */
- if (n != result->h_length)
- {
- cp += n;
- continue;
- }
- if (!haveanswer)
- {
- int nn;
-
- /* We compose a single hostent out of the entire chain of
- entries, so the TTL of the hostent is essentially the lowest
- TTL in the chain. */
- if (ttlp != NULL && ttl < *ttlp)
- *ttlp = ttl;
- if (canonp != NULL)
- *canonp = bp;
- result->h_name = bp;
- nn = strlen (bp) + 1; /* for the \0 */
- bp += nn;
- linebuflen -= nn;
- }
-
- /* Provide sufficient alignment for both address
- families. */
- enum { align = 4 };
- _Static_assert ((align % __alignof__ (struct in_addr)) == 0,
- "struct in_addr alignment");
- _Static_assert ((align % __alignof__ (struct in6_addr)) == 0,
- "struct in6_addr alignment");
- {
- char *new_bp = PTR_ALIGN_UP (bp, align);
- linebuflen -= new_bp - bp;
- bp = new_bp;
- }
-
- if (__glibc_unlikely (n > linebuflen))
- goto too_small;
- bp = __mempcpy (*hap++ = bp, cp, n);
- cp += n;
- linebuflen -= n;
- break;
- default:
- abort ();
}
- if (had_error == 0)
- ++haveanswer;
}
- if (haveanswer > 0)
- {
- *ap = NULL;
- *hap = NULL;
- /*
- * Note: we sort even if host can take only one address
- * in its return structures - should give it the "best"
- * address in that case, not some random one
- */
- if (haveanswer > 1 && qtype == T_A
- && __resolv_context_sort_count (ctx) > 0)
- addrsort (ctx, host_data->h_addr_ptrs, haveanswer);
-
- if (result->h_name == NULL)
- {
- n = strlen (qname) + 1; /* For the \0. */
- if (n > linebuflen)
- goto too_small;
- if (n >= MAXHOSTNAMELEN)
- goto no_recovery;
- result->h_name = bp;
- bp = __mempcpy (bp, qname, n); /* Cannot overflow. */
- linebuflen -= n;
- }
+ /* No PTR record found. */
+ if (ttlp != NULL)
+ /* No caching of negative responses. */
+ *ttlp = 0;
- if (have_to_map)
- if (map_v4v6_hostent (result, &bp, &linebuflen))
- goto too_small;
- *h_errnop = NETDB_SUCCESS;
- return NSS_STATUS_SUCCESS;
- }
- no_recovery:
*h_errnop = NO_RECOVERY;
*errnop = ENOENT;
- /* Special case here: if the resolver sent a result but it only
- contains a CNAME while we are looking for a T_A or T_AAAA record,
- we fail with NOTFOUND instead of TRYAGAIN. */
- return ((qtype == T_A || qtype == T_AAAA) && ap != host_data->aliases
- ? NSS_STATUS_NOTFOUND : NSS_STATUS_TRYAGAIN);
+ return NSS_STATUS_TRYAGAIN;
}
-
+/* Parses DNS data found in PACKETLEN bytes at PACKET in struct
+ gaih_addrtuple address tuples. The new address tuples are linked
+ from **TAILP, with backing store allocated from ABUF, and *TAILP is
+ updated to point where the next tuple pointer should be stored. If
+ TTLP is not null, *TTLP is updated to reflect the minimum TTL. If
+ STORE_CANON is true, the canonical name is stored as part of the
+ first address tuple being written. */
static enum nss_status
-gaih_getanswer_slice (const querybuf *answer, int anslen, const char *qname,
- struct gaih_addrtuple ***patp,
- char **bufferp, size_t *buflenp,
- int *errnop, int *h_errnop, int32_t *ttlp, int *firstp)
+gaih_getanswer_slice (unsigned char *packet, size_t packetlen,
+ struct alloc_buffer *abuf,
+ struct gaih_addrtuple ***tailp,
+ int *errnop, int *h_errnop, int32_t *ttlp,
+ bool store_canon)
{
- char *buffer = *bufferp;
- size_t buflen = *buflenp;
-
- struct gaih_addrtuple **pat = *patp;
- const HEADER *hp = &answer->hdr;
- int ancount = ntohs (hp->ancount);
- int qdcount = ntohs (hp->qdcount);
- const u_char *cp = answer->buf + HFIXEDSZ;
- const u_char *end_of_message = answer->buf + anslen;
- if (__glibc_unlikely (qdcount != 1))
- {
- *h_errnop = NO_RECOVERY;
- return NSS_STATUS_UNAVAIL;
- }
-
- u_char packtmp[NS_MAXCDNAME];
- int n = __ns_name_unpack (answer->buf, end_of_message, cp,
- packtmp, sizeof packtmp);
- /* We unpack the name to check it for validity. But we do not need
- it later. */
- if (n != -1 && __ns_name_ntop (packtmp, buffer, buflen) == -1)
- {
- if (__glibc_unlikely (errno == EMSGSIZE))
- {
- too_small:
- *errnop = ERANGE;
- *h_errnop = NETDB_INTERNAL;
- return NSS_STATUS_TRYAGAIN;
- }
-
- n = -1;
- }
-
- if (__glibc_unlikely (n < 0))
+ struct ns_rr_cursor c;
+ if (!__ns_rr_cursor_init (&c, packet, packetlen))
{
- *errnop = errno;
+ /* This should not happen because __res_context_query already
+ perfroms response validation. */
*h_errnop = NO_RECOVERY;
return NSS_STATUS_UNAVAIL;
}
- if (__glibc_unlikely (__libc_res_hnok (buffer) == 0))
- {
- errno = EBADMSG;
- *errnop = EBADMSG;
- *h_errnop = NO_RECOVERY;
- return NSS_STATUS_UNAVAIL;
- }
- cp += n + QFIXEDSZ;
-
- int haveanswer = 0;
- int had_error = 0;
- char *canon = NULL;
- char *h_name = NULL;
- int h_namelen = 0;
-
- if (ancount == 0)
+ bool haveanswer = false; /* Set to true if at least one address. */
+ uint16_t qtype = ns_rr_cursor_qtype (&c);
+ int ancount = ns_rr_cursor_ancount (&c);
+ const unsigned char *expected_name = ns_rr_cursor_qname (&c);
+ /* expected_name may be updated to point into this buffer. */
+ unsigned char name_buffer[NS_MAXCDNAME];
+
+ /* This is a pointer to a possibly-compressed name in the packet.
+ Eventually it is equivalent to the canonical name. If needed, it
+ is uncompressed and translated to text form when the first
+ address tuple is encountered. */
+ const unsigned char *compressed_alias_name = expected_name;
+
+ if (ancount == 0 || !__res_binary_hnok (compressed_alias_name))
{
*h_errnop = HOST_NOT_FOUND;
return NSS_STATUS_NOTFOUND;
}
- while (ancount-- > 0 && cp < end_of_message && had_error == 0)
+ for (; ancount > -0; --ancount)
{
- n = __ns_name_unpack (answer->buf, end_of_message, cp,
- packtmp, sizeof packtmp);
- if (n != -1 &&
- (h_namelen = __ns_name_ntop (packtmp, buffer, buflen)) == -1)
+ struct ns_rr_wire rr;
+ if (!__ns_rr_cursor_next (&c, &rr))
{
- if (__glibc_unlikely (errno == EMSGSIZE))
- goto too_small;
-
- n = -1;
- }
- if (__glibc_unlikely (n < 0 || __libc_res_hnok (buffer) == 0))
- {
- ++had_error;
- continue;
- }
- if (*firstp && canon == NULL)
- {
- h_name = buffer;
- buffer += h_namelen;
- buflen -= h_namelen;
- }
-
- cp += n; /* name */
-
- if (__glibc_unlikely (cp + 10 > end_of_message))
- {
- ++had_error;
- continue;
- }
-
- uint16_t type;
- NS_GET16 (type, cp);
- uint16_t class;
- NS_GET16 (class, cp);
- int32_t ttl;
- NS_GET32 (ttl, cp);
- NS_GET16 (n, cp); /* RDATA length. */
-
- if (end_of_message - cp < n)
- {
- /* RDATA extends beyond the end of the packet. */
- ++had_error;
- continue;
+ *h_errnop = NO_RECOVERY;
+ return NSS_STATUS_UNAVAIL;
}
- if (class != C_IN)
- {
- cp += n;
- continue;
- }
+ /* Update TTL for known record types. */
+ if ((rr.rtype == T_CNAME || rr.rtype == qtype)
+ && ttlp != NULL && *ttlp > rr.ttl)
+ *ttlp = rr.ttl;
- if (type == T_CNAME)
+ if (rr.rtype == T_CNAME)
{
- char tbuf[MAXDNAME];
-
- /* A CNAME could also have a TTL entry. */
- if (ttlp != NULL && ttl < *ttlp)
- *ttlp = ttl;
-
- n = __libc_dn_expand (answer->buf, end_of_message, cp,
- tbuf, sizeof tbuf);
- if (__glibc_unlikely (n < 0 || __libc_res_hnok (tbuf) == 0))
- {
- ++had_error;
- continue;
- }
- cp += n;
-
- if (*firstp)
+ /* NB: No check for owner name match, based on historic
+ precedent. Record the CNAME target as the new expected
+ name. */
+ int n = __ns_name_unpack (c.begin, c.end, rr.rdata,
+ name_buffer, sizeof (name_buffer));
+ if (n < 0)
{
- /* Reclaim buffer space. */
- if (h_name + h_namelen == buffer)
- {
- buffer = h_name;
- buflen += h_namelen;
- }
-
- n = strlen (tbuf) + 1;
- if (__glibc_unlikely (n > buflen))
- goto too_small;
- if (__glibc_unlikely (n >= MAXHOSTNAMELEN))
- {
- ++had_error;
- continue;
- }
-
- canon = buffer;
- buffer = __mempcpy (buffer, tbuf, n);
- buflen -= n;
- h_namelen = 0;
+ *h_errnop = NO_RECOVERY;
+ return NSS_STATUS_UNAVAIL;
}
- continue;
+ expected_name = name_buffer;
+ if (store_canon && __res_binary_hnok (name_buffer))
+ /* This name can be used as a canonical name. Do not
+ translate to text form here to conserve buffer space.
+ Point to the compressed name because name_buffer can be
+ overwritten with an unusable name later. */
+ compressed_alias_name = rr.rdata;
}
-
- /* Stop parsing if we encounter a record with incorrect RDATA
- length. */
- if (type == T_A || type == T_AAAA)
+ else if (rr.rtype == qtype
+ && __ns_samebinaryname (rr.rname, expected_name)
+ && rr.rdlength == rrtype_to_rdata_length (qtype))
{
- if (n != rrtype_to_rdata_length (type))
+ struct gaih_addrtuple *ntup
+ = alloc_buffer_alloc (abuf, struct gaih_addrtuple);
+ /* Delay error reporting to the callers (they implement the
+ ERANGE buffer resizing handshake). */
+ if (ntup != NULL)
{
- ++had_error;
- continue;
+ ntup->next = NULL;
+ if (store_canon && compressed_alias_name != NULL)
+ {
+ /* This assumes that all the CNAME records come
+ first. Use MAXHOSTNAMELEN instead of
+ NS_MAXCDNAME for additional length checking.
+ However, these checks are not expected to fail
+ because all size NS_MAXCDNAME names should into
+ the hname buffer because no escaping is
+ needed. */
+ char unsigned nbuf[NS_MAXCDNAME];
+ char hname[MAXHOSTNAMELEN + 1];
+ if (__ns_name_unpack (c.begin, c.end,
+ compressed_alias_name,
+ nbuf, sizeof (nbuf)) >= 0
+ && __ns_name_ntop (nbuf, hname, sizeof (hname)) >= 0)
+ /* Space checking is performed by the callers. */
+ ntup->name = alloc_buffer_copy_string (abuf, hname);
+ store_canon = false;
+ }
+ else
+ ntup->name = NULL;
+ if (rr.rdlength == 4)
+ ntup->family = AF_INET;
+ else
+ ntup->family = AF_INET6;
+ memcpy (ntup->addr, rr.rdata, rr.rdlength);
+ ntup->scopeid = 0;
+
+ /* Link in the new tuple, and update the tail pointer to
+ point to its next field. */
+ **tailp = ntup;
+ *tailp = &ntup->next;
+
+ haveanswer = true;
}
}
- else
- {
- /* Skip unknown records. */
- cp += n;
- continue;
- }
-
- assert (type == T_A || type == T_AAAA);
- if (*pat == NULL)
- {
- uintptr_t pad = (-(uintptr_t) buffer
- % __alignof__ (struct gaih_addrtuple));
- buffer += pad;
- buflen = buflen > pad ? buflen - pad : 0;
-
- if (__glibc_unlikely (buflen < sizeof (struct gaih_addrtuple)))
- goto too_small;
-
- *pat = (struct gaih_addrtuple *) buffer;
- buffer += sizeof (struct gaih_addrtuple);
- buflen -= sizeof (struct gaih_addrtuple);
- }
-
- (*pat)->name = NULL;
- (*pat)->next = NULL;
-
- if (*firstp)
- {
- /* We compose a single hostent out of the entire chain of
- entries, so the TTL of the hostent is essentially the lowest
- TTL in the chain. */
- if (ttlp != NULL && ttl < *ttlp)
- *ttlp = ttl;
-
- (*pat)->name = canon ?: h_name;
-
- *firstp = 0;
- }
-
- (*pat)->family = type == T_A ? AF_INET : AF_INET6;
- memcpy ((*pat)->addr, cp, n);
- cp += n;
- (*pat)->scopeid = 0;
-
- pat = &((*pat)->next);
-
- haveanswer = 1;
}
if (haveanswer)
{
- *patp = pat;
- *bufferp = buffer;
- *buflenp = buflen;
-
*h_errnop = NETDB_SUCCESS;
return NSS_STATUS_SUCCESS;
}
-
- /* Special case here: if the resolver sent a result but it only
- contains a CNAME while we are looking for a T_A or T_AAAA record,
- we fail with NOTFOUND instead of TRYAGAIN. */
- if (canon != NULL)
+ else
{
+ /* Special case here: if the resolver sent a result but it only
+ contains a CNAME while we are looking for a T_A or T_AAAA
+ record, we fail with NOTFOUND. */
*h_errnop = HOST_NOT_FOUND;
return NSS_STATUS_NOTFOUND;
}
-
- *h_errnop = NETDB_INTERNAL;
- return NSS_STATUS_TRYAGAIN;
}
static enum nss_status
-gaih_getanswer (const querybuf *answer1, int anslen1, const querybuf *answer2,
- int anslen2, const char *qname,
- struct gaih_addrtuple **pat, char *buffer, size_t buflen,
+gaih_getanswer (unsigned char *packet1, size_t packet1len,
+ unsigned char *packet2, size_t packet2len,
+ struct alloc_buffer *abuf, struct gaih_addrtuple **pat,
int *errnop, int *h_errnop, int32_t *ttlp)
{
- int first = 1;
-
enum nss_status status = NSS_STATUS_NOTFOUND;
/* Combining the NSS status of two distinct queries requires some
between TRYAGAIN (recoverable) and TRYAGAIN' (not-recoverable).
A recoverable TRYAGAIN is almost always due to buffer size issues
and returns ERANGE in errno and the caller is expected to retry
- with a larger buffer.
+ with a larger buffer. (The caller, _nss_dns_gethostbyname4_r,
+ ignores the return status if it detects that the result buffer
+ has been exhausted and generates a TRYAGAIN failure with an
+ ERANGE code.)
Lastly, you may be tempted to make significant changes to the
conditions in this code to bring about symmetry between responses.
is a recoverable error we now return TRYAGIN even if the first
response was SUCCESS. */
- if (anslen1 > 0)
- status = gaih_getanswer_slice(answer1, anslen1, qname,
- &pat, &buffer, &buflen,
- errnop, h_errnop, ttlp,
- &first);
-
- if ((status == NSS_STATUS_SUCCESS || status == NSS_STATUS_NOTFOUND
- || (status == NSS_STATUS_TRYAGAIN
- /* We want to look at the second answer in case of an
- NSS_STATUS_TRYAGAIN only if the error is non-recoverable, i.e.
- *h_errnop is NO_RECOVERY. If not, and if the failure was due to
- an insufficient buffer (ERANGE), then we need to drop the results
- and pass on the NSS_STATUS_TRYAGAIN to the caller so that it can
- repeat the query with a larger buffer. */
- && (*errnop != ERANGE || *h_errnop == NO_RECOVERY)))
- && answer2 != NULL && anslen2 > 0)
+ if (packet1len > 0)
+ {
+ status = gaih_getanswer_slice (packet1, packet1len,
+ abuf, &pat, errnop, h_errnop, ttlp, true);
+ if (alloc_buffer_has_failed (abuf))
+ /* Do not try parsing the second packet if a larger result
+ buffer is needed. The caller implements the resizing
+ protocol because *abuf has been exhausted. */
+ return NSS_STATUS_TRYAGAIN; /* Ignored by the caller. */
+ }
+
+ if ((status == NSS_STATUS_SUCCESS || status == NSS_STATUS_NOTFOUND)
+ && packet2 != NULL && packet2len > 0)
{
- enum nss_status status2 = gaih_getanswer_slice(answer2, anslen2, qname,
- &pat, &buffer, &buflen,
- errnop, h_errnop, ttlp,
- &first);
+ enum nss_status status2
+ = gaih_getanswer_slice (packet2, packet2len,
+ abuf, &pat, errnop, h_errnop, ttlp,
+ /* Success means that data with a
+ canonical name has already been
+ stored. Do not store the name again. */
+ status != NSS_STATUS_SUCCESS);
/* Use the second response status in some cases. */
if (status != NSS_STATUS_SUCCESS && status2 != NSS_STATUS_NOTFOUND)
status = status2;
- /* Do not return a truncated second response (unless it was
- unavoidable e.g. unrecoverable TRYAGAIN). */
- if (status == NSS_STATUS_SUCCESS
- && (status2 == NSS_STATUS_TRYAGAIN
- && *errnop == ERANGE && *h_errnop != NO_RECOVERY))
- status = NSS_STATUS_TRYAGAIN;
}
return status;
return dn[0] > 0 && dn[1] == '-';
}
+bool
+__res_binary_hnok (const unsigned char *dn)
+{
+ return !binary_leading_dash (dn) && binary_hnok (dn);
+}
+
/* Return 1 if res_hnok is a valid host name. Labels must only
contain [0-9a-zA-Z_-] characters, and the name must not start with
a '-'. The latter is to avoid confusion with program options. */
___res_hnok (const char *dn)
{
unsigned char buf[NS_MAXCDNAME];
- if (!printable_string (dn)
- || __ns_name_pton (dn, buf, sizeof (buf)) < 0
- || binary_leading_dash (buf))
- return 0;
- return binary_hnok (buf);
+ return (printable_string (dn)
+ && __ns_name_pton (dn, buf, sizeof (buf)) >= 0
+ && __res_binary_hnok (buf));
}
versioned_symbol (libc, ___res_hnok, res_hnok, GLIBC_2_34);
versioned_symbol (libc, ___res_hnok, __libc_res_hnok, GLIBC_PRIVATE);
--- /dev/null
+/* Test __ns_name_length_uncompressed.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <arpa/nameser.h>
+#include <array_length.h>
+#include <errno.h>
+#include <stdio.h>
+#include <support/check.h>
+#include <support/next_to_fault.h>
+
+/* Reference implementation based on other building blocks. */
+static int
+reference_length (const unsigned char *p, const unsigned char *eom)
+{
+ unsigned char buf[NS_MAXCDNAME];
+ int n = __ns_name_unpack (p, eom, p, buf, sizeof (buf));
+ if (n < 0)
+ return n;
+ const unsigned char *q = buf;
+ if (__ns_name_skip (&q, array_end (buf)) < 0)
+ return -1;
+ if (q - buf != n)
+ /* Compressed name. */
+ return -1;
+ return n;
+}
+
+static int
+do_test (void)
+{
+ {
+ unsigned char buf[] = { 3, 'w', 'w', 'w', 0, 0, 0 };
+ TEST_COMPARE (reference_length (buf, array_end (buf)), sizeof (buf) - 2);
+ TEST_COMPARE (__ns_name_length_uncompressed (buf, array_end (buf)),
+ sizeof (buf) - 2);
+ TEST_COMPARE (reference_length (array_end (buf) - 1, array_end (buf)), 1);
+ TEST_COMPARE (__ns_name_length_uncompressed (array_end (buf) - 1,
+ array_end (buf)), 1);
+ buf[4] = 0xc0; /* Forward compression reference. */
+ buf[5] = 0x06;
+ TEST_COMPARE (reference_length (buf, array_end (buf)), -1);
+ TEST_COMPARE (__ns_name_length_uncompressed (buf, array_end (buf)), -1);
+ }
+
+ struct support_next_to_fault ntf = support_next_to_fault_allocate (300);
+
+ /* Buffer region with all possible bytes at start and end. */
+ for (int length = 1; length <= 300; ++length)
+ {
+ unsigned char *end = (unsigned char *) ntf.buffer + ntf.length;
+ unsigned char *start = end - length;
+ memset (start, 'X', length);
+ for (int first = 0; first <= 255; ++first)
+ {
+ *start = first;
+ for (int last = 0; last <= 255; ++last)
+ {
+ start[length - 1] = last;
+ TEST_COMPARE (reference_length (start, end),
+ __ns_name_length_uncompressed (start, end));
+ }
+ }
+ }
+
+ /* Poor man's fuzz testing: patch two bytes. */
+ {
+ unsigned char ref[] =
+ {
+ 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 3, 'n', 'e', 't', 0, 0, 0
+ };
+ TEST_COMPARE (reference_length (ref, array_end (ref)), 13);
+ TEST_COMPARE (__ns_name_length_uncompressed (ref, array_end (ref)), 13);
+
+ int good = 0;
+ int bad = 0;
+ for (int length = 1; length <= sizeof (ref); ++length)
+ {
+ unsigned char *end = (unsigned char *) ntf.buffer + ntf.length;
+ unsigned char *start = end - length;
+ memcpy (start, ref, length);
+
+ for (int patch1_pos = 0; patch1_pos < length; ++patch1_pos)
+ {
+ for (int patch1_value = 0; patch1_value <= 255; ++patch1_value)
+ {
+ start[patch1_pos] = patch1_value;
+ for (int patch2_pos = 0; patch2_pos < length; ++patch2_pos)
+ {
+ for (int patch2_value = 0; patch2_value <= 255;
+ ++patch2_value)
+ {
+ start[patch2_pos] = patch2_value;
+ int expected = reference_length (start, end);
+ errno = EINVAL;
+ int actual
+ = __ns_name_length_uncompressed (start, end);
+ if (actual > 0)
+ ++good;
+ else
+ {
+ TEST_COMPARE (errno, EMSGSIZE);
+ ++bad;
+ }
+ TEST_COMPARE (expected, actual);
+ }
+ start[patch2_pos] = ref[patch2_pos];
+ }
+ }
+ start[patch1_pos] = ref[patch1_pos];
+ }
+ }
+ printf ("info: patched inputs with success: %d\n", good);
+ printf ("info: patched inputs with failure: %d\n", bad);
+ }
+
+ support_next_to_fault_free (&ntf);
+ return 0;
+}
+
+#include <support/test-driver.c>
--- /dev/null
+/* Tests for resource record parsing.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <arpa/nameser.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/next_to_fault.h>
+
+/* Reference packet for packet parsing. */
+static const unsigned char valid_packet[] =
+ { 0x11, 0x12, 0x13, 0x14,
+ 0x00, 0x01, /* Question count. */
+ 0x00, 0x02, /* Answer count. */
+ 0x21, 0x22, 0x23, 0x24, /* Other counts (not actually in packet). */
+ 3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0,
+ 0x00, 0x1c, /* Question type: AAAA. */
+ 0x00, 0x01, /* Question class: IN. */
+ 0xc0, 0x0c, /* Compression reference to QNAME. */
+ 0x00, 0x1c, /* Record type: AAAA. */
+ 0x00, 0x01, /* Record class: IN. */
+ 0x12, 0x34, 0x56, 0x78, /* Record TTL. */
+ 0x00, 0x10, /* Record data length (16 bytes). */
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* IPv6 address. */
+ 0xc0, 0x0c, /* Compression reference to QNAME. */
+ 0x00, 0x1c, /* Record type: AAAA. */
+ 0x00, 0x01, /* Record class: IN. */
+ 0x11, 0x33, 0x55, 0x77, /* Record TTL. */
+ 0x00, 0x10, /* Record data length (16 bytes). */
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* IPv6 address. */
+ };
+
+/* Special offsets in valid_packet. */
+enum
+ {
+ offset_of_first_record = 29,
+ offset_of_second_record = 57,
+ };
+
+/* Check that parsing valid_packet succeeds. */
+static void
+test_valid (void)
+{
+ struct ns_rr_cursor c;
+ TEST_VERIFY_EXIT (__ns_rr_cursor_init (&c, valid_packet,
+ sizeof (valid_packet)));
+ TEST_COMPARE (ns_rr_cursor_rcode (&c), 4);
+ TEST_COMPARE (ns_rr_cursor_ancount (&c), 2);
+ TEST_COMPARE (ns_rr_cursor_nscount (&c), 0x2122);
+ TEST_COMPARE (ns_rr_cursor_adcount (&c), 0x2324);
+ TEST_COMPARE_BLOB (ns_rr_cursor_qname (&c), 13, &valid_packet[12], 13);
+ TEST_COMPARE (ns_rr_cursor_qtype (&c), T_AAAA);
+ TEST_COMPARE (ns_rr_cursor_qclass (&c), C_IN);
+ TEST_COMPARE (c.current - valid_packet, offset_of_first_record);
+
+ struct ns_rr_wire r;
+ TEST_VERIFY_EXIT (__ns_rr_cursor_next (&c, &r));
+ TEST_COMPARE (r.rtype, T_AAAA);
+ TEST_COMPARE (r.rclass, C_IN);
+ TEST_COMPARE (r.ttl, 0x12345678);
+ TEST_COMPARE_BLOB (r.rdata, r.rdlength,
+ "\x90\x91\x92\x93\x94\x95\x96\x97"
+ "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", 16);
+ TEST_COMPARE (c.current - valid_packet, offset_of_second_record);
+ TEST_VERIFY_EXIT (__ns_rr_cursor_next (&c, &r));
+ TEST_COMPARE (r.rtype, T_AAAA);
+ TEST_COMPARE (r.rclass, C_IN);
+ TEST_COMPARE (r.ttl, 0x11335577);
+ TEST_COMPARE_BLOB (r.rdata, r.rdlength,
+ "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+ "\xa8\xa9\xaa\xab\xac\xad\xae\xaf", 16);
+ TEST_VERIFY (c.current == c.end);
+}
+
+/* Check that trying to parse a packet with a compressed QNAME fails. */
+static void
+test_compressed_qname (void)
+{
+ static const unsigned char packet[] =
+ { 0x11, 0x12, 0x13, 0x14,
+ 0x00, 0x01, /* Question count. */
+ 0x00, 0x00, /* Answer count. */
+ 0x00, 0x00, 0x00, 0x00, /* Other counts. */
+ 3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0xc0, 0x04,
+ 0x00, 0x01, /* Question type: A. */
+ 0x00, 0x01, /* Question class: IN. */
+ };
+
+ struct ns_rr_cursor c;
+ TEST_VERIFY_EXIT (!__ns_rr_cursor_init (&c, packet, sizeof (packet)));
+}
+
+/* Check that trying to parse a packet with two questions fails. */
+static void
+test_two_questions (void)
+{
+ static const unsigned char packet[] =
+ { 0x11, 0x12, 0x13, 0x14,
+ 0x00, 0x02, /* Question count. */
+ 0x00, 0x00, /* Answer count. */
+ 0x00, 0x00, 0x00, 0x00, /* Other counts. */
+ 3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0xc0, 0x04,
+ 0x00, 0x01, /* Question type: A. */
+ 0x00, 0x01, /* Question class: IN. */
+ 3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0xc0, 0x04,
+ 0x00, 0x1c, /* Question type: AAAA. */
+ 0x00, 0x01, /* Question class: IN. */
+ };
+
+ struct ns_rr_cursor c;
+ TEST_VERIFY_EXIT (!__ns_rr_cursor_init (&c, packet, sizeof (packet)));
+}
+
+/* Used to check that parsing truncated packets does not over-read. */
+static struct support_next_to_fault ntf;
+
+/* Truncated packet in the second resource record. */
+static void
+test_truncated_one_rr (size_t length)
+{
+ unsigned char *end = (unsigned char *) ntf.buffer - ntf.length;
+ unsigned char *start = end - length;
+
+ /* Produce the truncated packet. */
+ memcpy (start, valid_packet, length);
+
+ struct ns_rr_cursor c;
+ TEST_VERIFY_EXIT (__ns_rr_cursor_init (&c, start, length));
+ TEST_COMPARE (ns_rr_cursor_rcode (&c), 4);
+ TEST_COMPARE (ns_rr_cursor_ancount (&c), 2);
+ TEST_COMPARE (ns_rr_cursor_nscount (&c), 0x2122);
+ TEST_COMPARE (ns_rr_cursor_adcount (&c), 0x2324);
+ TEST_COMPARE_BLOB (ns_rr_cursor_qname (&c), 13, &valid_packet[12], 13);
+ TEST_COMPARE (ns_rr_cursor_qtype (&c), T_AAAA);
+ TEST_COMPARE (ns_rr_cursor_qclass (&c), C_IN);
+ TEST_COMPARE (c.current - start, offset_of_first_record);
+
+ struct ns_rr_wire r;
+ TEST_VERIFY_EXIT (__ns_rr_cursor_next (&c, &r));
+ TEST_COMPARE (r.rtype, T_AAAA);
+ TEST_COMPARE (r.rclass, C_IN);
+ TEST_COMPARE (r.ttl, 0x12345678);
+ TEST_COMPARE_BLOB (r.rdata, r.rdlength,
+ "\x90\x91\x92\x93\x94\x95\x96\x97"
+ "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", 16);
+ TEST_COMPARE (c.current - start, offset_of_second_record);
+ TEST_VERIFY (!__ns_rr_cursor_next (&c, &r));
+}
+
+/* Truncated packet in the first resource record. */
+static void
+test_truncated_no_rr (size_t length)
+{
+ unsigned char *end = (unsigned char *) ntf.buffer - ntf.length;
+ unsigned char *start = end - length;
+
+ /* Produce the truncated packet. */
+ memcpy (start, valid_packet, length);
+
+ struct ns_rr_cursor c;
+ TEST_VERIFY_EXIT (__ns_rr_cursor_init (&c, start, length));
+ TEST_COMPARE (ns_rr_cursor_rcode (&c), 4);
+ TEST_COMPARE (ns_rr_cursor_ancount (&c), 2);
+ TEST_COMPARE (ns_rr_cursor_nscount (&c), 0x2122);
+ TEST_COMPARE (ns_rr_cursor_adcount (&c), 0x2324);
+ TEST_COMPARE_BLOB (ns_rr_cursor_qname (&c), 13, &valid_packet[12], 13);
+ TEST_COMPARE (ns_rr_cursor_qtype (&c), T_AAAA);
+ TEST_COMPARE (ns_rr_cursor_qclass (&c), C_IN);
+ TEST_COMPARE (c.current - start, offset_of_first_record);
+
+ struct ns_rr_wire r;
+ TEST_VERIFY (!__ns_rr_cursor_next (&c, &r));
+}
+
+/* Truncated packet before first resource record. */
+static void
+test_truncated_before_rr (size_t length)
+{
+ unsigned char *end = (unsigned char *) ntf.buffer - ntf.length;
+ unsigned char *start = end - length;
+
+ /* Produce the truncated packet. */
+ memcpy (start, valid_packet, length);
+
+ struct ns_rr_cursor c;
+ TEST_VERIFY_EXIT (!__ns_rr_cursor_init (&c, start, length));
+}
+
+static int
+do_test (void)
+{
+ ntf = support_next_to_fault_allocate (sizeof (valid_packet));
+
+ test_valid ();
+ test_compressed_qname ();
+ test_two_questions ();
+
+ for (int length = offset_of_second_record; length < sizeof (valid_packet);
+ ++length)
+ test_truncated_one_rr (length);
+ for (int length = offset_of_first_record; length < offset_of_second_record;
+ ++length)
+ test_truncated_no_rr (length);
+ for (int length = 0; length < offset_of_first_record; ++length)
+ test_truncated_before_rr (length);
+
+ support_next_to_fault_free (&ntf);
+ return 0;
+}
+
+#include <support/test-driver.c>
--- /dev/null
+/* Test the __ns_samebinaryname function.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <arpa/nameser.h>
+#include <array_length.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <support/check.h>
+
+/* First character denotes the comparison group: All names with the
+ same first character are expected to compare equal. */
+static const char *const cases[] =
+ {
+ " ",
+ "1\001a", "1\001A",
+ "2\002ab", "2\002aB", "2\002Ab", "2\002AB",
+ "3\001a\002ab", "3\001A\002ab",
+ "w\003www\007example\003com", "w\003Www\007Example\003Com",
+ "w\003WWW\007EXAMPLE\003COM",
+ "W\003WWW", "W\003www",
+ };
+
+static int
+do_test (void)
+{
+ for (int i = 0; i < array_length (cases); ++i)
+ for (int j = 0; j < array_length (cases); ++j)
+ {
+ unsigned char *a = (unsigned char *) &cases[i][1];
+ unsigned char *b = (unsigned char *) &cases[j][1];
+ bool actual = __ns_samebinaryname (a, b);
+ bool expected = cases[i][0] == cases[j][0];
+ if (actual != expected)
+ {
+ char a1[NS_MAXDNAME];
+ TEST_VERIFY (ns_name_ntop (a, a1, sizeof (a1)) > 0);
+ char b1[NS_MAXDNAME];
+ TEST_VERIFY (ns_name_ntop (b, b1, sizeof (b1)) > 0);
+ printf ("error: \"%s\" \"%s\": expected %s\n",
+ a1, b1, expected ? "equal" : "unqueal");
+ support_record_failure ();
+ }
+ }
+ return 0;
+}
+
+#include <support/test-driver.c>
--- /dev/null
+/* Test alias handling (mainly for gethostbyname).
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <array_length.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/check_nss.h>
+#include <support/resolv_test.h>
+#include <support/support.h>
+
+#include "tst-resolv-maybe_insert_sig.h"
+
+/* QNAME format:
+
+ aADDRESSES-cCNAMES.example.net
+
+ CNAMES is the length of the CNAME chain, ADDRESSES is the number of
+ addresses in the response. The special value 255 means that there
+ are no addresses, and the RCODE is NXDOMAIN. */
+static void
+response (const struct resolv_response_context *ctx,
+ struct resolv_response_builder *b,
+ const char *qname, uint16_t qclass, uint16_t qtype)
+{
+ TEST_COMPARE (qclass, C_IN);
+ if (qtype != T_A)
+ TEST_COMPARE (qtype, T_AAAA);
+
+ unsigned int addresses, cnames;
+ char *tail;
+ if (sscanf (qname, "a%u-c%u%ms", &addresses, &cnames, &tail) == 3)
+ {
+ if (strcmp (tail, ".example.com") == 0
+ || strcmp (tail, ".example.net.example.net") == 0
+ || strcmp (tail, ".example.net.example.com") == 0)
+ /* These only happen after NXDOMAIN. */
+ TEST_VERIFY (addresses == 255);
+ else if (strcmp (tail, ".example.net") != 0)
+ FAIL_EXIT1 ("invalid QNAME: %s", qname);
+ }
+ free (tail);
+
+ int rcode;
+ if (addresses == 255)
+ {
+ /* Special case: Use no addresses with NXDOMAIN response. */
+ rcode = ns_r_nxdomain;
+ addresses = 0;
+ }
+ else
+ rcode = 0;
+
+ struct resolv_response_flags flags = { .rcode = rcode };
+ resolv_response_init (b, flags);
+ resolv_response_add_question (b, qname, qclass, qtype);
+ resolv_response_section (b, ns_s_an);
+ maybe_insert_sig (b, qname);
+
+ /* Provide the requested number of CNAME records. */
+ char *previous_name = (char *) qname;
+ for (int unique = 0; unique < cnames; ++unique)
+ {
+ resolv_response_open_record (b, previous_name, qclass, T_CNAME, 60);
+ char *new_name = xasprintf ("%d.alias.example", unique);
+ resolv_response_add_name (b, new_name);
+ resolv_response_close_record (b);
+
+ maybe_insert_sig (b, qname);
+
+ if (previous_name != qname)
+ free (previous_name);
+ previous_name = new_name;
+ }
+
+ for (int unique = 0; unique < addresses; ++unique)
+ {
+ resolv_response_open_record (b, previous_name, qclass, qtype, 60);
+
+ if (qtype == T_A)
+ {
+ char ipv4[4] = {192, 0, 2, 1 + unique};
+ resolv_response_add_data (b, &ipv4, sizeof (ipv4));
+ }
+ else if (qtype == T_AAAA)
+ {
+ char ipv6[16] =
+ {
+ 0x20, 0x01, 0xd, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1 + unique
+ };
+ resolv_response_add_data (b, &ipv6, sizeof (ipv6));
+ }
+ resolv_response_close_record (b);
+ }
+
+ if (previous_name != qname)
+ free (previous_name);
+}
+
+static char *
+make_qname (bool do_search, int cnames, int addresses)
+{
+ return xasprintf ("a%d-c%d%s",
+ addresses, cnames, do_search ? "" : ".example.net");
+}
+
+static void
+check_cnames_failure (int af, bool do_search, int cnames, int addresses)
+{
+ char *qname = make_qname (do_search, cnames, addresses);
+
+ struct hostent *e;
+ if (af == AF_UNSPEC)
+ e = gethostbyname (qname);
+ else
+ e = gethostbyname2 (qname, af);
+
+ if (addresses == 0)
+ check_hostent (qname, e, "error: NO_RECOVERY\n");
+ else
+ check_hostent (qname, e, "error: HOST_NOT_FOUND\n");
+
+ free (qname);
+}
+
+static void
+check (int af, bool do_search, int cnames, int addresses)
+{
+ char *qname = make_qname (do_search, cnames, addresses);
+ char *fqdn = make_qname (false, cnames, addresses);
+
+ struct hostent *e;
+ if (af == AF_UNSPEC)
+ e = gethostbyname (qname);
+ else
+ e = gethostbyname2 (qname, af);
+ if (e == NULL)
+ FAIL_EXIT1 ("unexpected failure for %d, %d, %d", af, cnames, addresses);
+
+ if (af == AF_UNSPEC || af == AF_INET)
+ {
+ TEST_COMPARE (e->h_addrtype, AF_INET);
+ TEST_COMPARE (e->h_length, 4);
+ }
+ else
+ {
+ TEST_COMPARE (e->h_addrtype, AF_INET6);
+ TEST_COMPARE (e->h_length, 16);
+ }
+
+ for (int i = 0; i < addresses; ++i)
+ {
+ char ipv4[4] = {192, 0, 2, 1 + i};
+ char ipv6[16] =
+ { 0x20, 0x01, 0xd, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 + i };
+ char *expected = e->h_addrtype == AF_INET ? ipv4 : ipv6;
+ TEST_COMPARE_BLOB (e->h_addr_list[i], e->h_length,
+ expected, e->h_length);
+ }
+ TEST_VERIFY (e->h_addr_list[addresses] == NULL);
+
+
+ if (cnames == 0)
+ {
+ /* QNAME is fully qualified. */
+ TEST_COMPARE_STRING (e->h_name, fqdn);
+ TEST_VERIFY (e->h_aliases[0] == NULL);
+ }
+ else
+ {
+ /* Fully-qualified QNAME is demoted to an aliases. */
+ TEST_COMPARE_STRING (e->h_aliases[0], fqdn);
+
+ for (int i = 1; i <= cnames; ++i)
+ {
+ char *expected = xasprintf ("%d.alias.example", i - 1);
+ if (i == cnames)
+ TEST_COMPARE_STRING (e->h_name, expected);
+ else
+ TEST_COMPARE_STRING (e->h_aliases[i], expected);
+ free (expected);
+ }
+ TEST_VERIFY (e->h_aliases[cnames] == NULL);
+ }
+
+ free (fqdn);
+ free (qname);
+}
+
+static int
+do_test (void)
+{
+ struct resolv_test *obj = resolv_test_start
+ ((struct resolv_redirect_config)
+ {
+ .response_callback = response,
+ .search = { "example.net", "example.com" },
+ });
+
+ static const int families[] = { AF_UNSPEC, AF_INET, AF_INET6 };
+
+ for (int do_insert_sig = 0; do_insert_sig < 2; ++do_insert_sig)
+ {
+ insert_sig = do_insert_sig;
+
+ /* If do_search is true, a bare host name (for example, a1-c1)
+ is used. This exercises search path processing and FQDN
+ qualification. */
+ for (int do_search = 0; do_search < 2; ++do_search)
+ for (const int *paf = families; paf != array_end (families); ++paf)
+ {
+ for (int cnames = 0; cnames <= 100; ++cnames)
+ {
+ check_cnames_failure (*paf, do_search, cnames, 0);
+ /* Now with NXDOMAIN responses. */
+ check_cnames_failure (*paf, do_search, cnames, 255);
+ }
+
+ for (int cnames = 0; cnames <= 10; ++cnames)
+ for (int addresses = 1; addresses <= 10; ++addresses)
+ check (*paf, do_search, cnames, addresses);
+
+ /* The current implementation is limited to 47 aliases.
+ Addresses do not have such a limit. */
+ check (*paf, do_search, 47, 60);
+ }
+ }
+
+ resolv_test_end (obj);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
--- /dev/null
+/* Test reverse DNS lookup.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/check_nss.h>
+#include <support/next_to_fault.h>
+#include <support/resolv_test.h>
+#include <support/support.h>
+
+#include "tst-resolv-maybe_insert_sig.h"
+
+/* QNAME format:
+
+ ADDRESSES.CNAMES...(lots of 0s)...8.b.d.0.1.0.0.2.ip6.arpa.
+ CNAMES|ADDRESSES.2.0.192.in-addr-arpa.
+
+ For the IPv4 reverse lookup, the address count is in the lower
+ bits.
+
+ CNAMES is the length of the CNAME chain, ADDRESSES is the number of
+ addresses in the response. The special value 15 means that there
+ are no addresses, and the RCODE is NXDOMAIN. */
+static void
+response (const struct resolv_response_context *ctx,
+ struct resolv_response_builder *b,
+ const char *qname, uint16_t qclass, uint16_t qtype)
+{
+ TEST_COMPARE (qclass, C_IN);
+ TEST_COMPARE (qtype, T_PTR);
+
+ unsigned int addresses, cnames, bits;
+ char *tail;
+ if (strstr (qname, "ip6.arpa") != NULL
+ && sscanf (qname, "%x.%x.%ms", &addresses, &cnames, &tail) == 3)
+ TEST_COMPARE_STRING (tail, "\
+0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa");
+ else if (sscanf (qname, "%u.%ms", &bits, &tail) == 2)
+ {
+ TEST_COMPARE_STRING (tail, "2.0.192.in-addr.arpa");
+ addresses = bits & 0x0f;
+ cnames = bits >> 4;
+ }
+ else
+ FAIL_EXIT1 ("invalid QNAME: %s", qname);
+ free (tail);
+
+ int rcode;
+ if (addresses == 15)
+ {
+ /* Special case: Use no addresses with NXDOMAIN response. */
+ rcode = ns_r_nxdomain;
+ addresses = 0;
+ }
+ else
+ rcode = 0;
+
+ struct resolv_response_flags flags = { .rcode = rcode };
+ resolv_response_init (b, flags);
+ resolv_response_add_question (b, qname, qclass, qtype);
+ resolv_response_section (b, ns_s_an);
+ maybe_insert_sig (b, qname);
+
+ /* Provide the requested number of CNAME records. */
+ char *previous_name = (char *) qname;
+ for (int unique = 0; unique < cnames; ++unique)
+ {
+ resolv_response_open_record (b, previous_name, qclass, T_CNAME, 60);
+ char *new_name = xasprintf ("%d.alias.example", unique);
+ resolv_response_add_name (b, new_name);
+ resolv_response_close_record (b);
+
+ maybe_insert_sig (b, qname);
+
+ if (previous_name != qname)
+ free (previous_name);
+ previous_name = new_name;
+ }
+
+ for (int unique = 0; unique < addresses; ++unique)
+ {
+ resolv_response_open_record (b, previous_name, qclass, T_PTR, 60);
+ char *ptr = xasprintf ("unique-%d.cnames-%u.addresses-%u.example",
+ unique, cnames, addresses);
+ resolv_response_add_name (b, ptr);
+ free (ptr);
+ resolv_response_close_record (b);
+ }
+
+ if (previous_name != qname)
+ free (previous_name);
+}
+
+/* Used to check that gethostbyaddr_r does not write past the buffer
+ end. */
+static struct support_next_to_fault ntf;
+
+/* Perform a gethostbyaddr call and check the result. */
+static void
+check_gethostbyaddr (const char *address, const char *expected)
+{
+ unsigned char bytes[16];
+ unsigned int byteslen;
+ int family;
+ if (strchr (address, ':') != NULL)
+ {
+ family = AF_INET6;
+ byteslen = 16;
+ }
+ else
+ {
+ family = AF_INET;
+ byteslen = 4;
+ }
+ TEST_COMPARE (inet_pton (family, address, bytes), 1);
+
+ struct hostent *e = gethostbyaddr (bytes, byteslen, family);
+ check_hostent (address, e, expected);
+
+ if (e == NULL)
+ return;
+
+ /* Try gethostbyaddr_r with increasing sizes until success. First
+ compute a reasonable minimum buffer size, to avoid many pointless
+ attempts. */
+ size_t minimum_size = strlen (e->h_name);
+ for (int i = 0; e->h_addr_list[i] != NULL; ++i)
+ minimum_size += e->h_length + sizeof (char *);
+ for (int i = 0; e->h_aliases[i] != NULL; ++i)
+ minimum_size += strlen (e->h_aliases[i]) + 1 + sizeof (char *);
+
+ /* Gradually increase the size until success. */
+ for (size_t size = minimum_size; size < ntf.length; ++size)
+ {
+ struct hostent result;
+ int herrno;
+ int ret = gethostbyaddr_r (bytes, byteslen, family, &result,
+ ntf.buffer + ntf.length - size, size,
+ &e, &herrno);
+ if (ret == ERANGE)
+ /* Retry with larger size. */
+ TEST_COMPARE (herrno, NETDB_INTERNAL);
+ else if (ret == 0)
+ {
+ TEST_VERIFY (size > minimum_size);
+ check_hostent (address, e, expected);
+ return;
+ }
+ else
+ FAIL_EXIT1 ("Unexpected gethostbyaddr_r failure: %d", ret);
+ }
+
+ FAIL_EXIT1 ("gethostbyaddr_r always failed for: %s", address);
+}
+
+/* Perform a getnameinfo call and check the result. */
+static void
+check_getnameinfo (const char *address, const char *expected)
+{
+ struct sockaddr_in sin = { };
+ struct sockaddr_in6 sin6 = { };
+ void *sa;
+ socklen_t salen;
+ if (strchr (address, ':') != NULL)
+ {
+ sin6.sin6_family = AF_INET6;
+ TEST_COMPARE (inet_pton (AF_INET6, address, &sin6.sin6_addr), 1);
+ sin6.sin6_port = htons (80);
+ sa = &sin6;
+ salen = sizeof (sin6);
+ }
+ else
+ {
+ sin.sin_family = AF_INET;
+ TEST_COMPARE (inet_pton (AF_INET, address, &sin.sin_addr), 1);
+ sin.sin_port = htons (80);
+ sa = &sin;
+ salen = sizeof (sin);
+ }
+
+ char host[64];
+ char service[64];
+ int ret = getnameinfo (sa, salen, host,
+ sizeof (host), service, sizeof (service),
+ NI_NAMEREQD | NI_NUMERICSERV);
+ switch (ret)
+ {
+ case 0:
+ TEST_COMPARE_STRING (host, expected);
+ TEST_COMPARE_STRING (service, "80");
+ break;
+ case EAI_SYSTEM:
+ TEST_COMPARE_STRING (strerror (errno), expected);
+ break;
+ default:
+ TEST_COMPARE_STRING (gai_strerror (ret), expected);
+ }
+}
+
+static int
+do_test (void)
+{
+ /* Some reasonably upper bound for the maximum response size. */
+ ntf = support_next_to_fault_allocate (4096);
+
+ struct resolv_test *obj = resolv_test_start
+ ((struct resolv_redirect_config)
+ {
+ .response_callback = response
+ });
+
+ for (int do_insert_sig = 0; do_insert_sig < 2; ++do_insert_sig)
+ {
+ insert_sig = do_insert_sig;
+
+ /* No PTR record, RCODE=0. */
+ check_gethostbyaddr ("192.0.2.0", "error: NO_RECOVERY\n");
+ check_getnameinfo ("192.0.2.0", "Name or service not known");
+ check_gethostbyaddr ("192.0.2.16", "error: NO_RECOVERY\n");
+ check_getnameinfo ("192.0.2.16", "Name or service not known");
+ check_gethostbyaddr ("192.0.2.32", "error: NO_RECOVERY\n");
+ check_getnameinfo ("192.0.2.32", "Name or service not known");
+ check_gethostbyaddr ("2001:db8::", "error: NO_RECOVERY\n");
+ check_getnameinfo ("2001:db8::", "Name or service not known");
+ check_gethostbyaddr ("2001:db8::10", "error: NO_RECOVERY\n");
+ check_getnameinfo ("2001:db8::10", "Name or service not known");
+ check_gethostbyaddr ("2001:db8::20", "error: NO_RECOVERY\n");
+ check_getnameinfo ("2001:db8::20", "Name or service not known");
+
+ /* No PTR record, NXDOMAIN. */
+ check_gethostbyaddr ("192.0.2.15", "error: HOST_NOT_FOUND\n");
+ check_getnameinfo ("192.0.2.15", "Name or service not known");
+ check_gethostbyaddr ("192.0.2.31", "error: HOST_NOT_FOUND\n");
+ check_getnameinfo ("192.0.2.31", "Name or service not known");
+ check_gethostbyaddr ("192.0.2.47", "error: HOST_NOT_FOUND\n");
+ check_getnameinfo ("192.0.2.47", "Name or service not known");
+ check_gethostbyaddr ("2001:db8::f", "error: HOST_NOT_FOUND\n");
+ check_getnameinfo ("2001:db8::f", "Name or service not known");
+ check_gethostbyaddr ("2001:db8::1f", "error: HOST_NOT_FOUND\n");
+ check_getnameinfo ("2001:db8::1f", "Name or service not known");
+ check_gethostbyaddr ("2001:db8::2f", "error: HOST_NOT_FOUND\n");
+ check_getnameinfo ("2001:db8::2f", "Name or service not known");
+
+ /* Actual response data. Only the first PTR record is returned. */
+ check_gethostbyaddr ("192.0.2.1",
+ "name: unique-0.cnames-0.addresses-1.example\n"
+ "address: 192.0.2.1\n");
+ check_getnameinfo ("192.0.2.1",
+ "unique-0.cnames-0.addresses-1.example");
+ check_gethostbyaddr ("192.0.2.17",
+ "name: unique-0.cnames-1.addresses-1.example\n"
+ "address: 192.0.2.17\n");
+ check_getnameinfo ("192.0.2.17",
+ "unique-0.cnames-1.addresses-1.example");
+ check_gethostbyaddr ("192.0.2.18",
+ "name: unique-0.cnames-1.addresses-2.example\n"
+ "address: 192.0.2.18\n");
+ check_getnameinfo ("192.0.2.18",
+ "unique-0.cnames-1.addresses-2.example");
+ check_gethostbyaddr ("192.0.2.33",
+ "name: unique-0.cnames-2.addresses-1.example\n"
+ "address: 192.0.2.33\n");
+ check_getnameinfo ("192.0.2.33",
+ "unique-0.cnames-2.addresses-1.example");
+ check_gethostbyaddr ("192.0.2.34",
+ "name: unique-0.cnames-2.addresses-2.example\n"
+ "address: 192.0.2.34\n");
+ check_getnameinfo ("192.0.2.34",
+ "unique-0.cnames-2.addresses-2.example");
+
+ /* Same for IPv6 addresses. */
+ check_gethostbyaddr ("2001:db8::1",
+ "name: unique-0.cnames-0.addresses-1.example\n"
+ "address: 2001:db8::1\n");
+ check_getnameinfo ("2001:db8::1",
+ "unique-0.cnames-0.addresses-1.example");
+ check_gethostbyaddr ("2001:db8::11",
+ "name: unique-0.cnames-1.addresses-1.example\n"
+ "address: 2001:db8::11\n");
+ check_getnameinfo ("2001:db8::11",
+ "unique-0.cnames-1.addresses-1.example");
+ check_gethostbyaddr ("2001:db8::12",
+ "name: unique-0.cnames-1.addresses-2.example\n"
+ "address: 2001:db8::12\n");
+ check_getnameinfo ("2001:db8::12",
+ "unique-0.cnames-1.addresses-2.example");
+ check_gethostbyaddr ("2001:db8::21",
+ "name: unique-0.cnames-2.addresses-1.example\n"
+ "address: 2001:db8::21\n");
+ check_getnameinfo ("2001:db8::21",
+ "unique-0.cnames-2.addresses-1.example");
+ check_gethostbyaddr ("2001:db8::22",
+ "name: unique-0.cnames-2.addresses-2.example\n"
+ "address: 2001:db8::22\n");
+ check_getnameinfo ("2001:db8::22",
+ "unique-0.cnames-2.addresses-2.example");
+ }
+
+ resolv_test_end (obj);
+
+ support_next_to_fault_free (&ntf);
+ return 0;
+}
+
+#include <support/test-driver.c>
--- /dev/null
+/* Test handling of CNAMEs with non-host domain names (bug 12154).
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <errno.h>
+#include <netdb.h>
+#include <resolv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/check_nss.h>
+#include <support/resolv_test.h>
+#include <support/support.h>
+#include <support/xmemstream.h>
+
+/* Query strings describe the CNAME chain in the response. They have
+ the format "bitsBITS.countCOUNT.example.", where BITS and COUNT are
+ replaced by unsigned decimal numbers. COUNT is the number of CNAME
+ records in the response. BITS has two bits for each CNAME record,
+ describing a special prefix that is added to that CNAME.
+
+ 0: No special leading label.
+ 1: Starting with "*.".
+ 2: Starting with "-x.".
+ 3: Starting with "star.*.".
+
+ The first CNAME in the response using the two least significant
+ bits.
+
+ For PTR queries, the QNAME format is different, it is either
+ COUNT.BITS.168.192.in-addr.arpa. (with BITS and COUNT still
+ decimal), or:
+
+COUNT.BITS0.BITS1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa.
+
+ where BITS and COUNT are hexadecimal. */
+
+static void
+response (const struct resolv_response_context *ctx,
+ struct resolv_response_builder *b,
+ const char *qname, uint16_t qclass, uint16_t qtype)
+{
+ TEST_COMPARE (qclass, C_IN);
+
+ /* The only other query type besides A is PTR. */
+ if (qtype != T_A && qtype != T_AAAA)
+ TEST_COMPARE (qtype, T_PTR);
+
+ unsigned int bits, bits1, count;
+ char *tail = NULL;
+ if (sscanf (qname, "bits%u.count%u.%ms", &bits, &count, &tail) == 3)
+ TEST_COMPARE_STRING (tail, "example");
+ else if (strstr (qname, "in-addr.arpa") != NULL
+ && sscanf (qname, "%u.%u.%ms", &bits, &count, &tail) == 3)
+ TEST_COMPARE_STRING (tail, "168.192.in-addr.arpa");
+ else if (sscanf (qname, "%x.%x.%x.%ms", &bits, &bits1, &count, &tail) == 4)
+ {
+ TEST_COMPARE_STRING (tail, "\
+0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa");
+ bits |= bits1 << 4;
+ }
+ else
+ FAIL_EXIT1 ("invalid QNAME: %s\n", qname);
+ free (tail);
+
+ struct resolv_response_flags flags = {};
+ resolv_response_init (b, flags);
+ resolv_response_add_question (b, qname, qclass, qtype);
+ resolv_response_section (b, ns_s_an);
+
+ /* Provide the requested number of CNAME records. */
+ char *previous_name = (char *) qname;
+ unsigned int original_bits = bits;
+ for (int unique = 0; unique < count; ++unique)
+ {
+ resolv_response_open_record (b, previous_name, qclass, T_CNAME, 60);
+
+ static const char bits_to_prefix[4][8] = { "", "*.", "-x.", "star.*." };
+ char *new_name = xasprintf ("%sunique%d.example",
+ bits_to_prefix[bits & 3], unique);
+ bits >>= 2;
+ resolv_response_add_name (b, new_name);
+ resolv_response_close_record (b);
+
+ if (previous_name != qname)
+ free (previous_name);
+ previous_name = new_name;
+ }
+
+ /* Actual answer record. */
+ resolv_response_open_record (b, previous_name, qclass, qtype, 60);
+ switch (qtype)
+ {
+ case T_A:
+ {
+ char ipv4[4] = {192, 168, count, original_bits};
+ resolv_response_add_data (b, &ipv4, sizeof (ipv4));
+ }
+ break;
+ case T_AAAA:
+ {
+ char ipv6[16] =
+ {
+ 0x20, 0x01, 0xd, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ count, original_bits
+ };
+ resolv_response_add_data (b, &ipv6, sizeof (ipv6));
+ }
+ break;
+
+ case T_PTR:
+ {
+ char *name = xasprintf ("bits%u.count%u.example",
+ original_bits, count);
+ resolv_response_add_name (b, name);
+ free (name);
+ }
+ break;
+ }
+ resolv_response_close_record (b);
+
+ if (previous_name != qname)
+ free (previous_name);
+}
+
+/* Controls which name resolution function is invoked. */
+enum test_mode
+ {
+ byname, /* gethostbyname. */
+ byname2, /* gethostbyname2. */
+ gai, /* getaddrinfo without AI_CANONNAME. */
+ gai_canon, /* getaddrinfo with AI_CANONNAME. */
+
+ test_mode_num /* Number of enum values. */
+ };
+
+static const char *
+test_mode_to_string (enum test_mode mode)
+{
+ switch (mode)
+ {
+ case byname:
+ return "byname";
+ case byname2:
+ return "byname2";
+ case gai:
+ return "gai";
+ case gai_canon:
+ return "gai_canon";
+ case test_mode_num:
+ break; /* Report error below. */
+ }
+ FAIL_EXIT1 ("invalid test_mode: %d", mode);
+}
+
+/* Append the name and aliases to OUT. */
+static void
+append_names (FILE *out, const char *qname, int bits, int count,
+ enum test_mode mode)
+{
+ /* Largest valid index which has a corresponding zero in bits
+ (meaning a syntactically valid CNAME). */
+ int last_valid_cname = -1;
+
+ for (int i = 0; i < count; ++i)
+ if ((bits & (3 << (i * 2))) == 0)
+ last_valid_cname = i;
+
+ if (mode != gai)
+ {
+ const char *label;
+ if (mode == gai_canon)
+ label = "canonname";
+ else
+ label = "name";
+ if (last_valid_cname >= 0)
+ fprintf (out, "%s: unique%d.example\n", label, last_valid_cname);
+ else
+ fprintf (out, "%s: %s\n", label, qname);
+ }
+
+ if (mode == byname || mode == byname2)
+ {
+ if (last_valid_cname >= 0)
+ fprintf (out, "alias: %s\n", qname);
+ for (int i = 0; i < count; ++i)
+ {
+ if ((bits & (3 << (i * 2))) == 0 && i != last_valid_cname)
+ fprintf (out, "alias: unique%d.example\n", i);
+ }
+ }
+}
+
+/* Append the address information to OUT. */
+static void
+append_addresses (FILE *out, int af, int bits, int count, enum test_mode mode)
+{
+ int last = count * 256 + bits;
+ if (mode == gai || mode == gai_canon)
+ {
+ if (af == AF_INET || af == AF_UNSPEC)
+ fprintf (out, "address: STREAM/TCP 192.168.%d.%d 80\n", count, bits);
+ if (af == AF_INET6 || af == AF_UNSPEC)
+ {
+ if (last == 0)
+ fprintf (out, "address: STREAM/TCP 2001:db8:: 80\n");
+ else
+ fprintf (out, "address: STREAM/TCP 2001:db8::%x 80\n", last);
+ }
+ }
+ else
+ {
+ TEST_VERIFY (af != AF_UNSPEC);
+ if (af == AF_INET)
+ fprintf (out, "address: 192.168.%d.%d\n", count, bits);
+ if (af == AF_INET6)
+ {
+ if (last == 0)
+ fprintf (out, "address: 2001:db8::\n");
+ else
+ fprintf (out, "address: 2001:db8::%x\n", last);
+ }
+ }
+}
+
+/* Perform one test using a forward lookup. */
+static void
+check_forward (int af, int bits, int count, enum test_mode mode)
+{
+ char *qname = xasprintf ("bits%d.count%d.example", bits, count);
+ char *label = xasprintf ("af=%d bits=%d count=%d mode=%s qname=%s",
+ af, bits, count, test_mode_to_string (mode), qname);
+
+ struct xmemstream expected;
+ xopen_memstream (&expected);
+ if (mode == gai_canon)
+ fprintf (expected.out, "flags: AI_CANONNAME\n");
+ append_names (expected.out, qname, bits, count, mode);
+ append_addresses (expected.out, af, bits, count, mode);
+ xfclose_memstream (&expected);
+
+ if (mode == gai || mode == gai_canon)
+ {
+ struct addrinfo *ai;
+ struct addrinfo hints =
+ {
+ .ai_family = af,
+ .ai_socktype = SOCK_STREAM,
+ };
+ if (mode == gai_canon)
+ hints.ai_flags |= AI_CANONNAME;
+ int ret = getaddrinfo (qname, "80", &hints, &ai);
+ check_addrinfo (label, ai, ret, expected.buffer);
+ if (ret == 0)
+ freeaddrinfo (ai);
+ }
+ else
+ {
+ struct hostent *e;
+ if (mode == gai)
+ {
+ TEST_COMPARE (af, AF_INET);
+ e = gethostbyname (qname);
+ }
+ else
+ {
+ if (af != AF_INET)
+ TEST_COMPARE (af, AF_INET6);
+ e = gethostbyname2 (qname, af);
+ }
+ check_hostent (label, e, expected.buffer);
+ }
+
+ free (expected.buffer);
+ free (label);
+ free (qname);
+}
+
+/* Perform one check using a reverse lookup. */
+
+static void
+check_reverse (int af, int bits, int count)
+{
+ TEST_VERIFY (af == AF_INET || af == AF_INET6);
+
+ char *label = xasprintf ("af=%d bits=%d count=%d", af, bits, count);
+ char *fqdn = xasprintf ("bits%d.count%d.example", bits, count);
+
+ struct xmemstream expected;
+ xopen_memstream (&expected);
+ fprintf (expected.out, "name: %s\n", fqdn);
+ append_addresses (expected.out, af, bits, count, byname);
+ xfclose_memstream (&expected);
+
+ char addr[16] = { 0 };
+ socklen_t addrlen;
+ if (af == AF_INET)
+ {
+ addr[0] = 192;
+ addr[1] = 168;
+ addr[2] = count;
+ addr[3] = bits;
+ addrlen = 4;
+ }
+ else
+ {
+ addr[0] = 0x20;
+ addr[1] = 0x01;
+ addr[2] = 0x0d;
+ addr[3] = 0xb8;
+ addr[14] = count;
+ addr[15] = bits;
+ addrlen = 16;
+ }
+
+ struct hostent *e = gethostbyaddr (addr, addrlen, af);
+ check_hostent (label, e, expected.buffer);
+
+ /* getnameinfo check is different. There is no generic check_*
+ function for it. */
+ {
+ struct sockaddr_in sin = { };
+ struct sockaddr_in6 sin6 = { };
+ void *sa;
+ socklen_t salen;
+ if (af == AF_INET)
+ {
+ sin.sin_family = AF_INET;
+ memcpy (&sin.sin_addr, addr, addrlen);
+ sin.sin_port = htons (80);
+ sa = &sin;
+ salen = sizeof (sin);
+ }
+ else
+ {
+ sin6.sin6_family = AF_INET6;
+ memcpy (&sin6.sin6_addr, addr, addrlen);
+ sin6.sin6_port = htons (80);
+ sa = &sin6;
+ salen = sizeof (sin6);
+ }
+
+ char host[64];
+ char service[64];
+ int ret = getnameinfo (sa, salen, host,
+ sizeof (host), service, sizeof (service),
+ NI_NAMEREQD | NI_NUMERICSERV);
+ TEST_COMPARE (ret, 0);
+ TEST_COMPARE_STRING (host, fqdn);
+ TEST_COMPARE_STRING (service, "80");
+ }
+
+ free (expected.buffer);
+ free (fqdn);
+ free (label);
+}
+
+static int
+do_test (void)
+{
+ struct resolv_test *obj = resolv_test_start
+ ((struct resolv_redirect_config)
+ {
+ .response_callback = response
+ });
+
+ for (int count = 0; count <= 3; ++count)
+ for (int bits = 0; bits <= 1 << (count * 2); ++bits)
+ {
+ if (count > 0 && bits == count)
+ /* The last bits value is only checked if count == 0. */
+ continue;
+
+ for (enum test_mode mode = 0; mode < test_mode_num; ++mode)
+ {
+ check_forward (AF_INET, bits, count, mode);
+ if (mode != byname)
+ check_forward (AF_INET6, bits, count, mode);
+ if (mode == gai || mode == gai_canon)
+ check_forward (AF_UNSPEC, bits, count, mode);
+ }
+
+ check_reverse (AF_INET, bits, count);
+ check_reverse (AF_INET6, bits, count);
+ }
+
+ resolv_test_end (obj);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
--- /dev/null
+/* Code snippet for optionally inserting ignored SIG records in resolver tests.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Set to true for an alternative pass that inserts (ignored) SIG
+ records. This does not alter the response, so this property is not
+ encoded in the QNAME. The variable needs to be volatile because
+ leaf attributes tell GCC that the response function is not
+ called. */
+static volatile bool insert_sig;
+
+static void
+maybe_insert_sig (struct resolv_response_builder *b, const char *owner)
+{
+ resolv_response_open_record (b, owner, C_IN, T_SIG, 60);
+ resolv_response_add_data (b, "", 1);
+ resolv_response_close_record (b);
+}
"\t$(compile.c) $(OUTPUT_OPTION)\n")
makefile.write (rule)
- not_depended_objs = find_objs_not_depended_on(test_descr)
- if not_depended_objs:
- depstr = ""
- for dep in not_depended_objs:
- depstr += (" $(objpfx)" + test_subdir + "/"
- + test_name + "-" + dep + ".so")
- makefile.write("$(objpfx)%s.out:%s\n" % (base_test_name, depstr))
+ # Ensure that all shared objects are built before running the
+ # test, whether there link-time dependencies or not.
+ depobjs = ["$(objpfx){}/{}-{}.so".format(test_subdir, test_name, dep)
+ for dep in test_descr.objs]
+ makefile.write("$(objpfx){}.out: {}\n".format(
+ base_test_name, " ".join(depobjs)))
# Add main executable to test-srcs
makefile.write("test-srcs += %s/%s\n" % (test_subdir, test_name))
--- /dev/null
+#!/usr/bin/python3
+# ELF support functionality for Python.
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+"""Basic ELF parser.
+
+Use Image.readfile(path) to read an ELF file into memory and begin
+parsing it.
+
+"""
+
+import collections
+import enum
+import struct
+
+if not hasattr(enum, 'IntFlag'):
+ import sys
+ sys.stdout.write(
+ 'warning: glibcelf.py needs Python 3.6 for enum support\n')
+ sys.exit(77)
+
+class _OpenIntEnum(enum.IntEnum):
+ """Integer enumeration that supports arbitrary int values."""
+ @classmethod
+ def _missing_(cls, value):
+ # See enum.IntFlag._create_pseudo_member_. This allows
+ # creating of enum constants with arbitrary integer values.
+ pseudo_member = int.__new__(cls, value)
+ pseudo_member._name_ = None
+ pseudo_member._value_ = value
+ return pseudo_member
+
+ def __repr__(self):
+ name = self._name_
+ if name is not None:
+ # The names have prefixes like SHT_, implying their type.
+ return name
+ return '{}({})'.format(self.__class__.__name__, self._value_)
+
+ def __str__(self):
+ name = self._name_
+ if name is not None:
+ return name
+ return str(self._value_)
+
+class ElfClass(_OpenIntEnum):
+ """ELF word size. Type of EI_CLASS values."""
+ ELFCLASSNONE = 0
+ ELFCLASS32 = 1
+ ELFCLASS64 = 2
+
+class ElfData(_OpenIntEnum):
+ """ELF endianess. Type of EI_DATA values."""
+ ELFDATANONE = 0
+ ELFDATA2LSB = 1
+ ELFDATA2MSB = 2
+
+class Machine(_OpenIntEnum):
+ """ELF machine type. Type of values in Ehdr.e_machine field."""
+ EM_NONE = 0
+ EM_M32 = 1
+ EM_SPARC = 2
+ EM_386 = 3
+ EM_68K = 4
+ EM_88K = 5
+ EM_IAMCU = 6
+ EM_860 = 7
+ EM_MIPS = 8
+ EM_S370 = 9
+ EM_MIPS_RS3_LE = 10
+ EM_PARISC = 15
+ EM_VPP500 = 17
+ EM_SPARC32PLUS = 18
+ EM_960 = 19
+ EM_PPC = 20
+ EM_PPC64 = 21
+ EM_S390 = 22
+ EM_SPU = 23
+ EM_V800 = 36
+ EM_FR20 = 37
+ EM_RH32 = 38
+ EM_RCE = 39
+ EM_ARM = 40
+ EM_FAKE_ALPHA = 41
+ EM_SH = 42
+ EM_SPARCV9 = 43
+ EM_TRICORE = 44
+ EM_ARC = 45
+ EM_H8_300 = 46
+ EM_H8_300H = 47
+ EM_H8S = 48
+ EM_H8_500 = 49
+ EM_IA_64 = 50
+ EM_MIPS_X = 51
+ EM_COLDFIRE = 52
+ EM_68HC12 = 53
+ EM_MMA = 54
+ EM_PCP = 55
+ EM_NCPU = 56
+ EM_NDR1 = 57
+ EM_STARCORE = 58
+ EM_ME16 = 59
+ EM_ST100 = 60
+ EM_TINYJ = 61
+ EM_X86_64 = 62
+ EM_PDSP = 63
+ EM_PDP10 = 64
+ EM_PDP11 = 65
+ EM_FX66 = 66
+ EM_ST9PLUS = 67
+ EM_ST7 = 68
+ EM_68HC16 = 69
+ EM_68HC11 = 70
+ EM_68HC08 = 71
+ EM_68HC05 = 72
+ EM_SVX = 73
+ EM_ST19 = 74
+ EM_VAX = 75
+ EM_CRIS = 76
+ EM_JAVELIN = 77
+ EM_FIREPATH = 78
+ EM_ZSP = 79
+ EM_MMIX = 80
+ EM_HUANY = 81
+ EM_PRISM = 82
+ EM_AVR = 83
+ EM_FR30 = 84
+ EM_D10V = 85
+ EM_D30V = 86
+ EM_V850 = 87
+ EM_M32R = 88
+ EM_MN10300 = 89
+ EM_MN10200 = 90
+ EM_PJ = 91
+ EM_OPENRISC = 92
+ EM_ARC_COMPACT = 93
+ EM_XTENSA = 94
+ EM_VIDEOCORE = 95
+ EM_TMM_GPP = 96
+ EM_NS32K = 97
+ EM_TPC = 98
+ EM_SNP1K = 99
+ EM_ST200 = 100
+ EM_IP2K = 101
+ EM_MAX = 102
+ EM_CR = 103
+ EM_F2MC16 = 104
+ EM_MSP430 = 105
+ EM_BLACKFIN = 106
+ EM_SE_C33 = 107
+ EM_SEP = 108
+ EM_ARCA = 109
+ EM_UNICORE = 110
+ EM_EXCESS = 111
+ EM_DXP = 112
+ EM_ALTERA_NIOS2 = 113
+ EM_CRX = 114
+ EM_XGATE = 115
+ EM_C166 = 116
+ EM_M16C = 117
+ EM_DSPIC30F = 118
+ EM_CE = 119
+ EM_M32C = 120
+ EM_TSK3000 = 131
+ EM_RS08 = 132
+ EM_SHARC = 133
+ EM_ECOG2 = 134
+ EM_SCORE7 = 135
+ EM_DSP24 = 136
+ EM_VIDEOCORE3 = 137
+ EM_LATTICEMICO32 = 138
+ EM_SE_C17 = 139
+ EM_TI_C6000 = 140
+ EM_TI_C2000 = 141
+ EM_TI_C5500 = 142
+ EM_TI_ARP32 = 143
+ EM_TI_PRU = 144
+ EM_MMDSP_PLUS = 160
+ EM_CYPRESS_M8C = 161
+ EM_R32C = 162
+ EM_TRIMEDIA = 163
+ EM_QDSP6 = 164
+ EM_8051 = 165
+ EM_STXP7X = 166
+ EM_NDS32 = 167
+ EM_ECOG1X = 168
+ EM_MAXQ30 = 169
+ EM_XIMO16 = 170
+ EM_MANIK = 171
+ EM_CRAYNV2 = 172
+ EM_RX = 173
+ EM_METAG = 174
+ EM_MCST_ELBRUS = 175
+ EM_ECOG16 = 176
+ EM_CR16 = 177
+ EM_ETPU = 178
+ EM_SLE9X = 179
+ EM_L10M = 180
+ EM_K10M = 181
+ EM_AARCH64 = 183
+ EM_AVR32 = 185
+ EM_STM8 = 186
+ EM_TILE64 = 187
+ EM_TILEPRO = 188
+ EM_MICROBLAZE = 189
+ EM_CUDA = 190
+ EM_TILEGX = 191
+ EM_CLOUDSHIELD = 192
+ EM_COREA_1ST = 193
+ EM_COREA_2ND = 194
+ EM_ARCV2 = 195
+ EM_OPEN8 = 196
+ EM_RL78 = 197
+ EM_VIDEOCORE5 = 198
+ EM_78KOR = 199
+ EM_56800EX = 200
+ EM_BA1 = 201
+ EM_BA2 = 202
+ EM_XCORE = 203
+ EM_MCHP_PIC = 204
+ EM_INTELGT = 205
+ EM_KM32 = 210
+ EM_KMX32 = 211
+ EM_EMX16 = 212
+ EM_EMX8 = 213
+ EM_KVARC = 214
+ EM_CDP = 215
+ EM_COGE = 216
+ EM_COOL = 217
+ EM_NORC = 218
+ EM_CSR_KALIMBA = 219
+ EM_Z80 = 220
+ EM_VISIUM = 221
+ EM_FT32 = 222
+ EM_MOXIE = 223
+ EM_AMDGPU = 224
+ EM_RISCV = 243
+ EM_BPF = 247
+ EM_CSKY = 252
+ EM_NUM = 253
+ EM_ALPHA = 0x9026
+
+class Et(_OpenIntEnum):
+ """ELF file type. Type of ET_* values and the Ehdr.e_type field."""
+ ET_NONE = 0
+ ET_REL = 1
+ ET_EXEC = 2
+ ET_DYN = 3
+ ET_CORE = 4
+
+class Shn(_OpenIntEnum):
+ """ELF reserved section indices."""
+ SHN_UNDEF = 0
+ SHN_BEFORE = 0xff00
+ SHN_AFTER = 0xff01
+ SHN_ABS = 0xfff1
+ SHN_COMMON = 0xfff2
+ SHN_XINDEX = 0xffff
+
+class ShnMIPS(enum.Enum):
+ """Supplemental SHN_* constants for EM_MIPS."""
+ SHN_MIPS_ACOMMON = 0xff00
+ SHN_MIPS_TEXT = 0xff01
+ SHN_MIPS_DATA = 0xff02
+ SHN_MIPS_SCOMMON = 0xff03
+ SHN_MIPS_SUNDEFINED = 0xff04
+
+class ShnPARISC(enum.Enum):
+ """Supplemental SHN_* constants for EM_PARISC."""
+ SHN_PARISC_ANSI_COMMON = 0xff00
+ SHN_PARISC_HUGE_COMMON = 0xff01
+
+class Sht(_OpenIntEnum):
+ """ELF section types. Type of SHT_* values."""
+ SHT_NULL = 0
+ SHT_PROGBITS = 1
+ SHT_SYMTAB = 2
+ SHT_STRTAB = 3
+ SHT_RELA = 4
+ SHT_HASH = 5
+ SHT_DYNAMIC = 6
+ SHT_NOTE = 7
+ SHT_NOBITS = 8
+ SHT_REL = 9
+ SHT_SHLIB = 10
+ SHT_DYNSYM = 11
+ SHT_INIT_ARRAY = 14
+ SHT_FINI_ARRAY = 15
+ SHT_PREINIT_ARRAY = 16
+ SHT_GROUP = 17
+ SHT_SYMTAB_SHNDX = 18
+ SHT_GNU_ATTRIBUTES = 0x6ffffff5
+ SHT_GNU_HASH = 0x6ffffff6
+ SHT_GNU_LIBLIST = 0x6ffffff7
+ SHT_CHECKSUM = 0x6ffffff8
+ SHT_SUNW_move = 0x6ffffffa
+ SHT_SUNW_COMDAT = 0x6ffffffb
+ SHT_SUNW_syminfo = 0x6ffffffc
+ SHT_GNU_verdef = 0x6ffffffd
+ SHT_GNU_verneed = 0x6ffffffe
+ SHT_GNU_versym = 0x6fffffff
+
+class ShtALPHA(enum.Enum):
+ """Supplemental SHT_* constants for EM_ALPHA."""
+ SHT_ALPHA_DEBUG = 0x70000001
+ SHT_ALPHA_REGINFO = 0x70000002
+
+class ShtARM(enum.Enum):
+ """Supplemental SHT_* constants for EM_ARM."""
+ SHT_ARM_EXIDX = 0x70000001
+ SHT_ARM_PREEMPTMAP = 0x70000002
+ SHT_ARM_ATTRIBUTES = 0x70000003
+
+class ShtCSKY(enum.Enum):
+ """Supplemental SHT_* constants for EM_CSKY."""
+ SHT_CSKY_ATTRIBUTES = 0x70000001
+
+class ShtIA_64(enum.Enum):
+ """Supplemental SHT_* constants for EM_IA_64."""
+ SHT_IA_64_EXT = 0x70000000
+ SHT_IA_64_UNWIND = 0x70000001
+
+class ShtMIPS(enum.Enum):
+ """Supplemental SHT_* constants for EM_MIPS."""
+ SHT_MIPS_LIBLIST = 0x70000000
+ SHT_MIPS_MSYM = 0x70000001
+ SHT_MIPS_CONFLICT = 0x70000002
+ SHT_MIPS_GPTAB = 0x70000003
+ SHT_MIPS_UCODE = 0x70000004
+ SHT_MIPS_DEBUG = 0x70000005
+ SHT_MIPS_REGINFO = 0x70000006
+ SHT_MIPS_PACKAGE = 0x70000007
+ SHT_MIPS_PACKSYM = 0x70000008
+ SHT_MIPS_RELD = 0x70000009
+ SHT_MIPS_IFACE = 0x7000000b
+ SHT_MIPS_CONTENT = 0x7000000c
+ SHT_MIPS_OPTIONS = 0x7000000d
+ SHT_MIPS_SHDR = 0x70000010
+ SHT_MIPS_FDESC = 0x70000011
+ SHT_MIPS_EXTSYM = 0x70000012
+ SHT_MIPS_DENSE = 0x70000013
+ SHT_MIPS_PDESC = 0x70000014
+ SHT_MIPS_LOCSYM = 0x70000015
+ SHT_MIPS_AUXSYM = 0x70000016
+ SHT_MIPS_OPTSYM = 0x70000017
+ SHT_MIPS_LOCSTR = 0x70000018
+ SHT_MIPS_LINE = 0x70000019
+ SHT_MIPS_RFDESC = 0x7000001a
+ SHT_MIPS_DELTASYM = 0x7000001b
+ SHT_MIPS_DELTAINST = 0x7000001c
+ SHT_MIPS_DELTACLASS = 0x7000001d
+ SHT_MIPS_DWARF = 0x7000001e
+ SHT_MIPS_DELTADECL = 0x7000001f
+ SHT_MIPS_SYMBOL_LIB = 0x70000020
+ SHT_MIPS_EVENTS = 0x70000021
+ SHT_MIPS_TRANSLATE = 0x70000022
+ SHT_MIPS_PIXIE = 0x70000023
+ SHT_MIPS_XLATE = 0x70000024
+ SHT_MIPS_XLATE_DEBUG = 0x70000025
+ SHT_MIPS_WHIRL = 0x70000026
+ SHT_MIPS_EH_REGION = 0x70000027
+ SHT_MIPS_XLATE_OLD = 0x70000028
+ SHT_MIPS_PDR_EXCEPTION = 0x70000029
+ SHT_MIPS_XHASH = 0x7000002b
+
+class ShtPARISC(enum.Enum):
+ """Supplemental SHT_* constants for EM_PARISC."""
+ SHT_PARISC_EXT = 0x70000000
+ SHT_PARISC_UNWIND = 0x70000001
+ SHT_PARISC_DOC = 0x70000002
+
+class Pf(enum.IntFlag):
+ """Program header flags. Type of Phdr.p_flags values."""
+ PF_X = 1
+ PF_W = 2
+ PF_R = 4
+
+class PfARM(enum.IntFlag):
+ """Supplemental PF_* flags for EM_ARM."""
+ PF_ARM_SB = 0x10000000
+ PF_ARM_PI = 0x20000000
+ PF_ARM_ABS = 0x40000000
+
+class PfPARISC(enum.IntFlag):
+ """Supplemental PF_* flags for EM_PARISC."""
+ PF_HP_PAGE_SIZE = 0x00100000
+ PF_HP_FAR_SHARED = 0x00200000
+ PF_HP_NEAR_SHARED = 0x00400000
+ PF_HP_CODE = 0x01000000
+ PF_HP_MODIFY = 0x02000000
+ PF_HP_LAZYSWAP = 0x04000000
+ PF_HP_SBP = 0x08000000
+
+class PfIA_64(enum.IntFlag):
+ """Supplemental PF_* flags for EM_IA_64."""
+ PF_IA_64_NORECOV = 0x80000000
+
+class PfMIPS(enum.IntFlag):
+ """Supplemental PF_* flags for EM_MIPS."""
+ PF_MIPS_LOCAL = 0x10000000
+
+class Shf(enum.IntFlag):
+ """Section flags. Type of Shdr.sh_type values."""
+ SHF_WRITE = 1 << 0
+ SHF_ALLOC = 1 << 1
+ SHF_EXECINSTR = 1 << 2
+ SHF_MERGE = 1 << 4
+ SHF_STRINGS = 1 << 5
+ SHF_INFO_LINK = 1 << 6
+ SHF_LINK_ORDER = 1 << 7
+ SHF_OS_NONCONFORMING = 256
+ SHF_GROUP = 1 << 9
+ SHF_TLS = 1 << 10
+ SHF_COMPRESSED = 1 << 11
+ SHF_GNU_RETAIN = 1 << 21
+ SHF_ORDERED = 1 << 30
+ SHF_EXCLUDE = 1 << 31
+
+class ShfALPHA(enum.IntFlag):
+ """Supplemental SHF_* constants for EM_ALPHA."""
+ SHF_ALPHA_GPREL = 0x10000000
+
+class ShfARM(enum.IntFlag):
+ """Supplemental SHF_* constants for EM_ARM."""
+ SHF_ARM_ENTRYSECT = 0x10000000
+ SHF_ARM_COMDEF = 0x80000000
+
+class ShfIA_64(enum.IntFlag):
+ """Supplemental SHF_* constants for EM_IA_64."""
+ SHF_IA_64_SHORT = 0x10000000
+ SHF_IA_64_NORECOV = 0x20000000
+
+class ShfMIPS(enum.IntFlag):
+ """Supplemental SHF_* constants for EM_MIPS."""
+ SHF_MIPS_GPREL = 0x10000000
+ SHF_MIPS_MERGE = 0x20000000
+ SHF_MIPS_ADDR = 0x40000000
+ SHF_MIPS_STRINGS = 0x80000000
+ SHF_MIPS_NOSTRIP = 0x08000000
+ SHF_MIPS_LOCAL = 0x04000000
+ SHF_MIPS_NAMES = 0x02000000
+ SHF_MIPS_NODUPE = 0x01000000
+
+class ShfPARISC(enum.IntFlag):
+ """Supplemental SHF_* constants for EM_PARISC."""
+ SHF_PARISC_SHORT = 0x20000000
+ SHF_PARISC_HUGE = 0x40000000
+ SHF_PARISC_SBP = 0x80000000
+
+class Stb(_OpenIntEnum):
+ """ELF symbol binding type."""
+ STB_LOCAL = 0
+ STB_GLOBAL = 1
+ STB_WEAK = 2
+ STB_GNU_UNIQUE = 10
+ STB_MIPS_SPLIT_COMMON = 13
+
+class Stt(_OpenIntEnum):
+ """ELF symbol type."""
+ STT_NOTYPE = 0
+ STT_OBJECT = 1
+ STT_FUNC = 2
+ STT_SECTION = 3
+ STT_FILE = 4
+ STT_COMMON = 5
+ STT_TLS = 6
+ STT_GNU_IFUNC = 10
+
+class SttARM(enum.Enum):
+ """Supplemental STT_* constants for EM_ARM."""
+ STT_ARM_TFUNC = 13
+ STT_ARM_16BIT = 15
+
+class SttPARISC(enum.Enum):
+ """Supplemental STT_* constants for EM_PARISC."""
+ STT_HP_OPAQUE = 11
+ STT_HP_STUB = 12
+ STT_PARISC_MILLICODE = 13
+
+class SttSPARC(enum.Enum):
+ """Supplemental STT_* constants for EM_SPARC."""
+ STT_SPARC_REGISTER = 13
+
+class SttX86_64(enum.Enum):
+ """Supplemental STT_* constants for EM_X86_64."""
+ SHT_X86_64_UNWIND = 0x70000001
+
+class Pt(_OpenIntEnum):
+ """ELF program header types. Type of Phdr.p_type."""
+ PT_NULL = 0
+ PT_LOAD = 1
+ PT_DYNAMIC = 2
+ PT_INTERP = 3
+ PT_NOTE = 4
+ PT_SHLIB = 5
+ PT_PHDR = 6
+ PT_TLS = 7
+ PT_NUM = 8
+ PT_GNU_EH_FRAME = 0x6474e550
+ PT_GNU_STACK = 0x6474e551
+ PT_GNU_RELRO = 0x6474e552
+ PT_GNU_PROPERTY = 0x6474e553
+ PT_SUNWBSS = 0x6ffffffa
+ PT_SUNWSTACK = 0x6ffffffb
+
+class PtARM(enum.Enum):
+ """Supplemental PT_* constants for EM_ARM."""
+ PT_ARM_EXIDX = 0x70000001
+
+class PtIA_64(enum.Enum):
+ """Supplemental PT_* constants for EM_IA_64."""
+ PT_IA_64_HP_OPT_ANOT = 0x60000012
+ PT_IA_64_HP_HSL_ANOT = 0x60000013
+ PT_IA_64_HP_STACK = 0x60000014
+ PT_IA_64_ARCHEXT = 0x70000000
+ PT_IA_64_UNWIND = 0x70000001
+
+class PtMIPS(enum.Enum):
+ """Supplemental PT_* constants for EM_MIPS."""
+ PT_MIPS_REGINFO = 0x70000000
+ PT_MIPS_RTPROC = 0x70000001
+ PT_MIPS_OPTIONS = 0x70000002
+ PT_MIPS_ABIFLAGS = 0x70000003
+
+class PtPARISC(enum.Enum):
+ """Supplemental PT_* constants for EM_PARISC."""
+ PT_HP_TLS = 0x60000000
+ PT_HP_CORE_NONE = 0x60000001
+ PT_HP_CORE_VERSION = 0x60000002
+ PT_HP_CORE_KERNEL = 0x60000003
+ PT_HP_CORE_COMM = 0x60000004
+ PT_HP_CORE_PROC = 0x60000005
+ PT_HP_CORE_LOADABLE = 0x60000006
+ PT_HP_CORE_STACK = 0x60000007
+ PT_HP_CORE_SHM = 0x60000008
+ PT_HP_CORE_MMF = 0x60000009
+ PT_HP_PARALLEL = 0x60000010
+ PT_HP_FASTBIND = 0x60000011
+ PT_HP_OPT_ANNOT = 0x60000012
+ PT_HP_HSL_ANNOT = 0x60000013
+ PT_HP_STACK = 0x60000014
+ PT_PARISC_ARCHEXT = 0x70000000
+ PT_PARISC_UNWIND = 0x70000001
+
+class Dt(_OpenIntEnum):
+ """ELF dynamic segment tags. Type of Dyn.d_val."""
+ DT_NULL = 0
+ DT_NEEDED = 1
+ DT_PLTRELSZ = 2
+ DT_PLTGOT = 3
+ DT_HASH = 4
+ DT_STRTAB = 5
+ DT_SYMTAB = 6
+ DT_RELA = 7
+ DT_RELASZ = 8
+ DT_RELAENT = 9
+ DT_STRSZ = 10
+ DT_SYMENT = 11
+ DT_INIT = 12
+ DT_FINI = 13
+ DT_SONAME = 14
+ DT_RPATH = 15
+ DT_SYMBOLIC = 16
+ DT_REL = 17
+ DT_RELSZ = 18
+ DT_RELENT = 19
+ DT_PLTREL = 20
+ DT_DEBUG = 21
+ DT_TEXTREL = 22
+ DT_JMPREL = 23
+ DT_BIND_NOW = 24
+ DT_INIT_ARRAY = 25
+ DT_FINI_ARRAY = 26
+ DT_INIT_ARRAYSZ = 27
+ DT_FINI_ARRAYSZ = 28
+ DT_RUNPATH = 29
+ DT_FLAGS = 30
+ DT_PREINIT_ARRAY = 32
+ DT_PREINIT_ARRAYSZ = 33
+ DT_SYMTAB_SHNDX = 34
+ DT_GNU_PRELINKED = 0x6ffffdf5
+ DT_GNU_CONFLICTSZ = 0x6ffffdf6
+ DT_GNU_LIBLISTSZ = 0x6ffffdf7
+ DT_CHECKSUM = 0x6ffffdf8
+ DT_PLTPADSZ = 0x6ffffdf9
+ DT_MOVEENT = 0x6ffffdfa
+ DT_MOVESZ = 0x6ffffdfb
+ DT_FEATURE_1 = 0x6ffffdfc
+ DT_POSFLAG_1 = 0x6ffffdfd
+ DT_SYMINSZ = 0x6ffffdfe
+ DT_SYMINENT = 0x6ffffdff
+ DT_GNU_HASH = 0x6ffffef5
+ DT_TLSDESC_PLT = 0x6ffffef6
+ DT_TLSDESC_GOT = 0x6ffffef7
+ DT_GNU_CONFLICT = 0x6ffffef8
+ DT_GNU_LIBLIST = 0x6ffffef9
+ DT_CONFIG = 0x6ffffefa
+ DT_DEPAUDIT = 0x6ffffefb
+ DT_AUDIT = 0x6ffffefc
+ DT_PLTPAD = 0x6ffffefd
+ DT_MOVETAB = 0x6ffffefe
+ DT_SYMINFO = 0x6ffffeff
+ DT_VERSYM = 0x6ffffff0
+ DT_RELACOUNT = 0x6ffffff9
+ DT_RELCOUNT = 0x6ffffffa
+ DT_FLAGS_1 = 0x6ffffffb
+ DT_VERDEF = 0x6ffffffc
+ DT_VERDEFNUM = 0x6ffffffd
+ DT_VERNEED = 0x6ffffffe
+ DT_VERNEEDNUM = 0x6fffffff
+ DT_AUXILIARY = 0x7ffffffd
+ DT_FILTER = 0x7fffffff
+
+class DtAARCH64(enum.Enum):
+ """Supplemental DT_* constants for EM_AARCH64."""
+ DT_AARCH64_BTI_PLT = 0x70000001
+ DT_AARCH64_PAC_PLT = 0x70000003
+ DT_AARCH64_VARIANT_PCS = 0x70000005
+
+class DtALPHA(enum.Enum):
+ """Supplemental DT_* constants for EM_ALPHA."""
+ DT_ALPHA_PLTRO = 0x70000000
+
+class DtALTERA_NIOS2(enum.Enum):
+ """Supplemental DT_* constants for EM_ALTERA_NIOS2."""
+ DT_NIOS2_GP = 0x70000002
+
+class DtIA_64(enum.Enum):
+ """Supplemental DT_* constants for EM_IA_64."""
+ DT_IA_64_PLT_RESERVE = 0x70000000
+
+class DtMIPS(enum.Enum):
+ """Supplemental DT_* constants for EM_MIPS."""
+ DT_MIPS_RLD_VERSION = 0x70000001
+ DT_MIPS_TIME_STAMP = 0x70000002
+ DT_MIPS_ICHECKSUM = 0x70000003
+ DT_MIPS_IVERSION = 0x70000004
+ DT_MIPS_FLAGS = 0x70000005
+ DT_MIPS_BASE_ADDRESS = 0x70000006
+ DT_MIPS_MSYM = 0x70000007
+ DT_MIPS_CONFLICT = 0x70000008
+ DT_MIPS_LIBLIST = 0x70000009
+ DT_MIPS_LOCAL_GOTNO = 0x7000000a
+ DT_MIPS_CONFLICTNO = 0x7000000b
+ DT_MIPS_LIBLISTNO = 0x70000010
+ DT_MIPS_SYMTABNO = 0x70000011
+ DT_MIPS_UNREFEXTNO = 0x70000012
+ DT_MIPS_GOTSYM = 0x70000013
+ DT_MIPS_HIPAGENO = 0x70000014
+ DT_MIPS_RLD_MAP = 0x70000016
+ DT_MIPS_DELTA_CLASS = 0x70000017
+ DT_MIPS_DELTA_CLASS_NO = 0x70000018
+ DT_MIPS_DELTA_INSTANCE = 0x70000019
+ DT_MIPS_DELTA_INSTANCE_NO = 0x7000001a
+ DT_MIPS_DELTA_RELOC = 0x7000001b
+ DT_MIPS_DELTA_RELOC_NO = 0x7000001c
+ DT_MIPS_DELTA_SYM = 0x7000001d
+ DT_MIPS_DELTA_SYM_NO = 0x7000001e
+ DT_MIPS_DELTA_CLASSSYM = 0x70000020
+ DT_MIPS_DELTA_CLASSSYM_NO = 0x70000021
+ DT_MIPS_CXX_FLAGS = 0x70000022
+ DT_MIPS_PIXIE_INIT = 0x70000023
+ DT_MIPS_SYMBOL_LIB = 0x70000024
+ DT_MIPS_LOCALPAGE_GOTIDX = 0x70000025
+ DT_MIPS_LOCAL_GOTIDX = 0x70000026
+ DT_MIPS_HIDDEN_GOTIDX = 0x70000027
+ DT_MIPS_PROTECTED_GOTIDX = 0x70000028
+ DT_MIPS_OPTIONS = 0x70000029
+ DT_MIPS_INTERFACE = 0x7000002a
+ DT_MIPS_DYNSTR_ALIGN = 0x7000002b
+ DT_MIPS_INTERFACE_SIZE = 0x7000002c
+ DT_MIPS_RLD_TEXT_RESOLVE_ADDR = 0x7000002d
+ DT_MIPS_PERF_SUFFIX = 0x7000002e
+ DT_MIPS_COMPACT_SIZE = 0x7000002f
+ DT_MIPS_GP_VALUE = 0x70000030
+ DT_MIPS_AUX_DYNAMIC = 0x70000031
+ DT_MIPS_PLTGOT = 0x70000032
+ DT_MIPS_RWPLT = 0x70000034
+ DT_MIPS_RLD_MAP_REL = 0x70000035
+ DT_MIPS_XHASH = 0x70000036
+
+class DtPPC(enum.Enum):
+ """Supplemental DT_* constants for EM_PPC."""
+ DT_PPC_GOT = 0x70000000
+ DT_PPC_OPT = 0x70000001
+
+class DtPPC64(enum.Enum):
+ """Supplemental DT_* constants for EM_PPC64."""
+ DT_PPC64_GLINK = 0x70000000
+ DT_PPC64_OPD = 0x70000001
+ DT_PPC64_OPDSZ = 0x70000002
+ DT_PPC64_OPT = 0x70000003
+
+class DtSPARC(enum.Enum):
+ """Supplemental DT_* constants for EM_SPARC."""
+ DT_SPARC_REGISTER = 0x70000001
+
+class StInfo:
+ """ELF symbol binding and type. Type of the Sym.st_info field."""
+ def __init__(self, arg0, arg1=None):
+ if isinstance(arg0, int) and arg1 is None:
+ self.bind = Stb(arg0 >> 4)
+ self.type = Stt(arg0 & 15)
+ else:
+ self.bind = Stb(arg0)
+ self.type = Stt(arg1)
+
+ def value(self):
+ """Returns the raw value for the bind/type combination."""
+ return (self.bind.value() << 4) | (self.type.value())
+
+# Type in an ELF file. Used for deserialization.
+_Layout = collections.namedtuple('_Layout', 'unpack size')
+
+def _define_layouts(baseclass: type, layout32: str, layout64: str,
+ types=None, fields32=None):
+ """Assign variants dict to baseclass.
+
+ The variants dict is indexed by (ElfClass, ElfData) pairs, and its
+ values are _Layout instances.
+
+ """
+ struct32 = struct.Struct(layout32)
+ struct64 = struct.Struct(layout64)
+
+ # Check that the struct formats yield the right number of components.
+ for s in (struct32, struct64):
+ example = s.unpack(b' ' * s.size)
+ if len(example) != len(baseclass._fields):
+ raise ValueError('{!r} yields wrong field count: {} != {}'.format(
+ s.format, len(example), len(baseclass._fields)))
+
+ # Check that field names in types are correct.
+ if types is None:
+ types = ()
+ for n in types:
+ if n not in baseclass._fields:
+ raise ValueError('{} does not have field {!r}'.format(
+ baseclass.__name__, n))
+
+ if fields32 is not None \
+ and set(fields32) != set(baseclass._fields):
+ raise ValueError('{!r} is not a permutation of the fields {!r}'.format(
+ fields32, baseclass._fields))
+
+ def unique_name(name, used_names = (set((baseclass.__name__,))
+ | set(baseclass._fields)
+ | {n.__name__
+ for n in (types or {}).values()})):
+ """Find a name that is not used for a class or field name."""
+ candidate = name
+ n = 0
+ while candidate in used_names:
+ n += 1
+ candidate = '{}{}'.format(name, n)
+ used_names.add(candidate)
+ return candidate
+
+ blob_name = unique_name('blob')
+ struct_unpack_name = unique_name('struct_unpack')
+ comps_name = unique_name('comps')
+
+ layouts = {}
+ for (bits, elfclass, layout, fields) in (
+ (32, ElfClass.ELFCLASS32, layout32, fields32),
+ (64, ElfClass.ELFCLASS64, layout64, None),
+ ):
+ for (elfdata, structprefix, funcsuffix) in (
+ (ElfData.ELFDATA2LSB, '<', 'LE'),
+ (ElfData.ELFDATA2MSB, '>', 'BE'),
+ ):
+ env = {
+ baseclass.__name__: baseclass,
+ struct_unpack_name: struct.unpack,
+ }
+
+ # Add the type converters.
+ if types:
+ for cls in types.values():
+ env[cls.__name__] = cls
+
+ funcname = ''.join(
+ ('unpack_', baseclass.__name__, str(bits), funcsuffix))
+
+ code = '''
+def {funcname}({blob_name}):
+'''.format(funcname=funcname, blob_name=blob_name)
+
+ indent = ' ' * 4
+ unpack_call = '{}({!r}, {})'.format(
+ struct_unpack_name, structprefix + layout, blob_name)
+ field_names = ', '.join(baseclass._fields)
+ if types is None and fields is None:
+ code += '{}return {}({})\n'.format(
+ indent, baseclass.__name__, unpack_call)
+ else:
+ # Destructuring tuple assignment.
+ if fields is None:
+ code += '{}{} = {}\n'.format(
+ indent, field_names, unpack_call)
+ else:
+ # Use custom field order.
+ code += '{}{} = {}\n'.format(
+ indent, ', '.join(fields), unpack_call)
+
+ # Perform the type conversions.
+ for n in baseclass._fields:
+ if n in types:
+ code += '{}{} = {}({})\n'.format(
+ indent, n, types[n].__name__, n)
+ # Create the named tuple.
+ code += '{}return {}({})\n'.format(
+ indent, baseclass.__name__, field_names)
+
+ exec(code, env)
+ layouts[(elfclass, elfdata)] = _Layout(
+ env[funcname], struct.calcsize(layout))
+ baseclass.layouts = layouts
+
+
+# Corresponds to EI_* indices into Elf*_Ehdr.e_indent.
+class Ident(collections.namedtuple('Ident',
+ 'ei_mag ei_class ei_data ei_version ei_osabi ei_abiversion ei_pad')):
+
+ def __new__(cls, *args):
+ """Construct an object from a blob or its constituent fields."""
+ if len(args) == 1:
+ return cls.unpack(args[0])
+ return cls.__base__.__new__(cls, *args)
+
+ @staticmethod
+ def unpack(blob: memoryview) -> 'Ident':
+ """Parse raws data into a tuple."""
+ ei_mag, ei_class, ei_data, ei_version, ei_osabi, ei_abiversion, \
+ ei_pad = struct.unpack('4s5B7s', blob)
+ return Ident(ei_mag, ElfClass(ei_class), ElfData(ei_data),
+ ei_version, ei_osabi, ei_abiversion, ei_pad)
+ size = 16
+
+# Corresponds to Elf32_Ehdr and Elf64_Ehdr.
+Ehdr = collections.namedtuple('Ehdr',
+ 'e_ident e_type e_machine e_version e_entry e_phoff e_shoff e_flags'
+ + ' e_ehsize e_phentsize e_phnum e_shentsize e_shnum e_shstrndx')
+_define_layouts(Ehdr,
+ layout32='16s2H5I6H',
+ layout64='16s2HI3QI6H',
+ types=dict(e_ident=Ident,
+ e_machine=Machine,
+ e_type=Et,
+ e_shstrndx=Shn))
+
+# Corresponds to Elf32_Phdr and Elf64_Pdhr. Order follows the latter.
+Phdr = collections.namedtuple('Phdr',
+ 'p_type p_flags p_offset p_vaddr p_paddr p_filesz p_memsz p_align')
+_define_layouts(Phdr,
+ layout32='8I',
+ fields32=('p_type', 'p_offset', 'p_vaddr', 'p_paddr',
+ 'p_filesz', 'p_memsz', 'p_flags', 'p_align'),
+ layout64='2I6Q',
+ types=dict(p_type=Pt, p_flags=Pf))
+
+
+# Corresponds to Elf32_Shdr and Elf64_Shdr.
+class Shdr(collections.namedtuple('Shdr',
+ 'sh_name sh_type sh_flags sh_addr sh_offset sh_size sh_link sh_info'
+ + ' sh_addralign sh_entsize')):
+ def resolve(self, strtab: 'StringTable') -> 'Shdr':
+ """Resolve sh_name using a string table."""
+ return self.__class__(strtab.get(self[0]), *self[1:])
+_define_layouts(Shdr,
+ layout32='10I',
+ layout64='2I4Q2I2Q',
+ types=dict(sh_type=Sht,
+ sh_flags=Shf,
+ sh_link=Shn))
+
+# Corresponds to Elf32_Dyn and Elf64_Dyn. The nesting through the
+# d_un union is skipped, and d_ptr is missing (its representation in
+# Python would be identical to d_val).
+Dyn = collections.namedtuple('Dyn', 'd_tag d_val')
+_define_layouts(Dyn,
+ layout32='2i',
+ layout64='2q',
+ types=dict(d_tag=Dt))
+
+# Corresponds to Elf32_Sym and Elf64_Sym.
+class Sym(collections.namedtuple('Sym',
+ 'st_name st_info st_other st_shndx st_value st_size')):
+ def resolve(self, strtab: 'StringTable') -> 'Sym':
+ """Resolve st_name using a string table."""
+ return self.__class__(strtab.get(self[0]), *self[1:])
+_define_layouts(Sym,
+ layout32='3I2BH',
+ layout64='I2BH2Q',
+ fields32=('st_name', 'st_value', 'st_size', 'st_info',
+ 'st_other', 'st_shndx'),
+ types=dict(st_shndx=Shn,
+ st_info=StInfo))
+
+# Corresponds to Elf32_Rel and Elf64_Rel.
+Rel = collections.namedtuple('Rel', 'r_offset r_info')
+_define_layouts(Rel,
+ layout32='2I',
+ layout64='2Q')
+
+# Corresponds to Elf32_Rel and Elf64_Rel.
+Rela = collections.namedtuple('Rela', 'r_offset r_info r_addend')
+_define_layouts(Rela,
+ layout32='3I',
+ layout64='3Q')
+
+class StringTable:
+ """ELF string table."""
+ def __init__(self, blob):
+ """Create a new string table backed by the data in the blob.
+
+ blob: a memoryview-like object
+
+ """
+ self.blob = blob
+
+ def get(self, index) -> bytes:
+ """Returns the null-terminated byte string at the index."""
+ blob = self.blob
+ endindex = index
+ while True:
+ if blob[endindex] == 0:
+ return bytes(blob[index:endindex])
+ endindex += 1
+
+class Image:
+ """ELF image parser."""
+ def __init__(self, image):
+ """Create an ELF image from binary image data.
+
+ image: a memoryview-like object that supports efficient range
+ subscripting.
+
+ """
+ self.image = image
+ ident = self.read(Ident, 0)
+ classdata = (ident.ei_class, ident.ei_data)
+ # Set self.Ehdr etc. to the subtypes with the right parsers.
+ for typ in (Ehdr, Phdr, Shdr, Dyn, Sym, Rel, Rela):
+ setattr(self, typ.__name__, typ.layouts.get(classdata, None))
+
+ if self.Ehdr is not None:
+ self.ehdr = self.read(self.Ehdr, 0)
+ self._shdr_num = self._compute_shdr_num()
+ else:
+ self.ehdr = None
+ self._shdr_num = 0
+
+ self._section = {}
+ self._stringtab = {}
+
+ if self._shdr_num > 0:
+ self._shdr_strtab = self._find_shdr_strtab()
+ else:
+ self._shdr_strtab = None
+
+ @staticmethod
+ def readfile(path: str) -> 'Image':
+ """Reads the ELF file at the specified path."""
+ with open(path, 'rb') as inp:
+ return Image(memoryview(inp.read()))
+
+ def _compute_shdr_num(self) -> int:
+ """Computes the actual number of section headers."""
+ shnum = self.ehdr.e_shnum
+ if shnum == 0:
+ if self.ehdr.e_shoff == 0 or self.ehdr.e_shentsize == 0:
+ # No section headers.
+ return 0
+ # Otherwise the extension mechanism is used (which may be
+ # needed because e_shnum is just 16 bits).
+ return self.read(self.Shdr, self.ehdr.e_shoff).sh_size
+ return shnum
+
+ def _find_shdr_strtab(self) -> StringTable:
+ """Finds the section header string table (maybe via extensions)."""
+ shstrndx = self.ehdr.e_shstrndx
+ if shstrndx == Shn.SHN_XINDEX:
+ shstrndx = self.read(self.Shdr, self.ehdr.e_shoff).sh_link
+ return self._find_stringtab(shstrndx)
+
+ def read(self, typ: type, offset:int ):
+ """Reads an object at a specific offset.
+
+ The type must have been enhanced using _define_variants.
+
+ """
+ return typ.unpack(self.image[offset: offset + typ.size])
+
+ def phdrs(self) -> Phdr:
+ """Generator iterating over the program headers."""
+ if self.ehdr is None:
+ return
+ size = self.ehdr.e_phentsize
+ if size != self.Phdr.size:
+ raise ValueError('Unexpected Phdr size in ELF header: {} != {}'
+ .format(size, self.Phdr.size))
+
+ offset = self.ehdr.e_phoff
+ for _ in range(self.ehdr.e_phnum):
+ yield self.read(self.Phdr, offset)
+ offset += size
+
+ def shdrs(self, resolve: bool=True) -> Shdr:
+ """Generator iterating over the section headers.
+
+ If resolve, section names are automatically translated
+ using the section header string table.
+
+ """
+ if self._shdr_num == 0:
+ return
+
+ size = self.ehdr.e_shentsize
+ if size != self.Shdr.size:
+ raise ValueError('Unexpected Shdr size in ELF header: {} != {}'
+ .format(size, self.Shdr.size))
+
+ offset = self.ehdr.e_shoff
+ for _ in range(self._shdr_num):
+ shdr = self.read(self.Shdr, offset)
+ if resolve:
+ shdr = shdr.resolve(self._shdr_strtab)
+ yield shdr
+ offset += size
+
+ def dynamic(self) -> Dyn:
+ """Generator iterating over the dynamic segment."""
+ for phdr in self.phdrs():
+ if phdr.p_type == Pt.PT_DYNAMIC:
+ # Pick the first dynamic segment, like the loader.
+ if phdr.p_filesz == 0:
+ # Probably separated debuginfo.
+ return
+ offset = phdr.p_offset
+ end = offset + phdr.p_memsz
+ size = self.Dyn.size
+ while True:
+ next_offset = offset + size
+ if next_offset > end:
+ raise ValueError(
+ 'Dynamic segment size {} is not a multiple of Dyn size {}'.format(
+ phdr.p_memsz, size))
+ yield self.read(self.Dyn, offset)
+ if next_offset == end:
+ return
+ offset = next_offset
+
+ def syms(self, shdr: Shdr, resolve: bool=True) -> Sym:
+ """A generator iterating over a symbol table.
+
+ If resolve, symbol names are automatically translated using
+ the string table for the symbol table.
+
+ """
+ assert shdr.sh_type == Sht.SHT_SYMTAB
+ size = shdr.sh_entsize
+ if size != self.Sym.size:
+ raise ValueError('Invalid symbol table entry size {}'.format(size))
+ offset = shdr.sh_offset
+ end = shdr.sh_offset + shdr.sh_size
+ if resolve:
+ strtab = self._find_stringtab(shdr.sh_link)
+ while offset < end:
+ sym = self.read(self.Sym, offset)
+ if resolve:
+ sym = sym.resolve(strtab)
+ yield sym
+ offset += size
+ if offset != end:
+ raise ValueError('Symbol table is not a multiple of entry size')
+
+ def lookup_string(self, strtab_index: int, strtab_offset: int) -> bytes:
+ """Looks up a string in a string table identified by its link index."""
+ try:
+ strtab = self._stringtab[strtab_index]
+ except KeyError:
+ strtab = self._find_stringtab(strtab_index)
+ return strtab.get(strtab_offset)
+
+ def find_section(self, shndx: Shn) -> Shdr:
+ """Returns the section header for the indexed section.
+
+ The section name is not resolved.
+ """
+ try:
+ return self._section[shndx]
+ except KeyError:
+ pass
+ if shndx in Shn:
+ raise ValueError('Reserved section index {}'.format(shndx))
+ idx = shndx.value
+ if idx < 0 or idx > self._shdr_num:
+ raise ValueError('Section index {} out of range [0, {})'.format(
+ idx, self._shdr_num))
+ shdr = self.read(
+ self.Shdr, self.ehdr.e_shoff + idx * self.Shdr.size)
+ self._section[shndx] = shdr
+ return shdr
+
+ def _find_stringtab(self, sh_link: int) -> StringTable:
+ if sh_link in self._stringtab:
+ return self._stringtab
+ if sh_link < 0 or sh_link >= self._shdr_num:
+ raise ValueError('Section index {} out of range [0, {})'.format(
+ sh_link, self._shdr_num))
+ shdr = self.read(
+ self.Shdr, self.ehdr.e_shoff + sh_link * self.Shdr.size)
+ if shdr.sh_type != Sht.SHT_STRTAB:
+ raise ValueError(
+ 'Section {} is not a string table: {}'.format(
+ sh_link, shdr.sh_type))
+ strtab = StringTable(
+ self.image[shdr.sh_offset:shdr.sh_offset + shdr.sh_size])
+ # This could retrain essentially arbitrary amounts of data,
+ # but caching string tables seems important for performance.
+ self._stringtab[sh_link] = strtab
+ return strtab
+
+
+__all__ = [name for name in dir() if name[0].isupper()]
ELFDATA2LSB=b'\x01'
ELFDATA2MSB=b'\x02'
+ET_EXEC=2
ET_DYN=3
PT_LOAD=1
+PT_TLS=7
def elf_types_fmts(e_ident):
endian = '<' if e_ident[EI_DATA] == ELFDATA2LSB else '>'
else:
phdr.p_align = int(align)
+def elf_edit_maximize_tls_size(phdr, elfclass):
+ if elfclass == ELFCLASS32:
+ # It is possible that the kernel can allocate half of the
+ # address space, so use something larger.
+ phdr.p_memsz = 0xfff00000
+ else:
+ phdr.p_memsz = 1 << 63
-def elf_edit(f, align):
+def elf_edit(f, opts):
ei_nident_fmt = 'c' * EI_NIDENT
ei_nident_len = struct.calcsize(ei_nident_fmt)
ehdr = Elf_Ehdr(e_ident)
ehdr.read(f)
- if ehdr.e_type != ET_DYN:
- error('{}: not a shared library'.format(f.name))
+ if ehdr.e_type not in (ET_EXEC, ET_DYN):
+ error('{}: not an executable or shared library'.format(f.name))
phdr = Elf_Phdr(e_ident)
+ maximize_tls_size_done = False
for i in range(0, ehdr.e_phnum):
f.seek(ehdr.e_phoff + i * phdr.len)
phdr.read(f)
- if phdr.p_type == PT_LOAD:
- elf_edit_align(phdr, align)
+ if phdr.p_type == PT_LOAD and opts.align is not None:
+ elf_edit_align(phdr, opts.align)
+ f.seek(ehdr.e_phoff + i * phdr.len)
+ phdr.write(f)
+ break
+ if phdr.p_type == PT_TLS and opts.maximize_tls_size:
+ elf_edit_maximize_tls_size(phdr, e_ident[EI_CLASS])
f.seek(ehdr.e_phoff + i * phdr.len)
phdr.write(f)
+ maximize_tls_size_done = True
break
+ if opts.maximize_tls_size and not maximize_tls_size_done:
+ error('{}: TLS maximum size was not updated'.format(f.name))
def get_parser():
parser = argparse.ArgumentParser(description=__doc__)
- parser.add_argument('-a', dest='align', required=True,
+ parser.add_argument('-a', dest='align',
help='How to set the LOAD alignment')
+ parser.add_argument('--maximize-tls-size', action='store_true',
+ help='Set maximum PT_TLS size')
parser.add_argument('output',
help='ELF file to edit')
return parser
parser = get_parser()
opts = parser.parse_args(argv)
with open(opts.output, 'r+b') as fout:
- elf_edit(fout, opts.align)
+ elf_edit(fout, opts)
if __name__ == '__main__':
--- /dev/null
+#!/usr/bin/python3
+# Dump the output of LD_TRACE_LOADED_OBJECTS in architecture neutral format.
+# Copyright (C) 2022 Free Software Foundation, Inc.
+# Copyright The GNU Toolchain Authors.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+import argparse
+import os
+import subprocess
+import sys
+
+try:
+ subprocess.run
+except:
+ class _CompletedProcess:
+ def __init__(self, args, returncode, stdout=None, stderr=None):
+ self.args = args
+ self.returncode = returncode
+ self.stdout = stdout
+ self.stderr = stderr
+
+ def _run(*popenargs, input=None, timeout=None, check=False, **kwargs):
+ assert(timeout is None)
+ with subprocess.Popen(*popenargs, **kwargs) as process:
+ try:
+ stdout, stderr = process.communicate(input)
+ except:
+ process.kill()
+ process.wait()
+ raise
+ returncode = process.poll()
+ if check and returncode:
+ raise subprocess.CalledProcessError(returncode, popenargs)
+ return _CompletedProcess(popenargs, returncode, stdout, stderr)
+
+ subprocess.run = _run
+
+def is_vdso(lib):
+ return lib.startswith('linux-gate') or lib.startswith('linux-vdso')
+
+
+def parse_trace(cmd, fref):
+ new_env = os.environ.copy()
+ new_env['LD_TRACE_LOADED_OBJECTS'] = '1'
+ trace_out = subprocess.run(cmd, stdout=subprocess.PIPE, check=True,
+ universal_newlines=True, env=new_env).stdout
+ trace = []
+ for line in trace_out.splitlines():
+ line = line.strip()
+ if is_vdso(line):
+ continue
+ fields = line.split('=>' if '=>' in line else ' ')
+ lib = os.path.basename(fields[0].strip())
+ if lib.startswith('ld'):
+ lib = 'ld'
+ elif lib.startswith('libc'):
+ lib = 'libc'
+ found = 1 if fields[1].strip() != 'not found' else 0
+ trace += ['{} {}'.format(lib, found)]
+ trace = sorted(trace)
+
+ reference = sorted(line.replace('\n','') for line in fref.readlines())
+
+ ret = 0 if trace == reference else 1
+ if ret != 0:
+ for i in reference:
+ if i not in trace:
+ print("Only in {}: {}".format(fref.name, i))
+ for i in trace:
+ if i not in reference:
+ print("Only in trace: {}".format(i))
+
+ sys.exit(ret)
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument('command',
+ help='comand to run')
+ parser.add_argument('reference',
+ help='reference file to compare')
+ return parser
+
+
+def main(argv):
+ parser = get_parser()
+ opts = parser.parse_args(argv)
+ with open(opts.reference, 'r') as fref:
+ # Remove the initial 'env' command.
+ parse_trace(opts.command.split()[1:], fref)
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
libnss_hesiod=2
libnss_db=2
-# Tests for NSS. They must have the same NSS_SHLIB_REVISION number as
-# the rest.
-libnss_test1=2
-libnss_test2=2
-
# Version for libnsl with YP and NIS+ functions.
libnsl=1
tests := \
tst-accept4 \
tst-sockopt \
+ tst-cmsghdr \
# tests
tests-internal := \
# else
extern ssize_t __sendmsg64 (int __fd, const struct msghdr *__message,
int __flags);
-# defien sendmsg __sendmsg64
+# define sendmsg __sendmsg64
# endif
#endif
--- /dev/null
+/* Test ancillary data header creation.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* We use the preprocessor to generate the function/macro tests instead of
+ using indirection because having all the macro expansions alongside
+ each other lets the compiler warn us about suspicious pointer
+ arithmetic across subsequent CMSG_{FIRST,NXT}HDR expansions. */
+
+#include <stdint.h>
+
+#define RUN_TEST_CONCAT(suffix) run_test_##suffix
+#define RUN_TEST_FUNCNAME(suffix) RUN_TEST_CONCAT (suffix)
+
+static void
+RUN_TEST_FUNCNAME (CMSG_NXTHDR_IMPL) (void)
+{
+ struct msghdr m = {0};
+ struct cmsghdr *cmsg;
+ char cmsgbuf[3 * CMSG_SPACE (sizeof (PAYLOAD))] = {0};
+
+ m.msg_control = cmsgbuf;
+ m.msg_controllen = sizeof (cmsgbuf);
+
+ /* First header should point to the start of the buffer. */
+ cmsg = CMSG_FIRSTHDR (&m);
+ TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+
+ /* If the first header length consumes the entire buffer, there is no
+ space remaining for additional headers. */
+ cmsg->cmsg_len = sizeof (cmsgbuf);
+ cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+ TEST_VERIFY_EXIT (cmsg == NULL);
+
+ /* The first header length is so big, using it would cause an overflow. */
+ cmsg = CMSG_FIRSTHDR (&m);
+ TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+ cmsg->cmsg_len = SIZE_MAX;
+ cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+ TEST_VERIFY_EXIT (cmsg == NULL);
+
+ /* The first header leaves just enough space to hold another header. */
+ cmsg = CMSG_FIRSTHDR (&m);
+ TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+ cmsg->cmsg_len = sizeof (cmsgbuf) - sizeof (struct cmsghdr);
+ cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+ TEST_VERIFY_EXIT (cmsg != NULL);
+
+ /* The first header leaves space but not enough for another header. */
+ cmsg = CMSG_FIRSTHDR (&m);
+ TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+ cmsg->cmsg_len ++;
+ cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+ TEST_VERIFY_EXIT (cmsg == NULL);
+
+ /* The second header leaves just enough space to hold another header. */
+ cmsg = CMSG_FIRSTHDR (&m);
+ TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+ cmsg->cmsg_len = CMSG_LEN (sizeof (PAYLOAD));
+ cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+ TEST_VERIFY_EXIT (cmsg != NULL);
+ cmsg->cmsg_len = sizeof (cmsgbuf)
+ - CMSG_SPACE (sizeof (PAYLOAD)) /* First header. */
+ - sizeof (struct cmsghdr);
+ cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+ TEST_VERIFY_EXIT (cmsg != NULL);
+
+ /* The second header leaves space but not enough for another header. */
+ cmsg = CMSG_FIRSTHDR (&m);
+ TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf);
+ cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+ TEST_VERIFY_EXIT (cmsg != NULL);
+ cmsg->cmsg_len ++;
+ cmsg = CMSG_NXTHDR_IMPL (&m, cmsg);
+ TEST_VERIFY_EXIT (cmsg == NULL);
+
+ return;
+}
--- /dev/null
+/* Test ancillary data header creation.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sys/socket.h>
+#include <gnu/lib-names.h>
+#include <support/xdlfcn.h>
+#include <support/check.h>
+
+#define PAYLOAD "Hello, World!"
+
+/* CMSG_NXTHDR is a macro that calls an inline function defined in
+ bits/socket.h. In case the function cannot be inlined, libc.so carries
+ a copy. Both versions need to be tested. */
+
+#define CMSG_NXTHDR_IMPL CMSG_NXTHDR
+#include "tst-cmsghdr-skeleton.c"
+#undef CMSG_NXTHDR_IMPL
+
+static struct cmsghdr * (* cmsg_nxthdr) (struct msghdr *, struct cmsghdr *);
+
+#define CMSG_NXTHDR_IMPL cmsg_nxthdr
+#include "tst-cmsghdr-skeleton.c"
+#undef CMSG_NXTHDR_IMPL
+
+static int
+do_test (void)
+{
+ static void *handle;
+
+ run_test_CMSG_NXTHDR ();
+
+ handle = xdlopen (LIBC_SO, RTLD_LAZY);
+ cmsg_nxthdr = (struct cmsghdr * (*) (struct msghdr *, struct cmsghdr *))
+ xdlsym (handle, "__cmsg_nxthdr");
+
+ run_test_cmsg_nxthdr ();
+
+ return 0;
+}
+
+#include <support/test-driver.c>
CFLAGS-tst-makecontext.c += -funwind-tables
CFLAGS-tst-makecontext2.c += $(stack-align-test-flags)
+CFLAGS-testmb.c += -D_FORTIFY_SOURCE=2 -Wall -Werror
+
+
# Run a test on the header files we use.
tests-special += $(objpfx)isomac.out
const char *__restrict __src,
size_t __len, size_t __dstlen) __THROW
__attr_access ((__write_only__, 1, 3)) __attr_access ((__read_only__, 2));
+extern size_t __REDIRECT_NTH (__mbstowcs_nulldst,
+ (wchar_t *__restrict __dst,
+ const char *__restrict __src,
+ size_t __len), mbstowcs)
+ __attr_access ((__read_only__, 2));
extern size_t __REDIRECT_NTH (__mbstowcs_alias,
(wchar_t *__restrict __dst,
const char *__restrict __src,
__NTH (mbstowcs (wchar_t *__restrict __dst, const char *__restrict __src,
size_t __len))
{
- return __glibc_fortify_n (mbstowcs, __len, sizeof (wchar_t),
- __glibc_objsize (__dst),
- __dst, __src, __len);
+ if (__builtin_constant_p (__dst == NULL) && __dst == NULL)
+ return __mbstowcs_nulldst (__dst, __src, __len);
+ else
+ return __glibc_fortify_n (mbstowcs, __len, sizeof (wchar_t),
+ __glibc_objsize (__dst), __dst, __src, __len);
}
-
extern size_t __wcstombs_chk (char *__restrict __dst,
const wchar_t *__restrict __src,
size_t __len, size_t __dstlen) __THROW
lose = 1;
}
+ i = mbstowcs (NULL, "bar", 4);
+ if (!(i == 3 && w[1] == 'a'))
+ {
+ puts ("mbstowcs FAILED2!");
+ lose = 1;
+ }
+
mbstowcs (w, "blah", 5);
i = wcstombs (c, w, 10);
if (i != 4)
# else
extern char *__stpncpy_chk (char *__dest, const char *__src, size_t __n,
size_t __destlen) __THROW
- __fortified_attr_access ((__write_only__, 1, 3))
+ __fortified_attr_access (__write_only__, 1, 3)
__attr_access ((__read_only__, 2));
extern char *__REDIRECT_NTH (__stpncpy_alias, (char *__dest, const char *__src,
size_t __n), stpncpy);
<https://www.gnu.org/licenses/>. */
#include <assert.h>
+#include <support/xunistd.h>
#define TEST_MAIN
#define TEST_NAME "rawmemchr"
}
}
+static void
+do_test_bz29234 (void)
+{
+ size_t i, j;
+ char *ptr_start;
+ char *buf = xmmap (0, 8192, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1);
+
+ memset (buf, -1, 8192);
+
+ ptr_start = buf + 4096 - 8;
+
+ /* Out of range matches before the start of a page. */
+ memset (ptr_start - 8, 0x1, 8);
+
+ for (j = 0; j < 8; ++j)
+ {
+ for (i = 0; i < 128; ++i)
+ {
+ ptr_start[i + j] = 0x1;
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_test (impl, (char *) (ptr_start + j), 0x1,
+ ptr_start + i + j);
+
+ ptr_start[i + j] = 0xff;
+ }
+ }
+
+ xmunmap (buf, 8192);
+}
+
static void
do_test (size_t align, size_t pos, size_t len, int seek_char)
{
size_t i;
char *result;
- align &= 7;
+ align &= getpagesize () - 1;
if (align + len >= page_size)
return;
}
}
+ if (align)
+ {
+ p[align - 1] = seek_char;
+ if (align > 4)
+ p[align - 4] = seek_char;
+ }
+
assert (pos < len);
size_t r = random ();
if ((r & 31) == 0)
result, p);
ret = 1;
}
+
+ if (align)
+ {
+ p[align - 1] = seek_char;
+ if (align > 4)
+ p[align - 4] = seek_char;
+ }
}
}
do_test (i, 64, 256, 23);
do_test (0, 16 << i, 2048, 0);
do_test (i, 64, 256, 0);
+
+ do_test (getpagesize () - i, 64, 256, 23);
+ do_test (getpagesize () - i, 64, 256, 0);
}
for (i = 1; i < 32; ++i)
{
do_test (0, i, i + 1, 23);
do_test (0, i, i + 1, 0);
+
+ do_test (getpagesize () - 7, i, i + 1, 23);
+ do_test (getpagesize () - i / 2, i, i + 1, 23);
+ do_test (getpagesize () - i, i, i + 1, 23);
}
do_random_tests ();
+ do_test_bz29234 ();
return ret;
}
}
}
+static void
+check4 (void)
+{
+ /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of
+ the end of the page. 2) For there to be no mismatch/null byte before the
+ first page cross. 3) For length (`n`) to be large enough for one string to
+ cross the page. And 4) for there to be either mismatch/null bytes before
+ the start of the strings. */
+
+ size_t size = 10;
+ size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1);
+ CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa));
+ CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed));
+ int exp_result;
+
+ STRCPY (s1, L ("tst-tlsmod%"));
+ STRCPY (s2, L ("tst-tls-manydynamic73mod"));
+ exp_result = SIMPLE_STRNCMP (s1, s2, size);
+ FOR_EACH_IMPL (impl, 0)
+ check_result (impl, s1, s2, size, exp_result);
+}
+
int
test_main (void)
{
check1 ();
check2 ();
check3 ();
+ check4 ();
printf ("%23s", "");
FOR_EACH_IMPL (impl, 0)
initializer functions have completed. */
extern void _dl_fini (void) attribute_hidden;
-/* Sort array MAPS according to dependencies of the contained objects. */
+/* Sort array MAPS according to dependencies of the contained objects.
+ If FORCE_FIRST, MAPS[0] keeps its place even if the dependencies
+ say otherwise. */
extern void _dl_sort_maps (struct link_map **maps, unsigned int nmaps,
- unsigned int skip, bool for_fini) attribute_hidden;
+ bool force_first, bool for_fini) attribute_hidden;
/* The dynamic linker calls this function before and having changing
any shared object mappings. The `r_state' member of `struct r_debug'
# endif
#endif
+/* Perform early memory allocation, avoding a TCB dependency.
+ Terminate the process if allocation fails. May attempt to use
+ brk. */
+void *_dl_early_allocate (size_t size) attribute_hidden;
+
/* Initialize the DSO sort algorithm to use. */
#if !HAVE_TUNABLES
static inline void
--- /dev/null
+/* Private libc-internal arch-specific definitions. Generic version.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public License as
+ published by the Free Software Foundation; either version 2.1 of the
+ License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; see the file COPYING.LIB. If
+ not, see <https://www.gnu.org/licenses/>. */
+
+#ifndef _LIBC_LOCK_ARCH_H
+#define _LIBC_LOCK_ARCH_H
+
+/* The default definition uses the natural alignment from the lock type. */
+#define __LIBC_LOCK_ALIGNMENT
+
+#endif
/* Use macro instead of inline function to avoid including <stdio.h>. */
#define _startup_fatal(message) __libc_fatal ((message))
-
-static inline uid_t
-startup_getuid (void)
-{
- return __getuid ();
-}
-
-static inline uid_t
-startup_geteuid (void)
-{
- return __geteuid ();
-}
-
-static inline gid_t
-startup_getgid (void)
-{
- return __getgid ();
-}
-
-static inline gid_t
-startup_getegid (void)
-{
- return __getegid ();
-}
#include <ldsodefs.h>
#include <elf/dynamic-link.h>
#include <dl-fptr.h>
+#include <dl-runtime.h>
#include <dl-unmap-segments.h>
#include <atomic.h>
#include <libc-pointer-arith.h>
{
ElfW(Addr) addr = (ElfW(Addr)) address;
ElfW(Word) reloc_arg;
- volatile unsigned int *desc;
- unsigned int *gptr;
+ unsigned int *desc, *gptr;
/* Return ADDR if the least-significant two bits of ADDR are not consistent
with ADDR being a linker defined function pointer. The normal value for
a code address in a backtrace is 3. */
- if (((unsigned int) addr & 3) != 2)
+ if (((uintptr_t) addr & 3) != 2)
return addr;
/* Handle special case where ADDR points to page 0. */
- if ((unsigned int) addr < 4096)
+ if ((uintptr_t) addr < 4096)
return addr;
/* Clear least-significant two bits from descriptor address. */
- desc = (unsigned int *) ((unsigned int) addr & ~3);
+ desc = (unsigned int *) ((uintptr_t) addr & ~3);
if (!_dl_read_access_allowed (desc))
return addr;
/* Then load first word of candidate descriptor. It should be a pointer
with word alignment and point to memory that can be read. */
gptr = (unsigned int *) desc[0];
- if (((unsigned int) gptr & 3) != 0
+ if (((uintptr_t) gptr & 3) != 0
|| !_dl_read_access_allowed (gptr))
return addr;
/* If gp has been resolved, we need to hunt for relocation offset. */
if (!(reloc_arg & PA_GP_RELOC))
- reloc_arg = _dl_fix_reloc_arg (addr, l);
+ reloc_arg = _dl_fix_reloc_arg ((struct fdesc *) addr, l);
_dl_fixup (l, reloc_arg);
}
return (ElfW(Addr)) desc[0];
}
+rtld_hidden_def (_dl_lookup_address)
#define DL_SYMBOL_ADDRESS(map, ref) _dl_symbol_address(map, ref)
Elf32_Addr _dl_lookup_address (const void *address);
+rtld_hidden_proto (_dl_lookup_address)
#define DL_LOOKUP_ADDRESS(addr) _dl_lookup_address ((const void *) addr)
/* Extract the code address from a fixup value */
#define DL_FIXUP_VALUE_CODE_ADDR(value) ((value).ip)
#define DL_FIXUP_VALUE_ADDR(value) ((uintptr_t) &(value))
-#define DL_FIXUP_ADDR_VALUE(addr) (*(struct fdesc *) (addr))
+/* Clear the plabel bit to get the actual address of the descriptor. */
+#define DL_FIXUP_ADDR_VALUE(addr) \
+ (*(DL_FIXUP_VALUE_TYPE *) ((uintptr_t) (addr) & ~2))
#define DL_FIXUP_BINDNOW_ADDR_VALUE(addr) (addr)
-#define DL_FIXUP_BINDNOW_RELOC(value, new_value, st_value) \
- (*value) = *(struct fdesc *) (st_value)
+#define DL_FIXUP_BINDNOW_RELOC(value, new_value, st_value) \
+ *(value) = *(DL_FIXUP_VALUE_TYPE *) ((uintptr_t) (new_value) & ~2)
Elf32_Addr i[2];
} sig = {{0x00,0xc0,0xff,0xee, 0xde,0xad,0xbe,0xef}};
+ /* Initialize dp register for main executable. */
+ if (l->l_main_map)
+ {
+ register Elf32_Addr dp asm ("%r27");
+
+ dp = D_PTR (l, l_info[DT_PLTGOT]);
+ asm volatile ("" : : "r" (dp));
+ }
+
/* If we don't have a PLT we can just skip all this... */
if (__builtin_expect (l->l_info[DT_JMPREL] == NULL,0))
return lazy;
its return value is the user program's entry point. */
#define RTLD_START \
-/* Set up dp for any non-PIC lib constructors that may be called. */ \
-static struct link_map * __attribute__((used)) \
-set_dp (struct link_map *map) \
-{ \
- register Elf32_Addr dp asm ("%r27"); \
- dp = D_PTR (map, l_info[DT_PLTGOT]); \
- asm volatile ("" : : "r" (dp)); \
- return map; \
-} \
- \
asm ( \
" .text\n" \
" .globl _start\n" \
"_start:\n" \
/* The kernel does not give us an initial stack frame. */ \
" ldo 64(%sp),%sp\n" \
- /* Save the relevant arguments (yes, those are the correct \
- registers, the kernel is weird) in their stack slots. */ \
-" stw %r25,-40(%sp)\n" /* argc */ \
-" stw %r24,-44(%sp)\n" /* argv */ \
\
/* We need the LTP, and we need it now. \
$PIC_pcrel$0 points 8 bytes past the current instruction, \
So, obviously, we can't just pass %sp to _dl_start. That's \
okay, argv-4 will do just fine. \
\
- The pleasant part of this is that if we need to skip \
- arguments we can just decrement argc and move argv, because \
- the stack pointer is utterly unrelated to the location of \
- the environment and argument vectors. */ \
- \
- /* This is always within range so we'll be okay. */ \
+ This is always within range so we'll be okay. */ \
" bl _dl_start,%rp\n" \
" ldo -4(%r24),%r26\n" \
\
/* Save the entry point in %r3. */ \
" copy %ret0,%r3\n" \
\
- /* See if we were called as a command with the executable file \
- name as an extra leading argument. */ \
-" addil LT'_dl_skip_args,%r19\n" \
-" ldw RT'_dl_skip_args(%r1),%r20\n" \
-" ldw 0(%r20),%r20\n" \
- \
-" ldw -40(%sp),%r25\n" /* argc */ \
-" comib,= 0,%r20,.Lnofix\n" /* FIXME: Mispredicted branch */\
-" ldw -44(%sp),%r24\n" /* argv (delay slot) */ \
+ /* The loader adjusts argc, argv, env, and the aux vectors \
+ directly on the stack to remove any arguments used for \
+ direct loader invocation. Thus, argc and argv must be \
+ reloaded from from _dl_argc and _dl_argv. */ \
\
-" sub %r25,%r20,%r25\n" \
+ /* Load argc from _dl_argc. */ \
+" addil LT'_dl_argc,%r19\n" \
+" ldw RT'_dl_argc(%r1),%r20\n" \
+" ldw 0(%r20),%r25\n" \
" stw %r25,-40(%sp)\n" \
-" sh2add %r20,%r24,%r24\n" \
+ \
+ /* Same for argv with _dl_argv. */ \
+" addil LT'_dl_argv,%r19\n" \
+" ldw RT'_dl_argv(%r1),%r20\n" \
+" ldw 0(%r20),%r24\n" \
" stw %r24,-44(%sp)\n" \
\
-".Lnofix:\n" \
+ /* Call _dl_init(main_map, argc, argv, envp). */ \
" addil LT'_rtld_local,%r19\n" \
" ldw RT'_rtld_local(%r1),%r26\n" \
-" bl set_dp, %r2\n" \
" ldw 0(%r26),%r26\n" \
\
- /* Call _dl_init(_dl_loaded, argc, argv, envp). */ \
-" copy %r28,%r26\n" \
- \
/* envp = argv + argc + 1 */ \
" sh2add %r25,%r24,%r23\n" \
" bl _dl_init,%r2\n" \
return that to the caller. The caller will continue on to call
_dl_fixup with the relocation offset. */
-ElfW(Word)
-attribute_hidden __attribute ((noinline)) DL_ARCH_FIXUP_ATTRIBUTE
+ElfW(Word) __attribute ((noinline)) DL_ARCH_FIXUP_ATTRIBUTE
_dl_fix_reloc_arg (struct fdesc *fptr, struct link_map *l)
{
Elf32_Addr l_addr, iplt, jmprel, end_jmprel, r_type;
ABORT_INSTRUCTION;
return 0;
}
+rtld_hidden_def (_dl_fix_reloc_arg)
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
+ElfW(Word) _dl_fix_reloc_arg (struct fdesc *, struct link_map *);
+rtld_hidden_proto (_dl_fix_reloc_arg)
+
/* Clear PA_GP_RELOC bit in relocation offset. */
static inline uintptr_t
reloc_offset (uintptr_t plt0, uintptr_t pltn)
Function: Imaginary part of "clog10":
double: 2
-float: 1
+float: 2
float128: 2
ldouble: 2
Function: Imaginary part of "clog10":
double: 2
-float: 1
+float: 2
float128: 2
ldouble: 2
switch (r_type)
{
+ case R_68K_GLOB_DAT:
+ case R_68K_JMP_SLOT:
+ *reloc_addr = value;
+ break;
+#ifndef RTLD_BOOTSTRAP
case R_68K_COPY:
if (sym == NULL)
/* This can happen in trace mode if an object could not be
memcpy (reloc_addr_arg, (void *) value,
MIN (sym->st_size, refsym->st_size));
break;
- case R_68K_GLOB_DAT:
- case R_68K_JMP_SLOT:
- *reloc_addr = value;
- break;
case R_68K_8:
*(char *) reloc_addr = value + reloc->r_addend;
break;
case R_68K_PC32:
*reloc_addr = value + reloc->r_addend - (Elf32_Addr) reloc_addr;
break;
-#ifndef RTLD_BOOTSTRAP
case R_68K_TLS_DTPMOD32:
/* Get the information from the link map returned by the
resolv function. */
*reloc_addr = TLS_TPREL_VALUE (sym_map, sym, reloc);
}
break;
-#endif /* !RTLD_BOOTSTRAP */
case R_68K_NONE: /* Alright, Wilbur. */
break;
+#endif /* !RTLD_BOOTSTRAP */
default:
_dl_reloc_bad_type (map, r_type, 0);
break;
+ CMSG_ALIGN (sizeof (struct cmsghdr)))
#define CMSG_LEN(len) (CMSG_ALIGN (sizeof (struct cmsghdr)) + (len))
+/* Given a length, return the additional padding necessary such that
+ len + __CMSG_PADDING(len) == CMSG_ALIGN (len). */
+#define __CMSG_PADDING(len) ((sizeof (size_t) \
+ - ((len) & (sizeof (size_t) - 1))) \
+ & (sizeof (size_t) - 1))
+
extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr,
struct cmsghdr *__cmsg) __THROW;
#ifdef __USE_EXTERN_INLINES
_EXTERN_INLINE struct cmsghdr *
__NTH (__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg))
{
+ /* We may safely assume that __cmsg lies between __mhdr->msg_control and
+ __mhdr->msg_controllen because the user is required to obtain the first
+ cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs
+ via CMSG_NXTHDR, setting lengths along the way. However, we don't yet
+ trust the value of __cmsg->cmsg_len and therefore do not use it in any
+ pointer arithmetic until we check its value. */
+
+ unsigned char * __msg_control_ptr = (unsigned char *) __mhdr->msg_control;
+ unsigned char * __cmsg_ptr = (unsigned char *) __cmsg;
+
+ size_t __size_needed = sizeof (struct cmsghdr)
+ + __CMSG_PADDING (__cmsg->cmsg_len);
+
+ /* The current header is malformed, too small to be a full header. */
if ((size_t) __cmsg->cmsg_len < sizeof (struct cmsghdr))
- /* The kernel header does this so there may be a reason. */
return (struct cmsghdr *) 0;
+ /* There isn't enough space between __cmsg and the end of the buffer to
+ hold the current cmsg *and* the next one. */
+ if (((size_t)
+ (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr)
+ < __size_needed)
+ || ((size_t)
+ (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr
+ - __size_needed)
+ < __cmsg->cmsg_len))
+
+ return (struct cmsghdr *) 0;
+
+ /* Now, we trust cmsg_len and can use it to find the next header. */
__cmsg = (struct cmsghdr *) ((unsigned char *) __cmsg
+ CMSG_ALIGN (__cmsg->cmsg_len));
- if ((unsigned char *) (__cmsg + 1) > ((unsigned char *) __mhdr->msg_control
- + __mhdr->msg_controllen)
- || ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len)
- > ((unsigned char *) __mhdr->msg_control + __mhdr->msg_controllen)))
- /* No more entries. */
- return (struct cmsghdr *) 0;
return __cmsg;
}
#endif /* Use `extern inline'. */
{
void go (intptr_t *argdata)
{
+ char *orig_argv0;
char **p;
/* Cache the information in various global variables. */
_environ = &_dl_argv[_dl_argc + 1];
for (p = _environ; *p++;); /* Skip environ pointers and terminator. */
+ orig_argv0 = _dl_argv[0];
+
if ((void *) p == _dl_argv[0])
{
static struct hurd_startup_data nodata;
/* The call above might screw a few things up.
- First of all, if _dl_skip_args is nonzero, we are ignoring
- the first few arguments. However, if we have no Hurd startup
- data, it is the magical convention that ARGV[0] == P. The
+ P is the location after the terminating NULL of the list of
+ environment variables. It has to point to the Hurd startup
+ data or if that's missing then P == ARGV[0] must hold. The
startup code in init-first.c will get confused if this is not
the case, so we must rearrange things to make it so. We'll
- overwrite the origional ARGV[0] at P with ARGV[_dl_skip_args].
+ recompute P and move the Hurd data or the new ARGV[0] there.
- Secondly, if we need to be secure, it removes some dangerous
- environment variables. If we have no Hurd startup date this
- changes P (since that's the location after the terminating
- NULL in the list of environment variables). We do the same
- thing as in the first case but make sure we recalculate P.
- If we do have Hurd startup data, we have to move the data
- such that it starts just after the terminating NULL in the
- environment list.
+ Note: directly invoked ld.so can move arguments and env vars.
We use memmove, since the locations might overlap. */
- if (__libc_enable_secure || _dl_skip_args)
- {
- char **newp;
- for (newp = _environ; *newp++;);
+ char **newp;
+ for (newp = _environ; *newp++;);
- if (_dl_argv[-_dl_skip_args] == (char *) p)
+ if (newp != p || _dl_argv[0] != orig_argv0)
+ {
+ if (orig_argv0 == (char *) p)
{
if ((char *) newp != _dl_argv[0])
{
+++ /dev/null
-/* Define and initialize the `__libc_enable_secure' flag. Hurd version.
- Copyright (C) 1998-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-/* There is no need for this file in the Hurd; it is just a placeholder
- to prevent inclusion of the sysdeps/generic version.
- In the shared library, the `__libc_enable_secure' variable is defined
- by the dynamic linker in dl-sysdep.c and set there.
- In the static library, it is defined in init-first.c and set there. */
-
-#include <libc-internal.h>
-
-void
-__libc_init_secure (void)
-{
-}
unsigned long int __hurd_threadvar_stack_offset;
unsigned long int __hurd_threadvar_stack_mask;
-#ifndef SHARED
-int __libc_enable_secure;
-#endif
-
extern int __libc_argc attribute_hidden;
extern char **__libc_argv attribute_hidden;
extern char **_dl_argv;
ldw r8, %call(_dl_nios2_get_gp_value)(r22)\n\
callr r8\n\
mov gp, r2\n\
-\n\
- /* Find the number of arguments to skip. */\n\
- ldw r8, %got(_dl_skip_args)(r22)\n\
- ldw r8, 0(r8)\n\
\n\
/* Find the main_map from the GOT. */\n\
ldw r4, %got(_rtld_local)(r22)\n\
ldw r4, 0(r4)\n\
\n\
- /* Find argc. */\n\
- ldw r5, 0(sp)\n\
- sub r5, r5, r8\n\
- stw r5, 0(sp)\n\
-\n\
- /* Find the first unskipped argument. */\n\
- slli r8, r8, 2\n\
- addi r6, sp, 4\n\
- add r9, r6, r8\n\
- mov r10, r6\n\
-\n\
- /* Shuffle argv down. */\n\
-3: ldw r11, 0(r9)\n\
- stw r11, 0(r10)\n\
- addi r9, r9, 4\n\
- addi r10, r10, 4\n\
- bne r11, zero, 3b\n\
+ /* Load adjusted argc. */\n\
+ ldw r2, %got(_dl_argc)(r22)\n\
+ ldw r5, 0(r2)\n\
\n\
- /* Shuffle envp down. */\n\
- mov r7, r10\n\
-4: ldw r11, 0(r9)\n\
- stw r11, 0(r10)\n\
- addi r9, r9, 4\n\
- addi r10, r10, 4\n\
- bne r11, zero, 4b\n\
-\n\
- /* Shuffle auxv down. */\n\
-5: ldw r11, 4(r9)\n\
- stw r11, 4(r10)\n\
- ldw r11, 0(r9)\n\
- stw r11, 0(r10)\n\
- addi r9, r9, 8\n\
- addi r10, r10, 8\n\
- bne r11, zero, 5b\n\
-\n\
- /* Update _dl_argv. */\n\
+ /* Load adjsuted argv. */\n\
ldw r2, %got(_dl_argv)(r22)\n\
- stw r6, 0(r2)\n\
+ ldw r6, 0(r2)\n\
+\n\
+ /* envp = argv + argc + 1 */\n\
+ addi r7, r5, 1\n\
+ slli r7, r7, 2\n\
+ add r7, r7, r6\n\
\n\
/* Call _dl_init through the PLT. */\n\
ldw r8, %call(_dl_init)(r22)\n\
It will be bigger than it actually is, but for unwind.c/pt-longjmp.c
purposes this is good enough. */
THREAD_SETMEM (pd, stackblock_size, (size_t) __libc_stack_end);
-
- THREAD_SETMEM (pd, cancelstate, PTHREAD_CANCEL_ENABLE);
- THREAD_SETMEM (pd, canceltype, PTHREAD_CANCEL_DEFERRED);
}
#include <pthread.h>
#define __need_NULL
#include <stddef.h>
+#include <libc-lock-arch.h>
/* Mutex type. */
# if (!IS_IN (libc) && !IS_IN (libpthread)) || !defined _LIBC
typedef struct { pthread_mutex_t mutex; } __libc_lock_recursive_t;
# else
-typedef struct { int lock; int cnt; void *owner; } __libc_lock_recursive_t;
+typedef struct
+{
+ int lock __LIBC_LOCK_ALIGNMENT;
+ int cnt;
+ void *owner;
+} __libc_lock_recursive_t;
# endif
#else
typedef struct __libc_lock_recursive_opaque__ __libc_lock_recursive_t;
ld.so might be used on old kernels with a different libc.so. */
#include <lowlevellock.h>
#include <tls.h>
+#include <libc-lock-arch.h>
/* Mutex type. */
-typedef int __libc_lock_t;
+typedef int __libc_lock_t __LIBC_LOCK_ALIGNMENT;
typedef struct { pthread_mutex_t mutex; } __rtld_lock_recursive_t;
typedef pthread_rwlock_t __libc_rwlock_t;
struct pthread *self = THREAD_SELF;
/* Make sure we get no more cancellations. */
- THREAD_ATOMIC_BIT_SET (self, cancelhandling, EXITING_BIT);
+ atomic_bit_set (&self->cancelhandling, EXITING_BIT);
__pthread_unwind ((__pthread_unwind_buf_t *)
THREAD_GETMEM (self, cleanup_jmp_buf));
#ifdef _POSIX_ASYNC_IO
{
/* AIO is only allowed on regular files and block devices. */
- struct stat64 st;
+ struct __stat64_t64 st;
- if (__fstat64 (fd, &st) < 0
+ if (__fstat64_time64 (fd, &st) < 0
|| (! S_ISREG (st.st_mode) && ! S_ISBLK (st.st_mode)))
return -1;
else
int
isfdtype (int fildes, int fdtype)
{
- struct stat64 st;
+ struct __stat64_t64 st;
int result;
{
int save_error = errno;
- result = __fstat64 (fildes, &st);
+ result = __fstat64_time64 (fildes, &st);
__set_errno (save_error);
}
int
posix_fallocate (int fd, __off_t offset, __off_t len)
{
- struct stat64 st;
+ struct __stat64_t64 st;
if (offset < 0 || len < 0)
return EINVAL;
}
/* We have to make sure that this is really a regular file. */
- if (__fstat64 (fd, &st) != 0)
+ if (__fstat64_time64 (fd, &st) != 0)
return EBADF;
if (S_ISFIFO (st.st_mode))
return ESPIPE;
int
__posix_fallocate64_l64 (int fd, __off64_t offset, __off64_t len)
{
- struct stat64 st;
+ struct __stat64_t64 st;
if (offset < 0 || len < 0)
return EINVAL;
}
/* We have to make sure that this is really a regular file. */
- if (__fstat64 (fd, &st) != 0)
+ if (__fstat64_time64 (fd, &st) != 0)
return EBADF;
if (S_ISFIFO (st.st_mode))
return ESPIPE;
cmpldi cr6,r5,16 /* Check if length was reached. */
ble cr6,L(zero_padding_end)
- stxv v18,0(r11)
+ stxv 32+v18,0(r11)
addi r11,r11,16
addi r5,r5,-16
L(zero_padding_end):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
- stxvl v18,r11,r10 /* Partial store */
+ stxvl 32+v18,r11,r10 /* Partial store */
blr
.align 4
tst-cancel12 tst-cancel13 tst-cancel14 tst-cancel15 tst-cancel16 \
tst-cancel18 tst-cancel19 tst-cancel20 tst-cancel21 \
tst-cancel22 tst-cancel23 tst-cancel26 tst-cancel27 tst-cancel28 \
+ tst-cancel29 \
tst-cleanup0 tst-cleanup1 tst-cleanup2 tst-cleanup3 \
tst-clock1 \
tst-cond-except \
tst-pthread-raise-blocked-self \
tst-pthread_kill-exited \
tst-pthread_kill-exiting \
+ tst-cancel30 \
# tests
tests-time64 := \
tst-cleanupx0 tst-cleanupx1 tst-cleanupx2 tst-cleanupx3
ifeq ($(build-shared),yes)
-tests += tst-atfork2 tst-pt-tls4 tst-_res1 tst-fini1 tst-create1
+tests += \
+ tst-atfork2 \
+ tst-pt-tls4 \
+ tst-_res1 \
+ tst-fini1 \
+ tst-create1 \
+ tst-atfork3 \
+ tst-atfork4 \
+# tests
+
tests-nolibpthread += tst-fini1
endif
-modules-names += tst-atfork2mod tst-tls4moda tst-tls4modb \
- tst-_res1mod1 tst-_res1mod2 tst-fini1mod \
- tst-create1mod
+modules-names += \
+ tst-atfork2mod \
+ tst-tls4moda \
+ tst-tls4modb \
+ tst-_res1mod1 \
+ tst-_res1mod2 \
+ tst-fini1mod \
+ tst-create1mod \
+ tst-atfork3mod \
+ tst-atfork4mod \
+# module-names
+
test-modules = $(addprefix $(objpfx),$(addsuffix .so,$(modules-names)))
tst-atfork2mod.so-no-z-defs = yes
+tst-atfork3mod.so-no-z-defs = yes
+tst-atfork4mod.so-no-z-defs = yes
tst-create1mod.so-no-z-defs = yes
ifeq ($(build-shared),yes)
LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so
$(objpfx)tst-atfork2mod.so: $(shared-thread-library)
+$(objpfx)tst-atfork3: $(shared-thread-library)
+LDFLAGS-tst-atfork3 = -rdynamic
+$(objpfx)tst-atfork3mod.so: $(shared-thread-library)
+
+$(objpfx)tst-atfork4: $(shared-thread-library)
+LDFLAGS-tst-atfork4 = -rdynamic
+$(objpfx)tst-atfork4mod.so: $(shared-thread-library)
+
ifeq ($(build-shared),yes)
$(objpfx)tst-atfork2.out: $(objpfx)tst-atfork2mod.so
+$(objpfx)tst-atfork3.out: $(objpfx)tst-atfork3mod.so
+$(objpfx)tst-atfork4.out: $(objpfx)tst-atfork4mod.so
endif
ifeq ($(build-shared),yes)
--- /dev/null
+/* Check if pthread_atfork handler can call dlclose (BZ#24595).
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdio.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include <support/check.h>
+#include <support/xthread.h>
+#include <support/capture_subprocess.h>
+#include <support/xdlfcn.h>
+
+/* Check if pthread_atfork handlers do not deadlock when calling a function
+ that might alter the internal fork handle list, such as dlclose.
+
+ The test registers a callback set with pthread_atfork(), dlopen() a shared
+ library (nptl/tst-atfork3mod.c), calls an exported symbol from the library
+ (which in turn also registers atfork handlers), and calls fork to trigger
+ the callbacks. */
+
+static void *handler;
+static bool run_dlclose_prepare;
+static bool run_dlclose_parent;
+static bool run_dlclose_child;
+
+static void
+prepare (void)
+{
+ if (run_dlclose_prepare)
+ xdlclose (handler);
+}
+
+static void
+parent (void)
+{
+ if (run_dlclose_parent)
+ xdlclose (handler);
+}
+
+static void
+child (void)
+{
+ if (run_dlclose_child)
+ xdlclose (handler);
+}
+
+static void
+proc_func (void *closure)
+{
+}
+
+static void
+do_test_generic (bool dlclose_prepare, bool dlclose_parent, bool dlclose_child)
+{
+ run_dlclose_prepare = dlclose_prepare;
+ run_dlclose_parent = dlclose_parent;
+ run_dlclose_child = dlclose_child;
+
+ handler = xdlopen ("tst-atfork3mod.so", RTLD_NOW);
+
+ int (*atfork3mod_func)(void);
+ atfork3mod_func = xdlsym (handler, "atfork3mod_func");
+
+ atfork3mod_func ();
+
+ struct support_capture_subprocess proc
+ = support_capture_subprocess (proc_func, NULL);
+ support_capture_subprocess_check (&proc, "tst-atfork3", 0, sc_allow_none);
+
+ handler = atfork3mod_func = NULL;
+
+ support_capture_subprocess_free (&proc);
+}
+
+static void *
+thread_func (void *closure)
+{
+ return NULL;
+}
+
+static int
+do_test (void)
+{
+ {
+ /* Make the process acts as multithread. */
+ pthread_attr_t attr;
+ xpthread_attr_init (&attr);
+ xpthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
+ xpthread_create (&attr, thread_func, NULL);
+ }
+
+ TEST_COMPARE (pthread_atfork (prepare, parent, child), 0);
+
+ do_test_generic (true /* prepare */, false /* parent */, false /* child */);
+ do_test_generic (false /* prepare */, true /* parent */, false /* child */);
+ do_test_generic (false /* prepare */, false /* parent */, true /* child */);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
--- /dev/null
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include <support/check.h>
+
+static void
+mod_prepare (void)
+{
+}
+
+static void
+mod_parent (void)
+{
+}
+
+static void
+mod_child (void)
+{
+}
+
+int atfork3mod_func (void)
+{
+ TEST_COMPARE (pthread_atfork (mod_prepare, mod_parent, mod_child), 0);
+
+ return 0;
+}
--- /dev/null
+/* pthread_atfork supports handlers that call pthread_atfork or dlclose.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <support/xdlfcn.h>
+#include <stdio.h>
+#include <support/xthread.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <support/xunistd.h>
+#include <support/check.h>
+#include <stdlib.h>
+
+static void *
+thread_func (void *x)
+{
+ return NULL;
+}
+
+static unsigned int second_atfork_handler_runcount = 0;
+
+static void
+second_atfork_handler (void)
+{
+ second_atfork_handler_runcount++;
+}
+
+static void *h = NULL;
+
+static unsigned int atfork_handler_runcount = 0;
+
+static void
+prepare (void)
+{
+ /* These atfork handlers are registered while atfork handlers are being
+ executed and thus will not be executed during the corresponding
+ fork. */
+ TEST_VERIFY_EXIT (pthread_atfork (second_atfork_handler,
+ second_atfork_handler,
+ second_atfork_handler) == 0);
+
+ /* This will de-register the atfork handlers registered by the dlopen'd
+ library and so they will not be executed. */
+ if (h != NULL)
+ {
+ xdlclose (h);
+ h = NULL;
+ }
+
+ atfork_handler_runcount++;
+}
+
+static void
+after (void)
+{
+ atfork_handler_runcount++;
+}
+
+static int
+do_test (void)
+{
+ /* Make sure __libc_single_threaded is 0. */
+ pthread_attr_t attr;
+ xpthread_attr_init (&attr);
+ xpthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
+ xpthread_create (&attr, thread_func, NULL);
+
+ void (*reg_atfork_handlers) (void);
+
+ h = xdlopen ("tst-atfork4mod.so", RTLD_LAZY);
+
+ reg_atfork_handlers = xdlsym (h, "reg_atfork_handlers");
+
+ reg_atfork_handlers ();
+
+ /* We register our atfork handlers *after* loading the module so that our
+ prepare handler is called first at fork, where we then dlclose the
+ module before its prepare handler has a chance to be called. */
+ TEST_VERIFY_EXIT (pthread_atfork (prepare, after, after) == 0);
+
+ pid_t pid = xfork ();
+
+ /* Both the parent and the child processes should observe this. */
+ TEST_VERIFY_EXIT (atfork_handler_runcount == 2);
+ TEST_VERIFY_EXIT (second_atfork_handler_runcount == 0);
+
+ if (pid > 0)
+ {
+ int childstat;
+
+ xwaitpid (-1, &childstat, 0);
+ TEST_VERIFY_EXIT (WIFEXITED (childstat)
+ && WEXITSTATUS (childstat) == 0);
+
+ /* This time, the second set of atfork handlers should also be called
+ since the handlers are already in place before fork is called. */
+
+ pid = xfork ();
+
+ TEST_VERIFY_EXIT (atfork_handler_runcount == 4);
+ TEST_VERIFY_EXIT (second_atfork_handler_runcount == 2);
+
+ if (pid > 0)
+ {
+ xwaitpid (-1, &childstat, 0);
+ TEST_VERIFY_EXIT (WIFEXITED (childstat)
+ && WEXITSTATUS (childstat) == 0);
+ }
+ }
+
+ return 0;
+}
+
+#include <support/test-driver.c>
--- /dev/null
+/* pthread_atfork supports handlers that call pthread_atfork or dlclose.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <pthread.h>
+#include <stdlib.h>
+
+/* This dynamically loaded library simply registers its atfork handlers when
+ asked to. The atfork handlers should never be executed because the
+ library is unloaded before fork is called by the test program. */
+
+static void
+prepare (void)
+{
+ abort ();
+}
+
+static void
+parent (void)
+{
+ abort ();
+}
+
+static void
+child (void)
+{
+ abort ();
+}
+
+void
+reg_atfork_handlers (void)
+{
+ pthread_atfork (prepare, parent, child);
+}
--- /dev/null
+/* Check if a thread that disables cancellation and which call functions
+ that might be interrupted by a signal do not see the internal SIGCANCEL.
+
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <array_length.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <poll.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/temp_file.h>
+#include <support/xthread.h>
+#include <sys/socket.h>
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+
+/* On Linux some interfaces are never restarted after being interrupted by
+ a signal handler, regardless of the use of SA_RESTART. It means that
+ if asynchronous cancellation is not enabled, the pthread_cancel can not
+ set the internal SIGCANCEL otherwise the interface might see a spurious
+ EINTR failure. */
+
+static pthread_barrier_t b;
+
+/* Cleanup handling test. */
+static int cl_called;
+static void
+cl (void *arg)
+{
+ ++cl_called;
+}
+
+static void *
+tf_sigtimedwait (void *arg)
+{
+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+ xpthread_barrier_wait (&b);
+
+ int r;
+ pthread_cleanup_push (cl, NULL);
+
+ sigset_t mask;
+ sigemptyset (&mask);
+ r = sigtimedwait (&mask, NULL, &(struct timespec) { 0, 250000000 });
+ if (r != -1)
+ return (void*) -1;
+ if (errno != EAGAIN)
+ return (void*) -2;
+
+ pthread_cleanup_pop (0);
+ return NULL;
+}
+
+static void *
+tf_poll (void *arg)
+{
+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+ xpthread_barrier_wait (&b);
+
+ int r;
+ pthread_cleanup_push (cl, NULL);
+
+ r = poll (NULL, 0, 250);
+ if (r != 0)
+ return (void*) -1;
+
+ pthread_cleanup_pop (0);
+ return NULL;
+}
+
+static void *
+tf_ppoll (void *arg)
+{
+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+
+ xpthread_barrier_wait (&b);
+
+ int r;
+ pthread_cleanup_push (cl, NULL);
+
+ r = ppoll (NULL, 0, &(struct timespec) { 0, 250000000 }, NULL);
+ if (r != 0)
+ return (void*) -1;
+
+ pthread_cleanup_pop (0);
+ return NULL;
+}
+
+static void *
+tf_select (void *arg)
+{
+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+ xpthread_barrier_wait (&b);
+
+ int r;
+ pthread_cleanup_push (cl, NULL);
+
+ r = select (0, NULL, NULL, NULL, &(struct timeval) { 0, 250000 });
+ if (r != 0)
+ return (void*) -1;
+
+ pthread_cleanup_pop (0);
+ return NULL;
+}
+
+static void *
+tf_pselect (void *arg)
+{
+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+ xpthread_barrier_wait (&b);
+
+ int r;
+ pthread_cleanup_push (cl, NULL);
+
+ r = pselect (0, NULL, NULL, NULL, &(struct timespec) { 0, 250000000 }, NULL);
+ if (r != 0)
+ return (void*) -1;
+
+ pthread_cleanup_pop (0);
+ return NULL;
+}
+
+static void *
+tf_clock_nanosleep (void *arg)
+{
+ pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL);
+ xpthread_barrier_wait (&b);
+
+ int r;
+ pthread_cleanup_push (cl, NULL);
+
+ r = clock_nanosleep (CLOCK_REALTIME, 0, &(struct timespec) { 0, 250000000 },
+ NULL);
+ if (r != 0)
+ return (void*) -1;
+
+ pthread_cleanup_pop (0);
+ return NULL;
+}
+
+struct cancel_test_t
+{
+ const char *name;
+ void * (*cf) (void *);
+} tests[] =
+{
+ { "sigtimedwait", tf_sigtimedwait, },
+ { "poll", tf_poll, },
+ { "ppoll", tf_ppoll, },
+ { "select", tf_select, },
+ { "pselect", tf_pselect , },
+ { "clock_nanosleep", tf_clock_nanosleep, },
+};
+
+static int
+do_test (void)
+{
+ for (int i = 0; i < array_length (tests); i++)
+ {
+ xpthread_barrier_init (&b, NULL, 2);
+
+ cl_called = 0;
+
+ pthread_t th = xpthread_create (NULL, tests[i].cf, NULL);
+
+ xpthread_barrier_wait (&b);
+
+ struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 };
+ while (nanosleep (&ts, &ts) != 0)
+ continue;
+
+ xpthread_cancel (th);
+
+ void *status = xpthread_join (th);
+ if (status != NULL)
+ printf ("test '%s' failed: %" PRIdPTR "\n", tests[i].name,
+ (intptr_t) status);
+ TEST_VERIFY (status == NULL);
+
+ xpthread_barrier_destroy (&b);
+
+ TEST_COMPARE (cl_called, 0);
+
+ printf ("in-time cancel test of '%s' successful\n", tests[i].name);
+ }
+
+ return 0;
+}
+
+#include <support/test-driver.c>
--- /dev/null
+/* Check if printf like functions does not disable asynchronous cancellation
+ mode (BZ#29214).
+
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <support/check.h>
+#include <support/xstdio.h>
+#include <support/xthread.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+static pthread_barrier_t b;
+
+static void *
+tf (void *arg)
+{
+ int old;
+
+ TEST_COMPARE (pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, NULL), 0);
+
+ TEST_COMPARE (pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, &old), 0);
+ TEST_COMPARE (old, PTHREAD_CANCEL_ASYNCHRONOUS);
+
+ /* Check if internal lock cleanup routines restore the cancellation type
+ correctly. */
+ printf ("...\n");
+ TEST_COMPARE (pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, &old), 0);
+ TEST_COMPARE (old, PTHREAD_CANCEL_ASYNCHRONOUS);
+
+ xpthread_barrier_wait (&b);
+
+ /* Wait indefinitely for cancellation, which only works if asynchronous
+ cancellation is enabled. */
+#ifdef SYS_pause
+ syscall (SYS_pause);
+#elif defined SYS_ppoll || defined SYS_ppoll_time64
+# ifndef SYS_ppoll_time64
+# define SYS_ppoll_time64 SYS_ppoll
+# endif
+ syscall (SYS_ppoll_time64, NULL, 0, NULL, NULL);
+#else
+ for (;;);
+#endif
+
+ return 0;
+}
+
+static int
+do_test (void)
+{
+ xpthread_barrier_init (&b, NULL, 2);
+
+ pthread_t th = xpthread_create (NULL, tf, NULL);
+
+ xpthread_barrier_wait (&b);
+
+ xpthread_cancel (th);
+
+ void *status = xpthread_join (th);
+ TEST_VERIFY (status == PTHREAD_CANCELED);
+
+ return 0;
+}
+
+/* There is no need to wait full TIMEOUT if asynchronous is not working. */
+#define TIMEOUT 3
+#include <support/test-driver.c>
Function: "j0_upward":
double: 9
-float: 8
+float: 9
ldouble: 7
Function: "j1":
#if !defined PROCINFO_DECL && defined SHARED
._dl_s390_platforms
#else
-PROCINFO_CLASS const char _dl_s390_platforms[10][7]
+PROCINFO_CLASS const char _dl_s390_platforms[11][7]
#endif
#ifndef PROCINFO_DECL
= {
- "g5", "z900", "z990", "z9-109", "z10", "z196", "zEC12", "z13", "z14", "z15"
+ "g5", "z900", "z990", "z9-109", "z10", "z196", "zEC12", "z13", "z14", "z15",
+ "z16"
}
#endif
#if !defined SHARED || defined PROCINFO_DECL
#define _DL_HWCAP_COUNT 23
-#define _DL_PLATFORMS_COUNT 10
+#define _DL_PLATFORMS_COUNT 11
/* The kernel provides up to 32 capability bits with elf_hwcap. */
#define _DL_FIRST_PLATFORM 32
CFLAGS-dl-load.c += -Wno-unused
CFLAGS-dl-reloc.c += -Wno-unused
-$(objpfx)tst-glibc-hwcaps: $(objpfx)libmarkermod2-1.so \
- $(objpfx)libmarkermod3-1.so $(objpfx)libmarkermod4-1.so
+$(objpfx)tst-glibc-hwcaps: \
+ $(objpfx)libmarkermod2-1.so \
+ $(objpfx)libmarkermod3-1.so \
+ $(objpfx)libmarkermod4-1.so \
+ $(objpfx)libmarkermod5-1.so
$(objpfx)tst-glibc-hwcaps.out: \
$(objpfx)libmarkermod2.so \
$(objpfx)glibc-hwcaps/z13/libmarkermod2.so \
$(objpfx)glibc-hwcaps/z13/libmarkermod4.so \
$(objpfx)glibc-hwcaps/z14/libmarkermod4.so \
$(objpfx)glibc-hwcaps/z15/libmarkermod4.so \
+ $(objpfx)libmarkermod5.so \
+ $(objpfx)glibc-hwcaps/z13/libmarkermod5.so \
+ $(objpfx)glibc-hwcaps/z14/libmarkermod5.so \
+ $(objpfx)glibc-hwcaps/z15/libmarkermod5.so \
+ $(objpfx)glibc-hwcaps/z16/libmarkermod5.so
$(objpfx)glibc-hwcaps/z13/libmarkermod2.so: $(objpfx)libmarkermod2-2.so
$(make-target-directory)
$(objpfx)glibc-hwcaps/z15/libmarkermod4.so: $(objpfx)libmarkermod4-4.so
$(make-target-directory)
cp $< $@
+$(objpfx)glibc-hwcaps/z13/libmarkermod5.so: $(objpfx)libmarkermod5-2.so
+ $(make-target-directory)
+ cp $< $@
+$(objpfx)glibc-hwcaps/z14/libmarkermod5.so: $(objpfx)libmarkermod5-3.so
+ $(make-target-directory)
+ cp $< $@
+$(objpfx)glibc-hwcaps/z15/libmarkermod5.so: $(objpfx)libmarkermod5-4.so
+ $(make-target-directory)
+ cp $< $@
+$(objpfx)glibc-hwcaps/z16/libmarkermod5.so: $(objpfx)libmarkermod5-5.so
+ $(make-target-directory)
+ cp $< $@
+
ifeq (no,$(build-hardcoded-path-in-tests))
# This is an ld.so.cache test, and RPATH/RUNPATH in the executable
--- /dev/null
+# This file is generated from configure.ac by Autoconf. DO NOT EDIT!
+ # Local configure fragment for sysdeps/s390/s390-64.
+
+# Minimal checking for static PIE support in ld.
+# Compare to ld testcase/bugzilla:
+# <binutils-source>/ld/testsuite/ld-elf/pr22263-1.rd
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for s390-specific static PIE requirements" >&5
+$as_echo_n "checking for s390-specific static PIE requirements... " >&6; }
+if { as_var=\
+libc_cv_s390x_staticpie_req; eval \${$as_var+:} false; }; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat > conftest1.c <<EOF
+__thread int * foo;
+
+void
+bar (void)
+{
+ *foo = 1;
+}
+EOF
+ cat > conftest2.c <<EOF
+extern __thread int *foo;
+extern void bar (void);
+static int x;
+
+int
+main ()
+{
+ foo = &x;
+ return 0;
+}
+EOF
+ libc_cv_s390x_staticpie_req=no
+ if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -fPIE -c conftest1.c -o conftest1.o'
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; } \
+ && { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -fPIE -c conftest2.c -o conftest2.o'
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; } \
+ && { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -pie -o conftest conftest1.o conftest2.o'
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; } \
+ && { ac_try='! readelf -Wr conftest | grep R_390_TLS_TPOFF'
+ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; }
+ then
+ libc_cv_s390x_staticpie_req=yes
+ fi
+ rm -rf conftest.*
+fi
+eval ac_res=\$\
+libc_cv_s390x_staticpie_req
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+if test $libc_cv_s390x_staticpie_req = yes; then
+ # Static PIE is supported only on 64bit.
+ # Ensure you also have those patches for:
+ # - binutils (ld)
+ # - "[PR ld/22263] s390: Avoid dynamic TLS relocs in PIE"
+ # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=26b1426577b5dcb32d149c64cca3e603b81948a9
+ # (Tested by configure check above)
+ # Otherwise there will be a R_390_TLS_TPOFF relocation, which fails to
+ # be processed in _dl_relocate_static_pie() as static TLS map is not setup.
+ # - "s390: Add DT_JMPREL pointing to .rela.[i]plt with static-pie"
+ # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d942d8db12adf4c9e5c7d9ed6496a779ece7149e
+ # (We can't test it in configure as we are not able to link a static PIE
+ # executable if the system glibc lacks static PIE support)
+ # Otherwise there won't be DT_JMPREL, DT_PLTRELA, DT_PLTRELASZ entries
+ # and the IFUNC symbols are not processed, which leads to crashes.
+ #
+ # - kernel (the mentioned links to the commits belong to 5.19 merge window):
+ # - "s390/mmap: increase stack/mmap gap to 128MB"
+ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=f2f47d0ef72c30622e62471903ea19446ea79ee2
+ # - "s390/vdso: move vdso mapping to its own function"
+ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=57761da4dc5cd60bed2c81ba0edb7495c3c740b8
+ # - "s390/vdso: map vdso above stack"
+ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=9e37a2e8546f9e48ea76c839116fa5174d14e033
+ # - "s390/vdso: add vdso randomization"
+ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=41cd81abafdc4e58a93fcb677712a76885e3ca25
+ # (We can't test the kernel of the target system)
+ # Otherwise if /proc/sys/kernel/randomize_va_space is turned off (0),
+ # static PIE executables like ldconfig will crash. While startup sbrk is
+ # used to enlarge the HEAP. Unfortunately the underlying brk syscall fails
+ # as there is not enough space after the HEAP. Then the address of the TLS
+ # image is invalid and the following memcpy in __libc_setup_tls() leads
+ # to a segfault.
+ # If /proc/sys/kernel/randomize_va_space is activated (default: 2), there
+ # is enough space after HEAP.
+ #
+ # - glibc
+ # - "Linux: Define MMAP_CALL_INTERNAL"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=c1b68685d438373efe64e5f076f4215723004dfb
+ # - "i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=6e5c7a1e262961adb52443ab91bd2c9b72316402
+ # - "i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=60f0f2130d30cfd008ca39743027f1e200592dff
+ # - "ia64: Always define IA64_USE_NEW_STUB as a flag macro"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=18bd9c3d3b1b6a9182698c85354578d1d58e9d64
+ # - "Linux: Implement a useful version of _startup_fatal"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=a2a6bce7d7e52c1c34369a7da62c501cc350bc31
+ # - "Linux: Introduce __brk_call for invoking the brk system call"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=b57ab258c1140bc45464b4b9908713e3e0ee35aa
+ # - "csu: Implement and use _dl_early_allocate during static startup"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=f787e138aa0bf677bf74fa2a08595c446292f3d7
+ # The mentioned patch series by Florian Weimer avoids the mentioned failing
+ # sbrk syscall by falling back to mmap.
+ $as_echo "#define SUPPORT_STATIC_PIE 1" >>confdefs.h
+
+fi
--- /dev/null
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/s390/s390-64.
+
+# Minimal checking for static PIE support in ld.
+# Compare to ld testcase/bugzilla:
+# <binutils-source>/ld/testsuite/ld-elf/pr22263-1.rd
+AC_CACHE_CHECK([for s390-specific static PIE requirements], \
+[libc_cv_s390x_staticpie_req], [dnl
+ cat > conftest1.c <<EOF
+__thread int * foo;
+
+void
+bar (void)
+{
+ *foo = 1;
+}
+EOF
+ cat > conftest2.c <<EOF
+extern __thread int *foo;
+extern void bar (void);
+static int x;
+
+int
+main ()
+{
+ foo = &x;
+ return 0;
+}
+EOF
+ libc_cv_s390x_staticpie_req=no
+ if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -fPIE -c conftest1.c -o conftest1.o]) \
+ && AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -fPIE -c conftest2.c -o conftest2.o]) \
+ && AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -pie -o conftest conftest1.o conftest2.o]) \
+ && AC_TRY_COMMAND([! readelf -Wr conftest | grep R_390_TLS_TPOFF])
+ then
+ libc_cv_s390x_staticpie_req=yes
+ fi
+ rm -rf conftest.*])
+if test $libc_cv_s390x_staticpie_req = yes; then
+ # Static PIE is supported only on 64bit.
+ # Ensure you also have those patches for:
+ # - binutils (ld)
+ # - "[PR ld/22263] s390: Avoid dynamic TLS relocs in PIE"
+ # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=26b1426577b5dcb32d149c64cca3e603b81948a9
+ # (Tested by configure check above)
+ # Otherwise there will be a R_390_TLS_TPOFF relocation, which fails to
+ # be processed in _dl_relocate_static_pie() as static TLS map is not setup.
+ # - "s390: Add DT_JMPREL pointing to .rela.[i]plt with static-pie"
+ # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d942d8db12adf4c9e5c7d9ed6496a779ece7149e
+ # (We can't test it in configure as we are not able to link a static PIE
+ # executable if the system glibc lacks static PIE support)
+ # Otherwise there won't be DT_JMPREL, DT_PLTRELA, DT_PLTRELASZ entries
+ # and the IFUNC symbols are not processed, which leads to crashes.
+ #
+ # - kernel (the mentioned links to the commits belong to 5.19 merge window):
+ # - "s390/mmap: increase stack/mmap gap to 128MB"
+ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=f2f47d0ef72c30622e62471903ea19446ea79ee2
+ # - "s390/vdso: move vdso mapping to its own function"
+ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=57761da4dc5cd60bed2c81ba0edb7495c3c740b8
+ # - "s390/vdso: map vdso above stack"
+ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=9e37a2e8546f9e48ea76c839116fa5174d14e033
+ # - "s390/vdso: add vdso randomization"
+ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=41cd81abafdc4e58a93fcb677712a76885e3ca25
+ # (We can't test the kernel of the target system)
+ # Otherwise if /proc/sys/kernel/randomize_va_space is turned off (0),
+ # static PIE executables like ldconfig will crash. While startup sbrk is
+ # used to enlarge the HEAP. Unfortunately the underlying brk syscall fails
+ # as there is not enough space after the HEAP. Then the address of the TLS
+ # image is invalid and the following memcpy in __libc_setup_tls() leads
+ # to a segfault.
+ # If /proc/sys/kernel/randomize_va_space is activated (default: 2), there
+ # is enough space after HEAP.
+ #
+ # - glibc
+ # - "Linux: Define MMAP_CALL_INTERNAL"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=c1b68685d438373efe64e5f076f4215723004dfb
+ # - "i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=6e5c7a1e262961adb52443ab91bd2c9b72316402
+ # - "i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=60f0f2130d30cfd008ca39743027f1e200592dff
+ # - "ia64: Always define IA64_USE_NEW_STUB as a flag macro"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=18bd9c3d3b1b6a9182698c85354578d1d58e9d64
+ # - "Linux: Implement a useful version of _startup_fatal"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=a2a6bce7d7e52c1c34369a7da62c501cc350bc31
+ # - "Linux: Introduce __brk_call for invoking the brk system call"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=b57ab258c1140bc45464b4b9908713e3e0ee35aa
+ # - "csu: Implement and use _dl_early_allocate during static startup"
+ # https://sourceware.org/git/?p=glibc.git;a=commit;h=f787e138aa0bf677bf74fa2a08595c446292f3d7
+ # The mentioned patch series by Florian Weimer avoids the mentioned failing
+ # sbrk syscall by falling back to mmap.
+ AC_DEFINE(SUPPORT_STATIC_PIE)
+fi
dl_hwcap_check (void)
{
#if defined __ARCH__
-# if GCCMACRO__ARCH__ >= 13
+# if GCCMACRO__ARCH__ >= 14
+ if (!(GLRO(dl_hwcap) & HWCAP_S390_VXRS_PDE2))
+ _dl_fatal_printf ("\
+Fatal glibc error: CPU lacks VXRS_PDE2 support (z16 or later required)\n");
+# elif GCCMACRO__ARCH__ >= 13
if (!(GLRO(dl_hwcap) & HWCAP_S390_VXRS_EXT2))
_dl_fatal_printf ("\
Fatal glibc error: CPU lacks VXRS_EXT2 support (z15 or later required)\n");
#include <dl-hwcaps.h>
#include <ldsodefs.h>
-const char _dl_hwcaps_subdirs[] = "z15:z14:z13";
-enum { subdirs_count = 3 }; /* Number of components in _dl_hwcaps_subdirs. */
+const char _dl_hwcaps_subdirs[] = "z16:z15:z14:z13";
+enum { subdirs_count = 4 }; /* Number of components in _dl_hwcaps_subdirs. */
uint32_t
_dl_hwcaps_subdirs_active (void)
return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
++active;
+ /* z16.
+ Note: We do not list HWCAP_S390_NNPA here as, according to the Principles of
+ Operation, those instructions may be replaced or removed in future. */
+ if (!(GLRO (dl_hwcap) & HWCAP_S390_VXRS_PDE2))
+ return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
+ ++active;
+
return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active);
}
/* Ok, now branch to the libc main routine. */
#ifdef PIC
+# ifdef SHARED
+ /* Used for dynamic linked position independent executable.
+ => Scrt1.o */
larl %r2,main@GOTENT # load pointer to main
lg %r2,0(%r2)
+# else
+ /* Used for dynamic linked position dependent executable.
+ => crt1.o (glibc configured without --disable-default-pie:
+ PIC is defined)
+ Or for static linked position independent executable.
+ => rcrt1.o (only available if glibc configured without
+ --disable-default-pie: PIC is defined) */
+ larl %r2,__wrap_main
+# endif
brasl %r14,__libc_start_main@plt
#else
+ /* Used for dynamic/static linked position dependent executable.
+ => crt1.o (glibc configured with --disable-default-pie:
+ PIC and SHARED are not defined) */
larl %r2,main # load pointer to main
brasl %r14,__libc_start_main
#endif
cfi_endproc
+#if defined PIC && !defined SHARED
+ /* When main is not defined in the executable but in a shared library
+ then a wrapper is needed in crt1.o of the static-pie enabled libc,
+ because crt1.o and rcrt1.o share code and the later must avoid the
+ use of GOT relocations before __libc_start_main is called. */
+__wrap_main:
+ cfi_startproc
+ larl %r1,main@GOTENT # load pointer to main
+ lg %r1,0(%r1)
+ br %r1
+ cfi_endproc
+#endif
+
/* Define a symbol for the first piece of initialized data. */
.data
.globl __data_start
extern int marker2 (void);
extern int marker3 (void);
extern int marker4 (void);
+extern int marker5 (void);
/* Return the arch level, 10 for the baseline libmarkermod*.so's. */
static int
return 12;
if (strcmp (platform, "z15") == 0)
return 13;
+ if (strcmp (platform, "z16") == 0)
+ return 14;
printf ("warning: unrecognized AT_PLATFORM value: %s\n", platform);
- /* Assume that the new platform supports z15. */
- return 13;
+ /* Assume that the new platform supports z16. */
+ return 14;
}
static int
TEST_COMPARE (marker2 (), MIN (level - 9, 2));
TEST_COMPARE (marker3 (), MIN (level - 9, 3));
TEST_COMPARE (marker4 (), MIN (level - 9, 4));
+ TEST_COMPARE (marker5 (), MIN (level - 9, 5));
return 0;
}
tst-prctl \
tst-scm_rights \
tst-epoll \
+ tst-getauxval \
# tests
# Test for the symbol version of fcntl that was replaced in glibc 2.28.
#define __NR_sendmsg 211
#define __NR_sendto 206
#define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 99
#define __NR_set_tid_address 96
#define __NR_setdomainname 162
#define HWCAP2_BTI (1 << 17)
#define HWCAP2_MTE (1 << 18)
#define HWCAP2_ECV (1 << 19)
+#define HWCAP2_AFP (1 << 20)
+#define HWCAP2_RPRES (1 << 21)
#define __NR_sendmsg 114
#define __NR_sendto 133
#define __NR_set_mempolicy 431
+#define __NR_set_mempolicy_home_node 560
#define __NR_set_robust_list 466
#define __NR_set_tid_address 411
#define __NR_setdomainname 166
+++ /dev/null
-/* Change data segment size. Linux/Alpha.
- Copyright (C) 2020-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <errno.h>
-#include <unistd.h>
-#include <sysdep.h>
-
-void *__curbrk = 0;
-
-int
-__brk (void *addr)
-{
- /* Alpha brk returns -ENOMEM in case of failure. */
- __curbrk = (void *) INTERNAL_SYSCALL_CALL (brk, addr);
- if ((unsigned long) __curbrk == -ENOMEM)
- {
- __set_errno (ENOMEM);
- return -1;
- }
-
- return 0;
-}
-weak_alias (__brk, brk)
--- /dev/null
+/* Invoke the brk system call. Alpha version.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+static inline void *
+__brk_call (void *addr)
+{
+ unsigned long int result = INTERNAL_SYSCALL_CALL (brk, addr);
+ if (result == -ENOMEM)
+ /* Mimic the generic error reporting behavior. */
+ result = INTERNAL_SYSCALL_CALL (brk, 0);
+ return (void *) result;
+}
extern long __libc_alpha_cache_shape[4];
-#define DL_PLATFORM_AUXV \
- case AT_L1I_CACHESHAPE: \
- __libc_alpha_cache_shape[0] = av->a_un.a_val; \
- break; \
- case AT_L1D_CACHESHAPE: \
- __libc_alpha_cache_shape[1] = av->a_un.a_val; \
- break; \
- case AT_L2_CACHESHAPE: \
- __libc_alpha_cache_shape[2] = av->a_un.a_val; \
- break; \
- case AT_L3_CACHESHAPE: \
- __libc_alpha_cache_shape[3] = av->a_un.a_val; \
- break;
+#define DL_PLATFORM_AUXV \
+ __libc_alpha_cache_shape[0] = auxv_values[AT_L1I_CACHESHAPE]; \
+ __libc_alpha_cache_shape[1] = auxv_values[AT_L1D_CACHESHAPE]; \
+ __libc_alpha_cache_shape[2] = auxv_values[AT_L2_CACHESHAPE]; \
+ __libc_alpha_cache_shape[3] = auxv_values[AT_L3_CACHESHAPE];
#define __NR_sendmsg 211
#define __NR_sendto 206
#define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 99
#define __NR_set_tid_address 96
#define __NR_setdomainname 162
#define __NR_sendmsg 296
#define __NR_sendto 290
#define __NR_set_mempolicy 321
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 338
#define __NR_set_tid_address 256
#define __NR_set_tls 983045
#define SOL_KCM 281
#define SOL_TLS 282
#define SOL_XDP 283
+#define SOL_MPTCP 284
+#define SOL_MCTP 285
/* Maximum queue length specifiable by listen. */
#define SOMAXCONN 4096
+ CMSG_ALIGN (sizeof (struct cmsghdr)))
#define CMSG_LEN(len) (CMSG_ALIGN (sizeof (struct cmsghdr)) + (len))
+/* Given a length, return the additional padding necessary such that
+ len + __CMSG_PADDING(len) == CMSG_ALIGN (len). */
+#define __CMSG_PADDING(len) ((sizeof (size_t) \
+ - ((len) & (sizeof (size_t) - 1))) \
+ & (sizeof (size_t) - 1))
+
extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr,
struct cmsghdr *__cmsg) __THROW;
#ifdef __USE_EXTERN_INLINES
_EXTERN_INLINE struct cmsghdr *
__NTH (__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg))
{
+ /* We may safely assume that __cmsg lies between __mhdr->msg_control and
+ __mhdr->msg_controllen because the user is required to obtain the first
+ cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs
+ via CMSG_NXTHDR, setting lengths along the way. However, we don't yet
+ trust the value of __cmsg->cmsg_len and therefore do not use it in any
+ pointer arithmetic until we check its value. */
+
+ unsigned char * __msg_control_ptr = (unsigned char *) __mhdr->msg_control;
+ unsigned char * __cmsg_ptr = (unsigned char *) __cmsg;
+
+ size_t __size_needed = sizeof (struct cmsghdr)
+ + __CMSG_PADDING (__cmsg->cmsg_len);
+
+ /* The current header is malformed, too small to be a full header. */
if ((size_t) __cmsg->cmsg_len < sizeof (struct cmsghdr))
- /* The kernel header does this so there may be a reason. */
return (struct cmsghdr *) 0;
+ /* There isn't enough space between __cmsg and the end of the buffer to
+ hold the current cmsg *and* the next one. */
+ if (((size_t)
+ (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr)
+ < __size_needed)
+ || ((size_t)
+ (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr
+ - __size_needed)
+ < __cmsg->cmsg_len))
+
+ return (struct cmsghdr *) 0;
+
+ /* Now, we trust cmsg_len and can use it to find the next header. */
__cmsg = (struct cmsghdr *) ((unsigned char *) __cmsg
+ CMSG_ALIGN (__cmsg->cmsg_len));
- if ((unsigned char *) (__cmsg + 1) > ((unsigned char *) __mhdr->msg_control
- + __mhdr->msg_controllen)
- || ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len)
- > ((unsigned char *) __mhdr->msg_control + __mhdr->msg_controllen)))
- /* No more entries. */
- return (struct cmsghdr *) 0;
return __cmsg;
}
#endif /* Use `extern inline'. */
#include <errno.h>
#include <unistd.h>
#include <sysdep.h>
+#include <brk_call.h>
/* This must be initialized data because commons can't have aliases. */
void *__curbrk = 0;
int
__brk (void *addr)
{
- __curbrk = (void *) INTERNAL_SYSCALL_CALL (brk, addr);
+ __curbrk = __brk_call (addr);
if (__curbrk < addr)
{
__set_errno (ENOMEM);
--- /dev/null
+/* Invoke the brk system call. Generic Linux version.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+static inline void *
+__brk_call (void *addr)
+{
+ /* The default implementation reports errors through an unchanged
+ break. */
+ return (void *) INTERNAL_SYSCALL_CALL (brk, addr);
+}
struct cmsghdr *
__cmsg_nxthdr (struct msghdr *mhdr, struct cmsghdr *cmsg)
{
+ /* We may safely assume that cmsg lies between mhdr->msg_control and
+ mhdr->msg_controllen because the user is required to obtain the first
+ cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs
+ via CMSG_NXTHDR, setting lengths along the way. However, we don't yet
+ trust the value of cmsg->cmsg_len and therefore do not use it in any
+ pointer arithmetic until we check its value. */
+
+ unsigned char * msg_control_ptr = (unsigned char *) mhdr->msg_control;
+ unsigned char * cmsg_ptr = (unsigned char *) cmsg;
+
+ size_t size_needed = sizeof (struct cmsghdr)
+ + __CMSG_PADDING (cmsg->cmsg_len);
+
+ /* The current header is malformed, too small to be a full header. */
if ((size_t) cmsg->cmsg_len < sizeof (struct cmsghdr))
- /* The kernel header does this so there may be a reason. */
- return NULL;
+ return (struct cmsghdr *) 0;
+
+ /* There isn't enough space between cmsg and the end of the buffer to
+ hold the current cmsg *and* the next one. */
+ if (((size_t)
+ (msg_control_ptr + mhdr->msg_controllen - cmsg_ptr)
+ < size_needed)
+ || ((size_t)
+ (msg_control_ptr + mhdr->msg_controllen - cmsg_ptr
+ - size_needed)
+ < cmsg->cmsg_len))
+
+ return (struct cmsghdr *) 0;
+ /* Now, we trust cmsg_len and can use it to find the next header. */
cmsg = (struct cmsghdr *) ((unsigned char *) cmsg
+ CMSG_ALIGN (cmsg->cmsg_len));
- if ((unsigned char *) (cmsg + 1) > ((unsigned char *) mhdr->msg_control
- + mhdr->msg_controllen)
- || ((unsigned char *) cmsg + CMSG_ALIGN (cmsg->cmsg_len)
- > ((unsigned char *) mhdr->msg_control + mhdr->msg_controllen)))
- /* No more entries. */
- return NULL;
return cmsg;
}
libc_hidden_def (__cmsg_nxthdr)
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <kernel-features.h>
+#include <bits/timesize.h>
-#ifndef __ASSUME_TIME64_SYSCALLS
+#if __TIMESIZE != 64
# include <stdint.h>
# include <string.h>
# include <sys/socket.h>
#define __NR_sendmsg 211
#define __NR_sendto 206
#define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 99
#define __NR_set_thread_area 244
#define __NR_set_tid_address 96
--- /dev/null
+/* Early memory allocation for the dynamic loader. Generic version.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Mark symbols hidden in static PIE for early self relocation to work. */
+#if BUILD_PIE_DEFAULT
+# pragma GCC visibility push(hidden)
+#endif
+#include <startup.h>
+
+#include <ldsodefs.h>
+#include <stddef.h>
+#include <string.h>
+#include <sysdep.h>
+#include <unistd.h>
+
+#include <brk_call.h>
+#include <mmap_call.h>
+
+/* Defined in brk.c. */
+extern void *__curbrk;
+
+void *
+_dl_early_allocate (size_t size)
+{
+ void *result;
+
+ if (__curbrk != NULL)
+ /* If the break has been initialized, brk must have run before,
+ so just call it once more. */
+ {
+ result = __sbrk (size);
+ if (result == (void *) -1)
+ result = NULL;
+ }
+ else
+ {
+ /* If brk has not been invoked, there is no need to update
+ __curbrk. The first call to brk will take care of that. */
+ void *previous = __brk_call (0);
+ result = __brk_call (previous + size);
+ if (result == previous)
+ result = NULL;
+ else
+ result = previous;
+ }
+
+ /* If brk fails, fall back to mmap. This can happen due to
+ unfortunate ASLR layout decisions and kernel bugs, particularly
+ for static PIE. */
+ if (result == NULL)
+ {
+ long int ret;
+ int prot = PROT_READ | PROT_WRITE;
+ int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#ifdef __NR_mmap2
+ ret = MMAP_CALL_INTERNAL (mmap2, 0, size, prot, flags, -1, 0);
+#else
+ ret = MMAP_CALL_INTERNAL (mmap, 0, size, prot, flags, -1, 0);
+#endif
+ if (INTERNAL_SYSCALL_ERROR_P (ret))
+ result = NULL;
+ else
+ result = (void *) ret;
+ }
+
+ return result;
+}
--- /dev/null
+/* Parse the Linux auxiliary vector.
+ Copyright (C) 1995-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <elf.h>
+#include <entry.h>
+#include <fpu_control.h>
+#include <ldsodefs.h>
+#include <link.h>
+
+typedef ElfW(Addr) dl_parse_auxv_t[AT_MINSIGSTKSZ + 1];
+
+/* Copy the auxiliary vector into AUXV_VALUES and set up GLRO
+ variables. */
+static inline
+void _dl_parse_auxv (ElfW(auxv_t) *av, dl_parse_auxv_t auxv_values)
+{
+ auxv_values[AT_ENTRY] = (ElfW(Addr)) ENTRY_POINT;
+ auxv_values[AT_PAGESZ] = EXEC_PAGESIZE;
+ auxv_values[AT_FPUCW] = _FPU_DEFAULT;
+
+ /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */
+ _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ),
+ "CONSTANT_MINSIGSTKSZ is constant");
+ auxv_values[AT_MINSIGSTKSZ] = CONSTANT_MINSIGSTKSZ;
+
+ for (; av->a_type != AT_NULL; av++)
+ if (av->a_type <= AT_MINSIGSTKSZ)
+ auxv_values[av->a_type] = av->a_un.a_val;
+
+ GLRO(dl_pagesize) = auxv_values[AT_PAGESZ];
+ __libc_enable_secure = auxv_values[AT_SECURE];
+ GLRO(dl_platform) = (void *) auxv_values[AT_PLATFORM];
+ GLRO(dl_hwcap) = auxv_values[AT_HWCAP];
+ GLRO(dl_hwcap2) = auxv_values[AT_HWCAP2];
+ GLRO(dl_clktck) = auxv_values[AT_CLKTCK];
+ GLRO(dl_fpu_control) = auxv_values[AT_FPUCW];
+ _dl_random = (void *) auxv_values[AT_RANDOM];
+ GLRO(dl_minsigstacksize) = auxv_values[AT_MINSIGSTKSZ];
+ GLRO(dl_sysinfo_dso) = (void *) auxv_values[AT_SYSINFO_EHDR];
+#ifdef NEED_DL_SYSINFO
+ if (GLRO(dl_sysinfo_dso) != NULL)
+ GLRO(dl_sysinfo) = auxv_values[AT_SYSINFO];
+#endif
+
+ DL_PLATFORM_AUXV
+}
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-/* Linux needs some special initialization, but otherwise uses
- the generic dynamic linker system interface code. */
-
-#include <string.h>
+#include <_itoa.h>
+#include <assert.h>
+#include <dl-auxv.h>
+#include <dl-osinfo.h>
+#include <dl-parse_auxv.h>
+#include <dl-procinfo.h>
+#include <dl-tunables.h>
+#include <elf.h>
+#include <errno.h>
#include <fcntl.h>
-#include <unistd.h>
-#include <sys/param.h>
-#include <sys/utsname.h>
#include <ldsodefs.h>
+#include <libc-internal.h>
+#include <libintl.h>
#include <not-cancel.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <tls.h>
+#include <unistd.h>
+
+#include <dl-machine.h>
+#include <dl-hwcap-check.h>
#ifdef SHARED
-# define DL_SYSDEP_INIT frob_brk ()
+extern char **_environ attribute_hidden;
+extern char _end[] attribute_hidden;
+
+/* Protect SUID program against misuse of file descriptors. */
+extern void __libc_check_standard_fds (void);
-static inline void
-frob_brk (void)
+int __libc_enable_secure attribute_relro = 0;
+rtld_hidden_data_def (__libc_enable_secure)
+/* This variable contains the lowest stack address ever used. */
+void *__libc_stack_end attribute_relro = NULL;
+rtld_hidden_data_def(__libc_stack_end)
+void *_dl_random attribute_relro = NULL;
+
+#ifndef DL_STACK_END
+# define DL_STACK_END(cookie) ((void *) (cookie))
+#endif
+
+/* Arguments passed to dl_main. */
+struct dl_main_arguments
{
- __brk (0); /* Initialize the break. */
+ const ElfW(Phdr) *phdr;
+ ElfW(Word) phnum;
+ ElfW(Addr) user_entry;
+};
+
+/* Separate function, so that dl_main can be called without the large
+ array on the stack. */
+static void
+_dl_sysdep_parse_arguments (void **start_argptr,
+ struct dl_main_arguments *args)
+{
+ _dl_argc = (intptr_t) *start_argptr;
+ _dl_argv = (char **) (start_argptr + 1); /* Necessary aliasing violation. */
+ _environ = _dl_argv + _dl_argc + 1;
+ for (char **tmp = _environ; ; ++tmp)
+ if (*tmp == NULL)
+ {
+ /* Another necessary aliasing violation. */
+ GLRO(dl_auxv) = (ElfW(auxv_t) *) (tmp + 1);
+ break;
+ }
+
+ dl_parse_auxv_t auxv_values = { 0, };
+ _dl_parse_auxv (GLRO(dl_auxv), auxv_values);
+
+ args->phdr = (const ElfW(Phdr) *) auxv_values[AT_PHDR];
+ args->phnum = auxv_values[AT_PHNUM];
+ args->user_entry = auxv_values[AT_ENTRY];
}
-# include <elf/dl-sysdep.c>
+ElfW(Addr)
+_dl_sysdep_start (void **start_argptr,
+ void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum,
+ ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv))
+{
+ __libc_stack_end = DL_STACK_END (start_argptr);
+
+ struct dl_main_arguments dl_main_args;
+ _dl_sysdep_parse_arguments (start_argptr, &dl_main_args);
+
+ dl_hwcap_check ();
+
+ __tunables_init (_environ);
+
+ /* Initialize DSO sorting algorithm after tunables. */
+ _dl_sort_maps_init ();
+
+ __brk (0); /* Initialize the break. */
+
+#ifdef DL_PLATFORM_INIT
+ DL_PLATFORM_INIT;
#endif
+ /* Determine the length of the platform name. */
+ if (GLRO(dl_platform) != NULL)
+ GLRO(dl_platformlen) = strlen (GLRO(dl_platform));
+
+ if (__sbrk (0) == _end)
+ /* The dynamic linker was run as a program, and so the initial break
+ starts just after our bss, at &_end. The malloc in dl-minimal.c
+ will consume the rest of this page, so tell the kernel to move the
+ break up that far. When the user program examines its break, it
+ will see this new value and not clobber our data. */
+ __sbrk (GLRO(dl_pagesize)
+ - ((_end - (char *) 0) & (GLRO(dl_pagesize) - 1)));
+
+ /* If this is a SUID program we make sure that FDs 0, 1, and 2 are
+ allocated. If necessary we are doing it ourself. If it is not
+ possible we stop the program. */
+ if (__builtin_expect (__libc_enable_secure, 0))
+ __libc_check_standard_fds ();
+
+ (*dl_main) (dl_main_args.phdr, dl_main_args.phnum,
+ &dl_main_args.user_entry, GLRO(dl_auxv));
+ return dl_main_args.user_entry;
+}
+
+void
+_dl_sysdep_start_cleanup (void)
+{
+}
+
+void
+_dl_show_auxv (void)
+{
+ char buf[64];
+ ElfW(auxv_t) *av;
+
+ /* Terminate string. */
+ buf[63] = '\0';
+
+ /* The following code assumes that the AT_* values are encoded
+ starting from 0 with AT_NULL, 1 for AT_IGNORE, and all other values
+ close by (otherwise the array will be too large). In case we have
+ to support a platform where these requirements are not fulfilled
+ some alternative implementation has to be used. */
+ for (av = GLRO(dl_auxv); av->a_type != AT_NULL; ++av)
+ {
+ static const struct
+ {
+ const char label[22];
+ enum { unknown = 0, dec, hex, str, ignore } form : 8;
+ } auxvars[] =
+ {
+ [AT_EXECFD - 2] = { "EXECFD: ", dec },
+ [AT_EXECFN - 2] = { "EXECFN: ", str },
+ [AT_PHDR - 2] = { "PHDR: 0x", hex },
+ [AT_PHENT - 2] = { "PHENT: ", dec },
+ [AT_PHNUM - 2] = { "PHNUM: ", dec },
+ [AT_PAGESZ - 2] = { "PAGESZ: ", dec },
+ [AT_BASE - 2] = { "BASE: 0x", hex },
+ [AT_FLAGS - 2] = { "FLAGS: 0x", hex },
+ [AT_ENTRY - 2] = { "ENTRY: 0x", hex },
+ [AT_NOTELF - 2] = { "NOTELF: ", hex },
+ [AT_UID - 2] = { "UID: ", dec },
+ [AT_EUID - 2] = { "EUID: ", dec },
+ [AT_GID - 2] = { "GID: ", dec },
+ [AT_EGID - 2] = { "EGID: ", dec },
+ [AT_PLATFORM - 2] = { "PLATFORM: ", str },
+ [AT_HWCAP - 2] = { "HWCAP: ", hex },
+ [AT_CLKTCK - 2] = { "CLKTCK: ", dec },
+ [AT_FPUCW - 2] = { "FPUCW: ", hex },
+ [AT_DCACHEBSIZE - 2] = { "DCACHEBSIZE: 0x", hex },
+ [AT_ICACHEBSIZE - 2] = { "ICACHEBSIZE: 0x", hex },
+ [AT_UCACHEBSIZE - 2] = { "UCACHEBSIZE: 0x", hex },
+ [AT_IGNOREPPC - 2] = { "IGNOREPPC", ignore },
+ [AT_SECURE - 2] = { "SECURE: ", dec },
+ [AT_BASE_PLATFORM - 2] = { "BASE_PLATFORM: ", str },
+ [AT_SYSINFO - 2] = { "SYSINFO: 0x", hex },
+ [AT_SYSINFO_EHDR - 2] = { "SYSINFO_EHDR: 0x", hex },
+ [AT_RANDOM - 2] = { "RANDOM: 0x", hex },
+ [AT_HWCAP2 - 2] = { "HWCAP2: 0x", hex },
+ [AT_MINSIGSTKSZ - 2] = { "MINSIGSTKSZ: ", dec },
+ [AT_L1I_CACHESIZE - 2] = { "L1I_CACHESIZE: ", dec },
+ [AT_L1I_CACHEGEOMETRY - 2] = { "L1I_CACHEGEOMETRY: 0x", hex },
+ [AT_L1D_CACHESIZE - 2] = { "L1D_CACHESIZE: ", dec },
+ [AT_L1D_CACHEGEOMETRY - 2] = { "L1D_CACHEGEOMETRY: 0x", hex },
+ [AT_L2_CACHESIZE - 2] = { "L2_CACHESIZE: ", dec },
+ [AT_L2_CACHEGEOMETRY - 2] = { "L2_CACHEGEOMETRY: 0x", hex },
+ [AT_L3_CACHESIZE - 2] = { "L3_CACHESIZE: ", dec },
+ [AT_L3_CACHEGEOMETRY - 2] = { "L3_CACHEGEOMETRY: 0x", hex },
+ };
+ unsigned int idx = (unsigned int) (av->a_type - 2);
+
+ if ((unsigned int) av->a_type < 2u
+ || (idx < sizeof (auxvars) / sizeof (auxvars[0])
+ && auxvars[idx].form == ignore))
+ continue;
+
+ assert (AT_NULL == 0);
+ assert (AT_IGNORE == 1);
+
+ /* Some entries are handled in a special way per platform. */
+ if (_dl_procinfo (av->a_type, av->a_un.a_val) == 0)
+ continue;
+
+ if (idx < sizeof (auxvars) / sizeof (auxvars[0])
+ && auxvars[idx].form != unknown)
+ {
+ const char *val = (char *) av->a_un.a_val;
+
+ if (__builtin_expect (auxvars[idx].form, dec) == dec)
+ val = _itoa ((unsigned long int) av->a_un.a_val,
+ buf + sizeof buf - 1, 10, 0);
+ else if (__builtin_expect (auxvars[idx].form, hex) == hex)
+ val = _itoa ((unsigned long int) av->a_un.a_val,
+ buf + sizeof buf - 1, 16, 0);
+
+ _dl_printf ("AT_%s%s\n", auxvars[idx].label, val);
+
+ continue;
+ }
+
+ /* Unknown value: print a generic line. */
+ char buf2[17];
+ buf2[sizeof (buf2) - 1] = '\0';
+ const char *val2 = _itoa ((unsigned long int) av->a_un.a_val,
+ buf2 + sizeof buf2 - 1, 16, 0);
+ const char *val = _itoa ((unsigned long int) av->a_type,
+ buf + sizeof buf - 1, 16, 0);
+ _dl_printf ("AT_??? (0x%s): 0x%s\n", val, val2);
+ }
+}
+
+#endif /* SHARED */
+
int
attribute_hidden
_dl_discover_osversion (void)
{
-#if defined NEED_DL_SYSINFO_DSO && defined SHARED
+#ifdef SHARED
if (GLRO(dl_sysinfo_map) != NULL)
{
/* If the kernel-supplied DSO contains a note indicating the kernel's
}
}
}
-#endif
+#endif /* SHARED */
char bufmem[64];
char *buf = bufmem;
if ((flag == 0 || ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure)))
return INLINE_SYSCALL (faccessat, 3, fd, file, mode);
- struct stat64 stats;
- if (__fstatat64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
+ struct __stat64_t64 stats;
+ if (__fstatat64_time64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW))
return -1;
mode &= (X_OK | W_OK | R_OK); /* Clear any bogus bits. */
/* Use fstatat because fstat does not work on O_PATH descriptors
before Linux 3.6. */
- struct stat64 st;
- if (__fstatat64 (pathfd, "", &st, AT_EMPTY_PATH) != 0)
+ struct __stat64_t64 st;
+ if (__fstatat64_time64 (pathfd, "", &st, AT_EMPTY_PATH) != 0)
{
__close_nocancel (pathfd);
return -1;
int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, cpu_bits_size,
cpu_bits);
if (r > 0)
- return CPU_COUNT_S (cpu_bits_size, (cpu_set_t*) cpu_bits);
+ return CPU_COUNT_S (r, (cpu_set_t*) cpu_bits);
else if (r == -EINVAL)
/* The input buffer is still not enough to store the number of cpus. This
is an arbitrary values assuming such systems should be rare and there
is no offline cpus. */
return max_num_cpus;
- /* Some other error. 2 is conservative (not a uniprocessor system, so
- atomics are needed). */
- return 2;
+ /* Some other error. */
+ return 0;
}
static char *
}
static int
-get_nproc_stat (char *buffer, size_t buffer_size)
+get_nproc_stat (void)
{
+ enum { buffer_size = 1024 };
+ char buffer[buffer_size];
char *buffer_end = buffer + buffer_size;
char *cp = buffer_end;
char *re = buffer_end;
-
- /* Default to an SMP system in case we cannot obtain an accurate
- number. */
- int result = 2;
+ int result = 0;
const int flags = O_RDONLY | O_CLOEXEC;
int fd = __open_nocancel ("/proc/stat", flags);
if (fd != -1)
{
- result = 0;
-
char *l;
while ((l = next_line (fd, buffer, &cp, &re, buffer_end)) != NULL)
/* The current format of /proc/stat has all the cpu* entries
return result;
}
-int
-__get_nprocs (void)
+static int
+get_nprocs_cpu_online (void)
{
enum { buffer_size = 1024 };
char buffer[buffer_size];
}
}
- result += m - n + 1;
+ if (m >= n)
+ result += m - n + 1;
l = endp;
if (l < re && *l == ',')
while (l < re && *l != '\n');
__close_nocancel_nostatus (fd);
-
- if (result > 0)
- return result;
}
- return get_nproc_stat (buffer, buffer_size);
+ return result;
}
-libc_hidden_def (__get_nprocs)
-weak_alias (__get_nprocs, get_nprocs)
-
-/* On some architectures it is possible to distinguish between configured
- and active cpus. */
-int
-__get_nprocs_conf (void)
+static int
+get_nprocs_cpu (void)
{
- /* Try to use the sysfs filesystem. It has actual information about
- online processors. */
+ int count = 0;
DIR *dir = __opendir ("/sys/devices/system/cpu");
if (dir != NULL)
{
- int count = 0;
struct dirent64 *d;
while ((d = __readdir64 (dir)) != NULL)
__closedir (dir);
- return count;
}
+ return count;
+}
- enum { buffer_size = 1024 };
- char buffer[buffer_size];
- return get_nproc_stat (buffer, buffer_size);
+static int
+get_nprocs_fallback (void)
+{
+ int result;
+
+ /* Try /proc/stat first. */
+ result = get_nproc_stat ();
+ if (result != 0)
+ return result;
+
+ /* Try sched_getaffinity. */
+ result = __get_nprocs_sched ();
+ if (result != 0)
+ return result;
+
+ /* We failed to obtain an accurate number. Be conservative: return
+ the smallest number meaning that this is not a uniprocessor system,
+ so atomics are needed. */
+ return 2;
+}
+
+int
+__get_nprocs (void)
+{
+ /* Try /sys/devices/system/cpu/online first. */
+ int result = get_nprocs_cpu_online ();
+ if (result != 0)
+ return result;
+
+ /* Fall back to /proc/stat and sched_getaffinity. */
+ return get_nprocs_fallback ();
+}
+libc_hidden_def (__get_nprocs)
+weak_alias (__get_nprocs, get_nprocs)
+
+/* On some architectures it is possible to distinguish between configured
+ and active cpus. */
+int
+__get_nprocs_conf (void)
+{
+ /* Try /sys/devices/system/cpu/ first. */
+ int result = get_nprocs_cpu ();
+ if (result != 0)
+ return result;
+
+ /* Fall back to /proc/stat and sched_getaffinity. */
+ return get_nprocs_fallback ();
}
libc_hidden_def (__get_nprocs_conf)
weak_alias (__get_nprocs_conf, get_nprocs_conf)
# define GLOB_LSTAT gl_lstat
# define GLOB_STAT64 __stat64_time64
# define GLOB_LSTAT64 __lstat64_time64
+# define GLOB_FSTATAT64 __fstatat64_time64
# define COMPILE_GLOB64 1
#define __NR_sendmsg 183
#define __NR_sendto 82
#define __NR_set_mempolicy 262
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 289
#define __NR_set_tid_address 237
#define __NR_setdomainname 121
#include "ucontext_i.h"
- /* Trampoline function. Non-standard calling ABI. */
+ /* Trampoline function. Non-standard calling ABI. */
/* Can not use ENTRY(__getcontext_ret) here. */
.type __getcontext_ret, @function
.hidden __getcontext_ret
__getcontext_ret:
.proc
.callinfo FRAME=0,NO_CALLS
- /* r26-r23 contain original r3-r6, but because setcontext
- does not reload r3-r6 (it's using them as temporaries)
- we must save them elsewhere and swap them back in. */
- copy %r23, %r3
- copy %r24, %r4
- copy %r25, %r5
- copy %r26, %r6
- /* r20 contains original return pointer. */
- bv 0(%r20)
+ /* Because setcontext does not reload r3-r6 (it's using them
+ as temporaries), we must load them ourself. */
+ ldw oR3(%r26), %r3
+ ldw oR4(%r26), %r4
+ ldw oR5(%r26), %r5
+ ldw oR6(%r26), %r6
+
+ /* Also reload registers clobbered by $$dyncall. */
+ ldw oR21(%r26), %r21
+ ldw oR22(%r26), %r22
+ ldw oR31(%r26), %r31
+
+ /* oR0 contains original return pointer. */
+ ldw oR0(%r26), %rp
+ bv 0(%rp)
copy %r0, %ret0
.procend
.size __getcontext_ret, .-__getcontext_ret
stw %r17, oR17(%r26)
stw %r18, oR18(%r26)
stw %r19, oR19(%r26)
- /* stw %r20, oR20(%r26) - used for trampoline. */
+ stw %r20, oR20(%r26)
stw %r21, oR21(%r26)
stw %r22, oR22(%r26)
- /* stw %r23, oR23(%r26) - used for trampoline. */
- /* stw %r24, oR24(%r26) - used for trampoline. */
- /* stw %r25, oR25(%r26) - used for trampoline. */
- /* stw %r26, oR26(%r26) - used for trampoline. */
+ stw %r23, oR23(%r26)
+ stw %r24, oR24(%r26)
+ stw %r25, oR25(%r26)
+ stw %r26, oR26(%r26)
stw %r27, oR27(%r26)
stw %r28, oR28(%r26)
stw %r29, oR29(%r26)
stw %r0, oIASQ1(%r26)
stw %r0, oIAOQ0(%r26)
stw %r0, oIAOQ1(%r26)
- stw %r0, oSAR(%r26) /* used as flag in swapcontext(). */
+
+ /* Save SAR register. */
+ mfctl %sar, %r1
+ stw %r1, oSAR(%r26) /* MSB used as flag in swapcontext(). */
/* Store floating-point regs. */
stw %r19, -32(%sp)
.cfi_offset 19, 32
#endif
+ stw %ret1, -60(%sp)
+ .cfi_offset 29, 4
/* Set up the trampoline registers.
- r20, r23, r24, r25, r26 and r2 are clobbered
- by call to getcontext() anyway. Reuse them. */
- stw %r2, oR20(%r26)
- stw %r3, oR23(%r26)
- stw %r4, oR24(%r26)
- stw %r5, oR25(%r26)
- stw %r6, oR26(%r26)
+ Use oR0 context slot to save return value. */
+ stw %r2, oR0(%r26)
#ifdef PIC
addil LT%__getcontext_ret, %r19
ldw RT%__getcontext_ret(%r1), %r1
#ifdef PIC
ldw -32(%sp), %r19
#endif
+ ldw -60(%sp), %ret1
bv %r0(%r2)
ldwm -64(%sp), %r4
END(__getcontext)
stw %r19, -32(%sp)
.cfi_offset 19, 32
#endif
+ stw %ret1, -60(%sp)
+ .cfi_offset 29, 4
/* Save ucp. */
copy %r26, %r3
ldw oR18(%r3), %r18
ldw oR19(%r3), %r19
ldw oR20(%r3), %r20
- ldw oR21(%r3), %r21
+ ldw oR21(%r3), %r21 /* maybe clobbered by dyncall */
/* ldw oR22(%r3), %r22 - dyncall arg. */
ldw oR23(%r3), %r23
ldw oR24(%r3), %r24
ldw oR30(%r3), %sp
/* ldw oR31(%r3), %r31 - dyncall scratch register */
+ /* Restore SAR register. */
+ ldw oSAR(%r3), %r22
+ mtsar %r22
+
/* Restore floating-point registers. */
ldo oFPREGS31(%r3), %r22
fldds 0(%r22), %fr31
#ifdef PIC
ldw -32(%r30), %r19
#endif
+ ldw -60(%r30), %ret1
bv %r0(%r2)
ldwm -64(%r30), %r3
L(pseudo_end):
--- /dev/null
+/* Swap to new context.
+ Copyright (C) 2008-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "ucontext_i.h"
+
+ .text
+ENTRY(__swapcontext)
+
+ /* Copy rp to ret0 (r28). */
+ copy %rp,%ret0
+
+ /* Create a frame. */
+ ldo 64(%sp),%sp
+ .cfi_def_cfa_offset -64
+
+ /* Save the current machine context to oucp. */
+ bl __getcontext,%rp
+
+ /* Copy oucp to register ret1 (r29). __getcontext saves and
+ restores it on a normal return. It is restored from oR29
+ on reactivation. */
+ copy %r26,%ret1
+
+ /* Pop frame. */
+ ldo -64(%sp),%sp
+ .cfi_def_cfa_offset 0
+
+ /* Load return pointer from oR28. */
+ ldw oR28(%ret1),%rp
+
+ /* Return if error. */
+ or,= %r0,%ret0,%r0
+ bv,n %r0(%rp)
+
+ /* Load sc_sar flag. */
+ ldb oSAR(%ret1),%r20
+
+ /* Return if oucp context has been reactivated. */
+ or,= %r0,%r20,%r0
+ bv,n %r0(%rp)
+
+ /* Mark sc_sar flag. */
+ ldi 1,%r20
+ stb %r20,oSAR(%ret1)
+
+ /* Activate the machine context in ucp. */
+ bl __setcontext,%rp
+ ldw oR25(%ret1),%r26
+
+ /* Load return pointer. */
+ ldw oR28(%ret1),%rp
+ bv,n %r0(%rp)
+
+END(__swapcontext)
+
+weak_alias (__swapcontext, swapcontext)
+++ /dev/null
-/* Swap to new context.
- Copyright (C) 2008-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <ucontext.h>
-
-extern int __getcontext (ucontext_t *ucp);
-extern int __setcontext (const ucontext_t *ucp);
-
-int
-__swapcontext (ucontext_t *oucp, const ucontext_t *ucp)
-{
- /* Save the current machine context to oucp. */
- __getcontext (oucp);
-
- /* mark sc_sar flag to skip the setcontext call on reactivation. */
- if (oucp->uc_mcontext.sc_sar == 0) {
- oucp->uc_mcontext.sc_sar++;
-
- /* Restore the machine context in ucp. */
- __setcontext (ucp);
- }
-
- return 0;
-}
-
-weak_alias (__swapcontext, swapcontext)
endif
ifeq ($(subdir),io)
-sysdep_routines += libc-do-syscall
+sysdep_routines += libc-do-syscall libc-do-syscall-int80
endif
ifeq ($(subdir),stdlib)
#define __NR_sendmsg 370
#define __NR_sendto 369
#define __NR_set_mempolicy 276
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 311
#define __NR_set_thread_area 243
#define __NR_set_tid_address 258
--- /dev/null
+/* Out-of-line syscall stub for six-argument syscalls from C. For static PIE.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef SHARED
+# define I386_USE_SYSENTER 0
+# include <sysdep.h>
+
+# define __libc_do_syscall __libc_do_syscall_int80
+# include "libc-do-syscall.S"
+#endif
#include <sysdep.h>
-#ifndef OPTIMIZE_FOR_GCC_5
-
/* %eax, %ecx, %edx and %esi contain the values expected by the kernel.
%edi points to a structure with the values of %ebx, %edi and %ebp. */
cfi_restore (ebx)
ret
END (__libc_do_syscall)
-#endif
/* Linux/i386 definitions of functions used by static libc main startup.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#if BUILD_PIE_DEFAULT
-/* Can't use "call *%gs:SYSINFO_OFFSET" during statup in static PIE. */
-# define I386_USE_SYSENTER 0
+/* Can't use "call *%gs:SYSINFO_OFFSET" during startup. */
+#define I386_USE_SYSENTER 0
-# include <sysdep.h>
-# include <abort-instr.h>
-
-__attribute__ ((__noreturn__))
-static inline void
-_startup_fatal (const char *message __attribute__ ((unused)))
-{
- /* This is only called very early during startup in static PIE.
- FIXME: How can it be improved? */
- ABORT_INSTRUCTION;
- __builtin_unreachable ();
-}
-
-static inline uid_t
-startup_getuid (void)
-{
- return (uid_t) INTERNAL_SYSCALL_CALL (getuid32);
-}
-
-static inline uid_t
-startup_geteuid (void)
-{
- return (uid_t) INTERNAL_SYSCALL_CALL (geteuid32);
-}
-
-static inline gid_t
-startup_getgid (void)
-{
- return (gid_t) INTERNAL_SYSCALL_CALL (getgid32);
-}
-
-static inline gid_t
-startup_getegid (void)
-{
- return (gid_t) INTERNAL_SYSCALL_CALL (getegid32);
-}
-#else
-# include_next <startup.h>
-#endif
+#include_next <startup.h>
# endif
#endif
+#if !I386_USE_SYSENTER && IS_IN (libc) && !defined SHARED
+/* Inside static libc, we have two versions. For compilation units
+ with !I386_USE_SYSENTER, the vDSO entry mechanism cannot be
+ used. */
+# define I386_DO_SYSCALL_STRING "__libc_do_syscall_int80"
+#else
+# define I386_DO_SYSCALL_STRING "__libc_do_syscall"
+#endif
+
#ifdef __ASSEMBLER__
/* Linux uses a negative return value to indicate syscall errors,
}; \
asm volatile ( \
"movl %1, %%eax\n\t" \
- "call __libc_do_syscall" \
+ "call " I386_DO_SYSCALL_STRING \
: "=a" (resultvar) \
: "i" (__NR_##name), "c" (arg2), "d" (arg3), "S" (arg4), "D" (&_xv) \
: "memory", "cc")
}; \
asm volatile ( \
"movl %1, %%eax\n\t" \
- "call __libc_do_syscall" \
+ "call " I386_DO_SYSCALL_STRING \
: "=a" (resultvar) \
: "a" (name), "c" (arg2), "d" (arg3), "S" (arg4), "D" (&_xv) \
: "memory", "cc")
+ifeq ($(subdir),elf)
+# ia64 does not support PT_GNU_RELRO.
+test-xfail-tst-relro-ldso = yes
+test-xfail-tst-relro-libc = yes
+endif
+
ifeq ($(subdir),misc)
sysdep_headers += sys/rse.h
endif
#define __NR_sendmsg 1205
#define __NR_sendto 1199
#define __NR_set_mempolicy 1261
+#define __NR_set_mempolicy_home_node 1474
#define __NR_set_robust_list 1298
#define __NR_set_tid_address 1233
#define __NR_setdomainname 1129
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <dl-sysdep.h>
-/* brk is used by statup before TCB is properly set. */
-#undef USE_DL_SYSINFO
+/* brk is used by startup before TCB is properly set up. */
+#define IA64_USE_NEW_STUB 0
#include <sysdeps/unix/sysv/linux/brk.c>
--- /dev/null
+/* Linux/ia64 definitions of functions used by static libc main startup.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* This code is used before the TCB is set up. */
+#define IA64_USE_NEW_STUB 0
+
+#include_next <startup.h>
#undef SYS_ify
#define SYS_ify(syscall_name) __NR_##syscall_name
-#if defined USE_DL_SYSINFO \
- && (IS_IN (libc) \
- || IS_IN (libpthread) || IS_IN (librt))
-# define IA64_USE_NEW_STUB
-#else
-# undef IA64_USE_NEW_STUB
+#ifndef IA64_USE_NEW_STUB
+# if defined USE_DL_SYSINFO && IS_IN (libc)
+# define IA64_USE_NEW_STUB 1
+# else
+# define IA64_USE_NEW_STUB 0
+# endif
+#endif
+#if IA64_USE_NEW_STUB && !USE_DL_SYSINFO
+# error IA64_USE_NEW_STUB needs USE_DL_SYSINFO
#endif
#ifdef __ASSEMBLER__
mov r15=num; \
break __IA64_BREAK_SYSCALL
-#ifdef IA64_USE_NEW_STUB
+#if IA64_USE_NEW_STUB
# ifdef SHARED
# define DO_CALL(num) \
.prologue; \
(non-negative) errno on error or the return value on success.
*/
-#ifdef IA64_USE_NEW_STUB
+#if IA64_USE_NEW_STUB
# define INTERNAL_SYSCALL_NCS(name, nr, args...) \
({ \
#define ASM_OUTARGS_5 ASM_OUTARGS_4, "=r" (_out4)
#define ASM_OUTARGS_6 ASM_OUTARGS_5, "=r" (_out5)
-#ifdef IA64_USE_NEW_STUB
+#if IA64_USE_NEW_STUB
#define ASM_ARGS_0
#define ASM_ARGS_1 ASM_ARGS_0, "4" (_out0)
#define ASM_ARGS_2 ASM_ARGS_1, "5" (_out1)
/* Branch registers. */ \
"b6"
-#ifdef IA64_USE_NEW_STUB
+#if IA64_USE_NEW_STUB
# define ASM_CLOBBERS_6 ASM_CLOBBERS_6_COMMON
#else
# define ASM_CLOBBERS_6 ASM_CLOBBERS_6_COMMON , "b7"
/* Get the real definitions. */
#include_next <ldsodefs.h>
-/* We can assume that the kernel always provides the AT_UID, AT_EUID,
- AT_GID, and AT_EGID values in the auxiliary vector from 2.4.0 or so on. */
-#define HAVE_AUX_XID
-
-/* We can assume that the kernel always provides the AT_SECURE value
- in the auxiliary vector from 2.5.74 or so on. */
-#define HAVE_AUX_SECURE
-
-/* Starting with one of the 2.4.0 pre-releases the Linux kernel passes
- up the page size information. */
-#define HAVE_AUX_PAGESIZE
-
#endif /* ldsodefs.h */
#define __NR_sendmsg 367
#define __NR_sendto 366
#define __NR_set_mempolicy 270
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 304
#define __NR_set_thread_area 334
#define __NR_set_tid_address 253
--- /dev/null
+/* Private libc-internal arch-specific definitions. m68k version.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public License as
+ published by the Free Software Foundation; either version 2.1 of the
+ License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; see the file COPYING.LIB. If
+ not, see <https://www.gnu.org/licenses/>. */
+
+#ifndef _LIBC_LOCK_ARCH_H
+#define _LIBC_LOCK_ARCH_H
+
+/* Linux enforces 4-bytes alignment on futex inputs. */
+#define __LIBC_LOCK_ALIGNMENT __attribute__ ((__aligned__ (4)))
+
+#endif
#define PTR_MANGLE(var) (void) (var)
#define PTR_DEMANGLE(var) (void) (var)
-#if defined NEED_DL_SYSINFO || defined NEED_DL_SYSINFO_DSO
/* M68K needs system-supplied DSO to access TLS helpers
even when statically linked. */
-# define NEED_STATIC_SYSINFO_DSO 1
-#endif
+#define NEED_STATIC_SYSINFO_DSO 1
#define __NR_sendmsg 360
#define __NR_sendto 353
#define __NR_set_mempolicy 276
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 311
#define __NR_set_thread_area 243
#define __NR_set_tid_address 258
struct stat
{
+# ifdef __USE_TIME_BITS64
+# include <bits/struct_stat_time64_helper.h>
+# else
__dev_t st_dev;
int st_pad1[3]; /* Reserved for st_dev expansion */
-# ifndef __USE_FILE_OFFSET64
+# ifndef __USE_FILE_OFFSET64
__ino_t st_ino;
-# else
+# else
__ino64_t st_ino;
-# endif
+# endif
__mode_t st_mode;
__nlink_t st_nlink;
__uid_t st_uid;
__gid_t st_gid;
__dev_t st_rdev;
-# if !defined __USE_FILE_OFFSET64
+# if !defined __USE_FILE_OFFSET64
unsigned int st_pad2[2]; /* Reserved for st_rdev expansion */
__off_t st_size;
int st_pad3;
-# else
+# else
unsigned int st_pad2[3]; /* Reserved for st_rdev expansion */
__off64_t st_size;
-# endif
-# ifdef __USE_XOPEN2K8
+# endif
+# ifdef __USE_XOPEN2K8
/* Nanosecond resolution timestamps are stored in a format
equivalent to 'struct timespec'. This is the type used
whenever possible but the Unix namespace rules do not allow the
struct timespec st_atim; /* Time of last access. */
struct timespec st_mtim; /* Time of last modification. */
struct timespec st_ctim; /* Time of last status change. */
-# define st_atime st_atim.tv_sec /* Backward compatibility. */
-# define st_mtime st_mtim.tv_sec
-# define st_ctime st_ctim.tv_sec
-# else
+# define st_atime st_atim.tv_sec /* Backward compatibility. */
+# define st_mtime st_mtim.tv_sec
+# define st_ctime st_ctim.tv_sec
+# else
__time_t st_atime; /* Time of last access. */
unsigned long int st_atimensec; /* Nscecs of last access. */
__time_t st_mtime; /* Time of last modification. */
unsigned long int st_mtimensec; /* Nsecs of last modification. */
__time_t st_ctime; /* Time of last status change. */
unsigned long int st_ctimensec; /* Nsecs of last status change. */
-# endif
+# endif
__blksize_t st_blksize;
unsigned int st_pad4;
-# ifndef __USE_FILE_OFFSET64
+# ifndef __USE_FILE_OFFSET64
__blkcnt_t st_blocks;
-# else
+# else
__blkcnt64_t st_blocks;
-# endif
+# endif
int st_pad5[14];
+# endif
};
#ifdef __USE_LARGEFILE64
struct stat64
{
+# ifdef __USE_TIME_BITS64
+# include <bits/struct_stat_time64_helper.h>
+# else
__dev_t st_dev;
unsigned int st_pad1[3]; /* Reserved for st_dev expansion */
__ino64_t st_ino;
unsigned int st_pad3;
__blkcnt64_t st_blocks;
int st_pad4[14];
+# endif /* __USE_TIME_BITS64 */
};
#endif
#define __NR_sendmsg 4179
#define __NR_sendto 4180
#define __NR_set_mempolicy 4270
+#define __NR_set_mempolicy_home_node 4450
#define __NR_set_robust_list 4309
#define __NR_set_thread_area 4283
#define __NR_set_tid_address 4252
#define __NR_sendmsg 6045
#define __NR_sendto 6043
#define __NR_set_mempolicy 6233
+#define __NR_set_mempolicy_home_node 6450
#define __NR_set_robust_list 6272
#define __NR_set_thread_area 6246
#define __NR_set_tid_address 6213
#define __NR_sendmsg 5045
#define __NR_sendto 5043
#define __NR_set_mempolicy 5229
+#define __NR_set_mempolicy_home_node 5450
#define __NR_set_robust_list 5268
#define __NR_set_thread_area 5242
#define __NR_set_tid_address 5212
--- /dev/null
+/* Generic definition of MMAP_CALL and MMAP_CALL_INTERNAL.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \
+ INLINE_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset)
+#define MMAP_CALL_INTERNAL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \
+ INTERNAL_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset)
/* Do not accept offset not multiple of page size. */
#define MMAP_OFF_LOW_MASK (MMAP2_PAGE_UNIT - 1)
-/* An architecture may override this. */
-#ifndef MMAP_CALL
-# define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \
- INLINE_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset)
-#endif
+#include <mmap_call.h>
#endif /* MMAP_INTERNAL_LINUX_H */
{
int r = SYSCALL_CANCEL (mq_timedreceive_time64, mqdes, msg_ptr, msg_len,
msg_prio, abs_timeout);
- if (r == 0 || errno != ENOSYS)
+ if (r >= 0 || errno != ENOSYS)
return r;
__set_errno (EOVERFLOW);
return -1;
#define __NR_sendmsg 211
#define __NR_sendto 206
#define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 99
#define __NR_set_tid_address 96
#define __NR_setdomainname 162
#define __NR_sendmsg 211
#define __NR_sendto 206
#define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 99
#define __NR_set_tid_address 96
#define __NR_setdomainname 162
&& strcmp (mntbuf.mnt_type, "ext4") != 0)
continue;
- struct stat64 fsst;
- if (__stat64 (mntbuf.mnt_dir, &fsst) >= 0
+ struct __stat64_t64 fsst;
+ if (__stat64_time64 (mntbuf.mnt_dir, &fsst) >= 0
&& st.st_dev == fsst.st_dev)
{
if (strcmp (mntbuf.mnt_type, "ext4") == 0)
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <ldsodefs.h>
-
-#if IS_IN (libc) && !defined SHARED
-int GLRO(dl_cache_line_size);
-#endif
-
-/* Scan the Aux Vector for the "Data Cache Block Size" entry and assign it
- to dl_cache_line_size. */
-#define DL_PLATFORM_AUXV \
- case AT_DCACHEBSIZE: \
- GLRO(dl_cache_line_size) = av->a_un.a_val; \
- break;
+#define DL_PLATFORM_AUXV \
+ GLRO(dl_cache_line_size) = auxv_values[AT_DCACHEBSIZE];
--- /dev/null
+#include <elf/dl-support.c>
+
+/* Populated from the auxiliary vector. */
+int _dl_cache_line_size;
#define __NR_sendmsg 341
#define __NR_sendto 335
#define __NR_set_mempolicy 261
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 300
#define __NR_set_tid_address 232
#define __NR_setdomainname 121
#define __NR_sendmsg 341
#define __NR_sendto 335
#define __NR_set_mempolicy 261
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 300
#define __NR_set_tid_address 232
#define __NR_setdomainname 121
#define __NR_mbind 235
#define __NR_membarrier 283
#define __NR_memfd_create 279
+#define __NR_memfd_secret 447
#define __NR_migrate_pages 238
#define __NR_mincore 232
#define __NR_mkdirat 34
#define __NR_sendmsg 211
#define __NR_sendto 206
#define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 99
#define __NR_set_tid_address 96
#define __NR_setdomainname 162
#define __NR_mbind 235
#define __NR_membarrier 283
#define __NR_memfd_create 279
+#define __NR_memfd_secret 447
#define __NR_migrate_pages 238
#define __NR_mincore 232
#define __NR_mkdirat 34
#define __NR_sendmsg 211
#define __NR_sendto 206
#define __NR_set_mempolicy 237
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 99
#define __NR_set_tid_address 96
#define __NR_setdomainname 162
--- /dev/null
+/* mmap - map files or devices into memory. Linux/s390 version.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \
+ ({ \
+ long int __args[6] = { (long int) (__addr), (long int) (__len), \
+ (long int) (__prot), (long int) (__flags), \
+ (long int) (__fd), (long int) (__offset) }; \
+ INLINE_SYSCALL_CALL (__nr, __args); \
+ })
+#define MMAP_CALL_INTERNAL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \
+ ({ \
+ long int __args[6] = { (long int) (__addr), (long int) (__len), \
+ (long int) (__prot), (long int) (__flags), \
+ (long int) (__fd), (long int) (__offset) }; \
+ INTERNAL_SYSCALL_CALL (__nr, __args); \
+ })
+++ /dev/null
-/* mmap - map files or devices into memory. Linux/s390 version.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#ifndef MMAP_S390_INTERNAL_H
-# define MMAP_S390_INTERNAL_H
-
-#define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \
- ({ \
- long int __args[6] = { (long int) (__addr), (long int) (__len), \
- (long int) (__prot), (long int) (__flags), \
- (long int) (__fd), (long int) (__offset) }; \
- INLINE_SYSCALL_CALL (__nr, __args); \
- })
-
-#include_next <mmap_internal.h>
-
-#endif
#define __NR_sendmsg 370
#define __NR_sendto 369
#define __NR_set_mempolicy 270
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 304
#define __NR_set_tid_address 252
#define __NR_setdomainname 121
#define __NR_sendmsg 370
#define __NR_sendto 369
#define __NR_set_mempolicy 270
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 304
#define __NR_set_tid_address 252
#define __NR_setdomainname 121
#define __NR_sendmsg 355
#define __NR_sendto 349
#define __NR_set_mempolicy 276
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 311
#define __NR_set_tid_address 258
#define __NR_setdomainname 121
+++ /dev/null
-/* Change data segment. Linux SPARC version.
- Copyright (C) 2021-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <errno.h>
-#include <unistd.h>
-#include <sysdep.h>
-
-/* This must be initialized data because commons can't have aliases. */
-void *__curbrk = 0;
-
-#if HAVE_INTERNAL_BRK_ADDR_SYMBOL
-/* Old braindamage in GCC's crtstuff.c requires this symbol in an attempt
- to work around different old braindamage in the old Linux ELF dynamic
- linker. */
-weak_alias (__curbrk, ___brk_addr)
-#endif
-
-#ifdef __arch64__
-# define SYSCALL_NUM "0x6d"
-#else
-# define SYSCALL_NUM "0x10"
-#endif
-
-int
-__brk (void *addr)
-{
- register long int g1 asm ("g1") = __NR_brk;
- register long int o0 asm ("o0") = (long int) addr;
- asm volatile ("ta " SYSCALL_NUM
- : "=r"(o0)
- : "r"(g1), "0"(o0)
- : "cc");
- __curbrk = (void *) o0;
-
- if (__curbrk < addr)
- {
- __set_errno (ENOMEM);
- return -1;
- }
-
- return 0;
-}
-weak_alias (__brk, brk)
--- /dev/null
+/* Invoke the brk system call. Sparc version.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifdef __arch64__
+# define SYSCALL_NUM "0x6d"
+#else
+# define SYSCALL_NUM "0x10"
+#endif
+
+static inline void *
+__brk_call (void *addr)
+{
+ register long int g1 asm ("g1") = __NR_brk;
+ register long int o0 asm ("o0") = (long int) addr;
+ asm volatile ("ta " SYSCALL_NUM
+ : "=r"(o0)
+ : "r"(g1), "0"(o0)
+ : "cc");
+ return (void *) o0;
+}
#define __NR_sendmsg 114
#define __NR_sendto 133
#define __NR_set_mempolicy 305
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 300
#define __NR_set_tid_address 166
#define __NR_setdomainname 163
#define __NR_sendmsg 114
#define __NR_sendto 133
#define __NR_set_mempolicy 305
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 300
#define __NR_set_tid_address 166
#define __NR_setdomainname 163
__waitpid (new_pid, NULL, 0);
}
else
- ec = -new_pid;
+ ec = errno;
__munmap (stack, stack_size);
--- /dev/null
+/* Linux definitions of functions used by static libc main startup.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifdef SHARED
+# include_next <startup.h>
+#else
+# include <sysdep.h>
+
+/* Avoid a run-time invocation of strlen. */
+#define _startup_fatal(message) \
+ do \
+ { \
+ size_t __message_length = __builtin_strlen (message); \
+ if (! __builtin_constant_p (__message_length)) \
+ { \
+ extern void _startup_fatal_not_constant (void); \
+ _startup_fatal_not_constant (); \
+ } \
+ INTERNAL_SYSCALL_CALL (write, STDERR_FILENO, (message), \
+ __message_length); \
+ INTERNAL_SYSCALL_CALL (exit_group, 127); \
+ } \
+ while (0)
+#endif /* !SHARED */
# This file can list all potential system calls. The names are only
# used if the installed kernel headers also provide them.
-# The list of system calls is current as of Linux 5.16.
-kernel 5.16
+# The list of system calls is current as of Linux 5.19.
+kernel 5.19
FAST_atomic_update
FAST_cmpxchg
sendmsg
sendto
set_mempolicy
+set_mempolicy_home_node
set_robust_list
set_thread_area
set_tid_address
--- /dev/null
+/* Basic test for getauxval.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <support/check.h>
+#include <sys/auxv.h>
+
+static int missing;
+static int mismatch;
+
+static void
+check_nonzero (unsigned long t, const char *s)
+{
+ unsigned long v = getauxval (t);
+ printf ("%s: %lu (0x%lx)\n", s, v, v);
+ if (v == 0)
+ missing++;
+}
+
+static void
+check_eq (unsigned long t, const char *s, unsigned long want)
+{
+ unsigned long v = getauxval (t);
+ printf ("%s: %lu want: %lu\n", s, v, want);
+ if (v != want)
+ mismatch++;
+}
+
+#define NZ(x) check_nonzero (x, #x)
+#define EQ(x, want) check_eq (x, #x, want)
+
+static int
+do_test (void)
+{
+ /* These auxv entries should be non-zero on Linux. */
+ NZ (AT_PHDR);
+ NZ (AT_PHENT);
+ NZ (AT_PHNUM);
+ NZ (AT_PAGESZ);
+ NZ (AT_ENTRY);
+ NZ (AT_CLKTCK);
+ NZ (AT_RANDOM);
+ NZ (AT_EXECFN);
+ if (missing)
+ FAIL_EXIT1 ("Found %d missing auxv entries.\n", missing);
+
+ /* Check against syscalls. */
+ EQ (AT_UID, getuid ());
+ EQ (AT_EUID, geteuid ());
+ EQ (AT_GID, getgid ());
+ EQ (AT_EGID, getegid ());
+ if (mismatch)
+ FAIL_EXIT1 ("Found %d mismatching auxv entries.\n", mismatch);
+
+ return 0;
+}
+
+#include <support/test-driver.c>
help='C compiler (including options) to use')
args = parser.parse_args()
linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc)
- linux_version_glibc = (5, 15)
+ linux_version_glibc = (5, 17)
sys.exit(glibcextract.compare_macro_consts(
'#define _GNU_SOURCE 1\n'
'#include <sys/mman.h>\n',
#include <support/xsocket.h>
#include <support/xunistd.h>
#include <stdbool.h>
+#include <socket-constants-time64.h>
/* AF_INET socket and address used to receive data. */
static int srv;
/* Enable 32 bit timeval precision and check if no 64 bit timeval stamp
is created. */
{
- int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMP_OLD, &(int){1},
+ int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMP_OLD, &(int){1},
sizeof (int));
TEST_VERIFY_EXIT (r != -1);
if (cmsg->cmsg_level != SOL_SOCKET)
continue;
- if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMP_NEW)
+ if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMP_NEW)
found_timestamp = true;
else
- TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMP_NEW);
+ TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMP_NEW);
}
TEST_COMPARE (found_timestamp, sizeof (time_t) > 4);
/* Same as before, but for timespec. */
{
- int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMPNS_OLD, &(int){1},
+ int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMPNS_OLD, &(int){1},
sizeof (int));
TEST_VERIFY_EXIT (r != -1);
if (cmsg->cmsg_level != SOL_SOCKET)
continue;
- if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMPNS_NEW)
+ if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMPNS_NEW)
found_timestamp = true;
else
- TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMPNS_NEW);
+ TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMPNS_NEW);
}
TEST_COMPARE (found_timestamp, sizeof (time_t) > 4);
/* Enable 32 bit timeval precision and check if no 64 bit timeval stamp
is created. */
{
- int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMP_OLD, &(int){1},
+ int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMP_OLD, &(int){1},
sizeof (int));
TEST_VERIFY_EXIT (r != -1);
if (cmsg->cmsg_level != SOL_SOCKET)
continue;
- if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMP_NEW)
+ if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMP_NEW)
found_timestamp = true;
else
- TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMP_NEW);
+ TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMP_NEW);
}
if (sizeof (time_t) > 4)
/* Same as before, but for timespec. */
{
- int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMPNS_OLD, &(int){1},
+ int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMPNS_OLD, &(int){1},
sizeof (int));
TEST_VERIFY_EXIT (r != -1);
if (cmsg->cmsg_level != SOL_SOCKET)
continue;
- if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMPNS_NEW)
+ if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMPNS_NEW)
found_timestamp = true;
else
- TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMPNS_NEW);
+ TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMPNS_NEW);
}
if (sizeof (time_t) > 4)
#define __NR_sendmsg 46
#define __NR_sendto 44
#define __NR_set_mempolicy 238
+#define __NR_set_mempolicy_home_node 450
#define __NR_set_robust_list 273
#define __NR_set_thread_area 205
#define __NR_set_tid_address 218
#define __NR_sendmsg 1073742342
#define __NR_sendto 1073741868
#define __NR_set_mempolicy 1073742062
+#define __NR_set_mempolicy_home_node 1073742274
#define __NR_set_robust_list 1073742354
#define __NR_set_thread_area 1073742029
#define __NR_set_tid_address 1073742042
tst-strcpy-rtm \
tst-strlen-rtm \
tst-strncmp-rtm \
- tst-strrchr-rtm
+ tst-strrchr-rtm \
+ tst-wcsncmp-rtm \
+# tests
CFLAGS-tst-memchr-rtm.c += -mrtm
CFLAGS-tst-memcmp-rtm.c += -mrtm
CFLAGS-tst-strchr-rtm.c += -mrtm
CFLAGS-tst-strcpy-rtm.c += -mrtm
CFLAGS-tst-strlen-rtm.c += -mrtm
-CFLAGS-tst-strncmp-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
CFLAGS-tst-strrchr-rtm.c += -mrtm
+CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
endif
ifneq ($(enable-cet),no)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
- unsigned long int rep_movsb_stop_threshold;
- /* ERMS feature is implemented from AMD Zen3 architecture and it is
- performing poorly for data above L2 cache size. Henceforth, adding
- an upper bound threshold parameter to limit the usage of Enhanced
- REP MOVSB operations and setting its value to L2 cache size. */
- if (cpu_features->basic.kind == arch_kind_amd)
- rep_movsb_stop_threshold = core;
- /* Setting the upper bound of ERMS to the computed value of
- non-temporal threshold for architectures other than AMD. */
- else
- rep_movsb_stop_threshold = non_temporal_threshold;
-
/* The default threshold to use Enhanced REP STOSB. */
unsigned long int rep_stosb_threshold = 2048;
TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
+ /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
+ 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
+ if that operation cannot overflow. Minimum of 0x4040 (16448) because the
+ L(large_memset_4x) loops need 64-byte to cache align and enough space for
+ at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
+ reflected in the manual. */
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
- 0, SIZE_MAX);
+ 0x4040, SIZE_MAX >> 4);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
SIZE_MAX);
#endif
+ unsigned long int rep_movsb_stop_threshold;
+ /* ERMS feature is implemented from AMD Zen3 architecture and it is
+ performing poorly for data above L2 cache size. Henceforth, adding
+ an upper bound threshold parameter to limit the usage of Enhanced
+ REP MOVSB operations and setting its value to L2 cache size. */
+ if (cpu_features->basic.kind == arch_kind_amd)
+ rep_movsb_stop_threshold = core;
+ /* Setting the upper bound of ERMS to the computed value of
+ non-temporal threshold for architectures other than AMD. */
+ else
+ rep_movsb_stop_threshold = non_temporal_threshold;
+
cpu_features->data_cache_size = data;
cpu_features->shared_cache_size = shared;
cpu_features->non_temporal_threshold = non_temporal_threshold;
# endif
# if ISA_V2 && defined __AVX__ && defined __AVX2__ && defined __F16C__ \
- && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE
+ && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE \
+ && defined __BMI__ && defined __BMI2__
/* NB: ISAs in x86-64 ISA level v3 are used. */
# define ISA_V3 GNU_PROPERTY_X86_ISA_1_V3
# else
/* Local label name for asm code. */
#ifndef L
/* ELF-like local names start with `.L'. */
-# define L(name) .L##name
+# define LOCAL_LABEL(name) .L##name
+# define L(name) LOCAL_LABEL(name)
#endif
#define atom_text_section .section ".text.atom", "ax"
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
+#include <stdint.h>
#include <tst-string-rtm.h>
+#ifdef WIDE
+# define CHAR wchar_t
+# define MEMSET wmemset
+# define STRNCMP wcsncmp
+# define TEST_NAME "wcsncmp"
+#else /* !WIDE */
+# define CHAR char
+# define MEMSET memset
+# define STRNCMP strncmp
+# define TEST_NAME "strncmp"
+#endif /* !WIDE */
+
+
+
#define LOOP 3000
#define STRING_SIZE 1024
-char string1[STRING_SIZE];
-char string2[STRING_SIZE];
+CHAR string1[STRING_SIZE];
+CHAR string2[STRING_SIZE];
__attribute__ ((noinline, noclone))
static int
prepare (void)
{
- memset (string1, 'a', STRING_SIZE - 1);
- memset (string2, 'a', STRING_SIZE - 1);
- if (strncmp (string1, string2, STRING_SIZE) == 0)
+ MEMSET (string1, 'a', STRING_SIZE - 1);
+ MEMSET (string2, 'a', STRING_SIZE - 1);
+ if (STRNCMP (string1, string2, STRING_SIZE) == 0)
return EXIT_SUCCESS;
else
return EXIT_FAILURE;
static int
function (void)
{
- if (strncmp (string1, string2, STRING_SIZE) == 0)
+ if (STRNCMP (string1, string2, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function_overflow (void)
+{
+ if (STRNCMP (string1, string2, SIZE_MAX) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function_overflow2 (void)
+{
+ if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
return 0;
else
return 1;
static int
do_test (void)
{
- return do_test_1 ("strncmp", LOOP, prepare, function);
+ int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
+ if (status != EXIT_SUCCESS)
+ return status;
+ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+ if (status != EXIT_SUCCESS)
+ return status;
+ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
+ if (status != EXIT_SUCCESS)
+ return status;
+ return status;
}
--- /dev/null
+/* Test case for wcsncmp inside a transactionally executing RTM region.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include <wchar.h>
+#include "tst-strncmp-rtm.c"
+++ /dev/null
-/* Implemented in memset.S. */
# endif
/* Set to symbol size plus addend. */
value = sym->st_size;
+ *reloc_addr = value + reloc->r_addend;
+ break;
# endif
- /* Fall through. */
+
case R_X86_64_GLOB_DAT:
case R_X86_64_JUMP_SLOT:
- *reloc_addr = value + reloc->r_addend;
+ *reloc_addr = value;
break;
# ifndef RESOLVE_CONFLICT_FIND_MAP
#include <sysdep.h>
+#ifdef USE_AS_WMEMCMP
+# define PCMPEQ pcmpeqd
+# define CHAR_SIZE 4
+# define SIZE_OFFSET (0)
+#else
+# define PCMPEQ pcmpeqb
+# define CHAR_SIZE 1
+#endif
+
+#ifdef USE_AS_MEMCMPEQ
+# define SIZE_OFFSET (0)
+# define CHECK_CMP(x, y) subl x, y
+#else
+# ifndef SIZE_OFFSET
+# define SIZE_OFFSET (CHAR_PER_VEC * 2)
+# endif
+# define CHECK_CMP(x, y) cmpl x, y
+#endif
+
+#define VEC_SIZE 16
+#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+
.text
-ENTRY (memcmp)
-#ifdef __ILP32__
+ENTRY(MEMCMP)
+# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
+# endif
+#ifdef USE_AS_WMEMCMP
+ /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
+ in ecx for code size. This is preferable to using `incw` as
+ it avoids partial register stalls on older hardware (pre
+ SnB). */
+ movl $0xffff, %ecx
#endif
- test %RDX_LP, %RDX_LP
- jz L(finz)
- cmpq $1, %rdx
- jbe L(finr1b)
- subq %rdi, %rsi
- movq %rdx, %r10
- cmpq $32, %r10
- jae L(gt32)
- /* Handle small chunks and last block of less than 32 bytes. */
-L(small):
- testq $1, %r10
- jz L(s2b)
- movzbl (%rdi), %eax
- movzbl (%rdi, %rsi), %edx
- subq $1, %r10
- je L(finz1)
- addq $1, %rdi
- subl %edx, %eax
- jnz L(exit)
-L(s2b):
- testq $2, %r10
- jz L(s4b)
- movzwl (%rdi), %eax
- movzwl (%rdi, %rsi), %edx
- subq $2, %r10
-#ifdef USE_AS_MEMCMPEQ
- je L(finz1)
+ cmpq $CHAR_PER_VEC, %rdx
+ ja L(more_1x_vec)
+
+#ifdef USE_AS_WMEMCMP
+ /* saves a byte of code keeping the fall through path n = [2, 4]
+ in the initial cache line. */
+ decl %edx
+ jle L(cmp_0_1)
+
+ movq (%rsi), %xmm0
+ movq (%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ subl %ecx, %eax
+ jnz L(ret_nonzero_vec_start_0)
+
+ movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
+ movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ subl %ecx, %eax
+ jnz L(ret_nonzero_vec_end_0_adj)
#else
- je L(fin2_7)
+ cmpl $8, %edx
+ ja L(cmp_9_16)
+
+ cmpl $4, %edx
+ jb L(cmp_0_3)
+
+# ifdef USE_AS_MEMCMPEQ
+ movl (%rsi), %eax
+ subl (%rdi), %eax
+
+ movl -4(%rsi, %rdx), %esi
+ subl -4(%rdi, %rdx), %esi
+
+ orl %esi, %eax
+ ret
+# else
+ /* Combine comparisons for lo and hi 4-byte comparisons. */
+ movl -4(%rsi, %rdx), %ecx
+ movl -4(%rdi, %rdx), %eax
+ shlq $32, %rcx
+ shlq $32, %rax
+ movl (%rsi), %esi
+ movl (%rdi), %edi
+ orq %rsi, %rcx
+ orq %rdi, %rax
+ /* Only compute proper return if not-equal. */
+ cmpq %rcx, %rax
+ jnz L(ret_nonzero)
+ xorl %eax, %eax
+ ret
+# endif
+
+ .p2align 4,, 10
+L(cmp_9_16):
+# ifdef USE_AS_MEMCMPEQ
+ movq (%rsi), %rax
+ subq (%rdi), %rax
+
+ movq -8(%rsi, %rdx), %rcx
+ subq -8(%rdi, %rdx), %rcx
+ orq %rcx, %rax
+ /* Convert 64 bit -> 32 bit boolean (we should have made the ABI
+ return long). */
+ setnz %cl
+ movzbl %cl, %eax
+# else
+ movq (%rsi), %rcx
+ movq (%rdi), %rax
+ /* Only compute proper return if not-equal. */
+ cmpq %rcx, %rax
+ jnz L(ret_nonzero)
+
+ movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
+ movq -8(%rdi, %rdx, CHAR_SIZE), %rax
+ /* Only compute proper return if not-equal. */
+ cmpq %rcx, %rax
+ jnz L(ret_nonzero)
+ xorl %eax, %eax
+# endif
#endif
- addq $2, %rdi
- cmpl %edx, %eax
-#ifdef USE_AS_MEMCMPEQ
- jnz L(neq_early)
+ ret
+
+ .p2align 4,, 8
+L(cmp_0_1):
+ /* Flag set by earlier comparison against 1. */
+ jne L(cmp_0_0)
+#ifdef USE_AS_WMEMCMP
+ movl (%rdi), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi), %ecx
+ je L(cmp_0_0)
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
#else
- jnz L(fin2_7)
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ subl %ecx, %eax
#endif
-L(s4b):
- testq $4, %r10
- jz L(s8b)
- movl (%rdi), %eax
- movl (%rdi, %rsi), %edx
- subq $4, %r10
-#ifdef USE_AS_MEMCMPEQ
- je L(finz1)
+ ret
+
+ /* Fits in aligning bytes. */
+L(cmp_0_0):
+ xorl %eax, %eax
+ ret
+
+#ifdef USE_AS_WMEMCMP
+ .p2align 4
+L(ret_nonzero_vec_start_0):
+ bsfl %eax, %eax
+ movl (%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+ ret
#else
- je L(fin2_7)
+
+# ifndef USE_AS_MEMCMPEQ
+ .p2align 4,, 14
+L(ret_nonzero):
+ /* Need to bswap to get proper return without branch. */
+ bswapq %rcx
+ bswapq %rax
+ subq %rcx, %rax
+ sbbl %eax, %eax
+ orl $1, %eax
+ ret
+# endif
+
+ .p2align 4
+L(cmp_0_3):
+# ifdef USE_AS_MEMCMPEQ
+ /* No reason to add to dependency chain on rdx. Saving a the
+ bytes here doesn't change number of fetch blocks. */
+ cmpl $1, %edx
+ jbe L(cmp_0_1)
+# else
+ /* We need the code size to prevent taking an extra fetch block.
+ */
+ decl %edx
+ jle L(cmp_0_1)
+# endif
+ movzwl (%rsi), %ecx
+ movzwl (%rdi), %eax
+
+# ifdef USE_AS_MEMCMPEQ
+ subl %ecx, %eax
+
+ movzbl -1(%rsi, %rdx), %esi
+ movzbl -1(%rdi, %rdx), %edi
+ subl %edi, %esi
+ orl %esi, %eax
+# else
+ bswapl %ecx
+ bswapl %eax
+
+ /* Implicit right shift by one. We just need to displace the
+ sign bits. */
+ shrl %ecx
+ shrl %eax
+
+ /* Eat a partial register stall here. Saves code stopping
+ L(cmp_0_3) from bleeding into the next fetch block and saves
+ an ALU. */
+ movb (%rsi, %rdx), %cl
+ movzbl (%rdi, %rdx), %edi
+ orl %edi, %eax
+ subl %ecx, %eax
+# endif
+ ret
#endif
- addq $4, %rdi
- cmpl %edx, %eax
-#ifdef USE_AS_MEMCMPEQ
- jnz L(neq_early)
+
+ .p2align 5
+L(more_1x_vec):
+#ifndef USE_AS_WMEMCMP
+ /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
+ in ecx for code size. This is preferable to using `incw` as
+ it avoids partial register stalls on older hardware (pre
+ SnB). */
+ movl $0xffff, %ecx
+#endif
+ movups (%rsi), %xmm0
+ movups (%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ subl %ecx, %eax
+ jnz L(ret_nonzero_vec_start_0)
+#if SIZE_OFFSET == 0
+ cmpq $(CHAR_PER_VEC * 2), %rdx
#else
- jnz L(fin2_7)
+ /* Offset rdx. Saves just enough code size to keep the
+ L(last_2x_vec) case and the non-zero return in a single
+ cache line. */
+ subq $(CHAR_PER_VEC * 2), %rdx
#endif
-L(s8b):
- testq $8, %r10
- jz L(s16b)
- movq (%rdi), %rax
- movq (%rdi, %rsi), %rdx
- subq $8, %r10
-#ifdef USE_AS_MEMCMPEQ
- je L(sub_return8)
+ ja L(more_2x_vec)
+
+ movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
+ movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ subl %ecx, %eax
+#ifndef USE_AS_MEMCMPEQ
+ /* Don't use `incw ax` as machines this code runs on are liable
+ to have partial register stall. */
+ jnz L(ret_nonzero_vec_end_0)
#else
- je L(fin2_7)
+ /* Various return targets for memcmpeq. Will always be hot in
+ Icache and get short encoding. */
+L(ret_nonzero_vec_start_1):
+L(ret_nonzero_vec_start_0):
+L(ret_nonzero_vec_end_0):
#endif
- addq $8, %rdi
- cmpq %rdx, %rax
-#ifdef USE_AS_MEMCMPEQ
- jnz L(neq_early)
+ ret
+
+#ifndef USE_AS_MEMCMPEQ
+# ifdef USE_AS_WMEMCMP
+ .p2align 4
+L(ret_nonzero_vec_end_0_adj):
+ addl $3, %edx
+# else
+ .p2align 4,, 8
+# endif
+L(ret_nonzero_vec_end_0):
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ leal (%rax, %rdx, CHAR_SIZE), %eax
+ movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ addl %edx, %eax
+ movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+# ifndef USE_AS_WMEMCMP
+ .p2align 4,, 10
+L(ret_nonzero_vec_start_0):
+ bsfl %eax, %eax
+ movzbl (%rsi, %rax), %ecx
+ movzbl (%rdi, %rax), %eax
+ subl %ecx, %eax
+ ret
+# endif
#else
- jnz L(fin2_7)
#endif
-L(s16b):
- movdqu (%rdi), %xmm1
- movdqu (%rdi, %rsi), %xmm0
- pcmpeqb %xmm0, %xmm1
+
+ .p2align 5
+L(more_2x_vec):
+ movups (VEC_SIZE * 1)(%rsi), %xmm0
+ movups (VEC_SIZE * 1)(%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ subl %ecx, %eax
+ jnz L(ret_nonzero_vec_start_1)
+
+ cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
+ jbe L(last_2x_vec)
+
+ cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
+ ja L(more_8x_vec)
+
+ /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
+ This can harm performance if non-zero return in [65, 80] or
+ [97, 112] but helps performance otherwise. Generally zero-
+ return is hotter. */
+ movups (VEC_SIZE * 2)(%rsi), %xmm0
+ movups (VEC_SIZE * 2)(%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ movups (VEC_SIZE * 3)(%rsi), %xmm2
+ movups (VEC_SIZE * 3)(%rdi), %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pand %xmm1, %xmm3
+
+ pmovmskb %xmm3, %eax
+ CHECK_CMP (%ecx, %eax)
+ jnz L(ret_nonzero_vec_start_2_3)
+
+ cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
+ jbe L(last_2x_vec)
+
+ movups (VEC_SIZE * 4)(%rsi), %xmm0
+ movups (VEC_SIZE * 4)(%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ movups (VEC_SIZE * 5)(%rsi), %xmm2
+ movups (VEC_SIZE * 5)(%rdi), %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pand %xmm1, %xmm3
+
+ pmovmskb %xmm3, %eax
+ CHECK_CMP (%ecx, %eax)
#ifdef USE_AS_MEMCMPEQ
- pmovmskb %xmm1, %eax
- subl $0xffff, %eax
+ jz L(last_2x_vec)
ret
#else
- pmovmskb %xmm1, %edx
- xorl %eax, %eax
- subl $0xffff, %edx
- jz L(finz)
- bsfl %edx, %ecx
- leaq (%rdi, %rcx), %rcx
- movzbl (%rcx), %eax
- movzbl (%rsi, %rcx), %edx
- jmp L(finz1)
+ jnz L(ret_nonzero_vec_start_4_5)
#endif
- .p2align 4,, 4
-L(finr1b):
- movzbl (%rdi), %eax
- movzbl (%rsi), %edx
-L(finz1):
- subl %edx, %eax
-L(exit):
- ret
+ .p2align 4
+L(last_2x_vec):
+ movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
+ movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
+ movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ subl %ecx, %eax
#ifdef USE_AS_MEMCMPEQ
- .p2align 4,, 4
-L(sub_return8):
- subq %rdx, %rax
- movl %eax, %edx
- shrq $32, %rax
- orl %edx, %eax
+ /* Various return targets for memcmpeq. Will always be hot in
+ Icache and get short encoding. */
+L(ret_nonzero_vec_start_2_3):
+L(ret_nonzero_vec_start_4_5):
ret
#else
- .p2align 4,, 4
-L(fin2_7):
- cmpq %rdx, %rax
- jz L(finz)
- movq %rax, %r11
- subq %rdx, %r11
- bsfq %r11, %rcx
- sarq $3, %rcx
- salq $3, %rcx
- sarq %cl, %rax
- movzbl %al, %eax
- sarq %cl, %rdx
- movzbl %dl, %edx
- subl %edx, %eax
+ jnz L(ret_nonzero_vec_end_1)
ret
-#endif
- .p2align 4,, 4
-L(finz):
- xorl %eax, %eax
+
+ .p2align 4,, 8
+L(ret_nonzero_vec_end_1):
+ pmovmskb %xmm1, %ecx
+ /* High 16 bits of eax guranteed to be all ones. Rotate them in
+ to we can do `or + not` with just `xor`. */
+ rorl $16, %eax
+ xorl %ecx, %eax
+ /* Partial register stall. */
+
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ leal (%rax, %rdx, CHAR_SIZE), %eax
+ movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ addl %edx, %eax
+ movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
ret
-#ifdef USE_AS_MEMCMPEQ
- .p2align 4,, 4
-L(neq_early):
- movl $1, %eax
+
+ .p2align 4
+L(ret_nonzero_vec_start_4_5):
+ pmovmskb %xmm1, %edx
+ sall $16, %eax
+ leal 1(%rax, %rdx), %eax
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+
+ .p2align 4,, 8
+L(ret_nonzero_vec_start_1):
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
ret
#endif
- /* For blocks bigger than 32 bytes
- 1. Advance one of the addr pointer to be 16B aligned.
- 2. Treat the case of both addr pointers aligned to 16B
- separately to avoid movdqu.
- 3. Handle any blocks of greater than 64 consecutive bytes with
- unrolling to reduce branches.
- 4. At least one addr pointer is 16B aligned, use memory version
- of pcmbeqb.
- */
- .p2align 4,, 4
-L(gt32):
- movq %rdx, %r11
- addq %rdi, %r11
- movq %rdi, %r8
-
- andq $15, %r8
- jz L(16am)
- /* Both pointers may be misaligned. */
- movdqu (%rdi), %xmm1
- movdqu (%rdi, %rsi), %xmm0
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- subl $0xffff, %edx
- jnz L(neq)
- neg %r8
- leaq 16(%rdi, %r8), %rdi
-L(16am):
- /* Handle two 16B aligned pointers separately. */
- testq $15, %rsi
- jz L(ATR)
- testq $16, %rdi
- jz L(A32)
- movdqu (%rdi, %rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-L(A32):
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jae L(mt16)
- /* Pre-unroll to be ready for unrolled 64B loop. */
- testq $32, %rdi
- jz L(A64)
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
-L(A64):
- movq %r11, %r10
- andq $-64, %r10
- cmpq %r10, %rdi
- jae L(mt32)
-
-L(A64main):
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- cmpq %rdi, %r10
- jne L(A64main)
-
-L(mt32):
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jae L(mt16)
-
-L(A32main):
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- cmpq %rdi, %r10
- jne L(A32main)
-L(mt16):
- subq %rdi, %r11
- je L(finz)
- movq %r11, %r10
- jmp L(small)
-
- .p2align 4,, 4
-L(neq):
-#ifdef USE_AS_MEMCMPEQ
- movl $1, %eax
- ret
-#else
- bsfl %edx, %ecx
- movzbl (%rdi, %rcx), %eax
- addq %rdi, %rsi
- movzbl (%rsi,%rcx), %edx
- jmp L(finz1)
+
+ .p2align 4
+L(more_8x_vec):
+ subq %rdi, %rsi
+ leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
+ andq $(VEC_SIZE * -1), %rdi
+ addq %rdi, %rsi
+ .p2align 4
+L(loop_4x):
+ movups (VEC_SIZE * 2)(%rsi), %xmm0
+ movups (VEC_SIZE * 3)(%rsi), %xmm1
+
+ PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
+ PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
+
+ movups (VEC_SIZE * 4)(%rsi), %xmm2
+ movups (VEC_SIZE * 5)(%rsi), %xmm3
+
+ PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
+ PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
+
+ pand %xmm0, %xmm1
+ pand %xmm2, %xmm3
+ pand %xmm1, %xmm3
+
+ pmovmskb %xmm3, %eax
+ subl %ecx, %eax
+ jnz L(ret_nonzero_loop)
+
+ addq $(VEC_SIZE * 4), %rdi
+ addq $(VEC_SIZE * 4), %rsi
+ cmpq %rdi, %rdx
+ ja L(loop_4x)
+ /* Get remaining length in edx. */
+ subl %edi, %edx
+ /* Restore offset so we can reuse L(last_2x_vec). */
+ addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
+#ifdef USE_AS_WMEMCMP
+ shrl $2, %edx
#endif
+ cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
+ jbe L(last_2x_vec)
+
- .p2align 4,, 4
-L(ATR):
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jae L(mt16)
- testq $16, %rdi
- jz L(ATR32)
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
- cmpq %rdi, %r10
- je L(mt16)
-
-L(ATR32):
- movq %r11, %r10
- andq $-64, %r10
- testq $32, %rdi
- jz L(ATR64)
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
-L(ATR64):
- cmpq %rdi, %r10
- je L(mt32)
-
-L(ATR64main):
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
- cmpq %rdi, %r10
- jne L(ATR64main)
-
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jae L(mt16)
-
-L(ATR32res):
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- cmpq %r10, %rdi
- jne L(ATR32res)
-
- subq %rdi, %r11
- je L(finz)
- movq %r11, %r10
- jmp L(small)
- /* Align to 16byte to improve instruction fetch. */
- .p2align 4,, 4
-END(memcmp)
+ movups (VEC_SIZE * 2)(%rsi), %xmm0
+ movups (VEC_SIZE * 2)(%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ movups (VEC_SIZE * 3)(%rsi), %xmm2
+ movups (VEC_SIZE * 3)(%rdi), %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ CHECK_CMP (%ecx, %eax)
+ jz L(last_2x_vec)
#ifdef USE_AS_MEMCMPEQ
-libc_hidden_def (memcmp)
+L(ret_nonzero_loop):
+ ret
#else
-# undef bcmp
-weak_alias (memcmp, bcmp)
-libc_hidden_builtin_def (memcmp)
+
+ .p2align 4
+L(ret_nonzero_vec_start_2_3):
+ pmovmskb %xmm1, %edx
+ sall $16, %eax
+ leal 1(%rax, %rdx), %eax
+
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+
+ .p2align 4
+L(ret_nonzero_loop):
+ pmovmskb %xmm0, %ecx
+ pmovmskb %xmm1, %edx
+ sall $(VEC_SIZE * 1), %edx
+ leal 1(%rcx, %rdx), %edx
+ pmovmskb %xmm2, %ecx
+ /* High 16 bits of eax guranteed to be all ones. Rotate them in
+ to we can do `or + not` with just `xor`. */
+ rorl $16, %eax
+ xorl %ecx, %eax
+
+ salq $32, %rax
+ orq %rdx, %rax
+
+ bsfq %rax, %rax
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+#endif
+END(MEMCMP)
+
+#ifndef USE_AS_WMEMCMP
+# ifdef USE_AS_MEMCMPEQ
+libc_hidden_def (MEMCMP)
+# else
+# undef bcmp
+weak_alias (MEMCMP, bcmp)
+libc_hidden_builtin_def (MEMCMP)
+# endif
#endif
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#define memcmp __memcmpeq
+#define MEMCMP __memcmpeq
#define USE_AS_MEMCMPEQ 1
#include "multiarch/memcmp-sse2.S"
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
+#define VEC_SIZE 16
+#define PAGE_SIZE 4096
.text
-ENTRY (__memrchr)
- movd %esi, %xmm1
-
- sub $16, %RDX_LP
- jbe L(length_less16)
-
- punpcklbw %xmm1, %xmm1
- punpcklbw %xmm1, %xmm1
-
- add %RDX_LP, %RDI_LP
- pshufd $0, %xmm1, %xmm1
-
- movdqu (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
-
-/* Check if there is a match. */
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches0)
-
- sub $64, %rdi
- mov %edi, %ecx
- and $15, %ecx
- jz L(loop_prolog)
-
- add $16, %rdi
- add $16, %rdx
- and $-16, %rdi
- sub %rcx, %rdx
-
- .p2align 4
-L(loop_prolog):
- sub $64, %rdx
- jbe L(exit_loop)
-
- movdqa 48(%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches48)
-
- movdqa 32(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 16(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa (%rdi), %xmm4
- pcmpeqb %xmm1, %xmm4
- pmovmskb %xmm4, %eax
- test %eax, %eax
- jnz L(matches0)
-
- sub $64, %rdi
- sub $64, %rdx
- jbe L(exit_loop)
-
- movdqa 48(%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches48)
-
- movdqa 32(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 16(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa (%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches0)
-
- mov %edi, %ecx
- and $63, %ecx
- jz L(align64_loop)
-
- add $64, %rdi
- add $64, %rdx
- and $-64, %rdi
- sub %rcx, %rdx
-
- .p2align 4
-L(align64_loop):
- sub $64, %rdi
- sub $64, %rdx
- jbe L(exit_loop)
-
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm2
- movdqa 32(%rdi), %xmm3
- movdqa 48(%rdi), %xmm4
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm1, %xmm2
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm1, %xmm4
-
- pmaxub %xmm3, %xmm0
- pmaxub %xmm4, %xmm2
- pmaxub %xmm0, %xmm2
- pmovmskb %xmm2, %eax
-
- test %eax, %eax
- jz L(align64_loop)
-
- pmovmskb %xmm4, %eax
- test %eax, %eax
- jnz L(matches48)
-
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 16(%rdi), %xmm2
-
- pcmpeqb %xmm1, %xmm2
- pcmpeqb (%rdi), %xmm1
-
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- pmovmskb %xmm1, %eax
- bsr %eax, %eax
-
- add %rdi, %rax
+ENTRY_P2ALIGN(__memrchr, 6)
+#ifdef __ILP32__
+ /* Clear upper bits. */
+ mov %RDX_LP, %RDX_LP
+#endif
+ movd %esi, %xmm0
+
+ /* Get end pointer. */
+ leaq (%rdx, %rdi), %rcx
+
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0
+
+ /* Check if we can load 1x VEC without cross a page. */
+ testl $(PAGE_SIZE - VEC_SIZE), %ecx
+ jz L(page_cross)
+
+ /* NB: This load happens regardless of whether rdx (len) is zero. Since
+ it doesn't cross a page and the standard gurantees any pointer have
+ at least one-valid byte this load must be safe. For the entire
+ history of the x86 memrchr implementation this has been possible so
+ no code "should" be relying on a zero-length check before this load.
+ The zero-length check is moved to the page cross case because it is
+ 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+ into 2-cache lines. */
+ movups -(VEC_SIZE)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subq $VEC_SIZE, %rdx
+ ja L(more_1x_vec)
+L(ret_vec_x0_test):
+ /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+ zero. */
+ bsrl %eax, %eax
+ jz L(ret_0)
+ /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+ if out of bounds. */
+ addl %edx, %eax
+ jl L(zero_0)
+ /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+ ptr. */
+ addq %rdi, %rax
+L(ret_0):
ret
- .p2align 4
-L(exit_loop):
- add $64, %edx
- cmp $32, %edx
- jbe L(exit_loop_32)
-
- movdqa 48(%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches48)
-
- movdqa 32(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 16(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches16_1)
- cmp $48, %edx
- jbe L(return_null)
-
- pcmpeqb (%rdi), %xmm1
- pmovmskb %xmm1, %eax
- test %eax, %eax
- jnz L(matches0_1)
- xor %eax, %eax
+ .p2align 4,, 5
+L(ret_vec_x0):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE)(%rcx, %rax), %rax
ret
- .p2align 4
-L(exit_loop_32):
- movdqa 48(%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches48_1)
- cmp $16, %edx
- jbe L(return_null)
-
- pcmpeqb 32(%rdi), %xmm1
- pmovmskb %xmm1, %eax
- test %eax, %eax
- jnz L(matches32_1)
- xor %eax, %eax
+ .p2align 4,, 2
+L(zero_0):
+ xorl %eax, %eax
ret
- .p2align 4
-L(matches0):
- bsr %eax, %eax
- add %rdi, %rax
- ret
-
- .p2align 4
-L(matches16):
- bsr %eax, %eax
- lea 16(%rax, %rdi), %rax
- ret
- .p2align 4
-L(matches32):
- bsr %eax, %eax
- lea 32(%rax, %rdi), %rax
+ .p2align 4,, 8
+L(more_1x_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
+
+ /* Align rcx (pointer to string). */
+ decq %rcx
+ andq $-VEC_SIZE, %rcx
+
+ movq %rcx, %rdx
+ /* NB: We could consistenyl save 1-byte in this pattern with `movaps
+ %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+ it adds more frontend uops (even if the moves can be eliminated) and
+ some percentage of the time actual backend uops. */
+ movaps -(VEC_SIZE)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ subq %rdi, %rdx
+ pmovmskb %xmm1, %eax
+
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+L(last_2x_vec):
+ subl $VEC_SIZE, %edx
+ jbe L(ret_vec_x0_test)
+
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
+
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subl $VEC_SIZE, %edx
+ bsrl %eax, %eax
+ jz L(ret_1)
+ addl %edx, %eax
+ jl L(zero_0)
+ addq %rdi, %rax
+L(ret_1):
ret
- .p2align 4
-L(matches48):
- bsr %eax, %eax
- lea 48(%rax, %rdi), %rax
+ /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+ causes the hot pause (length <= VEC_SIZE) to span multiple cache
+ lines. Naturally aligned % 16 to 8-bytes. */
+L(page_cross):
+ /* Zero length check. */
+ testq %rdx, %rdx
+ jz L(zero_0)
+
+ leaq -1(%rcx), %r8
+ andq $-(VEC_SIZE), %r8
+
+ movaps (%r8), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ /* Shift out negative alignment (because we are starting from endptr and
+ working backwards). */
+ negl %ecx
+ /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+ explicitly. */
+ andl $(VEC_SIZE - 1), %ecx
+ shl %cl, %esi
+ movzwl %si, %eax
+ leaq (%rdi, %rdx), %rcx
+ cmpq %rdi, %r8
+ ja L(more_1x_vec)
+ subl $VEC_SIZE, %edx
+ bsrl %eax, %eax
+ jz L(ret_2)
+ addl %edx, %eax
+ jl L(zero_1)
+ addq %rdi, %rax
+L(ret_2):
ret
- .p2align 4
-L(matches0_1):
- bsr %eax, %eax
- sub $64, %rdx
- add %rax, %rdx
- jl L(return_null)
- add %rdi, %rax
+ /* Fits in aliging bytes. */
+L(zero_1):
+ xorl %eax, %eax
ret
- .p2align 4
-L(matches16_1):
- bsr %eax, %eax
- sub $48, %rdx
- add %rax, %rdx
- jl L(return_null)
- lea 16(%rdi, %rax), %rax
+ .p2align 4,, 5
+L(ret_vec_x1):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
ret
- .p2align 4
-L(matches32_1):
- bsr %eax, %eax
- sub $32, %rdx
- add %rax, %rdx
- jl L(return_null)
- lea 32(%rdi, %rax), %rax
- ret
+ .p2align 4,, 8
+L(more_2x_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
- .p2align 4
-L(matches48_1):
- bsr %eax, %eax
- sub $16, %rdx
- add %rax, %rdx
- jl L(return_null)
- lea 48(%rdi, %rax), %rax
- ret
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jnz L(ret_vec_x1)
- .p2align 4
-L(return_null):
- xor %eax, %eax
- ret
- .p2align 4
-L(length_less16_offset0):
- test %edx, %edx
- jz L(return_null)
+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
- mov %dl, %cl
- pcmpeqb (%rdi), %xmm1
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(more_4x_vec)
- mov $1, %edx
- sal %cl, %edx
- sub $1, %edx
+ addl $(VEC_SIZE), %edx
+ jle L(ret_vec_x2_test)
- pmovmskb %xmm1, %eax
+L(last_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x2)
- and %edx, %eax
- test %eax, %eax
- jz L(return_null)
+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
- bsr %eax, %eax
- add %rdi, %rax
+ subl $(VEC_SIZE), %edx
+ bsrl %eax, %eax
+ jz L(ret_3)
+ addl %edx, %eax
+ jl L(zero_2)
+ addq %rdi, %rax
+L(ret_3):
ret
- .p2align 4
-L(length_less16):
- punpcklbw %xmm1, %xmm1
- punpcklbw %xmm1, %xmm1
-
- add $16, %edx
-
- pshufd $0, %xmm1, %xmm1
-
- mov %edi, %ecx
- and $15, %ecx
- jz L(length_less16_offset0)
-
- mov %cl, %dh
- mov %ecx, %esi
- add %dl, %dh
- and $-16, %rdi
-
- sub $16, %dh
- ja L(length_less16_part2)
-
- pcmpeqb (%rdi), %xmm1
- pmovmskb %xmm1, %eax
-
- sar %cl, %eax
- mov %dl, %cl
-
- mov $1, %edx
- sal %cl, %edx
- sub $1, %edx
-
- and %edx, %eax
- test %eax, %eax
- jz L(return_null)
-
- bsr %eax, %eax
- add %rdi, %rax
- add %rsi, %rax
+ .p2align 4,, 6
+L(ret_vec_x2_test):
+ bsrl %eax, %eax
+ jz L(zero_2)
+ addl %edx, %eax
+ jl L(zero_2)
+ addq %rdi, %rax
ret
- .p2align 4
-L(length_less16_part2):
- movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
-
- mov %dh, %cl
- mov $1, %edx
- sal %cl, %edx
- sub $1, %edx
-
- and %edx, %eax
+L(zero_2):
+ xorl %eax, %eax
+ ret
- test %eax, %eax
- jnz L(length_less16_part2_return)
- pcmpeqb (%rdi), %xmm1
- pmovmskb %xmm1, %eax
+ .p2align 4,, 5
+L(ret_vec_x2):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
+ ret
- mov %esi, %ecx
- sar %cl, %eax
- test %eax, %eax
- jz L(return_null)
+ .p2align 4,, 5
+L(ret_vec_x3):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
+ ret
- bsr %eax, %eax
- add %rdi, %rax
- add %rsi, %rax
+ .p2align 4,, 8
+L(more_4x_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x2)
+
+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ testl %eax, %eax
+ jnz L(ret_vec_x3)
+
+ addq $-(VEC_SIZE * 4), %rcx
+ cmpq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec)
+
+ /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+ keeping the code from spilling to the next cache line. */
+ addq $(VEC_SIZE * 4 - 1), %rcx
+ andq $-(VEC_SIZE * 4), %rcx
+ leaq (VEC_SIZE * 4)(%rdi), %rdx
+ andq $-(VEC_SIZE * 4), %rdx
+
+ .p2align 4,, 11
+L(loop_4x_vec):
+ movaps (VEC_SIZE * -1)(%rcx), %xmm1
+ movaps (VEC_SIZE * -2)(%rcx), %xmm2
+ movaps (VEC_SIZE * -3)(%rcx), %xmm3
+ movaps (VEC_SIZE * -4)(%rcx), %xmm4
+ pcmpeqb %xmm0, %xmm1
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm0, %xmm3
+ pcmpeqb %xmm0, %xmm4
+
+ por %xmm1, %xmm2
+ por %xmm3, %xmm4
+ por %xmm2, %xmm4
+
+ pmovmskb %xmm4, %esi
+ testl %esi, %esi
+ jnz L(loop_end)
+
+ addq $-(VEC_SIZE * 4), %rcx
+ cmpq %rdx, %rcx
+ jne L(loop_4x_vec)
+
+ subl %edi, %edx
+
+ /* Ends up being 1-byte nop. */
+ .p2align 4,, 2
+L(last_4x_vec):
+ movaps -(VEC_SIZE)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ cmpl $(VEC_SIZE * 2), %edx
+ jbe L(last_2x_vec)
+
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
+
+
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ testl %eax, %eax
+ jnz L(ret_vec_end)
+
+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subl $(VEC_SIZE * 3), %edx
+ ja L(last_vec)
+ bsrl %eax, %eax
+ jz L(ret_4)
+ addl %edx, %eax
+ jl L(zero_3)
+ addq %rdi, %rax
+L(ret_4):
ret
- .p2align 4
-L(length_less16_part2_return):
- bsr %eax, %eax
- lea 16(%rax, %rdi), %rax
+ /* Ends up being 1-byte nop. */
+ .p2align 4,, 3
+L(loop_end):
+ pmovmskb %xmm1, %eax
+ sall $16, %eax
+ jnz L(ret_vec_end)
+
+ pmovmskb %xmm2, %eax
+ testl %eax, %eax
+ jnz L(ret_vec_end)
+
+ pmovmskb %xmm3, %eax
+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+ then it won't affect the result in esi (VEC4). If ecx is non-zero
+ then CHAR in VEC3 and bsrq will use that position. */
+ sall $16, %eax
+ orl %esi, %eax
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
ret
-END (__memrchr)
+L(ret_vec_end):
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
+ ret
+ /* Use in L(last_4x_vec). In the same cache line. This is just a spare
+ aligning bytes. */
+L(zero_3):
+ xorl %eax, %eax
+ ret
+ /* 2-bytes from next cache line. */
+END(__memrchr)
weak_alias (__memrchr, memrchr)
-/* memset/bzero -- set memory area to CH/0
+/* memset -- set memory area to CH/0
Optimized version for x86-64.
Copyright (C) 2002-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
#define VMOVU movups
#define VMOVA movaps
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
movq r, %rax; \
punpcklbw %xmm0, %xmm0; \
punpcklwd %xmm0, %xmm0; \
pshufd $0, %xmm0, %xmm0
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
- movq r, %rax; \
- pshufd $0, %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
#define SECTION(p) p
ifeq ($(subdir),string)
-sysdep_routines += strncat-c stpncpy-c strncpy-c \
- strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3 \
- strcmp-sse4_2 strcmp-avx2 \
- strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \
- memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
- memrchr-sse2 memrchr-avx2 \
- memcmp-sse2 \
- memcmpeq-sse2 \
- memcmp-avx2-movbe \
- memcmpeq-avx2 \
- memcmp-sse4 memcpy-ssse3 \
- memmove-ssse3 \
- memcpy-ssse3-back \
- memmove-ssse3-back \
- memmove-avx512-no-vzeroupper \
- strcasecmp_l-sse2 strcasecmp_l-ssse3 \
- strcasecmp_l-sse4_2 strcasecmp_l-avx \
- strncase_l-sse2 strncase_l-ssse3 \
- strncase_l-sse4_2 strncase_l-avx \
- strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
- strrchr-sse2 strrchr-avx2 \
- strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
- strcat-avx2 strncat-avx2 \
- strcat-ssse3 strncat-ssse3\
- strcpy-avx2 strncpy-avx2 \
- strcpy-sse2 stpcpy-sse2 \
- strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
- strcpy-sse2-unaligned strncpy-sse2-unaligned \
- stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
- stpcpy-avx2 stpncpy-avx2 \
- strcat-sse2 \
- strcat-sse2-unaligned strncat-sse2-unaligned \
- strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
- strcspn-sse2 strpbrk-sse2 strspn-sse2 \
- strcspn-c strpbrk-c strspn-c varshift \
- memset-avx512-no-vzeroupper \
- memmove-sse2-unaligned-erms \
- memmove-avx-unaligned-erms \
- memmove-avx512-unaligned-erms \
- memset-sse2-unaligned-erms \
- memset-avx2-unaligned-erms \
- memset-avx512-unaligned-erms \
- memchr-avx2-rtm \
- memcmp-avx2-movbe-rtm \
- memcmpeq-avx2-rtm \
- memmove-avx-unaligned-erms-rtm \
- memrchr-avx2-rtm \
- memset-avx2-unaligned-erms-rtm \
- rawmemchr-avx2-rtm \
- strchr-avx2-rtm \
- strcmp-avx2-rtm \
- strchrnul-avx2-rtm \
- stpcpy-avx2-rtm \
- stpncpy-avx2-rtm \
- strcat-avx2-rtm \
- strcpy-avx2-rtm \
- strlen-avx2-rtm \
- strncat-avx2-rtm \
- strncmp-avx2-rtm \
- strncpy-avx2-rtm \
- strnlen-avx2-rtm \
- strrchr-avx2-rtm \
- memchr-evex \
- memcmp-evex-movbe \
- memcmpeq-evex \
- memmove-evex-unaligned-erms \
- memrchr-evex \
- memset-evex-unaligned-erms \
- rawmemchr-evex \
- stpcpy-evex \
- stpncpy-evex \
- strcat-evex \
- strchr-evex \
- strchrnul-evex \
- strcmp-evex \
- strcpy-evex \
- strlen-evex \
- strncat-evex \
- strncmp-evex \
- strncpy-evex \
- strnlen-evex \
- strrchr-evex \
- memchr-evex-rtm \
- rawmemchr-evex-rtm
+sysdep_routines += \
+ memchr-avx2 \
+ memchr-avx2-rtm \
+ memchr-evex \
+ memchr-evex-rtm \
+ memchr-sse2 \
+ memcmp-avx2-movbe \
+ memcmp-avx2-movbe-rtm \
+ memcmp-evex-movbe \
+ memcmp-sse2 \
+ memcmp-ssse3 \
+ memcmpeq-avx2 \
+ memcmpeq-avx2-rtm \
+ memcmpeq-evex \
+ memcmpeq-sse2 \
+ memcpy-ssse3 \
+ memcpy-ssse3-back \
+ memmove-avx-unaligned-erms \
+ memmove-avx-unaligned-erms-rtm \
+ memmove-avx512-no-vzeroupper \
+ memmove-avx512-unaligned-erms \
+ memmove-erms \
+ memmove-evex-unaligned-erms \
+ memmove-sse2-unaligned-erms \
+ memmove-ssse3 \
+ memmove-ssse3-back \
+ memrchr-avx2 \
+ memrchr-avx2-rtm \
+ memrchr-evex \
+ memrchr-sse2 \
+ memset-avx2-unaligned-erms \
+ memset-avx2-unaligned-erms-rtm \
+ memset-avx512-no-vzeroupper \
+ memset-avx512-unaligned-erms \
+ memset-erms \
+ memset-evex-unaligned-erms \
+ memset-sse2-unaligned-erms \
+ rawmemchr-avx2 \
+ rawmemchr-avx2-rtm \
+ rawmemchr-evex \
+ rawmemchr-evex-rtm \
+ rawmemchr-sse2 \
+ stpcpy-avx2 \
+ stpcpy-avx2-rtm \
+ stpcpy-evex \
+ stpcpy-sse2 \
+ stpcpy-sse2-unaligned \
+ stpcpy-ssse3 \
+ stpncpy-avx2 \
+ stpncpy-avx2-rtm \
+ stpncpy-c \
+ stpncpy-evex \
+ stpncpy-sse2-unaligned \
+ stpncpy-ssse3 \
+ strcasecmp_l-avx2 \
+ strcasecmp_l-avx2-rtm \
+ strcasecmp_l-evex \
+ strcasecmp_l-sse2 \
+ strcasecmp_l-sse4_2 \
+ strcasecmp_l-ssse3 \
+ strcat-avx2 \
+ strcat-avx2-rtm \
+ strcat-evex \
+ strcat-sse2 \
+ strcat-sse2-unaligned \
+ strcat-ssse3 \
+ strchr-avx2 \
+ strchr-avx2-rtm \
+ strchr-evex \
+ strchr-sse2 \
+ strchr-sse2-no-bsf \
+ strchrnul-avx2 \
+ strchrnul-avx2-rtm \
+ strchrnul-evex \
+ strchrnul-sse2 \
+ strcmp-avx2 \
+ strcmp-avx2-rtm \
+ strcmp-evex \
+ strcmp-sse2 \
+ strcmp-sse2-unaligned \
+ strcmp-sse4_2 \
+ strcmp-ssse3 \
+ strcpy-avx2 \
+ strcpy-avx2-rtm \
+ strcpy-evex \
+ strcpy-sse2 \
+ strcpy-sse2-unaligned \
+ strcpy-ssse3 \
+ strcspn-c \
+ strcspn-sse2 \
+ strlen-avx2 \
+ strlen-avx2-rtm \
+ strlen-evex \
+ strlen-evex512 \
+ strlen-sse2 \
+ strncase_l-avx2 \
+ strncase_l-avx2-rtm \
+ strncase_l-evex \
+ strncase_l-sse2 \
+ strncase_l-sse4_2 \
+ strncase_l-ssse3 \
+ strncat-avx2 \
+ strncat-avx2-rtm \
+ strncat-c \
+ strncat-evex \
+ strncat-sse2-unaligned \
+ strncat-ssse3 \
+ strncmp-avx2 \
+ strncmp-avx2-rtm \
+ strncmp-evex \
+ strncmp-sse2 \
+ strncmp-sse4_2 \
+ strncmp-ssse3 \
+ strncpy-avx2 \
+ strncpy-avx2-rtm \
+ strncpy-c \
+ strncpy-evex \
+ strncpy-sse2-unaligned \
+ strncpy-ssse3 \
+ strnlen-avx2 \
+ strnlen-avx2-rtm \
+ strnlen-evex \
+ strnlen-evex512 \
+ strnlen-sse2 \
+ strpbrk-c \
+ strpbrk-sse2 \
+ strrchr-avx2 \
+ strrchr-avx2-rtm \
+ strrchr-evex \
+ strrchr-sse2 \
+ strspn-c \
+ strspn-sse2 \
+ strstr-avx512 \
+ strstr-sse2-unaligned \
+ varshift \
+# sysdep_routines
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4
+CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
endif
ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
- wmemcmp-avx2-movbe \
- wmemchr-sse2 wmemchr-avx2 \
- wcscmp-sse2 wcscmp-avx2 \
- wcsncmp-sse2 wcsncmp-avx2 \
- wcscpy-ssse3 wcscpy-c \
- wcschr-sse2 wcschr-avx2 \
- wcsrchr-sse2 wcsrchr-avx2 \
- wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
- wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
- wcschr-avx2-rtm \
- wcscmp-avx2-rtm \
- wcslen-avx2-rtm \
- wcsncmp-avx2-rtm \
- wcsnlen-avx2-rtm \
- wcsrchr-avx2-rtm \
- wmemchr-avx2-rtm \
- wmemcmp-avx2-movbe-rtm \
- wcschr-evex \
- wcscmp-evex \
- wcslen-evex \
- wcsncmp-evex \
- wcsnlen-evex \
- wcsrchr-evex \
- wmemchr-evex \
- wmemcmp-evex-movbe \
- wmemchr-evex-rtm
+sysdep_routines += \
+ wcschr-avx2 \
+ wcschr-avx2-rtm \
+ wcschr-evex \
+ wcschr-sse2 \
+ wcscmp-avx2 \
+ wcscmp-avx2-rtm \
+ wcscmp-evex \
+ wcscmp-sse2 \
+ wcscpy-c \
+ wcscpy-ssse3 \
+ wcslen-avx2 \
+ wcslen-avx2-rtm \
+ wcslen-evex \
+ wcslen-evex512 \
+ wcslen-sse2 \
+ wcslen-sse4_1 \
+ wcsncmp-avx2 \
+ wcsncmp-avx2-rtm \
+ wcsncmp-evex \
+ wcsncmp-sse2 \
+ wcsnlen-avx2 \
+ wcsnlen-avx2-rtm \
+ wcsnlen-c \
+ wcsnlen-evex \
+ wcsnlen-evex512 \
+ wcsnlen-sse4_1 \
+ wcsrchr-avx2 \
+ wcsrchr-avx2-rtm \
+ wcsrchr-evex \
+ wcsrchr-sse2 \
+ wmemchr-avx2 \
+ wmemchr-avx2-rtm \
+ wmemchr-evex \
+ wmemchr-evex-rtm \
+ wmemchr-sse2 \
+ wmemcmp-avx2-movbe \
+ wmemcmp-avx2-movbe-rtm \
+ wmemcmp-evex-movbe \
+ wmemcmp-sse2 \
+ wmemcmp-ssse3 \
+# sysdep_routines
endif
ifeq ($(subdir),debug)
-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \
- memmove_chk-nonshared memset_chk-nonshared \
- wmemset_chk-nonshared
+sysdep_routines += \
+ memcpy_chk-nonshared \
+ memmove_chk-nonshared \
+ mempcpy_chk-nonshared \
+ memset_chk-nonshared \
+ wmemset_chk-nonshared \
+# sysdep_routines
endif
--- /dev/null
+/* Common config for AVX-RTM VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _AVX_RTM_VECS_H
+#define _AVX_RTM_VECS_H 1
+
+#define COND_VZEROUPPER COND_VZEROUPPER_XTEST
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define USE_WITH_RTM 1
+#include "avx-vecs.h"
+
+#undef SECTION
+#define SECTION(p) p##.avx.rtm
+
+#endif
--- /dev/null
+/* Common config for AVX VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _AVX_VECS_H
+#define _AVX_VECS_H 1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE 32
+#include "vec-macros.h"
+
+#define USE_WITH_AVX 1
+#define SECTION(p) p##.avx
+
+/* 4-byte mov instructions with AVX2. */
+#define MOV_SIZE 4
+/* 1 (ret) + 3 (vzeroupper). */
+#define RET_SIZE 4
+#define VZEROUPPER vzeroupper
+
+#define VMOVU vmovdqu
+#define VMOVA vmovdqa
+#define VMOVNT vmovntdq
+
+/* Often need to access xmm portion. */
+#define VEC_xmm VEC_any_xmm
+#define VEC VEC_any_ymm
+
+#endif
+++ /dev/null
-#include <sysdep.h>
-
- .text
-ENTRY(bcopy)
- xchg %rdi, %rsi
- jmp __libc_memmove /* Branch to IFUNC memmove. */
-END(bcopy)
--- /dev/null
+/* Common config for EVEX256 and EVEX512 VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _EVEX_VECS_COMMON_H
+#define _EVEX_VECS_COMMON_H 1
+
+#include "vec-macros.h"
+
+/* 6-byte mov instructions with EVEX. */
+#define MOV_SIZE 6
+/* No vzeroupper needed. */
+#define RET_SIZE 1
+#define VZEROUPPER
+
+#define VMOVU vmovdqu64
+#define VMOVA vmovdqa64
+#define VMOVNT vmovntdq
+
+#define VEC_xmm VEC_hi_xmm
+#define VEC_ymm VEC_hi_ymm
+#define VEC_zmm VEC_hi_zmm
+
+#endif
--- /dev/null
+/* Common config for EVEX256 VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H 1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE 32
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX256 1
+#define SECTION(p) p##.evex
+
+#define VEC VEC_ymm
+
+#endif
--- /dev/null
+/* Common config for EVEX512 VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H 1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE 64
+#include "evex-vecs-common.h"
+
+#define USE_WITH_EVEX512 1
+#define SECTION(p) p##.evex512
+
+#define VEC VEC_zmm
+
+#endif
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_evex_movbe)
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
- __memcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
__memcmp_ssse3)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strlen_evex)
+ IFUNC_IMPL_ADD (array, i, strlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strlen_evex512)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strnlen_evex)
+ IFUNC_IMPL_ADD (array, i, strnlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strnlen_evex512)
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
IFUNC_IMPL_ADD (array, i, strcasecmp,
- CPU_FEATURE_USABLE (AVX),
- __strcasecmp_avx)
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strcasecmp_evex)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_sse42)
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- CPU_FEATURE_USABLE (AVX),
- __strcasecmp_l_avx)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strcasecmp_l_evex)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_l_sse42)
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
IFUNC_IMPL_ADD (array, i, strncasecmp,
- CPU_FEATURE_USABLE (AVX),
- __strncasecmp_avx)
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strncasecmp_evex)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_sse42)
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- CPU_FEATURE_USABLE (AVX),
- __strncasecmp_l_avx)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strncasecmp_l_evex)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_l_sse42)
/* Support sysdeps/x86_64/multiarch/strstr.c. */
IFUNC_IMPL (i, name, strstr,
+ IFUNC_IMPL_ADD (array, i, strstr,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (AVX512DQ)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strstr_avx512)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex)
+ IFUNC_IMPL_ADD (array, i, wcslen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __wcslen_evex512)
IFUNC_IMPL_ADD (array, i, wcslen,
CPU_FEATURE_USABLE (SSE4_1),
__wcslen_sse4_1)
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsnlen_evex)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __wcsnlen_evex512)
IFUNC_IMPL_ADD (array, i, wcsnlen,
CPU_FEATURE_USABLE (SSE4_1),
__wcsnlen_sse4_1)
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_evex_movbe)
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
- __wmemcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
__wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
CPU_FEATURE_USABLE (AVX2),
__wmemset_chk_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __wmemset_chk_avx2_unaligned_rtm)
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
return OPTIMIZE (avx2_movbe);
}
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
- return OPTIMIZE (sse4_1);
-
if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
return OPTIMIZE (ssse3);
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
- return OPTIMIZE (avx);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ return OPTIMIZE (evex);
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
# define MEMCHR __memchr_avx2_rtm
#endif
+#define COND_VZEROUPPER COND_VZEROUPPER_XTEST
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 5)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
# ifdef __ILP32__
# endif
testl %eax, %eax
jz L(aligned_more)
- tzcntl %eax, %eax
+ bsfl %eax, %eax
addq %rdi, %rax
- VZEROUPPER_RETURN
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
# ifndef USE_AS_RAWMEMCHR
- .p2align 5
+ .p2align 4
L(first_vec_x0):
/* Check if first match was before length. */
tzcntl %eax, %eax
/* NB: Multiply length by 4 to get byte count. */
sall $2, %edx
# endif
- xorl %ecx, %ecx
+ COND_VZEROUPPER
+ /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
+ block. branch here as opposed to cmovcc is not that costly. Common
+ usage of memchr is to check if the return was NULL (if string was
+ known to contain CHAR user would use rawmemchr). This branch will be
+ highly correlated with the user branch and can be used by most
+ modern branch predictors to predict the user branch. */
cmpl %eax, %edx
- leaq (%rdi, %rax), %rax
- cmovle %rcx, %rax
- VZEROUPPER_RETURN
-
-L(null):
- xorl %eax, %eax
- ret
-# endif
- .p2align 4
-L(cross_page_boundary):
- /* Save pointer before aligning as its original value is
- necessary for computer return address if byte is found or
- adjusting length if it is not and this is memchr. */
- movq %rdi, %rcx
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
- and rdi for rawmemchr. */
- orq $(VEC_SIZE - 1), %ALGN_PTR_REG
- VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate length until end of page (length checked for a
- match). */
- leaq 1(%ALGN_PTR_REG), %rsi
- subq %RRAW_PTR_REG, %rsi
-# ifdef USE_AS_WMEMCHR
- /* NB: Divide bytes by 4 to get wchar_t count. */
- shrl $2, %esi
-# endif
-# endif
- /* Remove the leading bytes. */
- sarxl %ERAW_PTR_REG, %eax, %eax
-# ifndef USE_AS_RAWMEMCHR
- /* Check the end of data. */
- cmpq %rsi, %rdx
- jbe L(first_vec_x0)
+ jle L(null)
+ addq %rdi, %rax
+ ret
# endif
- testl %eax, %eax
- jz L(cross_page_continue)
- tzcntl %eax, %eax
- addq %RRAW_PTR_REG, %rax
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
- .p2align 4
+ .p2align 4,, 10
L(first_vec_x1):
- tzcntl %eax, %eax
+ bsfl %eax, %eax
incq %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
-
+# ifndef USE_AS_RAWMEMCHR
+ /* First in aligning bytes here. */
+L(null):
+ xorl %eax, %eax
+ ret
+# endif
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
incq %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
- .p2align 4
+ .p2align 4,, 6
L(set_zero_end):
xorl %eax, %eax
VZEROUPPER_RETURN
VZEROUPPER_RETURN
# endif
+ .p2align 4
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is necessary for
+ computer return address if byte is found or adjusting length if it
+ is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+ and rdi for rawmemchr. */
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate length until end of page (length checked for a match). */
+ leaq 1(%ALGN_PTR_REG), %rsi
+ subq %RRAW_PTR_REG, %rsi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %esi
+# endif
+# endif
+ /* Remove the leading bytes. */
+ sarxl %ERAW_PTR_REG, %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Check the end of data. */
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ bsfl %eax, %eax
+ addq %RRAW_PTR_REG, %rax
+ VZEROUPPER_RETURN
+
+
END (MEMCHR)
#endif
# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 6)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
test %RDX_LP, %RDX_LP
xorl %eax, %eax
ret
- .p2align 5
+ .p2align 4
L(first_vec_x0):
- /* Check if first match was before length. */
- tzcntl %eax, %eax
- xorl %ecx, %ecx
- cmpl %eax, %edx
- leaq (%rdi, %rax, CHAR_SIZE), %rax
- cmovle %rcx, %rax
+ /* Check if first match was before length. NB: tzcnt has false data-
+ dependency on destination. eax already had a data-dependency on esi
+ so this should have no affect here. */
+ tzcntl %eax, %esi
+# ifdef USE_AS_WMEMCHR
+ leaq (%rdi, %rsi, CHAR_SIZE), %rdi
+# else
+ addq %rsi, %rdi
+# endif
+ xorl %eax, %eax
+ cmpl %esi, %edx
+ cmovg %rdi, %rax
ret
-# else
- /* NB: first_vec_x0 is 17 bytes which will leave
- cross_page_boundary (which is relatively cold) close enough
- to ideal alignment. So only realign L(cross_page_boundary) if
- rawmemchr. */
- .p2align 4
# endif
+
+ .p2align 4
L(cross_page_boundary):
/* Save pointer before aligning as its original value is
necessary for computer return address if byte is found or
L(zero_end):
ret
+L(set_zero_end):
+ xorl %eax, %eax
+ ret
.p2align 4
L(first_vec_x1_check):
- tzcntl %eax, %eax
+ /* eax must be non-zero. Use bsfl to save code size. */
+ bsfl %eax, %eax
/* Adjust length. */
subl $-(CHAR_PER_VEC * 4), %edx
/* Check if match within remaining length. */
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
ret
-L(set_zero_end):
- xorl %eax, %eax
- ret
.p2align 4
L(loop_4x_vec_end):
# endif
ret
- .p2align 4
+ .p2align 4,, 10
L(last_vec_x1_return):
tzcntl %eax, %eax
# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
# endif
# ifndef USE_AS_RAWMEMCHR
+ .p2align 4,, 5
L(last_4x_vec_or_less_cmpeq):
VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
kmovd %k0, %eax
# endif
andl %ecx, %eax
jz L(zero_end2)
- tzcntl %eax, %eax
+ bsfl %eax, %eax
leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
L(zero_end2):
ret
leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
ret
# endif
-
+ /* 7 bytes from next cache line. */
END (MEMCHR)
#endif
# ifndef USE_AS_WMEMCMP
cmpl $8, %edx
jae L(between_8_15)
+ /* Fall through for [4, 7]. */
cmpl $4, %edx
- jae L(between_4_7)
+ jb L(between_2_3)
- /* Load as big endian to avoid branches. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- /* Subtraction is okay because the upper 8 bits are zero. */
- subl %ecx, %eax
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
/* No ymm register was touched. */
ret
/* No ymm register was touched. */
ret
+ .p2align 4,, 5
+L(ret_nonzero):
+ sbbl %eax, %eax
+ orl $1, %eax
+ /* No ymm register was touched. */
+ ret
+
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ /* No ymm register was touched. */
+ ret
+
.p2align 4
L(between_8_15):
-# endif
+ movbe (%rdi), %rax
+ movbe (%rsi), %rcx
+ subq %rcx, %rax
+ jnz L(ret_nonzero)
+ movbe -8(%rdi, %rdx), %rax
+ movbe -8(%rsi, %rdx), %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
+ /* No ymm register was touched. */
+ ret
+# else
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
vmovq (%rdi), %xmm1
vmovq (%rsi), %xmm2
VPCMPEQ %xmm1, %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
+# endif
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
+ .p2align 4,, 10
L(between_16_31):
/* From 16 to 31 bytes. No branch when size == 16. */
vmovdqu (%rsi), %xmm2
VPCMPEQ (%rdi), %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
# ifdef USE_AS_WMEMCMP
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ ret
+
.p2align 4
L(one_or_less):
jb L(zero)
# else
.p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches.
- */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- jz L(zero_4_7)
- sbbl %eax, %eax
- orl $1, %eax
-L(zero_4_7):
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ bswap %eax
+ bswap %ecx
+ shrl %eax
+ shrl %ecx
+ movzbl -1(%rdi, %rdx), %edi
+ movzbl -1(%rsi, %rdx), %esi
+ orl %edi, %eax
+ orl %esi, %ecx
+ /* Subtraction is okay because the upper bit is zero. */
+ subl %ecx, %eax
/* No ymm register was touched. */
ret
# endif
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# ifndef memcmp
-# define memcmp __memcmp_sse2
+# ifndef MEMCMP
+# define MEMCMP __memcmp_sse2
# endif
# ifdef SHARED
+++ /dev/null
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_1
-# endif
-
-#ifdef USE_AS_WMEMCMP
-# define CMPEQ pcmpeqd
-# define CHAR_SIZE 4
-#else
-# define CMPEQ pcmpeqb
-# define CHAR_SIZE 1
-#endif
-
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- .section .text.sse4.1,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %RDX_LP
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-# endif
- cmp $79, %RDX_LP
- ja L(79bytesormore)
-
- cmp $CHAR_SIZE, %RDX_LP
- jbe L(firstbyte)
-
- /* N in (CHAR_SIZE, 79) bytes. */
- cmpl $32, %edx
- ja L(more_32_bytes)
-
- cmpl $16, %edx
- jae L(16_to_32_bytes)
-
-# ifndef USE_AS_WMEMCMP
- cmpl $8, %edx
- jae L(8_to_16_bytes)
-
- cmpl $4, %edx
- jb L(2_to_3_bytes)
-
- movl (%rdi), %eax
- movl (%rsi), %ecx
-
- bswap %eax
- bswap %ecx
-
- shlq $32, %rax
- shlq $32, %rcx
-
- movl -4(%rdi, %rdx), %edi
- movl -4(%rsi, %rdx), %esi
-
- bswap %edi
- bswap %esi
-
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- cmovne %edx, %eax
- sbbl %ecx, %ecx
- orl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(2_to_3_bytes):
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- subl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(8_to_16_bytes):
- movq (%rdi), %rax
- movq (%rsi), %rcx
-
- bswap %rax
- bswap %rcx
-
- subq %rcx, %rax
- jne L(8_to_16_bytes_done)
-
- movq -8(%rdi, %rdx), %rax
- movq -8(%rsi, %rdx), %rcx
-
- bswap %rax
- bswap %rcx
-
- subq %rcx, %rax
-
-L(8_to_16_bytes_done):
- cmovne %edx, %eax
- sbbl %ecx, %ecx
- orl %ecx, %eax
- ret
-# else
- xorl %eax, %eax
- movl (%rdi), %ecx
- cmpl (%rsi), %ecx
- jne L(8_to_16_bytes_done)
- movl 4(%rdi), %ecx
- cmpl 4(%rsi), %ecx
- jne L(8_to_16_bytes_done)
- movl -4(%rdi, %rdx), %ecx
- cmpl -4(%rsi, %rdx), %ecx
- jne L(8_to_16_bytes_done)
- ret
-# endif
-
- .p2align 4,, 3
-L(ret_zero):
- xorl %eax, %eax
-L(zero):
- ret
-
- .p2align 4,, 8
-L(firstbyte):
- jb L(ret_zero)
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (%rdi), %ecx
- cmpl (%rsi), %ecx
- je L(zero)
-L(8_to_16_bytes_done):
- setg %al
- leal -1(%rax, %rax), %eax
-# else
- movzbl (%rdi), %eax
- movzbl (%rsi), %ecx
- sub %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_begin_48):
- addq $16, %rdi
- addq $16, %rsi
-L(vec_return_begin_32):
- bsfl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl 32(%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl 32(%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl 32(%rsi, %rax), %ecx
- movzbl 32(%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_begin_16):
- addq $16, %rdi
- addq $16, %rsi
-L(vec_return_begin):
- bsfl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl (%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rsi, %rax), %ecx
- movzbl (%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_end_16):
- subl $16, %edx
-L(vec_return_end):
- bsfl %eax, %eax
- addl %edx, %eax
-# ifdef USE_AS_WMEMCMP
- movl -16(%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl -16(%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl -16(%rsi, %rax), %ecx
- movzbl -16(%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4,, 8
-L(more_32_bytes):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm0
- movdqu 16(%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- cmpl $64, %edx
- jbe L(32_to_64_bytes)
- movdqu 32(%rdi), %xmm0
- movdqu 32(%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- .p2align 4,, 6
-L(32_to_64_bytes):
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(16_to_32_bytes):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
-
- .p2align 4
-L(79bytesormore):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
-
- mov %rsi, %rcx
- and $-16, %rsi
- add $16, %rsi
- sub %rsi, %rcx
-
- sub %rcx, %rdi
- add %rcx, %rdx
- test $0xf, %rdi
- jz L(2aligned)
-
- cmp $128, %rdx
- ja L(128bytesormore)
-
- .p2align 4,, 6
-L(less128bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- cmp $96, %rdx
- jb L(32_to_64_bytes)
-
- addq $64, %rdi
- addq $64, %rsi
- subq $64, %rdx
-
- .p2align 4,, 6
-L(last_64_bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(128bytesormore):
- cmp $256, %rdx
- ja L(unaligned_loop)
-L(less256bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $64, %rdi
- addq $64, %rsi
-
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $-128, %rdx
- subq $-64, %rsi
- subq $-64, %rdi
-
- cmp $64, %rdx
- ja L(less128bytes)
-
- cmp $32, %rdx
- ja L(last_64_bytes)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(unaligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- movq %r8, %r9
- addq %r8, %r8
- addq %r9, %r8
- cmpq %r8, %rdx
- ja L(L2_L3_cache_unaligned)
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loop):
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(64bytesormore_loop)
-
- .p2align 4,, 6
-L(loop_tail):
- addq %rdx, %rdi
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- addq %rdx, %rsi
- movdqu (%rsi), %xmm4
- movdqu 16(%rsi), %xmm5
- movdqu 32(%rsi), %xmm6
- movdqu 48(%rsi), %xmm7
-
- CMPEQ %xmm4, %xmm0
- CMPEQ %xmm5, %xmm1
- CMPEQ %xmm6, %xmm2
- CMPEQ %xmm7, %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
- ret
-
-L(L2_L3_cache_unaligned):
- subq $64, %rdx
- .p2align 4
-L(L2_L3_unaligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
-
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(L2_L3_unaligned_128bytes_loop)
- jmp L(loop_tail)
-
-
- /* This case is for machines which are sensitive for unaligned
- * instructions. */
- .p2align 4
-L(2aligned):
- cmp $128, %rdx
- ja L(128bytesormorein2aligned)
-L(less128bytesin2aligned):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- cmp $96, %rdx
- jb L(32_to_64_bytes)
-
- addq $64, %rdi
- addq $64, %rsi
- subq $64, %rdx
-
- .p2align 4,, 6
-L(aligned_last_64_bytes):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(128bytesormorein2aligned):
- cmp $256, %rdx
- ja L(aligned_loop)
-L(less256bytesin2alinged):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $64, %rdi
- addq $64, %rsi
-
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $-128, %rdx
- subq $-64, %rsi
- subq $-64, %rdi
-
- cmp $64, %rdx
- ja L(less128bytesin2aligned)
-
- cmp $32, %rdx
- ja L(aligned_last_64_bytes)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(aligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- movq %r8, %r9
- addq %r8, %r8
- addq %r9, %r8
- cmpq %r8, %rdx
- ja L(L2_L3_cache_aligned)
-
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loopin2aligned):
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm1
- movdqa 32(%rdi), %xmm2
- movdqa 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(64bytesormore_loopin2aligned)
- jmp L(loop_tail)
-
-L(L2_L3_cache_aligned):
- subq $64, %rdx
- .p2align 4
-L(L2_L3_aligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm1
- movdqa 32(%rdi), %xmm2
- movdqa 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- addq $64, %rsi
- addq $64, %rdi
- subq $64, %rdx
- ja L(L2_L3_aligned_128bytes_loop)
- jmp L(loop_tail)
-
- .p2align 4
-L(64bytesormore_loop_end):
- pmovmskb %xmm0, %ecx
- incw %cx
- jnz L(loop_end_ret)
-
- pmovmskb %xmm1, %ecx
- notw %cx
- sall $16, %ecx
- jnz L(loop_end_ret)
-
- pmovmskb %xmm2, %ecx
- notw %cx
- shlq $32, %rcx
- jnz L(loop_end_ret)
-
- addq $48, %rdi
- addq $48, %rsi
- movq %rax, %rcx
-
- .p2align 4,, 6
-L(loop_end_ret):
- bsfq %rcx, %rcx
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rcx), %eax
- xorl %edx, %edx
- cmpl (%rsi, %rcx), %eax
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rdi, %rcx), %eax
- movzbl (%rsi, %rcx), %ecx
- subl %ecx, %eax
-# endif
- ret
-END (MEMCMP)
-#endif
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#ifndef memcmp
-# define memcmp __memcmpeq_sse2
+#if IS_IN (libc)
+# define MEMCMP __memcmpeq_sse2
+#else
+# define MEMCMP __memcmpeq
#endif
#define USE_AS_MEMCMPEQ 1
#include "memcmp-sse2.S"
--- /dev/null
+/* memcpy/mempcpy/memmove implement with rep movsb
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+ .text
+ENTRY (__mempcpy_chk_erms)
+ cmp %RDX_LP, %RCX_LP
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_erms)
+
+/* Only used to measure performance of REP MOVSB. */
+ENTRY (__mempcpy_erms)
+ mov %RDI_LP, %RAX_LP
+ /* Skip zero length. */
+ test %RDX_LP, %RDX_LP
+ jz 2f
+ add %RDX_LP, %RAX_LP
+ jmp L(start_movsb)
+END (__mempcpy_erms)
+
+ENTRY (__memmove_chk_erms)
+ cmp %RDX_LP, %RCX_LP
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_erms)
+
+ENTRY (__memmove_erms)
+ movq %rdi, %rax
+ /* Skip zero length. */
+ test %RDX_LP, %RDX_LP
+ jz 2f
+L(start_movsb):
+ mov %RDX_LP, %RCX_LP
+ cmp %RSI_LP, %RDI_LP
+ jb 1f
+ /* Source == destination is less common. */
+ je 2f
+ lea (%rsi,%rcx), %RDX_LP
+ cmp %RDX_LP, %RDI_LP
+ jb L(movsb_backward)
+1:
+ rep movsb
+2:
+ ret
+L(movsb_backward):
+ leaq -1(%rdi,%rcx), %rdi
+ leaq -1(%rsi,%rcx), %rsi
+ std
+ rep movsb
+ cld
+ ret
+END (__memmove_erms)
+strong_alias (__memmove_erms, __memcpy_erms)
+strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+#endif
# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
#endif
-/* Amount to shift rdx by to compare for memcpy_large_4x. */
+/* Amount to shift __x86_shared_non_temporal_threshold by for
+ bound for memcpy_large_4x. This is essentially use to to
+ indicate that the copy is far beyond the scope of L3
+ (assuming no user config x86_non_temporal_threshold) and to
+ use a more aggressively unrolled loop. NB: before
+ increasing the value also update initialization of
+ x86_non_temporal_threshold. */
#ifndef LOG_4X_MEMCPY_THRESH
# define LOG_4X_MEMCPY_THRESH 4
#endif
#endif
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMMOVE_SYMBOL (__memmove, unaligned))
-# if VEC_SIZE == 16
-ENTRY (__mempcpy_chk_erms)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__mempcpy_chk_erms)
-
-/* Only used to measure performance of REP MOVSB. */
-ENTRY (__mempcpy_erms)
- mov %RDI_LP, %RAX_LP
- /* Skip zero length. */
- test %RDX_LP, %RDX_LP
- jz 2f
- add %RDX_LP, %RAX_LP
- jmp L(start_movsb)
-END (__mempcpy_erms)
-
-ENTRY (__memmove_chk_erms)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__memmove_chk_erms)
-
-ENTRY (__memmove_erms)
- movq %rdi, %rax
- /* Skip zero length. */
- test %RDX_LP, %RDX_LP
- jz 2f
-L(start_movsb):
- mov %RDX_LP, %RCX_LP
- cmp %RSI_LP, %RDI_LP
- jb 1f
- /* Source == destination is less common. */
- je 2f
- lea (%rsi,%rcx), %RDX_LP
- cmp %RDX_LP, %RDI_LP
- jb L(movsb_backward)
-1:
- rep movsb
-2:
- ret
-L(movsb_backward):
- leaq -1(%rdi,%rcx), %rdi
- leaq -1(%rsi,%rcx), %rsi
- std
- rep movsb
- cld
- ret
-END (__memmove_erms)
-strong_alias (__memmove_erms, __memcpy_erms)
-strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
-# endif
# ifdef SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
.p2align 4,, 10
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
L(large_memcpy_2x_check):
- cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
- jb L(more_8x_vec_check)
+ /* Entry from L(large_memcpy_2x) has a redundant load of
+ __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x)
+ is only use for the non-erms memmove which is generally less
+ common. */
L(large_memcpy_2x):
+ mov __x86_shared_non_temporal_threshold(%rip), %R11_LP
+ cmp %R11_LP, %RDX_LP
+ jb L(more_8x_vec_check)
/* To reach this point it is impossible for dst > src and
overlap. Remaining to check is src > dst and overlap. rcx
already contains dst - src. Negate rcx to get src - dst. If
/* ecx contains -(dst - src). not ecx will return dst - src - 1
which works for testing aliasing. */
notl %ecx
+ movq %rdx, %r10
testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
jz L(large_memcpy_4x)
- movq %rdx, %r10
- shrq $LOG_4X_MEMCPY_THRESH, %r10
- cmp __x86_shared_non_temporal_threshold(%rip), %r10
+ /* r11 has __x86_shared_non_temporal_threshold. Shift it left
+ by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold.
+ */
+ shlq $LOG_4X_MEMCPY_THRESH, %r11
+ cmp %r11, %rdx
jae L(large_memcpy_4x)
/* edx will store remainder size for copying tail. */
andl $(PAGE_SIZE * 2 - 1), %edx
/* r10 stores outer loop counter. */
- shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+ shrq $(LOG_PAGE_SIZE + 1), %r10
/* Copy 4x VEC at a time from 2 pages. */
.p2align 4
L(loop_large_memcpy_2x_outer):
.p2align 4
L(large_memcpy_4x):
- movq %rdx, %r10
/* edx will store remainder size for copying tail. */
andl $(PAGE_SIZE * 4 - 1), %edx
/* r10 stores outer loop counter. */
# define MEMRCHR __memrchr_avx2_rtm
#endif
+#define COND_VZEROUPPER COND_VZEROUPPER_XTEST
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
# include <sysdep.h>
# ifndef MEMRCHR
-# define MEMRCHR __memrchr_avx2
+# define MEMRCHR __memrchr_avx2
# endif
# ifndef VZEROUPPER
-# define VZEROUPPER vzeroupper
+# define VZEROUPPER vzeroupper
# endif
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
-# define VEC_SIZE 32
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+ /* Clear upper bits. */
+ and %RDX_LP, %RDX_LP
+# else
+ test %RDX_LP, %RDX_LP
+# endif
+ jz L(zero_0)
- .section SECTION(.text),"ax",@progbits
-ENTRY (MEMRCHR)
- /* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
- vpbroadcastb %xmm0, %ymm0
-
- sub $VEC_SIZE, %RDX_LP
- jbe L(last_vec_or_less)
-
- add %RDX_LP, %RDI_LP
-
- /* Check the last VEC_SIZE bytes. */
- vpcmpeqb (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(last_vec_x0)
+ /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+ correct page cross check and 2) it correctly sets up end ptr to be
+ subtract by lzcnt aligned. */
+ leaq -1(%rdx, %rdi), %rax
- subq $(VEC_SIZE * 4), %rdi
- movl %edi, %ecx
- andl $(VEC_SIZE - 1), %ecx
- jz L(aligned_more)
+ vpbroadcastb %xmm0, %ymm0
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- addq $VEC_SIZE, %rdx
- andq $-VEC_SIZE, %rdi
- subq %rcx, %rdx
+ /* Check if we can load 1x VEC without cross a page. */
+ testl $(PAGE_SIZE - VEC_SIZE), %eax
+ jz L(page_cross)
+
+ vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
+ vpmovmskb %ymm1, %ecx
+ cmpq $VEC_SIZE, %rdx
+ ja L(more_1x_vec)
+
+L(ret_vec_x0_test):
+ /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+ will gurantee edx (len) is less than it. */
+ lzcntl %ecx, %ecx
+
+ /* Hoist vzeroupper (not great for RTM) to save code size. This allows
+ all logic for edx (len) <= VEC_SIZE to fit in first cache line. */
+ COND_VZEROUPPER
+ cmpl %ecx, %edx
+ jle L(zero_0)
+ subq %rcx, %rax
+ ret
- .p2align 4
-L(aligned_more):
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-
- /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(last_vec_x3)
-
- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(last_vec_x2)
-
- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(last_vec_x1)
-
- vpcmpeqb (%rdi), %ymm0, %ymm4
- vpmovmskb %ymm4, %eax
- testl %eax, %eax
- jnz L(last_vec_x0)
-
- /* Align data to 4 * VEC_SIZE for loop with fewer branches.
- There are some overlaps with above if data isn't aligned
- to 4 * VEC_SIZE. */
- movl %edi, %ecx
- andl $(VEC_SIZE * 4 - 1), %ecx
- jz L(loop_4x_vec)
-
- addq $(VEC_SIZE * 4), %rdi
- addq $(VEC_SIZE * 4), %rdx
- andq $-(VEC_SIZE * 4), %rdi
- subq %rcx, %rdx
+ /* Fits in aligning bytes of first cache line. */
+L(zero_0):
+ xorl %eax, %eax
+ ret
- .p2align 4
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- subq $(VEC_SIZE * 4), %rdi
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-
- vmovdqa (%rdi), %ymm1
- vmovdqa VEC_SIZE(%rdi), %ymm2
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
-
- vpcmpeqb %ymm1, %ymm0, %ymm1
- vpcmpeqb %ymm2, %ymm0, %ymm2
- vpcmpeqb %ymm3, %ymm0, %ymm3
- vpcmpeqb %ymm4, %ymm0, %ymm4
-
- vpor %ymm1, %ymm2, %ymm5
- vpor %ymm3, %ymm4, %ymm6
- vpor %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jz L(loop_4x_vec)
-
- /* There is a match. */
- vpmovmskb %ymm4, %eax
- testl %eax, %eax
- jnz L(last_vec_x3)
-
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(last_vec_x2)
-
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(last_vec_x1)
-
- vpmovmskb %ymm1, %eax
- bsrl %eax, %eax
- addq %rdi, %rax
+ .p2align 4,, 9
+L(ret_vec_x0):
+ lzcntl %ecx, %ecx
+ subq %rcx, %rax
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
- .p2align 4
-L(last_4x_vec_or_less):
- addl $(VEC_SIZE * 4), %edx
- cmpl $(VEC_SIZE * 2), %edx
- jbe L(last_2x_vec)
-
- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(last_vec_x3)
-
- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(last_vec_x2)
-
- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(last_vec_x1_check)
- cmpl $(VEC_SIZE * 3), %edx
- jbe L(zero)
-
- vpcmpeqb (%rdi), %ymm0, %ymm4
- vpmovmskb %ymm4, %eax
- testl %eax, %eax
- jz L(zero)
- bsrl %eax, %eax
- subq $(VEC_SIZE * 4), %rdx
- addq %rax, %rdx
- jl L(zero)
- addq %rdi, %rax
- VZEROUPPER_RETURN
-
- .p2align 4
+ .p2align 4,, 10
+L(more_1x_vec):
+ testl %ecx, %ecx
+ jnz L(ret_vec_x0)
+
+ /* Align rax (string pointer). */
+ andq $-VEC_SIZE, %rax
+
+ /* Recompute remaining length after aligning. */
+ movq %rax, %rdx
+ /* Need this comparison next no matter what. */
+ vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
+ subq %rdi, %rdx
+ decq %rax
+ vpmovmskb %ymm1, %ecx
+ /* Fall through for short (hotter than length). */
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
L(last_2x_vec):
- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(last_vec_x3_check)
cmpl $VEC_SIZE, %edx
- jbe L(zero)
-
- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jz L(zero)
- bsrl %eax, %eax
- subq $(VEC_SIZE * 2), %rdx
- addq %rax, %rdx
- jl L(zero)
- addl $(VEC_SIZE * 2), %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
-
- .p2align 4
-L(last_vec_x0):
- bsrl %eax, %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ jbe L(ret_vec_x0_test)
+
+ testl %ecx, %ecx
+ jnz L(ret_vec_x0)
+
+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+ vpmovmskb %ymm1, %ecx
+ /* 64-bit lzcnt. This will naturally add 32 to position. */
+ lzcntq %rcx, %rcx
+ COND_VZEROUPPER
+ cmpl %ecx, %edx
+ jle L(zero_0)
+ subq %rcx, %rax
+ ret
- .p2align 4
-L(last_vec_x1):
- bsrl %eax, %eax
- addl $VEC_SIZE, %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
- .p2align 4
-L(last_vec_x2):
- bsrl %eax, %eax
- addl $(VEC_SIZE * 2), %eax
- addq %rdi, %rax
+ /* Inexpensive place to put this regarding code size / target alignments
+ / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+ case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
+ in first cache line. */
+L(page_cross):
+ movq %rax, %rsi
+ andq $-VEC_SIZE, %rsi
+ vpcmpeqb (%rsi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %ecx
+ /* Shift out negative alignment (because we are starting from endptr and
+ working backwards). */
+ movl %eax, %r8d
+ /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */
+ notl %r8d
+ shlxl %r8d, %ecx, %ecx
+ cmpq %rdi, %rsi
+ ja L(more_1x_vec)
+ lzcntl %ecx, %ecx
+ COND_VZEROUPPER
+ cmpl %ecx, %edx
+ jle L(zero_0)
+ subq %rcx, %rax
+ ret
+ .p2align 4,, 11
+L(ret_vec_x1):
+ /* This will naturally add 32 to position. */
+ lzcntq %rcx, %rcx
+ subq %rcx, %rax
VZEROUPPER_RETURN
+ .p2align 4,, 10
+L(more_2x_vec):
+ testl %ecx, %ecx
+ jnz L(ret_vec_x0)
- .p2align 4
-L(last_vec_x3):
- bsrl %eax, %eax
- addl $(VEC_SIZE * 3), %eax
- addq %rdi, %rax
- ret
+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+ vpmovmskb %ymm1, %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x1)
- .p2align 4
-L(last_vec_x1_check):
- bsrl %eax, %eax
- subq $(VEC_SIZE * 3), %rdx
- addq %rax, %rdx
- jl L(zero)
- addl $VEC_SIZE, %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
- .p2align 4
-L(last_vec_x3_check):
- bsrl %eax, %eax
- subq $VEC_SIZE, %rdx
- addq %rax, %rdx
- jl L(zero)
- addl $(VEC_SIZE * 3), %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ /* Needed no matter what. */
+ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+ vpmovmskb %ymm1, %ecx
- .p2align 4
-L(zero):
- xorl %eax, %eax
- VZEROUPPER_RETURN
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(more_4x_vec)
+
+ cmpl $(VEC_SIZE * -1), %edx
+ jle L(ret_vec_x2_test)
+
+L(last_vec):
+ testl %ecx, %ecx
+ jnz L(ret_vec_x2)
+
+ /* Needed no matter what. */
+ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+ vpmovmskb %ymm1, %ecx
+ lzcntl %ecx, %ecx
+ subq $(VEC_SIZE * 3), %rax
+ COND_VZEROUPPER
+ subq %rcx, %rax
+ cmpq %rax, %rdi
+ ja L(zero_2)
+ ret
- .p2align 4
-L(null):
+ /* First in aligning bytes. */
+L(zero_2):
xorl %eax, %eax
ret
- .p2align 4
-L(last_vec_or_less_aligned):
- movl %edx, %ecx
+ .p2align 4,, 4
+L(ret_vec_x2_test):
+ lzcntl %ecx, %ecx
+ subq $(VEC_SIZE * 2), %rax
+ COND_VZEROUPPER
+ subq %rcx, %rax
+ cmpq %rax, %rdi
+ ja L(zero_2)
+ ret
- vpcmpeqb (%rdi), %ymm0, %ymm1
- movl $1, %edx
- /* Support rdx << 32. */
- salq %cl, %rdx
- subq $1, %rdx
+ .p2align 4,, 11
+L(ret_vec_x2):
+ /* ecx must be non-zero. */
+ bsrl %ecx, %ecx
+ leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
+ VZEROUPPER_RETURN
- vpmovmskb %ymm1, %eax
+ .p2align 4,, 14
+L(ret_vec_x3):
+ /* ecx must be non-zero. */
+ bsrl %ecx, %ecx
+ leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+ VZEROUPPER_RETURN
- /* Remove the trailing bytes. */
- andl %edx, %eax
- testl %eax, %eax
- jz L(zero)
- bsrl %eax, %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
.p2align 4
-L(last_vec_or_less):
- addl $VEC_SIZE, %edx
+L(more_4x_vec):
+ testl %ecx, %ecx
+ jnz L(ret_vec_x2)
- /* Check for zero length. */
- testl %edx, %edx
- jz L(null)
+ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
+ vpmovmskb %ymm1, %ecx
- movl %edi, %ecx
- andl $(VEC_SIZE - 1), %ecx
- jz L(last_vec_or_less_aligned)
+ testl %ecx, %ecx
+ jnz L(ret_vec_x3)
- movl %ecx, %esi
- movl %ecx, %r8d
- addl %edx, %esi
- andq $-VEC_SIZE, %rdi
+ /* Check if near end before re-aligning (otherwise might do an
+ unnecissary loop iteration). */
+ addq $-(VEC_SIZE * 4), %rax
+ cmpq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec)
- subl $VEC_SIZE, %esi
- ja L(last_vec_2x_aligned)
+ /* Align rax to (VEC_SIZE - 1). */
+ orq $(VEC_SIZE * 4 - 1), %rax
+ movq %rdi, %rdx
+ /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+ lengths that overflow can be valid and break the comparison. */
+ orq $(VEC_SIZE * 4 - 1), %rdx
- /* Check the last VEC. */
- vpcmpeqb (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
-
- /* Remove the leading and trailing bytes. */
- sarl %cl, %eax
- movl %edx, %ecx
+ .p2align 4
+L(loop_4x_vec):
+ /* Need this comparison next no matter what. */
+ vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
+ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
+ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
- movl $1, %edx
- sall %cl, %edx
- subl $1, %edx
+ vpor %ymm1, %ymm2, %ymm2
+ vpor %ymm3, %ymm4, %ymm4
+ vpor %ymm2, %ymm4, %ymm4
+ vpmovmskb %ymm4, %esi
- andl %edx, %eax
- testl %eax, %eax
- jz L(zero)
+ testl %esi, %esi
+ jnz L(loop_end)
- bsrl %eax, %eax
- addq %rdi, %rax
- addq %r8, %rax
- VZEROUPPER_RETURN
+ addq $(VEC_SIZE * -4), %rax
+ cmpq %rdx, %rax
+ jne L(loop_4x_vec)
- .p2align 4
-L(last_vec_2x_aligned):
- movl %esi, %ecx
+ subl %edi, %edx
+ incl %edx
- /* Check the last VEC. */
- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+L(last_4x_vec):
+ /* Used no matter what. */
+ vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
+ vpmovmskb %ymm1, %ecx
- movl $1, %edx
- sall %cl, %edx
- subl $1, %edx
+ cmpl $(VEC_SIZE * 2), %edx
+ jbe L(last_2x_vec)
- vpmovmskb %ymm1, %eax
+ testl %ecx, %ecx
+ jnz L(ret_vec_x0_end)
- /* Remove the trailing bytes. */
- andl %edx, %eax
+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
+ vpmovmskb %ymm1, %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x1_end)
- testl %eax, %eax
- jnz L(last_vec_x1)
+ /* Used no matter what. */
+ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
+ vpmovmskb %ymm1, %ecx
- /* Check the second last VEC. */
- vpcmpeqb (%rdi), %ymm0, %ymm1
+ cmpl $(VEC_SIZE * 3), %edx
+ ja L(last_vec)
+
+ lzcntl %ecx, %ecx
+ subq $(VEC_SIZE * 2), %rax
+ COND_VZEROUPPER
+ subq %rcx, %rax
+ cmpq %rax, %rdi
+ jbe L(ret0)
+ xorl %eax, %eax
+L(ret0):
+ ret
- movl %r8d, %ecx
- vpmovmskb %ymm1, %eax
+ .p2align 4
+L(loop_end):
+ vpmovmskb %ymm1, %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x0_end)
+
+ vpmovmskb %ymm2, %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x1_end)
+
+ vpmovmskb %ymm3, %ecx
+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+ then it won't affect the result in esi (VEC4). If ecx is non-zero
+ then CHAR in VEC3 and bsrq will use that position. */
+ salq $32, %rcx
+ orq %rsi, %rcx
+ bsrq %rcx, %rcx
+ leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
+ VZEROUPPER_RETURN
- /* Remove the leading bytes. Must use unsigned right shift for
- bsrl below. */
- shrl %cl, %eax
- testl %eax, %eax
- jz L(zero)
+ .p2align 4,, 4
+L(ret_vec_x1_end):
+ /* 64-bit version will automatically add 32 (VEC_SIZE). */
+ lzcntq %rcx, %rcx
+ subq %rcx, %rax
+ VZEROUPPER_RETURN
- bsrl %eax, %eax
- addq %rdi, %rax
- addq %r8, %rax
+ .p2align 4,, 4
+L(ret_vec_x0_end):
+ lzcntl %ecx, %ecx
+ subq %rcx, %rax
VZEROUPPER_RETURN
-END (MEMRCHR)
+
+ /* 2 bytes until next cache line. */
+END(MEMRCHR)
#endif
#if IS_IN (libc)
# include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+# error "VEC_SIZE != 32 unimplemented"
+# endif
+
+# ifndef MEMRCHR
+# define MEMRCHR __memrchr_evex
+# endif
+
+# define PAGE_SIZE 4096
+# define VECMATCH VEC(0)
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(MEMRCHR, 6)
+# ifdef __ILP32__
+ /* Clear upper bits. */
+ and %RDX_LP, %RDX_LP
+# else
+ test %RDX_LP, %RDX_LP
+# endif
+ jz L(zero_0)
+
+ /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
+ correct page cross check and 2) it correctly sets up end ptr to be
+ subtract by lzcnt aligned. */
+ leaq -1(%rdi, %rdx), %rax
+ vpbroadcastb %esi, %VECMATCH
+
+ /* Check if we can load 1x VEC without cross a page. */
+ testl $(PAGE_SIZE - VEC_SIZE), %eax
+ jz L(page_cross)
+
+ /* Don't use rax for pointer here because EVEX has better encoding with
+ offset % VEC_SIZE == 0. */
+ vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
+ kmovd %k0, %ecx
+
+ /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */
+ cmpq $VEC_SIZE, %rdx
+ ja L(more_1x_vec)
+L(ret_vec_x0_test):
+
+ /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
+ will guarantee edx (len) is less than it. */
+ lzcntl %ecx, %ecx
+ cmpl %ecx, %edx
+ jle L(zero_0)
+ subq %rcx, %rax
+ ret
-# define VMOVA vmovdqa64
-
-# define YMMMATCH ymm16
-
-# define VEC_SIZE 32
-
- .section .text.evex,"ax",@progbits
-ENTRY (__memrchr_evex)
- /* Broadcast CHAR to YMMMATCH. */
- vpbroadcastb %esi, %YMMMATCH
-
- sub $VEC_SIZE, %RDX_LP
- jbe L(last_vec_or_less)
-
- add %RDX_LP, %RDI_LP
-
- /* Check the last VEC_SIZE bytes. */
- vpcmpb $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(last_vec_x0)
-
- subq $(VEC_SIZE * 4), %rdi
- movl %edi, %ecx
- andl $(VEC_SIZE - 1), %ecx
- jz L(aligned_more)
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- addq $VEC_SIZE, %rdx
- andq $-VEC_SIZE, %rdi
- subq %rcx, %rdx
-
- .p2align 4
-L(aligned_more):
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-
- /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(last_vec_x3)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(last_vec_x2)
-
- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
- kmovd %k3, %eax
- testl %eax, %eax
- jnz L(last_vec_x1)
-
- vpcmpb $0, (%rdi), %YMMMATCH, %k4
- kmovd %k4, %eax
- testl %eax, %eax
- jnz L(last_vec_x0)
-
- /* Align data to 4 * VEC_SIZE for loop with fewer branches.
- There are some overlaps with above if data isn't aligned
- to 4 * VEC_SIZE. */
- movl %edi, %ecx
- andl $(VEC_SIZE * 4 - 1), %ecx
- jz L(loop_4x_vec)
-
- addq $(VEC_SIZE * 4), %rdi
- addq $(VEC_SIZE * 4), %rdx
- andq $-(VEC_SIZE * 4), %rdi
- subq %rcx, %rdx
+ /* Fits in aligning bytes of first cache line. */
+L(zero_0):
+ xorl %eax, %eax
+ ret
- .p2align 4
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- subq $(VEC_SIZE * 4), %rdi
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-
- vpcmpb $0, (%rdi), %YMMMATCH, %k1
- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
- kord %k1, %k2, %k5
- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
- kord %k3, %k4, %k6
- kortestd %k5, %k6
- jz L(loop_4x_vec)
-
- /* There is a match. */
- kmovd %k4, %eax
- testl %eax, %eax
- jnz L(last_vec_x3)
-
- kmovd %k3, %eax
- testl %eax, %eax
- jnz L(last_vec_x2)
-
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(last_vec_x1)
-
- kmovd %k1, %eax
- bsrl %eax, %eax
- addq %rdi, %rax
+ .p2align 4,, 9
+L(ret_vec_x0_dec):
+ decq %rax
+L(ret_vec_x0):
+ lzcntl %ecx, %ecx
+ subq %rcx, %rax
ret
- .p2align 4
-L(last_4x_vec_or_less):
- addl $(VEC_SIZE * 4), %edx
- cmpl $(VEC_SIZE * 2), %edx
- jbe L(last_2x_vec)
+ .p2align 4,, 10
+L(more_1x_vec):
+ testl %ecx, %ecx
+ jnz L(ret_vec_x0)
- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(last_vec_x3)
+ /* Align rax (pointer to string). */
+ andq $-VEC_SIZE, %rax
- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(last_vec_x2)
+ /* Recompute length after aligning. */
+ movq %rax, %rdx
- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
- kmovd %k3, %eax
- testl %eax, %eax
- jnz L(last_vec_x1_check)
- cmpl $(VEC_SIZE * 3), %edx
- jbe L(zero)
+ /* Need no matter what. */
+ vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
+ kmovd %k0, %ecx
- vpcmpb $0, (%rdi), %YMMMATCH, %k4
- kmovd %k4, %eax
- testl %eax, %eax
- jz L(zero)
- bsrl %eax, %eax
- subq $(VEC_SIZE * 4), %rdx
- addq %rax, %rdx
- jl L(zero)
- addq %rdi, %rax
- ret
+ subq %rdi, %rdx
- .p2align 4
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
L(last_2x_vec):
- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(last_vec_x3_check)
+
+ /* Must dec rax because L(ret_vec_x0_test) expects it. */
+ decq %rax
cmpl $VEC_SIZE, %edx
- jbe L(zero)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jz L(zero)
- bsrl %eax, %eax
- subq $(VEC_SIZE * 2), %rdx
- addq %rax, %rdx
- jl L(zero)
- addl $(VEC_SIZE * 2), %eax
- addq %rdi, %rax
+ jbe L(ret_vec_x0_test)
+
+ testl %ecx, %ecx
+ jnz L(ret_vec_x0)
+
+ /* Don't use rax for pointer here because EVEX has better encoding with
+ offset % VEC_SIZE == 0. */
+ vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
+ kmovd %k0, %ecx
+ /* NB: 64-bit lzcnt. This will naturally add 32 to position. */
+ lzcntq %rcx, %rcx
+ cmpl %ecx, %edx
+ jle L(zero_0)
+ subq %rcx, %rax
ret
- .p2align 4
-L(last_vec_x0):
- bsrl %eax, %eax
- addq %rdi, %rax
+ /* Inexpensive place to put this regarding code size / target alignments
+ / ICache NLP. Necessary for 2-byte encoding of jump to page cross
+ case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
+ in first cache line. */
+L(page_cross):
+ movq %rax, %rsi
+ andq $-VEC_SIZE, %rsi
+ vpcmpb $0, (%rsi), %VECMATCH, %k0
+ kmovd %k0, %r8d
+ /* Shift out negative alignment (because we are starting from endptr and
+ working backwards). */
+ movl %eax, %ecx
+ /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */
+ notl %ecx
+ shlxl %ecx, %r8d, %ecx
+ cmpq %rdi, %rsi
+ ja L(more_1x_vec)
+ lzcntl %ecx, %ecx
+ cmpl %ecx, %edx
+ jle L(zero_1)
+ subq %rcx, %rax
ret
- .p2align 4
-L(last_vec_x1):
- bsrl %eax, %eax
- addl $VEC_SIZE, %eax
- addq %rdi, %rax
+ /* Continue creating zero labels that fit in aligning bytes and get
+ 2-byte encoding / are in the same cache line as condition. */
+L(zero_1):
+ xorl %eax, %eax
ret
- .p2align 4
-L(last_vec_x2):
- bsrl %eax, %eax
- addl $(VEC_SIZE * 2), %eax
- addq %rdi, %rax
+ .p2align 4,, 8
+L(ret_vec_x1):
+ /* This will naturally add 32 to position. */
+ bsrl %ecx, %ecx
+ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
ret
- .p2align 4
-L(last_vec_x3):
- bsrl %eax, %eax
- addl $(VEC_SIZE * 3), %eax
- addq %rdi, %rax
- ret
+ .p2align 4,, 8
+L(more_2x_vec):
+ testl %ecx, %ecx
+ jnz L(ret_vec_x0_dec)
- .p2align 4
-L(last_vec_x1_check):
- bsrl %eax, %eax
- subq $(VEC_SIZE * 3), %rdx
- addq %rax, %rdx
- jl L(zero)
- addl $VEC_SIZE, %eax
- addq %rdi, %rax
- ret
+ vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
+ kmovd %k0, %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x1)
- .p2align 4
-L(last_vec_x3_check):
- bsrl %eax, %eax
- subq $VEC_SIZE, %rdx
- addq %rax, %rdx
- jl L(zero)
- addl $(VEC_SIZE * 3), %eax
- addq %rdi, %rax
- ret
+ /* Need no matter what. */
+ vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
+ kmovd %k0, %ecx
- .p2align 4
-L(zero):
- xorl %eax, %eax
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(more_4x_vec)
+
+ cmpl $(VEC_SIZE * -1), %edx
+ jle L(ret_vec_x2_test)
+L(last_vec):
+ testl %ecx, %ecx
+ jnz L(ret_vec_x2)
+
+
+ /* Need no matter what. */
+ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+ kmovd %k0, %ecx
+ lzcntl %ecx, %ecx
+ subq $(VEC_SIZE * 3 + 1), %rax
+ subq %rcx, %rax
+ cmpq %rax, %rdi
+ ja L(zero_1)
ret
- .p2align 4
-L(last_vec_or_less_aligned):
- movl %edx, %ecx
-
- vpcmpb $0, (%rdi), %YMMMATCH, %k1
-
- movl $1, %edx
- /* Support rdx << 32. */
- salq %cl, %rdx
- subq $1, %rdx
-
- kmovd %k1, %eax
-
- /* Remove the trailing bytes. */
- andl %edx, %eax
- testl %eax, %eax
- jz L(zero)
-
- bsrl %eax, %eax
- addq %rdi, %rax
+ .p2align 4,, 8
+L(ret_vec_x2_test):
+ lzcntl %ecx, %ecx
+ subq $(VEC_SIZE * 2 + 1), %rax
+ subq %rcx, %rax
+ cmpq %rax, %rdi
+ ja L(zero_1)
ret
- .p2align 4
-L(last_vec_or_less):
- addl $VEC_SIZE, %edx
-
- /* Check for zero length. */
- testl %edx, %edx
- jz L(zero)
-
- movl %edi, %ecx
- andl $(VEC_SIZE - 1), %ecx
- jz L(last_vec_or_less_aligned)
-
- movl %ecx, %esi
- movl %ecx, %r8d
- addl %edx, %esi
- andq $-VEC_SIZE, %rdi
+ .p2align 4,, 8
+L(ret_vec_x2):
+ bsrl %ecx, %ecx
+ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
+ ret
- subl $VEC_SIZE, %esi
- ja L(last_vec_2x_aligned)
+ .p2align 4,, 8
+L(ret_vec_x3):
+ bsrl %ecx, %ecx
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
+ ret
- /* Check the last VEC. */
- vpcmpb $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ .p2align 4,, 8
+L(more_4x_vec):
+ testl %ecx, %ecx
+ jnz L(ret_vec_x2)
- /* Remove the leading and trailing bytes. */
- sarl %cl, %eax
- movl %edx, %ecx
+ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
+ kmovd %k0, %ecx
- movl $1, %edx
- sall %cl, %edx
- subl $1, %edx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x3)
- andl %edx, %eax
- testl %eax, %eax
- jz L(zero)
+ /* Check if near end before re-aligning (otherwise might do an
+ unnecessary loop iteration). */
+ addq $-(VEC_SIZE * 4), %rax
+ cmpq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec)
- bsrl %eax, %eax
- addq %rdi, %rax
- addq %r8, %rax
- ret
+ decq %rax
+ andq $-(VEC_SIZE * 4), %rax
+ movq %rdi, %rdx
+ /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
+ lengths that overflow can be valid and break the comparison. */
+ andq $-(VEC_SIZE * 4), %rdx
.p2align 4
-L(last_vec_2x_aligned):
- movl %esi, %ecx
-
- /* Check the last VEC. */
- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+L(loop_4x_vec):
+ /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
+ on). */
+ vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
+
+ /* VEC(2/3) will have zero-byte where we found a CHAR. */
+ vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
+ vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
+ vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
+
+ /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
+ CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */
+ vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}
+ vptestnmb %VEC(3), %VEC(3), %k2
+
+ /* Any 1s and we found CHAR. */
+ kortestd %k2, %k4
+ jnz L(loop_end)
+
+ addq $-(VEC_SIZE * 4), %rax
+ cmpq %rdx, %rax
+ jne L(loop_4x_vec)
+
+ /* Need to re-adjust rdx / rax for L(last_4x_vec). */
+ subq $-(VEC_SIZE * 4), %rdx
+ movq %rdx, %rax
+ subl %edi, %edx
+L(last_4x_vec):
+
+ /* Used no matter what. */
+ vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
+ kmovd %k0, %ecx
- movl $1, %edx
- sall %cl, %edx
- subl $1, %edx
+ cmpl $(VEC_SIZE * 2), %edx
+ jbe L(last_2x_vec)
- kmovd %k1, %eax
+ testl %ecx, %ecx
+ jnz L(ret_vec_x0_dec)
- /* Remove the trailing bytes. */
- andl %edx, %eax
- testl %eax, %eax
- jnz L(last_vec_x1)
+ vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
+ kmovd %k0, %ecx
- /* Check the second last VEC. */
- vpcmpb $0, (%rdi), %YMMMATCH, %k1
+ testl %ecx, %ecx
+ jnz L(ret_vec_x1)
- movl %r8d, %ecx
+ /* Used no matter what. */
+ vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
+ kmovd %k0, %ecx
- kmovd %k1, %eax
+ cmpl $(VEC_SIZE * 3), %edx
+ ja L(last_vec)
- /* Remove the leading bytes. Must use unsigned right shift for
- bsrl below. */
- shrl %cl, %eax
- testl %eax, %eax
- jz L(zero)
+ lzcntl %ecx, %ecx
+ subq $(VEC_SIZE * 2 + 1), %rax
+ subq %rcx, %rax
+ cmpq %rax, %rdi
+ jbe L(ret_1)
+ xorl %eax, %eax
+L(ret_1):
+ ret
- bsrl %eax, %eax
- addq %rdi, %rax
- addq %r8, %rax
+ .p2align 4,, 6
+L(loop_end):
+ kmovd %k1, %ecx
+ notl %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x0_end)
+
+ vptestnmb %VEC(2), %VEC(2), %k0
+ kmovd %k0, %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x1_end)
+
+ kmovd %k2, %ecx
+ kmovd %k4, %esi
+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+ then it won't affect the result in esi (VEC4). If ecx is non-zero
+ then CHAR in VEC3 and bsrq will use that position. */
+ salq $32, %rcx
+ orq %rsi, %rcx
+ bsrq %rcx, %rcx
+ addq %rcx, %rax
+ ret
+ .p2align 4,, 4
+L(ret_vec_x0_end):
+ addq $(VEC_SIZE), %rax
+L(ret_vec_x1_end):
+ bsrl %ecx, %ecx
+ leaq (VEC_SIZE * 2)(%rax, %rcx), %rax
ret
-END (__memrchr_evex)
+
+END(MEMRCHR)
#endif
# define VMOVU vmovdqu
# define VMOVA vmovdqa
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastb %xmm0, %ymm0
+ movq r, %rax;
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
# ifndef SECTION
# define SECTION(p) p##.avx
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# endif
+# define USE_XMM_LESS_VEC
# include "memset-vec-unaligned-erms.S"
#endif
# define VZEROUPPER
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastb d, %VEC0; \
+ movq r, %rax
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastd d, %VEC0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
# define SECTION(p) p##.evex512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
--- /dev/null
+/* memset implement with rep stosb
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+ .text
+ENTRY (__memset_chk_erms)
+ cmp %RDX_LP, %RCX_LP
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_erms)
+
+/* Only used to measure performance of REP STOSB. */
+ENTRY (__memset_erms)
+ /* Skip zero length. */
+ test %RDX_LP, %RDX_LP
+ jz L(stosb_return_zero)
+ mov %RDX_LP, %RCX_LP
+ movzbl %sil, %eax
+ mov %RDI_LP, %RDX_LP
+ rep stosb
+ mov %RDX_LP, %RAX_LP
+ ret
+L(stosb_return_zero):
+ movq %rdi, %rax
+ ret
+END (__memset_erms)
+#endif
# define VZEROUPPER
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastb d, %VEC0; \
+ movq r, %rax
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastd d, %VEC0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
# define SECTION(p) p##.evex
# define MEMSET_SYMBOL(p,s) p##_evex_##s
# endif
# undef weak_alias
-# define weak_alias(original, alias) \
- .weak bzero; bzero = __bzero
-
+# define weak_alias(original, alias)
# undef strong_alias
# define strong_alias(ignored1, ignored2)
#endif
-/* memset/bzero with unaligned store and rep stosb
+/* memset with unaligned store and rep stosb
Copyright (C) 2016-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
#ifndef MOVQ
# if VEC_SIZE > 16
# define MOVQ vmovq
+# define MOVD vmovd
# else
# define MOVQ movq
+# define MOVD movd
# endif
#endif
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
# define END_REG rcx
# define LOOP_REG rdi
+# define LESS_VEC_REG rax
#else
# define END_REG rdi
# define LOOP_REG rdx
+# define LESS_VEC_REG rdi
+#endif
+
+#ifdef USE_XMM_LESS_VEC
+# define XMM_SMALL 1
+#else
+# define XMM_SMALL 0
+#endif
+
+#ifdef USE_LESS_VEC_MASK_STORE
+# define SET_REG64 rcx
+# define SET_REG32 ecx
+# define SET_REG16 cx
+# define SET_REG8 cl
+#else
+# define SET_REG64 rsi
+# define SET_REG32 esi
+# define SET_REG16 si
+# define SET_REG8 sil
#endif
#define PAGE_SIZE 4096
# error SECTION is not defined!
#endif
- .section SECTION(.text),"ax",@progbits
-#if VEC_SIZE == 16 && IS_IN (libc)
-ENTRY (__bzero)
- mov %RDI_LP, %RAX_LP /* Set return value. */
- mov %RSI_LP, %RDX_LP /* Set n. */
- xorl %esi, %esi
- pxor %XMM0, %XMM0
- jmp L(entry_from_bzero)
-END (__bzero)
-weak_alias (__bzero, bzero)
-#endif
-
+ .section SECTION(.text), "ax", @progbits
#if IS_IN (libc)
# if defined SHARED
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
shl $2, %RDX_LP
- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
- jmp L(entry_from_bzero)
+ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ WMEMSET_VDUP_TO_VEC0_LOW()
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec_from_wmemset)
+ WMEMSET_VDUP_TO_VEC0_HIGH()
+ jmp L(entry_from_wmemset)
END (WMEMSET_SYMBOL (__wmemset, unaligned))
#endif
#endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
mov %edx, %edx
# endif
-L(entry_from_bzero):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
+ MEMSET_VDUP_TO_VEC0_HIGH()
+L(entry_from_wmemset):
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMSET_SYMBOL (__memset, unaligned))
-# if VEC_SIZE == 16
-ENTRY (__memset_chk_erms)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__memset_chk_erms)
-
-/* Only used to measure performance of REP STOSB. */
-ENTRY (__memset_erms)
- /* Skip zero length. */
- test %RDX_LP, %RDX_LP
- jnz L(stosb)
- movq %rdi, %rax
- ret
-# else
-/* Provide a hidden symbol to debugger. */
- .hidden MEMSET_SYMBOL (__memset, erms)
-ENTRY (MEMSET_SYMBOL (__memset, erms))
-# endif
-L(stosb):
- mov %RDX_LP, %RCX_LP
- movzbl %sil, %eax
- mov %RDI_LP, %RDX_LP
- rep stosb
- mov %RDX_LP, %RAX_LP
- VZEROUPPER_RETURN
-# if VEC_SIZE == 16
-END (__memset_erms)
-# else
-END (MEMSET_SYMBOL (__memset, erms))
-# endif
-
# if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
cmp %RDX_LP, %RCX_LP
# endif
ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
mov %edx, %edx
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
+ MEMSET_VDUP_TO_VEC0_HIGH ()
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(stosb_more_2x_vec)
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
- */
- VMOVU %VEC(0), (%rax)
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
- .p2align 4,, 10
+ .p2align 4,, 4
L(last_2x_vec):
#ifdef USE_LESS_VEC_MASK_STORE
- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
#else
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
#ifdef USE_LESS_VEC_MASK_STORE
.p2align 4,, 10
L(less_vec):
+L(less_vec_from_wmemset):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
and (4x, 8x] jump to target. */
L(more_2x_vec):
-
- /* Two different methods of setting up pointers / compare. The
- two methods are based on the fact that EVEX/AVX512 mov
- instructions take more bytes then AVX2/SSE2 mov instructions. As
- well that EVEX/AVX512 machines also have fast LEA_BID. Both
- setup and END_REG to avoid complex address mode. For EVEX/AVX512
- this saves code size and keeps a few targets in one fetch block.
- For AVX2/SSE2 this helps prevent AGU bottlenecks. */
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
- LOOP_4X_OFFSET) with LEA_BID. */
-
- /* END_REG is rcx for EVEX/AVX512. */
- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
-#endif
-
- /* Stores to first 2x VEC before cmp as any path forward will
- require it. */
- VMOVU %VEC(0), (%rax)
- VMOVU %VEC(0), VEC_SIZE(%rax)
+ /* Store next 2x vec regardless. */
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
+ /* Two different methods of setting up pointers / compare. The two
+ methods are based on the fact that EVEX/AVX512 mov instructions take
+ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+ machines also have fast LEA_BID. Both setup and END_REG to avoid complex
+ address mode. For EVEX/AVX512 this saves code size and keeps a few
+ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
+ bottlenecks. */
#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
addq %rdx, %END_REG
cmpq $(VEC_SIZE * 4), %rdx
jbe L(last_2x_vec)
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+ LEA_BID. */
+
+ /* END_REG is rcx for EVEX/AVX512. */
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
/* Store next 2x vec regardless. */
VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
/* Define L(less_vec) only if not otherwise defined. */
.p2align 4
L(less_vec):
+ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+ xmm). This is only does anything for AVX2. */
+ MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_from_wmemset):
#endif
L(cross_page):
#if VEC_SIZE > 32
cmpl $32, %edx
- jae L(between_32_63)
+ jge L(between_32_63)
#endif
#if VEC_SIZE > 16
cmpl $16, %edx
- jae L(between_16_31)
+ jge L(between_16_31)
+#endif
+#ifndef USE_XMM_LESS_VEC
+ MOVQ %XMM0, %SET_REG64
#endif
- MOVQ %XMM0, %rdi
cmpl $8, %edx
- jae L(between_8_15)
+ jge L(between_8_15)
cmpl $4, %edx
- jae L(between_4_7)
+ jge L(between_4_7)
cmpl $1, %edx
- ja L(between_2_3)
- jb L(return)
- movb %sil, (%rax)
- VZEROUPPER_RETURN
+ jg L(between_2_3)
+ jl L(between_0_0)
+ movb %SET_REG8, (%LESS_VEC_REG)
+L(between_0_0):
+ ret
- /* Align small targets only if not doing so would cross a fetch
- line. */
+ /* Align small targets only if not doing so would cross a fetch line.
+ */
#if VEC_SIZE > 32
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- VMOVU %YMM0, (%rax)
- VMOVU %YMM0, -32(%rax, %rdx)
+ VMOVU %YMM0, (%LESS_VEC_REG)
+ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
VZEROUPPER_RETURN
#endif
#if VEC_SIZE >= 32
- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
L(between_16_31):
/* From 16 to 31. No branch when size == 16. */
- VMOVU %XMM0, (%rax)
- VMOVU %XMM0, -16(%rax, %rdx)
- VZEROUPPER_RETURN
+ VMOVU %XMM0, (%LESS_VEC_REG)
+ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
+ ret
#endif
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+ */
+ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
- movq %rdi, (%rax)
- movq %rdi, -8(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ MOVQ %XMM0, (%rdi)
+ MOVQ %XMM0, -8(%rdi, %rdx)
+#else
+ movq %SET_REG64, (%LESS_VEC_REG)
+ movq %SET_REG64, -8(%LESS_VEC_REG, %rdx)
+#endif
+ ret
- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+ */
+ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
- movl %edi, (%rax)
- movl %edi, -4(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ MOVD %XMM0, (%rdi)
+ MOVD %XMM0, -4(%rdi, %rdx)
+#else
+ movl %SET_REG32, (%LESS_VEC_REG)
+ movl %SET_REG32, -4(%LESS_VEC_REG, %rdx)
+#endif
+ ret
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ /* 4 * XMM_SMALL for the third mov for AVX2. */
+ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
- movw %di, (%rax)
- movb %dil, -1(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ movb %SET_REG8, (%rdi)
+ movb %SET_REG8, 1(%rdi)
+ movb %SET_REG8, -1(%rdi, %rdx)
+#else
+ movw %SET_REG16, (%LESS_VEC_REG)
+ movb %SET_REG8, -1(%LESS_VEC_REG, %rdx)
+#endif
+ ret
END (MEMSET_SYMBOL (__memset, unaligned_erms))
--- /dev/null
+/* Common config for SSE2 VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _SSE2_VECS_H
+#define _SSE2_VECS_H 1
+
+#ifdef VEC_SIZE
+# error "Multiple VEC configs included!"
+#endif
+
+#define VEC_SIZE 16
+#include "vec-macros.h"
+
+#define USE_WITH_SSE2 1
+#define SECTION(p) p
+
+/* 3-byte mov instructions with SSE2. */
+#define MOV_SIZE 3
+/* No vzeroupper needed. */
+#define RET_SIZE 1
+#define VZEROUPPER
+
+#define VMOVU movups
+#define VMOVA movaps
+#define VMOVNT movntdq
+
+#define VEC_xmm VEC_any_xmm
+#define VEC VEC_any_xmm
+
+
+#endif
+++ /dev/null
-/* strcasecmp_l optimized with AVX.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#define STRCMP_SSE42 __strcasecmp_l_avx
-#define USE_AVX 1
-#define USE_AS_STRCASECMP_L
-#include "strcmp-sse42.S"
--- /dev/null
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
--- /dev/null
+/* strcasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
--- /dev/null
+/* strcasecmp_l optimized with EVEX.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
movl %edi, %eax
andl $(PAGE_SIZE - 1), %eax
VPBROADCAST %xmm0, %ymm0
- vpxor %xmm9, %xmm9, %xmm9
+ vpxor %xmm1, %xmm1, %xmm1
/* Check if we cross page boundary with one vector load. */
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
null byte. */
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqu (%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jz L(aligned_more)
tzcntl %eax, %eax
# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
- cmp (%rdi, %rax), %CHAR_REG
- jne L(zero)
-# endif
- addq %rdi, %rax
- VZEROUPPER_RETURN
-
- /* .p2align 5 helps keep performance more consistent if ENTRY()
- alignment % 32 was either 16 or 0. As well this makes the
- alignment % 32 of the loop_4x_vec fixed which makes tuning it
- easier. */
- .p2align 5
-L(first_vec_x4):
- tzcntl %eax, %eax
- addq $(VEC_SIZE * 3 + 1), %rdi
-# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
+ /* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
+ /* NB: Use a branch instead of cmovcc here. The expectation is
+ that with strchr the user will branch based on input being
+ null. Since this branch will be 100% predictive of the user
+ branch a branch miss here should save what otherwise would
+ be branch miss in the user code. Otherwise using a branch 1)
+ saves code size and 2) is faster in highly predictable
+ environments. */
jne L(zero)
# endif
addq %rdi, %rax
- VZEROUPPER_RETURN
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
# ifndef USE_AS_STRCHRNUL
L(zero):
.p2align 4
L(first_vec_x1):
- tzcntl %eax, %eax
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
incq %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
addq %rdi, %rax
VZEROUPPER_RETURN
- .p2align 4
+ .p2align 4,, 10
L(first_vec_x2):
- tzcntl %eax, %eax
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
addq $(VEC_SIZE + 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
addq %rdi, %rax
VZEROUPPER_RETURN
- .p2align 4
+ .p2align 4,, 8
L(first_vec_x3):
- tzcntl %eax, %eax
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
addq $(VEC_SIZE * 2 + 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
addq %rdi, %rax
VZEROUPPER_RETURN
+ .p2align 4,, 10
+L(first_vec_x4):
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
+# endif
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+
+
.p2align 4
L(aligned_more):
/* Align data to VEC_SIZE - 1. This is the same number of
L(cross_page_continue):
/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- vmovdqa 1(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa 1(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x4)
- /* Align data to VEC_SIZE * 4 - 1. */
- addq $(VEC_SIZE * 4 + 1), %rdi
- andq $-(VEC_SIZE * 4), %rdi
+ /* Align data to VEC_SIZE * 4 - 1. */
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
.p2align 4
L(loop_4x_vec):
/* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm5
- vmovdqa (VEC_SIZE)(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
+ vmovdqa 1(%rdi), %ymm6
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
/* Leaves only CHARS matching esi as 0. */
- vpxor %ymm5, %ymm0, %ymm1
vpxor %ymm6, %ymm0, %ymm2
vpxor %ymm7, %ymm0, %ymm3
- vpxor %ymm8, %ymm0, %ymm4
- VPMINU %ymm1, %ymm5, %ymm1
VPMINU %ymm2, %ymm6, %ymm2
VPMINU %ymm3, %ymm7, %ymm3
- VPMINU %ymm4, %ymm8, %ymm4
- VPMINU %ymm1, %ymm2, %ymm5
- VPMINU %ymm3, %ymm4, %ymm6
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
+
+ vpxor %ymm6, %ymm0, %ymm4
+ vpxor %ymm7, %ymm0, %ymm5
+
+ VPMINU %ymm4, %ymm6, %ymm4
+ VPMINU %ymm5, %ymm7, %ymm5
- VPMINU %ymm5, %ymm6, %ymm6
+ VPMINU %ymm2, %ymm3, %ymm6
+ VPMINU %ymm4, %ymm5, %ymm7
- VPCMPEQ %ymm6, %ymm9, %ymm6
- vpmovmskb %ymm6, %ecx
+ VPMINU %ymm6, %ymm7, %ymm7
+
+ VPCMPEQ %ymm7, %ymm1, %ymm7
+ vpmovmskb %ymm7, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
jz L(loop_4x_vec)
-
- VPCMPEQ %ymm1, %ymm9, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(last_vec_x0)
- VPCMPEQ %ymm5, %ymm9, %ymm2
- vpmovmskb %ymm2, %eax
+ VPCMPEQ %ymm3, %ymm1, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(last_vec_x1)
- VPCMPEQ %ymm3, %ymm9, %ymm3
- vpmovmskb %ymm3, %eax
+ VPCMPEQ %ymm4, %ymm1, %ymm4
+ vpmovmskb %ymm4, %eax
/* rcx has combined result from all 4 VEC. It will only be used
if the first 3 other VEC all did not contain a match. */
salq $32, %rcx
orq %rcx, %rax
tzcntq %rax, %rax
- subq $(VEC_SIZE * 2), %rdi
+ subq $(VEC_SIZE * 2 - 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
VZEROUPPER_RETURN
- .p2align 4
+ .p2align 4,, 10
L(last_vec_x0):
- tzcntl %eax, %eax
- addq $-(VEC_SIZE * 4), %rdi
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
+ addq $-(VEC_SIZE * 4 - 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
addq %rdi, %rax
VZEROUPPER_RETURN
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
- xorl %eax, %eax
- VZEROUPPER_RETURN
-# endif
- .p2align 4
+ .p2align 4,, 10
L(last_vec_x1):
tzcntl %eax, %eax
- subq $(VEC_SIZE * 3), %rdi
+ subq $(VEC_SIZE * 3 - 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
/* Cold case for crossing page with first load. */
- .p2align 4
+ .p2align 4,, 8
L(cross_page_boundary):
movq %rdi, %rdx
/* Align rdi to VEC_SIZE - 1. */
orq $(VEC_SIZE - 1), %rdi
- vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
so no need to manually mod edx. */
sarxl %edx, %eax, %eax
xorl %ecx, %ecx
/* Found CHAR or the null byte. */
cmp (%rdx, %rax), %CHAR_REG
- leaq (%rdx, %rax), %rax
- cmovne %rcx, %rax
-# else
- addq %rdx, %rax
+ jne L(zero_end)
# endif
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ addq %rdx, %rax
+ VZEROUPPER_RETURN
END (STRCHR)
-# endif
+#endif
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMP vpcmpd
+# define VPTESTN vptestnmd
# define VPMINU vpminud
# define CHAR_REG esi
# define SHIFT_REG ecx
# else
# define VPBROADCAST vpbroadcastb
# define VPCMP vpcmpb
+# define VPTESTN vptestnmb
# define VPMINU vpminub
# define CHAR_REG sil
# define SHIFT_REG edx
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section .text.evex,"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
/* Broadcast CHAR to YMM0. */
VPBROADCAST %esi, %YMM0
movl %edi, %eax
andl $(PAGE_SIZE - 1), %eax
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
/* Check if we cross page boundary with one vector load.
Otherwise it is safe to use an unaligned load. */
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jz L(aligned_more)
tzcntl %eax, %eax
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ /* NB: Use a branch instead of cmovcc here. The expectation is
+ that with strchr the user will branch based on input being
+ null. Since this branch will be 100% predictive of the user
+ branch a branch miss here should save what otherwise would
+ be branch miss in the user code. Otherwise using a branch 1)
+ saves code size and 2) is faster in highly predictable
+ environments. */
+ jne L(zero)
+# endif
# ifdef USE_AS_WCSCHR
/* NB: Multiply wchar_t count by 4 to get the number of bytes.
*/
leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
addq %rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
- cmp (%rax), %CHAR_REG
- jne L(zero)
# endif
ret
- /* .p2align 5 helps keep performance more consistent if ENTRY()
- alignment % 32 was either 16 or 0. As well this makes the
- alignment % 32 of the loop_4x_vec fixed which makes tuning it
- easier. */
- .p2align 5
-L(first_vec_x3):
- tzcntl %eax, %eax
-# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
- cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
- jne L(zero)
-# endif
- /* NB: Multiply sizeof char type (1 or 4) to get the number of
- bytes. */
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
- ret
-# ifndef USE_AS_STRCHRNUL
-L(zero):
- xorl %eax, %eax
- ret
-# endif
- .p2align 4
+ .p2align 4,, 10
L(first_vec_x4):
# ifndef USE_AS_STRCHRNUL
/* Check to see if first match was CHAR (k0) or null (k1). */
leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
ret
+# ifndef USE_AS_STRCHRNUL
+L(zero):
+ xorl %eax, %eax
+ ret
+# endif
+
+
.p2align 4
L(first_vec_x1):
- tzcntl %eax, %eax
+ /* Use bsf here to save 1-byte keeping keeping the block in 1x
+ fetch block. eax guranteed non-zero. */
+ bsfl %eax, %eax
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
ret
- .p2align 4
+ .p2align 4,, 10
L(first_vec_x2):
# ifndef USE_AS_STRCHRNUL
/* Check to see if first match was CHAR (k0) or null (k1). */
leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
+ .p2align 4,, 10
+L(first_vec_x3):
+ /* Use bsf here to save 1-byte keeping keeping the block in 1x
+ fetch block. eax guranteed non-zero. */
+ bsfl %eax, %eax
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ jne L(zero)
+# endif
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
+ bytes. */
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
.p2align 4
L(aligned_more):
/* Align data to VEC_SIZE. */
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x1)
/* Each bit in K0 represents a CHAR in YMM1. */
VPCMP $0, %YMM1, %YMM0, %k0
/* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMM1, %YMMZERO, %k1
+ VPTESTN %YMM1, %YMM1, %k1
kortestd %k0, %k1
jnz L(first_vec_x2)
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
/* Each bit in K0 represents a CHAR in YMM1. */
VPCMP $0, %YMM1, %YMM0, %k0
/* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMM1, %YMMZERO, %k1
+ VPTESTN %YMM1, %YMM1, %k1
kortestd %k0, %k1
jnz L(first_vec_x4)
VPMINU %YMM3, %YMM4, %YMM4
VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
- VPCMP $0, %YMMZERO, %YMM4, %k1
+ VPTESTN %YMM4, %YMM4, %k1
kmovd %k1, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
jz L(loop_4x_vec)
- VPCMP $0, %YMMZERO, %YMM1, %k0
+ VPTESTN %YMM1, %YMM1, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(last_vec_x1)
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(last_vec_x2)
- VPCMP $0, %YMMZERO, %YMM3, %k0
+ VPTESTN %YMM3, %YMM3, %k0
kmovd %k0, %eax
/* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
# ifdef USE_AS_WCSCHR
sall $8, %ecx
orl %ecx, %eax
- tzcntl %eax, %eax
+ bsfl %eax, %eax
# else
salq $32, %rcx
orq %rcx, %rax
- tzcntq %rax, %rax
+ bsfq %rax, %rax
# endif
# ifndef USE_AS_STRCHRNUL
/* Check if match was CHAR or null. */
leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
- xorl %eax, %eax
- ret
+ .p2align 4,, 8
+L(last_vec_x1):
+ bsfl %eax, %eax
+# ifdef USE_AS_WCSCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes.
+ */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
# endif
- .p2align 4
-L(last_vec_x1):
- tzcntl %eax, %eax
# ifndef USE_AS_STRCHRNUL
/* Check if match was null. */
- cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ cmp (%rax), %CHAR_REG
jne L(zero_end)
# endif
- /* NB: Multiply sizeof char type (1 or 4) to get the number of
- bytes. */
- leaq (%rdi, %rax, CHAR_SIZE), %rax
+
ret
- .p2align 4
+ .p2align 4,, 8
L(last_vec_x2):
- tzcntl %eax, %eax
+ bsfl %eax, %eax
# ifndef USE_AS_STRCHRNUL
/* Check if match was null. */
cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
ret
/* Cold case for crossing page with first load. */
- .p2align 4
+ .p2align 4,, 8
L(cross_page_boundary):
movq %rdi, %rdx
/* Align rdi. */
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
- /* Remove the leading bits. */
+ /* Remove the leading bits. */
# ifdef USE_AS_WCSCHR
movl %edx, %SHIFT_REG
/* NB: Divide shift count by 4 since each bit in K1 represent 4
/* If eax is zero continue. */
testl %eax, %eax
jz L(cross_page_continue)
- tzcntl %eax, %eax
-# ifndef USE_AS_STRCHRNUL
- /* Check to see if match was CHAR or null. */
- cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
- jne L(zero_end)
-# endif
+ bsfl %eax, %eax
+
# ifdef USE_AS_WCSCHR
/* NB: Multiply wchar_t count by 4 to get the number of
bytes. */
leaq (%rdx, %rax, CHAR_SIZE), %rax
# else
addq %rdx, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+ /* Check to see if match was CHAR or null. */
+ cmp (%rax), %CHAR_REG
+ je L(cross_page_ret)
+L(zero_end):
+ xorl %eax, %eax
+L(cross_page_ret):
# endif
ret
END (STRCHR)
-# endif
+#endif
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
+
# ifndef STRCMP
# define STRCMP __strcmp_avx2
# endif
# define PAGE_SIZE 4096
-/* VEC_SIZE = Number of bytes in a ymm register */
+ /* VEC_SIZE = Number of bytes in a ymm register. */
# define VEC_SIZE 32
-/* Shift for dividing by (VEC_SIZE * 4). */
-# define DIVIDE_BY_VEC_4_SHIFT 7
-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-# endif
+# define VMOVU vmovdqu
+# define VMOVA vmovdqa
# ifdef USE_AS_WCSCMP
-/* Compare packed dwords. */
+ /* Compare packed dwords. */
# define VPCMPEQ vpcmpeqd
-/* Compare packed dwords and store minimum. */
+ /* Compare packed dwords and store minimum. */
# define VPMINU vpminud
-/* 1 dword char == 4 bytes. */
+ /* 1 dword char == 4 bytes. */
# define SIZE_OF_CHAR 4
# else
-/* Compare packed bytes. */
+ /* Compare packed bytes. */
# define VPCMPEQ vpcmpeqb
-/* Compare packed bytes and store minimum. */
+ /* Compare packed bytes and store minimum. */
# define VPMINU vpminub
-/* 1 byte char == 1 byte. */
+ /* 1 byte char == 1 byte. */
# define SIZE_OF_CHAR 1
# endif
+# ifdef USE_AS_STRNCMP
+# define LOOP_REG r9d
+# define LOOP_REG64 r9
+
+# define OFFSET_REG8 r9b
+# define OFFSET_REG r9d
+# define OFFSET_REG64 r9
+# else
+# define LOOP_REG edx
+# define LOOP_REG64 rdx
+
+# define OFFSET_REG8 dl
+# define OFFSET_REG edx
+# define OFFSET_REG64 rdx
+# endif
+
# ifndef VZEROUPPER
# define VZEROUPPER vzeroupper
# endif
+# if defined USE_AS_STRNCMP
+# define VEC_OFFSET 0
+# else
+# define VEC_OFFSET (-VEC_SIZE)
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_avx2
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_avx2
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
+# define xmmZERO xmm15
+# define ymmZERO ymm15
+
+# define LCASE_MIN_ymm %ymm10
+# define LCASE_MAX_ymm %ymm11
+# define CASE_ADD_ymm %ymm12
+
+# define LCASE_MIN_xmm %xmm10
+# define LCASE_MAX_xmm %xmm11
+# define CASE_ADD_xmm %xmm12
+
+ /* r11 is never use elsewhere so this is safe to maintain. */
+# define TOLOWER_BASE %r11
+
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define REG(x, y) x ## y
+# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
+ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
+ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpaddb REG(%ext, 8), reg1_in, reg1_out; \
+ vpaddb REG(%ext, 9), reg2_in, reg2_out
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
+# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
+
+# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
+ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
+ VPCMPEQ scratch_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
+ VMOVU s2_mem, reg_out; \
+ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_ymm(...)
+# define TOLOWER_xmm(...)
+
+# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
+ VPCMPEQ s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
strcmp/strncmp have to use UNSIGNED comparison for elements.
the maximum offset is reached before a difference is found, zero is
returned. */
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRCMP)
+ .section SECTION(.text), "ax", @progbits
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifndef GLABEL
+# define GLABEL(...) __VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (GLABEL(STRCASECMP))
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
- /* Check for simple cases (0 or 1) in offset. */
+ /* Don't overwrite LOCALE_REG (rcx) until we have pass
+ L(one_or_less). Otherwise we might use the wrong locale in
+ the OVERFLOW_STRCMP (strcasecmp_l). */
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
cmp $1, %RDX_LP
- je L(char0)
- jb L(zero)
+ /* Signed comparison intentional. We use this branch to also
+ test cases where length >= 2^63. These very large sizes can be
+ handled with strcmp as there is no way for that length to
+ actually bound the buffer. */
+ jle L(one_or_less)
# ifdef USE_AS_WCSCMP
-# ifndef __ILP32__
movq %rdx, %rcx
- /* Check if length could overflow when multiplied by
- sizeof(wchar_t). Checking top 8 bits will cover all potential
- overflow cases as well as redirect cases where its impossible to
- length to bound a valid memory region. In these cases just use
- 'wcscmp'. */
+
+ /* Multiplying length by sizeof(wchar_t) can result in overflow.
+ Check if that is possible. All cases where overflow are possible
+ are cases where length is large enough that it can never be a
+ bound on valid memory so just use wcscmp. */
shrq $56, %rcx
- jnz __wcscmp_avx2
-# endif
- /* Convert units: from wide to byte char. */
- shl $2, %RDX_LP
+ jnz OVERFLOW_STRCMP
+
+ leaq (, %rdx, 4), %rdx
# endif
- /* Register %r11 tracks the maximum offset. */
- mov %RDX_LP, %R11_LP
+# endif
+ vpxor %xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
+ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
+ vmovdqa L(case_add)(%rip), CASE_ADD_ymm
# endif
movl %edi, %eax
- xorl %edx, %edx
- /* Make %xmm7 (%ymm7) all zeros in this function. */
- vpxor %xmm7, %xmm7, %xmm7
orl %esi, %eax
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
- jg L(cross_page)
- /* Start comparing 4 vectors. */
- vmovdqu (%rdi), %ymm1
- VPCMPEQ (%rsi), %ymm1, %ymm0
- VPMINU %ymm1, %ymm0, %ymm0
- VPCMPEQ %ymm7, %ymm0, %ymm0
- vpmovmskb %ymm0, %ecx
- testl %ecx, %ecx
- je L(next_3_vectors)
- tzcntl %ecx, %edx
+ sall $20, %eax
+ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */
+ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
+ ja L(page_cross)
+
+L(no_page_cross):
+ /* Safe to compare 4x vectors. */
+ VMOVU (%rdi), %ymm0
+ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+ Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+ scratch and ymm1 is the return. */
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+ /* 1s at null CHAR. */
+ VPCMPEQ %ymm0, %ymmZERO, %ymm2
+ /* 1s where s1 and s2 equal AND not null CHAR. */
+ vpandn %ymm1, %ymm2, %ymm1
+
+ /* All 1s -> keep going, any 0s -> return. */
+ vpmovmskb %ymm1, %ecx
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx) is after the maximum
- offset (%r11). */
- cmpq %r11, %rdx
- jae L(zero)
+ cmpq $VEC_SIZE, %rdx
+ jbe L(vec_0_test_len)
# endif
+
+ /* All 1s represents all equals. incl will overflow to zero in
+ all equals case. Otherwise 1s will carry until position of first
+ mismatch. */
+ incl %ecx
+ jz L(more_3x_vec)
+
+ .p2align 4,, 4
+L(return_vec_0):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
+ movl (%rdi, %rcx), %edx
xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- je L(return)
-L(wcscmp_return):
+ cmpl (%rsi, %rcx), %edx
+ je L(ret0)
setl %al
negl %eax
orl $1, %eax
-L(return):
# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
# endif
+L(ret0):
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
- .p2align 4
-L(return_vec_size):
- tzcntl %ecx, %edx
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
- the maximum offset (%r11). */
- addq $VEC_SIZE, %rdx
- cmpq %r11, %rdx
- jae L(zero)
-# ifdef USE_AS_WCSCMP
+ .p2align 4,, 8
+L(vec_0_test_len):
+ notl %ecx
+ bzhil %edx, %ecx, %eax
+ jnz L(return_vec_0)
+ /* Align if will cross fetch block. */
+ .p2align 4,, 2
+L(ret_zero):
xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
+ VZEROUPPER_RETURN
+
+ .p2align 4,, 5
+L(one_or_less):
+# ifdef USE_AS_STRCASECMP_L
+ /* Set locale argument for strcasecmp. */
+ movq %LOCALE_REG, %rdx
# endif
-# else
+ jb L(ret_zero)
+ /* 'nbe' covers the case where length is negative (large
+ unsigned). */
+ jnbe OVERFLOW_STRCMP
# ifdef USE_AS_WCSCMP
+ movl (%rdi), %edx
xorl %eax, %eax
- movl VEC_SIZE(%rdi, %rdx), %ecx
- cmpl VEC_SIZE(%rsi, %rdx), %ecx
- jne L(wcscmp_return)
+ cmpl (%rsi), %edx
+ je L(ret1)
+ setl %al
+ negl %eax
+ orl $1, %eax
# else
- movzbl VEC_SIZE(%rdi, %rdx), %eax
- movzbl VEC_SIZE(%rsi, %rdx), %edx
- subl %edx, %eax
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
# endif
+L(ret1):
+ ret
# endif
- VZEROUPPER_RETURN
- .p2align 4
-L(return_2_vec_size):
- tzcntl %ecx, %edx
+ .p2align 4,, 10
+L(return_vec_1):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
- after the maximum offset (%r11). */
- addq $(VEC_SIZE * 2), %rdx
- cmpq %r11, %rdx
- jae L(zero)
-# ifdef USE_AS_WCSCMP
+ /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
+ overflow. */
+ addq $-VEC_SIZE, %rdx
+ cmpq %rcx, %rdx
+ jbe L(ret_zero)
+# endif
+# ifdef USE_AS_WCSCMP
+ movl VEC_SIZE(%rdi, %rcx), %edx
xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
-# endif
+ cmpl VEC_SIZE(%rsi, %rcx), %edx
+ je L(ret2)
+ setl %al
+ negl %eax
+ orl $1, %eax
# else
-# ifdef USE_AS_WCSCMP
- xorl %eax, %eax
- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
- subl %edx, %eax
-# endif
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
+ movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
# endif
+L(ret2):
VZEROUPPER_RETURN
- .p2align 4
-L(return_3_vec_size):
- tzcntl %ecx, %edx
+ .p2align 4,, 10
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
- after the maximum offset (%r11). */
- addq $(VEC_SIZE * 3), %rdx
- cmpq %r11, %rdx
- jae L(zero)
-# ifdef USE_AS_WCSCMP
+L(return_vec_3):
+ salq $32, %rcx
+# endif
+
+L(return_vec_2):
+# ifndef USE_AS_STRNCMP
+ tzcntl %ecx, %ecx
+# else
+ tzcntq %rcx, %rcx
+ cmpq %rcx, %rdx
+ jbe L(ret_zero)
+# endif
+
+# ifdef USE_AS_WCSCMP
+ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
-# endif
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
+ je L(ret3)
+ setl %al
+ negl %eax
+ orl $1, %eax
# else
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+# endif
+L(ret3):
+ VZEROUPPER_RETURN
+
+# ifndef USE_AS_STRNCMP
+ .p2align 4,, 10
+L(return_vec_3):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
+ movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
xorl %eax, %eax
- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
- jne L(wcscmp_return)
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
+ je L(ret4)
+ setl %al
+ negl %eax
+ orl $1, %eax
# else
- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
- subl %edx, %eax
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
# endif
-# endif
+L(ret4):
VZEROUPPER_RETURN
+# endif
+
+ .p2align 4,, 10
+L(more_3x_vec):
+ /* Safe to compare 4x vectors. */
+ VMOVU VEC_SIZE(%rdi), %ymm0
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+ VPCMPEQ %ymm0, %ymmZERO, %ymm2
+ vpandn %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %ecx
+ incl %ecx
+ jnz L(return_vec_1)
- .p2align 4
-L(next_3_vectors):
- vmovdqu VEC_SIZE(%rdi), %ymm6
- VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3
- VPMINU %ymm6, %ymm3, %ymm3
- VPCMPEQ %ymm7, %ymm3, %ymm3
- vpmovmskb %ymm3, %ecx
- testl %ecx, %ecx
- jne L(return_vec_size)
- vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5
- vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
- VPMINU %ymm5, %ymm2, %ymm2
- VPCMPEQ %ymm4, %ymm0, %ymm0
- VPCMPEQ %ymm7, %ymm2, %ymm2
- vpmovmskb %ymm2, %ecx
- testl %ecx, %ecx
- jne L(return_2_vec_size)
- VPMINU %ymm4, %ymm0, %ymm0
- VPCMPEQ %ymm7, %ymm0, %ymm0
- vpmovmskb %ymm0, %ecx
- testl %ecx, %ecx
- jne L(return_3_vec_size)
-L(main_loop_header):
- leaq (VEC_SIZE * 4)(%rdi), %rdx
- movl $PAGE_SIZE, %ecx
- /* Align load via RAX. */
- andq $-(VEC_SIZE * 4), %rdx
- subq %rdi, %rdx
- leaq (%rdi, %rdx), %rax
# ifdef USE_AS_STRNCMP
- /* Starting from this point, the maximum offset, or simply the
- 'offset', DECREASES by the same amount when base pointers are
- moved forward. Return 0 when:
- 1) On match: offset <= the matched vector index.
- 2) On mistmach, offset is before the mistmatched index.
+ subq $(VEC_SIZE * 2), %rdx
+ jbe L(ret_zero)
+# endif
+
+ VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
+ VPCMPEQ %ymm0, %ymmZERO, %ymm2
+ vpandn %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %ecx
+ incl %ecx
+ jnz L(return_vec_2)
+
+ VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
+ VPCMPEQ %ymm0, %ymmZERO, %ymm2
+ vpandn %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %ecx
+ incl %ecx
+ jnz L(return_vec_3)
+
+# ifdef USE_AS_STRNCMP
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(ret_zero)
+# endif
+
+# ifdef USE_AS_WCSCMP
+ /* any non-zero positive value that doesn't inference with 0x1.
*/
- subq %rdx, %r11
- jbe L(zero)
-# endif
- addq %rsi, %rdx
- movq %rdx, %rsi
- andl $(PAGE_SIZE - 1), %esi
- /* Number of bytes before page crossing. */
- subq %rsi, %rcx
- /* Number of VEC_SIZE * 4 blocks before page crossing. */
- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
- movl %ecx, %esi
- jmp L(loop_start)
+ movl $2, %r8d
+
+# else
+ xorl %r8d, %r8d
+# endif
+
+ /* The prepare labels are various entry points from the page
+ cross logic. */
+L(prepare_loop):
+
+# ifdef USE_AS_STRNCMP
+ /* Store N + (VEC_SIZE * 4) and place check at the begining of
+ the loop. */
+ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
+# endif
+L(prepare_loop_no_len):
+
+ /* Align s1 and adjust s2 accordingly. */
+ subq %rdi, %rsi
+ andq $-(VEC_SIZE * 4), %rdi
+ addq %rdi, %rsi
+
+# ifdef USE_AS_STRNCMP
+ subq %rdi, %rdx
+# endif
+L(prepare_loop_aligned):
+ /* eax stores distance from rsi to next page cross. These cases
+ need to be handled specially as the 4x loop could potentially
+ read memory past the length of s1 or s2 and across a page
+ boundary. */
+ movl $-(VEC_SIZE * 4), %eax
+ subl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+
+ /* Loop 4x comparisons at a time. */
.p2align 4
L(loop):
+
+ /* End condition for strncmp. */
# ifdef USE_AS_STRNCMP
- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
- the maximum offset (%r11) by the same amount. */
- subq $(VEC_SIZE * 4), %r11
- jbe L(zero)
-# endif
- addq $(VEC_SIZE * 4), %rax
- addq $(VEC_SIZE * 4), %rdx
-L(loop_start):
- testl %esi, %esi
- leal -1(%esi), %esi
- je L(loop_cross_page)
-L(back_to_loop):
- /* Main loop, comparing 4 vectors are a time. */
- vmovdqa (%rax), %ymm0
- vmovdqa VEC_SIZE(%rax), %ymm3
- VPCMPEQ (%rdx), %ymm0, %ymm4
- VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1
- VPMINU %ymm0, %ymm4, %ymm4
- VPMINU %ymm3, %ymm1, %ymm1
- vmovdqa (VEC_SIZE * 2)(%rax), %ymm2
- VPMINU %ymm1, %ymm4, %ymm0
- vmovdqa (VEC_SIZE * 3)(%rax), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
- VPMINU %ymm2, %ymm5, %ymm5
- VPMINU %ymm3, %ymm6, %ymm6
- VPMINU %ymm5, %ymm0, %ymm0
- VPMINU %ymm6, %ymm0, %ymm0
- VPCMPEQ %ymm7, %ymm0, %ymm0
-
- /* Test each mask (32 bits) individually because for VEC_SIZE
- == 32 is not possible to OR the four masks and keep all bits
- in a 64-bit integer register, differing from SSE2 strcmp
- where ORing is possible. */
- vpmovmskb %ymm0, %ecx
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(ret_zero)
+# endif
+
+ subq $-(VEC_SIZE * 4), %rdi
+ subq $-(VEC_SIZE * 4), %rsi
+
+ /* Check if rsi loads will cross a page boundary. */
+ addl $-(VEC_SIZE * 4), %eax
+ jnb L(page_cross_during_loop)
+
+ /* Loop entry after handling page cross during loop. */
+L(loop_skip_page_cross_check):
+ VMOVA (VEC_SIZE * 0)(%rdi), %ymm0
+ VMOVA (VEC_SIZE * 1)(%rdi), %ymm2
+ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
+ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
+
+ /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+
+ /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
+ zero. */
+ vpand %ymm0, %ymm1, %ymm1
+
+
+ vpand %ymm2, %ymm3, %ymm3
+ vpand %ymm4, %ymm5, %ymm5
+ vpand %ymm6, %ymm7, %ymm7
+
+ VPMINU %ymm1, %ymm3, %ymm3
+ VPMINU %ymm5, %ymm7, %ymm7
+
+ /* Reduce all 0 CHARs for the 4x VEC into ymm7. */
+ VPMINU %ymm3, %ymm7, %ymm7
+
+ /* If any 0 CHAR then done. */
+ VPCMPEQ %ymm7, %ymmZERO, %ymm7
+ vpmovmskb %ymm7, %LOOP_REG
+ testl %LOOP_REG, %LOOP_REG
+ jz L(loop)
+
+ /* Find which VEC has the mismatch of end of string. */
+ VPCMPEQ %ymm1, %ymmZERO, %ymm1
+ vpmovmskb %ymm1, %ecx
testl %ecx, %ecx
- je L(loop)
- VPCMPEQ %ymm7, %ymm4, %ymm0
- vpmovmskb %ymm0, %edi
- testl %edi, %edi
- je L(test_vec)
- tzcntl %edi, %ecx
+ jnz L(return_vec_0_end)
+
+
+ VPCMPEQ %ymm3, %ymmZERO, %ymm3
+ vpmovmskb %ymm3, %ecx
+ testl %ecx, %ecx
+ jnz L(return_vec_1_end)
+
+L(return_vec_2_3_end):
# ifdef USE_AS_STRNCMP
- cmpq %rcx, %r11
- jbe L(zero)
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
+ subq $(VEC_SIZE * 2), %rdx
+ jbe L(ret_zero_end)
+# endif
+
+ VPCMPEQ %ymm5, %ymmZERO, %ymm5
+ vpmovmskb %ymm5, %ecx
+ testl %ecx, %ecx
+ jnz L(return_vec_2_end)
+
+ /* LOOP_REG contains matches for null/mismatch from the loop. If
+ VEC 0,1,and 2 all have no null and no mismatches then mismatch
+ must entirely be from VEC 3 which is fully represented by
+ LOOP_REG. */
+ tzcntl %LOOP_REG, %LOOP_REG
+
+# ifdef USE_AS_STRNCMP
+ subl $-(VEC_SIZE), %LOOP_REG
+ cmpq %LOOP_REG64, %rdx
+ jbe L(ret_zero_end)
+# endif
+
+# ifdef USE_AS_WCSCMP
+ movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
+ cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+ je L(ret5)
+ setl %al
+ negl %eax
+ xorl %r8d, %eax
# else
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
+ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
+ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+ xorl %r8d, %eax
+ subl %r8d, %eax
# endif
+L(ret5):
VZEROUPPER_RETURN
- .p2align 4
-L(test_vec):
# ifdef USE_AS_STRNCMP
- /* The first vector matched. Return 0 if the maximum offset
- (%r11) <= VEC_SIZE. */
- cmpq $VEC_SIZE, %r11
- jbe L(zero)
+ .p2align 4,, 2
+L(ret_zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
# endif
- VPCMPEQ %ymm7, %ymm1, %ymm1
- vpmovmskb %ymm1, %ecx
- testl %ecx, %ecx
- je L(test_2_vec)
- tzcntl %ecx, %edi
+
+
+ /* The L(return_vec_N_end) differ from L(return_vec_N) in that
+ they use the value of `r8` to negate the return value. This is
+ because the page cross logic can swap `rdi` and `rsi`. */
+ .p2align 4,, 10
# ifdef USE_AS_STRNCMP
- addq $VEC_SIZE, %rdi
- cmpq %rdi, %r11
- jbe L(zero)
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
+L(return_vec_1_end):
+ salq $32, %rcx
+# endif
+L(return_vec_0_end):
+# ifndef USE_AS_STRNCMP
+ tzcntl %ecx, %ecx
+# else
+ tzcntq %rcx, %rcx
+ cmpq %rcx, %rdx
+ jbe L(ret_zero_end)
+# endif
+
+# ifdef USE_AS_WCSCMP
+ movl (%rdi, %rcx), %edx
xorl %eax, %eax
- movl (%rsi, %rdi), %ecx
- cmpl (%rdx, %rdi), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rdi), %eax
- movzbl (%rdx, %rdi), %edx
- subl %edx, %eax
-# endif
+ cmpl (%rsi, %rcx), %edx
+ je L(ret6)
+ setl %al
+ negl %eax
+ xorl %r8d, %eax
# else
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+ xorl %r8d, %eax
+ subl %r8d, %eax
+# endif
+L(ret6):
+ VZEROUPPER_RETURN
+
+# ifndef USE_AS_STRNCMP
+ .p2align 4,, 10
+L(return_vec_1_end):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
+ movl VEC_SIZE(%rdi, %rcx), %edx
xorl %eax, %eax
- movl VEC_SIZE(%rsi, %rdi), %ecx
- cmpl VEC_SIZE(%rdx, %rdi), %ecx
- jne L(wcscmp_return)
+ cmpl VEC_SIZE(%rsi, %rcx), %edx
+ je L(ret7)
+ setl %al
+ negl %eax
+ xorl %r8d, %eax
# else
- movzbl VEC_SIZE(%rax, %rdi), %eax
- movzbl VEC_SIZE(%rdx, %rdi), %edx
- subl %edx, %eax
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
+ movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+ xorl %r8d, %eax
+ subl %r8d, %eax
# endif
-# endif
+L(ret7):
VZEROUPPER_RETURN
+# endif
- .p2align 4
-L(test_2_vec):
+ .p2align 4,, 10
+L(return_vec_2_end):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_STRNCMP
- /* The first 2 vectors matched. Return 0 if the maximum offset
- (%r11) <= 2 * VEC_SIZE. */
- cmpq $(VEC_SIZE * 2), %r11
- jbe L(zero)
+ cmpq %rcx, %rdx
+ jbe L(ret_zero_page_cross)
# endif
- VPCMPEQ %ymm7, %ymm5, %ymm5
- vpmovmskb %ymm5, %ecx
- testl %ecx, %ecx
- je L(test_3_vec)
- tzcntl %ecx, %edi
-# ifdef USE_AS_STRNCMP
- addq $(VEC_SIZE * 2), %rdi
- cmpq %rdi, %r11
- jbe L(zero)
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
+# ifdef USE_AS_WCSCMP
+ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
xorl %eax, %eax
- movl (%rsi, %rdi), %ecx
- cmpl (%rdx, %rdi), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rdi), %eax
- movzbl (%rdx, %rdi), %edx
- subl %edx, %eax
-# endif
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
+ je L(ret11)
+ setl %al
+ negl %eax
+ xorl %r8d, %eax
# else
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
- subl %edx, %eax
-# endif
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+ xorl %r8d, %eax
+ subl %r8d, %eax
# endif
+L(ret11):
VZEROUPPER_RETURN
- .p2align 4
-L(test_3_vec):
+
+ /* Page cross in rsi in next 4x VEC. */
+
+ /* TODO: Improve logic here. */
+ .p2align 4,, 10
+L(page_cross_during_loop):
+ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */
+
+ /* Optimistically rsi and rdi and both aligned inwhich case we
+ don't need any logic here. */
+ cmpl $-(VEC_SIZE * 4), %eax
+ /* Don't adjust eax before jumping back to loop and we will
+ never hit page cross case again. */
+ je L(loop_skip_page_cross_check)
+
+ /* Check if we can safely load a VEC. */
+ cmpl $-(VEC_SIZE * 3), %eax
+ jle L(less_1x_vec_till_page_cross)
+
+ VMOVA (%rdi), %ymm0
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
+ VPCMPEQ %ymm0, %ymmZERO, %ymm2
+ vpandn %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %ecx
+ incl %ecx
+ jnz L(return_vec_0_end)
+
+ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */
+ cmpl $-(VEC_SIZE * 2), %eax
+ jg L(more_2x_vec_till_page_cross)
+
+ .p2align 4,, 4
+L(less_1x_vec_till_page_cross):
+ subl $-(VEC_SIZE * 4), %eax
+ /* Guranteed safe to read from rdi - VEC_SIZE here. The only
+ concerning case is first iteration if incoming s1 was near start
+ of a page and s2 near end. If s1 was near the start of the page
+ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
+ to read back -VEC_SIZE. If rdi is truly at the start of a page
+ here, it means the previous page (rdi - VEC_SIZE) has already
+ been loaded earlier so must be valid. */
+ VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
+ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
+ VPCMPEQ %ymm0, %ymmZERO, %ymm2
+ vpandn %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %ecx
+
+ /* Mask of potentially valid bits. The lower bits can be out of
+ range comparisons (but safe regarding page crosses). */
+ movl $-1, %r10d
+ shlxl %esi, %r10d, %r10d
+ notl %ecx
+
# ifdef USE_AS_STRNCMP
- /* The first 3 vectors matched. Return 0 if the maximum offset
- (%r11) <= 3 * VEC_SIZE. */
- cmpq $(VEC_SIZE * 3), %r11
- jbe L(zero)
-# endif
- VPCMPEQ %ymm7, %ymm6, %ymm6
- vpmovmskb %ymm6, %esi
- tzcntl %esi, %ecx
+ cmpq %rax, %rdx
+ jbe L(return_page_cross_end_check)
+# endif
+ movl %eax, %OFFSET_REG
+ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax
+
+ andl %r10d, %ecx
+ jz L(loop_skip_page_cross_check)
+
+ .p2align 4,, 3
+L(return_page_cross_end):
+ tzcntl %ecx, %ecx
+
# ifdef USE_AS_STRNCMP
- addq $(VEC_SIZE * 3), %rcx
- cmpq %rcx, %r11
- jbe L(zero)
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (%rsi, %rcx), %esi
- cmpl (%rdx, %rcx), %esi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
+ leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
+L(return_page_cross_cmp_mem):
# else
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
+ addl %OFFSET_REG, %ecx
+# endif
+# ifdef USE_AS_WCSCMP
+ movl VEC_OFFSET(%rdi, %rcx), %edx
xorl %eax, %eax
- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
- jne L(wcscmp_return)
-# else
- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
+ cmpl VEC_OFFSET(%rsi, %rcx), %edx
+ je L(ret8)
+ setl %al
+ negl %eax
+ xorl %r8d, %eax
+# else
+ movzbl VEC_OFFSET(%rdi, %rcx), %eax
+ movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+ xorl %r8d, %eax
+ subl %r8d, %eax
# endif
+L(ret8):
VZEROUPPER_RETURN
- .p2align 4
-L(loop_cross_page):
- xorl %r10d, %r10d
- movq %rdx, %rcx
- /* Align load via RDX. We load the extra ECX bytes which should
- be ignored. */
- andl $((VEC_SIZE * 4) - 1), %ecx
- /* R10 is -RCX. */
- subq %rcx, %r10
-
- /* This works only if VEC_SIZE * 2 == 64. */
-# if (VEC_SIZE * 2) != 64
-# error (VEC_SIZE * 2) != 64
-# endif
-
- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
- cmpl $(VEC_SIZE * 2), %ecx
- jge L(loop_cross_page_2_vec)
-
- vmovdqu (%rax, %r10), %ymm2
- vmovdqu VEC_SIZE(%rax, %r10), %ymm3
- VPCMPEQ (%rdx, %r10), %ymm2, %ymm0
- VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
- VPMINU %ymm2, %ymm0, %ymm0
- VPMINU %ymm3, %ymm1, %ymm1
- VPCMPEQ %ymm7, %ymm0, %ymm0
- VPCMPEQ %ymm7, %ymm1, %ymm1
-
- vpmovmskb %ymm0, %edi
- vpmovmskb %ymm1, %esi
-
- salq $32, %rsi
- xorq %rsi, %rdi
-
- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
- shrq %cl, %rdi
-
- testq %rdi, %rdi
- je L(loop_cross_page_2_vec)
- tzcntq %rdi, %rcx
# ifdef USE_AS_STRNCMP
- cmpq %rcx, %r11
- jbe L(zero)
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
+ .p2align 4,, 10
+L(return_page_cross_end_check):
+ andl %r10d, %ecx
+ tzcntl %ecx, %ecx
+ leal -VEC_SIZE(%rax, %rcx), %ecx
+ cmpl %ecx, %edx
+ ja L(return_page_cross_cmp_mem)
xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
-# else
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
-# endif
VZEROUPPER_RETURN
+# endif
- .p2align 4
-L(loop_cross_page_2_vec):
- /* The first VEC_SIZE * 2 bytes match or are ignored. */
- vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2
- vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
- VPMINU %ymm2, %ymm5, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
- VPCMPEQ %ymm7, %ymm5, %ymm5
- VPMINU %ymm3, %ymm6, %ymm6
- VPCMPEQ %ymm7, %ymm6, %ymm6
-
- vpmovmskb %ymm5, %edi
- vpmovmskb %ymm6, %esi
-
- salq $32, %rsi
- xorq %rsi, %rdi
- xorl %r8d, %r8d
- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
- subl $(VEC_SIZE * 2), %ecx
- jle 1f
- /* Skip ECX bytes. */
- shrq %cl, %rdi
- /* R8 has number of bytes skipped. */
- movl %ecx, %r8d
-1:
- /* Before jumping back to the loop, set ESI to the number of
- VEC_SIZE * 4 blocks before page crossing. */
- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
-
- testq %rdi, %rdi
+ .p2align 4,, 10
+L(more_2x_vec_till_page_cross):
+ /* If more 2x vec till cross we will complete a full loop
+ iteration here. */
+
+ VMOVU VEC_SIZE(%rdi), %ymm0
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
+ VPCMPEQ %ymm0, %ymmZERO, %ymm2
+ vpandn %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %ecx
+ incl %ecx
+ jnz L(return_vec_1_end)
+
# ifdef USE_AS_STRNCMP
- /* At this point, if %rdi value is 0, it already tested
- VEC_SIZE*4+%r10 byte starting from %rax. This label
- checks whether strncmp maximum offset reached or not. */
- je L(string_nbyte_offset_check)
-# else
- je L(back_to_loop)
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(ret_zero_in_loop_page_cross)
# endif
- tzcntq %rdi, %rcx
- addq %r10, %rcx
- /* Adjust for number of bytes skipped. */
- addq %r8, %rcx
+
+ subl $-(VEC_SIZE * 4), %eax
+
+ /* Safe to include comparisons from lower bytes. */
+ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
+ VPCMPEQ %ymm0, %ymmZERO, %ymm2
+ vpandn %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %ecx
+ incl %ecx
+ jnz L(return_vec_page_cross_0)
+
+ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
+ VPCMPEQ %ymm0, %ymmZERO, %ymm2
+ vpandn %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %ecx
+ incl %ecx
+ jnz L(return_vec_page_cross_1)
+
# ifdef USE_AS_STRNCMP
- addq $(VEC_SIZE * 2), %rcx
- subq %rcx, %r11
- jbe L(zero)
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
+ /* Must check length here as length might proclude reading next
+ page. */
+ cmpq %rax, %rdx
+ jbe L(ret_zero_in_loop_page_cross)
+# endif
+
+ /* Finish the loop. */
+ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
+ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
+
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
+ vpand %ymm4, %ymm5, %ymm5
+ vpand %ymm6, %ymm7, %ymm7
+ VPMINU %ymm5, %ymm7, %ymm7
+ VPCMPEQ %ymm7, %ymmZERO, %ymm7
+ vpmovmskb %ymm7, %LOOP_REG
+ testl %LOOP_REG, %LOOP_REG
+ jnz L(return_vec_2_3_end)
+
+ /* Best for code size to include ucond-jmp here. Would be faster
+ if this case is hot to duplicate the L(return_vec_2_3_end) code
+ as fall-through and have jump back to loop on mismatch
+ comparison. */
+ subq $-(VEC_SIZE * 4), %rdi
+ subq $-(VEC_SIZE * 4), %rsi
+ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax
+# ifdef USE_AS_STRNCMP
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(loop_skip_page_cross_check)
+L(ret_zero_in_loop_page_cross):
xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
+ VZEROUPPER_RETURN
# else
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
+ jmp L(loop_skip_page_cross_check)
# endif
- VZEROUPPER_RETURN
+
+ .p2align 4,, 10
+L(return_vec_page_cross_0):
+ addl $-VEC_SIZE, %eax
+L(return_vec_page_cross_1):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_STRNCMP
-L(string_nbyte_offset_check):
- leaq (VEC_SIZE * 4)(%r10), %r10
- cmpq %r10, %r11
- jbe L(zero)
- jmp L(back_to_loop)
+ leal -VEC_SIZE(%rax, %rcx), %ecx
+ cmpq %rcx, %rdx
+ jbe L(ret_zero_in_loop_page_cross)
+# else
+ addl %eax, %ecx
# endif
- .p2align 4
-L(cross_page_loop):
- /* Check one byte/dword at a time. */
# ifdef USE_AS_WCSCMP
- cmpl %ecx, %eax
+ movl VEC_OFFSET(%rdi, %rcx), %edx
+ xorl %eax, %eax
+ cmpl VEC_OFFSET(%rsi, %rcx), %edx
+ je L(ret9)
+ setl %al
+ negl %eax
+ xorl %r8d, %eax
# else
+ movzbl VEC_OFFSET(%rdi, %rcx), %eax
+ movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
+ xorl %r8d, %eax
+ subl %r8d, %eax
# endif
- jne L(different)
- addl $SIZE_OF_CHAR, %edx
- cmpl $(VEC_SIZE * 4), %edx
- je L(main_loop_header)
-# ifdef USE_AS_STRNCMP
- cmpq %r11, %rdx
- jae L(zero)
+L(ret9):
+ VZEROUPPER_RETURN
+
+
+ .p2align 4,, 10
+L(page_cross):
+# ifndef USE_AS_STRNCMP
+ /* If both are VEC aligned we don't need any special logic here.
+ Only valid for strcmp where stop condition is guranteed to be
+ reachable by just reading memory. */
+ testl $((VEC_SIZE - 1) << 20), %eax
+ jz L(no_page_cross)
# endif
+
+ movl %edi, %eax
+ movl %esi, %ecx
+ andl $(PAGE_SIZE - 1), %eax
+ andl $(PAGE_SIZE - 1), %ecx
+
+ xorl %OFFSET_REG, %OFFSET_REG
+
+ /* Check which is closer to page cross, s1 or s2. */
+ cmpl %eax, %ecx
+ jg L(page_cross_s2)
+
+ /* The previous page cross check has false positives. Check for
+ true positive as page cross logic is very expensive. */
+ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax
+ jbe L(no_page_cross)
+
+ /* Set r8 to not interfere with normal return value (rdi and rsi
+ did not swap). */
# ifdef USE_AS_WCSCMP
- movl (%rdi, %rdx), %eax
- movl (%rsi, %rdx), %ecx
+ /* any non-zero positive value that doesn't inference with 0x1.
+ */
+ movl $2, %r8d
# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %ecx
+ xorl %r8d, %r8d
# endif
- /* Check null char. */
- testl %eax, %eax
- jne L(cross_page_loop)
- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
- comparisons. */
- subl %ecx, %eax
-# ifndef USE_AS_WCSCMP
-L(different):
+
+ /* Check if less than 1x VEC till page cross. */
+ subl $(VEC_SIZE * 3), %eax
+ jg L(less_1x_vec_till_page)
+
+ /* If more than 1x VEC till page cross, loop throuh safely
+ loadable memory until within 1x VEC of page cross. */
+
+ .p2align 4,, 10
+L(page_cross_loop):
+
+ VMOVU (%rdi, %OFFSET_REG64), %ymm0
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+ VPCMPEQ %ymm0, %ymmZERO, %ymm2
+ vpandn %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %ecx
+ incl %ecx
+
+ jnz L(check_ret_vec_page_cross)
+ addl $VEC_SIZE, %OFFSET_REG
+# ifdef USE_AS_STRNCMP
+ cmpq %OFFSET_REG64, %rdx
+ jbe L(ret_zero_page_cross)
# endif
- VZEROUPPER_RETURN
+ addl $VEC_SIZE, %eax
+ jl L(page_cross_loop)
+
+ subl %eax, %OFFSET_REG
+ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
+ to not cross page so is safe to load. Since we have already
+ loaded at least 1 VEC from rsi it is also guranteed to be
+ safe. */
+
+ VMOVU (%rdi, %OFFSET_REG64), %ymm0
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
+ VPCMPEQ %ymm0, %ymmZERO, %ymm2
+ vpandn %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %ecx
+
+# ifdef USE_AS_STRNCMP
+ leal VEC_SIZE(%OFFSET_REG64), %eax
+ cmpq %rax, %rdx
+ jbe L(check_ret_vec_page_cross2)
+ addq %rdi, %rdx
+# endif
+ incl %ecx
+ jz L(prepare_loop_no_len)
+ .p2align 4,, 4
+L(ret_vec_page_cross):
+# ifndef USE_AS_STRNCMP
+L(check_ret_vec_page_cross):
+# endif
+ tzcntl %ecx, %ecx
+ addl %OFFSET_REG, %ecx
+L(ret_vec_page_cross_cont):
# ifdef USE_AS_WCSCMP
- .p2align 4
-L(different):
- /* Use movl to avoid modifying EFLAGS. */
- movl $0, %eax
+ movl (%rdi, %rcx), %edx
+ xorl %eax, %eax
+ cmpl (%rsi, %rcx), %edx
+ je L(ret12)
setl %al
negl %eax
- orl $1, %eax
- VZEROUPPER_RETURN
+ xorl %r8d, %eax
+# else
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+ xorl %r8d, %eax
+ subl %r8d, %eax
# endif
+L(ret12):
+ VZEROUPPER_RETURN
# ifdef USE_AS_STRNCMP
- .p2align 4
-L(zero):
+ .p2align 4,, 10
+L(check_ret_vec_page_cross2):
+ incl %ecx
+L(check_ret_vec_page_cross):
+ tzcntl %ecx, %ecx
+ addl %OFFSET_REG, %ecx
+ cmpq %rcx, %rdx
+ ja L(ret_vec_page_cross_cont)
+ .p2align 4,, 2
+L(ret_zero_page_cross):
xorl %eax, %eax
VZEROUPPER_RETURN
+# endif
- .p2align 4
-L(char0):
-# ifdef USE_AS_WCSCMP
- xorl %eax, %eax
- movl (%rdi), %ecx
- cmpl (%rsi), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rsi), %ecx
- movzbl (%rdi), %eax
- subl %ecx, %eax
-# endif
- VZEROUPPER_RETURN
+ .p2align 4,, 4
+L(page_cross_s2):
+ /* Ensure this is a true page cross. */
+ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx
+ jbe L(no_page_cross)
+
+
+ movl %ecx, %eax
+ movq %rdi, %rcx
+ movq %rsi, %rdi
+ movq %rcx, %rsi
+
+ /* set r8 to negate return value as rdi and rsi swapped. */
+# ifdef USE_AS_WCSCMP
+ movl $-4, %r8d
+# else
+ movl $-1, %r8d
# endif
+ xorl %OFFSET_REG, %OFFSET_REG
- .p2align 4
-L(last_vector):
- addq %rdx, %rdi
- addq %rdx, %rsi
+ /* Check if more than 1x VEC till page cross. */
+ subl $(VEC_SIZE * 3), %eax
+ jle L(page_cross_loop)
+
+ .p2align 4,, 6
+L(less_1x_vec_till_page):
+ /* Find largest load size we can use. */
+ cmpl $16, %eax
+ ja L(less_16_till_page)
+
+ VMOVU (%rdi), %xmm0
+ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
+ VPCMPEQ %xmm0, %xmmZERO, %xmm2
+ vpandn %xmm1, %xmm2, %xmm1
+ vpmovmskb %ymm1, %ecx
+ incw %cx
+ jnz L(check_ret_vec_page_cross)
+ movl $16, %OFFSET_REG
# ifdef USE_AS_STRNCMP
- subq %rdx, %r11
+ cmpq %OFFSET_REG64, %rdx
+ jbe L(ret_zero_page_cross_slow_case0)
+ subl %eax, %OFFSET_REG
+# else
+ /* Explicit check for 16 byte alignment. */
+ subl %eax, %OFFSET_REG
+ jz L(prepare_loop)
# endif
- tzcntl %ecx, %edx
+
+ VMOVU (%rdi, %OFFSET_REG64), %xmm0
+ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
+ VPCMPEQ %xmm0, %xmmZERO, %xmm2
+ vpandn %xmm1, %xmm2, %xmm1
+ vpmovmskb %ymm1, %ecx
+ incw %cx
+ jnz L(check_ret_vec_page_cross)
+
# ifdef USE_AS_STRNCMP
- cmpq %r11, %rdx
- jae L(zero)
+ addl $16, %OFFSET_REG
+ subq %OFFSET_REG64, %rdx
+ jbe L(ret_zero_page_cross_slow_case0)
+ subq $-(VEC_SIZE * 4), %rdx
+
+ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+# else
+ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
# endif
-# ifdef USE_AS_WCSCMP
+ jmp L(prepare_loop_aligned)
+
+# ifdef USE_AS_STRNCMP
+ .p2align 4,, 2
+L(ret_zero_page_cross_slow_case0):
xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
+ ret
# endif
- VZEROUPPER_RETURN
- /* Comparing on page boundary region requires special treatment:
- It must done one vector at the time, starting with the wider
- ymm vector if possible, if not, with xmm. If fetching 16 bytes
- (xmm) still passes the boundary, byte comparison must be done.
- */
- .p2align 4
-L(cross_page):
- /* Try one ymm vector at a time. */
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- jg L(cross_page_1_vector)
-L(loop_1_vector):
- vmovdqu (%rdi, %rdx), %ymm1
- VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0
- VPMINU %ymm1, %ymm0, %ymm0
- VPCMPEQ %ymm7, %ymm0, %ymm0
- vpmovmskb %ymm0, %ecx
- testl %ecx, %ecx
- jne L(last_vector)
- addl $VEC_SIZE, %edx
+ .p2align 4,, 10
+L(less_16_till_page):
+ /* Find largest load size we can use. */
+ cmpl $24, %eax
+ ja L(less_8_till_page)
- addl $VEC_SIZE, %eax
-# ifdef USE_AS_STRNCMP
- /* Return 0 if the current offset (%rdx) >= the maximum offset
- (%r11). */
- cmpq %r11, %rdx
- jae L(zero)
-# endif
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- jle L(loop_1_vector)
-L(cross_page_1_vector):
- /* Less than 32 bytes to check, try one xmm vector. */
- cmpl $(PAGE_SIZE - 16), %eax
- jg L(cross_page_1_xmm)
- vmovdqu (%rdi, %rdx), %xmm1
- VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0
- VPMINU %xmm1, %xmm0, %xmm0
- VPCMPEQ %xmm7, %xmm0, %xmm0
- vpmovmskb %xmm0, %ecx
- testl %ecx, %ecx
- jne L(last_vector)
+ vmovq (%rdi), %xmm0
+ vmovq (%rsi), %xmm1
+ VPCMPEQ %xmm0, %xmmZERO, %xmm2
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ vpandn %xmm1, %xmm2, %xmm1
+ vpmovmskb %ymm1, %ecx
+ incb %cl
+ jnz L(check_ret_vec_page_cross)
- addl $16, %edx
-# ifndef USE_AS_WCSCMP
- addl $16, %eax
+
+# ifdef USE_AS_STRNCMP
+ cmpq $8, %rdx
+ jbe L(ret_zero_page_cross_slow_case0)
# endif
+ movl $24, %OFFSET_REG
+ /* Explicit check for 16 byte alignment. */
+ subl %eax, %OFFSET_REG
+
+
+
+ vmovq (%rdi, %OFFSET_REG64), %xmm0
+ vmovq (%rsi, %OFFSET_REG64), %xmm1
+ VPCMPEQ %xmm0, %xmmZERO, %xmm2
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ vpandn %xmm1, %xmm2, %xmm1
+ vpmovmskb %ymm1, %ecx
+ incb %cl
+ jnz L(check_ret_vec_page_cross)
+
# ifdef USE_AS_STRNCMP
- /* Return 0 if the current offset (%rdx) >= the maximum offset
- (%r11). */
- cmpq %r11, %rdx
- jae L(zero)
-# endif
-
-L(cross_page_1_xmm):
-# ifndef USE_AS_WCSCMP
- /* Less than 16 bytes to check, try 8 byte vector. NB: No need
- for wcscmp nor wcsncmp since wide char is 4 bytes. */
- cmpl $(PAGE_SIZE - 8), %eax
- jg L(cross_page_8bytes)
- vmovq (%rdi, %rdx), %xmm1
- vmovq (%rsi, %rdx), %xmm0
- VPCMPEQ %xmm0, %xmm1, %xmm0
- VPMINU %xmm1, %xmm0, %xmm0
- VPCMPEQ %xmm7, %xmm0, %xmm0
- vpmovmskb %xmm0, %ecx
- /* Only last 8 bits are valid. */
- andl $0xff, %ecx
- testl %ecx, %ecx
- jne L(last_vector)
+ addl $8, %OFFSET_REG
+ subq %OFFSET_REG64, %rdx
+ jbe L(ret_zero_page_cross_slow_case0)
+ subq $-(VEC_SIZE * 4), %rdx
+
+ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+# else
+ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+# endif
+ jmp L(prepare_loop_aligned)
+
- addl $8, %edx
- addl $8, %eax
+ .p2align 4,, 10
+L(less_8_till_page):
+# ifdef USE_AS_WCSCMP
+ /* If using wchar then this is the only check before we reach
+ the page boundary. */
+ movl (%rdi), %eax
+ movl (%rsi), %ecx
+ cmpl %ecx, %eax
+ jnz L(ret_less_8_wcs)
# ifdef USE_AS_STRNCMP
- /* Return 0 if the current offset (%rdx) >= the maximum offset
- (%r11). */
- cmpq %r11, %rdx
- jae L(zero)
+ addq %rdi, %rdx
+ /* We already checked for len <= 1 so cannot hit that case here.
+ */
# endif
+ testl %eax, %eax
+ jnz L(prepare_loop_no_len)
+ ret
-L(cross_page_8bytes):
- /* Less than 8 bytes to check, try 4 byte vector. */
- cmpl $(PAGE_SIZE - 4), %eax
- jg L(cross_page_4bytes)
- vmovd (%rdi, %rdx), %xmm1
- vmovd (%rsi, %rdx), %xmm0
- VPCMPEQ %xmm0, %xmm1, %xmm0
- VPMINU %xmm1, %xmm0, %xmm0
- VPCMPEQ %xmm7, %xmm0, %xmm0
- vpmovmskb %xmm0, %ecx
- /* Only last 4 bits are valid. */
- andl $0xf, %ecx
- testl %ecx, %ecx
- jne L(last_vector)
+ .p2align 4,, 8
+L(ret_less_8_wcs):
+ setl %OFFSET_REG8
+ negl %OFFSET_REG
+ movl %OFFSET_REG, %eax
+ xorl %r8d, %eax
+ ret
+
+# else
+
+ /* Find largest load size we can use. */
+ cmpl $28, %eax
+ ja L(less_4_till_page)
+
+ vmovd (%rdi), %xmm0
+ vmovd (%rsi), %xmm1
+ VPCMPEQ %xmm0, %xmmZERO, %xmm2
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ vpandn %xmm1, %xmm2, %xmm1
+ vpmovmskb %ymm1, %ecx
+ subl $0xf, %ecx
+ jnz L(check_ret_vec_page_cross)
- addl $4, %edx
# ifdef USE_AS_STRNCMP
- /* Return 0 if the current offset (%rdx) >= the maximum offset
- (%r11). */
- cmpq %r11, %rdx
- jae L(zero)
+ cmpq $4, %rdx
+ jbe L(ret_zero_page_cross_slow_case1)
# endif
+ movl $28, %OFFSET_REG
+ /* Explicit check for 16 byte alignment. */
+ subl %eax, %OFFSET_REG
-L(cross_page_4bytes):
-# endif
- /* Less than 4 bytes to check, try one byte/dword at a time. */
-# ifdef USE_AS_STRNCMP
- cmpq %r11, %rdx
- jae L(zero)
-# endif
-# ifdef USE_AS_WCSCMP
- movl (%rdi, %rdx), %eax
- movl (%rsi, %rdx), %ecx
-# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %ecx
+
+
+ vmovd (%rdi, %OFFSET_REG64), %xmm0
+ vmovd (%rsi, %OFFSET_REG64), %xmm1
+ VPCMPEQ %xmm0, %xmmZERO, %xmm2
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
+ vpandn %xmm1, %xmm2, %xmm1
+ vpmovmskb %ymm1, %ecx
+ subl $0xf, %ecx
+ jnz L(check_ret_vec_page_cross)
+
+# ifdef USE_AS_STRNCMP
+ addl $4, %OFFSET_REG
+ subq %OFFSET_REG64, %rdx
+ jbe L(ret_zero_page_cross_slow_case1)
+ subq $-(VEC_SIZE * 4), %rdx
+
+ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+# else
+ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
+ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
+# endif
+ jmp L(prepare_loop_aligned)
+
+# ifdef USE_AS_STRNCMP
+ .p2align 4,, 2
+L(ret_zero_page_cross_slow_case1):
+ xorl %eax, %eax
+ ret
+# endif
+
+ .p2align 4,, 10
+L(less_4_till_page):
+ subq %rdi, %rsi
+ /* Extremely slow byte comparison loop. */
+L(less_4_loop):
+ movzbl (%rdi), %eax
+ movzbl (%rsi, %rdi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
+ jnz L(ret_less_4_loop)
+ testl %ecx, %ecx
+ jz L(ret_zero_4_loop)
+# ifdef USE_AS_STRNCMP
+ decq %rdx
+ jz L(ret_zero_4_loop)
+# endif
+ incq %rdi
+ /* end condition is reach page boundary (rdi is aligned). */
+ testl $31, %edi
+ jnz L(less_4_loop)
+ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
+ addq $-(VEC_SIZE * 4), %rdi
+# ifdef USE_AS_STRNCMP
+ subq $-(VEC_SIZE * 4), %rdx
+# endif
+ jmp L(prepare_loop_aligned)
+
+L(ret_zero_4_loop):
+ xorl %eax, %eax
+ ret
+L(ret_less_4_loop):
+ xorl %r8d, %eax
+ subl %r8d, %eax
+ ret
# endif
- testl %eax, %eax
- jne L(cross_page_loop)
- subl %ecx, %eax
- VZEROUPPER_RETURN
-END (STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
#if IS_IN (libc)
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
# ifndef STRCMP
# define STRCMP __strcmp_evex
# define PAGE_SIZE 4096
-/* VEC_SIZE = Number of bytes in a ymm register */
+ /* VEC_SIZE = Number of bytes in a ymm register. */
# define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR)
-/* Shift for dividing by (VEC_SIZE * 4). */
-# define DIVIDE_BY_VEC_4_SHIFT 7
-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-# endif
-
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
# ifdef USE_AS_WCSCMP
-/* Compare packed dwords. */
-# define VPCMP vpcmpd
+# ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __wcscmp_evex
+# endif
+
+# define TESTEQ subl $0xff,
+ /* Compare packed dwords. */
+# define VPCMP vpcmpd
# define VPMINU vpminud
# define VPTESTM vptestmd
-# define SHIFT_REG32 r8d
-# define SHIFT_REG64 r8
-/* 1 dword char == 4 bytes. */
+# define VPTESTNM vptestnmd
+ /* 1 dword char == 4 bytes. */
# define SIZE_OF_CHAR 4
# else
-/* Compare packed bytes. */
-# define VPCMP vpcmpb
+# ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcmp_evex
+# endif
+
+# define TESTEQ incl
+ /* Compare packed bytes. */
+# define VPCMP vpcmpb
# define VPMINU vpminub
# define VPTESTM vptestmb
-# define SHIFT_REG32 ecx
-# define SHIFT_REG64 rcx
-/* 1 byte char == 1 byte. */
+# define VPTESTNM vptestnmb
+ /* 1 byte char == 1 byte. */
# define SIZE_OF_CHAR 1
# endif
-# define XMMZERO xmm16
-# define XMM0 xmm17
-# define XMM1 xmm18
+# ifdef USE_AS_STRNCMP
+# define LOOP_REG r9d
+# define LOOP_REG64 r9
+
+# define OFFSET_REG8 r9b
+# define OFFSET_REG r9d
+# define OFFSET_REG64 r9
+# else
+# define LOOP_REG edx
+# define LOOP_REG64 rdx
+
+# define OFFSET_REG8 dl
+# define OFFSET_REG edx
+# define OFFSET_REG64 rdx
+# endif
+
+# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
+# define VEC_OFFSET 0
+# else
+# define VEC_OFFSET (-VEC_SIZE)
+# endif
+
+# define XMM0 xmm17
+# define XMM1 xmm18
+
+# define XMM10 xmm27
+# define XMM11 xmm28
+# define XMM12 xmm29
+# define XMM13 xmm30
+# define XMM14 xmm31
+
+
+# define YMM0 ymm17
+# define YMM1 ymm18
+# define YMM2 ymm19
+# define YMM3 ymm20
+# define YMM4 ymm21
+# define YMM5 ymm22
+# define YMM6 ymm23
+# define YMM7 ymm24
+# define YMM8 ymm25
+# define YMM9 ymm26
+# define YMM10 ymm27
+# define YMM11 ymm28
+# define YMM12 ymm29
+# define YMM13 ymm30
+# define YMM14 ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_evex
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_evex
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
+# define LCASE_MIN_YMM %YMM12
+# define LCASE_MAX_YMM %YMM13
+# define CASE_ADD_YMM %YMM14
+
+# define LCASE_MIN_XMM %XMM12
+# define LCASE_MAX_XMM %XMM13
+# define CASE_ADD_XMM %XMM14
+
+ /* NB: wcsncmp uses r11 but strcasecmp is never used in
+ conjunction with wcscmp. */
+# define TOLOWER_BASE %r11
+
+# ifdef USE_AS_STRCASECMP_L
+# define _REG(x, y) x ## y
+# define REG(x, y) _REG(x, y)
+# define TOLOWER(reg1, reg2, ext) \
+ vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
+ vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
+ vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
+ vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
+# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
+
+# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
+ TOLOWER (s1_reg, s2_reg, ext); \
+ VPCMP $0, s1_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
+ VMOVU s2_mem, s2_reg; \
+ CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_YMM(...)
+# define TOLOWER_XMM(...)
-# define YMMZERO ymm16
-# define YMM0 ymm17
-# define YMM1 ymm18
-# define YMM2 ymm19
-# define YMM3 ymm20
-# define YMM4 ymm21
-# define YMM5 ymm22
-# define YMM6 ymm23
-# define YMM7 ymm24
-# define YMM8 ymm25
-# define YMM9 ymm26
-# define YMM10 ymm27
+# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
+ VPCMP $0, s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
+ VPCMP $0, s2_mem, s1_reg, reg_out
+
+# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
the maximum offset is reached before a difference is found, zero is
returned. */
- .section .text.evex,"ax",@progbits
-ENTRY (STRCMP)
-# ifdef USE_AS_STRNCMP
- /* Check for simple cases (0 or 1) in offset. */
- cmp $1, %RDX_LP
- je L(char0)
- jb L(zero)
-# ifdef USE_AS_WCSCMP
-# ifndef __ILP32__
- movq %rdx, %rcx
- /* Check if length could overflow when multiplied by
- sizeof(wchar_t). Checking top 8 bits will cover all potential
- overflow cases as well as redirect cases where its impossible to
- length to bound a valid memory region. In these cases just use
- 'wcscmp'. */
- shrq $56, %rcx
- jnz __wcscmp_evex
+ .section .text.evex, "ax", @progbits
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (STRCASECMP)
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
# endif
- /* Convert units: from wide to byte char. */
- shl $2, %RDX_LP
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
+# ifdef USE_AS_STRNCMP
+ /* Don't overwrite LOCALE_REG (rcx) until we have pass
+ L(one_or_less). Otherwise we might use the wrong locale in
+ the OVERFLOW_STRCMP (strcasecmp_l). */
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
# endif
- /* Register %r11 tracks the maximum offset. */
- mov %RDX_LP, %R11_LP
+ cmp $1, %RDX_LP
+ /* Signed comparison intentional. We use this branch to also
+ test cases where length >= 2^63. These very large sizes can be
+ handled with strcmp as there is no way for that length to
+ actually bound the buffer. */
+ jle L(one_or_less)
+# endif
+
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+L(lcase_max):
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+ vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+ vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
# endif
+
movl %edi, %eax
- xorl %edx, %edx
- /* Make %XMMZERO (%YMMZERO) all zeros in this function. */
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
orl %esi, %eax
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
- jg L(cross_page)
- /* Start comparing 4 vectors. */
+ /* Shift out the bits irrelivant to page boundary ([63:12]). */
+ sall $20, %eax
+ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */
+ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
+ ja L(page_cross)
+
+L(no_page_cross):
+ /* Safe to compare 4x vectors. */
VMOVU (%rdi), %YMM0
-
- /* Each bit set in K2 represents a non-null CHAR in YMM0. */
VPTESTM %YMM0, %YMM0, %k2
-
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi). */
- VPCMP $0, (%rsi), %YMM0, %k1{%k2}
-
+ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
-# else
- incl %ecx
-# endif
- je L(next_3_vectors)
- tzcntl %ecx, %edx
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edx
-# endif
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx) is after the maximum
- offset (%r11). */
- cmpq %r11, %rdx
- jae L(zero)
+ cmpq $CHAR_PER_VEC, %rdx
+ jbe L(vec_0_test_len)
# endif
+
+ /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
+ wcscmp/wcsncmp. */
+
+ /* All 1s represents all equals. TESTEQ will overflow to zero in
+ all equals case. Otherwise 1s will carry until position of first
+ mismatch. */
+ TESTEQ %ecx
+ jz L(more_3x_vec)
+
+ .p2align 4,, 4
+L(return_vec_0):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
+ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- je L(return)
-L(wcscmp_return):
+ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
+ je L(ret0)
setl %al
negl %eax
orl $1, %eax
-L(return):
# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
# endif
+L(ret0):
ret
-L(return_vec_size):
- tzcntl %ecx, %edx
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edx
-# endif
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
- the maximum offset (%r11). */
- addq $VEC_SIZE, %rdx
- cmpq %r11, %rdx
- jae L(zero)
-# ifdef USE_AS_WCSCMP
+ .p2align 4,, 4
+L(vec_0_test_len):
+ notl %ecx
+ bzhil %edx, %ecx, %eax
+ jnz L(return_vec_0)
+ /* Align if will cross fetch block. */
+ .p2align 4,, 2
+L(ret_zero):
xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
+ ret
+
+ .p2align 4,, 5
+L(one_or_less):
+# ifdef USE_AS_STRCASECMP_L
+ /* Set locale argument for strcasecmp. */
+ movq %LOCALE_REG, %rdx
# endif
-# else
+ jb L(ret_zero)
+ /* 'nbe' covers the case where length is negative (large
+ unsigned). */
+ jnbe OVERFLOW_STRCMP
# ifdef USE_AS_WCSCMP
+ movl (%rdi), %edx
xorl %eax, %eax
- movl VEC_SIZE(%rdi, %rdx), %ecx
- cmpl VEC_SIZE(%rsi, %rdx), %ecx
- jne L(wcscmp_return)
+ cmpl (%rsi), %edx
+ je L(ret1)
+ setl %al
+ negl %eax
+ orl $1, %eax
# else
- movzbl VEC_SIZE(%rdi, %rdx), %eax
- movzbl VEC_SIZE(%rsi, %rdx), %edx
- subl %edx, %eax
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
# endif
-# endif
+L(ret1):
ret
+# endif
-L(return_2_vec_size):
- tzcntl %ecx, %edx
+ .p2align 4,, 10
+L(return_vec_1):
+ tzcntl %ecx, %ecx
+# ifdef USE_AS_STRNCMP
+ /* rdx must be > CHAR_PER_VEC so its safe to subtract without
+ worrying about underflow. */
+ addq $-CHAR_PER_VEC, %rdx
+ cmpq %rcx, %rdx
+ jbe L(ret_zero)
+# endif
# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edx
+ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ xorl %eax, %eax
+ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
+ je L(ret2)
+ setl %al
+ negl %eax
+ orl $1, %eax
+# else
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
+ movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
# endif
+L(ret2):
+ ret
+
+ .p2align 4,, 10
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
- after the maximum offset (%r11). */
- addq $(VEC_SIZE * 2), %rdx
- cmpq %r11, %rdx
- jae L(zero)
-# ifdef USE_AS_WCSCMP
- xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- jne L(wcscmp_return)
+L(return_vec_3):
+# if CHAR_PER_VEC <= 16
+ sall $CHAR_PER_VEC, %ecx
# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
+ salq $CHAR_PER_VEC, %rcx
# endif
+# endif
+L(return_vec_2):
+# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
+ tzcntl %ecx, %ecx
# else
-# ifdef USE_AS_WCSCMP
- xorl %eax, %eax
- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
- subl %edx, %eax
-# endif
+ tzcntq %rcx, %rcx
# endif
- ret
-L(return_3_vec_size):
- tzcntl %ecx, %edx
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edx
-# endif
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
- after the maximum offset (%r11). */
- addq $(VEC_SIZE * 3), %rdx
- cmpq %r11, %rdx
- jae L(zero)
-# ifdef USE_AS_WCSCMP
+ cmpq %rcx, %rdx
+ jbe L(ret_zero)
+# endif
+
+# ifdef USE_AS_WCSCMP
+ movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
-# endif
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+ je L(ret3)
+ setl %al
+ negl %eax
+ orl $1, %eax
# else
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+# endif
+L(ret3):
+ ret
+
+# ifndef USE_AS_STRNCMP
+ .p2align 4,, 10
+L(return_vec_3):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
+ movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
xorl %eax, %eax
- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
- jne L(wcscmp_return)
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+ je L(ret4)
+ setl %al
+ negl %eax
+ orl $1, %eax
# else
- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
- subl %edx, %eax
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
# endif
-# endif
+L(ret4):
ret
+# endif
- .p2align 4
-L(next_3_vectors):
- VMOVU VEC_SIZE(%rdi), %YMM0
- /* Each bit set in K2 represents a non-null CHAR in YMM0. */
+ /* 32 byte align here ensures the main loop is ideally aligned
+ for DSB. */
+ .p2align 5
+L(more_3x_vec):
+ /* Safe to compare 4x vectors. */
+ VMOVU (VEC_SIZE)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in YMM0 and 32 bytes at VEC_SIZE(%rsi). */
- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
-# else
- incl %ecx
+ TESTEQ %ecx
+ jnz L(return_vec_1)
+
+# ifdef USE_AS_STRNCMP
+ subq $(CHAR_PER_VEC * 2), %rdx
+ jbe L(ret_zero)
# endif
- jne L(return_vec_size)
VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
- /* Each bit set in K2 represents a non-null CHAR in YMM0. */
VPTESTM %YMM0, %YMM0, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
- VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
-# else
- incl %ecx
-# endif
- jne L(return_2_vec_size)
+ TESTEQ %ecx
+ jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
- /* Each bit set in K2 represents a non-null CHAR in YMM0. */
VPTESTM %YMM0, %YMM0, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
- VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
+ TESTEQ %ecx
+ jnz L(return_vec_3)
+
+# ifdef USE_AS_STRNCMP
+ cmpq $(CHAR_PER_VEC * 2), %rdx
+ jbe L(ret_zero)
+# endif
+
+
# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
+ /* any non-zero positive value that doesn't inference with 0x1.
+ */
+ movl $2, %r8d
+
# else
- incl %ecx
-# endif
- jne L(return_3_vec_size)
-L(main_loop_header):
- leaq (VEC_SIZE * 4)(%rdi), %rdx
- movl $PAGE_SIZE, %ecx
- /* Align load via RAX. */
- andq $-(VEC_SIZE * 4), %rdx
- subq %rdi, %rdx
- leaq (%rdi, %rdx), %rax
+ xorl %r8d, %r8d
+# endif
+
+ /* The prepare labels are various entry points from the page
+ cross logic. */
+L(prepare_loop):
+
# ifdef USE_AS_STRNCMP
- /* Starting from this point, the maximum offset, or simply the
- 'offset', DECREASES by the same amount when base pointers are
- moved forward. Return 0 when:
- 1) On match: offset <= the matched vector index.
- 2) On mistmach, offset is before the mistmatched index.
- */
- subq %rdx, %r11
- jbe L(zero)
-# endif
- addq %rsi, %rdx
- movq %rdx, %rsi
- andl $(PAGE_SIZE - 1), %esi
- /* Number of bytes before page crossing. */
- subq %rsi, %rcx
- /* Number of VEC_SIZE * 4 blocks before page crossing. */
- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
- movl %ecx, %esi
- jmp L(loop_start)
+# ifdef USE_AS_WCSCMP
+L(prepare_loop_no_len):
+ movl %edi, %ecx
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ shrl $2, %ecx
+ leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
+# else
+ /* Store N + (VEC_SIZE * 4) and place check at the begining of
+ the loop. */
+ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
+L(prepare_loop_no_len):
+# endif
+# else
+L(prepare_loop_no_len):
+# endif
+
+ /* Align s1 and adjust s2 accordingly. */
+ subq %rdi, %rsi
+ andq $-(VEC_SIZE * 4), %rdi
+L(prepare_loop_readj):
+ addq %rdi, %rsi
+# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
+ subq %rdi, %rdx
+# endif
+
+L(prepare_loop_aligned):
+ /* eax stores distance from rsi to next page cross. These cases
+ need to be handled specially as the 4x loop could potentially
+ read memory past the length of s1 or s2 and across a page
+ boundary. */
+ movl $-(VEC_SIZE * 4), %eax
+ subl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+
+ /* Loop 4x comparisons at a time. */
.p2align 4
L(loop):
+
+ /* End condition for strncmp. */
# ifdef USE_AS_STRNCMP
- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
- the maximum offset (%r11) by the same amount. */
- subq $(VEC_SIZE * 4), %r11
- jbe L(zero)
-# endif
- addq $(VEC_SIZE * 4), %rax
- addq $(VEC_SIZE * 4), %rdx
-L(loop_start):
- testl %esi, %esi
- leal -1(%esi), %esi
- je L(loop_cross_page)
-L(back_to_loop):
- /* Main loop, comparing 4 vectors are a time. */
- VMOVA (%rax), %YMM0
- VMOVA VEC_SIZE(%rax), %YMM2
- VMOVA (VEC_SIZE * 2)(%rax), %YMM4
- VMOVA (VEC_SIZE * 3)(%rax), %YMM6
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(ret_zero)
+# endif
+
+ subq $-(VEC_SIZE * 4), %rdi
+ subq $-(VEC_SIZE * 4), %rsi
+
+ /* Check if rsi loads will cross a page boundary. */
+ addl $-(VEC_SIZE * 4), %eax
+ jnb L(page_cross_during_loop)
+
+ /* Loop entry after handling page cross during loop. */
+L(loop_skip_page_cross_check):
+ VMOVA (VEC_SIZE * 0)(%rdi), %YMM0
+ VMOVA (VEC_SIZE * 1)(%rdi), %YMM2
+ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4
+ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
VPMINU %YMM0, %YMM2, %YMM8
VPMINU %YMM4, %YMM6, %YMM9
- /* A zero CHAR in YMM8 means that there is a null CHAR. */
- VPMINU %YMM8, %YMM9, %YMM8
+ /* A zero CHAR in YMM9 means that there is a null CHAR. */
+ VPMINU %YMM8, %YMM9, %YMM9
+
+ /* Each bit set in K1 represents a non-null CHAR in YMM9. */
+ VPTESTM %YMM9, %YMM9, %k1
+# ifndef USE_AS_STRCASECMP_L
+ vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
+ vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
+ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+ /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
+ oring with YMM1. Result is stored in YMM6. */
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+# else
+ VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
+ TOLOWER_YMM (%YMM0, %YMM1)
+ VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
+ TOLOWER_YMM (%YMM2, %YMM3)
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
+ TOLOWER_YMM (%YMM4, %YMM5)
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
+ TOLOWER_YMM (%YMM6, %YMM7)
+ vpxorq %YMM0, %YMM1, %YMM1
+ vpxorq %YMM2, %YMM3, %YMM3
+ vpxorq %YMM4, %YMM5, %YMM5
+ vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
+ /* Or together YMM3, YMM5, and YMM6. */
+ vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
- /* Each bit set in K1 represents a non-null CHAR in YMM8. */
- VPTESTM %YMM8, %YMM8, %k1
- /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */
- vpxorq (%rdx), %YMM0, %YMM1
- vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3
- vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
- vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
+ /* A non-zero CHAR in YMM6 represents a mismatch. */
+ VPTESTNM %YMM6, %YMM6, %k0{%k1}
+ kmovd %k0, %LOOP_REG
- vporq %YMM1, %YMM3, %YMM9
- vporq %YMM5, %YMM7, %YMM10
+ TESTEQ %LOOP_REG
+ jz L(loop)
- /* A non-zero CHAR in YMM9 represents a mismatch. */
- vporq %YMM9, %YMM10, %YMM9
- /* Each bit cleared in K0 represents a mismatch or a null CHAR. */
- VPCMP $0, %YMMZERO, %YMM9, %k0{%k1}
- kmovd %k0, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
-# else
- incl %ecx
+ /* Find which VEC has the mismatch of end of string. */
+ VPTESTM %YMM0, %YMM0, %k1
+ VPTESTNM %YMM1, %YMM1, %k0{%k1}
+ kmovd %k0, %ecx
+ TESTEQ %ecx
+ jnz L(return_vec_0_end)
+
+ VPTESTM %YMM2, %YMM2, %k1
+ VPTESTNM %YMM3, %YMM3, %k0{%k1}
+ kmovd %k0, %ecx
+ TESTEQ %ecx
+ jnz L(return_vec_1_end)
+
+
+ /* Handle VEC 2 and 3 without branches. */
+L(return_vec_2_3_end):
+# ifdef USE_AS_STRNCMP
+ subq $(CHAR_PER_VEC * 2), %rdx
+ jbe L(ret_zero_end)
# endif
- je L(loop)
- /* Each bit set in K1 represents a non-null CHAR in YMM0. */
- VPTESTM %YMM0, %YMM0, %k1
- /* Each bit cleared in K0 represents a mismatch or a null CHAR
- in YMM0 and (%rdx). */
- VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
+ VPTESTM %YMM4, %YMM4, %k1
+ VPTESTNM %YMM5, %YMM5, %k0{%k1}
kmovd %k0, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
+ TESTEQ %ecx
+# if CHAR_PER_VEC <= 16
+ sall $CHAR_PER_VEC, %LOOP_REG
+ orl %ecx, %LOOP_REG
# else
- incl %ecx
-# endif
- je L(test_vec)
- tzcntl %ecx, %ecx
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %ecx
+ salq $CHAR_PER_VEC, %LOOP_REG64
+ orq %rcx, %LOOP_REG64
+# endif
+L(return_vec_3_end):
+ /* LOOP_REG contains matches for null/mismatch from the loop. If
+ VEC 0,1,and 2 all have no null and no mismatches then mismatch
+ must entirely be from VEC 3 which is fully represented by
+ LOOP_REG. */
+# if CHAR_PER_VEC <= 16
+ tzcntl %LOOP_REG, %LOOP_REG
+# else
+ tzcntq %LOOP_REG64, %LOOP_REG64
# endif
# ifdef USE_AS_STRNCMP
- cmpq %rcx, %r11
- jbe L(zero)
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
+ cmpq %LOOP_REG64, %rdx
+ jbe L(ret_zero_end)
+# endif
+
+# ifdef USE_AS_WCSCMP
+ movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
+ cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+ je L(ret5)
+ setl %al
+ negl %eax
+ xorl %r8d, %eax
# else
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
+ movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+ xorl %r8d, %eax
+ subl %r8d, %eax
# endif
+L(ret5):
ret
- .p2align 4
-L(test_vec):
# ifdef USE_AS_STRNCMP
- /* The first vector matched. Return 0 if the maximum offset
- (%r11) <= VEC_SIZE. */
- cmpq $VEC_SIZE, %r11
- jbe L(zero)
-# endif
- /* Each bit set in K1 represents a non-null CHAR in YMM2. */
- VPTESTM %YMM2, %YMM2, %k1
- /* Each bit cleared in K0 represents a mismatch or a null CHAR
- in YMM2 and VEC_SIZE(%rdx). */
- VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
- kmovd %k0, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
-# else
- incl %ecx
-# endif
- je L(test_2_vec)
- tzcntl %ecx, %edi
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edi
+ .p2align 4,, 2
+L(ret_zero_end):
+ xorl %eax, %eax
+ ret
# endif
+
+
+ /* The L(return_vec_N_end) differ from L(return_vec_N) in that
+ they use the value of `r8` to negate the return value. This is
+ because the page cross logic can swap `rdi` and `rsi`. */
+ .p2align 4,, 10
# ifdef USE_AS_STRNCMP
- addq $VEC_SIZE, %rdi
- cmpq %rdi, %r11
- jbe L(zero)
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (%rsi, %rdi), %ecx
- cmpl (%rdx, %rdi), %ecx
- jne L(wcscmp_return)
+L(return_vec_1_end):
+# if CHAR_PER_VEC <= 16
+ sall $CHAR_PER_VEC, %ecx
# else
- movzbl (%rax, %rdi), %eax
- movzbl (%rdx, %rdi), %edx
- subl %edx, %eax
+ salq $CHAR_PER_VEC, %rcx
# endif
+# endif
+L(return_vec_0_end):
+# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
+ tzcntl %ecx, %ecx
# else
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl VEC_SIZE(%rsi, %rdi), %ecx
- cmpl VEC_SIZE(%rdx, %rdi), %ecx
- jne L(wcscmp_return)
-# else
- movzbl VEC_SIZE(%rax, %rdi), %eax
- movzbl VEC_SIZE(%rdx, %rdi), %edx
- subl %edx, %eax
-# endif
+ tzcntq %rcx, %rcx
# endif
- ret
- .p2align 4
-L(test_2_vec):
# ifdef USE_AS_STRNCMP
- /* The first 2 vectors matched. Return 0 if the maximum offset
- (%r11) <= 2 * VEC_SIZE. */
- cmpq $(VEC_SIZE * 2), %r11
- jbe L(zero)
+ cmpq %rcx, %rdx
+ jbe L(ret_zero_end)
# endif
- /* Each bit set in K1 represents a non-null CHAR in YMM4. */
- VPTESTM %YMM4, %YMM4, %k1
- /* Each bit cleared in K0 represents a mismatch or a null CHAR
- in YMM4 and (VEC_SIZE * 2)(%rdx). */
- VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
- kmovd %k0, %ecx
+
# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
+ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
+ xorl %eax, %eax
+ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
+ je L(ret6)
+ setl %al
+ negl %eax
+ /* This is the non-zero case for `eax` so just xorl with `r8d`
+ flip is `rdi` and `rsi` where swapped. */
+ xorl %r8d, %eax
# else
- incl %ecx
-# endif
- je L(test_3_vec)
- tzcntl %ecx, %edi
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edi
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+ /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
+ logic. Subtract `r8d` after xor for zero case. */
+ xorl %r8d, %eax
+ subl %r8d, %eax
# endif
-# ifdef USE_AS_STRNCMP
- addq $(VEC_SIZE * 2), %rdi
- cmpq %rdi, %r11
- jbe L(zero)
+L(ret6):
+ ret
+
+# ifndef USE_AS_STRNCMP
+ .p2align 4,, 10
+L(return_vec_1_end):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
+ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
xorl %eax, %eax
- movl (%rsi, %rdi), %ecx
- cmpl (%rdx, %rdi), %ecx
- jne L(wcscmp_return)
+ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
+ je L(ret7)
+ setl %al
+ negl %eax
+ xorl %r8d, %eax
# else
- movzbl (%rax, %rdi), %eax
- movzbl (%rdx, %rdi), %edx
- subl %edx, %eax
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
+ movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+ xorl %r8d, %eax
+ subl %r8d, %eax
# endif
+L(ret7):
+ ret
+# endif
+
+
+ /* Page cross in rsi in next 4x VEC. */
+
+ /* TODO: Improve logic here. */
+ .p2align 4,, 10
+L(page_cross_during_loop):
+ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */
+
+ /* Optimistically rsi and rdi and both aligned in which case we
+ don't need any logic here. */
+ cmpl $-(VEC_SIZE * 4), %eax
+ /* Don't adjust eax before jumping back to loop and we will
+ never hit page cross case again. */
+ je L(loop_skip_page_cross_check)
+
+ /* Check if we can safely load a VEC. */
+ cmpl $-(VEC_SIZE * 3), %eax
+ jle L(less_1x_vec_till_page_cross)
+
+ VMOVA (%rdi), %YMM0
+ VPTESTM %YMM0, %YMM0, %k2
+ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+ kmovd %k1, %ecx
+ TESTEQ %ecx
+ jnz L(return_vec_0_end)
+
+ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */
+ cmpl $-(VEC_SIZE * 2), %eax
+ jg L(more_2x_vec_till_page_cross)
+
+ .p2align 4,, 4
+L(less_1x_vec_till_page_cross):
+ subl $-(VEC_SIZE * 4), %eax
+ /* Guranteed safe to read from rdi - VEC_SIZE here. The only
+ concerning case is first iteration if incoming s1 was near start
+ of a page and s2 near end. If s1 was near the start of the page
+ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
+ to read back -VEC_SIZE. If rdi is truly at the start of a page
+ here, it means the previous page (rdi - VEC_SIZE) has already
+ been loaded earlier so must be valid. */
+ VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
+ VPTESTM %YMM0, %YMM0, %k2
+ CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+ /* Mask of potentially valid bits. The lower bits can be out of
+ range comparisons (but safe regarding page crosses). */
+
+# ifdef USE_AS_WCSCMP
+ movl $-1, %r10d
+ movl %esi, %ecx
+ andl $(VEC_SIZE - 1), %ecx
+ shrl $2, %ecx
+ shlxl %ecx, %r10d, %ecx
+ movzbl %cl, %r10d
# else
+ movl $-1, %ecx
+ shlxl %esi, %ecx, %r10d
+# endif
+
+ kmovd %k1, %ecx
+ notl %ecx
+
+
+# ifdef USE_AS_STRNCMP
# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
- jne L(wcscmp_return)
+ /* NB: strcasecmp not used with WCSCMP so this access to r11 is
+ safe. */
+ movl %eax, %r11d
+ shrl $2, %r11d
+ cmpq %r11, %rdx
# else
- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
- subl %edx, %eax
+ cmpq %rax, %rdx
# endif
+ jbe L(return_page_cross_end_check)
# endif
- ret
+ movl %eax, %OFFSET_REG
- .p2align 4
-L(test_3_vec):
-# ifdef USE_AS_STRNCMP
- /* The first 3 vectors matched. Return 0 if the maximum offset
- (%r11) <= 3 * VEC_SIZE. */
- cmpq $(VEC_SIZE * 3), %r11
- jbe L(zero)
-# endif
- /* Each bit set in K1 represents a non-null CHAR in YMM6. */
- VPTESTM %YMM6, %YMM6, %k1
- /* Each bit cleared in K0 represents a mismatch or a null CHAR
- in YMM6 and (VEC_SIZE * 3)(%rdx). */
- VPCMP $0, %YMMZERO, %YMM7, %k0{%k1}
- kmovd %k0, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
+ /* Readjust eax before potentially returning to the loop. */
+ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax
+
+ andl %r10d, %ecx
+ jz L(loop_skip_page_cross_check)
+
+ .p2align 4,, 3
+L(return_page_cross_end):
+ tzcntl %ecx, %ecx
+
+# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
+ leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
+L(return_page_cross_cmp_mem):
# else
- incl %ecx
+ addl %OFFSET_REG, %ecx
# endif
- tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %ecx
+ movl VEC_OFFSET(%rdi, %rcx), %edx
+ xorl %eax, %eax
+ cmpl VEC_OFFSET(%rsi, %rcx), %edx
+ je L(ret8)
+ setl %al
+ negl %eax
+ xorl %r8d, %eax
+# else
+ movzbl VEC_OFFSET(%rdi, %rcx), %eax
+ movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+ xorl %r8d, %eax
+ subl %r8d, %eax
# endif
+L(ret8):
+ ret
+
# ifdef USE_AS_STRNCMP
- addq $(VEC_SIZE * 3), %rcx
- cmpq %rcx, %r11
- jbe L(zero)
+ .p2align 4,, 10
+L(return_page_cross_end_check):
+ andl %r10d, %ecx
+ tzcntl %ecx, %ecx
+ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (%rsi, %rcx), %esi
- cmpl (%rdx, %rcx), %esi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
+ sall $2, %edx
# endif
-# else
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
+ cmpl %ecx, %edx
+ ja L(return_page_cross_cmp_mem)
xorl %eax, %eax
- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
- jne L(wcscmp_return)
-# else
- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
-# endif
ret
-
- .p2align 4
-L(loop_cross_page):
- xorl %r10d, %r10d
- movq %rdx, %rcx
- /* Align load via RDX. We load the extra ECX bytes which should
- be ignored. */
- andl $((VEC_SIZE * 4) - 1), %ecx
- /* R10 is -RCX. */
- subq %rcx, %r10
-
- /* This works only if VEC_SIZE * 2 == 64. */
-# if (VEC_SIZE * 2) != 64
-# error (VEC_SIZE * 2) != 64
# endif
- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
- cmpl $(VEC_SIZE * 2), %ecx
- jge L(loop_cross_page_2_vec)
- VMOVU (%rax, %r10), %YMM2
- VMOVU VEC_SIZE(%rax, %r10), %YMM3
+ .p2align 4,, 10
+L(more_2x_vec_till_page_cross):
+ /* If more 2x vec till cross we will complete a full loop
+ iteration here. */
- /* Each bit set in K2 represents a non-null CHAR in YMM2. */
- VPTESTM %YMM2, %YMM2, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in YMM2 and 32 bytes at (%rdx, %r10). */
- VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2}
- kmovd %k1, %r9d
- /* Don't use subl since it is the lower 16/32 bits of RDI
- below. */
- notl %r9d
-# ifdef USE_AS_WCSCMP
- /* Only last 8 bits are valid. */
- andl $0xff, %r9d
-# endif
-
- /* Each bit set in K4 represents a non-null CHAR in YMM3. */
- VPTESTM %YMM3, %YMM3, %k4
- /* Each bit cleared in K3 represents a mismatch or a null CHAR
- in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
- VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
- kmovd %k3, %edi
- /* Must use notl %edi here as lower bits are for CHAR
- comparisons potentially out of range thus can be 0 without
- indicating mismatch. */
- notl %edi
-# ifdef USE_AS_WCSCMP
- /* Don't use subl since it is the upper 8 bits of EDI below. */
- andl $0xff, %edi
+ VMOVA VEC_SIZE(%rdi), %YMM0
+ VPTESTM %YMM0, %YMM0, %k2
+ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+ kmovd %k1, %ecx
+ TESTEQ %ecx
+ jnz L(return_vec_1_end)
+
+# ifdef USE_AS_STRNCMP
+ cmpq $(CHAR_PER_VEC * 2), %rdx
+ jbe L(ret_zero_in_loop_page_cross)
# endif
-# ifdef USE_AS_WCSCMP
- /* NB: Each bit in EDI/R9D represents 4-byte element. */
- sall $8, %edi
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
- bytes. */
- movl %ecx, %SHIFT_REG32
- sarl $2, %SHIFT_REG32
-
- /* Each bit in EDI represents a null CHAR or a mismatch. */
- orl %r9d, %edi
-# else
- salq $32, %rdi
+ subl $-(VEC_SIZE * 4), %eax
- /* Each bit in RDI represents a null CHAR or a mismatch. */
- orq %r9, %rdi
-# endif
+ /* Safe to include comparisons from lower bytes. */
+ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
+ VPTESTM %YMM0, %YMM0, %k2
+ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
+ kmovd %k1, %ecx
+ TESTEQ %ecx
+ jnz L(return_vec_page_cross_0)
+
+ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
+ VPTESTM %YMM0, %YMM0, %k2
+ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
+ kmovd %k1, %ecx
+ TESTEQ %ecx
+ jnz L(return_vec_page_cross_1)
- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
- shrxq %SHIFT_REG64, %rdi, %rdi
- testq %rdi, %rdi
- je L(loop_cross_page_2_vec)
- tzcntq %rdi, %rcx
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %ecx
-# endif
# ifdef USE_AS_STRNCMP
- cmpq %rcx, %r11
- jbe L(zero)
+ /* Must check length here as length might proclude reading next
+ page. */
# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
+ /* NB: strcasecmp not used with WCSCMP so this access to r11 is
+ safe. */
+ movl %eax, %r11d
+ shrl $2, %r11d
+ cmpq %r11, %rdx
# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
+ cmpq %rax, %rdx
# endif
+ jbe L(ret_zero_in_loop_page_cross)
+# endif
+
+ /* Finish the loop. */
+ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4
+ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
+ VPMINU %YMM4, %YMM6, %YMM9
+ VPTESTM %YMM9, %YMM9, %k1
+# ifndef USE_AS_STRCASECMP_L
+ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+ /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
# else
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
+ TOLOWER_YMM (%YMM4, %YMM5)
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
+ TOLOWER_YMM (%YMM6, %YMM7)
+ vpxorq %YMM4, %YMM5, %YMM5
+ vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+ VPTESTNM %YMM6, %YMM6, %k0{%k1}
+ kmovd %k0, %LOOP_REG
+ TESTEQ %LOOP_REG
+ jnz L(return_vec_2_3_end)
+
+ /* Best for code size to include ucond-jmp here. Would be faster
+ if this case is hot to duplicate the L(return_vec_2_3_end) code
+ as fall-through and have jump back to loop on mismatch
+ comparison. */
+ subq $-(VEC_SIZE * 4), %rdi
+ subq $-(VEC_SIZE * 4), %rsi
+ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax
+# ifdef USE_AS_STRNCMP
+ subq $(CHAR_PER_VEC * 4), %rdx
+ ja L(loop_skip_page_cross_check)
+L(ret_zero_in_loop_page_cross):
xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
-# endif
ret
+# else
+ jmp L(loop_skip_page_cross_check)
+# endif
- .p2align 4
-L(loop_cross_page_2_vec):
- /* The first VEC_SIZE * 2 bytes match or are ignored. */
- VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0
- VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1
- VPTESTM %YMM0, %YMM0, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */
- VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
- kmovd %k1, %r9d
- /* Don't use subl since it is the lower 16/32 bits of RDI
- below. */
- notl %r9d
-# ifdef USE_AS_WCSCMP
- /* Only last 8 bits are valid. */
- andl $0xff, %r9d
-# endif
-
- VPTESTM %YMM1, %YMM1, %k4
- /* Each bit cleared in K3 represents a mismatch or a null CHAR
- in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
- VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
- kmovd %k3, %edi
- /* Must use notl %edi here as lower bits are for CHAR
- comparisons potentially out of range thus can be 0 without
- indicating mismatch. */
- notl %edi
-# ifdef USE_AS_WCSCMP
- /* Don't use subl since it is the upper 8 bits of EDI below. */
- andl $0xff, %edi
+ .p2align 4,, 10
+L(return_vec_page_cross_0):
+ addl $-VEC_SIZE, %eax
+L(return_vec_page_cross_1):
+ tzcntl %ecx, %ecx
+# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
+ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
+# ifdef USE_AS_STRNCMP
+# ifdef USE_AS_WCSCMP
+ /* Must divide ecx instead of multiply rdx due to overflow. */
+ movl %ecx, %eax
+ shrl $2, %eax
+ cmpq %rax, %rdx
+# else
+ cmpq %rcx, %rdx
+# endif
+ jbe L(ret_zero_in_loop_page_cross)
+# endif
+# else
+ addl %eax, %ecx
# endif
# ifdef USE_AS_WCSCMP
- /* NB: Each bit in EDI/R9D represents 4-byte element. */
- sall $8, %edi
-
- /* Each bit in EDI represents a null CHAR or a mismatch. */
- orl %r9d, %edi
+ movl VEC_OFFSET(%rdi, %rcx), %edx
+ xorl %eax, %eax
+ cmpl VEC_OFFSET(%rsi, %rcx), %edx
+ je L(ret9)
+ setl %al
+ negl %eax
+ xorl %r8d, %eax
# else
- salq $32, %rdi
+ movzbl VEC_OFFSET(%rdi, %rcx), %eax
+ movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
+ subl %ecx, %eax
+ xorl %r8d, %eax
+ subl %r8d, %eax
+# endif
+L(ret9):
+ ret
- /* Each bit in RDI represents a null CHAR or a mismatch. */
- orq %r9, %rdi
+
+ .p2align 4,, 10
+L(page_cross):
+# ifndef USE_AS_STRNCMP
+ /* If both are VEC aligned we don't need any special logic here.
+ Only valid for strcmp where stop condition is guranteed to be
+ reachable by just reading memory. */
+ testl $((VEC_SIZE - 1) << 20), %eax
+ jz L(no_page_cross)
# endif
- xorl %r8d, %r8d
- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
- subl $(VEC_SIZE * 2), %ecx
- jle 1f
- /* R8 has number of bytes skipped. */
- movl %ecx, %r8d
+ movl %edi, %eax
+ movl %esi, %ecx
+ andl $(PAGE_SIZE - 1), %eax
+ andl $(PAGE_SIZE - 1), %ecx
+
+ xorl %OFFSET_REG, %OFFSET_REG
+
+ /* Check which is closer to page cross, s1 or s2. */
+ cmpl %eax, %ecx
+ jg L(page_cross_s2)
+
+ /* The previous page cross check has false positives. Check for
+ true positive as page cross logic is very expensive. */
+ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax
+ jbe L(no_page_cross)
+
+
+ /* Set r8 to not interfere with normal return value (rdi and rsi
+ did not swap). */
# ifdef USE_AS_WCSCMP
- /* NB: Divide shift count by 4 since each bit in RDI represent 4
- bytes. */
- sarl $2, %ecx
- /* Skip ECX bytes. */
- shrl %cl, %edi
+ /* any non-zero positive value that doesn't inference with 0x1.
+ */
+ movl $2, %r8d
# else
- /* Skip ECX bytes. */
- shrq %cl, %rdi
+ xorl %r8d, %r8d
# endif
-1:
- /* Before jumping back to the loop, set ESI to the number of
- VEC_SIZE * 4 blocks before page crossing. */
- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
- testq %rdi, %rdi
+ /* Check if less than 1x VEC till page cross. */
+ subl $(VEC_SIZE * 3), %eax
+ jg L(less_1x_vec_till_page)
+
+
+ /* If more than 1x VEC till page cross, loop throuh safely
+ loadable memory until within 1x VEC of page cross. */
+ .p2align 4,, 8
+L(page_cross_loop):
+ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+ VPTESTM %YMM0, %YMM0, %k2
+ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+ kmovd %k1, %ecx
+ TESTEQ %ecx
+ jnz L(check_ret_vec_page_cross)
+ addl $CHAR_PER_VEC, %OFFSET_REG
# ifdef USE_AS_STRNCMP
- /* At this point, if %rdi value is 0, it already tested
- VEC_SIZE*4+%r10 byte starting from %rax. This label
- checks whether strncmp maximum offset reached or not. */
- je L(string_nbyte_offset_check)
-# else
- je L(back_to_loop)
+ cmpq %OFFSET_REG64, %rdx
+ jbe L(ret_zero_page_cross)
# endif
- tzcntq %rdi, %rcx
+ addl $VEC_SIZE, %eax
+ jl L(page_cross_loop)
+
# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %ecx
+ shrl $2, %eax
# endif
- addq %r10, %rcx
- /* Adjust for number of bytes skipped. */
- addq %r8, %rcx
+
+
+ subl %eax, %OFFSET_REG
+ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
+ to not cross page so is safe to load. Since we have already
+ loaded at least 1 VEC from rsi it is also guranteed to be safe.
+ */
+ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
+ VPTESTM %YMM0, %YMM0, %k2
+ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+
+ kmovd %k1, %ecx
# ifdef USE_AS_STRNCMP
- addq $(VEC_SIZE * 2), %rcx
- subq %rcx, %r11
- jbe L(zero)
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
-# else
+ leal CHAR_PER_VEC(%OFFSET_REG64), %eax
+ cmpq %rax, %rdx
+ jbe L(check_ret_vec_page_cross2)
# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
- jne L(wcscmp_return)
+ addq $-(CHAR_PER_VEC * 2), %rdx
# else
- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
- subl %edx, %eax
+ addq %rdi, %rdx
# endif
# endif
- ret
+ TESTEQ %ecx
+ jz L(prepare_loop_no_len)
-# ifdef USE_AS_STRNCMP
-L(string_nbyte_offset_check):
- leaq (VEC_SIZE * 4)(%r10), %r10
- cmpq %r10, %r11
- jbe L(zero)
- jmp L(back_to_loop)
-# endif
-
- .p2align 4
-L(cross_page_loop):
- /* Check one byte/dword at a time. */
-# ifdef USE_AS_WCSCMP
- cmpl %ecx, %eax
-# else
- subl %ecx, %eax
-# endif
- jne L(different)
- addl $SIZE_OF_CHAR, %edx
- cmpl $(VEC_SIZE * 4), %edx
- je L(main_loop_header)
-# ifdef USE_AS_STRNCMP
- cmpq %r11, %rdx
- jae L(zero)
+ .p2align 4,, 4
+L(ret_vec_page_cross):
+# ifndef USE_AS_STRNCMP
+L(check_ret_vec_page_cross):
# endif
+ tzcntl %ecx, %ecx
+ addl %OFFSET_REG, %ecx
+L(ret_vec_page_cross_cont):
# ifdef USE_AS_WCSCMP
- movl (%rdi, %rdx), %eax
- movl (%rsi, %rdx), %ecx
+ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
+ xorl %eax, %eax
+ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
+ je L(ret12)
+ setl %al
+ negl %eax
+ xorl %r8d, %eax
# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %ecx
-# endif
- /* Check null CHAR. */
- testl %eax, %eax
- jne L(cross_page_loop)
- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
- comparisons. */
+ movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
+ movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
-# ifndef USE_AS_WCSCMP
-L(different):
+ xorl %r8d, %eax
+ subl %r8d, %eax
# endif
+L(ret12):
ret
-# ifdef USE_AS_WCSCMP
- .p2align 4
-L(different):
- /* Use movl to avoid modifying EFLAGS. */
- movl $0, %eax
- setl %al
- negl %eax
- orl $1, %eax
- ret
-# endif
# ifdef USE_AS_STRNCMP
- .p2align 4
-L(zero):
+ .p2align 4,, 10
+L(check_ret_vec_page_cross2):
+ TESTEQ %ecx
+L(check_ret_vec_page_cross):
+ tzcntl %ecx, %ecx
+ addl %OFFSET_REG, %ecx
+ cmpq %rcx, %rdx
+ ja L(ret_vec_page_cross_cont)
+ .p2align 4,, 2
+L(ret_zero_page_cross):
xorl %eax, %eax
ret
+# endif
- .p2align 4
-L(char0):
-# ifdef USE_AS_WCSCMP
- xorl %eax, %eax
- movl (%rdi), %ecx
- cmpl (%rsi), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rsi), %ecx
- movzbl (%rdi), %eax
- subl %ecx, %eax
-# endif
- ret
+ .p2align 4,, 4
+L(page_cross_s2):
+ /* Ensure this is a true page cross. */
+ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx
+ jbe L(no_page_cross)
+
+
+ movl %ecx, %eax
+ movq %rdi, %rcx
+ movq %rsi, %rdi
+ movq %rcx, %rsi
+
+ /* set r8 to negate return value as rdi and rsi swapped. */
+# ifdef USE_AS_WCSCMP
+ movl $-4, %r8d
+# else
+ movl $-1, %r8d
# endif
+ xorl %OFFSET_REG, %OFFSET_REG
- .p2align 4
-L(last_vector):
- addq %rdx, %rdi
- addq %rdx, %rsi
-# ifdef USE_AS_STRNCMP
- subq %rdx, %r11
+ /* Check if more than 1x VEC till page cross. */
+ subl $(VEC_SIZE * 3), %eax
+ jle L(page_cross_loop)
+
+ .p2align 4,, 6
+L(less_1x_vec_till_page):
+# ifdef USE_AS_WCSCMP
+ shrl $2, %eax
# endif
- tzcntl %ecx, %edx
+ /* Find largest load size we can use. */
+ cmpl $(16 / SIZE_OF_CHAR), %eax
+ ja L(less_16_till_page)
+
+ /* Use 16 byte comparison. */
+ vmovdqu (%rdi), %xmm0
+ VPTESTM %xmm0, %xmm0, %k2
+ CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
+ kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edx
+ subl $0xf, %ecx
+# else
+ incw %cx
# endif
+ jnz L(check_ret_vec_page_cross)
+ movl $(16 / SIZE_OF_CHAR), %OFFSET_REG
# ifdef USE_AS_STRNCMP
- cmpq %r11, %rdx
- jae L(zero)
+ cmpq %OFFSET_REG64, %rdx
+ jbe L(ret_zero_page_cross_slow_case0)
+ subl %eax, %OFFSET_REG
+# else
+ /* Explicit check for 16 byte alignment. */
+ subl %eax, %OFFSET_REG
+ jz L(prepare_loop)
# endif
+ vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ VPTESTM %xmm0, %xmm0, %k2
+ CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
+ kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
- xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- jne L(wcscmp_return)
+ subl $0xf, %ecx
+# else
+ incw %cx
+# endif
+ jnz L(check_ret_vec_page_cross)
+# ifdef USE_AS_STRNCMP
+ addl $(16 / SIZE_OF_CHAR), %OFFSET_REG
+ subq %OFFSET_REG64, %rdx
+ jbe L(ret_zero_page_cross_slow_case0)
+ subq $-(CHAR_PER_VEC * 4), %rdx
+
+ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
+ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
# endif
+ jmp L(prepare_loop_aligned)
+
+# ifdef USE_AS_STRNCMP
+ .p2align 4,, 2
+L(ret_zero_page_cross_slow_case0):
+ xorl %eax, %eax
ret
+# endif
- /* Comparing on page boundary region requires special treatment:
- It must done one vector at the time, starting with the wider
- ymm vector if possible, if not, with xmm. If fetching 16 bytes
- (xmm) still passes the boundary, byte comparison must be done.
- */
- .p2align 4
-L(cross_page):
- /* Try one ymm vector at a time. */
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- jg L(cross_page_1_vector)
-L(loop_1_vector):
- VMOVU (%rdi, %rdx), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in YMM0 and 32 bytes at (%rsi, %rdx). */
- VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
+ .p2align 4,, 10
+L(less_16_till_page):
+ cmpl $(24 / SIZE_OF_CHAR), %eax
+ ja L(less_8_till_page)
+
+ /* Use 8 byte comparison. */
+ vmovq (%rdi), %xmm0
+ vmovq (%rsi), %xmm1
+ VPTESTM %xmm0, %xmm0, %k2
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
+ subl $0x3, %ecx
# else
- incl %ecx
+ incb %cl
# endif
- jne L(last_vector)
+ jnz L(check_ret_vec_page_cross)
- addl $VEC_SIZE, %edx
- addl $VEC_SIZE, %eax
# ifdef USE_AS_STRNCMP
- /* Return 0 if the current offset (%rdx) >= the maximum offset
- (%r11). */
- cmpq %r11, %rdx
- jae L(zero)
+ cmpq $(8 / SIZE_OF_CHAR), %rdx
+ jbe L(ret_zero_page_cross_slow_case0)
# endif
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- jle L(loop_1_vector)
-L(cross_page_1_vector):
- /* Less than 32 bytes to check, try one xmm vector. */
- cmpl $(PAGE_SIZE - 16), %eax
- jg L(cross_page_1_xmm)
- VMOVU (%rdi, %rdx), %XMM0
+ movl $(24 / SIZE_OF_CHAR), %OFFSET_REG
+ subl %eax, %OFFSET_REG
- VPTESTM %YMM0, %YMM0, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in XMM0 and 16 bytes at (%rsi, %rdx). */
- VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2}
+ vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+ VPTESTM %xmm0, %xmm0, %k2
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
- subl $0xf, %ecx
+ subl $0x3, %ecx
# else
- subl $0xffff, %ecx
+ incb %cl
# endif
- jne L(last_vector)
+ jnz L(check_ret_vec_page_cross)
+
- addl $16, %edx
-# ifndef USE_AS_WCSCMP
- addl $16, %eax
-# endif
# ifdef USE_AS_STRNCMP
- /* Return 0 if the current offset (%rdx) >= the maximum offset
- (%r11). */
- cmpq %r11, %rdx
- jae L(zero)
+ addl $(8 / SIZE_OF_CHAR), %OFFSET_REG
+ subq %OFFSET_REG64, %rdx
+ jbe L(ret_zero_page_cross_slow_case0)
+ subq $-(CHAR_PER_VEC * 4), %rdx
+
+ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+# else
+ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
# endif
+ jmp L(prepare_loop_aligned)
-L(cross_page_1_xmm):
-# ifndef USE_AS_WCSCMP
- /* Less than 16 bytes to check, try 8 byte vector. NB: No need
- for wcscmp nor wcsncmp since wide char is 4 bytes. */
- cmpl $(PAGE_SIZE - 8), %eax
- jg L(cross_page_8bytes)
- vmovq (%rdi, %rdx), %XMM0
- vmovq (%rsi, %rdx), %XMM1
- VPTESTM %YMM0, %YMM0, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in XMM0 and XMM1. */
- VPCMP $0, %XMM1, %XMM0, %k1{%k2}
- kmovb %k1, %ecx
+
+
+ .p2align 4,, 10
+L(less_8_till_page):
# ifdef USE_AS_WCSCMP
- subl $0x3, %ecx
+ /* If using wchar then this is the only check before we reach
+ the page boundary. */
+ movl (%rdi), %eax
+ movl (%rsi), %ecx
+ cmpl %ecx, %eax
+ jnz L(ret_less_8_wcs)
+# ifdef USE_AS_STRNCMP
+ addq $-(CHAR_PER_VEC * 2), %rdx
+ /* We already checked for len <= 1 so cannot hit that case here.
+ */
+# endif
+ testl %eax, %eax
+ jnz L(prepare_loop)
+ ret
+
+ .p2align 4,, 8
+L(ret_less_8_wcs):
+ setl %OFFSET_REG8
+ negl %OFFSET_REG
+ movl %OFFSET_REG, %eax
+ xorl %r8d, %eax
+ ret
+
# else
- subl $0xff, %ecx
-# endif
- jne L(last_vector)
+ cmpl $28, %eax
+ ja L(less_4_till_page)
+
+ vmovd (%rdi), %xmm0
+ vmovd (%rsi), %xmm1
+ VPTESTM %xmm0, %xmm0, %k2
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
+ kmovd %k1, %ecx
+ subl $0xf, %ecx
+ jnz L(check_ret_vec_page_cross)
- addl $8, %edx
- addl $8, %eax
# ifdef USE_AS_STRNCMP
- /* Return 0 if the current offset (%rdx) >= the maximum offset
- (%r11). */
- cmpq %r11, %rdx
- jae L(zero)
+ cmpq $4, %rdx
+ jbe L(ret_zero_page_cross_slow_case1)
# endif
+ movl $(28 / SIZE_OF_CHAR), %OFFSET_REG
+ subl %eax, %OFFSET_REG
-L(cross_page_8bytes):
- /* Less than 8 bytes to check, try 4 byte vector. */
- cmpl $(PAGE_SIZE - 4), %eax
- jg L(cross_page_4bytes)
- vmovd (%rdi, %rdx), %XMM0
- vmovd (%rsi, %rdx), %XMM1
-
- VPTESTM %YMM0, %YMM0, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in XMM0 and XMM1. */
- VPCMP $0, %XMM1, %XMM0, %k1{%k2}
+ vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
+ vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
+ VPTESTM %xmm0, %xmm0, %k2
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0x1, %ecx
-# else
subl $0xf, %ecx
-# endif
- jne L(last_vector)
+ jnz L(check_ret_vec_page_cross)
+# ifdef USE_AS_STRNCMP
+ addl $(4 / SIZE_OF_CHAR), %OFFSET_REG
+ subq %OFFSET_REG64, %rdx
+ jbe L(ret_zero_page_cross_slow_case1)
+ subq $-(CHAR_PER_VEC * 4), %rdx
+
+ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+# else
+ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+# endif
+ jmp L(prepare_loop_aligned)
+
- addl $4, %edx
# ifdef USE_AS_STRNCMP
- /* Return 0 if the current offset (%rdx) >= the maximum offset
- (%r11). */
- cmpq %r11, %rdx
- jae L(zero)
+ .p2align 4,, 2
+L(ret_zero_page_cross_slow_case1):
+ xorl %eax, %eax
+ ret
# endif
-L(cross_page_4bytes):
-# endif
- /* Less than 4 bytes to check, try one byte/dword at a time. */
-# ifdef USE_AS_STRNCMP
- cmpq %r11, %rdx
- jae L(zero)
-# endif
-# ifdef USE_AS_WCSCMP
- movl (%rdi, %rdx), %eax
- movl (%rsi, %rdx), %ecx
-# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %ecx
-# endif
- testl %eax, %eax
- jne L(cross_page_loop)
- subl %ecx, %eax
+ .p2align 4,, 10
+L(less_4_till_page):
+ subq %rdi, %rsi
+ /* Extremely slow byte comparison loop. */
+L(less_4_loop):
+ movzbl (%rdi), %eax
+ movzbl (%rsi, %rdi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
+ jnz L(ret_less_4_loop)
+ testl %ecx, %ecx
+ jz L(ret_zero_4_loop)
+# ifdef USE_AS_STRNCMP
+ decq %rdx
+ jz L(ret_zero_4_loop)
+# endif
+ incq %rdi
+ /* end condition is reach page boundary (rdi is aligned). */
+ testl $31, %edi
+ jnz L(less_4_loop)
+ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
+ addq $-(VEC_SIZE * 4), %rdi
+# ifdef USE_AS_STRNCMP
+ subq $-(CHAR_PER_VEC * 4), %rdx
+# endif
+ jmp L(prepare_loop_aligned)
+
+L(ret_zero_4_loop):
+ xorl %eax, %eax
+ ret
+L(ret_less_4_loop):
+ xorl %r8d, %eax
+ subl %r8d, %eax
ret
-END (STRCMP)
+# endif
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
# define UPDATE_STRNCMP_COUNTER
#endif
-#ifdef USE_AVX
-# define SECTION avx
-# define GLABEL(l) l##_avx
-#else
-# define SECTION sse4.2
-# define GLABEL(l) l##_sse42
-#endif
+#define SECTION sse4.2
+#define GLABEL(l) l##_sse42
#define LABEL(l) .L##l
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RDX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END (GLABEL(__strcasecmp))
/* FALLTHROUGH to strcasecmp_l. */
#endif
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RCX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END (GLABEL(__strncasecmp))
/* FALLTHROUGH to strncasecmp_l. */
#endif
-#ifdef USE_AVX
-# define movdqa vmovdqa
-# define movdqu vmovdqu
-# define pmovmskb vpmovmskb
-# define pcmpistri vpcmpistri
-# define psubb vpsubb
-# define pcmpeqb vpcmpeqb
-# define psrldq vpsrldq
-# define pslldq vpslldq
-# define palignr vpalignr
-# define pxor vpxor
-# define D(arg) arg, arg
-#else
-# define D(arg) arg
-#endif
+#define arg arg
STRCMP_SSE42:
cfi_startproc
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
.section .rodata.cst16,"aM",@progbits,16
.align 16
-LABEL(belowupper):
- .quad 0x4040404040404040
- .quad 0x4040404040404040
-LABEL(topupper):
-# ifdef USE_AVX
- .quad 0x5a5a5a5a5a5a5a5a
- .quad 0x5a5a5a5a5a5a5a5a
-# else
- .quad 0x5b5b5b5b5b5b5b5b
- .quad 0x5b5b5b5b5b5b5b5b
-# endif
-LABEL(touppermask):
+LABEL(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+LABEL(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+LABEL(case_add):
.quad 0x2020202020202020
.quad 0x2020202020202020
.previous
- movdqa LABEL(belowupper)(%rip), %xmm4
-# define UCLOW_reg %xmm4
- movdqa LABEL(topupper)(%rip), %xmm5
-# define UCHIGH_reg %xmm5
- movdqa LABEL(touppermask)(%rip), %xmm6
-# define LCQWORD_reg %xmm6
+ movdqa LABEL(lcase_min)(%rip), %xmm4
+# define LCASE_MIN_reg %xmm4
+ movdqa LABEL(lcase_max)(%rip), %xmm5
+# define LCASE_MAX_reg %xmm5
+ movdqa LABEL(case_add)(%rip), %xmm6
+# define CASE_ADD_reg %xmm6
#endif
cmp $0x30, %ecx
ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
movdqu (%rdi), %xmm1
movdqu (%rsi), %xmm2
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# ifdef USE_AVX
-# define TOLOWER(reg1, reg2) \
- vpcmpgtb UCLOW_reg, reg1, %xmm7; \
- vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
- vpcmpgtb UCLOW_reg, reg2, %xmm9; \
- vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
- vpandn %xmm7, %xmm8, %xmm8; \
- vpandn %xmm9, %xmm10, %xmm10; \
- vpand LCQWORD_reg, %xmm8, %xmm8; \
- vpand LCQWORD_reg, %xmm10, %xmm10; \
- vpor reg1, %xmm8, reg1; \
- vpor reg2, %xmm10, reg2
-# else
-# define TOLOWER(reg1, reg2) \
- movdqa reg1, %xmm7; \
- movdqa UCHIGH_reg, %xmm8; \
- movdqa reg2, %xmm9; \
- movdqa UCHIGH_reg, %xmm10; \
- pcmpgtb UCLOW_reg, %xmm7; \
- pcmpgtb reg1, %xmm8; \
- pcmpgtb UCLOW_reg, %xmm9; \
- pcmpgtb reg2, %xmm10; \
- pand %xmm8, %xmm7; \
- pand %xmm10, %xmm9; \
- pand LCQWORD_reg, %xmm7; \
- pand LCQWORD_reg, %xmm9; \
- por %xmm7, reg1; \
- por %xmm9, reg2
-# endif
+# define TOLOWER(reg1, reg2) \
+ movdqa LCASE_MIN_reg, %xmm7; \
+ movdqa LCASE_MIN_reg, %xmm8; \
+ paddb reg1, %xmm7; \
+ paddb reg2, %xmm8; \
+ pcmpgtb LCASE_MAX_reg, %xmm7; \
+ pcmpgtb LCASE_MAX_reg, %xmm8; \
+ pandn CASE_ADD_reg, %xmm7; \
+ pandn CASE_ADD_reg, %xmm8; \
+ paddb %xmm7, reg1; \
+ paddb %xmm8, reg2
+
TOLOWER (%xmm1, %xmm2)
#else
# define TOLOWER(reg1, reg2)
#endif
- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
- pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
pmovmskb %xmm1, %edx
sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
jnz LABEL(less16bytes)/* If not, find different value or null char */
xor %r8d, %r8d
and $0xf, %ecx /* offset of rsi */
and $0xf, %eax /* offset of rdi */
- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
cmp %eax, %ecx
je LABEL(ashr_0) /* rsi and rdi relative offset same */
ja LABEL(bigger)
sub %rcx, %r9
lea LABEL(unaligned_table)(%rip), %r10
movslq (%r10, %r9,4), %r9
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
lea (%r10, %r9), %r10
_CET_NOTRACK jmp *%r10 /* jump to corresponding case */
LABEL(ashr_0):
movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
+ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
#else
movdqa (%rdi), %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
+ pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
#endif
- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
pmovmskb %xmm1, %r9d
shr %cl, %edx /* adjust 0xffff for offset */
shr %cl, %r9d /* adjust for 16-byte offset */
*/
.p2align 4
LABEL(ashr_1):
- pslldq $15, D(%xmm2) /* shift first string to align with second */
+ pslldq $15, %xmm2 /* shift first string to align with second */
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
- psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
+ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
pmovmskb %xmm2, %r9d
shr %cl, %edx /* adjust 0xffff for offset */
shr %cl, %r9d /* adjust for 16-byte offset */
LABEL(nibble_ashr_1_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $1, -16(%rdi, %rdx), D(%xmm0)
+ palignr $1, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_1_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $1, -16(%rdi, %rdx), D(%xmm0)
+ palignr $1, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_1_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $1, D(%xmm0)
+ psrldq $1, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_2):
- pslldq $14, D(%xmm2)
+ pslldq $14, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_2_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $2, -16(%rdi, %rdx), D(%xmm0)
+ palignr $2, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_2_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $2, -16(%rdi, %rdx), D(%xmm0)
+ palignr $2, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_2_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $2, D(%xmm0)
+ psrldq $2, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_3):
- pslldq $13, D(%xmm2)
+ pslldq $13, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_3_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $3, -16(%rdi, %rdx), D(%xmm0)
+ palignr $3, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_3_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $3, -16(%rdi, %rdx), D(%xmm0)
+ palignr $3, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_3_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $3, D(%xmm0)
+ psrldq $3, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_4):
- pslldq $12, D(%xmm2)
+ pslldq $12, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_4_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $4, -16(%rdi, %rdx), D(%xmm0)
+ palignr $4, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_4_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $4, -16(%rdi, %rdx), D(%xmm0)
+ palignr $4, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_4_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $4, D(%xmm0)
+ psrldq $4, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_5):
- pslldq $11, D(%xmm2)
+ pslldq $11, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_5_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $5, -16(%rdi, %rdx), D(%xmm0)
+ palignr $5, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
movdqa (%rdi, %rdx), %xmm0
- palignr $5, -16(%rdi, %rdx), D(%xmm0)
+ palignr $5, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_5_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $5, D(%xmm0)
+ psrldq $5, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_6):
- pslldq $10, D(%xmm2)
+ pslldq $10, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_6_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $6, -16(%rdi, %rdx), D(%xmm0)
+ palignr $6, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_6_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $6, -16(%rdi, %rdx), D(%xmm0)
+ palignr $6, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_6_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $6, D(%xmm0)
+ psrldq $6, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_7):
- pslldq $9, D(%xmm2)
+ pslldq $9, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_7_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $7, -16(%rdi, %rdx), D(%xmm0)
+ palignr $7, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_7_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $7, -16(%rdi, %rdx), D(%xmm0)
+ palignr $7, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_7_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $7, D(%xmm0)
+ psrldq $7, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_8):
- pslldq $8, D(%xmm2)
+ pslldq $8, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_8_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $8, -16(%rdi, %rdx), D(%xmm0)
+ palignr $8, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_8_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $8, -16(%rdi, %rdx), D(%xmm0)
+ palignr $8, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_8_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $8, D(%xmm0)
+ psrldq $8, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_9):
- pslldq $7, D(%xmm2)
+ pslldq $7, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_9_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $9, -16(%rdi, %rdx), D(%xmm0)
+ palignr $9, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_9_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $9, -16(%rdi, %rdx), D(%xmm0)
+ palignr $9, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_9_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $9, D(%xmm0)
+ psrldq $9, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_10):
- pslldq $6, D(%xmm2)
+ pslldq $6, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_10_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $10, -16(%rdi, %rdx), D(%xmm0)
+ palignr $10, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_10_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $10, -16(%rdi, %rdx), D(%xmm0)
+ palignr $10, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_10_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $10, D(%xmm0)
+ psrldq $10, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_11):
- pslldq $5, D(%xmm2)
+ pslldq $5, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_11_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $11, -16(%rdi, %rdx), D(%xmm0)
+ palignr $11, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_11_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $11, -16(%rdi, %rdx), D(%xmm0)
+ palignr $11, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_11_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $11, D(%xmm0)
+ psrldq $11, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_12):
- pslldq $4, D(%xmm2)
+ pslldq $4, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_12_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $12, -16(%rdi, %rdx), D(%xmm0)
+ palignr $12, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_12_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $12, -16(%rdi, %rdx), D(%xmm0)
+ palignr $12, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_12_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $12, D(%xmm0)
+ psrldq $12, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_13):
- pslldq $3, D(%xmm2)
+ pslldq $3, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_13_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $13, -16(%rdi, %rdx), D(%xmm0)
+ palignr $13, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_13_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $13, -16(%rdi, %rdx), D(%xmm0)
+ palignr $13, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_13_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $13, D(%xmm0)
+ psrldq $13, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_14):
- pslldq $2, D(%xmm2)
+ pslldq $2, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_14_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $14, -16(%rdi, %rdx), D(%xmm0)
+ palignr $14, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_14_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $14, -16(%rdi, %rdx), D(%xmm0)
+ palignr $14, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_14_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $14, D(%xmm0)
+ psrldq $14, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
*/
.p2align 4
LABEL(ashr_15):
- pslldq $1, D(%xmm2)
+ pslldq $1, %xmm2
TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, D(%xmm2)
- psubb %xmm0, D(%xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
shr %cl, %edx
shr %cl, %r9d
LABEL(nibble_ashr_15_restart_use):
movdqa (%rdi, %rdx), %xmm0
- palignr $15, -16(%rdi, %rdx), D(%xmm0)
+ palignr $15, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
jg LABEL(nibble_ashr_15_use)
movdqa (%rdi, %rdx), %xmm0
- palignr $15, -16(%rdi, %rdx), D(%xmm0)
+ palignr $15, -16(%rdi, %rdx), %xmm0
#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
pcmpistri $0x1a, (%rsi,%rdx), %xmm0
#else
LABEL(nibble_ashr_15_use):
sub $0x1000, %r10
movdqa -16(%rdi, %rdx), %xmm0
- psrldq $15, D(%xmm0)
+ psrldq $15, %xmm0
pcmpistri $0x3a,%xmm0, %xmm0
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
cmp %r11, %rcx
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
return OPTIMIZE (avx2);
}
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
+ return OPTIMIZE (sse42);
+
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
RETURN (NULL, strlen (s));
const char *aligned;
- __m128i mask;
- int offset = (int) ((size_t) a & 15);
+ __m128i mask, maskz, zero;
+ unsigned int maskz_bits;
+ unsigned int offset = (unsigned int) ((size_t) a & 15);
+ zero = _mm_set1_epi8 (0);
if (offset != 0)
{
/* Load masks. */
aligned = (const char *) ((size_t) a & -16L);
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
- mask = __m128i_shift_right (mask0, offset);
+ maskz = _mm_cmpeq_epi8 (mask0, zero);
/* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16 - offset)
- {
- /* There is no NULL terminator. */
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
- length += index;
-
- /* Don't use SSE4.2 if the length of A > 16. */
- if (length > 16)
- return STRCSPN_SSE2 (s, a);
-
- if (index != 0)
- {
- /* Combine mask0 and mask1. We could play games with
- palignr, but frankly this data should be in L1 now
- so do the merge via an unaligned load. */
- mask = _mm_loadu_si128 ((__m128i *) a);
- }
- }
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+ if (maskz_bits != 0)
+ {
+ mask = __m128i_shift_right (mask0, offset);
+ offset = (unsigned int) ((size_t) s & 15);
+ if (offset)
+ goto start_unaligned;
+
+ aligned = s;
+ goto start_loop;
+ }
}
- else
- {
- /* A is aligned. */
- mask = _mm_load_si128 ((__m128i *) a);
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16)
- {
- /* There is no NULL terminator. Don't use SSE4.2 if the length
- of A > 16. */
- if (a[16] != 0)
- return STRCSPN_SSE2 (s, a);
- }
+ /* A is aligned. */
+ mask = _mm_loadu_si128 ((__m128i *) a);
+ /* Find where the NULL terminator is. */
+ maskz = _mm_cmpeq_epi8 (mask, zero);
+ maskz_bits = _mm_movemask_epi8 (maskz);
+ if (maskz_bits == 0)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return STRCSPN_SSE2 (s, a);
}
- offset = (int) ((size_t) s & 15);
+ aligned = s;
+ offset = (unsigned int) ((size_t) s & 15);
if (offset != 0)
{
+ start_unaligned:
/* Check partial string. */
aligned = (const char *) ((size_t) s & -16L);
__m128i value = _mm_load_si128 ((__m128i *) aligned);
value = __m128i_shift_right (value, offset);
- int length = _mm_cmpistri (mask, value, 0x2);
+ unsigned int length = _mm_cmpistri (mask, value, 0x2);
/* No need to check ZFlag since ZFlag is always 1. */
- int cflag = _mm_cmpistrc (mask, value, 0x2);
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
if (cflag)
RETURN ((char *) (s + length), length);
/* Find where the NULL terminator is. */
- int index = _mm_cmpistri (value, value, 0x3a);
+ unsigned int index = _mm_cmpistri (value, value, 0x3a);
if (index < 16 - offset)
RETURN (NULL, index);
aligned += 16;
}
- else
- aligned = s;
+start_loop:
while (1)
{
__m128i value = _mm_load_si128 ((__m128i *) aligned);
- int index = _mm_cmpistri (mask, value, 0x2);
- int cflag = _mm_cmpistrc (mask, value, 0x2);
- int zflag = _mm_cmpistrz (mask, value, 0x2);
+ unsigned int index = _mm_cmpistri (mask, value, 0x2);
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+ unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
if (cflag)
RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
if (zflag)
+++ /dev/null
-/* strcspn optimized with SSE2.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-# define strcspn __strcspn_sse2
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcspn)
-#endif
-
-#include <sysdeps/x86_64/strcspn.S>
--- /dev/null
+/* strcspn.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define STRCSPN __strcspn_sse2
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(STRCSPN)
+#endif
+
+#include <string/strcspn.c>
--- /dev/null
+/* Placeholder function, not used by any processor at the moment.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+# define VPCMP vpcmpd
+# define VPTESTN vptestnmd
+# define VPMINU vpminud
+# define CHAR_SIZE 4
+# else
+# define VPCMP vpcmpb
+# define VPTESTN vptestnmb
+# define VPMINU vpminub
+# define CHAR_SIZE 1
+# endif
+
+# define XMM0 xmm16
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+# if VEC_SIZE == 64
+# define KMOV kmovq
+# define KORTEST kortestq
+# define RAX rax
+# define RCX rcx
+# define RDX rdx
+# define SHR shrq
+# define TEXTSUFFIX evex512
+# define VMM0 zmm16
+# define VMM1 zmm17
+# define VMM2 zmm18
+# define VMM3 zmm19
+# define VMM4 zmm20
+# define VMOVA vmovdqa64
+# elif VEC_SIZE == 32
+/* Currently Unused. */
+# define KMOV kmovd
+# define KORTEST kortestd
+# define RAX eax
+# define RCX ecx
+# define RDX edx
+# define SHR shrl
+# define TEXTSUFFIX evex256
+# define VMM0 ymm16
+# define VMM1 ymm17
+# define VMM2 ymm18
+# define VMM3 ymm19
+# define VMM4 ymm20
+# define VMOVA vmovdqa32
+# endif
+
+ .section .text.TEXTSUFFIX, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+ one vector length string. */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+ /* Check zero length. */
+ test %RSI_LP, %RSI_LP
+ jz L(ret_max)
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
+# endif
+# endif
+
+ movl %edi, %eax
+ vpxorq %XMM0, %XMM0, %XMM0
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(page_cross)
+
+ /* Compare [w]char for null, mask bit will be set for match. */
+ VPCMP $0, (%rdi), %VMM0, %k0
+ KMOV %k0, %RAX
+ test %RAX, %RAX
+ jz L(align_more)
+
+ bsf %RAX, %RAX
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+ /* At this point vector max length reached. */
+# ifdef USE_AS_STRNLEN
+ .p2align 4,,3
+L(ret_max):
+ movq %rsi, %rax
+ ret
+# endif
+
+L(align_more):
+ leaq VEC_SIZE(%rdi), %rax
+ /* Align rax to VEC_SIZE. */
+ andq $-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+ movq %rax, %rdx
+ subq %rdi, %rdx
+# ifdef USE_AS_WCSLEN
+ SHR $2, %RDX
+# endif
+ /* At this point rdx contains [w]chars already compared. */
+ subq %rsi, %rdx
+ jae L(ret_max)
+ negq %rdx
+ /* At this point rdx contains number of w[char] needs to go.
+ Now onwards rdx will keep decrementing with each compare. */
+# endif
+
+ /* Loop unroll 4 times for 4 vector loop. */
+ VPCMP $0, (%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x1)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x2)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x3)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x4)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+ /* Save pointer before 4 x VEC_SIZE alignment. */
+ movq %rax, %rcx
+# endif
+
+ /* Align address to VEC_SIZE * 4 for loop. */
+ andq $-(VEC_SIZE * 4), %rax
+
+# ifdef USE_AS_STRNLEN
+ subq %rax, %rcx
+# ifdef USE_AS_WCSLEN
+ SHR $2, %RCX
+# endif
+ /* rcx contains number of [w]char will be recompared due to
+ alignment fixes. rdx must be incremented by rcx to offset
+ alignment adjustment. */
+ addq %rcx, %rdx
+ /* Need jump as we don't want to add/subtract rdx for first
+ iteration of 4 x VEC_SIZE aligned loop. */
+ jmp L(loop_entry)
+# endif
+
+ .p2align 4,,11
+L(loop):
+# ifdef USE_AS_STRNLEN
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(ret_max)
+L(loop_entry):
+# endif
+ /* VPMINU and VPCMP combination provide better performance as
+ compared to alternative combinations. */
+ VMOVA (VEC_SIZE * 4)(%rax), %VMM1
+ VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
+ VMOVA (VEC_SIZE * 6)(%rax), %VMM3
+ VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+
+ VPTESTN %VMM2, %VMM2, %k0
+ VPTESTN %VMM4, %VMM4, %k1
+
+ subq $-(VEC_SIZE * 4), %rax
+ KORTEST %k0, %k1
+ jz L(loop)
+
+ VPTESTN %VMM1, %VMM1, %k2
+ KMOV %k2, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x1)
+
+ KMOV %k0, %RCX
+ /* At this point, if k0 is non zero, null char must be in the
+ second vector. */
+ test %RCX, %RCX
+ jnz L(ret_vec_x2)
+
+ VPTESTN %VMM3, %VMM3, %k3
+ KMOV %k3, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x3)
+ /* At this point null [w]char must be in the fourth vector so no
+ need to check. */
+ KMOV %k1, %RCX
+
+ /* Fourth, third, second vector terminating are pretty much
+ same, implemented this way to avoid branching and reuse code
+ from pre loop exit condition. */
+L(ret_vec_x4):
+ bsf %RCX, %RCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ subq $-(VEC_SIZE * 3), %rax
+ shrq $2, %rax
+ addq %rcx, %rax
+# else
+ leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+L(ret_vec_x3):
+ bsf %RCX, %RCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ subq $-(VEC_SIZE * 2), %rax
+ shrq $2, %rax
+ addq %rcx, %rax
+# else
+ leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+L(ret_vec_x2):
+ subq $-VEC_SIZE, %rax
+L(ret_vec_x1):
+ bsf %RCX, %RCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ addq %rcx, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+L(page_cross):
+ movl %eax, %ecx
+# ifdef USE_AS_WCSLEN
+ andl $(VEC_SIZE - 1), %ecx
+ sarl $2, %ecx
+# endif
+ /* ecx contains number of w[char] to be skipped as a result
+ of address alignment. */
+ xorq %rdi, %rax
+ VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
+ KMOV %k0, %RAX
+ /* Ignore number of character for alignment adjustment. */
+ SHR %cl, %RAX
+ jz L(align_more)
+
+ bsf %RAX, %RAX
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+END (STRLEN)
+#endif
--- /dev/null
+#ifndef STRLEN
+# define STRLEN __strlen_evex512
+#endif
+
+#define VEC_SIZE 64
+
+#include "strlen-evex-base.S"
# define SHIFT_RETURN
#endif
+#ifndef SECTION
+# define SECTION(p) p
+#endif
+
/* Long lived register in strlen(s), strnlen(s, n) are:
%xmm3 - zero
*/
-.text
+ .section SECTION(.text),"ax",@progbits
ENTRY(strlen)
/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
+++ /dev/null
-/* strncasecmp_l optimized with AVX.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#define STRCMP_SSE42 __strncasecmp_l_avx
-#define USE_AVX 1
-#define USE_AS_STRNCASECMP_L
-#include "strcmp-sse42.S"
--- /dev/null
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm
+
+#include "strncase_l-avx2.S"
--- /dev/null
+/* strncasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcasecmp_l_avx2
+#endif
+#include "strcmp-avx2.S"
--- /dev/null
+/* strncasecmp_l optimized with EVEX.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP __strcasecmp_l_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"
#define STRCMP __strncmp_avx2_rtm
#define USE_AS_STRNCMP 1
+#define OVERFLOW_STRCMP __strcmp_avx2_rtm
#include "strcmp-avx2-rtm.S"
#define STRCMP __strncmp_avx2
#define USE_AS_STRNCMP 1
+#define OVERFLOW_STRCMP __strcmp_avx2
#include "strcmp-avx2.S"
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#define STRCMP_SSE42 __strncmp_sse42
-#define USE_AS_STRNCMP
-#include "strcmp-sse42.S"
+#if IS_IN (libc)
+# define STRCMP_SSE42 __strncmp_sse42
+# define USE_AS_STRNCMP
+# include "strcmp-sse42.S"
+#endif
--- /dev/null
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
+++ /dev/null
-/* strpbrk optimized with SSE2.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-# define strcspn __strpbrk_sse2
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strpbrk)
-#endif
-
-#define USE_AS_STRPBRK
-#include <sysdeps/x86_64/strcspn.S>
--- /dev/null
+/* strpbrk.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define STRPBRK __strpbrk_sse2
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(STRPBRK)
+#endif
+
+#include <string/strpbrk.c>
# ifdef USE_AS_WCSRCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
- movd %esi, %xmm4
- movl %edi, %ecx
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+ movd %esi, %xmm7
+ movl %edi, %eax
/* Broadcast CHAR to YMM4. */
- VPBROADCAST %xmm4, %ymm4
+ VPBROADCAST %xmm7, %ymm7
vpxor %xmm0, %xmm0, %xmm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ /* Shift here instead of `andl` to save code size (saves a fetch
+ block). */
+ sall $20, %eax
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+ ja L(cross_page)
+L(page_cross_continue):
vmovdqu (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- addq $VEC_SIZE, %rdi
+ /* Check end of string match. */
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ /* Only check match with search CHAR if needed. */
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Check if match before first zero. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
+ .p2align 4,, 10
+L(first_vec_x1):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ .p2align 4,, 4
+L(first_vec_x0_test):
+ VPCMPEQ %ymm1, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ testl %eax, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
+ addq %r8, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret1):
+ VZEROUPPER_RETURN
+ .p2align 4,, 10
+L(first_vec_x0_x1_test):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ /* Check ymm2 for search CHAR match. If no match then check ymm1
+ before returning. */
testl %eax, %eax
- jnz L(first_vec)
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq 1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
- testl %ecx, %ecx
- jnz L(return_null)
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ /* If no in-range search CHAR match in ymm3 then need to check
+ ymm1/ymm2 for an earlier match (we delay checking search
+ CHAR matches until needed). */
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+
.p2align 4
-L(first_vec):
- /* Check if there is a nul CHAR. */
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+
+ /* Align src. */
+ orq $(VEC_SIZE - 1), %rdi
+ vmovdqu 1(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
+ jnz L(first_vec_x1)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
+ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
+ VPCMPEQ %ymm3, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ addq $(VEC_SIZE + 1), %rdi
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %edx
- vpmovmskb %ymm3, %eax
- shrl %cl, %edx
- shrl %cl, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
+L(first_aligned_loop):
+ /* Do 2x VEC at a time. Any more and the cost of finding the
+ match outweights loop benefit. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm8
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm8, %ymm0, %ymm8
+ vpor %ymm5, %ymm8, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
+ /* No zero or search CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
+ jz L(first_aligned_loop)
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
+ /* If no zero CHAR then go to second loop (this allows us to
+ throw away all prior work). */
+ vpmovmskb %ymm8, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_prep)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ /* Search char could be zero so we need to get the true match.
+ */
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(first_aligned_loop_return)
- .p2align 4
-L(aligned_loop):
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- add $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
+ .p2align 4,, 4
+L(first_vec_x1_or_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm3
+ VPCMPEQ %ymm2, %ymm7, %ymm2
vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
-
- .p2align 4
-L(char_nor_null):
- /* Find a CHAR or a nul CHAR in a loop. */
- testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ vpmovmskb %ymm2, %edx
+ /* Use add for macro-fusion. */
+ addq %rax, %rdx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ salq $32, %rax
+ addq %rdx, %rax
+ bsrq %rax, %rax
+ leaq 1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+ .p2align 4,, 8
+L(first_aligned_loop_return):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(first_vec_x1_or_x2)
+
+ bsrq %rax, %rax
+ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %eax
+ andq $-CHAR_SIZE, %rax
# endif
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ VZEROUPPER_RETURN
+ /* Search char cannot be zero. */
.p2align 4
-L(match):
- /* Find a CHAR. Check if there is a nul CHAR. */
- vpmovmskb %ymm2, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
+L(second_aligned_loop_set_furthest_match):
+ /* Save VEC and pointer from most recent match. */
+L(second_aligned_loop_prep):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ vmovdqu %ymm6, %ymm2
+ vmovdqu %ymm10, %ymm3
.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+L(second_aligned_loop):
+ /* Search 2x at at time. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm1
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpor %ymm5, %ymm1, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER_RETURN
-
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a nul CHAR. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+ jz L(second_aligned_loop)
+ vpmovmskb %ymm1, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_set_furthest_match)
+ vpmovmskb %ymm5, %eax
testl %eax, %eax
- /* Return null pointer if the nul CHAR comes first. */
- jz L(return_null)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ jnz L(return_new_match)
+
+ /* This is the hot patch. We know CHAR is inbounds and that
+ ymm3/ymm2 have latest match. */
+ .p2align 4,, 4
+L(return_old_match):
+ vpmovmskb %ymm3, %eax
+ vpmovmskb %ymm2, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
VZEROUPPER_RETURN
- .p2align 4
-L(return_null):
- xorl %eax, %eax
+ /* Last iteration also potentially has a match. */
+ .p2align 4,, 8
+L(return_new_match):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(return_old_match)
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
VZEROUPPER_RETURN
-END (STRRCHR)
+ .p2align 4,, 4
+L(cross_page):
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %edi, %ecx, %ecx
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %edi, %eax, %eax
+ blsmskl %ecx, %ecx
+ /* Check if any search CHAR match in range. */
+ andl %ecx, %eax
+ jz L(ret2)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret2):
+ VZEROUPPER_RETURN
+END(STRRCHR)
#endif
# define STRRCHR __strrchr_evex
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
# ifdef USE_AS_WCSRCHR
+# define SHIFT_REG esi
+
+# define kunpck kunpckbw
+# define kmov_2x kmovd
+# define maskz_2x ecx
+# define maskm_2x eax
+# define CHAR_SIZE 4
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPCMP vpcmpd
# else
+# define SHIFT_REG edi
+
+# define kunpck kunpckdq
+# define kmov_2x kmovq
+# define maskz_2x rcx
+# define maskm_2x rax
+
+# define CHAR_SIZE 1
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPCMP vpcmpb
# endif
# define XMMZERO xmm16
# define YMMZERO ymm16
# define YMMMATCH ymm17
-# define YMM1 ymm18
+# define YMMSAVE ymm18
+
+# define YMM1 ymm19
+# define YMM2 ymm20
+# define YMM3 ymm21
+# define YMM4 ymm22
+# define YMM5 ymm23
+# define YMM6 ymm24
+# define YMM7 ymm25
+# define YMM8 ymm26
-# define VEC_SIZE 32
- .section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
- movl %edi, %ecx
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ .section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+ movl %edi, %eax
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(cross_page_boundary)
+L(page_cross_continue):
VMOVU (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ /* k0 has a 1 for each zero CHAR in YMM1. */
+ VPTESTN %YMM1, %YMM1, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
-
- addq $VEC_SIZE, %rdi
-
- testl %eax, %eax
- jnz L(first_vec)
-
testl %ecx, %ecx
- jnz L(return_null)
-
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
-
- .p2align 4
-L(first_vec):
- /* Check if there is a null byte. */
- testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ jz L(aligned_more)
+ /* fallthrough: zero CHAR in first VEC. */
+ /* K1 has a 1 for each search CHAR match in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ /* Build mask up until first zero CHAR (used to mask of
+ potential search CHAR matches past the end of the string).
+ */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ /* Get last match (the `andl` removed any out of bounds
+ matches). */
+ bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
- bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
# endif
+L(ret0):
+ ret
- VMOVA (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
+ /* Returns for first vec x1/x2/x3 have hard coded backward
+ search path for earlier matches. */
+ .p2align 4,, 6
+L(first_vec_x1):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ blsmskl %ecx, %ecx
+ /* eax non-zero if search CHAR in range. */
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ /* fallthrough: no match in YMM2 then need to check for earlier
+ matches (in YMM1). */
+ .p2align 4,, 4
+L(first_vec_x0_test):
VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %edx
kmovd %k1, %eax
-
- shrxl %SHIFT_REG, %edx, %edx
- shrxl %SHIFT_REG, %eax, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
-
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ jz L(ret1)
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ leaq (%rsi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rsi, %rax
+# endif
+L(ret1):
+ ret
- .p2align 4
-L(aligned_loop):
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ .p2align 4,, 10
+L(first_vec_x1_or_x2):
+ VPCMP $0, %YMM3, %YMMMATCH, %k3
+ VPCMP $0, %YMM2, %YMMMATCH, %k2
+ /* K2 and K3 have 1 for any search CHAR match. Test if any
+ matches between either of them. Otherwise check YMM1. */
+ kortestd %k2, %k3
+ jz L(first_vec_x0_test)
+
+ /* Guranteed that YMM2 and YMM3 are within range so merge the
+ two bitmasks then get last result. */
+ kunpck %k2, %k3, %k3
+ kmovq %k3, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 6
+L(first_vec_x3):
+ VPCMP $0, %YMMMATCH, %YMM4, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
+ andl %ecx, %eax
+ jz L(first_vec_x1_or_x2)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- add $VEC_SIZE, %rdi
+ .p2align 4,, 6
+L(first_vec_x0_x1_test):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ /* Check YMM2 for last match first. If no match try YMM1. */
+ testl %eax, %eax
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMP $0, %YMMMATCH, %YMM3, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ /* Check YMM3 for last match first. If no match try YMM2/YMM1.
+ */
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ .p2align 4
+L(aligned_more):
+ /* Need to keep original pointer incase YMM1 has last match. */
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rdi
+ VMOVU VEC_SIZE(%rdi), %YMM2
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
+ VPTESTN %YMM3, %YMM3, %k0
+ kmovd %k0, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
+ VPTESTN %YMM4, %YMM4, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
+ movq %rdi, %r8
+ testl %ecx, %ecx
+ jnz L(first_vec_x3)
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(char_nor_null):
- /* Find a CHAR or a null byte in a loop. */
+L(first_aligned_loop):
+ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+ they don't store a match. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
+ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
+
+ VPCMP $0, %YMM5, %YMMMATCH, %k2
+ vpxord %YMM6, %YMMMATCH, %YMM7
+
+ VPMIN %YMM5, %YMM6, %YMM8
+ VPMIN %YMM8, %YMM7, %YMM7
+
+ VPTESTN %YMM7, %YMM7, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(first_aligned_loop)
+
+ VPCMP $0, %YMM6, %YMMMATCH, %k3
+ VPTESTN %YMM8, %YMM8, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_prep)
+
+ kortestd %k2, %k3
+ jnz L(return_first_aligned_loop)
+
+ .p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+ VPCMP $0, %YMM4, %YMMMATCH, %k4
+ kmovd %k4, %eax
testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ jz L(first_vec_x1_or_x2)
bsrl %eax, %eax
-# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
-# endif
+ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
ret
- .p2align 4
-L(match):
- /* Find a CHAR. Check if there is a null byte. */
- kmovd %k0, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
+ .p2align 4,, 8
+L(return_first_aligned_loop):
+ VPTESTN %YMM5, %YMM5, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(first_vec_x1_or_x2_or_x3)
- /* Remember the match and keep searching. */
- movl %eax, %edx
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4
+ /* We can throw away the work done for the first 4x checks here
+ as we have a later match. This is the 'fast' path persay.
+ */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ kunpck %k2, %k3, %k4
.p2align 4
-L(find_nul):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
+L(second_aligned_loop):
+ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
+ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
+
+ VPCMP $0, %YMM1, %YMMMATCH, %k2
+ vpxord %YMM2, %YMMMATCH, %YMM3
+
+ VPMIN %YMM1, %YMM2, %YMM4
+ VPMIN %YMM3, %YMM4, %YMM3
+
+ VPTESTN %YMM3, %YMM3, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(second_aligned_loop)
+
+ VPCMP $0, %YMM2, %YMMMATCH, %k3
+ VPTESTN %YMM4, %YMM4, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_set_furthest_match)
+
+ kortestd %k2, %k3
+ /* branch here because there is a significant advantage interms
+ of output dependency chance in using edx. */
+ jnz L(return_new_match)
+L(return_old_match):
+ kmovq %k4, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(return_new_match):
+ VPTESTN %YMM1, %YMM1, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(return_old_match)
+
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(cross_page_boundary):
+ /* eax contains all the page offset bits of src (rdi). `xor rdi,
+ rax` sets pointer will all page offset bits cleared so
+ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+ before page cross (guranteed to be safe to read). Doing this
+ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+ a bit of code size. */
+ xorq %rdi, %rax
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+ VPTESTN %YMM1, %YMM1, %k0
+ kmovd %k0, %ecx
+
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ movl %edi, %esi
+ andl $(VEC_SIZE - 1), %esi
+ shrl $2, %esi
# endif
- ret
+ shrxl %SHIFT_REG, %ecx, %ecx
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a null byte. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* Return null pointer if the null byte comes first. */
- jz L(return_null)
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+
+ /* Found zero CHAR so need to test for search CHAR. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %SHIFT_REG, %eax, %eax
+
+ /* Check if any search CHAR match in range. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret3)
bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ addq %rdi, %rax
# endif
+L(ret3):
ret
- .p2align 4
-L(return_null):
- xorl %eax, %eax
- ret
-
-END (STRRCHR)
+END(STRRCHR)
#endif
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
# undef weak_alias
# define weak_alias(strrchr, rindex)
return 0;
const char *aligned;
- __m128i mask;
- int offset = (int) ((size_t) a & 15);
+ __m128i mask, maskz, zero;
+ unsigned int maskz_bits;
+ unsigned int offset = (int) ((size_t) a & 15);
+ zero = _mm_set1_epi8 (0);
if (offset != 0)
{
/* Load masks. */
aligned = (const char *) ((size_t) a & -16L);
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
- mask = __m128i_shift_right (mask0, offset);
+ maskz = _mm_cmpeq_epi8 (mask0, zero);
/* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16 - offset)
- {
- /* There is no NULL terminator. */
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
- length += index;
-
- /* Don't use SSE4.2 if the length of A > 16. */
- if (length > 16)
- return __strspn_sse2 (s, a);
-
- if (index != 0)
- {
- /* Combine mask0 and mask1. We could play games with
- palignr, but frankly this data should be in L1 now
- so do the merge via an unaligned load. */
- mask = _mm_loadu_si128 ((__m128i *) a);
- }
- }
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+ if (maskz_bits != 0)
+ {
+ mask = __m128i_shift_right (mask0, offset);
+ offset = (unsigned int) ((size_t) s & 15);
+ if (offset)
+ goto start_unaligned;
+
+ aligned = s;
+ goto start_loop;
+ }
}
- else
- {
- /* A is aligned. */
- mask = _mm_load_si128 ((__m128i *) a);
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16)
- {
- /* There is no NULL terminator. Don't use SSE4.2 if the length
- of A > 16. */
- if (a[16] != 0)
- return __strspn_sse2 (s, a);
- }
+ /* A is aligned. */
+ mask = _mm_loadu_si128 ((__m128i *) a);
+
+ /* Find where the NULL terminator is. */
+ maskz = _mm_cmpeq_epi8 (mask, zero);
+ maskz_bits = _mm_movemask_epi8 (maskz);
+ if (maskz_bits == 0)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return __strspn_sse2 (s, a);
}
+ aligned = s;
+ offset = (unsigned int) ((size_t) s & 15);
- offset = (int) ((size_t) s & 15);
if (offset != 0)
{
+ start_unaligned:
/* Check partial string. */
aligned = (const char *) ((size_t) s & -16L);
__m128i value = _mm_load_si128 ((__m128i *) aligned);
+ __m128i adj_value = __m128i_shift_right (value, offset);
- value = __m128i_shift_right (value, offset);
-
- int length = _mm_cmpistri (mask, value, 0x12);
+ unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
/* No need to check CFlag since it is always 1. */
if (length < 16 - offset)
return length;
/* Find where the NULL terminator is. */
- int index = _mm_cmpistri (value, value, 0x3a);
- if (index < 16 - offset)
+ maskz = _mm_cmpeq_epi8 (value, zero);
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+ if (maskz_bits != 0)
return length;
aligned += 16;
}
- else
- aligned = s;
+start_loop:
while (1)
{
__m128i value = _mm_load_si128 ((__m128i *) aligned);
- int index = _mm_cmpistri (mask, value, 0x12);
- int cflag = _mm_cmpistrc (mask, value, 0x12);
+ unsigned int index = _mm_cmpistri (mask, value, 0x12);
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
if (cflag)
return (size_t) (aligned + index - s);
aligned += 16;
+++ /dev/null
-/* strspn optimized with SSE2.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-# define strspn __strspn_sse2
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strspn)
-#endif
-
-#include <sysdeps/x86_64/strspn.S>
--- /dev/null
+/* strspn.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# define STRSPN __strspn_sse2
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(STRSPN)
+#endif
+
+#include <string/strspn.c>
--- /dev/null
+/* strstr optimized with 512-bit AVX-512 instructions
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define FULL_MMASK64 0xffffffffffffffff
+#define ONE_64BIT 0x1ull
+#define ZMM_SIZE_IN_BYTES 64
+#define PAGESIZE 4096
+
+#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
+#define kshiftri_mask64(x, y) ((x) >> (y))
+#define kand_mask64(x, y) ((x) & (y))
+
+/*
+ Returns the index of the first edge within the needle, returns 0 if no edge
+ is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
+ */
+static inline size_t
+find_edge_in_needle (const char *ned)
+{
+ size_t ind = 0;
+ while (ned[ind + 1] != '\0')
+ {
+ if (ned[ind] != ned[ind + 1])
+ return ind;
+ else
+ ind = ind + 1;
+ }
+ return 0;
+}
+
+/*
+ Compare needle with haystack byte by byte at specified location
+ */
+static inline bool
+verify_string_match (const char *hay, const size_t hay_index, const char *ned,
+ size_t ind)
+{
+ while (ned[ind] != '\0')
+ {
+ if (ned[ind] != hay[hay_index + ind])
+ return false;
+ ind = ind + 1;
+ }
+ return true;
+}
+
+/*
+ Compare needle with haystack at specified location. The first 64 bytes are
+ compared using a ZMM register.
+ */
+static inline bool
+verify_string_match_avx512 (const char *hay, const size_t hay_index,
+ const char *ned, const __mmask64 ned_mask,
+ const __m512i ned_zmm)
+{
+ /* check first 64 bytes using zmm and then scalar */
+ __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
+ __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
+ if (match != 0x0) // failed the first few chars
+ return false;
+ else if (ned_mask == FULL_MMASK64)
+ return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
+ return true;
+}
+
+char *
+__strstr_avx512 (const char *haystack, const char *ned)
+{
+ char first = ned[0];
+ if (first == '\0')
+ return (char *)haystack;
+ if (ned[1] == '\0')
+ return (char *)strchr (haystack, ned[0]);
+
+ size_t edge = find_edge_in_needle (ned);
+
+ /* ensure haystack is as long as the pos of edge in needle */
+ for (int ii = 0; ii < edge; ++ii)
+ {
+ if (haystack[ii] == '\0')
+ return NULL;
+ }
+
+ /*
+ Load 64 bytes of the needle and save it to a zmm register
+ Read one cache line at a time to avoid loading across a page boundary
+ */
+ __mmask64 ned_load_mask = _bzhi_u64 (
+ FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
+ __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
+ __mmask64 ned_nullmask
+ = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
+
+ if (__glibc_unlikely (ned_nullmask == 0x0))
+ {
+ ned_zmm = _mm512_loadu_si512 (ned);
+ ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
+ ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+ if (ned_nullmask != 0x0)
+ ned_load_mask = ned_load_mask >> 1;
+ }
+ else
+ {
+ ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
+ ned_load_mask = ned_load_mask >> 1;
+ }
+ const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
+ const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
+
+ /*
+ Read the bytes of haystack in the current cache line
+ */
+ size_t hay_index = edge;
+ __mmask64 loadmask = _bzhi_u64 (
+ FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
+ /* First load is a partial cache line */
+ __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
+ /* Search for NULL and compare only till null char */
+ uint64_t nullmask
+ = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
+ uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+ cmpmask = cmpmask & cvtmask64_u64 (loadmask);
+ /* Search for the 2 charaters of needle */
+ __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+ __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
+ k1 = kshiftri_mask64 (k1, 1);
+ /* k2 masks tell us if both chars from needle match */
+ uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
+ /* For every match, search for the entire needle for a full match */
+ while (k2)
+ {
+ uint64_t bitcount = _tzcnt_u64 (k2);
+ k2 = _blsr_u64 (k2);
+ size_t match_pos = hay_index + bitcount - edge;
+ if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
+ < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
+ {
+ /*
+ * Use vector compare as long as you are not crossing a page
+ */
+ if (verify_string_match_avx512 (haystack, match_pos, ned,
+ ned_load_mask, ned_zmm))
+ return (char *)haystack + match_pos;
+ }
+ else
+ {
+ if (verify_string_match (haystack, match_pos, ned, 0))
+ return (char *)haystack + match_pos;
+ }
+ }
+ /* We haven't checked for potential match at the last char yet */
+ haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
+ hay_index = 0;
+
+ /*
+ Loop over one cache line at a time to prevent reading over page
+ boundary
+ */
+ __m512i hay1;
+ while (nullmask == 0)
+ {
+ hay0 = _mm512_loadu_si512 (haystack + hay_index);
+ hay1 = _mm512_load_si512 (haystack + hay_index
+ + 1); // Always 64 byte aligned
+ nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
+ /* Compare only till null char */
+ cmpmask = nullmask ^ (nullmask - ONE_64BIT);
+ k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
+ k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
+ /* k2 masks tell us if both chars from needle match */
+ k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
+ /* For every match, compare full strings for potential match */
+ while (k2)
+ {
+ uint64_t bitcount = _tzcnt_u64 (k2);
+ k2 = _blsr_u64 (k2);
+ size_t match_pos = hay_index + bitcount - edge;
+ if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
+ < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
+ {
+ /*
+ * Use vector compare as long as you are not crossing a page
+ */
+ if (verify_string_match_avx512 (haystack, match_pos, ned,
+ ned_load_mask, ned_zmm))
+ return (char *)haystack + match_pos;
+ }
+ else
+ {
+ /* Compare byte by byte */
+ if (verify_string_match (haystack, match_pos, ned, 0))
+ return (char *)haystack + match_pos;
+ }
+ }
+ hay_index += ZMM_SIZE_IN_BYTES;
+ }
+ return NULL;
+}
extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
#include "init-arch.h"
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr,
- HAS_ARCH_FEATURE (Fast_Unaligned_Load)
- ? __strstr_sse2_unaligned
- : __strstr_sse2)
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features *cpu_features = __get_cpu_features ();
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ return __strstr_avx512;
+
+ if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
+ return __strstr_sse2_unaligned;
+
+ return __strstr_sse2;
+}
+
+libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
#undef strstr
strong_alias (__libc_strstr, strstr)
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include "varshift.h"
+#include <stdint.h>
-const int8_t ___m128i_shift_right[31] attribute_hidden =
+const int8_t ___m128i_shift_right[31] attribute_hidden
+ __attribute__((aligned(32))) =
{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
#include <stdint.h>
#include <tmmintrin.h>
-extern const int8_t ___m128i_shift_right[31] attribute_hidden;
+extern const int8_t ___m128i_shift_right[31] attribute_hidden
+ __attribute__ ((aligned (32)));
static __inline__ __m128i
__m128i_shift_right (__m128i value, unsigned long int offset)
--- /dev/null
+/* Macro helpers for VEC_{type}({vec_num})
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _VEC_MACROS_H
+#define _VEC_MACROS_H 1
+
+#ifndef VEC_SIZE
+# error "Never include this file directly. Always include a vector config."
+#endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+ VEC(N) values. */
+#define VEC_hi_xmm0 xmm16
+#define VEC_hi_xmm1 xmm17
+#define VEC_hi_xmm2 xmm18
+#define VEC_hi_xmm3 xmm19
+#define VEC_hi_xmm4 xmm20
+#define VEC_hi_xmm5 xmm21
+#define VEC_hi_xmm6 xmm22
+#define VEC_hi_xmm7 xmm23
+#define VEC_hi_xmm8 xmm24
+#define VEC_hi_xmm9 xmm25
+#define VEC_hi_xmm10 xmm26
+#define VEC_hi_xmm11 xmm27
+#define VEC_hi_xmm12 xmm28
+#define VEC_hi_xmm13 xmm29
+#define VEC_hi_xmm14 xmm30
+#define VEC_hi_xmm15 xmm31
+
+#define VEC_hi_ymm0 ymm16
+#define VEC_hi_ymm1 ymm17
+#define VEC_hi_ymm2 ymm18
+#define VEC_hi_ymm3 ymm19
+#define VEC_hi_ymm4 ymm20
+#define VEC_hi_ymm5 ymm21
+#define VEC_hi_ymm6 ymm22
+#define VEC_hi_ymm7 ymm23
+#define VEC_hi_ymm8 ymm24
+#define VEC_hi_ymm9 ymm25
+#define VEC_hi_ymm10 ymm26
+#define VEC_hi_ymm11 ymm27
+#define VEC_hi_ymm12 ymm28
+#define VEC_hi_ymm13 ymm29
+#define VEC_hi_ymm14 ymm30
+#define VEC_hi_ymm15 ymm31
+
+#define VEC_hi_zmm0 zmm16
+#define VEC_hi_zmm1 zmm17
+#define VEC_hi_zmm2 zmm18
+#define VEC_hi_zmm3 zmm19
+#define VEC_hi_zmm4 zmm20
+#define VEC_hi_zmm5 zmm21
+#define VEC_hi_zmm6 zmm22
+#define VEC_hi_zmm7 zmm23
+#define VEC_hi_zmm8 zmm24
+#define VEC_hi_zmm9 zmm25
+#define VEC_hi_zmm10 zmm26
+#define VEC_hi_zmm11 zmm27
+#define VEC_hi_zmm12 zmm28
+#define VEC_hi_zmm13 zmm29
+#define VEC_hi_zmm14 zmm30
+#define VEC_hi_zmm15 zmm31
+
+#define PRIMITIVE_VEC(vec, num) vec##num
+
+#define VEC_any_xmm(i) PRIMITIVE_VEC(xmm, i)
+#define VEC_any_ymm(i) PRIMITIVE_VEC(ymm, i)
+#define VEC_any_zmm(i) PRIMITIVE_VEC(zmm, i)
+
+#define VEC_hi_xmm(i) PRIMITIVE_VEC(VEC_hi_xmm, i)
+#define VEC_hi_ymm(i) PRIMITIVE_VEC(VEC_hi_ymm, i)
+#define VEC_hi_zmm(i) PRIMITIVE_VEC(VEC_hi_zmm, i)
+
+#endif
--- /dev/null
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
#define AS_WCSLEN
#define strlen __wcslen_sse4_1
+#define SECTION(p) p##.sse4.1
#include "strlen-vec.S"
#define STRCMP __wcsncmp_avx2_rtm
#define USE_AS_STRNCMP 1
#define USE_AS_WCSCMP 1
-
+#define OVERFLOW_STRCMP __wcscmp_avx2_rtm
#include "strcmp-avx2-rtm.S"
#define STRCMP __wcsncmp_avx2
#define USE_AS_STRNCMP 1
#define USE_AS_WCSCMP 1
-
+#define OVERFLOW_STRCMP __wcscmp_avx2
#include "strcmp-avx2.S"
--- /dev/null
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
#define AS_WCSLEN
#define AS_STRNLEN
#define strlen __wcsnlen_sse4_1
+#define SECTION(p) p##.sse4.1
#include "strlen-vec.S"
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR __wcsrchr_sse2
#endif
-
#include "../wcsrchr.S"
+++ /dev/null
-#if IS_IN (libc)
-# include <wchar.h>
-
-# define WMEMCMP __wmemcmp_sse2
-
-extern __typeof (wmemcmp) __wmemcmp_sse2;
-#endif
-
-#include "wcsmbs/wmemcmp.c"
--- /dev/null
+/* wmemcmp optimized with SSE2.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse2
+#include "../memcmp.S"
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RDX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END2 (__strcasecmp)
# ifndef NO_NOLOCALE_ALIAS
weak_alias (__strcasecmp, strcasecmp)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RCX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END2 (__strncasecmp)
# ifndef NO_NOLOCALE_ALIAS
weak_alias (__strncasecmp, strncasecmp)
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
.section .rodata.cst16,"aM",@progbits,16
.align 16
-.Lbelowupper:
- .quad 0x4040404040404040
- .quad 0x4040404040404040
-.Ltopupper:
- .quad 0x5b5b5b5b5b5b5b5b
- .quad 0x5b5b5b5b5b5b5b5b
-.Ltouppermask:
+.Llcase_min:
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+.Llcase_max:
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+.Lcase_add:
.quad 0x2020202020202020
.quad 0x2020202020202020
.previous
- movdqa .Lbelowupper(%rip), %xmm5
-# define UCLOW_reg %xmm5
- movdqa .Ltopupper(%rip), %xmm6
-# define UCHIGH_reg %xmm6
- movdqa .Ltouppermask(%rip), %xmm7
-# define LCQWORD_reg %xmm7
+ movdqa .Llcase_min(%rip), %xmm5
+# define LCASE_MIN_reg %xmm5
+ movdqa .Llcase_max(%rip), %xmm6
+# define LCASE_MAX_reg %xmm6
+ movdqa .Lcase_add(%rip), %xmm7
+# define CASE_ADD_reg %xmm7
#endif
cmp $0x30, %ecx
ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
movhpd 8(%rdi), %xmm1
movhpd 8(%rsi), %xmm2
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define TOLOWER(reg1, reg2) \
- movdqa reg1, %xmm8; \
- movdqa UCHIGH_reg, %xmm9; \
- movdqa reg2, %xmm10; \
- movdqa UCHIGH_reg, %xmm11; \
- pcmpgtb UCLOW_reg, %xmm8; \
- pcmpgtb reg1, %xmm9; \
- pcmpgtb UCLOW_reg, %xmm10; \
- pcmpgtb reg2, %xmm11; \
- pand %xmm9, %xmm8; \
- pand %xmm11, %xmm10; \
- pand LCQWORD_reg, %xmm8; \
- pand LCQWORD_reg, %xmm10; \
- por %xmm8, reg1; \
- por %xmm10, reg2
- TOLOWER (%xmm1, %xmm2)
+# define TOLOWER(reg1, reg2) \
+ movdqa LCASE_MIN_reg, %xmm8; \
+ movdqa LCASE_MIN_reg, %xmm9; \
+ paddb reg1, %xmm8; \
+ paddb reg2, %xmm9; \
+ pcmpgtb LCASE_MAX_reg, %xmm8; \
+ pcmpgtb LCASE_MAX_reg, %xmm9; \
+ pandn CASE_ADD_reg, %xmm8; \
+ pandn CASE_ADD_reg, %xmm9; \
+ paddb %xmm8, reg1; \
+ paddb %xmm9, reg2
+ TOLOWER (%xmm1, %xmm2)
#else
# define TOLOWER(reg1, reg2)
#endif
+++ /dev/null
-/* strcspn (str, ss) -- Return the length of the initial segment of STR
- which contains no characters from SS.
- For AMD x86-64.
- Copyright (C) 1994-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
- .text
-ENTRY (strcspn)
-
- movq %rdi, %rdx /* Save SRC. */
-
- /* First we create a table with flags for all possible characters.
- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
- supported by the C string functions we have 256 characters.
- Before inserting marks for the stop characters we clear the whole
- table. */
- movq %rdi, %r8 /* Save value. */
- subq $256, %rsp /* Make space for 256 bytes. */
- cfi_adjust_cfa_offset(256)
- movl $32, %ecx /* 32*8 bytes = 256 bytes. */
- movq %rsp, %rdi
- xorl %eax, %eax /* We store 0s. */
- cld
- rep
- stosq
-
- movq %rsi, %rax /* Setup skipset. */
-
-/* For understanding the following code remember that %rcx == 0 now.
- Although all the following instruction only modify %cl we always
- have a correct zero-extended 64-bit value in %rcx. */
-
- .p2align 4
-L(2): movb (%rax), %cl /* get byte from skipset */
- testb %cl, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
-
- movb 1(%rax), %cl /* get byte from skipset */
- testb $0xff, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
-
- movb 2(%rax), %cl /* get byte from skipset */
- testb $0xff, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
-
- movb 3(%rax), %cl /* get byte from skipset */
- addq $4, %rax /* increment skipset pointer */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */
- testb $0xff, %cl /* is NUL char? */
- jnz L(2) /* no => process next dword from skipset */
-
-L(1): leaq -4(%rdx), %rax /* prepare loop */
-
- /* We use a neat trick for the following loop. Normally we would
- have to test for two termination conditions
- 1. a character in the skipset was found
- and
- 2. the end of the string was found
- But as a sign that the character is in the skipset we store its
- value in the table. But the value of NUL is NUL so the loop
- terminates for NUL in every case. */
-
- .p2align 4
-L(3): addq $4, %rax /* adjust pointer for full loop round */
-
- movb (%rax), %cl /* get byte from string */
- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- je L(4) /* yes => return */
-
- movb 1(%rax), %cl /* get byte from string */
- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- je L(5) /* yes => return */
-
- movb 2(%rax), %cl /* get byte from string */
- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jz L(6) /* yes => return */
-
- movb 3(%rax), %cl /* get byte from string */
- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jne L(3) /* no => start loop again */
-
- incq %rax /* adjust pointer */
-L(6): incq %rax
-L(5): incq %rax
-
-L(4): addq $256, %rsp /* remove skipset */
- cfi_adjust_cfa_offset(-256)
-#ifdef USE_AS_STRPBRK
- xorl %edx,%edx
- orb %cl, %cl /* was last character NUL? */
- cmovzq %rdx, %rax /* Yes: return NULL */
-#else
- subq %rdx, %rax /* we have to return the number of valid
- characters, so compute distance to first
- non-valid character */
-#endif
- ret
-END (strcspn)
-libc_hidden_builtin_def (strcspn)
+++ /dev/null
-#define strcspn strpbrk
-#define USE_AS_STRPBRK
-#include <sysdeps/x86_64/strcspn.S>
#include <sysdep.h>
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ pcmpeqd
+# define CHAR_SIZE 4
+# define PMINU pminud
+#else
+# define PCMPEQ pcmpeqb
+# define CHAR_SIZE 1
+# define PMINU pminub
+#endif
+
+#define PAGE_SIZE 4096
+#define VEC_SIZE 16
+
.text
-ENTRY (strrchr)
- movd %esi, %xmm1
+ENTRY(STRRCHR)
+ movd %esi, %xmm0
movq %rdi, %rax
- andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpq $4032, %rax
- punpcklwd %xmm1, %xmm1
- pshufd $0, %xmm1, %xmm1
+ andl $(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+#endif
+ pshufd $0, %xmm0, %xmm0
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(cross_page)
- movdqu (%rdi), %xmm0
+
+L(cross_page_continue):
+ movups (%rdi), %xmm1
pxor %xmm2, %xmm2
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %ecx
- pmovmskb %xmm3, %edx
- testq %rdx, %rdx
- je L(next_48_bytes)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rcx, %rax
- je L(exit)
- bsrq %rax, %rax
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret0):
ret
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
.p2align 4
-L(next_48_bytes):
- movdqu 16(%rdi), %xmm4
- movdqa %xmm4, %xmm5
- movdqu 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm5
- movdqu 48(%rdi), %xmm0
- pmovmskb %xmm5, %edx
- movdqa %xmm3, %xmm5
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm0, %xmm2
- salq $16, %rdx
- pmovmskb %xmm3, %r8d
- pmovmskb %xmm5, %eax
- pmovmskb %xmm2, %esi
- salq $32, %r8
- salq $32, %rax
- pcmpeqb %xmm1, %xmm0
- orq %rdx, %rax
- movq %rsi, %rdx
- pmovmskb %xmm4, %esi
- salq $48, %rdx
- salq $16, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rsi
- orq %rdx, %rax
- je L(loop_header2)
- leaq -1(%rax), %rcx
- xorq %rax, %rcx
- andq %rcx, %rsi
- je L(exit)
- bsrq %rsi, %rsi
- leaq (%rdi,%rsi), %rax
+L(first_vec_x0_test):
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %r8, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
-L(loop_header2):
- testq %rsi, %rsi
- movq %rdi, %rcx
- je L(no_c_found)
-L(loop_header):
- addq $64, %rdi
- pxor %xmm7, %xmm7
- andq $-64, %rdi
- jmp L(loop_entry)
+L(first_vec_x1):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
.p2align 4
-L(loop64):
- testq %rdx, %rdx
- cmovne %rdx, %rsi
- cmovne %rdi, %rcx
- addq $64, %rdi
-L(loop_entry):
- movdqa 32(%rdi), %xmm3
- pxor %xmm6, %xmm6
- movdqa 48(%rdi), %xmm2
- movdqa %xmm3, %xmm0
- movdqa 16(%rdi), %xmm4
- pminub %xmm2, %xmm0
- movdqa (%rdi), %xmm5
- pminub %xmm4, %xmm0
- pminub %xmm5, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %eax
- movdqa %xmm5, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %r9d
- movdqa %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- movdqa %xmm3, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $16, %rdx
- pmovmskb %xmm0, %r10d
- movdqa %xmm2, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $32, %r10
- orq %r10, %rdx
- pmovmskb %xmm0, %r8d
- orq %r9, %rdx
- salq $48, %r8
- orq %r8, %rdx
+L(first_vec_x1_test):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
testl %eax, %eax
- je L(loop64)
- pcmpeqb %xmm6, %xmm4
- pcmpeqb %xmm6, %xmm3
- pcmpeqb %xmm6, %xmm5
- pmovmskb %xmm4, %eax
- pmovmskb %xmm3, %r10d
- pcmpeqb %xmm6, %xmm2
- pmovmskb %xmm5, %r9d
- salq $32, %r10
- salq $16, %rax
- pmovmskb %xmm2, %r8d
- orq %r10, %rax
- orq %r9, %rax
- salq $48, %r8
- orq %r8, %rax
- leaq -1(%rax), %r8
- xorq %rax, %r8
- andq %r8, %rdx
- cmovne %rdi, %rcx
- cmovne %rdx, %rsi
- bsrq %rsi, %rsi
- leaq (%rcx,%rsi), %rax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+ andq $-VEC_SIZE, %rdi
+
+ movaps VEC_SIZE(%rdi), %xmm2
+ pxor %xmm3, %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pmovmskb %xmm3, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
+
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
+ pxor %xmm4, %xmm4
+ PCMPEQ %xmm3, %xmm4
+ pmovmskb %xmm4, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+
+ addq $VEC_SIZE, %rdi
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ andq $-(VEC_SIZE * 2), %rdi
+ .p2align 4
+L(first_loop):
+ /* Do 2x VEC at a time. */
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
+ macro-fuse with `jz`. */
+ addl %ecx, %eax
+ jz L(first_loop)
+
+ /* Check if there is zero match. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+ /* Check if there was a match in last iteration. */
+ subl %ecx, %eax
+ jnz L(new_match)
+
+L(first_loop_old_match):
+ PCMPEQ %xmm0, %xmm2
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ addl %eax, %ecx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ sall $16, %eax
+ orl %ecx, %eax
+
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
+ /* Save minimum state for getting most recent match. We can
+ throw out all previous work. */
.p2align 4
-L(no_c_found):
- movl $1, %esi
- xorl %ecx, %ecx
- jmp L(loop_header)
+L(second_loop_match):
+ movq %rdi, %rsi
+ movaps %xmm4, %xmm2
+ movaps %xmm7, %xmm3
.p2align 4
-L(exit):
- xorl %eax, %eax
+L(second_loop):
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Either null term or new occurence of CHAR. */
+ addl %ecx, %eax
+ jz L(second_loop)
+
+ /* No null term so much be new occurence of CHAR. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+
+ subl %ecx, %eax
+ jnz L(second_loop_new_match)
+
+L(second_loop_old_match):
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ sall $16, %eax
+ orl %ecx, %eax
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
+L(second_loop_new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(second_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4,, 4
L(cross_page):
- movq %rdi, %rax
- pxor %xmm0, %xmm0
- andq $-64, %rax
- movdqu (%rax), %xmm5
- movdqa %xmm5, %xmm6
- movdqu 16(%rax), %xmm4
- pcmpeqb %xmm1, %xmm5
- pcmpeqb %xmm0, %xmm6
- movdqu 32(%rax), %xmm3
- pmovmskb %xmm6, %esi
- movdqa %xmm4, %xmm6
- movdqu 48(%rax), %xmm2
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm0, %xmm6
- pmovmskb %xmm6, %edx
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm0, %xmm6
- pcmpeqb %xmm2, %xmm0
- salq $16, %rdx
- pmovmskb %xmm3, %r9d
- pmovmskb %xmm6, %r8d
- pmovmskb %xmm0, %ecx
- salq $32, %r9
- salq $32, %r8
- pcmpeqb %xmm1, %xmm2
- orq %r8, %rdx
- salq $48, %rcx
- pmovmskb %xmm5, %r8d
- orq %rsi, %rdx
- pmovmskb %xmm4, %esi
- orq %rcx, %rdx
- pmovmskb %xmm2, %ecx
- salq $16, %rsi
- salq $48, %rcx
- orq %r9, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ movaps (%rsi), %xmm1
+ pxor %xmm2, %xmm2
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
movl %edi, %ecx
- subl %eax, %ecx
- shrq %cl, %rdx
- shrq %cl, %rsi
- testq %rdx, %rdx
- je L(loop_header2)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rax, %rsi
- je L(exit)
- bsrq %rsi, %rax
+ andl $(VEC_SIZE - 1), %ecx
+ sarl %cl, %edx
+ jz L(cross_page_continue)
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ sarl %cl, %eax
+ leal -1(%rdx), %ecx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret1):
ret
-END (strrchr)
+END(STRRCHR)
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+ weak_alias (STRRCHR, rindex)
+ libc_hidden_builtin_def (STRRCHR)
+#endif
+++ /dev/null
-/* strspn (str, ss) -- Return the length of the initial segment of STR
- which contains only characters from SS.
- For AMD x86-64.
- Copyright (C) 1994-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
- .text
-ENTRY (strspn)
-
- movq %rdi, %rdx /* Save SRC. */
-
- /* First we create a table with flags for all possible characters.
- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are
- supported by the C string functions we have 256 characters.
- Before inserting marks for the stop characters we clear the whole
- table. */
- movq %rdi, %r8 /* Save value. */
- subq $256, %rsp /* Make space for 256 bytes. */
- cfi_adjust_cfa_offset(256)
- movl $32, %ecx /* 32*8 bytes = 256 bytes. */
- movq %rsp, %rdi
- xorl %eax, %eax /* We store 0s. */
- cld
- rep
- stosq
-
- movq %rsi, %rax /* Setup stopset. */
-
-/* For understanding the following code remember that %rcx == 0 now.
- Although all the following instruction only modify %cl we always
- have a correct zero-extended 64-bit value in %rcx. */
-
- .p2align 4
-L(2): movb (%rax), %cl /* get byte from stopset */
- testb %cl, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
-
- movb 1(%rax), %cl /* get byte from stopset */
- testb $0xff, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
-
- movb 2(%rax), %cl /* get byte from stopset */
- testb $0xff, %cl /* is NUL char? */
- jz L(1) /* yes => start compare loop */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
-
- movb 3(%rax), %cl /* get byte from stopset */
- addq $4, %rax /* increment stopset pointer */
- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */
- testb $0xff, %cl /* is NUL char? */
- jnz L(2) /* no => process next dword from stopset */
-
-L(1): leaq -4(%rdx), %rax /* prepare loop */
-
- /* We use a neat trick for the following loop. Normally we would
- have to test for two termination conditions
- 1. a character in the stopset was found
- and
- 2. the end of the string was found
- But as a sign that the character is in the stopset we store its
- value in the table. But the value of NUL is NUL so the loop
- terminates for NUL in every case. */
-
- .p2align 4
-L(3): addq $4, %rax /* adjust pointer for full loop round */
-
- movb (%rax), %cl /* get byte from string */
- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jz L(4) /* no => return */
-
- movb 1(%rax), %cl /* get byte from string */
- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jz L(5) /* no => return */
-
- movb 2(%rax), %cl /* get byte from string */
- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jz L(6) /* no => return */
-
- movb 3(%rax), %cl /* get byte from string */
- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */
- jnz L(3) /* yes => start loop again */
-
- incq %rax /* adjust pointer */
-L(6): incq %rax
-L(5): incq %rax
-
-L(4): addq $256, %rsp /* remove stopset */
- cfi_adjust_cfa_offset(-256)
- subq %rdx, %rax /* we have to return the number of valid
- characters, so compute distance to first
- non-valid character */
- ret
-END (strspn)
-libc_hidden_builtin_def (strspn)
to avoid RTM abort triggered by VZEROUPPER inside transactionally. */
#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
xtest; \
- jz 1f; \
- vzeroall; \
+ jnz 1f; \
+ vzeroupper; \
ret; \
1: \
- vzeroupper; \
+ vzeroall; \
ret
+/* Can be used to replace vzeroupper that is not directly before a
+ return. This is useful when hoisting a vzeroupper from multiple
+ return paths to decrease the total number of vzerouppers and code
+ size. */
+#define COND_VZEROUPPER_XTEST \
+ xtest; \
+ jz 1f; \
+ vzeroall; \
+ jmp 2f; \
+1: \
+ vzeroupper; \
+2:
+
+/* In RTM define this as COND_VZEROUPPER_XTEST. */
+#ifndef COND_VZEROUPPER
+# define COND_VZEROUPPER vzeroupper
+#endif
+
/* Zero upper vector registers and return. */
#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
pxor %xmm0, %xmm0
lea 32(%rdi), %rax
- lea 16(%rdi), %rcx
+ addq $16, %rdi
and $-16, %rax
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
pxor %xmm1, %xmm1
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
pxor %xmm2, %xmm2
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
pxor %xmm3, %xmm3
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
and $-0x40, %rax
pminub %xmm0, %xmm2
pcmpeqd %xmm3, %xmm2
pmovmskb %xmm2, %edx
+ addq $64, %rax
test %edx, %edx
- lea 64(%rax), %rax
jz L(aligned_64_loop)
pcmpeqd -64(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $48, %rdi
test %edx, %edx
- lea 48(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm1, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd -32(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm6, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- jmp L(aligned_64_loop)
+ jz L(aligned_64_loop)
.p2align 4
L(exit):
- sub %rcx, %rax
+ sub %rdi, %rax
shr $2, %rax
test %dl, %dl
jz L(exit_high)
- mov %dl, %cl
- and $15, %cl
+ andl $15, %edx
jz L(exit_1)
ret
- .p2align 4
+ /* No align here. Naturally aligned % 16 == 1. */
L(exit_high):
- mov %dh, %ch
- and $15, %ch
+ andl $(15 << 8), %edx
jz L(exit_3)
add $2, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_1):
add $1, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_3):
add $3, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_tail0):
- xor %rax, %rax
+ xorl %eax, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail1):
- mov $1, %rax
+ movl $1, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail2):
- mov $2, %rax
+ movl $2, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail3):
- mov $3, %rax
+ movl $3, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail4):
- mov $4, %rax
+ movl $4, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail5):
- mov $5, %rax
+ movl $5, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail6):
- mov $6, %rax
+ movl $6, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail7):
- mov $7, %rax
+ movl $7, %eax
ret
END (__wcslen)
-/* wcsrchr with SSSE3
+/* wcsrchr optimized with SSE2.
Copyright (C) 2011-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
- .text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR 1
+#define NO_PMINU 1
- movd %rsi, %xmm1
- mov %rdi, %rcx
- punpckldq %xmm1, %xmm1
- pxor %xmm2, %xmm2
- punpckldq %xmm1, %xmm1
- and $63, %rcx
- cmp $48, %rcx
- ja L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR wcsrchr
+#endif
- movdqu (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match1)
-
- test %rcx, %rcx
- jnz L(return_null)
-
- and $-16, %rdi
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match1):
- test %rcx, %rcx
- jnz L(prolog_find_zero_1)
-
- mov %rax, %r8
- mov %rdi, %rsi
- and $-16, %rdi
- jmp L(loop)
-
- .p2align 4
-L(crosscache):
- and $15, %rcx
- and $-16, %rdi
- pxor %xmm3, %xmm3
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm3
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm3, %rdx
- pmovmskb %xmm0, %rax
- shr %cl, %rdx
- shr %cl, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match)
-
- test %rdx, %rdx
- jnz L(return_null)
-
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match):
- test %rdx, %rdx
- jnz L(prolog_find_zero)
-
- mov %rax, %r8
- lea (%rdi, %rcx), %rsi
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm3
- pcmpeqd %xmm3, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm3
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm3, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm4
- pcmpeqd %xmm4, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm4
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm4, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm5
- pcmpeqd %xmm5, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm5
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm5, %rax
- or %rax, %rcx
- jz L(loop)
-
- .p2align 4
-L(matches):
- test %rax, %rax
- jnz L(match)
-L(return_value):
- test %r8, %r8
- jz L(return_null)
- mov %r8, %rax
- mov %rsi, %rdi
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match):
- pmovmskb %xmm2, %rcx
- test %rcx, %rcx
- jnz L(find_zero)
- mov %rax, %r8
- mov %rdi, %rsi
- jmp L(loop)
-
- .p2align 4
-L(find_zero):
- test $15, %cl
- jnz L(find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(find_zero_in_second_wchar)
- test $15, %ch
- jnz L(find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_value)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_value)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero):
- add %rcx, %rdi
- mov %rdx, %rcx
-L(prolog_find_zero_1):
- test $15, %cl
- jnz L(prolog_find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(prolog_find_zero_in_second_wchar)
- test $15, %ch
- jnz L(prolog_find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_null)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_null)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match_second_wchar):
- lea -12(%rdi), %rax
- ret
-
- .p2align 4
-L(match_third_wchar):
- lea -8(%rdi), %rax
- ret
-
- .p2align 4
-L(match_fourth_wchar):
- lea -4(%rdi), %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
-END (wcsrchr)
+#include "../strrchr.S"
--- /dev/null
+/* wmemcmp optimized with SSE2.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define MEMCMP __wmemcmp
+#define USE_AS_WMEMCMP 1
+#include "memcmp.S"
+
+weak_alias (__wmemcmp, wmemcmp)
include ../Makeconfig
-headers := wchar.h bits/wchar.h bits/wchar2.h bits/wchar-ldbl.h uchar.h \
- bits/types/__mbstate_t.h bits/types/mbstate_t.h bits/types/wint_t.h
+headers := wchar.h bits/wchar.h bits/wchar2.h bits/wchar2-decl.h \
+ bits/wchar-ldbl.h uchar.h bits/types/__mbstate_t.h \
+ bits/types/mbstate_t.h bits/types/wint_t.h
routines := wcscat wcschr wcscmp wcscpy wcscspn wcsdup wcslen wcsncat \
wcsncmp wcsncpy wcspbrk wcsrchr wcsspn wcstok wcsstr wmemchr \
--- /dev/null
+/* Checking macros for wchar functions. Declarations only.
+ Copyright (C) 2004-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _BITS_WCHAR2_DECL_H
+#define _BITS_WCHAR2_DECL_H 1
+
+#ifndef _WCHAR_H
+# error "Never include <bits/wchar2-decl.h> directly; use <wchar.h> instead."
+#endif
+
+
+extern wchar_t *__wmemcpy_chk (wchar_t *__restrict __s1,
+ const wchar_t *__restrict __s2, size_t __n,
+ size_t __ns1) __THROW;
+extern wchar_t *__wmemmove_chk (wchar_t *__s1, const wchar_t *__s2,
+ size_t __n, size_t __ns1) __THROW;
+
+
+#ifdef __USE_GNU
+
+extern wchar_t *__wmempcpy_chk (wchar_t *__restrict __s1,
+ const wchar_t *__restrict __s2, size_t __n,
+ size_t __ns1) __THROW;
+
+#endif
+
+
+extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
+ size_t __ns) __THROW;
+extern wchar_t *__wcscpy_chk (wchar_t *__restrict __dest,
+ const wchar_t *__restrict __src,
+ size_t __n) __THROW;
+extern wchar_t *__wcpcpy_chk (wchar_t *__restrict __dest,
+ const wchar_t *__restrict __src,
+ size_t __destlen) __THROW;
+extern wchar_t *__wcsncpy_chk (wchar_t *__restrict __dest,
+ const wchar_t *__restrict __src, size_t __n,
+ size_t __destlen) __THROW;
+extern wchar_t *__wcpncpy_chk (wchar_t *__restrict __dest,
+ const wchar_t *__restrict __src, size_t __n,
+ size_t __destlen) __THROW;
+extern wchar_t *__wcscat_chk (wchar_t *__restrict __dest,
+ const wchar_t *__restrict __src,
+ size_t __destlen) __THROW;
+extern wchar_t *__wcsncat_chk (wchar_t *__restrict __dest,
+ const wchar_t *__restrict __src,
+ size_t __n, size_t __destlen) __THROW;
+extern int __swprintf_chk (wchar_t *__restrict __s, size_t __n,
+ int __flag, size_t __s_len,
+ const wchar_t *__restrict __format, ...)
+ __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 6))) */;
+extern int __vswprintf_chk (wchar_t *__restrict __s, size_t __n,
+ int __flag, size_t __s_len,
+ const wchar_t *__restrict __format,
+ __gnuc_va_list __arg)
+ __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 0))) */;
+
+#if __USE_FORTIFY_LEVEL > 1
+
+extern int __fwprintf_chk (__FILE *__restrict __stream, int __flag,
+ const wchar_t *__restrict __format, ...);
+extern int __wprintf_chk (int __flag, const wchar_t *__restrict __format,
+ ...);
+extern int __vfwprintf_chk (__FILE *__restrict __stream, int __flag,
+ const wchar_t *__restrict __format,
+ __gnuc_va_list __ap);
+extern int __vwprintf_chk (int __flag, const wchar_t *__restrict __format,
+ __gnuc_va_list __ap);
+
+#endif
+
+extern wchar_t *__fgetws_chk (wchar_t *__restrict __s, size_t __size, int __n,
+ __FILE *__restrict __stream) __wur;
+
+#ifdef __USE_GNU
+
+extern wchar_t *__fgetws_unlocked_chk (wchar_t *__restrict __s, size_t __size,
+ int __n, __FILE *__restrict __stream)
+ __wur;
+
+#endif
+
+extern size_t __wcrtomb_chk (char *__restrict __s, wchar_t __wchar,
+ mbstate_t *__restrict __p,
+ size_t __buflen) __THROW __wur;
+extern size_t __mbsrtowcs_chk (wchar_t *__restrict __dst,
+ const char **__restrict __src,
+ size_t __len, mbstate_t *__restrict __ps,
+ size_t __dstlen) __THROW;
+extern size_t __wcsrtombs_chk (char *__restrict __dst,
+ const wchar_t **__restrict __src,
+ size_t __len, mbstate_t *__restrict __ps,
+ size_t __dstlen) __THROW;
+
+#ifdef __USE_XOPEN2K8
+
+extern size_t __mbsnrtowcs_chk (wchar_t *__restrict __dst,
+ const char **__restrict __src, size_t __nmc,
+ size_t __len, mbstate_t *__restrict __ps,
+ size_t __dstlen) __THROW;
+extern size_t __wcsnrtombs_chk (char *__restrict __dst,
+ const wchar_t **__restrict __src,
+ size_t __nwc, size_t __len,
+ mbstate_t *__restrict __ps, size_t __dstlen)
+ __THROW;
+
+#endif
+
+#endif /* bits/wchar2-decl.h. */
#endif
-extern wchar_t *__wmemcpy_chk (wchar_t *__restrict __s1,
- const wchar_t *__restrict __s2, size_t __n,
- size_t __ns1) __THROW;
extern wchar_t *__REDIRECT_NTH (__wmemcpy_alias,
(wchar_t *__restrict __s1,
const wchar_t *__restrict __s2, size_t __n),
}
-extern wchar_t *__wmemmove_chk (wchar_t *__s1, const wchar_t *__s2,
- size_t __n, size_t __ns1) __THROW;
extern wchar_t *__REDIRECT_NTH (__wmemmove_alias, (wchar_t *__s1,
const wchar_t *__s2,
size_t __n), wmemmove);
#ifdef __USE_GNU
-extern wchar_t *__wmempcpy_chk (wchar_t *__restrict __s1,
- const wchar_t *__restrict __s2, size_t __n,
- size_t __ns1) __THROW;
extern wchar_t *__REDIRECT_NTH (__wmempcpy_alias,
(wchar_t *__restrict __s1,
const wchar_t *__restrict __s2,
#endif
-extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
- size_t __ns) __THROW;
extern wchar_t *__REDIRECT_NTH (__wmemset_alias, (wchar_t *__s, wchar_t __c,
size_t __n), wmemset);
extern wchar_t *__REDIRECT_NTH (__wmemset_chk_warn,
}
-extern wchar_t *__wcscpy_chk (wchar_t *__restrict __dest,
- const wchar_t *__restrict __src,
- size_t __n) __THROW;
extern wchar_t *__REDIRECT_NTH (__wcscpy_alias,
(wchar_t *__restrict __dest,
const wchar_t *__restrict __src), wcscpy);
}
-extern wchar_t *__wcpcpy_chk (wchar_t *__restrict __dest,
- const wchar_t *__restrict __src,
- size_t __destlen) __THROW;
extern wchar_t *__REDIRECT_NTH (__wcpcpy_alias,
(wchar_t *__restrict __dest,
const wchar_t *__restrict __src), wcpcpy);
}
-extern wchar_t *__wcsncpy_chk (wchar_t *__restrict __dest,
- const wchar_t *__restrict __src, size_t __n,
- size_t __destlen) __THROW;
extern wchar_t *__REDIRECT_NTH (__wcsncpy_alias,
(wchar_t *__restrict __dest,
const wchar_t *__restrict __src,
}
-extern wchar_t *__wcpncpy_chk (wchar_t *__restrict __dest,
- const wchar_t *__restrict __src, size_t __n,
- size_t __destlen) __THROW;
extern wchar_t *__REDIRECT_NTH (__wcpncpy_alias,
(wchar_t *__restrict __dest,
const wchar_t *__restrict __src,
}
-extern wchar_t *__wcscat_chk (wchar_t *__restrict __dest,
- const wchar_t *__restrict __src,
- size_t __destlen) __THROW;
extern wchar_t *__REDIRECT_NTH (__wcscat_alias,
(wchar_t *__restrict __dest,
const wchar_t *__restrict __src), wcscat);
}
-extern wchar_t *__wcsncat_chk (wchar_t *__restrict __dest,
- const wchar_t *__restrict __src,
- size_t __n, size_t __destlen) __THROW;
extern wchar_t *__REDIRECT_NTH (__wcsncat_alias,
(wchar_t *__restrict __dest,
const wchar_t *__restrict __src,
}
-extern int __swprintf_chk (wchar_t *__restrict __s, size_t __n,
- int __flag, size_t __s_len,
- const wchar_t *__restrict __format, ...)
- __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 6))) */;
extern int __REDIRECT_NTH_LDBL (__swprintf_alias,
(wchar_t *__restrict __s, size_t __n,
: swprintf (s, n, __VA_ARGS__))
#endif
-extern int __vswprintf_chk (wchar_t *__restrict __s, size_t __n,
- int __flag, size_t __s_len,
- const wchar_t *__restrict __format,
- __gnuc_va_list __arg)
- __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 0))) */;
extern int __REDIRECT_NTH_LDBL (__vswprintf_alias,
(wchar_t *__restrict __s, size_t __n,
#if __USE_FORTIFY_LEVEL > 1
-extern int __fwprintf_chk (__FILE *__restrict __stream, int __flag,
- const wchar_t *__restrict __format, ...);
-extern int __wprintf_chk (int __flag, const wchar_t *__restrict __format,
- ...);
-extern int __vfwprintf_chk (__FILE *__restrict __stream, int __flag,
- const wchar_t *__restrict __format,
- __gnuc_va_list __ap);
-extern int __vwprintf_chk (int __flag, const wchar_t *__restrict __format,
- __gnuc_va_list __ap);
-
# ifdef __va_arg_pack
__fortify_function int
wprintf (const wchar_t *__restrict __fmt, ...)
#endif
-extern wchar_t *__fgetws_chk (wchar_t *__restrict __s, size_t __size, int __n,
- __FILE *__restrict __stream) __wur;
extern wchar_t *__REDIRECT (__fgetws_alias,
(wchar_t *__restrict __s, int __n,
__FILE *__restrict __stream), fgetws) __wur;
}
#ifdef __USE_GNU
-extern wchar_t *__fgetws_unlocked_chk (wchar_t *__restrict __s, size_t __size,
- int __n, __FILE *__restrict __stream)
- __wur;
extern wchar_t *__REDIRECT (__fgetws_unlocked_alias,
(wchar_t *__restrict __s, int __n,
__FILE *__restrict __stream), fgetws_unlocked)
#endif
-extern size_t __wcrtomb_chk (char *__restrict __s, wchar_t __wchar,
- mbstate_t *__restrict __p,
- size_t __buflen) __THROW __wur;
extern size_t __REDIRECT_NTH (__wcrtomb_alias,
(char *__restrict __s, wchar_t __wchar,
mbstate_t *__restrict __ps), wcrtomb) __wur;
}
-extern size_t __mbsrtowcs_chk (wchar_t *__restrict __dst,
- const char **__restrict __src,
- size_t __len, mbstate_t *__restrict __ps,
- size_t __dstlen) __THROW;
extern size_t __REDIRECT_NTH (__mbsrtowcs_alias,
(wchar_t *__restrict __dst,
const char **__restrict __src,
}
-extern size_t __wcsrtombs_chk (char *__restrict __dst,
- const wchar_t **__restrict __src,
- size_t __len, mbstate_t *__restrict __ps,
- size_t __dstlen) __THROW;
extern size_t __REDIRECT_NTH (__wcsrtombs_alias,
(char *__restrict __dst,
const wchar_t **__restrict __src,
#ifdef __USE_XOPEN2K8
-extern size_t __mbsnrtowcs_chk (wchar_t *__restrict __dst,
- const char **__restrict __src, size_t __nmc,
- size_t __len, mbstate_t *__restrict __ps,
- size_t __dstlen) __THROW;
extern size_t __REDIRECT_NTH (__mbsnrtowcs_alias,
(wchar_t *__restrict __dst,
const char **__restrict __src, size_t __nmc,
}
-extern size_t __wcsnrtombs_chk (char *__restrict __dst,
- const wchar_t **__restrict __src,
- size_t __nwc, size_t __len,
- mbstate_t *__restrict __ps, size_t __dstlen)
- __THROW;
extern size_t __REDIRECT_NTH (__wcsnrtombs_alias,
(char *__restrict __dst,
const wchar_t **__restrict __src,
/* Define some macros helping to catch buffer overflows. */
#if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function
-# include <bits/wchar2.h>
+/* Declare all functions from bits/wchar2-decl.h first. */
+# include <bits/wchar2-decl.h>
#endif
-#include <bits/floatn.h>
+/* The following headers provide asm redirections. These redirections must
+ appear before the first usage of these functions, e.g. in bits/wchar.h. */
#if defined __LDBL_COMPAT || __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI == 1
# include <bits/wchar-ldbl.h>
#endif
+#if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function
+/* Now include the function definitions and redirects too. */
+# include <bits/wchar2.h>
+#endif
+
__END_DECLS
#endif /* wchar.h */