From e00d24b1835a318e8ff79596d6bf0a07ff225273 Mon Sep 17 00:00:00 2001 From: GNU Libc Maintainers Date: Sun, 2 Oct 2022 18:46:25 +0100 Subject: [PATCH] git-updates GIT update of https://sourceware.org/git/glibc.git/release/2.35/master from glibc-2.35 GIT update of https://sourceware.org/git/glibc.git/release/2.35/master from glibc-2.35 Gbp-Pq: Name git-updates.diff --- INSTALL | 6 + NEWS | 60 +- bits/socket.h | 40 +- catgets/open_catalog.c | 4 +- configure | 79 +- configure.ac | 55 +- csu/libc-start.c | 3 - csu/libc-tls.c | 11 +- debug/tst-fortify.c | 5 + dlfcn/Makefile | 4 + dlfcn/dladdr.c | 2 +- dlfcn/dladdr1.c | 2 +- dlfcn/dlclose.c | 2 +- dlfcn/dlerror.c | 2 +- dlfcn/dlfcn.h | 7 +- dlfcn/dlinfo.c | 15 +- dlfcn/dlmopen.c | 2 +- dlfcn/dlopen.c | 4 +- dlfcn/dlopenold.c | 2 +- dlfcn/dlsym.c | 2 +- dlfcn/dlvsym.c | 2 +- dlfcn/tst-dlinfo-phdr.c | 125 ++ elf/Makefile | 186 +- elf/dl-audit.c | 3 +- elf/dl-deps.c | 2 + elf/dl-early_allocate.c | 30 + elf/dl-find_object.c | 5 +- elf/dl-hwcaps.c | 8 +- elf/dl-libc.c | 8 +- elf/dl-map-segments.h | 3 + elf/dl-open.c | 13 +- elf/dl-sort-maps.c | 48 +- elf/dl-support.c | 85 +- elf/dl-sysdep.c | 352 +-- elf/dso-sort-tests-1.def | 7 + elf/enbl-secure.c | 10 - elf/libtracemod1-1.c | 1 + elf/libtracemod2-1.c | 1 + elf/libtracemod3-1.c | 1 + elf/libtracemod4-1.c | 1 + elf/libtracemod5-1.c | 1 + elf/rtld.c | 80 +- elf/tst-audit26.c | 35 + elf/tst-auditmod24a.c | 4 +- elf/tst-auditmod24d.c | 4 +- elf/tst-auditmod25.c | 2 +- elf/tst-auditmod26.c | 104 + elf/tst-dlmopen-twice-mod1.c | 37 + elf/tst-dlmopen-twice-mod2.c | 50 + elf/tst-dlmopen-twice.c | 34 + elf/tst-glibc-hwcaps-cache.script | 6 + elf/tst-glibcelf.py | 260 +++ elf/tst-relro-symbols.py | 137 ++ .../tst-tls-allocation-failure-static.c | 21 +- elf/tst-trace1.exp | 4 + elf/tst-trace2.exp | 6 + elf/tst-trace3.exp | 6 + elf/tst-trace4.exp | 6 + elf/tst-trace5.exp | 6 + iconv/gconv_parseconfdir.h | 13 +- include/arpa/nameser.h | 106 + include/bits/stdio2-decl.h | 1 + include/bits/wchar2-decl.h | 1 + include/libc-internal.h | 3 - include/register-atfork.h | 26 +- include/resolv.h | 3 + include/unistd.h | 1 - inet/ruserpass.c | 4 +- io/Makefile | 8 +- io/tst-lchmod-time64.c | 2 + io/tst-lchmod.c | 22 +- io/tst-stat.c | 4 + libio/Makefile | 2 +- libio/bits/stdio2-decl.h | 111 + libio/bits/stdio2.h | 62 - libio/stdio.h | 17 +- locale/programs/ld-monetary.c | 182 +- locale/programs/locarchive.c | 2 +- localedata/Makefile | 4 +- localedata/gen-locale.sh | 10 +- malloc/malloc.c | 15 +- misc/daemon.c | 5 +- misc/getusershell.c | 4 +- misc/sys/cdefs.h | 12 +- nptl/allocatestack.c | 2 - nptl/cancellation.c | 50 +- nptl/cleanup_defer.c | 42 +- nptl/descr.h | 41 +- nptl/libc-cleanup.c | 40 +- nptl/pthread_cancel.c | 111 +- nptl/pthread_join_common.c | 7 +- nptl/pthread_setcancelstate.c | 26 +- nptl/pthread_setcanceltype.c | 31 +- nptl/pthread_testcancel.c | 9 +- nptl/unwind.c | 2 +- nscd/connections.c | 3 +- nss/Makefile | 26 +- nss/XXX-lookup.c | 5 + nss/nss_database.c | 39 +- nss/nss_module.c | 12 +- nss/nss_test_errno.c | 58 + nss/tst-nss-test_errno.c | 61 + posix/fork.c | 7 +- posix/glob.c | 70 +- posix/register-atfork.c | 140 +- posix/tst-spawn6.c | 62 +- resolv/Makefile | 41 +- resolv/README | 3 - resolv/mapv4v6addr.h | 69 - resolv/mapv4v6hostent.h | 84 - resolv/ns_name_length_uncompressed.c | 72 + resolv/ns_rr_cursor_init.c | 62 + resolv/ns_rr_cursor_next.c | 74 + resolv/ns_samebinaryname.c | 55 + resolv/nss_dns/dns-host.c | 1155 ++++------ resolv/res-name-checking.c | 14 +- resolv/tst-ns_name_length_uncompressed.c | 135 ++ resolv/tst-ns_rr_cursor.c | 227 ++ resolv/tst-ns_samebinaryname.c | 62 + resolv/tst-resolv-aliases.c | 254 +++ resolv/tst-resolv-byaddr.c | 326 +++ resolv/tst-resolv-invalid-cname.c | 406 ++++ resolv/tst-resolv-maybe_insert_sig.h | 32 + scripts/dso-ordering-test.py | 13 +- scripts/glibcelf.py | 1141 ++++++++++ scripts/tst-elf-edit.py | 34 +- scripts/tst-ld-trace.py | 108 + shlib-versions | 5 - socket/Makefile | 1 + socket/sys/socket.h | 2 +- socket/tst-cmsghdr-skeleton.c | 92 + socket/tst-cmsghdr.c | 56 + stdlib/Makefile | 3 + stdlib/bits/stdlib.h | 14 +- stdlib/testmb.c | 7 + string/bits/string_fortified.h | 2 +- string/test-rawmemchr.c | 57 +- string/test-strncmp.c | 23 + sysdeps/generic/ldsodefs.h | 11 +- sysdeps/generic/libc-lock-arch.h | 25 + sysdeps/generic/startup.h | 24 - sysdeps/hppa/dl-fptr.c | 15 +- sysdeps/hppa/dl-lookupcfg.h | 9 +- sysdeps/hppa/dl-machine.h | 60 +- sysdeps/hppa/dl-runtime.c | 4 +- sysdeps/hppa/dl-runtime.h | 3 + sysdeps/i386/fpu/libm-test-ulps | 2 +- .../i386/i686/fpu/multiarch/libm-test-ulps | 2 +- sysdeps/m68k/dl-machine.h | 12 +- sysdeps/mach/hurd/bits/socket.h | 40 +- sysdeps/mach/hurd/dl-sysdep.c | 30 +- sysdeps/mach/hurd/i386/init-first.c | 4 - sysdeps/nios2/dl-machine.h | 50 +- sysdeps/nptl/dl-tls_init_tp.c | 3 - sysdeps/nptl/libc-lock.h | 8 +- sysdeps/nptl/libc-lockP.h | 3 +- sysdeps/nptl/pthreadP.h | 2 +- sysdeps/posix/fpathconf.c | 4 +- sysdeps/posix/isfdtype.c | 4 +- sysdeps/posix/posix_fallocate.c | 4 +- sysdeps/posix/posix_fallocate64.c | 4 +- sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 4 +- sysdeps/pthread/Makefile | 40 +- sysdeps/pthread/tst-atfork3.c | 118 + sysdeps/pthread/tst-atfork3mod.c | 44 + sysdeps/pthread/tst-atfork4.c | 128 ++ sysdeps/pthread/tst-atfork4mod.c | 48 + sysdeps/pthread/tst-cancel29.c | 207 ++ sysdeps/pthread/tst-cancel30.c | 82 + sysdeps/riscv/rv64/rvd/libm-test-ulps | 2 +- sysdeps/s390/dl-procinfo.c | 5 +- sysdeps/s390/dl-procinfo.h | 2 +- sysdeps/s390/s390-64/Makefile | 25 +- sysdeps/s390/s390-64/configure | 122 ++ sysdeps/s390/s390-64/configure.ac | 92 + sysdeps/s390/s390-64/dl-hwcap-check.h | 6 +- sysdeps/s390/s390-64/dl-hwcaps-subdirs.c | 11 +- sysdeps/s390/s390-64/start.S | 28 + sysdeps/s390/s390-64/tst-glibc-hwcaps.c | 8 +- sysdeps/unix/sysv/linux/Makefile | 1 + .../unix/sysv/linux/aarch64/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h | 2 + sysdeps/unix/sysv/linux/alpha/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/alpha/brk_call.h | 27 + sysdeps/unix/sysv/linux/alpha/dl-auxv.h | 18 +- sysdeps/unix/sysv/linux/arc/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/arm/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/bits/socket.h | 42 +- sysdeps/unix/sysv/linux/brk.c | 3 +- sysdeps/unix/sysv/linux/brk_call.h | 25 + sysdeps/unix/sysv/linux/cmsg_nxthdr.c | 36 +- .../unix/sysv/linux/convert_scm_timestamps.c | 4 +- sysdeps/unix/sysv/linux/csky/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/dl-early_allocate.c | 82 + sysdeps/unix/sysv/linux/dl-parse_auxv.h | 61 + sysdeps/unix/sysv/linux/dl-sysdep.c | 240 +- sysdeps/unix/sysv/linux/faccessat.c | 4 +- sysdeps/unix/sysv/linux/fchmodat.c | 4 +- sysdeps/unix/sysv/linux/getsysstats.c | 96 +- sysdeps/unix/sysv/linux/glob64-time64.c | 1 + sysdeps/unix/sysv/linux/hppa/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/hppa/getcontext.S | 53 +- sysdeps/unix/sysv/linux/hppa/setcontext.S | 9 +- sysdeps/unix/sysv/linux/hppa/swapcontext.S | 72 + sysdeps/unix/sysv/linux/hppa/swapcontext.c | 41 - sysdeps/unix/sysv/linux/i386/Makefile | 2 +- sysdeps/unix/sysv/linux/i386/arch-syscall.h | 1 + .../sysv/linux/i386/libc-do-syscall-int80.S | 25 + .../unix/sysv/linux/i386/libc-do-syscall.S | 3 - sysdeps/unix/sysv/linux/i386/startup.h | 47 +- sysdeps/unix/sysv/linux/i386/sysdep.h | 13 +- sysdeps/unix/sysv/linux/ia64/Makefile | 6 + sysdeps/unix/sysv/linux/ia64/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/ia64/brk.c | 5 +- sysdeps/unix/sysv/linux/ia64/startup.h | 22 + sysdeps/unix/sysv/linux/ia64/sysdep.h | 23 +- sysdeps/unix/sysv/linux/ldsodefs.h | 12 - sysdeps/unix/sysv/linux/m68k/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/m68k/libc-lock-arch.h | 25 + sysdeps/unix/sysv/linux/m68k/sysdep.h | 4 +- .../unix/sysv/linux/microblaze/arch-syscall.h | 1 + .../unix/sysv/linux/mips/bits/struct_stat.h | 38 +- .../sysv/linux/mips/mips32/arch-syscall.h | 1 + .../sysv/linux/mips/mips64/n32/arch-syscall.h | 1 + .../sysv/linux/mips/mips64/n64/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/mmap_call.h | 22 + sysdeps/unix/sysv/linux/mmap_internal.h | 6 +- sysdeps/unix/sysv/linux/mq_timedreceive.c | 2 +- sysdeps/unix/sysv/linux/nios2/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/or1k/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/pathconf.c | 4 +- sysdeps/unix/sysv/linux/powerpc/dl-auxv.h | 14 +- sysdeps/unix/sysv/linux/powerpc/dl-support.c | 4 + .../linux/powerpc/powerpc32/arch-syscall.h | 1 + .../linux/powerpc/powerpc64/arch-syscall.h | 1 + .../unix/sysv/linux/riscv/rv32/arch-syscall.h | 2 + .../unix/sysv/linux/riscv/rv64/arch-syscall.h | 2 + .../s390/{mmap_internal.h => mmap_call.h} | 14 +- .../sysv/linux/s390/s390-32/arch-syscall.h | 1 + .../sysv/linux/s390/s390-64/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/sh/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/sparc/brk.c | 58 - .../linux/{alpha/brk.c => sparc/brk_call.h} | 35 +- .../sysv/linux/sparc/sparc32/arch-syscall.h | 1 + .../sysv/linux/sparc/sparc64/arch-syscall.h | 1 + sysdeps/unix/sysv/linux/spawni.c | 2 +- sysdeps/unix/sysv/linux/startup.h | 39 + sysdeps/unix/sysv/linux/syscall-names.list | 5 +- sysdeps/unix/sysv/linux/tst-getauxval.c | 74 + sysdeps/unix/sysv/linux/tst-mman-consts.py | 2 +- .../sysv/linux/tst-socket-timestamp-compat.c | 25 +- .../unix/sysv/linux/x86_64/64/arch-syscall.h | 1 + .../unix/sysv/linux/x86_64/x32/arch-syscall.h | 1 + sysdeps/x86/Makefile | 7 +- sysdeps/x86/dl-cacheinfo.h | 32 +- sysdeps/x86/isa-level.c | 3 +- sysdeps/x86/sysdep.h | 3 +- sysdeps/x86/tst-strncmp-rtm.c | 58 +- sysdeps/x86/tst-wcsncmp-rtm.c | 21 + sysdeps/x86_64/bzero.S | 1 - sysdeps/x86_64/dl-machine.h | 6 +- sysdeps/x86_64/memcmp.S | 884 +++++--- sysdeps/x86_64/memcmpeq.S | 2 +- sysdeps/x86_64/memrchr.S | 613 +++--- sysdeps/x86_64/memset.S | 16 +- sysdeps/x86_64/multiarch/Makefile | 300 ++- sysdeps/x86_64/multiarch/avx-rtm-vecs.h | 35 + sysdeps/x86_64/multiarch/avx-vecs.h | 47 + sysdeps/x86_64/multiarch/bcopy.S | 7 - sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 + sysdeps/x86_64/multiarch/evex256-vecs.h | 35 + sysdeps/x86_64/multiarch/evex512-vecs.h | 35 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 86 +- sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 19 +- sysdeps/x86_64/multiarch/memchr-avx2-rtm.S | 1 + sysdeps/x86_64/multiarch/memchr-avx2.S | 109 +- sysdeps/x86_64/multiarch/memchr-evex.S | 46 +- sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 +- sysdeps/x86_64/multiarch/memcmp-sse2.S | 4 +- sysdeps/x86_64/multiarch/memcmp-sse4.S | 803 ------- sysdeps/x86_64/multiarch/memcmpeq-sse2.S | 6 +- sysdeps/x86_64/multiarch/memmove-erms.S | 72 + .../multiarch/memmove-vec-unaligned-erms.S | 79 +- sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 1 + sysdeps/x86_64/multiarch/memrchr-avx2.S | 534 +++-- sysdeps/x86_64/multiarch/memrchr-evex.S | 539 +++-- .../multiarch/memset-avx2-unaligned-erms.S | 18 +- .../multiarch/memset-avx512-unaligned-erms.S | 18 +- sysdeps/x86_64/multiarch/memset-erms.S | 44 + .../multiarch/memset-evex-unaligned-erms.S | 18 +- .../multiarch/memset-sse2-unaligned-erms.S | 4 +- .../multiarch/memset-vec-unaligned-erms.S | 223 +- sysdeps/x86_64/multiarch/sse2-vecs.h | 47 + .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 + ...strcasecmp_l-avx.S => strcasecmp_l-avx2.S} | 9 +- .../{strncase_l-avx.S => strcasecmp_l-evex.S} | 11 +- sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +- sysdeps/x86_64/multiarch/strchr-evex.S | 146 +- sysdeps/x86_64/multiarch/strcmp-avx2.S | 1762 +++++++++------ sysdeps/x86_64/multiarch/strcmp-evex.S | 1945 ++++++++++------- sysdeps/x86_64/multiarch/strcmp-sse42.S | 307 ++- sysdeps/x86_64/multiarch/strcmp.c | 5 + sysdeps/x86_64/multiarch/strcspn-c.c | 83 +- .../{strspn-sse2.S => strcspn-sse2.c} | 8 +- sysdeps/x86_64/multiarch/strlen-evex-base.S | 302 +++ sysdeps/x86_64/multiarch/strlen-evex512.S | 7 + sysdeps/x86_64/multiarch/strlen-vec.S | 6 +- .../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 + sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 + sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 + sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 1 + sysdeps/x86_64/multiarch/strncmp-avx2.S | 1 + sysdeps/x86_64/multiarch/strncmp-sse4_2.S | 8 +- sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 + .../{strcspn-sse2.S => strpbrk-sse2.c} | 8 +- sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 ++-- sysdeps/x86_64/multiarch/strrchr-evex.S | 471 ++-- sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +- sysdeps/x86_64/multiarch/strspn-c.c | 86 +- .../{strpbrk-sse2.S => strspn-sse2.c} | 9 +- sysdeps/x86_64/multiarch/strstr-avx512.c | 218 ++ sysdeps/x86_64/multiarch/strstr.c | 24 +- sysdeps/x86_64/multiarch/varshift.c | 5 +- sysdeps/x86_64/multiarch/varshift.h | 3 +- sysdeps/x86_64/multiarch/vec-macros.h | 90 + sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 + sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 1 + sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 2 +- sysdeps/x86_64/multiarch/wcsncmp-avx2.S | 2 +- sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 + sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 1 + sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +- sysdeps/x86_64/multiarch/wmemcmp-c.c | 9 - sysdeps/x86_64/multiarch/wmemcmp-sse2.S | 21 + sysdeps/x86_64/strcmp.S | 64 +- sysdeps/x86_64/strcspn.S | 119 - sysdeps/x86_64/strpbrk.S | 3 - sysdeps/x86_64/strrchr.S | 510 +++-- sysdeps/x86_64/strspn.S | 112 - sysdeps/x86_64/sysdep.h | 24 +- sysdeps/x86_64/wcslen.S | 86 +- sysdeps/x86_64/wcsrchr.S | 268 +-- sysdeps/x86_64/wmemcmp.S | 23 + wcsmbs/Makefile | 5 +- wcsmbs/bits/wchar2-decl.h | 124 ++ wcsmbs/bits/wchar2.h | 72 - wcsmbs/wchar.h | 11 +- 348 files changed, 15336 insertions(+), 8053 deletions(-) create mode 100644 dlfcn/tst-dlinfo-phdr.c create mode 100644 elf/dl-early_allocate.c create mode 100644 elf/libtracemod1-1.c create mode 100644 elf/libtracemod2-1.c create mode 100644 elf/libtracemod3-1.c create mode 100644 elf/libtracemod4-1.c create mode 100644 elf/libtracemod5-1.c create mode 100644 elf/tst-audit26.c create mode 100644 elf/tst-auditmod26.c create mode 100644 elf/tst-dlmopen-twice-mod1.c create mode 100644 elf/tst-dlmopen-twice-mod2.c create mode 100644 elf/tst-dlmopen-twice.c create mode 100644 elf/tst-glibcelf.py create mode 100644 elf/tst-relro-symbols.py rename sysdeps/mach/hurd/enbl-secure.c => elf/tst-tls-allocation-failure-static.c (57%) create mode 100644 elf/tst-trace1.exp create mode 100644 elf/tst-trace2.exp create mode 100644 elf/tst-trace3.exp create mode 100644 elf/tst-trace4.exp create mode 100644 elf/tst-trace5.exp create mode 100644 include/bits/stdio2-decl.h create mode 100644 include/bits/wchar2-decl.h create mode 100644 io/tst-lchmod-time64.c create mode 100644 libio/bits/stdio2-decl.h create mode 100644 nss/nss_test_errno.c create mode 100644 nss/tst-nss-test_errno.c delete mode 100644 resolv/mapv4v6addr.h delete mode 100644 resolv/mapv4v6hostent.h create mode 100644 resolv/ns_name_length_uncompressed.c create mode 100644 resolv/ns_rr_cursor_init.c create mode 100644 resolv/ns_rr_cursor_next.c create mode 100644 resolv/ns_samebinaryname.c create mode 100644 resolv/tst-ns_name_length_uncompressed.c create mode 100644 resolv/tst-ns_rr_cursor.c create mode 100644 resolv/tst-ns_samebinaryname.c create mode 100644 resolv/tst-resolv-aliases.c create mode 100644 resolv/tst-resolv-byaddr.c create mode 100644 resolv/tst-resolv-invalid-cname.c create mode 100644 resolv/tst-resolv-maybe_insert_sig.h create mode 100644 scripts/glibcelf.py create mode 100755 scripts/tst-ld-trace.py create mode 100644 socket/tst-cmsghdr-skeleton.c create mode 100644 socket/tst-cmsghdr.c create mode 100644 sysdeps/generic/libc-lock-arch.h create mode 100644 sysdeps/pthread/tst-atfork3.c create mode 100644 sysdeps/pthread/tst-atfork3mod.c create mode 100644 sysdeps/pthread/tst-atfork4.c create mode 100644 sysdeps/pthread/tst-atfork4mod.c create mode 100644 sysdeps/pthread/tst-cancel29.c create mode 100644 sysdeps/pthread/tst-cancel30.c create mode 100644 sysdeps/s390/s390-64/configure create mode 100644 sysdeps/s390/s390-64/configure.ac create mode 100644 sysdeps/unix/sysv/linux/alpha/brk_call.h create mode 100644 sysdeps/unix/sysv/linux/brk_call.h create mode 100644 sysdeps/unix/sysv/linux/dl-early_allocate.c create mode 100644 sysdeps/unix/sysv/linux/dl-parse_auxv.h create mode 100644 sysdeps/unix/sysv/linux/hppa/swapcontext.S delete mode 100644 sysdeps/unix/sysv/linux/hppa/swapcontext.c create mode 100644 sysdeps/unix/sysv/linux/i386/libc-do-syscall-int80.S create mode 100644 sysdeps/unix/sysv/linux/ia64/startup.h create mode 100644 sysdeps/unix/sysv/linux/m68k/libc-lock-arch.h create mode 100644 sysdeps/unix/sysv/linux/mmap_call.h create mode 100644 sysdeps/unix/sysv/linux/powerpc/dl-support.c rename sysdeps/unix/sysv/linux/s390/{mmap_internal.h => mmap_call.h} (78%) delete mode 100644 sysdeps/unix/sysv/linux/sparc/brk.c rename sysdeps/unix/sysv/linux/{alpha/brk.c => sparc/brk_call.h} (61%) create mode 100644 sysdeps/unix/sysv/linux/startup.h create mode 100644 sysdeps/unix/sysv/linux/tst-getauxval.c create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c delete mode 100644 sysdeps/x86_64/bzero.S create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S create mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S create mode 100644 sysdeps/x86_64/multiarch/memmove-erms.S create mode 100644 sysdeps/x86_64/multiarch/memset-erms.S create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S rename sysdeps/x86_64/multiarch/{strcasecmp_l-avx.S => strcasecmp_l-avx2.S} (87%) rename sysdeps/x86_64/multiarch/{strncase_l-avx.S => strcasecmp_l-evex.S} (83%) rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strcspn-sse2.c} (86%) create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strpbrk-sse2.c} (85%) rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strspn-sse2.c} (84%) create mode 100644 sysdeps/x86_64/multiarch/strstr-avx512.c create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse2.S delete mode 100644 sysdeps/x86_64/strcspn.S delete mode 100644 sysdeps/x86_64/strpbrk.S delete mode 100644 sysdeps/x86_64/strspn.S create mode 100644 sysdeps/x86_64/wmemcmp.S create mode 100644 wcsmbs/bits/wchar2-decl.h diff --git a/INSTALL b/INSTALL index 63c022d6b..237a2f948 100644 --- a/INSTALL +++ b/INSTALL @@ -90,6 +90,12 @@ if 'CFLAGS' is specified it must enable optimization. For example: library will still be usable, but functionality may be lost--for example, you can't build a shared libc with old binutils. +'--with-default-link' + With '--with-default-link', the build system does not use a custom + linker script for linking shared objects. The default is + '--without-default-link', because the custom linker script is + needed for full RELRO protection. + '--with-nonshared-cflags=CFLAGS' Use additional compiler flags CFLAGS to build the parts of the library which are always statically linked into applications and diff --git a/NEWS b/NEWS index faa7ec187..ff8201b0e 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,58 @@ See the end for copying conditions. Please send GNU C library bug reports via using `glibc' in the "product" field. + +Version 2.35.1 + +The following bugs are resolved with this release: + + [12154] Do not fail DNS resolution for CNAMEs which are not host names + [25812] Libio vtable protection is sometimes only partially enforced + [28838] FAIL: elf/tst-p_align3 + [28846] CMSG_NXTHDR may trigger -Wstrict-overflow warning + [28850] linux: __get_nprocs_sched reads uninitialized memory from the stack + [28853] libc: tst-spawn6 changes current foreground process group + (breaks test isolation) + [28857] libc: FAIL: elf/tst-audit24a + [28860] build: --enable-kernel=5.1.0 build fails because of missing + __convert_scm_timestamps + [28865] linux: _SC_NPROCESSORS_CONF and _SC_NPROCESSORS_ONLN are inaccurate + without /sys and /proc + [28868] dynamic-link: Dynamic loader DFS algorithm segfaults on + missing libraries + [28896] strncmp-avx2-rtm and wcsncmp-avx2-rtm fallback on non-rtm + variants when avoiding overflow + [28937] New DSO dependency sorter does not put new map first if in a cycle + [28953] nss: Protect against errno changes in function lookup + [29029] nptl: poll() spuriously returns EINTR during thread + cancellation and with cancellation disabled + [29062] elf: Fix memory leak in _dl_find_object_update + [29078] functions unusable during early auditing + [29097] time: fchmodat does not handle 64 bit time_t for + AT_SYMLINK_NOFOLLOW + [29109] libc: posix_spawn() always returns 1 (EPERM) on clone() + failure + [29165] libc: [Regression] broken argv adjustment + [29187] dynamic-link: [regression] broken argv adjustment for nios2 + [29203] libc: daemon is not y2038 aware + [29204] libc: getusershell is not 2038 aware + [29207] libc: posix_fallocate fallback implementation is not y2038 + [29208] libc: fpathconf(_PC_ASYNC_IO) is not y2038 aware + [29209] libc: isfdtype is not y2038 aware + [29210] network: ruserpass is not y2038 aware + [29211] libc: __open_catalog is not y2038 aware + [29213] libc: gconv_parseconfdir is not y2038 aware + [29214] nptl: pthread_setcanceltype fails to set type + [29225] network: Mistyped define statement in socket/sys/socket.h in + line 184 + [29305] Conserve NSS buffer space during DNS packet parsing + [29415] nscd: Fix netlink cache invalidation if epoll is used + [29446] _dlopen now ignores dl_caller argument in static mode + [29490] alpha: New __brk_call implementation is broken + [29528] elf: Call __libc_early_init for reused namespaces + [29537] libc: [2.34 regression]: Alignment issue on m68k when using + [29583] Use 64-bit interfaces in gconv_parseconfdir + Version 2.35 @@ -139,6 +191,9 @@ Major new features: fortification balanced against additional runtime cost (checking non-constant bounds). +* The audit libraries will avoid unnecessary slowdown if it is not required + PLT tracking (by not implementing the la_pltenter or la_pltexit callbacks). + Deprecated and removed features, and other changes affecting compatibility: * On x86-64, the LD_PREFER_MAP_32BIT_EXEC environment variable support @@ -335,6 +390,8 @@ The following bugs are resolved with this release: [28837] libc: FAIL: socket/tst-socket-timestamp-compat [28847] locale: Empty mon_decimal_point in LC_MONETARY results in non- empty mon_decimal_point_wc + [29069] libc: fstatat64_time64_statx wrapper broken on MIPS N32 with + -D_FILE_OFFSET_BITS=64 and -D_TIME_BITS=64 Version 2.34 @@ -431,9 +488,6 @@ Major new features: execute programs that do not have any dynamic dependency (that is, they are statically linked). This feature is Linux-specific. -* The audit libraries will avoid unnecessary slowdown if it is not required - PLT tracking (by not implementing the la_pltenter or la_pltexit callbacks). - Deprecated and removed features, and other changes affecting compatibility: * The function pthread_mutex_consistent_np has been deprecated; programs diff --git a/bits/socket.h b/bits/socket.h index 2b99dea33..aac8c49b0 100644 --- a/bits/socket.h +++ b/bits/socket.h @@ -245,6 +245,12 @@ struct cmsghdr + CMSG_ALIGN (sizeof (struct cmsghdr))) #define CMSG_LEN(len) (CMSG_ALIGN (sizeof (struct cmsghdr)) + (len)) +/* Given a length, return the additional padding necessary such that + len + __CMSG_PADDING(len) == CMSG_ALIGN (len). */ +#define __CMSG_PADDING(len) ((sizeof (size_t) \ + - ((len) & (sizeof (size_t) - 1))) \ + & (sizeof (size_t) - 1)) + extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg) __THROW; #ifdef __USE_EXTERN_INLINES @@ -254,18 +260,38 @@ extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr, _EXTERN_INLINE struct cmsghdr * __NTH (__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg)) { + /* We may safely assume that __cmsg lies between __mhdr->msg_control and + __mhdr->msg_controllen because the user is required to obtain the first + cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs + via CMSG_NXTHDR, setting lengths along the way. However, we don't yet + trust the value of __cmsg->cmsg_len and therefore do not use it in any + pointer arithmetic until we check its value. */ + + unsigned char * __msg_control_ptr = (unsigned char *) __mhdr->msg_control; + unsigned char * __cmsg_ptr = (unsigned char *) __cmsg; + + size_t __size_needed = sizeof (struct cmsghdr) + + __CMSG_PADDING (__cmsg->cmsg_len); + + /* The current header is malformed, too small to be a full header. */ if ((size_t) __cmsg->cmsg_len < sizeof (struct cmsghdr)) - /* The kernel header does this so there may be a reason. */ return (struct cmsghdr *) 0; + /* There isn't enough space between __cmsg and the end of the buffer to + hold the current cmsg *and* the next one. */ + if (((size_t) + (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr) + < __size_needed) + || ((size_t) + (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr + - __size_needed) + < __cmsg->cmsg_len)) + + return (struct cmsghdr *) 0; + + /* Now, we trust cmsg_len and can use it to find the next header. */ __cmsg = (struct cmsghdr *) ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len)); - if ((unsigned char *) (__cmsg + 1) > ((unsigned char *) __mhdr->msg_control - + __mhdr->msg_controllen) - || ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len) - > ((unsigned char *) __mhdr->msg_control + __mhdr->msg_controllen))) - /* No more entries. */ - return (struct cmsghdr *) 0; return __cmsg; } #endif /* Use `extern inline'. */ diff --git a/catgets/open_catalog.c b/catgets/open_catalog.c index 48c2a4b98..cb1d123cd 100644 --- a/catgets/open_catalog.c +++ b/catgets/open_catalog.c @@ -39,7 +39,7 @@ __open_catalog (const char *cat_name, const char *nlspath, const char *env_var, __nl_catd catalog) { int fd = -1; - struct stat64 st; + struct __stat64_t64 st; int swapping; size_t cnt; size_t max_offset; @@ -193,7 +193,7 @@ __open_catalog (const char *cat_name, const char *nlspath, const char *env_var, return -1; } - if (__builtin_expect (__fstat64 (fd, &st), 0) < 0) + if (__glibc_unlikely (__fstat64_time64 (fd, &st) < 0)) goto close_unlock_return; if (__builtin_expect (!S_ISREG (st.st_mode), 0) diff --git a/configure b/configure index 00dc63838..443cf6ef9 100755 --- a/configure +++ b/configure @@ -730,7 +730,6 @@ infodir docdir oldincludedir includedir -runstatedir localstatedir sharedstatedir sysconfdir @@ -845,7 +844,6 @@ datadir='${datarootdir}' sysconfdir='${prefix}/etc' sharedstatedir='${prefix}/com' localstatedir='${prefix}/var' -runstatedir='${localstatedir}/run' includedir='${prefix}/include' oldincludedir='/usr/include' docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' @@ -1098,15 +1096,6 @@ do | -silent | --silent | --silen | --sile | --sil) silent=yes ;; - -runstatedir | --runstatedir | --runstatedi | --runstated \ - | --runstate | --runstat | --runsta | --runst | --runs \ - | --run | --ru | --r) - ac_prev=runstatedir ;; - -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \ - | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \ - | --run=* | --ru=* | --r=*) - runstatedir=$ac_optarg ;; - -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) ac_prev=sbindir ;; -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ @@ -1244,7 +1233,7 @@ fi for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ datadir sysconfdir sharedstatedir localstatedir includedir \ oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ - libdir localedir mandir runstatedir + libdir localedir mandir do eval ac_val=\$$ac_var # Remove trailing slashes. @@ -1397,7 +1386,6 @@ Fine tuning of the installation directories: --sysconfdir=DIR read-only single-machine data [PREFIX/etc] --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] --localstatedir=DIR modifiable single-machine data [PREFIX/var] - --runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run] --libdir=DIR object code libraries [EPREFIX/lib] --includedir=DIR C header files [PREFIX/include] --oldincludedir=DIR C header files for non-gcc [/usr/include] @@ -3388,7 +3376,7 @@ fi if test "${with_default_link+set}" = set; then : withval=$with_default_link; use_default_link=$withval else - use_default_link=default + use_default_link=no fi @@ -6235,69 +6223,6 @@ fi $as_echo "$libc_cv_hashstyle" >&6; } -# The linker's default -shared behavior is good enough if it -# does these things that our custom linker scripts ensure that -# all allocated NOTE sections come first. -if test "$use_default_link" = default; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sufficient default -shared layout" >&5 -$as_echo_n "checking for sufficient default -shared layout... " >&6; } -if ${libc_cv_use_default_link+:} false; then : - $as_echo_n "(cached) " >&6 -else - libc_cv_use_default_link=no - cat > conftest.s <<\EOF - .section .note.a,"a",%note - .balign 4 - .long 4,4,9 - .string "GNU" - .string "foo" - .section .note.b,"a",%note - .balign 4 - .long 4,4,9 - .string "GNU" - .string "bar" -EOF - if { ac_try=' ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&5' - { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 - (eval $ac_try) 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; } && - ac_try=`$READELF -S conftest.so | sed -n \ - '${x;p;} - s/^ *\[ *[1-9][0-9]*\] *\([^ ][^ ]*\) *\([^ ][^ ]*\) .*$/\2 \1/ - t a - b - : a - H'` - then - libc_seen_a=no libc_seen_b=no - set -- $ac_try - while test $# -ge 2 -a "$1" = NOTE; do - case "$2" in - .note.a) libc_seen_a=yes ;; - .note.b) libc_seen_b=yes ;; - esac - shift 2 - done - case "$libc_seen_a$libc_seen_b" in - yesyes) - libc_cv_use_default_link=yes - ;; - *) - echo >&5 "\ -$libc_seen_a$libc_seen_b from: -$ac_try" - ;; - esac - fi - rm -f conftest* -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_use_default_link" >&5 -$as_echo "$libc_cv_use_default_link" >&6; } - use_default_link=$libc_cv_use_default_link -fi - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for GLOB_DAT reloc" >&5 $as_echo_n "checking for GLOB_DAT reloc... " >&6; } if ${libc_cv_has_glob_dat+:} false; then : diff --git a/configure.ac b/configure.ac index 87f67d25e..228261a49 100644 --- a/configure.ac +++ b/configure.ac @@ -153,7 +153,7 @@ AC_ARG_WITH([default-link], AS_HELP_STRING([--with-default-link], [do not use explicit linker scripts]), [use_default_link=$withval], - [use_default_link=default]) + [use_default_link=no]) dnl Additional build flags injection. AC_ARG_WITH([nonshared-cflags], @@ -1402,59 +1402,6 @@ fi rm -f conftest*]) AC_SUBST(libc_cv_hashstyle) -# The linker's default -shared behavior is good enough if it -# does these things that our custom linker scripts ensure that -# all allocated NOTE sections come first. -if test "$use_default_link" = default; then - AC_CACHE_CHECK([for sufficient default -shared layout], - libc_cv_use_default_link, [dnl - libc_cv_use_default_link=no - cat > conftest.s <<\EOF - .section .note.a,"a",%note - .balign 4 - .long 4,4,9 - .string "GNU" - .string "foo" - .section .note.b,"a",%note - .balign 4 - .long 4,4,9 - .string "GNU" - .string "bar" -EOF - if AC_TRY_COMMAND([dnl - ${CC-cc} $ASFLAGS -shared -o conftest.so conftest.s 1>&AS_MESSAGE_LOG_FD]) && - ac_try=`$READELF -S conftest.so | sed -n \ - ['${x;p;} - s/^ *\[ *[1-9][0-9]*\] *\([^ ][^ ]*\) *\([^ ][^ ]*\) .*$/\2 \1/ - t a - b - : a - H']` - then - libc_seen_a=no libc_seen_b=no - set -- $ac_try - while test $# -ge 2 -a "$1" = NOTE; do - case "$2" in - .note.a) libc_seen_a=yes ;; - .note.b) libc_seen_b=yes ;; - esac - shift 2 - done - case "$libc_seen_a$libc_seen_b" in - yesyes) - libc_cv_use_default_link=yes - ;; - *) - echo >&AS_MESSAGE_LOG_FD "\ -$libc_seen_a$libc_seen_b from: -$ac_try" - ;; - esac - fi - rm -f conftest*]) - use_default_link=$libc_cv_use_default_link -fi - AC_CACHE_CHECK(for GLOB_DAT reloc, libc_cv_has_glob_dat, [dnl cat > conftest.c <dladdr (address, info); #endif return _dl_addr (address, info, NULL, NULL); diff --git a/dlfcn/dladdr1.c b/dlfcn/dladdr1.c index 5dadfd122..e0c9526c9 100644 --- a/dlfcn/dladdr1.c +++ b/dlfcn/dladdr1.c @@ -24,7 +24,7 @@ int __dladdr1 (const void *address, Dl_info *info, void **extra, int flags) { #ifdef SHARED - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->dladdr1 (address, info, extra, flags); #endif diff --git a/dlfcn/dlclose.c b/dlfcn/dlclose.c index a9921c316..aab88c47f 100644 --- a/dlfcn/dlclose.c +++ b/dlfcn/dlclose.c @@ -24,7 +24,7 @@ int __dlclose (void *handle) { #ifdef SHARED - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->dlclose (handle); #endif diff --git a/dlfcn/dlerror.c b/dlfcn/dlerror.c index 3bf6049e3..b899d252a 100644 --- a/dlfcn/dlerror.c +++ b/dlfcn/dlerror.c @@ -32,7 +32,7 @@ char * __dlerror (void) { # ifdef SHARED - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->dlerror (); # endif diff --git a/dlfcn/dlfcn.h b/dlfcn/dlfcn.h index b5cd5c523..a3af6051d 100644 --- a/dlfcn/dlfcn.h +++ b/dlfcn/dlfcn.h @@ -164,7 +164,12 @@ enum segment, or if the calling thread has not allocated a block for it. */ RTLD_DI_TLS_DATA = 10, - RTLD_DI_MAX = 10 + /* Treat ARG as const ElfW(Phdr) **, and store the address of the + program header array at that location. The dlinfo call returns + the number of program headers in the array. */ + RTLD_DI_PHDR = 11, + + RTLD_DI_MAX = 11 }; diff --git a/dlfcn/dlinfo.c b/dlfcn/dlinfo.c index fc63c0268..0fbe670d6 100644 --- a/dlfcn/dlinfo.c +++ b/dlfcn/dlinfo.c @@ -28,6 +28,10 @@ struct dlinfo_args void *handle; int request; void *arg; + + /* This is the value that is returned from dlinfo if no error is + signaled. */ + int result; }; static void @@ -40,6 +44,7 @@ dlinfo_doit (void *argsblock) { case RTLD_DI_CONFIGADDR: default: + args->result = -1; _dl_signal_error (0, NULL, NULL, N_("unsupported dlinfo request")); break; @@ -75,6 +80,11 @@ dlinfo_doit (void *argsblock) *(void **) args->arg = data; break; } + + case RTLD_DI_PHDR: + *(const ElfW(Phdr) **) args->arg = l->l_phdr; + args->result = l->l_phnum; + break; } } @@ -82,14 +92,15 @@ static int dlinfo_implementation (void *handle, int request, void *arg) { struct dlinfo_args args = { handle, request, arg }; - return _dlerror_run (&dlinfo_doit, &args) ? -1 : 0; + _dlerror_run (&dlinfo_doit, &args); + return args.result; } #ifdef SHARED int ___dlinfo (void *handle, int request, void *arg) { - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->dlinfo (handle, request, arg); else return dlinfo_implementation (handle, request, arg); diff --git a/dlfcn/dlmopen.c b/dlfcn/dlmopen.c index 2437f5ce2..b41778f16 100644 --- a/dlfcn/dlmopen.c +++ b/dlfcn/dlmopen.c @@ -80,7 +80,7 @@ dlmopen_implementation (Lmid_t nsid, const char *file, int mode, void * ___dlmopen (Lmid_t nsid, const char *file, int mode) { - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->dlmopen (nsid, file, mode, RETURN_ADDRESS (0)); else return dlmopen_implementation (nsid, file, mode, RETURN_ADDRESS (0)); diff --git a/dlfcn/dlopen.c b/dlfcn/dlopen.c index 846ca3833..9b07b4e13 100644 --- a/dlfcn/dlopen.c +++ b/dlfcn/dlopen.c @@ -75,7 +75,7 @@ dlopen_implementation (const char *file, int mode, void *dl_caller) void * ___dlopen (const char *file, int mode) { - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0)); else return dlopen_implementation (file, mode, RETURN_ADDRESS (0)); @@ -90,7 +90,7 @@ compat_symbol (libdl, ___dlopen, dlopen, GLIBC_2_1); void * __dlopen (const char *file, int mode, void *dl_caller) { - return dlopen_implementation (file, mode, RETURN_ADDRESS (0)); + return dlopen_implementation (file, mode, dl_caller); } void * diff --git a/dlfcn/dlopenold.c b/dlfcn/dlopenold.c index 67601434d..5c21a0049 100644 --- a/dlfcn/dlopenold.c +++ b/dlfcn/dlopenold.c @@ -70,7 +70,7 @@ __dlopen_nocheck (const char *file, int mode) mode |= RTLD_LAZY; args.mode = mode; - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->dlopen (file, mode, RETURN_ADDRESS (0)); return _dlerror_run (dlopen_doit, &args) ? NULL : args.new; diff --git a/dlfcn/dlsym.c b/dlfcn/dlsym.c index a71f8ae24..2e9ff98e7 100644 --- a/dlfcn/dlsym.c +++ b/dlfcn/dlsym.c @@ -62,7 +62,7 @@ dlsym_implementation (void *handle, const char *name, void *dl_caller) void * ___dlsym (void *handle, const char *name) { - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->dlsym (handle, name, RETURN_ADDRESS (0)); else return dlsym_implementation (handle, name, RETURN_ADDRESS (0)); diff --git a/dlfcn/dlvsym.c b/dlfcn/dlvsym.c index 72219d6da..caa46ba1e 100644 --- a/dlfcn/dlvsym.c +++ b/dlfcn/dlvsym.c @@ -65,7 +65,7 @@ dlvsym_implementation (void *handle, const char *name, const char *version, void * ___dlvsym (void *handle, const char *name, const char *version) { - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->dlvsym (handle, name, version, RETURN_ADDRESS (0)); else diff --git a/dlfcn/tst-dlinfo-phdr.c b/dlfcn/tst-dlinfo-phdr.c new file mode 100644 index 000000000..a15a7d48e --- /dev/null +++ b/dlfcn/tst-dlinfo-phdr.c @@ -0,0 +1,125 @@ +/* Test for dlinfo (RTLD_DI_PHDR). + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* Used to verify that the program header array appears as expected + among the dl_iterate_phdr callback invocations. */ + +struct dlip_callback_args +{ + struct link_map *l; /* l->l_addr is used to find the object. */ + const ElfW(Phdr) *phdr; /* Expected program header pointed. */ + int phnum; /* Expected program header count. */ + bool found; /* True if l->l_addr has been found. */ +}; + +static int +dlip_callback (struct dl_phdr_info *dlpi, size_t size, void *closure) +{ + TEST_COMPARE (sizeof (*dlpi), size); + struct dlip_callback_args *args = closure; + + if (dlpi->dlpi_addr == args->l->l_addr) + { + TEST_VERIFY (!args->found); + args->found = true; + TEST_VERIFY (args->phdr == dlpi->dlpi_phdr); + TEST_COMPARE (args->phnum, dlpi->dlpi_phnum); + } + + return 0; +} + +static int +do_test (void) +{ + /* Avoid a copy relocation. */ + struct r_debug *debug = xdlsym (RTLD_DEFAULT, "_r_debug"); + struct link_map *l = (struct link_map *) debug->r_map; + TEST_VERIFY_EXIT (l != NULL); + + do + { + printf ("info: checking link map %p (%p) for \"%s\"\n", + l, l->l_phdr, l->l_name); + + /* Cause dlerror () to return an error message. */ + dlsym (RTLD_DEFAULT, "does-not-exist"); + + /* Use the extension that link maps are valid dlopen handles. */ + const ElfW(Phdr) *phdr; + int phnum = dlinfo (l, RTLD_DI_PHDR, &phdr); + TEST_VERIFY (phnum >= 0); + /* Verify that the error message has been cleared. */ + TEST_COMPARE_STRING (dlerror (), NULL); + + TEST_VERIFY (phdr == l->l_phdr); + TEST_COMPARE (phnum, l->l_phnum); + + /* Check that we can find PT_DYNAMIC among the array. */ + { + bool dynamic_found = false; + for (int i = 0; i < phnum; ++i) + if (phdr[i].p_type == PT_DYNAMIC) + { + dynamic_found = true; + TEST_COMPARE ((ElfW(Addr)) l->l_ld, l->l_addr + phdr[i].p_vaddr); + } + TEST_VERIFY (dynamic_found); + } + + /* Check that dl_iterate_phdr finds the link map with the same + program headers. */ + { + struct dlip_callback_args args = + { + .l = l, + .phdr = phdr, + .phnum = phnum, + .found = false, + }; + TEST_COMPARE (dl_iterate_phdr (dlip_callback, &args), 0); + TEST_VERIFY (args.found); + } + + if (l->l_prev == NULL) + { + /* This is the executable, so the information is also + available via getauxval. */ + TEST_COMPARE_STRING (l->l_name, ""); + TEST_VERIFY (phdr == (const ElfW(Phdr) *) getauxval (AT_PHDR)); + TEST_COMPARE (phnum, getauxval (AT_PHNUM)); + } + + l = l->l_next; + } + while (l != NULL); + + return 0; +} + +#include diff --git a/elf/Makefile b/elf/Makefile index 5bdf0a383..bf2550472 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -33,6 +33,7 @@ routines = \ $(all-dl-routines) \ dl-addr \ dl-addr-obj \ + dl-early_allocate \ dl-error \ dl-iteratephdr \ dl-libc \ @@ -108,6 +109,7 @@ all-dl-routines = $(dl-routines) $(sysdep-dl-routines) # But they are absent from the shared libc, because that code is in ld.so. elide-routines.os = \ $(all-dl-routines) \ + dl-early_allocate \ dl-exception \ dl-origin \ dl-reloc-static-pie \ @@ -161,6 +163,11 @@ ifeq (yes,$(have-loop-to-function)) CFLAGS-rtld.c += -fno-tree-loop-distribute-patterns endif +ifeq (yes,$(have-loop-to-function)) +# Likewise, during static library startup, memset is not yet available. +CFLAGS-dl-support.c = -fno-tree-loop-distribute-patterns +endif + # Compile rtld itself without stack protection. # Also compile all routines in the static library that are elided from # the shared libc because they are in libc.a in the same way. @@ -272,6 +279,7 @@ tests-static-normal := \ tst-linkall-static \ tst-single_threaded-pthread-static \ tst-single_threaded-static \ + tst-tls-allocation-failure-static \ tst-tlsalign-extern-static \ tst-tlsalign-static \ # tests-static-normal @@ -366,6 +374,8 @@ tests += \ tst-align \ tst-align2 \ tst-align3 \ + tst-audit-tlsdesc \ + tst-audit-tlsdesc-dlopen \ tst-audit1 \ tst-audit2 \ tst-audit8 \ @@ -386,6 +396,7 @@ tests += \ tst-audit24d \ tst-audit25a \ tst-audit25b \ + tst-audit26 \ tst-auditmany \ tst-auxobj \ tst-auxobj-dlopen \ @@ -399,6 +410,7 @@ tests += \ tst-dlmopen4 \ tst-dlmopen-dlerror \ tst-dlmopen-gethostbyname \ + tst-dlmopen-twice \ tst-dlopenfail \ tst-dlopenfail-2 \ tst-dlopenrpath \ @@ -539,6 +551,39 @@ endif endif endif +tests-special += $(objpfx)tst-relro-ldso.out $(objpfx)tst-relro-libc.out +$(objpfx)tst-relro-ldso.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \ + $(objpfx)ld.so + $(PYTHON) tst-relro-symbols.py $(objpfx)ld.so \ + --required=_rtld_global_ro \ + > $@ 2>&1; $(evaluate-test) +# The optional symbols are present in libc only if the architecture has +# the GLIBC_2.0 symbol set in libc. +$(objpfx)tst-relro-libc.out: tst-relro-symbols.py $(..)/scripts/glibcelf.py \ + $(common-objpfx)libc.so + $(PYTHON) tst-relro-symbols.py $(common-objpfx)libc.so \ + --required=_IO_cookie_jumps \ + --required=_IO_file_jumps \ + --required=_IO_file_jumps_maybe_mmap \ + --required=_IO_file_jumps_mmap \ + --required=_IO_helper_jumps \ + --required=_IO_mem_jumps \ + --required=_IO_obstack_jumps \ + --required=_IO_proc_jumps \ + --required=_IO_str_chk_jumps \ + --required=_IO_str_jumps \ + --required=_IO_strn_jumps \ + --required=_IO_wfile_jumps \ + --required=_IO_wfile_jumps_maybe_mmap \ + --required=_IO_wfile_jumps_mmap \ + --required=_IO_wmem_jumps \ + --required=_IO_wstr_jumps \ + --required=_IO_wstrn_jumps \ + --optional=_IO_old_cookie_jumps \ + --optional=_IO_old_file_jumps \ + --optional=_IO_old_proc_jumps \ + > $@ 2>&1; $(evaluate-test) + ifeq ($(run-built-tests),yes) tests-special += $(objpfx)tst-valgrind-smoke.out endif @@ -613,6 +658,16 @@ modules-names = \ libmarkermod4-2 \ libmarkermod4-3 \ libmarkermod4-4 \ + libmarkermod5-1 \ + libmarkermod5-2 \ + libmarkermod5-3 \ + libmarkermod5-4 \ + libmarkermod5-5 \ + libtracemod1-1 \ + libtracemod2-1 \ + libtracemod3-1 \ + libtracemod4-1 \ + libtracemod5-1 \ ltglobmod1 \ ltglobmod2 \ neededobj1 \ @@ -674,6 +729,8 @@ modules-names = \ tst-alignmod3 \ tst-array2dep \ tst-array5dep \ + tst-audit-tlsdesc-mod1 \ + tst-audit-tlsdesc-mod2 \ tst-audit11mod1 \ tst-audit11mod2 \ tst-audit12mod1 \ @@ -707,6 +764,7 @@ modules-names = \ tst-auditmanymod7 \ tst-auditmanymod8 \ tst-auditmanymod9 \ + tst-auditmod-tlsdesc \ tst-auditmod1 \ tst-auditmod9a \ tst-auditmod9b \ @@ -725,6 +783,7 @@ modules-names = \ tst-auditmod24c \ tst-auditmod24d \ tst-auditmod25 \ + tst-auditmod26 \ tst-auxvalmod \ tst-big-note-lib \ tst-deep1mod1 \ @@ -742,6 +801,8 @@ modules-names = \ tst-dlmopen1mod \ tst-dlmopen-dlerror-mod \ tst-dlmopen-gethostbyname-mod \ + tst-dlmopen-twice-mod1 \ + tst-dlmopen-twice-mod2 \ tst-dlopenfaillinkmod \ tst-dlopenfailmod1 \ tst-dlopenfailmod2 \ @@ -895,23 +956,8 @@ modules-names += tst-gnu2-tls1mod $(objpfx)tst-gnu2-tls1: $(objpfx)tst-gnu2-tls1mod.so tst-gnu2-tls1mod.so-no-z-defs = yes CFLAGS-tst-gnu2-tls1mod.c += -mtls-dialect=gnu2 +endif # $(have-mtls-dialect-gnu2) -tests += tst-audit-tlsdesc tst-audit-tlsdesc-dlopen -modules-names += tst-audit-tlsdesc-mod1 tst-audit-tlsdesc-mod2 tst-auditmod-tlsdesc -$(objpfx)tst-audit-tlsdesc: $(objpfx)tst-audit-tlsdesc-mod1.so \ - $(objpfx)tst-audit-tlsdesc-mod2.so \ - $(shared-thread-library) -CFLAGS-tst-audit-tlsdesc-mod1.c += -mtls-dialect=gnu2 -CFLAGS-tst-audit-tlsdesc-mod2.c += -mtls-dialect=gnu2 -$(objpfx)tst-audit-tlsdesc-dlopen: $(shared-thread-library) -$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-audit-tlsdesc-mod1.so \ - $(objpfx)tst-audit-tlsdesc-mod2.so -$(objpfx)tst-audit-tlsdesc-mod1.so: $(objpfx)tst-audit-tlsdesc-mod2.so -$(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so -tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so -$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so -tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so -endif ifeq (yes,$(have-protected-data)) modules-names += tst-protected1moda tst-protected1modb tests += tst-protected1a tst-protected1b @@ -943,7 +989,7 @@ extra-test-objs += $(addsuffix .os,$(strip $(modules-names))) # filtmod1.so, tst-big-note-lib.so, tst-ro-dynamic-mod.so have special # rules. modules-names-nobuild := filtmod1 tst-big-note-lib tst-ro-dynamic-mod \ - tst-audit24bmod1 tst-audit24bmod2.so + tst-audit24bmod1 tst-audit24bmod2 tests += $(tests-static) @@ -1072,6 +1118,11 @@ tests-special += \ $(objpfx)tst-initorder2-cmp.out \ $(objpfx)tst-unused-dep-cmp.out \ $(objpfx)tst-unused-dep.out \ + $(objpfx)tst-trace1.out \ + $(objpfx)tst-trace2.out \ + $(objpfx)tst-trace3.out \ + $(objpfx)tst-trace4.out \ + $(objpfx)tst-trace5.out \ # tests-special endif @@ -1111,6 +1162,17 @@ CFLAGS-tst-prelink.c += -fno-pie tst-prelink-no-pie = yes endif +tests-special += $(objpfx)tst-glibcelf.out +$(objpfx)tst-glibcelf.out: tst-glibcelf.py elf.h $(..)/scripts/glibcelf.py \ + $(..)/scripts/glibcextract.py + PYTHONPATH=$(..)scripts $(PYTHON) tst-glibcelf.py \ + --cc="$(CC) $(patsubst -DMODULE_NAME=%,-DMODULE_NAME=testsuite,$(CPPFLAGS))" \ + < /dev/null > $@ 2>&1; $(evaluate-test) + +ifeq ($(run-built-tests),yes) +tests-special += $(objpfx)tst-tls-allocation-failure-static-patched.out +endif + # The test requires shared _and_ PIE because the executable # unit test driver must be able to link with the shared object # that is going to eventually go into an installed DSO. @@ -1234,8 +1296,7 @@ $(objpfx)ld.so: $(objpfx)librtld.os $(ld-map) $(LINK.o) -nostdlib -nostartfiles -shared -o $@.new \ $(LDFLAGS-rtld) -Wl,-z,defs $(z-now-$(bind-now)) \ $(filter-out $(map-file),$^) $(load-map-file) \ - -Wl,-soname=$(rtld-installed-name) \ - -Wl,-defsym=_begin=0 + -Wl,-soname=$(rtld-installed-name) $(call after-link,$@.new) $(READELF) -s $@.new \ | $(AWK) '($$7 ~ /^UND(|EF)$$/ && $$1 != "0:" && $$4 != "REGISTER") { print; p=1 } END { exit p != 0 }' @@ -2210,7 +2271,7 @@ $(objpfx)tst-audit24c.out: $(objpfx)tst-auditmod24c.so $(objpfx)tst-audit24c: $(objpfx)tst-audit24amod1.so \ $(objpfx)tst-audit24amod2.so tst-audit24c-ENV = LD_BIND_NOW=1 LD_AUDIT=$(objpfx)tst-auditmod24c.so -LDFLAGS-tst-audit24b = -Wl,-z,lazy +LDFLAGS-tst-audit24c = -Wl,-z,lazy $(objpfx)tst-audit24d.out: $(objpfx)tst-auditmod24d.so $(objpfx)tst-audit24d: $(objpfx)tst-audit24dmod1.so \ @@ -2242,6 +2303,10 @@ $(objpfx)tst-audit25b: $(objpfx)tst-audit25mod1.so \ LDFLAGS-tst-audit25b = -Wl,-z,now tst-audit25b-ARGS = -- $(host-test-program-cmd) +$(objpfx)tst-audit26.out: $(objpfx)tst-auditmod26.so +$(objpfx)tst-auditmod26.so: $(libsupport) +tst-audit26-ENV = LD_AUDIT=$(objpfx)tst-auditmod26.so + # tst-sonamemove links against an older implementation of the library. LDFLAGS-tst-sonamemove-linkmod1.so = \ -Wl,--version-script=tst-sonamemove-linkmod1.map \ @@ -2509,6 +2574,7 @@ LDFLAGS-libmarkermod1-1.so += -Wl,-soname,libmarkermod1.so LDFLAGS-libmarkermod2-1.so += -Wl,-soname,libmarkermod2.so LDFLAGS-libmarkermod3-1.so += -Wl,-soname,libmarkermod3.so LDFLAGS-libmarkermod4-1.so += -Wl,-soname,libmarkermod4.so +LDFLAGS-libmarkermod5-1.so += -Wl,-soname,libmarkermod5.so $(objpfx)libmarkermod%.os : markermodMARKER-VALUE.c $(compile-command.c) \ -DMARKER=marker$(firstword $(subst -, ,$*)) \ @@ -2521,6 +2587,8 @@ $(objpfx)libmarkermod3.so: $(objpfx)libmarkermod3-1.so cp $< $@ $(objpfx)libmarkermod4.so: $(objpfx)libmarkermod4-1.so cp $< $@ +$(objpfx)libmarkermod5.so: $(objpfx)libmarkermod5-1.so + cp $< $@ # tst-glibc-hwcaps-prepend checks that --glibc-hwcaps-prepend is # preferred over auto-detected subdirectories. @@ -2733,3 +2801,81 @@ $(objpfx)tst-p_align3: $(objpfx)tst-p_alignmod3.so $(objpfx)tst-p_align3.out: tst-p_align3.sh $(objpfx)tst-p_align3 $(SHELL) $< $(common-objpfx) '$(test-program-prefix)'; \ $(evaluate-test) + +LDFLAGS-libtracemod1-1.so += -Wl,-soname,libtracemod1.so +LDFLAGS-libtracemod2-1.so += -Wl,-soname,libtracemod2.so +LDFLAGS-libtracemod3-1.so += -Wl,-soname,libtracemod3.so +LDFLAGS-libtracemod4-1.so += -Wl,-soname,libtracemod4.so +LDFLAGS-libtracemod5-1.so += -Wl,-soname,libtracemod5.so + +$(objpfx)libtracemod1-1.so: $(objpfx)libtracemod2-1.so \ + $(objpfx)libtracemod3-1.so +$(objpfx)libtracemod2-1.so: $(objpfx)libtracemod4-1.so \ + $(objpfx)libtracemod5-1.so + +define libtracemod-x +$(objpfx)libtracemod$(1)/libtracemod$(1).so: $(objpfx)libtracemod$(1)-1.so + $$(make-target-directory) + cp $$< $$@ +endef +libtracemod-suffixes = 1 2 3 4 5 +$(foreach i,$(libtracemod-suffixes), $(eval $(call libtracemod-x,$(i)))) + +define tst-trace-skeleton +$(objpfx)tst-trace$(1).out: $(objpfx)libtracemod1/libtracemod1.so \ + $(objpfx)libtracemod2/libtracemod2.so \ + $(objpfx)libtracemod3/libtracemod3.so \ + $(objpfx)libtracemod4/libtracemod4.so \ + $(objpfx)libtracemod5/libtracemod5.so \ + $(..)scripts/tst-ld-trace.py \ + tst-trace$(1).exp + ${ $(PYTHON) $(..)scripts/tst-ld-trace.py \ + "$(test-wrapper-env) $(elf-objpfx)$(rtld-installed-name) \ + --library-path $(common-objpfx):$(strip $(2)) \ + $(objpfx)libtracemod1/libtracemod1.so" tst-trace$(1).exp \ + } > $$@; $$(evaluate-test) +endef + +$(eval $(call tst-trace-skeleton,1,)) +$(eval $(call tst-trace-skeleton,2,\ + $(objpfx)libtracemod2)) +$(eval $(call tst-trace-skeleton,3,\ + $(objpfx)libtracemod2:$(objpfx)libtracemod3)) +$(eval $(call tst-trace-skeleton,4,\ + $(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4)) +$(eval $(call tst-trace-skeleton,5,\ + $(objpfx)libtracemod2:$(objpfx)libtracemod3:$(objpfx)libtracemod4:$(objpfx)libtracemod5)) + +$(objpfx)tst-tls-allocation-failure-static-patched: \ + $(objpfx)tst-tls-allocation-failure-static $(..)scripts/tst-elf-edit.py + cp $< $@ + $(PYTHON) $(..)scripts/tst-elf-edit.py --maximize-tls-size $@ + +$(objpfx)tst-tls-allocation-failure-static-patched.out: \ + $(objpfx)tst-tls-allocation-failure-static-patched + $< > $@ 2>&1; echo "status: $$?" >> $@ + grep -q '^Fatal glibc error: Cannot allocate TLS block$$' $@ \ + && grep -q '^status: 127$$' $@; \ + $(evaluate-test) + +$(objpfx)tst-audit-tlsdesc: $(objpfx)tst-audit-tlsdesc-mod1.so \ + $(objpfx)tst-audit-tlsdesc-mod2.so \ + $(shared-thread-library) +ifeq (yes,$(have-mtls-dialect-gnu2)) +# The test is valid for all TLS types, but we want to exercise GNU2 +# TLS if possible. +CFLAGS-tst-audit-tlsdesc-mod1.c += -mtls-dialect=gnu2 +CFLAGS-tst-audit-tlsdesc-mod2.c += -mtls-dialect=gnu2 +endif +$(objpfx)tst-audit-tlsdesc-dlopen: $(shared-thread-library) +$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-audit-tlsdesc-mod1.so \ + $(objpfx)tst-audit-tlsdesc-mod2.so +$(objpfx)tst-audit-tlsdesc-mod1.so: $(objpfx)tst-audit-tlsdesc-mod2.so +$(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so +tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so +$(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so +tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so + +$(objpfx)tst-dlmopen-twice.out: \ + $(objpfx)tst-dlmopen-twice-mod1.so \ + $(objpfx)tst-dlmopen-twice-mod2.so diff --git a/elf/dl-audit.c b/elf/dl-audit.c index 794bfd45c..efc049247 100644 --- a/elf/dl-audit.c +++ b/elf/dl-audit.c @@ -257,7 +257,8 @@ _dl_audit_symbind (struct link_map *l, struct reloc_result *reloc_result, reloc_result->flags = flags; } - DL_FIXUP_BINDNOW_RELOC (value, new_value, sym.st_value); + if (flags & LA_SYMB_ALTVALUE) + DL_FIXUP_BINDNOW_RELOC (value, new_value, sym.st_value); } void diff --git a/elf/dl-deps.c b/elf/dl-deps.c index c8bab5cad..cfe7f0743 100644 --- a/elf/dl-deps.c +++ b/elf/dl-deps.c @@ -489,6 +489,8 @@ _dl_map_object_deps (struct link_map *map, for (nlist = 0, runp = known; runp; runp = runp->next) { + /* _dl_sort_maps ignores l_faked object, so it is safe to not consider + them for nlist. */ if (__builtin_expect (trace_mode, 0) && runp->map->l_faked) /* This can happen when we trace the loading. */ --map->l_searchlist.r_nlist; diff --git a/elf/dl-early_allocate.c b/elf/dl-early_allocate.c new file mode 100644 index 000000000..61677aaa0 --- /dev/null +++ b/elf/dl-early_allocate.c @@ -0,0 +1,30 @@ +/* Early memory allocation for the dynamic loader. Generic version. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include + +void * +_dl_early_allocate (size_t size) +{ + void *result = __sbrk (size); + if (result == (void *) -1) + result = NULL; + return result; +} diff --git a/elf/dl-find_object.c b/elf/dl-find_object.c index 2b8df2fd6..4d5831b6f 100644 --- a/elf/dl-find_object.c +++ b/elf/dl-find_object.c @@ -788,6 +788,9 @@ _dl_find_object_update (struct link_map *new_map) for (struct link_map *l = new_map; l != NULL; l = l->l_next) /* Skip proxy maps and already-processed maps. */ count += l == l->l_real && !l->l_find_object_processed; + if (count == 0) + return true; + struct link_map **map_array = malloc (count * sizeof (*map_array)); if (map_array == NULL) return false; @@ -797,8 +800,6 @@ _dl_find_object_update (struct link_map *new_map) if (l == l->l_real && !l->l_find_object_processed) map_array[i++] = l; } - if (count == 0) - return true; _dl_find_object_link_map_sort (map_array, count); bool ok = _dl_find_object_update_1 (map_array, count); diff --git a/elf/dl-hwcaps.c b/elf/dl-hwcaps.c index 6f161f6ad..92eb53790 100644 --- a/elf/dl-hwcaps.c +++ b/elf/dl-hwcaps.c @@ -193,7 +193,7 @@ _dl_important_hwcaps (const char *glibc_hwcaps_prepend, /* Each hwcaps subdirectory has a GLIBC_HWCAPS_PREFIX string prefix and a "/" suffix once stored in the result. */ hwcaps_counts.maximum_length += strlen (GLIBC_HWCAPS_PREFIX) + 1; - size_t total = (hwcaps_counts.count * (strlen (GLIBC_HWCAPS_PREFIX) + 1) + size_t hwcaps_sz = (hwcaps_counts.count * (strlen (GLIBC_HWCAPS_PREFIX) + 1) + hwcaps_counts.total_length); /* Count the number of bits set in the masked value. */ @@ -229,11 +229,12 @@ _dl_important_hwcaps (const char *glibc_hwcaps_prepend, assert (m == cnt); /* Determine the total size of all strings together. */ + size_t total; if (cnt == 1) - total += temp[0].len + 1; + total = temp[0].len + 1; else { - total += temp[0].len + temp[cnt - 1].len + 2; + total = temp[0].len + temp[cnt - 1].len + 2; if (cnt > 2) { total <<= 1; @@ -255,6 +256,7 @@ _dl_important_hwcaps (const char *glibc_hwcaps_prepend, /* This is the overall result, including both glibc-hwcaps subdirectories and the legacy hwcaps subdirectories using the power set construction. */ + total += hwcaps_sz; struct r_strlenpair *overall_result = malloc (*sz * sizeof (*result) + total); if (overall_result == NULL) diff --git a/elf/dl-libc.c b/elf/dl-libc.c index a7180d0af..266e068da 100644 --- a/elf/dl-libc.c +++ b/elf/dl-libc.c @@ -156,7 +156,7 @@ __libc_dlopen_mode (const char *name, int mode) args.caller_dlopen = RETURN_ADDRESS (0); #ifdef SHARED - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->libc_dlopen_mode (name, mode); #endif return dlerror_run (do_dlopen, &args) ? NULL : (void *) args.map; @@ -184,7 +184,7 @@ __libc_dlsym (void *map, const char *name) args.name = name; #ifdef SHARED - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->libc_dlsym (map, name); #endif return (dlerror_run (do_dlsym, &args) ? NULL @@ -198,7 +198,7 @@ void * __libc_dlvsym (void *map, const char *name, const char *version) { #ifdef SHARED - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->libc_dlvsym (map, name, version); #endif @@ -221,7 +221,7 @@ int __libc_dlclose (void *map) { #ifdef SHARED - if (!rtld_active ()) + if (GLRO (dl_dlfcn_hook) != NULL) return GLRO (dl_dlfcn_hook)->libc_dlclose (map); #endif return dlerror_run (do_dlclose, map); diff --git a/elf/dl-map-segments.h b/elf/dl-map-segments.h index 172692b12..fd24cf5d0 100644 --- a/elf/dl-map-segments.h +++ b/elf/dl-map-segments.h @@ -113,6 +113,9 @@ _dl_map_segments (struct link_map *l, int fd, unallocated. Then jump into the normal segment-mapping loop to handle the portion of the segment past the end of the file mapping. */ + if (__glibc_unlikely (loadcmds[nloadcmds - 1].mapstart < + c->mapend)) + return N_("ELF load command address/offset not page-aligned"); if (__glibc_unlikely (__mprotect ((caddr_t) (l->l_addr + c->mapend), loadcmds[nloadcmds - 1].mapstart - c->mapend, diff --git a/elf/dl-open.c b/elf/dl-open.c index a23e65926..46e8066fd 100644 --- a/elf/dl-open.c +++ b/elf/dl-open.c @@ -844,11 +844,14 @@ _dl_open (const char *file, int mode, const void *caller_dlopen, Lmid_t nsid, _dl_signal_error (EINVAL, file, NULL, N_("\ no more namespaces available for dlmopen()")); } - else if (nsid == GL(dl_nns)) - { - __rtld_lock_initialize (GL(dl_ns)[nsid]._ns_unique_sym_table.lock); - ++GL(dl_nns); - } + + if (nsid == GL(dl_nns)) + ++GL(dl_nns); + + /* Initialize the new namespace. Most members are + zero-initialized, only the lock needs special treatment. */ + memset (&GL(dl_ns)[nsid], 0, sizeof (GL(dl_ns)[nsid])); + __rtld_lock_initialize (GL(dl_ns)[nsid]._ns_unique_sym_table.lock); _dl_debug_update (nsid)->r_state = RT_CONSISTENT; } diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c index 9e9d53ec4..3e2a6a584 100644 --- a/elf/dl-sort-maps.c +++ b/elf/dl-sort-maps.c @@ -27,12 +27,12 @@ If FOR_FINI is true, this is called for finishing an object. */ static void _dl_sort_maps_original (struct link_map **maps, unsigned int nmaps, - unsigned int skip, bool for_fini) + bool force_first, bool for_fini) { /* Allows caller to do the common optimization of skipping the first map, usually the main binary. */ - maps += skip; - nmaps -= skip; + maps += force_first; + nmaps -= force_first; /* A list of one element need not be sorted. */ if (nmaps <= 1) @@ -140,7 +140,9 @@ static void dfs_traversal (struct link_map ***rpo, struct link_map *map, bool *do_reldeps) { - if (map->l_visited) + /* _dl_map_object_deps ignores l_faked objects when calculating the + number of maps before calling _dl_sort_maps, ignore them as well. */ + if (map->l_visited || map->l_faked) return; map->l_visited = 1; @@ -180,8 +182,9 @@ dfs_traversal (struct link_map ***rpo, struct link_map *map, static void _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps, - unsigned int skip __attribute__ ((unused)), bool for_fini) + bool force_first, bool for_fini) { + struct link_map *first_map = maps[0]; for (int i = nmaps - 1; i >= 0; i--) maps[i]->l_visited = 0; @@ -206,14 +209,6 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps, Adjusting the order so that maps[0] is last traversed naturally avoids this problem. - Further, the old "optimization" of skipping the main object at maps[0] - from the call-site (i.e. _dl_sort_maps(maps+1,nmaps-1)) is in general - no longer valid, since traversing along object dependency-links - may "find" the main object even when it is not included in the initial - order (e.g. a dlopen()'ed shared object can have circular dependencies - linked back to itself). In such a case, traversing N-1 objects will - create a N-object result, and raise problems. - To summarize, just passing in the full list, and iterating from back to front makes things much more straightforward. */ @@ -272,6 +267,27 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps, } memcpy (maps, rpo, sizeof (struct link_map *) * nmaps); + + /* Skipping the first object at maps[0] is not valid in general, + since traversing along object dependency-links may "find" that + first object even when it is not included in the initial order + (e.g., a dlopen'ed shared object can have circular dependencies + linked back to itself). In such a case, traversing N-1 objects + will create a N-object result, and raise problems. Instead, + force the object back into first place after sorting. This naive + approach may introduce further dependency ordering violations + compared to rotating the cycle until the first map is again in + the first position, but as there is a cycle, at least one + violation is already present. */ + if (force_first && maps[0] != first_map) + { + int i; + for (i = 0; maps[i] != first_map; ++i) + ; + assert (i < nmaps); + memmove (&maps[1], maps, i * sizeof (maps[0])); + maps[0] = first_map; + } } void @@ -284,7 +300,7 @@ _dl_sort_maps_init (void) void _dl_sort_maps (struct link_map **maps, unsigned int nmaps, - unsigned int skip, bool for_fini) + bool force_first, bool for_fini) { /* It can be tempting to use a static function pointer to store and call the current selected sorting algorithm routine, but experimentation @@ -294,9 +310,9 @@ _dl_sort_maps (struct link_map **maps, unsigned int nmaps, input cases. A simple if-case with direct function calls appears to be the fastest. */ if (__glibc_likely (GLRO(dl_dso_sort_algo) == dso_sort_algorithm_original)) - _dl_sort_maps_original (maps, nmaps, skip, for_fini); + _dl_sort_maps_original (maps, nmaps, force_first, for_fini); else - _dl_sort_maps_dfs (maps, nmaps, skip, for_fini); + _dl_sort_maps_dfs (maps, nmaps, force_first, for_fini); } #endif /* HAVE_TUNABLES. */ diff --git a/elf/dl-support.c b/elf/dl-support.c index fb6476553..09079c124 100644 --- a/elf/dl-support.c +++ b/elf/dl-support.c @@ -44,6 +44,7 @@ #include #include #include +#include extern char *__progname; char **_dl_argv = &__progname; /* This is checked for some error messages. */ @@ -241,93 +242,25 @@ __rtld_lock_define_initialized_recursive (, _dl_load_tls_lock) #ifdef HAVE_AUX_VECTOR +#include + int _dl_clktck; void _dl_aux_init (ElfW(auxv_t) *av) { - int seen = 0; - uid_t uid = 0; - gid_t gid = 0; - #ifdef NEED_DL_SYSINFO /* NB: Avoid RELATIVE relocation in static PIE. */ GL(dl_sysinfo) = DL_SYSINFO_DEFAULT; #endif _dl_auxv = av; - for (; av->a_type != AT_NULL; ++av) - switch (av->a_type) - { - case AT_PAGESZ: - if (av->a_un.a_val != 0) - GLRO(dl_pagesize) = av->a_un.a_val; - break; - case AT_CLKTCK: - GLRO(dl_clktck) = av->a_un.a_val; - break; - case AT_PHDR: - GL(dl_phdr) = (const void *) av->a_un.a_val; - break; - case AT_PHNUM: - GL(dl_phnum) = av->a_un.a_val; - break; - case AT_PLATFORM: - GLRO(dl_platform) = (void *) av->a_un.a_val; - break; - case AT_HWCAP: - GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val; - break; - case AT_HWCAP2: - GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val; - break; - case AT_FPUCW: - GLRO(dl_fpu_control) = av->a_un.a_val; - break; -#ifdef NEED_DL_SYSINFO - case AT_SYSINFO: - GL(dl_sysinfo) = av->a_un.a_val; - break; -#endif -#ifdef NEED_DL_SYSINFO_DSO - case AT_SYSINFO_EHDR: - GL(dl_sysinfo_dso) = (void *) av->a_un.a_val; - break; -#endif - case AT_UID: - uid ^= av->a_un.a_val; - seen |= 1; - break; - case AT_EUID: - uid ^= av->a_un.a_val; - seen |= 2; - break; - case AT_GID: - gid ^= av->a_un.a_val; - seen |= 4; - break; - case AT_EGID: - gid ^= av->a_un.a_val; - seen |= 8; - break; - case AT_SECURE: - seen = -1; - __libc_enable_secure = av->a_un.a_val; - __libc_enable_secure_decided = 1; - break; - case AT_RANDOM: - _dl_random = (void *) av->a_un.a_val; - break; - case AT_MINSIGSTKSZ: - _dl_minsigstacksize = av->a_un.a_val; - break; - DL_PLATFORM_AUXV - } - if (seen == 0xf) - { - __libc_enable_secure = uid != 0 || gid != 0; - __libc_enable_secure_decided = 1; - } + dl_parse_auxv_t auxv_values; + /* Use an explicit initialization loop here because memset may not + be available yet. */ + for (int i = 0; i < array_length (auxv_values); ++i) + auxv_values[i] = 0; + _dl_parse_auxv (av, auxv_values); } #endif diff --git a/elf/dl-sysdep.c b/elf/dl-sysdep.c index f1dba8ef2..7aa90ad6e 100644 --- a/elf/dl-sysdep.c +++ b/elf/dl-sysdep.c @@ -1,4 +1,4 @@ -/* Operating system support for run-time dynamic linker. Generic Unix version. +/* Operating system support for run-time dynamic linker. Stub version. Copyright (C) 1995-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,352 +16,4 @@ License along with the GNU C Library; if not, see . */ -/* We conditionalize the whole of this file rather than simply eliding it - from the static build, because other sysdeps/ versions of this file - might define things needed by a static build. */ - -#ifdef SHARED - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include <_itoa.h> -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -extern char **_environ attribute_hidden; -extern char _end[] attribute_hidden; - -/* Protect SUID program against misuse of file descriptors. */ -extern void __libc_check_standard_fds (void); - -int __libc_enable_secure attribute_relro = 0; -rtld_hidden_data_def (__libc_enable_secure) -/* This variable contains the lowest stack address ever used. */ -void *__libc_stack_end attribute_relro = NULL; -rtld_hidden_data_def(__libc_stack_end) -void *_dl_random attribute_relro = NULL; - -#ifndef DL_FIND_ARG_COMPONENTS -# define DL_FIND_ARG_COMPONENTS(cookie, argc, argv, envp, auxp) \ - do { \ - void **_tmp; \ - (argc) = *(long int *) cookie; \ - (argv) = (char **) ((long int *) cookie + 1); \ - (envp) = (argv) + (argc) + 1; \ - for (_tmp = (void **) (envp); *_tmp; ++_tmp) \ - continue; \ - (auxp) = (void *) ++_tmp; \ - } while (0) -#endif - -#ifndef DL_STACK_END -# define DL_STACK_END(cookie) ((void *) (cookie)) -#endif - -ElfW(Addr) -_dl_sysdep_start (void **start_argptr, - void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum, - ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv)) -{ - const ElfW(Phdr) *phdr = NULL; - ElfW(Word) phnum = 0; - ElfW(Addr) user_entry; - ElfW(auxv_t) *av; -#ifdef HAVE_AUX_SECURE -# define set_seen(tag) (tag) /* Evaluate for the side effects. */ -# define set_seen_secure() ((void) 0) -#else - uid_t uid = 0; - gid_t gid = 0; - unsigned int seen = 0; -# define set_seen_secure() (seen = -1) -# ifdef HAVE_AUX_XID -# define set_seen(tag) (tag) /* Evaluate for the side effects. */ -# else -# define M(type) (1 << (type)) -# define set_seen(tag) seen |= M ((tag)->a_type) -# endif -#endif -#ifdef NEED_DL_SYSINFO - uintptr_t new_sysinfo = 0; -#endif - - __libc_stack_end = DL_STACK_END (start_argptr); - DL_FIND_ARG_COMPONENTS (start_argptr, _dl_argc, _dl_argv, _environ, - GLRO(dl_auxv)); - - user_entry = (ElfW(Addr)) ENTRY_POINT; - GLRO(dl_platform) = NULL; /* Default to nothing known about the platform. */ - - /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */ - _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ), - "CONSTANT_MINSIGSTKSZ is constant"); - GLRO(dl_minsigstacksize) = CONSTANT_MINSIGSTKSZ; - - for (av = GLRO(dl_auxv); av->a_type != AT_NULL; set_seen (av++)) - switch (av->a_type) - { - case AT_PHDR: - phdr = (void *) av->a_un.a_val; - break; - case AT_PHNUM: - phnum = av->a_un.a_val; - break; - case AT_PAGESZ: - GLRO(dl_pagesize) = av->a_un.a_val; - break; - case AT_ENTRY: - user_entry = av->a_un.a_val; - break; -#ifndef HAVE_AUX_SECURE - case AT_UID: - case AT_EUID: - uid ^= av->a_un.a_val; - break; - case AT_GID: - case AT_EGID: - gid ^= av->a_un.a_val; - break; -#endif - case AT_SECURE: -#ifndef HAVE_AUX_SECURE - seen = -1; -#endif - __libc_enable_secure = av->a_un.a_val; - break; - case AT_PLATFORM: - GLRO(dl_platform) = (void *) av->a_un.a_val; - break; - case AT_HWCAP: - GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val; - break; - case AT_HWCAP2: - GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val; - break; - case AT_CLKTCK: - GLRO(dl_clktck) = av->a_un.a_val; - break; - case AT_FPUCW: - GLRO(dl_fpu_control) = av->a_un.a_val; - break; -#ifdef NEED_DL_SYSINFO - case AT_SYSINFO: - new_sysinfo = av->a_un.a_val; - break; -#endif -#ifdef NEED_DL_SYSINFO_DSO - case AT_SYSINFO_EHDR: - GLRO(dl_sysinfo_dso) = (void *) av->a_un.a_val; - break; -#endif - case AT_RANDOM: - _dl_random = (void *) av->a_un.a_val; - break; - case AT_MINSIGSTKSZ: - GLRO(dl_minsigstacksize) = av->a_un.a_val; - break; - DL_PLATFORM_AUXV - } - - dl_hwcap_check (); - -#ifndef HAVE_AUX_SECURE - if (seen != -1) - { - /* Fill in the values we have not gotten from the kernel through the - auxiliary vector. */ -# ifndef HAVE_AUX_XID -# define SEE(UID, var, uid) \ - if ((seen & M (AT_##UID)) == 0) var ^= __get##uid () - SEE (UID, uid, uid); - SEE (EUID, uid, euid); - SEE (GID, gid, gid); - SEE (EGID, gid, egid); -# endif - - /* If one of the two pairs of IDs does not match this is a setuid - or setgid run. */ - __libc_enable_secure = uid | gid; - } -#endif - -#ifndef HAVE_AUX_PAGESIZE - if (GLRO(dl_pagesize) == 0) - GLRO(dl_pagesize) = __getpagesize (); -#endif - -#ifdef NEED_DL_SYSINFO - if (new_sysinfo != 0) - { -# ifdef NEED_DL_SYSINFO_DSO - /* Only set the sysinfo value if we also have the vsyscall DSO. */ - if (GLRO(dl_sysinfo_dso) != 0) -# endif - GLRO(dl_sysinfo) = new_sysinfo; - } -#endif - - __tunables_init (_environ); - - /* Initialize DSO sorting algorithm after tunables. */ - _dl_sort_maps_init (); - -#ifdef DL_SYSDEP_INIT - DL_SYSDEP_INIT; -#endif - -#ifdef DL_PLATFORM_INIT - DL_PLATFORM_INIT; -#endif - - /* Determine the length of the platform name. */ - if (GLRO(dl_platform) != NULL) - GLRO(dl_platformlen) = strlen (GLRO(dl_platform)); - - if (__sbrk (0) == _end) - /* The dynamic linker was run as a program, and so the initial break - starts just after our bss, at &_end. The malloc in dl-minimal.c - will consume the rest of this page, so tell the kernel to move the - break up that far. When the user program examines its break, it - will see this new value and not clobber our data. */ - __sbrk (GLRO(dl_pagesize) - - ((_end - (char *) 0) & (GLRO(dl_pagesize) - 1))); - - /* If this is a SUID program we make sure that FDs 0, 1, and 2 are - allocated. If necessary we are doing it ourself. If it is not - possible we stop the program. */ - if (__builtin_expect (__libc_enable_secure, 0)) - __libc_check_standard_fds (); - - (*dl_main) (phdr, phnum, &user_entry, GLRO(dl_auxv)); - return user_entry; -} - -void -_dl_sysdep_start_cleanup (void) -{ -} - -void -_dl_show_auxv (void) -{ - char buf[64]; - ElfW(auxv_t) *av; - - /* Terminate string. */ - buf[63] = '\0'; - - /* The following code assumes that the AT_* values are encoded - starting from 0 with AT_NULL, 1 for AT_IGNORE, and all other values - close by (otherwise the array will be too large). In case we have - to support a platform where these requirements are not fulfilled - some alternative implementation has to be used. */ - for (av = GLRO(dl_auxv); av->a_type != AT_NULL; ++av) - { - static const struct - { - const char label[22]; - enum { unknown = 0, dec, hex, str, ignore } form : 8; - } auxvars[] = - { - [AT_EXECFD - 2] = { "EXECFD: ", dec }, - [AT_EXECFN - 2] = { "EXECFN: ", str }, - [AT_PHDR - 2] = { "PHDR: 0x", hex }, - [AT_PHENT - 2] = { "PHENT: ", dec }, - [AT_PHNUM - 2] = { "PHNUM: ", dec }, - [AT_PAGESZ - 2] = { "PAGESZ: ", dec }, - [AT_BASE - 2] = { "BASE: 0x", hex }, - [AT_FLAGS - 2] = { "FLAGS: 0x", hex }, - [AT_ENTRY - 2] = { "ENTRY: 0x", hex }, - [AT_NOTELF - 2] = { "NOTELF: ", hex }, - [AT_UID - 2] = { "UID: ", dec }, - [AT_EUID - 2] = { "EUID: ", dec }, - [AT_GID - 2] = { "GID: ", dec }, - [AT_EGID - 2] = { "EGID: ", dec }, - [AT_PLATFORM - 2] = { "PLATFORM: ", str }, - [AT_HWCAP - 2] = { "HWCAP: ", hex }, - [AT_CLKTCK - 2] = { "CLKTCK: ", dec }, - [AT_FPUCW - 2] = { "FPUCW: ", hex }, - [AT_DCACHEBSIZE - 2] = { "DCACHEBSIZE: 0x", hex }, - [AT_ICACHEBSIZE - 2] = { "ICACHEBSIZE: 0x", hex }, - [AT_UCACHEBSIZE - 2] = { "UCACHEBSIZE: 0x", hex }, - [AT_IGNOREPPC - 2] = { "IGNOREPPC", ignore }, - [AT_SECURE - 2] = { "SECURE: ", dec }, - [AT_BASE_PLATFORM - 2] = { "BASE_PLATFORM: ", str }, - [AT_SYSINFO - 2] = { "SYSINFO: 0x", hex }, - [AT_SYSINFO_EHDR - 2] = { "SYSINFO_EHDR: 0x", hex }, - [AT_RANDOM - 2] = { "RANDOM: 0x", hex }, - [AT_HWCAP2 - 2] = { "HWCAP2: 0x", hex }, - [AT_MINSIGSTKSZ - 2] = { "MINSIGSTKSZ: ", dec }, - [AT_L1I_CACHESIZE - 2] = { "L1I_CACHESIZE: ", dec }, - [AT_L1I_CACHEGEOMETRY - 2] = { "L1I_CACHEGEOMETRY: 0x", hex }, - [AT_L1D_CACHESIZE - 2] = { "L1D_CACHESIZE: ", dec }, - [AT_L1D_CACHEGEOMETRY - 2] = { "L1D_CACHEGEOMETRY: 0x", hex }, - [AT_L2_CACHESIZE - 2] = { "L2_CACHESIZE: ", dec }, - [AT_L2_CACHEGEOMETRY - 2] = { "L2_CACHEGEOMETRY: 0x", hex }, - [AT_L3_CACHESIZE - 2] = { "L3_CACHESIZE: ", dec }, - [AT_L3_CACHEGEOMETRY - 2] = { "L3_CACHEGEOMETRY: 0x", hex }, - }; - unsigned int idx = (unsigned int) (av->a_type - 2); - - if ((unsigned int) av->a_type < 2u - || (idx < sizeof (auxvars) / sizeof (auxvars[0]) - && auxvars[idx].form == ignore)) - continue; - - assert (AT_NULL == 0); - assert (AT_IGNORE == 1); - - /* Some entries are handled in a special way per platform. */ - if (_dl_procinfo (av->a_type, av->a_un.a_val) == 0) - continue; - - if (idx < sizeof (auxvars) / sizeof (auxvars[0]) - && auxvars[idx].form != unknown) - { - const char *val = (char *) av->a_un.a_val; - - if (__builtin_expect (auxvars[idx].form, dec) == dec) - val = _itoa ((unsigned long int) av->a_un.a_val, - buf + sizeof buf - 1, 10, 0); - else if (__builtin_expect (auxvars[idx].form, hex) == hex) - val = _itoa ((unsigned long int) av->a_un.a_val, - buf + sizeof buf - 1, 16, 0); - - _dl_printf ("AT_%s%s\n", auxvars[idx].label, val); - - continue; - } - - /* Unknown value: print a generic line. */ - char buf2[17]; - buf2[sizeof (buf2) - 1] = '\0'; - const char *val2 = _itoa ((unsigned long int) av->a_un.a_val, - buf2 + sizeof buf2 - 1, 16, 0); - const char *val = _itoa ((unsigned long int) av->a_type, - buf + sizeof buf - 1, 16, 0); - _dl_printf ("AT_??? (0x%s): 0x%s\n", val, val2); - } -} - -#endif +#error dl-sysdep support missing. diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def index 5f7f18ef2..4bf9052db 100644 --- a/elf/dso-sort-tests-1.def +++ b/elf/dso-sort-tests-1.def @@ -64,3 +64,10 @@ output: b>a>{}b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[a1;a->a2;a2->a;b->b1;c->a1;c=>a1 +output(glibc.rtld.dynamic_sort=1): {+a[a2>a1>a>];+b[b1>b>];-b[];%c(a1());}a1>a>];+b[b1>b>];-b[];%c(a1());} #include -/* If nonzero __libc_enable_secure is already set. */ -int __libc_enable_secure_decided; /* Safest assumption, if somehow the initializer isn't run. */ int __libc_enable_secure = 1; - -void -__libc_init_secure (void) -{ - if (__libc_enable_secure_decided == 0) - __libc_enable_secure = (startup_geteuid () != startup_getuid () - || startup_getegid () != startup_getgid ()); -} diff --git a/elf/libtracemod1-1.c b/elf/libtracemod1-1.c new file mode 100644 index 000000000..7c89c9a5a --- /dev/null +++ b/elf/libtracemod1-1.c @@ -0,0 +1 @@ +/* Empty */ diff --git a/elf/libtracemod2-1.c b/elf/libtracemod2-1.c new file mode 100644 index 000000000..7c89c9a5a --- /dev/null +++ b/elf/libtracemod2-1.c @@ -0,0 +1 @@ +/* Empty */ diff --git a/elf/libtracemod3-1.c b/elf/libtracemod3-1.c new file mode 100644 index 000000000..7c89c9a5a --- /dev/null +++ b/elf/libtracemod3-1.c @@ -0,0 +1 @@ +/* Empty */ diff --git a/elf/libtracemod4-1.c b/elf/libtracemod4-1.c new file mode 100644 index 000000000..7c89c9a5a --- /dev/null +++ b/elf/libtracemod4-1.c @@ -0,0 +1 @@ +/* Empty */ diff --git a/elf/libtracemod5-1.c b/elf/libtracemod5-1.c new file mode 100644 index 000000000..7c89c9a5a --- /dev/null +++ b/elf/libtracemod5-1.c @@ -0,0 +1 @@ +/* Empty */ diff --git a/elf/rtld.c b/elf/rtld.c index 8dafaf61f..86354fb0c 100644 --- a/elf/rtld.c +++ b/elf/rtld.c @@ -441,8 +441,8 @@ static ElfW(Addr) _dl_start_final (void *arg, struct dl_start_final_info *info); #endif -/* These defined magically in the linker script. */ -extern char _begin[] attribute_hidden; +/* These are defined magically by the linker. */ +extern const ElfW(Ehdr) __ehdr_start attribute_hidden; extern char _etext[] attribute_hidden; extern char _end[] attribute_hidden; @@ -487,7 +487,7 @@ _dl_start_final (void *arg, struct dl_start_final_info *info) #endif _dl_setup_hash (&GL(dl_rtld_map)); GL(dl_rtld_map).l_real = &GL(dl_rtld_map); - GL(dl_rtld_map).l_map_start = (ElfW(Addr)) _begin; + GL(dl_rtld_map).l_map_start = (ElfW(Addr)) &__ehdr_start; GL(dl_rtld_map).l_map_end = (ElfW(Addr)) _end; GL(dl_rtld_map).l_text_end = (ElfW(Addr)) _etext; /* Copy the TLS related data if necessary. */ @@ -1312,6 +1312,62 @@ rtld_setup_main_map (struct link_map *main_map) return has_interp; } +/* Adjusts the contents of the stack and related globals for the user + entry point. The ld.so processed skip_args arguments and bumped + _dl_argv and _dl_argc accordingly. Those arguments are removed from + argv here. */ +static void +_dl_start_args_adjust (int skip_args) +{ + void **sp = (void **) (_dl_argv - skip_args - 1); + void **p = sp + skip_args; + + if (skip_args == 0) + return; + + /* Sanity check. */ + intptr_t argc = (intptr_t) sp[0] - skip_args; + assert (argc == _dl_argc); + + /* Adjust argc on stack. */ + sp[0] = (void *) (intptr_t) _dl_argc; + + /* Update globals in rtld. */ + _dl_argv -= skip_args; + _environ -= skip_args; + + /* Shuffle argv down. */ + do + *++sp = *++p; + while (*p != NULL); + + assert (_environ == (char **) (sp + 1)); + + /* Shuffle envp down. */ + do + *++sp = *++p; + while (*p != NULL); + +#ifdef HAVE_AUX_VECTOR + void **auxv = (void **) GLRO(dl_auxv) - skip_args; + GLRO(dl_auxv) = (ElfW(auxv_t) *) auxv; /* Aliasing violation. */ + assert (auxv == sp + 1); + + /* Shuffle auxv down. */ + ElfW(auxv_t) ax; + char *oldp = (char *) (p + 1); + char *newp = (char *) (sp + 1); + do + { + memcpy (&ax, oldp, sizeof (ax)); + memcpy (newp, &ax, sizeof (ax)); + oldp += sizeof (ax); + newp += sizeof (ax); + } + while (ax.a_type != AT_NULL); +#endif +} + static void dl_main (const ElfW(Phdr) *phdr, ElfW(Word) phnum, @@ -1366,6 +1422,7 @@ dl_main (const ElfW(Phdr) *phdr, rtld_is_main = true; char *argv0 = NULL; + char **orig_argv = _dl_argv; /* Note the place where the dynamic linker actually came from. */ GL(dl_rtld_map).l_name = rtld_progname; @@ -1380,7 +1437,6 @@ dl_main (const ElfW(Phdr) *phdr, GLRO(dl_lazy) = -1; } - ++_dl_skip_args; --_dl_argc; ++_dl_argv; } @@ -1389,14 +1445,12 @@ dl_main (const ElfW(Phdr) *phdr, if (state.mode != rtld_mode_help) state.mode = rtld_mode_verify; - ++_dl_skip_args; --_dl_argc; ++_dl_argv; } else if (! strcmp (_dl_argv[1], "--inhibit-cache")) { GLRO(dl_inhibit_cache) = 1; - ++_dl_skip_args; --_dl_argc; ++_dl_argv; } @@ -1406,7 +1460,6 @@ dl_main (const ElfW(Phdr) *phdr, state.library_path = _dl_argv[2]; state.library_path_source = "--library-path"; - _dl_skip_args += 2; _dl_argc -= 2; _dl_argv += 2; } @@ -1415,7 +1468,6 @@ dl_main (const ElfW(Phdr) *phdr, { GLRO(dl_inhibit_rpath) = _dl_argv[2]; - _dl_skip_args += 2; _dl_argc -= 2; _dl_argv += 2; } @@ -1423,14 +1475,12 @@ dl_main (const ElfW(Phdr) *phdr, { audit_list_add_string (&state.audit_list, _dl_argv[2]); - _dl_skip_args += 2; _dl_argc -= 2; _dl_argv += 2; } else if (! strcmp (_dl_argv[1], "--preload") && _dl_argc > 2) { state.preloadarg = _dl_argv[2]; - _dl_skip_args += 2; _dl_argc -= 2; _dl_argv += 2; } @@ -1438,7 +1488,6 @@ dl_main (const ElfW(Phdr) *phdr, { argv0 = _dl_argv[2]; - _dl_skip_args += 2; _dl_argc -= 2; _dl_argv += 2; } @@ -1446,7 +1495,6 @@ dl_main (const ElfW(Phdr) *phdr, && _dl_argc > 2) { state.glibc_hwcaps_prepend = _dl_argv[2]; - _dl_skip_args += 2; _dl_argc -= 2; _dl_argv += 2; } @@ -1454,7 +1502,6 @@ dl_main (const ElfW(Phdr) *phdr, && _dl_argc > 2) { state.glibc_hwcaps_mask = _dl_argv[2]; - _dl_skip_args += 2; _dl_argc -= 2; _dl_argv += 2; } @@ -1463,7 +1510,6 @@ dl_main (const ElfW(Phdr) *phdr, { state.mode = rtld_mode_list_tunables; - ++_dl_skip_args; --_dl_argc; ++_dl_argv; } @@ -1472,7 +1518,6 @@ dl_main (const ElfW(Phdr) *phdr, { state.mode = rtld_mode_list_diagnostics; - ++_dl_skip_args; --_dl_argc; ++_dl_argv; } @@ -1518,7 +1563,6 @@ dl_main (const ElfW(Phdr) *phdr, _dl_usage (ld_so_name, NULL); } - ++_dl_skip_args; --_dl_argc; ++_dl_argv; @@ -1617,6 +1661,9 @@ dl_main (const ElfW(Phdr) *phdr, /* Set the argv[0] string now that we've processed the executable. */ if (argv0 != NULL) _dl_argv[0] = argv0; + + /* Adjust arguments for the application entry point. */ + _dl_start_args_adjust (_dl_argv - orig_argv); } else { @@ -1754,7 +1801,6 @@ dl_main (const ElfW(Phdr) *phdr, segment that also includes the phdrs. If that's not available, we use the old method that assumes the beginning of the file is part of the lowest-addressed PT_LOAD segment. */ - extern const ElfW(Ehdr) __ehdr_start __attribute__ ((visibility ("hidden"))); /* Set up the program header information for the dynamic linker itself. It is needed in the dl_iterate_phdr callbacks. */ diff --git a/elf/tst-audit26.c b/elf/tst-audit26.c new file mode 100644 index 000000000..3f920e83b --- /dev/null +++ b/elf/tst-audit26.c @@ -0,0 +1,35 @@ +/* Check the usability of functions in audit modules. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#include +#include + +static int +do_test (void) +{ + /* Check that the audit module has been loaded. */ + void *handle = xdlopen ("mapped to libc", RTLD_LOCAL | RTLD_NOW); + TEST_VERIFY (handle + == xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD)); + + return 0; +} + +#include diff --git a/elf/tst-auditmod24a.c b/elf/tst-auditmod24a.c index d8e88f398..3075dfae2 100644 --- a/elf/tst-auditmod24a.c +++ b/elf/tst-auditmod24a.c @@ -110,5 +110,7 @@ la_symbind32 (Elf32_Sym *sym, unsigned int ndx, return sym->st_value; } - abort (); + if (symname[0] != '\0') + abort (); + return sym->st_value; } diff --git a/elf/tst-auditmod24d.c b/elf/tst-auditmod24d.c index 8c803ecc0..badc6be45 100644 --- a/elf/tst-auditmod24d.c +++ b/elf/tst-auditmod24d.c @@ -116,5 +116,7 @@ la_symbind32 (Elf32_Sym *sym, unsigned int ndx, } } - abort (); + if (symname[0] != '\0') + abort (); + return sym->st_value; } diff --git a/elf/tst-auditmod25.c b/elf/tst-auditmod25.c index 526f5c54b..20640a8da 100644 --- a/elf/tst-auditmod25.c +++ b/elf/tst-auditmod25.c @@ -72,7 +72,7 @@ la_symbind32 (Elf32_Sym *sym, unsigned int ndx, unsigned int *flags, const char *symname) #endif { - if (*refcook != -1 && *defcook != -1) + if (*refcook != -1 && *defcook != -1 && symname[0] != '\0') fprintf (stderr, "la_symbind: %s %u\n", symname, *flags & (LA_SYMB_NOPLTENTER | LA_SYMB_NOPLTEXIT) ? 1 : 0); return sym->st_value; diff --git a/elf/tst-auditmod26.c b/elf/tst-auditmod26.c new file mode 100644 index 000000000..db7ba95ab --- /dev/null +++ b/elf/tst-auditmod26.c @@ -0,0 +1,104 @@ +/* Check the usability of functions in audit modules. Audit module. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +unsigned int +la_version (unsigned int current) +{ + /* Exercise various functions. */ + + /* Check dlopen, dlsym, dlclose. */ + void *handle = xdlopen (LIBM_SO, RTLD_LOCAL | RTLD_NOW); + void *ptr = xdlsym (handle, "sincos"); + TEST_VERIFY (ptr != NULL); + ptr = dlsym (handle, "SINCOS"); + TEST_VERIFY (ptr == NULL); + const char *message = dlerror (); + TEST_VERIFY (strstr (message, ": undefined symbol: SINCOS") != NULL); + ptr = dlsym (handle, "SINCOS"); + TEST_VERIFY (ptr == NULL); + xdlclose (handle); + TEST_COMPARE_STRING (dlerror (), NULL); + + handle = xdlopen (LIBC_SO, RTLD_LOCAL | RTLD_NOW | RTLD_NOLOAD); + + /* Check dlvsym. _exit is unlikely to gain another symbol + version. */ + TEST_VERIFY (xdlsym (handle, "_exit") + == xdlvsym (handle, "_exit", FIRST_VERSION_libc__exit_STRING)); + + /* Check dlinfo. */ + { + void *handle2 = NULL; + TEST_COMPARE (dlinfo (handle, RTLD_DI_LINKMAP, &handle2), 0); + TEST_VERIFY (handle2 == handle); + } + + /* Check dladdr and dladdr1. */ + Dl_info info = { }; + TEST_VERIFY (dladdr (&_exit, &info) != 0); + if (strcmp (info.dli_sname, "_Exit") != 0) /* _Exit is an alias. */ + TEST_COMPARE_STRING (info.dli_sname, "_exit"); + TEST_VERIFY (info.dli_saddr == &_exit); + TEST_VERIFY (strstr (info.dli_fname, LIBC_SO)); + void *extra_info; + memset (&info, 0, sizeof (info)); + TEST_VERIFY (dladdr1 (&_exit, &info, &extra_info, RTLD_DL_LINKMAP) != 0); + TEST_VERIFY (extra_info == handle); + + /* Verify that dlmopen creates a new namespace. */ + void *dlmopen_handle = xdlmopen (LM_ID_NEWLM, LIBC_SO, RTLD_NOW); + TEST_VERIFY (dlmopen_handle != handle); + memset (&info, 0, sizeof (info)); + extra_info = NULL; + ptr = xdlsym (dlmopen_handle, "_exit"); + TEST_VERIFY (dladdr1 (ptr, &info, &extra_info, RTLD_DL_LINKMAP) != 0); + TEST_VERIFY (extra_info == dlmopen_handle); + xdlclose (dlmopen_handle); + + /* Terminate the process with an error state. This does not happen + automatically because the audit module state is not shared with + the main program. */ + if (support_record_failure_is_failed ()) + { + fflush (stdout); + fflush (stderr); + _exit (1); + } + + return LAV_CURRENT; +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + if (strcmp (name, "mapped to libc") == 0) + return (char *) LIBC_SO; + else + return (char *) name; +} diff --git a/elf/tst-dlmopen-twice-mod1.c b/elf/tst-dlmopen-twice-mod1.c new file mode 100644 index 000000000..0eaf04948 --- /dev/null +++ b/elf/tst-dlmopen-twice-mod1.c @@ -0,0 +1,37 @@ +/* Initialization of libc after dlmopen/dlclose/dlmopen (bug 29528). Module 1. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +static void __attribute__ ((constructor)) +init (void) +{ + puts ("info: tst-dlmopen-twice-mod1.so loaded"); + fflush (stdout); +} + +static void __attribute__ ((destructor)) +fini (void) +{ + puts ("info: tst-dlmopen-twice-mod1.so about to be unloaded"); + fflush (stdout); +} + +/* Large allocation. The second module does not have this, so it + should load libc at a different address. */ +char large_allocate[16 * 1024 * 1024]; diff --git a/elf/tst-dlmopen-twice-mod2.c b/elf/tst-dlmopen-twice-mod2.c new file mode 100644 index 000000000..40c6c01f9 --- /dev/null +++ b/elf/tst-dlmopen-twice-mod2.c @@ -0,0 +1,50 @@ +/* Initialization of libc after dlmopen/dlclose/dlmopen (bug 29528). Module 2. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +static void __attribute__ ((constructor)) +init (void) +{ + puts ("info: tst-dlmopen-twice-mod2.so loaded"); + fflush (stdout); +} + +static void __attribute__ ((destructor)) +fini (void) +{ + puts ("info: tst-dlmopen-twice-mod2.so about to be unloaded"); + fflush (stdout); +} + +int +run_check (void) +{ + puts ("info: about to call isalpha"); + fflush (stdout); + + volatile char ch = 'a'; + if (!isalpha (ch)) + { + puts ("error: isalpha ('a') is not true"); + fflush (stdout); + return 1; + } + return 0; +} diff --git a/elf/tst-dlmopen-twice.c b/elf/tst-dlmopen-twice.c new file mode 100644 index 000000000..449f3c8fa --- /dev/null +++ b/elf/tst-dlmopen-twice.c @@ -0,0 +1,34 @@ +/* Initialization of libc after dlmopen/dlclose/dlmopen (bug 29528). Main. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +static int +do_test (void) +{ + void *handle = xdlmopen (LM_ID_NEWLM, "tst-dlmopen-twice-mod1.so", RTLD_NOW); + xdlclose (handle); + handle = xdlmopen (LM_ID_NEWLM, "tst-dlmopen-twice-mod2.so", RTLD_NOW); + int (*run_check) (void) = xdlsym (handle, "run_check"); + TEST_COMPARE (run_check (), 0); + xdlclose (handle); + return 0; +} + +#include diff --git a/elf/tst-glibc-hwcaps-cache.script b/elf/tst-glibc-hwcaps-cache.script index c3271f61f..d58fc8c5d 100644 --- a/elf/tst-glibc-hwcaps-cache.script +++ b/elf/tst-glibc-hwcaps-cache.script @@ -4,6 +4,7 @@ cp $B/elf/libmarkermod2-1.so $L/libmarkermod2.so cp $B/elf/libmarkermod3-1.so $L/libmarkermod3.so cp $B/elf/libmarkermod4-1.so $L/libmarkermod4.so +cp $B/elf/libmarkermod5-1.so $L/libmarkermod5.so mkdirp 0770 $L/glibc-hwcaps/power9 cp $B/elf/libmarkermod2-2.so $L/glibc-hwcaps/power9/libmarkermod2.so @@ -20,6 +21,11 @@ mkdirp 0770 $L/glibc-hwcaps/z15 cp $B/elf/libmarkermod4-2.so $L/glibc-hwcaps/z13/libmarkermod4.so cp $B/elf/libmarkermod4-3.so $L/glibc-hwcaps/z14/libmarkermod4.so cp $B/elf/libmarkermod4-4.so $L/glibc-hwcaps/z15/libmarkermod4.so +mkdirp 0770 $L/glibc-hwcaps/z16 +cp $B/elf/libmarkermod5-2.so $L/glibc-hwcaps/z13/libmarkermod5.so +cp $B/elf/libmarkermod5-3.so $L/glibc-hwcaps/z14/libmarkermod5.so +cp $B/elf/libmarkermod5-4.so $L/glibc-hwcaps/z15/libmarkermod5.so +cp $B/elf/libmarkermod5-5.so $L/glibc-hwcaps/z16/libmarkermod5.so mkdirp 0770 $L/glibc-hwcaps/x86-64-v2 cp $B/elf/libmarkermod2-2.so $L/glibc-hwcaps/x86-64-v2/libmarkermod2.so diff --git a/elf/tst-glibcelf.py b/elf/tst-glibcelf.py new file mode 100644 index 000000000..bf15a3bad --- /dev/null +++ b/elf/tst-glibcelf.py @@ -0,0 +1,260 @@ +#!/usr/bin/python3 +# Verify scripts/glibcelf.py contents against elf/elf.h. +# Copyright (C) 2022 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +import argparse +import enum +import sys + +import glibcelf +import glibcextract + +errors_encountered = 0 + +def error(message): + global errors_encountered + sys.stdout.write('error: {}\n'.format(message)) + errors_encountered += 1 + +# The enum constants in glibcelf are expected to have exactly these +# prefixes. +expected_constant_prefixes = tuple( + 'ELFCLASS ELFDATA EM_ ET_ DT_ PF_ PT_ SHF_ SHN_ SHT_ STB_ STT_'.split()) + +def find_constant_prefix(name): + """Returns a matching prefix from expected_constant_prefixes or None.""" + for prefix in expected_constant_prefixes: + if name.startswith(prefix): + return prefix + return None + +def find_enum_types(): + """A generator for OpenIntEnum and IntFlag classes in glibcelf.""" + for obj in vars(glibcelf).values(): + if isinstance(obj, type) and obj.__bases__[0] in ( + glibcelf._OpenIntEnum, enum.Enum, enum.IntFlag): + yield obj + +def check_duplicates(): + """Verifies that enum types do not have duplicate values. + + Different types must have different member names, too. + + """ + global_seen = {} + for typ in find_enum_types(): + seen = {} + last = None + for (name, e) in typ.__members__.items(): + if e.value in seen: + error('{} has {}={} and {}={}'.format( + typ, seen[e.value], e.value, name, e.value)) + last = e + else: + seen[e.value] = name + if last is not None and last.value > e.value: + error('{} has {}={} after {}={}'.format( + typ, name, e.value, last.name, last.value)) + if name in global_seen: + error('{} used in {} and {}'.format( + name, global_seen[name], typ)) + else: + global_seen[name] = typ + +def check_constant_prefixes(): + """Check that the constant prefixes match expected_constant_prefixes.""" + seen = set() + for typ in find_enum_types(): + typ_prefix = None + for val in typ: + prefix = find_constant_prefix(val.name) + if prefix is None: + error('constant {!r} for {} has unknown prefix'.format( + val, typ)) + break + elif typ_prefix is None: + typ_prefix = prefix + seen.add(typ_prefix) + elif prefix != typ_prefix: + error('prefix {!r} for constant {!r}, expected {!r}'.format( + prefix, val, typ_prefix)) + if typ_prefix is None: + error('empty enum type {}'.format(typ)) + + for prefix in sorted(set(expected_constant_prefixes) - seen): + error('missing constant prefix {!r}'.format(prefix)) + # Reverse difference is already covered inside the loop. + +def find_elf_h_constants(cc): + """Returns a dictionary of relevant constants from .""" + return glibcextract.compute_macro_consts( + source_text='#include ', + cc=cc, + macro_re='|'.join( + prefix + '.*' for prefix in expected_constant_prefixes)) + +# The first part of the pair is a name of an constant that is +# dropped from glibcelf. The second part is the constant as it is +# used in . +glibcelf_skipped_aliases = ( + ('EM_ARC_A5', 'EM_ARC_COMPACT'), + ('PF_PARISC_SBP', 'PF_HP_SBP') +) + +# Constants that provide little value and are not included in +# glibcelf: *LO*/*HI* range constants, *NUM constants counting the +# number of constants. Also includes the alias names from +# glibcelf_skipped_aliases. +glibcelf_skipped_constants = frozenset( + [e[0] for e in glibcelf_skipped_aliases]) | frozenset(""" +DT_AARCH64_NUM +DT_ADDRNUM +DT_ADDRRNGHI +DT_ADDRRNGLO +DT_ALPHA_NUM +DT_ENCODING +DT_EXTRANUM +DT_HIOS +DT_HIPROC +DT_IA_64_NUM +DT_LOOS +DT_LOPROC +DT_MIPS_NUM +DT_NUM +DT_PPC64_NUM +DT_PPC_NUM +DT_PROCNUM +DT_SPARC_NUM +DT_VALNUM +DT_VALRNGHI +DT_VALRNGLO +DT_VERSIONTAGNUM +ELFCLASSNUM +ELFDATANUM +ET_HIOS +ET_HIPROC +ET_LOOS +ET_LOPROC +ET_NUM +PF_MASKOS +PF_MASKPROC +PT_HIOS +PT_HIPROC +PT_HISUNW +PT_LOOS +PT_LOPROC +PT_LOSUNW +SHF_MASKOS +SHF_MASKPROC +SHN_HIOS +SHN_HIPROC +SHN_HIRESERVE +SHN_LOOS +SHN_LOPROC +SHN_LORESERVE +SHT_HIOS +SHT_HIPROC +SHT_HIPROC +SHT_HISUNW +SHT_HIUSER +SHT_LOOS +SHT_LOPROC +SHT_LOSUNW +SHT_LOUSER +SHT_NUM +STB_HIOS +STB_HIPROC +STB_LOOS +STB_LOPROC +STB_NUM +STT_HIOS +STT_HIPROC +STT_LOOS +STT_LOPROC +STT_NUM +""".strip().split()) + +def check_constant_values(cc): + """Checks the values of constants against glibcelf.""" + + glibcelf_constants = { + e.name: e for typ in find_enum_types() for e in typ} + elf_h_constants = find_elf_h_constants(cc=cc) + + missing_in_glibcelf = (set(elf_h_constants) - set(glibcelf_constants) + - glibcelf_skipped_constants) + for name in sorted(missing_in_glibcelf): + error('constant {} is missing from glibcelf'.format(name)) + + unexpected_in_glibcelf = \ + set(glibcelf_constants) & glibcelf_skipped_constants + for name in sorted(unexpected_in_glibcelf): + error('constant {} is supposed to be filtered from glibcelf'.format( + name)) + + missing_in_elf_h = set(glibcelf_constants) - set(elf_h_constants) + for name in sorted(missing_in_elf_h): + error('constant {} is missing from '.format(name)) + + expected_in_elf_h = glibcelf_skipped_constants - set(elf_h_constants) + for name in expected_in_elf_h: + error('filtered constant {} is missing from '.format(name)) + + for alias_name, name_in_glibcelf in glibcelf_skipped_aliases: + if name_in_glibcelf not in glibcelf_constants: + error('alias value {} for {} not in glibcelf'.format( + name_in_glibcelf, alias_name)) + elif (int(elf_h_constants[alias_name]) + != glibcelf_constants[name_in_glibcelf].value): + error(' has {}={}, glibcelf has {}={}'.format( + alias_name, elf_h_constants[alias_name], + name_in_glibcelf, glibcelf_constants[name_in_glibcelf])) + + # Check for value mismatches: + for name in sorted(set(glibcelf_constants) & set(elf_h_constants)): + glibcelf_value = glibcelf_constants[name].value + elf_h_value = int(elf_h_constants[name]) + # On 32-bit architectures as some constants that are + # parsed as signed, while they are unsigned in glibcelf. So + # far, this only affects some flag constants, so special-case + # them here. + if (glibcelf_value != elf_h_value + and not (isinstance(glibcelf_constants[name], enum.IntFlag) + and glibcelf_value == 1 << 31 + and elf_h_value == -(1 << 31))): + error('{}: glibcelf has {!r}, has {!r}'.format( + name, glibcelf_value, elf_h_value)) + +def main(): + """The main entry point.""" + parser = argparse.ArgumentParser( + description="Check glibcelf.py and elf.h against each other.") + parser.add_argument('--cc', metavar='CC', + help='C compiler (including options) to use') + args = parser.parse_args() + + check_duplicates() + check_constant_prefixes() + check_constant_values(cc=args.cc) + + if errors_encountered > 0: + print("note: errors encountered:", errors_encountered) + sys.exit(1) + +if __name__ == '__main__': + main() diff --git a/elf/tst-relro-symbols.py b/elf/tst-relro-symbols.py new file mode 100644 index 000000000..368ea3349 --- /dev/null +++ b/elf/tst-relro-symbols.py @@ -0,0 +1,137 @@ +#!/usr/bin/python3 +# Verify that certain symbols are covered by RELRO. +# Copyright (C) 2022 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +"""Analyze a (shared) object to verify that certain symbols are +present and covered by the PT_GNU_RELRO segment. + +""" + +import argparse +import os.path +import sys + +# Make available glibc Python modules. +sys.path.append(os.path.join( + os.path.dirname(os.path.realpath(__file__)), os.path.pardir, 'scripts')) + +import glibcelf + +def find_relro(path: str, img: glibcelf.Image) -> (int, int): + """Discover the address range of the PT_GNU_RELRO segment.""" + for phdr in img.phdrs(): + if phdr.p_type == glibcelf.Pt.PT_GNU_RELRO: + # The computation is not entirely accurate because + # _dl_protect_relro in elf/dl-reloc.c rounds both the + # start end and downwards using the run-time page size. + return phdr.p_vaddr, phdr.p_vaddr + phdr.p_memsz + sys.stdout.write('{}: error: no PT_GNU_RELRO segment\n'.format(path)) + sys.exit(1) + +def check_in_relro(kind, relro_begin, relro_end, name, start, size, error): + """Check if a section or symbol falls within in the RELRO segment.""" + end = start + size - 1 + if not (relro_begin <= start < end < relro_end): + error( + '{} {!r} of size {} at 0x{:x} is not in RELRO range [0x{:x}, 0x{:x})'.format( + kind, name.decode('UTF-8'), start, size, + relro_begin, relro_end)) + +def get_parser(): + """Return an argument parser for this script.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('object', help='path to object file to check') + parser.add_argument('--required', metavar='NAME', default=(), + help='required symbol names', nargs='*') + parser.add_argument('--optional', metavar='NAME', default=(), + help='required symbol names', nargs='*') + return parser + +def main(argv): + """The main entry point.""" + parser = get_parser() + opts = parser.parse_args(argv) + img = glibcelf.Image.readfile(opts.object) + + required_symbols = frozenset([sym.encode('UTF-8') + for sym in opts.required]) + optional_symbols = frozenset([sym.encode('UTF-8') + for sym in opts.optional]) + check_symbols = required_symbols | optional_symbols + + # Tracks the symbols in check_symbols that have been found. + symbols_found = set() + + # Discover the extent of the RELRO segment. + relro_begin, relro_end = find_relro(opts.object, img) + symbol_table_found = False + + errors = False + def error(msg: str) -> None: + """Record an error condition and write a message to standard output.""" + nonlocal errors + errors = True + sys.stdout.write('{}: error: {}\n'.format(opts.object, msg)) + + # Iterate over section headers to find the symbol table. + for shdr in img.shdrs(): + if shdr.sh_type == glibcelf.Sht.SHT_SYMTAB: + symbol_table_found = True + for sym in img.syms(shdr): + if sym.st_name in check_symbols: + symbols_found.add(sym.st_name) + + # Validate symbol type, section, and size. + if sym.st_info.type != glibcelf.Stt.STT_OBJECT: + error('symbol {!r} has wrong type {}'.format( + sym.st_name.decode('UTF-8'), sym.st_info.type)) + if sym.st_shndx in glibcelf.Shn: + error('symbol {!r} has reserved section {}'.format( + sym.st_name.decode('UTF-8'), sym.st_shndx)) + continue + if sym.st_size == 0: + error('symbol {!r} has size zero'.format( + sym.st_name.decode('UTF-8'))) + continue + + check_in_relro('symbol', relro_begin, relro_end, + sym.st_name, sym.st_value, sym.st_size, + error) + continue # SHT_SYMTAB + if shdr.sh_name == b'.data.rel.ro' \ + or shdr.sh_name.startswith(b'.data.rel.ro.'): + check_in_relro('section', relro_begin, relro_end, + shdr.sh_name, shdr.sh_addr, shdr.sh_size, + error) + continue + + if required_symbols - symbols_found: + for sym in sorted(required_symbols - symbols_found): + error('symbol {!r} not found'.format(sym.decode('UTF-8'))) + + if errors: + sys.exit(1) + + if not symbol_table_found: + sys.stdout.write( + '{}: warning: no symbol table found (stripped object)\n'.format( + opts.object)) + sys.exit(77) + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/sysdeps/mach/hurd/enbl-secure.c b/elf/tst-tls-allocation-failure-static.c similarity index 57% rename from sysdeps/mach/hurd/enbl-secure.c rename to elf/tst-tls-allocation-failure-static.c index 8c02789ec..8de831b24 100644 --- a/sysdeps/mach/hurd/enbl-secure.c +++ b/elf/tst-tls-allocation-failure-static.c @@ -1,5 +1,5 @@ -/* Define and initialize the `__libc_enable_secure' flag. Hurd version. - Copyright (C) 1998-2022 Free Software Foundation, Inc. +/* Base for test program with impossiblyh large PT_TLS segment. + Copyright (C) 2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,15 +16,16 @@ License along with the GNU C Library; if not, see . */ -/* There is no need for this file in the Hurd; it is just a placeholder - to prevent inclusion of the sysdeps/generic version. - In the shared library, the `__libc_enable_secure' variable is defined - by the dynamic linker in dl-sysdep.c and set there. - In the static library, it is defined in init-first.c and set there. */ +/* The test actual binary is patched using scripts/tst-elf-edit.py + --maximize-tls-size, and this introduces the expected test + allocation failure due to an excessive PT_LS p_memsz value. -#include + Patching the binary is required because on some 64-bit targets, TLS + relocations can only cover a 32-bit range, and glibc-internal TLS + variables such as errno end up outside that range. */ -void -__libc_init_secure (void) +int +main (void) { + return 0; } diff --git a/elf/tst-trace1.exp b/elf/tst-trace1.exp new file mode 100644 index 000000000..4a6f5211a --- /dev/null +++ b/elf/tst-trace1.exp @@ -0,0 +1,4 @@ +ld 1 +libc 1 +libtracemod2.so 0 +libtracemod3.so 0 diff --git a/elf/tst-trace2.exp b/elf/tst-trace2.exp new file mode 100644 index 000000000..e13506e2e --- /dev/null +++ b/elf/tst-trace2.exp @@ -0,0 +1,6 @@ +ld 1 +libc 1 +libtracemod2.so 1 +libtracemod3.so 0 +libtracemod4.so 0 +libtracemod5.so 0 diff --git a/elf/tst-trace3.exp b/elf/tst-trace3.exp new file mode 100644 index 000000000..e574549d1 --- /dev/null +++ b/elf/tst-trace3.exp @@ -0,0 +1,6 @@ +ld 1 +libc 1 +libtracemod2.so 1 +libtracemod3.so 1 +libtracemod4.so 0 +libtracemod5.so 0 diff --git a/elf/tst-trace4.exp b/elf/tst-trace4.exp new file mode 100644 index 000000000..31ca97b35 --- /dev/null +++ b/elf/tst-trace4.exp @@ -0,0 +1,6 @@ +ld 1 +libc 1 +libtracemod2.so 1 +libtracemod3.so 1 +libtracemod4.so 1 +libtracemod5.so 0 diff --git a/elf/tst-trace5.exp b/elf/tst-trace5.exp new file mode 100644 index 000000000..5d7d95372 --- /dev/null +++ b/elf/tst-trace5.exp @@ -0,0 +1,6 @@ +ld 1 +libc 1 +libtracemod2.so 1 +libtracemod3.so 1 +libtracemod4.so 1 +libtracemod5.so 1 diff --git a/iconv/gconv_parseconfdir.h b/iconv/gconv_parseconfdir.h index c0de54883..b72933b52 100644 --- a/iconv/gconv_parseconfdir.h +++ b/iconv/gconv_parseconfdir.h @@ -29,11 +29,14 @@ # define isspace(__c) __isspace_l ((__c), _nl_C_locobj_ptr) # define asprintf __asprintf # define opendir __opendir -# define readdir __readdir +# define readdir64 __readdir64 # define closedir __closedir # define mempcpy __mempcpy -# define lstat64 __lstat64 +# define struct_stat64 struct __stat64_t64 +# define lstat64 __lstat64_time64 # define feof_unlocked __feof_unlocked +#else +# define struct_stat64 struct stat64 #endif /* Name of the file containing the module information in the directories @@ -145,8 +148,8 @@ gconv_parseconfdir (const char *prefix, const char *dir, size_t dir_len) DIR *confdir = opendir (buf); if (confdir != NULL) { - struct dirent *ent; - while ((ent = readdir (confdir)) != NULL) + struct dirent64 *ent; + while ((ent = readdir64 (confdir)) != NULL) { if (ent->d_type != DT_REG && ent->d_type != DT_UNKNOWN) continue; @@ -158,7 +161,7 @@ gconv_parseconfdir (const char *prefix, const char *dir, size_t dir_len) && strcmp (ent->d_name + len - strlen (suffix), suffix) == 0) { char *conf; - struct stat64 st; + struct_stat64 st; if (asprintf (&conf, "%s/%s", buf, ent->d_name) < 0) continue; diff --git a/include/arpa/nameser.h b/include/arpa/nameser.h index 53f1dbc7c..c27e7886b 100644 --- a/include/arpa/nameser.h +++ b/include/arpa/nameser.h @@ -55,6 +55,12 @@ int __ns_name_ntop (const unsigned char *, char *, size_t) __THROW; int __ns_name_unpack (const unsigned char *, const unsigned char *, const unsigned char *, unsigned char *, size_t) __THROW; +/* Like ns_samename, but for uncompressed binary names. Return true + if the two arguments compare are equal as case-insensitive domain + names. */ +_Bool __ns_samebinaryname (const unsigned char *, const unsigned char *) + attribute_hidden; + #define ns_msg_getflag(handle, flag) \ (((handle)._flags & _ns_flagdata[flag].mask) >> _ns_flagdata[flag].shift) @@ -89,5 +95,105 @@ libc_hidden_proto (__ns_name_unpack) extern __typeof (ns_samename) __libc_ns_samename; libc_hidden_proto (__libc_ns_samename) +/* Packet parser helper functions. */ + +/* Verify that P points to an uncompressed domain name in wire format. + On success, return the length of the encoded name, including the + terminating null byte. On failure, return -1 and set errno. EOM + must point one past the last byte in the packet. */ +int __ns_name_length_uncompressed (const unsigned char *p, + const unsigned char *eom) attribute_hidden; + +/* Iterator over the resource records in a DNS packet. */ +struct ns_rr_cursor +{ + /* These members are not changed after initialization. */ + const unsigned char *begin; /* First byte of packet. */ + const unsigned char *end; /* One past the last byte of the packet. */ + const unsigned char *first_rr; /* First resource record (or packet end). */ + + /* Advanced towards the end while reading the packet. */ + const unsigned char *current; +}; + +/* Returns the RCODE field from the DNS header. */ +static inline int +ns_rr_cursor_rcode (const struct ns_rr_cursor *c) +{ + return c->begin[3] & 0x0f; /* Lower 4 bits at offset 3. */ +} + +/* Returns the length of the answer section according to the DNS header. */ +static inline int +ns_rr_cursor_ancount (const struct ns_rr_cursor *c) +{ + return c->begin[6] * 256 + c->begin[7]; /* 16 bits at offset 6. */ +} + +/* Returns the length of the authority (name server) section according + to the DNS header. */ +static inline int +ns_rr_cursor_nscount (const struct ns_rr_cursor *c) +{ + return c->begin[8] * 256 + c->begin[9]; /* 16 bits at offset 8. */ +} + +/* Returns the length of the additional data section according to the + DNS header. */ +static inline int +ns_rr_cursor_adcount (const struct ns_rr_cursor *c) +{ + return c->begin[10] * 256 + c->begin[11]; /* 16 bits at offset 10. */ +} + +/* Returns a pointer to the uncompressed question name in wire + format. */ +static inline const unsigned char * +ns_rr_cursor_qname (const struct ns_rr_cursor *c) +{ + return c->begin + 12; /* QNAME starts right after the header. */ +} + +/* Returns the question type of the first and only question. */ +static inline const int +ns_rr_cursor_qtype (const struct ns_rr_cursor *c) +{ + /* 16 bits 4 bytes back from the first RR header start. */ + return c->first_rr[-4] * 256 + c->first_rr[-3]; +} + +/* Returns the clss of the first and only question (usally C_IN). */ +static inline const int +ns_rr_cursor_qclass (const struct ns_rr_cursor *c) +{ + /* 16 bits 2 bytes back from the first RR header start. */ + return c->first_rr[-2] * 256 + c->first_rr[-1]; +} + +/* Initializes *C to cover the packet [BUF, BUF+LEN). Returns false + if LEN is less than sizeof (*HD), if the packet does not contain a + full (uncompressed) question, or if the question count is not 1. */ +_Bool __ns_rr_cursor_init (struct ns_rr_cursor *c, + const unsigned char *buf, size_t len) + attribute_hidden; + +/* Like ns_rr, but the record owner name is not decoded into text format. */ +struct ns_rr_wire +{ + unsigned char rname[NS_MAXCDNAME]; /* Owner name of the record. */ + uint16_t rtype; /* Resource record type (T_*). */ + uint16_t rclass; /* Resource record class (C_*). */ + uint32_t ttl; /* Time-to-live field. */ + const unsigned char *rdata; /* Start of resource record data. */ + uint16_t rdlength; /* Length of the data at rdata, in bytes. */ +}; + +/* Attempts to parse the record at C into *RR. On success, return + true, and C is advanced past the record, and RR->rdata points to + the record data. On failure, errno is set to EMSGSIZE, and false + is returned. */ +_Bool __ns_rr_cursor_next (struct ns_rr_cursor *c, struct ns_rr_wire *rr) + attribute_hidden; + # endif /* !_ISOMAC */ #endif diff --git a/include/bits/stdio2-decl.h b/include/bits/stdio2-decl.h new file mode 100644 index 000000000..bbb052f19 --- /dev/null +++ b/include/bits/stdio2-decl.h @@ -0,0 +1 @@ +#include diff --git a/include/bits/wchar2-decl.h b/include/bits/wchar2-decl.h new file mode 100644 index 000000000..00b1b9334 --- /dev/null +++ b/include/bits/wchar2-decl.h @@ -0,0 +1 @@ +#include diff --git a/include/libc-internal.h b/include/libc-internal.h index 15920d2bd..c052bccb2 100644 --- a/include/libc-internal.h +++ b/include/libc-internal.h @@ -21,9 +21,6 @@ #include -/* Initialize the `__libc_enable_secure' flag. */ -extern void __libc_init_secure (void); - /* Discover the tick frequency of the machine if something goes wrong, we return 0, an impossible hertz. */ extern int __profile_frequency (void); diff --git a/include/register-atfork.h b/include/register-atfork.h index be631137b..5ebe5a0b3 100644 --- a/include/register-atfork.h +++ b/include/register-atfork.h @@ -26,6 +26,7 @@ struct fork_handler void (*parent_handler) (void); void (*child_handler) (void); void *dso_handle; + uint64_t id; }; /* Function to call to unregister fork handlers. */ @@ -39,19 +40,18 @@ enum __run_fork_handler_type atfork_run_parent }; -/* Run the atfork handlers and lock/unlock the internal lock depending - of the WHO argument: - - - atfork_run_prepare: run all the PREPARE_HANDLER in reverse order of - insertion and locks the internal lock. - - atfork_run_child: run all the CHILD_HANDLER and unlocks the internal - lock. - - atfork_run_parent: run all the PARENT_HANDLER and unlocks the internal - lock. - - Perform locking only if DO_LOCKING. */ -extern void __run_fork_handlers (enum __run_fork_handler_type who, - _Bool do_locking) attribute_hidden; +/* Run the atfork prepare handlers in the reverse order of registration and + return the ID of the last registered handler. If DO_LOCKING is true, the + internal lock is held locked upon return. */ +extern uint64_t __run_prefork_handlers (_Bool do_locking) attribute_hidden; + +/* Given a handler type (parent or child), run all the atfork handlers in + the order of registration up to and including the handler with id equal + to LASTRUN. If DO_LOCKING is true, the internal lock is unlocked prior + to return. */ +extern void __run_postfork_handlers (enum __run_fork_handler_type who, + _Bool do_locking, + uint64_t lastrun) attribute_hidden; /* C library side function to register new fork handlers. */ extern int __register_atfork (void (*__prepare) (void), diff --git a/include/resolv.h b/include/resolv.h index 3590b6f49..4dbbac380 100644 --- a/include/resolv.h +++ b/include/resolv.h @@ -70,5 +70,8 @@ libc_hidden_proto (__libc_res_nameinquery) extern __typeof (__res_queriesmatch) __libc_res_queriesmatch; libc_hidden_proto (__libc_res_queriesmatch) +/* Variant of res_hnok which operates on binary (but uncompressed) names. */ +bool __res_binary_hnok (const unsigned char *dn) attribute_hidden; + # endif /* _RESOLV_H_ && !_ISOMAC */ #endif diff --git a/include/unistd.h b/include/unistd.h index 709016960..af795a37c 100644 --- a/include/unistd.h +++ b/include/unistd.h @@ -192,7 +192,6 @@ libc_hidden_proto (__tcsetpgrp) and some functions contained in the C library ignore various environment variables that normally affect them. */ extern int __libc_enable_secure attribute_relro; -extern int __libc_enable_secure_decided; rtld_hidden_proto (__libc_enable_secure) diff --git a/inet/ruserpass.c b/inet/ruserpass.c index d61a72877..75e2a0655 100644 --- a/inet/ruserpass.c +++ b/inet/ruserpass.c @@ -95,7 +95,7 @@ ruserpass (const char *host, const char **aname, const char **apass) char *hdir, *buf, *tmp; char myname[1024], *mydomain; int t, usedefault = 0; - struct stat64 stb; + struct __stat64_t64 stb; hdir = __libc_secure_getenv("HOME"); if (hdir == NULL) { @@ -174,7 +174,7 @@ next: break; case PASSWD: if (strcmp(*aname, "anonymous") && - __fstat64(fileno(cfile), &stb) >= 0 && + __fstat64_time64(fileno(cfile), &stb) >= 0 && (stb.st_mode & 077) != 0) { warnx(_("Error: .netrc file is readable by others.")); warnx(_("Remove 'password' line or make file unreadable by others.")); diff --git a/io/Makefile b/io/Makefile index cf265dc9b..b1710407d 100644 --- a/io/Makefile +++ b/io/Makefile @@ -83,16 +83,17 @@ tests := test-utime test-stat test-stat2 test-lfs tst-getcwd \ tst-ftw-bz28126 tests-time64 := \ + tst-fcntl-time64 \ + tst-fts-time64 \ tst-futimens-time64 \ tst-futimes-time64\ - tst-fts-time64 \ + tst-futimesat-time64 \ + tst-lchmod-time64 \ tst-lutimes-time64 \ tst-stat-time64 \ - tst-futimesat-time64 \ tst-utime-time64 \ tst-utimensat-time64 \ tst-utimes-time64 \ - tst-fcntl-time64 \ # tests-time64 # Likewise for statx, but we do not need static linking here. @@ -136,6 +137,7 @@ CFLAGS-close.c += -fexceptions -fasynchronous-unwind-tables CFLAGS-test-stat.c += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE CFLAGS-test-lfs.c += -D_LARGEFILE64_SOURCE +CFLAGS-tst-lchmod.c += -D_FILE_OFFSET_BITS=64 test-stat2-ARGS = Makefile . $(objpfx)test-stat2 diff --git a/io/tst-lchmod-time64.c b/io/tst-lchmod-time64.c new file mode 100644 index 000000000..f2b7cc9d3 --- /dev/null +++ b/io/tst-lchmod-time64.c @@ -0,0 +1,2 @@ +#define CHECK_TIME64 +#include "tst-lchmod.c" diff --git a/io/tst-lchmod.c b/io/tst-lchmod.c index c644f565f..c1c41bda8 100644 --- a/io/tst-lchmod.c +++ b/io/tst-lchmod.c @@ -66,10 +66,27 @@ select_path (bool do_relative_path, const char *full_path, const char *relative_ return full_path; } +static void +update_file_time_to_y2038 (const char *fname, int flags) +{ +#ifdef CHECK_TIME64 + /* Y2038 threshold plus 1 second. */ + const struct timespec ts[] = { { 0x80000001LL, 0}, { 0x80000001LL } }; + TEST_VERIFY_EXIT (utimensat (AT_FDCWD, fname, ts, flags) == 0); +#endif +} + static void test_1 (bool do_relative_path, int (*chmod_func) (int fd, const char *, mode_t, int)) { char *tempdir = support_create_temp_directory ("tst-lchmod-"); +#ifdef CHECK_TIME64 + if (!support_path_support_time64 (tempdir)) + { + puts ("info: test skipped, filesystem does not support 64 bit time_t"); + return; + } +#endif char *path_dangling = xasprintf ("%s/dangling", tempdir); char *path_file = xasprintf ("%s/file", tempdir); @@ -93,9 +110,12 @@ test_1 (bool do_relative_path, int (*chmod_func) (int fd, const char *, mode_t, xsymlink ("loop", path_loop); xsymlink ("target-does-not-exist", path_dangling); + update_file_time_to_y2038 (path_file, 0); + update_file_time_to_y2038 (path_to_file, AT_SYMLINK_NOFOLLOW); + /* Check that the modes do not collide with what we will use in the test. */ - struct stat64 st; + struct stat st; xstat (path_file, &st); TEST_VERIFY ((st.st_mode & 0777) != 1); xlstat (path_to_file, &st); diff --git a/io/tst-stat.c b/io/tst-stat.c index 2b7975e16..237988203 100644 --- a/io/tst-stat.c +++ b/io/tst-stat.c @@ -69,6 +69,10 @@ do_test (void) TEST_VERIFY_EXIT (fd >= 0); support_write_file_string (path, "abc"); + /* This should help to prevent delayed allocation, which may result + in a spurious stx_blocks/st_blocks difference. */ + fsync (fd); + bool check_ns = support_stat_nanoseconds (path); if (!check_ns) printf ("warning: timestamp with nanoseconds not supported\n"); diff --git a/libio/Makefile b/libio/Makefile index 0e5f348be..31831aea8 100644 --- a/libio/Makefile +++ b/libio/Makefile @@ -23,7 +23,7 @@ subdir := libio include ../Makeconfig headers := stdio.h \ - bits/stdio.h bits/stdio2.h bits/stdio-ldbl.h \ + bits/stdio.h bits/stdio2.h bits/stdio2-decl.h bits/stdio-ldbl.h \ bits/types/FILE.h bits/types/__FILE.h bits/types/struct_FILE.h \ bits/types/__fpos_t.h bits/types/__fpos64_t.h \ bits/types/cookie_io_functions_t.h diff --git a/libio/bits/stdio2-decl.h b/libio/bits/stdio2-decl.h new file mode 100644 index 000000000..e398f7182 --- /dev/null +++ b/libio/bits/stdio2-decl.h @@ -0,0 +1,111 @@ +/* Checking macros for stdio functions. Declarations only. + Copyright (C) 2004-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _BITS_STDIO2_DEC_H +#define _BITS_STDIO2_DEC_H 1 + +#ifndef _STDIO_H +# error "Never include directly; use instead." +#endif + +extern int __sprintf_chk (char *__restrict __s, int __flag, size_t __slen, + const char *__restrict __format, ...) __THROW + __attr_access ((__write_only__, 1, 3)); +extern int __vsprintf_chk (char *__restrict __s, int __flag, size_t __slen, + const char *__restrict __format, + __gnuc_va_list __ap) __THROW + __attr_access ((__write_only__, 1, 3)); + +#if defined __USE_ISOC99 || defined __USE_UNIX98 + +extern int __snprintf_chk (char *__restrict __s, size_t __n, int __flag, + size_t __slen, const char *__restrict __format, + ...) __THROW + __attr_access ((__write_only__, 1, 2)); +extern int __vsnprintf_chk (char *__restrict __s, size_t __n, int __flag, + size_t __slen, const char *__restrict __format, + __gnuc_va_list __ap) __THROW + __attr_access ((__write_only__, 1, 2)); + +#endif + +#if __USE_FORTIFY_LEVEL > 1 + +extern int __fprintf_chk (FILE *__restrict __stream, int __flag, + const char *__restrict __format, ...); +extern int __printf_chk (int __flag, const char *__restrict __format, ...); +extern int __vfprintf_chk (FILE *__restrict __stream, int __flag, + const char *__restrict __format, __gnuc_va_list __ap); +extern int __vprintf_chk (int __flag, const char *__restrict __format, + __gnuc_va_list __ap); + +# ifdef __USE_XOPEN2K8 +extern int __dprintf_chk (int __fd, int __flag, const char *__restrict __fmt, + ...) __attribute__ ((__format__ (__printf__, 3, 4))); +extern int __vdprintf_chk (int __fd, int __flag, + const char *__restrict __fmt, __gnuc_va_list __arg) + __attribute__ ((__format__ (__printf__, 3, 0))); +# endif + +# ifdef __USE_GNU + +extern int __asprintf_chk (char **__restrict __ptr, int __flag, + const char *__restrict __fmt, ...) + __THROW __attribute__ ((__format__ (__printf__, 3, 4))) __wur; +extern int __vasprintf_chk (char **__restrict __ptr, int __flag, + const char *__restrict __fmt, __gnuc_va_list __arg) + __THROW __attribute__ ((__format__ (__printf__, 3, 0))) __wur; +extern int __obstack_printf_chk (struct obstack *__restrict __obstack, + int __flag, const char *__restrict __format, + ...) + __THROW __attribute__ ((__format__ (__printf__, 3, 4))); +extern int __obstack_vprintf_chk (struct obstack *__restrict __obstack, + int __flag, + const char *__restrict __format, + __gnuc_va_list __args) + __THROW __attribute__ ((__format__ (__printf__, 3, 0))); + +# endif +#endif + +#if __GLIBC_USE (DEPRECATED_GETS) +extern char *__gets_chk (char *__str, size_t) __wur; +#endif + +extern char *__fgets_chk (char *__restrict __s, size_t __size, int __n, + FILE *__restrict __stream) + __wur __attr_access ((__write_only__, 1, 3)); + +extern size_t __fread_chk (void *__restrict __ptr, size_t __ptrlen, + size_t __size, size_t __n, + FILE *__restrict __stream) __wur; + +#ifdef __USE_GNU +extern char *__fgets_unlocked_chk (char *__restrict __s, size_t __size, + int __n, FILE *__restrict __stream) + __wur __attr_access ((__write_only__, 1, 3)); +#endif + +#ifdef __USE_MISC +# undef fread_unlocked +extern size_t __fread_unlocked_chk (void *__restrict __ptr, size_t __ptrlen, + size_t __size, size_t __n, + FILE *__restrict __stream) __wur; +#endif + +#endif /* bits/stdio2-decl.h. */ diff --git a/libio/bits/stdio2.h b/libio/bits/stdio2.h index b0b655ee7..b1e200e71 100644 --- a/libio/bits/stdio2.h +++ b/libio/bits/stdio2.h @@ -23,14 +23,6 @@ # error "Never include directly; use instead." #endif -extern int __sprintf_chk (char *__restrict __s, int __flag, size_t __slen, - const char *__restrict __format, ...) __THROW - __attr_access ((__write_only__, 1, 3)); -extern int __vsprintf_chk (char *__restrict __s, int __flag, size_t __slen, - const char *__restrict __format, - __gnuc_va_list __ap) __THROW - __attr_access ((__write_only__, 1, 3)); - #ifdef __va_arg_pack __fortify_function int __NTH (sprintf (char *__restrict __s, const char *__restrict __fmt, ...)) @@ -54,15 +46,6 @@ __NTH (vsprintf (char *__restrict __s, const char *__restrict __fmt, } #if defined __USE_ISOC99 || defined __USE_UNIX98 - -extern int __snprintf_chk (char *__restrict __s, size_t __n, int __flag, - size_t __slen, const char *__restrict __format, - ...) __THROW - __attr_access ((__write_only__, 1, 2)); -extern int __vsnprintf_chk (char *__restrict __s, size_t __n, int __flag, - size_t __slen, const char *__restrict __format, - __gnuc_va_list __ap) __THROW; - # ifdef __va_arg_pack __fortify_function int __NTH (snprintf (char *__restrict __s, size_t __n, @@ -89,15 +72,6 @@ __NTH (vsnprintf (char *__restrict __s, size_t __n, #endif #if __USE_FORTIFY_LEVEL > 1 - -extern int __fprintf_chk (FILE *__restrict __stream, int __flag, - const char *__restrict __format, ...); -extern int __printf_chk (int __flag, const char *__restrict __format, ...); -extern int __vfprintf_chk (FILE *__restrict __stream, int __flag, - const char *__restrict __format, __gnuc_va_list __ap); -extern int __vprintf_chk (int __flag, const char *__restrict __format, - __gnuc_va_list __ap); - # ifdef __va_arg_pack __fortify_function int fprintf (FILE *__restrict __stream, const char *__restrict __fmt, ...) @@ -136,12 +110,6 @@ vfprintf (FILE *__restrict __stream, } # ifdef __USE_XOPEN2K8 -extern int __dprintf_chk (int __fd, int __flag, const char *__restrict __fmt, - ...) __attribute__ ((__format__ (__printf__, 3, 4))); -extern int __vdprintf_chk (int __fd, int __flag, - const char *__restrict __fmt, __gnuc_va_list __arg) - __attribute__ ((__format__ (__printf__, 3, 0))); - # ifdef __va_arg_pack __fortify_function int dprintf (int __fd, const char *__restrict __fmt, ...) @@ -162,23 +130,6 @@ vdprintf (int __fd, const char *__restrict __fmt, __gnuc_va_list __ap) # endif # ifdef __USE_GNU - -extern int __asprintf_chk (char **__restrict __ptr, int __flag, - const char *__restrict __fmt, ...) - __THROW __attribute__ ((__format__ (__printf__, 3, 4))) __wur; -extern int __vasprintf_chk (char **__restrict __ptr, int __flag, - const char *__restrict __fmt, __gnuc_va_list __arg) - __THROW __attribute__ ((__format__ (__printf__, 3, 0))) __wur; -extern int __obstack_printf_chk (struct obstack *__restrict __obstack, - int __flag, const char *__restrict __format, - ...) - __THROW __attribute__ ((__format__ (__printf__, 3, 4))); -extern int __obstack_vprintf_chk (struct obstack *__restrict __obstack, - int __flag, - const char *__restrict __format, - __gnuc_va_list __args) - __THROW __attribute__ ((__format__ (__printf__, 3, 0))); - # ifdef __va_arg_pack __fortify_function int __NTH (asprintf (char **__restrict __ptr, const char *__restrict __fmt, ...)) @@ -231,7 +182,6 @@ __NTH (obstack_vprintf (struct obstack *__restrict __obstack, #endif #if __GLIBC_USE (DEPRECATED_GETS) -extern char *__gets_chk (char *__str, size_t) __wur; extern char *__REDIRECT (__gets_warn, (char *__str), gets) __wur __warnattr ("please use fgets or getline instead, gets can't " "specify buffer size"); @@ -245,9 +195,6 @@ gets (char *__str) } #endif -extern char *__fgets_chk (char *__restrict __s, size_t __size, int __n, - FILE *__restrict __stream) - __wur __attr_access ((__write_only__, 1, 3)); extern char *__REDIRECT (__fgets_alias, (char *__restrict __s, int __n, FILE *__restrict __stream), fgets) @@ -269,9 +216,6 @@ fgets (char *__restrict __s, int __n, FILE *__restrict __stream) return __fgets_chk (__s, sz, __n, __stream); } -extern size_t __fread_chk (void *__restrict __ptr, size_t __ptrlen, - size_t __size, size_t __n, - FILE *__restrict __stream) __wur; extern size_t __REDIRECT (__fread_alias, (void *__restrict __ptr, size_t __size, size_t __n, FILE *__restrict __stream), @@ -297,9 +241,6 @@ fread (void *__restrict __ptr, size_t __size, size_t __n, } #ifdef __USE_GNU -extern char *__fgets_unlocked_chk (char *__restrict __s, size_t __size, - int __n, FILE *__restrict __stream) - __wur __attr_access ((__write_only__, 1, 3)); extern char *__REDIRECT (__fgets_unlocked_alias, (char *__restrict __s, int __n, FILE *__restrict __stream), fgets_unlocked) @@ -324,9 +265,6 @@ fgets_unlocked (char *__restrict __s, int __n, FILE *__restrict __stream) #ifdef __USE_MISC # undef fread_unlocked -extern size_t __fread_unlocked_chk (void *__restrict __ptr, size_t __ptrlen, - size_t __size, size_t __n, - FILE *__restrict __stream) __wur; extern size_t __REDIRECT (__fread_unlocked_alias, (void *__restrict __ptr, size_t __size, size_t __n, FILE *__restrict __stream), diff --git a/libio/stdio.h b/libio/stdio.h index e6425341c..0e0f16b46 100644 --- a/libio/stdio.h +++ b/libio/stdio.h @@ -885,20 +885,27 @@ extern void funlockfile (FILE *__stream) __THROW; extern int __uflow (FILE *); extern int __overflow (FILE *, int); +#if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function +/* Declare all functions from bits/stdio2-decl.h first. */ +# include +#endif + +/* The following headers provide asm redirections. These redirections must + appear before the first usage of these functions, e.g. in bits/stdio.h. */ +#if defined __LDBL_COMPAT || __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI == 1 +# include +#endif + /* If we are compiling with optimizing read this file. It contains several optimizing inline functions and macros. */ #ifdef __USE_EXTERN_INLINES # include #endif #if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function +/* Now include the function definitions and redirects too. */ # include #endif -#include -#if defined __LDBL_COMPAT || __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI == 1 -# include -#endif - __END_DECLS #endif /* included. */ diff --git a/locale/programs/ld-monetary.c b/locale/programs/ld-monetary.c index 3b0412b40..18698bbe9 100644 --- a/locale/programs/ld-monetary.c +++ b/locale/programs/ld-monetary.c @@ -196,21 +196,105 @@ No definition for %s category found"), "LC_MONETARY"); } } + /* Generally speaking there are 3 standards the define the default, + warning, and error behaviour of LC_MONETARY. They are ISO/IEC TR 30112, + ISO/IEC 9899:2018 (ISO C17), and POSIX.1-2017. Within 30112 we have the + definition of a standard i18n FDCC-set, which for LC_MONETARY has the + following default values: + int_curr_symbol "" + currency_symbol "" + mon_decimal_point "" i.e. "," + mon_thousand_sep "" + mon_grouping "\177" i.e. CHAR_MAX + positive_sign "" + negative_sign "" i.e. "." + int_frac_digits -1 + frac_digits -1 + p_cs_precedes -1 + p_sep_by_space -1 + n_cs_precedes -1 + n_sep_by_space -1 + p_sign_posn -1 + n_sign_posn -1 + Under 30112 a keyword that is not provided implies an empty string "" + for string values or a -1 for integer values, and indicates the value + is unspecified with no default implied. No errors are considered. + The exception is mon_grouping which is a string with a terminating + CHAR_MAX. + For POSIX Issue 7 we have: + https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html + and again values not provided default to "" or -1, and indicate the value + is not available to the locale. The exception is mon_grouping which is + a string with a terminating CHAR_MAX. For the POSIX locale the values of + LC_MONETARY should be: + int_curr_symbol "" + currency_symbol "" + mon_decimal_point "" + mon_thousands_sep "" + mon_grouping "\177" i.e. CHAR_MAX + positive_sign "" + negative_sign "" + int_frac_digits -1 + frac_digits -1 + p_cs_precedes -1 + p_sep_by_space -1 + n_cs_precedes -1 + n_sep_by_space -1 + p_sign_posn -1 + n_sign_posn -1 + int_p_cs_precedes -1 + int_p_sep_by_space -1 + int_n_cs_precedes -1 + int_n_sep_by_space -1 + int_p_sign_posn -1 + int_n_sign_posn -1 + Like with 30112, POSIX also considers no error if the keywords are + missing, only that if the cateory as a whole is missing the referencing + of the category results in unspecified behaviour. + For ISO C17 there is no default value provided, but the localeconv + specification in 7.11.2.1 admits that members of char * type may point + to "" to indicate a value is not available or is of length zero. + The exception is decimal_point (not mon_decimal_point) which must be a + defined non-empty string. The values of char, which are generally + mapped to integer values in 30112 and POSIX, must be non-negative + numbers that map to CHAR_MAX when a value is not available in the + locale. + In ISO C17 for the "C" locale all values are empty strings "", or + CHAR_MAX, with the exception of decimal_point which is "." (defined + in LC_NUMERIC). ISO C17 makes no exception for mon_grouping like + 30112 and POSIX, but a value of "" is functionally equivalent to + "\177" since neither defines a grouping (though the latter terminates + the grouping). + + Lastly, we must consider the legacy C/POSIX locale that implemented + as a builtin in glibc and wether a default value mapping to the + C/POSIX locale may benefit the user from a compatibility perspective. + + Thus given 30112, POSIX, ISO C, and the builtin C/POSIX locale we + need to pick appropriate defaults below. */ + + /* The members of LC_MONETARY are handled in the order of their definition + in locale/categories.def. Please keep them in that order. */ + + /* The purpose of TEST_ELEM is to define a default value for the fields + in the category if the field was not defined in the cateory. If the + category was present but we didn't see a definition for the field then + we also issue a warning, otherwise the only warning you get is the one + earlier when a default category is created (completely missing category). + This missing field warning is glibc-specific since no standard requires + this warning, but we consider it valuable to print a warning for all + missing fields in the category. */ #define TEST_ELEM(cat, initval) \ if (monetary->cat == NULL) \ { \ if (! nothing) \ - record_error (0, 0, _("%s: field `%s' not defined"), \ - "LC_MONETARY", #cat); \ + record_warning (_("%s: field `%s' not defined"), \ + "LC_MONETARY", #cat); \ monetary->cat = initval; \ } + /* Keyword: int_curr_symbol. */ TEST_ELEM (int_curr_symbol, ""); - TEST_ELEM (currency_symbol, ""); - TEST_ELEM (mon_thousands_sep, ""); - TEST_ELEM (positive_sign, ""); - TEST_ELEM (negative_sign, ""); - /* The international currency symbol must come from ISO 4217. */ if (monetary->int_curr_symbol != NULL) { @@ -247,41 +331,63 @@ not correspond to a valid name in ISO 4217 [--no-warnings=intcurrsym]"), } } - /* The decimal point must not be empty. This is not said explicitly - in POSIX but ANSI C (ISO/IEC 9899) says in 4.4.2.1 it has to be - != "". */ + /* Keyword: currency_symbol */ + TEST_ELEM (currency_symbol, ""); + + /* Keyword: mon_decimal_point */ + /* ISO C17 7.11.2.1.3 explicitly allows mon_decimal_point to be the + empty string e.g. "". This indicates the value is not available in the + current locale or is of zero length. However, if the value was never + defined then we issue a warning and use a glibc-specific default. ISO + 30112 in the i18n FDCC-Set uses ",", and POSIX Issue 7 in the + POSIX locale uses "". It is specific to glibc that the default is + "."; we retain this existing behaviour for backwards compatibility. */ if (monetary->mon_decimal_point == NULL) { if (! nothing) - record_error (0, 0, _("%s: field `%s' not defined"), - "LC_MONETARY", "mon_decimal_point"); + record_warning (_("%s: field `%s' not defined, using defaults"), + "LC_MONETARY", "mon_decimal_point"); monetary->mon_decimal_point = "."; monetary->mon_decimal_point_wc = L'.'; } - else if (monetary->mon_decimal_point[0] == '\0' && ! be_quiet && ! nothing) + + /* Keyword: mon_thousands_sep */ + if (monetary->mon_thousands_sep == NULL) { - record_error (0, 0, _("\ -%s: value for field `%s' must not be an empty string"), - "LC_MONETARY", "mon_decimal_point"); + if (! nothing) + record_warning (_("%s: field `%s' not defined, using defaults"), + "LC_MONETARY", "mon_thousands_sep"); + monetary->mon_thousands_sep = ""; + monetary->mon_thousands_sep_wc = L'\0'; } + /* Keyword: mon_grouping */ if (monetary->mon_grouping_len == 0) { if (! nothing) - record_error (0, 0, _("%s: field `%s' not defined"), - "LC_MONETARY", "mon_grouping"); - + record_warning (_("%s: field `%s' not defined"), + "LC_MONETARY", "mon_grouping"); + /* Missing entries are given 1 element in their bytearray with + a value of CHAR_MAX which indicates that "No further grouping + is to be performed" (functionally equivalent to ISO C's "C" + locale default of ""). */ monetary->mon_grouping = (char *) "\177"; monetary->mon_grouping_len = 1; } + /* Keyword: positive_sign */ + TEST_ELEM (positive_sign, ""); + + /* Keyword: negative_sign */ + TEST_ELEM (negative_sign, ""); + #undef TEST_ELEM #define TEST_ELEM(cat, min, max, initval) \ if (monetary->cat == -2) \ { \ if (! nothing) \ - record_error (0, 0, _("%s: field `%s' not defined"), \ - "LC_MONETARY", #cat); \ + record_warning (_("%s: field `%s' not defined"), \ + "LC_MONETARY", #cat); \ monetary->cat = initval; \ } \ else if ((monetary->cat < min || monetary->cat > max) \ @@ -300,16 +406,11 @@ not correspond to a valid name in ISO 4217 [--no-warnings=intcurrsym]"), TEST_ELEM (p_sign_posn, -1, 4, -1); TEST_ELEM (n_sign_posn, -1, 4, -1); - /* The non-POSIX.2 extensions are optional. */ - if (monetary->duo_int_curr_symbol == NULL) - monetary->duo_int_curr_symbol = monetary->int_curr_symbol; - if (monetary->duo_currency_symbol == NULL) - monetary->duo_currency_symbol = monetary->currency_symbol; - - if (monetary->duo_int_frac_digits == -2) - monetary->duo_int_frac_digits = monetary->int_frac_digits; - if (monetary->duo_frac_digits == -2) - monetary->duo_frac_digits = monetary->frac_digits; + /* Keyword: crncystr */ + monetary->crncystr = (char *) xmalloc (strlen (monetary->currency_symbol) + + 2); + monetary->crncystr[0] = monetary->p_cs_precedes ? '-' : '+'; + strcpy (&monetary->crncystr[1], monetary->currency_symbol); #undef TEST_ELEM #define TEST_ELEM(cat, alt, min, max) \ @@ -327,6 +428,17 @@ not correspond to a valid name in ISO 4217 [--no-warnings=intcurrsym]"), TEST_ELEM (int_p_sign_posn, p_sign_posn, -1, 4); TEST_ELEM (int_n_sign_posn, n_sign_posn, -1, 4); + /* The non-POSIX.2 extensions are optional. */ + if (monetary->duo_int_curr_symbol == NULL) + monetary->duo_int_curr_symbol = monetary->int_curr_symbol; + if (monetary->duo_currency_symbol == NULL) + monetary->duo_currency_symbol = monetary->currency_symbol; + + if (monetary->duo_int_frac_digits == -2) + monetary->duo_int_frac_digits = monetary->int_frac_digits; + if (monetary->duo_frac_digits == -2) + monetary->duo_frac_digits = monetary->frac_digits; + TEST_ELEM (duo_p_cs_precedes, p_cs_precedes, -1, 1); TEST_ELEM (duo_p_sep_by_space, p_sep_by_space, -1, 2); TEST_ELEM (duo_n_cs_precedes, n_cs_precedes, -1, 1); @@ -349,17 +461,15 @@ not correspond to a valid name in ISO 4217 [--no-warnings=intcurrsym]"), if (monetary->duo_valid_to == 0) monetary->duo_valid_to = 99991231; + /* Keyword: conversion_rate */ if (monetary->conversion_rate[0] == 0) { monetary->conversion_rate[0] = 1; monetary->conversion_rate[1] = 1; } - /* Create the crncystr entry. */ - monetary->crncystr = (char *) xmalloc (strlen (monetary->currency_symbol) - + 2); - monetary->crncystr[0] = monetary->p_cs_precedes ? '-' : '+'; - strcpy (&monetary->crncystr[1], monetary->currency_symbol); + /* A value for monetary-decimal-point-wc was set when + monetary_decimal_point was set, likewise for monetary-thousands-sep-wc. */ } diff --git a/locale/programs/locarchive.c b/locale/programs/locarchive.c index 45408c26c..eeb2fa6ff 100644 --- a/locale/programs/locarchive.c +++ b/locale/programs/locarchive.c @@ -1397,7 +1397,7 @@ add_locales_to_archive (size_t nlist, char *list[], bool replace) { char fullname[fnamelen + 2 * strlen (d->d_name) + 7]; - if (d_type == DT_UNKNOWN) + if (d_type == DT_UNKNOWN || d_type == DT_LNK) { strcpy (stpcpy (stpcpy (fullname, fname), "/"), d->d_name); diff --git a/localedata/Makefile b/localedata/Makefile index 9ae2e5c16..7741ac3b5 100644 --- a/localedata/Makefile +++ b/localedata/Makefile @@ -468,11 +468,11 @@ define build-one-locale endef $(INSTALL-SUPPORTED-LOCALE-ARCHIVE): install-locales-dir - @flags="-c"; \ + @flags=""; \ $(build-one-locale) $(INSTALL-SUPPORTED-LOCALE-FILES): install-locales-dir - @flags="-c --no-archive --no-hard-links"; \ + @flags="--no-archive --no-hard-links"; \ $(build-one-locale) tst-setlocale-ENV = LC_ALL=ja_JP.EUC-JP diff --git a/localedata/gen-locale.sh b/localedata/gen-locale.sh index 7fce35f21..8053c816a 100644 --- a/localedata/gen-locale.sh +++ b/localedata/gen-locale.sh @@ -54,8 +54,14 @@ modifier=`echo $locfile|sed 's|[^.]*[.]\([^@ ]*\)\(@[^ ]*\)\?/LC_CTYPE|\2|'` echo "Generating locale $locale.$charmap: this might take a while..." -# Run quietly and force output. -flags="--quiet -c" +# Do not force output with '-c', all locales should compile without +# warning or errors. There is likewise no need to run quietly with +# '--quiet' since all locales should compile without additional +# diagnostics. If there are messages printed then we want to see +# them, fix them, and the associated error or warning. During +# development it may be beneficialy to put '--quiet -c' here to allow +# you to develop in-progress locales. +flags="" # For SJIS the charmap is SHIFT_JIS. We just want the locale to have # a slightly nicer name instead of using "*.SHIFT_SJIS", but that diff --git a/malloc/malloc.c b/malloc/malloc.c index 1a1ac1d8f..fe9cb9b80 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -292,19 +292,14 @@ # define __assert_fail(assertion, file, line, function) \ __malloc_assert(assertion, file, line, function) -extern const char *__progname; - -static void +_Noreturn static void __malloc_assert (const char *assertion, const char *file, unsigned int line, const char *function) { - (void) __fxprintf (NULL, "%s%s%s:%u: %s%sAssertion `%s' failed.\n", - __progname, __progname[0] ? ": " : "", - file, line, - function ? function : "", function ? ": " : "", - assertion); - fflush (stderr); - abort (); + __libc_message (do_abort, "\ +Fatal glibc error: malloc assertion failure in %s: %s\n", + function, assertion); + __builtin_unreachable (); } #endif #endif diff --git a/misc/daemon.c b/misc/daemon.c index 0e688f4d7..3c73ac2ab 100644 --- a/misc/daemon.c +++ b/misc/daemon.c @@ -61,11 +61,10 @@ daemon (int nochdir, int noclose) (void)__chdir("/"); if (!noclose) { - struct stat64 st; + struct __stat64_t64 st; if ((fd = __open_nocancel(_PATH_DEVNULL, O_RDWR, 0)) != -1 - && (__builtin_expect (__fstat64 (fd, &st), 0) - == 0)) { + && __glibc_likely (__fstat64_time64 (fd, &st) == 0)) { if (__builtin_expect (S_ISCHR (st.st_mode), 1) != 0 #if defined DEV_NULL_MAJOR && defined DEV_NULL_MINOR && (st.st_rdev diff --git a/misc/getusershell.c b/misc/getusershell.c index 11f5aa83f..4221095dc 100644 --- a/misc/getusershell.c +++ b/misc/getusershell.c @@ -97,7 +97,7 @@ initshells (void) { char **sp, *cp; FILE *fp; - struct stat64 statb; + struct __stat64_t64 statb; size_t flen; free(shells); @@ -106,7 +106,7 @@ initshells (void) strings = NULL; if ((fp = fopen(_PATH_SHELLS, "rce")) == NULL) goto init_okshells_noclose; - if (__fstat64(fileno(fp), &statb) == -1) { + if (__fstat64_time64(fileno(fp), &statb) == -1) { init_okshells: (void)fclose(fp); init_okshells_noclose: diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h index 44d3826bc..1c2b044a0 100644 --- a/misc/sys/cdefs.h +++ b/misc/sys/cdefs.h @@ -162,13 +162,13 @@ || (__builtin_constant_p (__l) && (__l) > 0)) /* Length is known to be safe at compile time if the __L * __S <= __OBJSZ - condition can be folded to a constant and if it is true. The -1 check is - redundant because since it implies that __glibc_safe_len_cond is true. */ + condition can be folded to a constant and if it is true, or unknown (-1) */ #define __glibc_safe_or_unknown_len(__l, __s, __osz) \ - (__glibc_unsigned_or_positive (__l) \ - && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \ - __s, __osz)) \ - && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), __s, __osz)) + ((__builtin_constant_p (__osz) && (__osz) == (__SIZE_TYPE__) -1) \ + || (__glibc_unsigned_or_positive (__l) \ + && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \ + (__s), (__osz))) \ + && __glibc_safe_len_cond ((__SIZE_TYPE__) (__l), (__s), (__osz)))) /* Conversely, we know at compile time that the length is unsafe if the __L * __S <= __OBJSZ condition can be folded to a constant and if it is diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c index 34a33164f..01a282f3f 100644 --- a/nptl/allocatestack.c +++ b/nptl/allocatestack.c @@ -119,8 +119,6 @@ get_cached_stack (size_t *sizep, void **memp) /* Cancellation handling is back to the default. */ result->cancelhandling = 0; - result->cancelstate = PTHREAD_CANCEL_ENABLE; - result->canceltype = PTHREAD_CANCEL_DEFERRED; result->cleanup = NULL; result->setup_failed = 0; diff --git a/nptl/cancellation.c b/nptl/cancellation.c index 8d54a6add..f4b08902b 100644 --- a/nptl/cancellation.c +++ b/nptl/cancellation.c @@ -30,19 +30,26 @@ int __pthread_enable_asynccancel (void) { struct pthread *self = THREAD_SELF; + int oldval = atomic_load_relaxed (&self->cancelhandling); - int oldval = THREAD_GETMEM (self, canceltype); - THREAD_SETMEM (self, canceltype, PTHREAD_CANCEL_ASYNCHRONOUS); + while (1) + { + int newval = oldval | CANCELTYPE_BITMASK; - int ch = THREAD_GETMEM (self, cancelhandling); + if (newval == oldval) + break; - if (self->cancelstate == PTHREAD_CANCEL_ENABLE - && (ch & CANCELED_BITMASK) - && !(ch & EXITING_BITMASK) - && !(ch & TERMINATED_BITMASK)) - { - THREAD_SETMEM (self, result, PTHREAD_CANCELED); - __do_cancel (); + if (atomic_compare_exchange_weak_acquire (&self->cancelhandling, + &oldval, newval)) + { + if (cancel_enabled_and_canceled_and_async (newval)) + { + self->result = PTHREAD_CANCELED; + __do_cancel (); + } + + break; + } } return oldval; @@ -56,10 +63,29 @@ __pthread_disable_asynccancel (int oldtype) { /* If asynchronous cancellation was enabled before we do not have anything to do. */ - if (oldtype == PTHREAD_CANCEL_ASYNCHRONOUS) + if (oldtype & CANCELTYPE_BITMASK) return; struct pthread *self = THREAD_SELF; - self->canceltype = PTHREAD_CANCEL_DEFERRED; + int newval; + int oldval = atomic_load_relaxed (&self->cancelhandling); + do + { + newval = oldval & ~CANCELTYPE_BITMASK; + } + while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling, + &oldval, newval)); + + /* We cannot return when we are being canceled. Upon return the + thread might be things which would have to be undone. The + following loop should loop until the cancellation signal is + delivered. */ + while (__glibc_unlikely ((newval & (CANCELING_BITMASK | CANCELED_BITMASK)) + == CANCELING_BITMASK)) + { + futex_wait_simple ((unsigned int *) &self->cancelhandling, newval, + FUTEX_PRIVATE); + newval = atomic_load_relaxed (&self->cancelhandling); + } } libc_hidden_def (__pthread_disable_asynccancel) diff --git a/nptl/cleanup_defer.c b/nptl/cleanup_defer.c index f8181a40e..4e864ead3 100644 --- a/nptl/cleanup_defer.c +++ b/nptl/cleanup_defer.c @@ -30,9 +30,22 @@ ___pthread_register_cancel_defer (__pthread_unwind_buf_t *buf) ibuf->priv.data.prev = THREAD_GETMEM (self, cleanup_jmp_buf); ibuf->priv.data.cleanup = THREAD_GETMEM (self, cleanup); - /* Disable asynchronous cancellation for now. */ - ibuf->priv.data.canceltype = THREAD_GETMEM (self, canceltype); - THREAD_SETMEM (self, canceltype, PTHREAD_CANCEL_DEFERRED); + int cancelhandling = atomic_load_relaxed (&self->cancelhandling); + if (__glibc_unlikely (cancelhandling & CANCELTYPE_BITMASK)) + { + int newval; + do + { + newval = cancelhandling & ~CANCELTYPE_BITMASK; + } + while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling, + &cancelhandling, + newval)); + } + + ibuf->priv.data.canceltype = (cancelhandling & CANCELTYPE_BITMASK + ? PTHREAD_CANCEL_ASYNCHRONOUS + : PTHREAD_CANCEL_DEFERRED); /* Store the new cleanup handler info. */ THREAD_SETMEM (self, cleanup_jmp_buf, (struct pthread_unwind_buf *) buf); @@ -54,9 +67,26 @@ ___pthread_unregister_cancel_restore (__pthread_unwind_buf_t *buf) THREAD_SETMEM (self, cleanup_jmp_buf, ibuf->priv.data.prev); - THREAD_SETMEM (self, canceltype, ibuf->priv.data.canceltype); - if (ibuf->priv.data.canceltype == PTHREAD_CANCEL_ASYNCHRONOUS) - __pthread_testcancel (); + if (ibuf->priv.data.canceltype == PTHREAD_CANCEL_DEFERRED) + return; + + int cancelhandling = atomic_load_relaxed (&self->cancelhandling); + if ((cancelhandling & CANCELTYPE_BITMASK) == 0) + { + int newval; + do + { + newval = cancelhandling | CANCELTYPE_BITMASK; + } + while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling, + &cancelhandling, newval)); + + if (cancel_enabled_and_canceled (cancelhandling)) + { + self->result = PTHREAD_CANCELED; + __do_cancel (); + } + } } versioned_symbol (libc, ___pthread_unregister_cancel_restore, __pthread_unregister_cancel_restore, GLIBC_2_34); diff --git a/nptl/descr.h b/nptl/descr.h index ea8aca08e..bb46b5958 100644 --- a/nptl/descr.h +++ b/nptl/descr.h @@ -279,18 +279,27 @@ struct pthread /* Flags determining processing of cancellation. */ int cancelhandling; + /* Bit set if cancellation is disabled. */ +#define CANCELSTATE_BIT 0 +#define CANCELSTATE_BITMASK (1 << CANCELSTATE_BIT) + /* Bit set if asynchronous cancellation mode is selected. */ +#define CANCELTYPE_BIT 1 +#define CANCELTYPE_BITMASK (1 << CANCELTYPE_BIT) + /* Bit set if canceling has been initiated. */ +#define CANCELING_BIT 2 +#define CANCELING_BITMASK (1 << CANCELING_BIT) /* Bit set if canceled. */ #define CANCELED_BIT 3 -#define CANCELED_BITMASK (0x01 << CANCELED_BIT) +#define CANCELED_BITMASK (1 << CANCELED_BIT) /* Bit set if thread is exiting. */ #define EXITING_BIT 4 -#define EXITING_BITMASK (0x01 << EXITING_BIT) +#define EXITING_BITMASK (1 << EXITING_BIT) /* Bit set if thread terminated and TCB is freed. */ #define TERMINATED_BIT 5 -#define TERMINATED_BITMASK (0x01 << TERMINATED_BIT) +#define TERMINATED_BITMASK (1 << TERMINATED_BIT) /* Bit set if thread is supposed to change XID. */ #define SETXID_BIT 6 -#define SETXID_BITMASK (0x01 << SETXID_BIT) +#define SETXID_BITMASK (1 << SETXID_BIT) /* Flags. Including those copied from the thread attribute. */ int flags; @@ -390,14 +399,6 @@ struct pthread /* Indicates whether is a C11 thread created by thrd_creat. */ bool c11; - /* Thread cancel state (PTHREAD_CANCEL_ENABLE or - PTHREAD_CANCEL_DISABLE). */ - unsigned char cancelstate; - - /* Thread cancel type (PTHREAD_CANCEL_DEFERRED or - PTHREAD_CANCEL_ASYNCHRONOUS). */ - unsigned char canceltype; - /* Used in __pthread_kill_internal to detected a thread that has exited or is about to exit. exit_lock must only be acquired after blocking signals. */ @@ -417,6 +418,22 @@ struct pthread (sizeof (struct pthread) - offsetof (struct pthread, end_padding)) } __attribute ((aligned (TCB_ALIGNMENT))); +static inline bool +cancel_enabled_and_canceled (int value) +{ + return (value & (CANCELSTATE_BITMASK | CANCELED_BITMASK | EXITING_BITMASK + | TERMINATED_BITMASK)) + == CANCELED_BITMASK; +} + +static inline bool +cancel_enabled_and_canceled_and_async (int value) +{ + return ((value) & (CANCELSTATE_BITMASK | CANCELTYPE_BITMASK | CANCELED_BITMASK + | EXITING_BITMASK | TERMINATED_BITMASK)) + == (CANCELTYPE_BITMASK | CANCELED_BITMASK); +} + /* This yields the pointer that TLS support code calls the thread pointer. */ #if TLS_TCB_AT_TP # define TLS_TPADJ(pd) (pd) diff --git a/nptl/libc-cleanup.c b/nptl/libc-cleanup.c index cb4c22628..2ce59388d 100644 --- a/nptl/libc-cleanup.c +++ b/nptl/libc-cleanup.c @@ -26,9 +26,24 @@ __libc_cleanup_push_defer (struct _pthread_cleanup_buffer *buffer) buffer->__prev = THREAD_GETMEM (self, cleanup); + int cancelhandling = atomic_load_relaxed (&self->cancelhandling); + /* Disable asynchronous cancellation for now. */ - buffer->__canceltype = THREAD_GETMEM (self, canceltype); - THREAD_SETMEM (self, canceltype, PTHREAD_CANCEL_DEFERRED); + if (__glibc_unlikely (cancelhandling & CANCELTYPE_BITMASK)) + { + int newval; + do + { + newval = cancelhandling & ~CANCELTYPE_BITMASK; + } + while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling, + &cancelhandling, + newval)); + } + + buffer->__canceltype = (cancelhandling & CANCELTYPE_BITMASK + ? PTHREAD_CANCEL_ASYNCHRONOUS + : PTHREAD_CANCEL_DEFERRED); THREAD_SETMEM (self, cleanup, buffer); } @@ -41,8 +56,23 @@ __libc_cleanup_pop_restore (struct _pthread_cleanup_buffer *buffer) THREAD_SETMEM (self, cleanup, buffer->__prev); - THREAD_SETMEM (self, canceltype, buffer->__canceltype); - if (buffer->__canceltype == PTHREAD_CANCEL_ASYNCHRONOUS) - __pthread_testcancel (); + int cancelhandling = atomic_load_relaxed (&self->cancelhandling); + if (buffer->__canceltype != PTHREAD_CANCEL_DEFERRED + && (cancelhandling & CANCELTYPE_BITMASK) == 0) + { + int newval; + do + { + newval = cancelhandling | CANCELTYPE_BITMASK; + } + while (!atomic_compare_exchange_weak_acquire (&self->cancelhandling, + &cancelhandling, newval)); + + if (cancel_enabled_and_canceled (cancelhandling)) + { + self->result = PTHREAD_CANCELED; + __do_cancel (); + } + } } libc_hidden_def (__libc_cleanup_pop_restore) diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c index 7524c7ce4..e67b2df5c 100644 --- a/nptl/pthread_cancel.c +++ b/nptl/pthread_cancel.c @@ -42,18 +42,29 @@ sigcancel_handler (int sig, siginfo_t *si, void *ctx) struct pthread *self = THREAD_SELF; - int ch = atomic_load_relaxed (&self->cancelhandling); - /* Cancelation not enabled, not cancelled, or already exitting. */ - if (self->cancelstate == PTHREAD_CANCEL_DISABLE - || (ch & CANCELED_BITMASK) == 0 - || (ch & EXITING_BITMASK) != 0) - return; - - /* Set the return value. */ - THREAD_SETMEM (self, result, PTHREAD_CANCELED); - /* Make sure asynchronous cancellation is still enabled. */ - if (self->canceltype == PTHREAD_CANCEL_ASYNCHRONOUS) - __do_cancel (); + int oldval = atomic_load_relaxed (&self->cancelhandling); + while (1) + { + /* We are canceled now. When canceled by another thread this flag + is already set but if the signal is directly send (internally or + from another process) is has to be done here. */ + int newval = oldval | CANCELING_BITMASK | CANCELED_BITMASK; + + if (oldval == newval || (oldval & EXITING_BITMASK) != 0) + /* Already canceled or exiting. */ + break; + + if (atomic_compare_exchange_weak_acquire (&self->cancelhandling, + &oldval, newval)) + { + self->result = PTHREAD_CANCELED; + + /* Make sure asynchronous cancellation is still enabled. */ + if ((oldval & CANCELTYPE_BITMASK) != 0) + /* Run the registered destructors and terminate the thread. */ + __do_cancel (); + } + } } int @@ -92,29 +103,71 @@ __pthread_cancel (pthread_t th) } #endif - int oldch = atomic_fetch_or_acquire (&pd->cancelhandling, CANCELED_BITMASK); - if ((oldch & CANCELED_BITMASK) != 0) - return 0; - - if (pd == THREAD_SELF) + /* Some syscalls are never restarted after being interrupted by a signal + handler, regardless of the use of SA_RESTART (they always fail with + EINTR). So pthread_cancel cannot send SIGCANCEL unless the cancellation + is enabled and set as asynchronous (in this case the cancellation will + be acted in the cancellation handler instead by the syscall wrapper). + Otherwise the target thread is set as 'cancelling' (CANCELING_BITMASK) + by atomically setting 'cancelhandling' and the cancelation will be acted + upon on next cancellation entrypoing in the target thread. + + It also requires to atomically check if cancellation is enabled and + asynchronous, so both cancellation state and type are tracked on + 'cancelhandling'. */ + + int result = 0; + int oldval = atomic_load_relaxed (&pd->cancelhandling); + int newval; + do { - /* A single-threaded process should be able to kill itself, since there - is nothing in the POSIX specification that says that it cannot. So - we set multiple_threads to true so that cancellation points get - executed. */ - THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1); + again: + newval = oldval | CANCELING_BITMASK | CANCELED_BITMASK; + if (oldval == newval) + break; + + /* If the cancellation is handled asynchronously just send a + signal. We avoid this if possible since it's more + expensive. */ + if (cancel_enabled_and_canceled_and_async (newval)) + { + /* Mark the cancellation as "in progress". */ + int newval2 = oldval | CANCELING_BITMASK; + if (!atomic_compare_exchange_weak_acquire (&pd->cancelhandling, + &oldval, newval2)) + goto again; + + if (pd == THREAD_SELF) + /* This is not merely an optimization: An application may + call pthread_cancel (pthread_self ()) without calling + pthread_create, so the signal handler may not have been + set up for a self-cancel. */ + { + pd->result = PTHREAD_CANCELED; + if ((newval & CANCELTYPE_BITMASK) != 0) + __do_cancel (); + } + else + /* The cancellation handler will take care of marking the + thread as canceled. */ + result = __pthread_kill_internal (th, SIGCANCEL); + + break; + } + + /* A single-threaded process should be able to kill itself, since + there is nothing in the POSIX specification that says that it + cannot. So we set multiple_threads to true so that cancellation + points get executed. */ + THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1); #ifndef TLS_MULTIPLE_THREADS_IN_TCB __libc_multiple_threads = 1; #endif - - THREAD_SETMEM (pd, result, PTHREAD_CANCELED); - if (pd->cancelstate == PTHREAD_CANCEL_ENABLE - && pd->canceltype == PTHREAD_CANCEL_ASYNCHRONOUS) - __do_cancel (); - return 0; } + while (!atomic_compare_exchange_weak_acquire (&pd->cancelhandling, &oldval, + newval)); - return __pthread_kill_internal (th, SIGCANCEL); + return result; } versioned_symbol (libc, __pthread_cancel, pthread_cancel, GLIBC_2_34); diff --git a/nptl/pthread_join_common.c b/nptl/pthread_join_common.c index a8e884f34..ca3245b0a 100644 --- a/nptl/pthread_join_common.c +++ b/nptl/pthread_join_common.c @@ -57,12 +57,9 @@ __pthread_clockjoin_ex (pthread_t threadid, void **thread_return, if ((pd == self || (self->joinid == pd && (pd->cancelhandling - & (CANCELED_BITMASK | EXITING_BITMASK + & (CANCELING_BITMASK | CANCELED_BITMASK | EXITING_BITMASK | TERMINATED_BITMASK)) == 0)) - && !(self->cancelstate == PTHREAD_CANCEL_ENABLE - && (pd->cancelhandling & (CANCELED_BITMASK | EXITING_BITMASK - | TERMINATED_BITMASK)) - == CANCELED_BITMASK)) + && !cancel_enabled_and_canceled (self->cancelhandling)) /* This is a deadlock situation. The threads are waiting for each other to finish. Note that this is a "may" error. To be 100% sure we catch this error we would have to lock the data diff --git a/nptl/pthread_setcancelstate.c b/nptl/pthread_setcancelstate.c index 9905b12e4..f8edf18fb 100644 --- a/nptl/pthread_setcancelstate.c +++ b/nptl/pthread_setcancelstate.c @@ -30,9 +30,29 @@ __pthread_setcancelstate (int state, int *oldstate) self = THREAD_SELF; - if (oldstate != NULL) - *oldstate = self->cancelstate; - self->cancelstate = state; + int oldval = atomic_load_relaxed (&self->cancelhandling); + while (1) + { + int newval = (state == PTHREAD_CANCEL_DISABLE + ? oldval | CANCELSTATE_BITMASK + : oldval & ~CANCELSTATE_BITMASK); + + if (oldstate != NULL) + *oldstate = ((oldval & CANCELSTATE_BITMASK) + ? PTHREAD_CANCEL_DISABLE : PTHREAD_CANCEL_ENABLE); + + if (oldval == newval) + break; + + if (atomic_compare_exchange_weak_acquire (&self->cancelhandling, + &oldval, newval)) + { + if (cancel_enabled_and_canceled_and_async (newval)) + __do_cancel (); + + break; + } + } return 0; } diff --git a/nptl/pthread_setcanceltype.c b/nptl/pthread_setcanceltype.c index 94e56466d..1307d355c 100644 --- a/nptl/pthread_setcanceltype.c +++ b/nptl/pthread_setcanceltype.c @@ -28,11 +28,32 @@ __pthread_setcanceltype (int type, int *oldtype) volatile struct pthread *self = THREAD_SELF; - if (oldtype != NULL) - *oldtype = self->canceltype; - self->canceltype = type; - if (type == PTHREAD_CANCEL_ASYNCHRONOUS) - __pthread_testcancel (); + int oldval = atomic_load_relaxed (&self->cancelhandling); + while (1) + { + int newval = (type == PTHREAD_CANCEL_ASYNCHRONOUS + ? oldval | CANCELTYPE_BITMASK + : oldval & ~CANCELTYPE_BITMASK); + + if (oldtype != NULL) + *oldtype = ((oldval & CANCELTYPE_BITMASK) + ? PTHREAD_CANCEL_ASYNCHRONOUS : PTHREAD_CANCEL_DEFERRED); + + if (oldval == newval) + break; + + if (atomic_compare_exchange_weak_acquire (&self->cancelhandling, + &oldval, newval)) + { + if (cancel_enabled_and_canceled_and_async (newval)) + { + THREAD_SETMEM (self, result, PTHREAD_CANCELED); + __do_cancel (); + } + + break; + } + } return 0; } diff --git a/nptl/pthread_testcancel.c b/nptl/pthread_testcancel.c index 13123608e..b81928c00 100644 --- a/nptl/pthread_testcancel.c +++ b/nptl/pthread_testcancel.c @@ -23,13 +23,10 @@ void ___pthread_testcancel (void) { struct pthread *self = THREAD_SELF; - int cancelhandling = THREAD_GETMEM (self, cancelhandling); - if (self->cancelstate == PTHREAD_CANCEL_ENABLE - && (cancelhandling & CANCELED_BITMASK) - && !(cancelhandling & EXITING_BITMASK) - && !(cancelhandling & TERMINATED_BITMASK)) + int cancelhandling = atomic_load_relaxed (&self->cancelhandling); + if (cancel_enabled_and_canceled (cancelhandling)) { - THREAD_SETMEM (self, result, PTHREAD_CANCELED); + self->result = PTHREAD_CANCELED; __do_cancel (); } } diff --git a/nptl/unwind.c b/nptl/unwind.c index c3563e346..33b0d8757 100644 --- a/nptl/unwind.c +++ b/nptl/unwind.c @@ -25,7 +25,7 @@ #include #include -#ifdef _STACK_GROWS_DOWN +#if _STACK_GROWS_DOWN # define FRAME_LEFT(frame, other, adj) \ ((uintptr_t) frame - adj >= (uintptr_t) other - adj) #elif _STACK_GROWS_UP diff --git a/nscd/connections.c b/nscd/connections.c index 61d1674eb..531d2e83d 100644 --- a/nscd/connections.c +++ b/nscd/connections.c @@ -2284,7 +2284,8 @@ main_loop_epoll (int efd) sizeof (buf))) != -1) ; - __bump_nl_timestamp (); + dbs[hstdb].head->extra_data[NSCD_HST_IDX_CONF_TIMESTAMP] + = __bump_nl_timestamp (); } # endif else diff --git a/nss/Makefile b/nss/Makefile index 552e5d03e..de439d491 100644 --- a/nss/Makefile +++ b/nss/Makefile @@ -60,7 +60,8 @@ tests = test-netdb test-digits-dots tst-nss-getpwent bug17079 \ tst-nss-test1 \ tst-nss-test2 \ tst-nss-test4 \ - tst-nss-test5 + tst-nss-test5 \ + tst-nss-test_errno xtests = bug-erange tests-container = \ @@ -132,7 +133,7 @@ libnss_compat-inhibit-o = $(filter-out .os,$(object-suffixes)) ifeq ($(build-static-nss),yes) tests-static += tst-nss-static endif -extra-test-objs += nss_test1.os nss_test2.os +extra-test-objs += nss_test1.os nss_test2.os nss_test_errno.os include ../Rules @@ -166,22 +167,26 @@ rtld-tests-LDFLAGS += -Wl,--dynamic-list=nss_test.ver libof-nss_test1 = extramodules libof-nss_test2 = extramodules +libof-nss_test_errno = extramodules $(objpfx)/libnss_test1.so: $(objpfx)nss_test1.os $(link-libc-deps) $(build-module) $(objpfx)/libnss_test2.so: $(objpfx)nss_test2.os $(link-libc-deps) $(build-module) +$(objpfx)/libnss_test_errno.so: $(objpfx)nss_test_errno.os $(link-libc-deps) + $(build-module) $(objpfx)nss_test2.os : nss_test1.c -ifdef libnss_test1.so-version -$(objpfx)/libnss_test1.so$(libnss_test1.so-version): $(objpfx)/libnss_test1.so +# Use the nss_files suffix for these objects as well. +$(objpfx)/libnss_test1.so$(libnss_files.so-version): $(objpfx)/libnss_test1.so $(make-link) -endif -ifdef libnss_test2.so-version -$(objpfx)/libnss_test2.so$(libnss_test2.so-version): $(objpfx)/libnss_test2.so +$(objpfx)/libnss_test2.so$(libnss_files.so-version): $(objpfx)/libnss_test2.so + $(make-link) +$(objpfx)/libnss_test_errno.so$(libnss_files.so-version): \ + $(objpfx)/libnss_test_errno.so $(make-link) -endif $(patsubst %,$(objpfx)%.out,$(tests) $(tests-container)) : \ - $(objpfx)/libnss_test1.so$(libnss_test1.so-version) \ - $(objpfx)/libnss_test2.so$(libnss_test2.so-version) + $(objpfx)/libnss_test1.so$(libnss_files.so-version) \ + $(objpfx)/libnss_test2.so$(libnss_files.so-version) \ + $(objpfx)/libnss_test_errno.so$(libnss_files.so-version) ifeq (yes,$(have-thread-library)) $(objpfx)tst-cancel-getpwuid_r: $(shared-thread-library) @@ -197,3 +202,4 @@ LDFLAGS-tst-nss-test2 = -Wl,--disable-new-dtags LDFLAGS-tst-nss-test3 = -Wl,--disable-new-dtags LDFLAGS-tst-nss-test4 = -Wl,--disable-new-dtags LDFLAGS-tst-nss-test5 = -Wl,--disable-new-dtags +LDFLAGS-tst-nss-test_errno = -Wl,--disable-new-dtags diff --git a/nss/XXX-lookup.c b/nss/XXX-lookup.c index db9593767..bfc57b8e6 100644 --- a/nss/XXX-lookup.c +++ b/nss/XXX-lookup.c @@ -15,6 +15,7 @@ License along with the GNU C Library; if not, see . */ +#include #include "nsswitch.h" /*******************************************************************\ @@ -54,6 +55,10 @@ DB_LOOKUP_FCT (nss_action_list *ni, const char *fct_name, const char *fct2_name, *ni = DATABASE_NAME_SYMBOL; + /* We want to know about it if we've somehow got a NULL action list; + in the past, we had bad state if seccomp interfered with setup. */ + assert(*ni != NULL); + return __nss_lookup (ni, fct_name, fct2_name, fctp); } libc_hidden_def (DB_LOOKUP_FCT) diff --git a/nss/nss_database.c b/nss/nss_database.c index d56c5b798..f2ed2f2c2 100644 --- a/nss/nss_database.c +++ b/nss/nss_database.c @@ -420,23 +420,32 @@ nss_database_check_reload_and_get (struct nss_database_state *local, return true; } - /* Before we reload, verify that "/" hasn't changed. We assume that - errors here are very unlikely, but the chance that we're entering - a container is also very unlikely, so we err on the side of both - very unlikely things not happening at the same time. */ - if (__stat64_time64 ("/", &str) != 0 - || (local->root_ino != 0 - && (str.st_ino != local->root_ino - || str.st_dev != local->root_dev))) + int stat_rv = __stat64_time64 ("/", &str); + + if (local->data.services[database_index] != NULL) { - /* Change detected; disable reloading and return current state. */ - atomic_store_release (&local->data.reload_disabled, 1); - *result = local->data.services[database_index]; - __libc_lock_unlock (local->lock); - return true; + /* Before we reload, verify that "/" hasn't changed. We assume that + errors here are very unlikely, but the chance that we're entering + a container is also very unlikely, so we err on the side of both + very unlikely things not happening at the same time. */ + if (stat_rv != 0 + || (local->root_ino != 0 + && (str.st_ino != local->root_ino + || str.st_dev != local->root_dev))) + { + /* Change detected; disable reloading and return current state. */ + atomic_store_release (&local->data.reload_disabled, 1); + *result = local->data.services[database_index]; + __libc_lock_unlock (local->lock); + return true; + } + } + if (stat_rv == 0) + { + local->root_ino = str.st_ino; + local->root_dev = str.st_dev; } - local->root_ino = str.st_ino; - local->root_dev = str.st_dev; + __libc_lock_unlock (local->lock); /* Avoid overwriting the global configuration until we have loaded diff --git a/nss/nss_module.c b/nss/nss_module.c index f9a1263e5..f00bbd9e1 100644 --- a/nss/nss_module.c +++ b/nss/nss_module.c @@ -330,8 +330,18 @@ name_search (const void *left, const void *right) void * __nss_module_get_function (struct nss_module *module, const char *name) { + /* A successful dlopen might clobber errno. */ + int saved_errno = errno; + if (!__nss_module_load (module)) - return NULL; + { + /* Reporting module load failure is currently inaccurate. See + bug 22041. Not changing errno is the conservative choice. */ + __set_errno (saved_errno); + return NULL; + } + + __set_errno (saved_errno); function_name *name_entry = bsearch (name, nss_function_name_array, array_length (nss_function_name_array), diff --git a/nss/nss_test_errno.c b/nss/nss_test_errno.c new file mode 100644 index 000000000..59a5c717b --- /dev/null +++ b/nss/nss_test_errno.c @@ -0,0 +1,58 @@ +/* NSS service provider with errno clobber. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include + +/* Catch misnamed and functions. */ +#pragma GCC diagnostic error "-Wmissing-prototypes" +NSS_DECLARE_MODULE_FUNCTIONS (test_errno) + +static void __attribute__ ((constructor)) +init (void) +{ + /* An arbitrary error code which is otherwise not used. */ + errno = -1009; +} + +/* Lookup functions for pwd follow that do not return any data. */ + +/* Catch misnamed function definitions. */ + +enum nss_status +_nss_test_errno_setpwent (int stayopen) +{ + setenv ("_nss_test_errno_setpwent", "yes", 1); + return NSS_STATUS_SUCCESS; +} + +enum nss_status +_nss_test_errno_getpwent_r (struct passwd *result, + char *buffer, size_t size, int *errnop) +{ + setenv ("_nss_test_errno_getpwent_r", "yes", 1); + return NSS_STATUS_NOTFOUND; +} + +enum nss_status +_nss_test_errno_endpwent (void) +{ + setenv ("_nss_test_errno_endpwent", "yes", 1); + return NSS_STATUS_SUCCESS; +} diff --git a/nss/tst-nss-test_errno.c b/nss/tst-nss-test_errno.c new file mode 100644 index 000000000..d2c42dd36 --- /dev/null +++ b/nss/tst-nss-test_errno.c @@ -0,0 +1,61 @@ +/* getpwent failure when dlopen clobbers errno (bug 28953). + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include + +static int +do_test (void) +{ + __nss_configure_lookup ("passwd", "files test_errno"); + + errno = 0; + setpwent (); + TEST_COMPARE (errno, 0); + + bool root_seen = false; + while (true) + { + errno = 0; + struct passwd *e = getpwent (); + if (e == NULL) + break; + if (strcmp (e->pw_name, "root")) + root_seen = true; + } + + TEST_COMPARE (errno, 0); + TEST_VERIFY (root_seen); + + errno = 0; + endpwent (); + TEST_COMPARE (errno, 0); + + TEST_COMPARE_STRING (getenv ("_nss_test_errno_setpwent"), "yes"); + TEST_COMPARE_STRING (getenv ("_nss_test_errno_getpwent_r"), "yes"); + TEST_COMPARE_STRING (getenv ("_nss_test_errno_endpwent"), "yes"); + + return 0; +} + +#include diff --git a/posix/fork.c b/posix/fork.c index 6b50c091f..e1be3422e 100644 --- a/posix/fork.c +++ b/posix/fork.c @@ -46,8 +46,9 @@ __libc_fork (void) best effort to make is async-signal-safe at least for single-thread case. */ bool multiple_threads = __libc_single_threaded == 0; + uint64_t lastrun; - __run_fork_handlers (atfork_run_prepare, multiple_threads); + lastrun = __run_prefork_handlers (multiple_threads); struct nss_database_data nss_database_data; @@ -105,7 +106,7 @@ __libc_fork (void) reclaim_stacks (); /* Run the handlers registered for the child. */ - __run_fork_handlers (atfork_run_child, multiple_threads); + __run_postfork_handlers (atfork_run_child, multiple_threads, lastrun); } else { @@ -123,7 +124,7 @@ __libc_fork (void) } /* Run the handlers registered for the parent. */ - __run_fork_handlers (atfork_run_parent, multiple_threads); + __run_postfork_handlers (atfork_run_parent, multiple_threads, lastrun); if (pid < 0) __set_errno (save_errno); diff --git a/posix/glob.c b/posix/glob.c index a2b5aabad..f6993a370 100644 --- a/posix/glob.c +++ b/posix/glob.c @@ -21,13 +21,14 @@ optimizes away the pattern == NULL test below. */ # define _GL_ARG_NONNULL(params) -# include +# include #endif #include #include +#include #include #include #include @@ -56,6 +57,8 @@ # define sysconf(id) __sysconf (id) # define closedir(dir) __closedir (dir) # define opendir(name) __opendir (name) +# undef dirfd +# define dirfd(str) __dirfd (str) # define readdir(str) __readdir64 (str) # define getpwnam_r(name, bufp, buf, len, res) \ __getpwnam_r (name, bufp, buf, len, res) @@ -69,11 +72,8 @@ # ifndef GLOB_LSTAT # define GLOB_LSTAT gl_lstat # endif -# ifndef GLOB_STAT64 -# define GLOB_STAT64 __stat64 -# endif -# ifndef GLOB_LSTAT64 -# define GLOB_LSTAT64 __lstat64 +# ifndef GLOB_FSTATAT64 +# define GLOB_FSTATAT64 __fstatat64 # endif # include #else /* !_LIBC */ @@ -88,8 +88,7 @@ # define struct_stat struct stat # define struct_stat64 struct stat # define GLOB_LSTAT gl_lstat -# define GLOB_STAT64 stat -# define GLOB_LSTAT64 lstat +# define GLOB_FSTATAT64 fstatat #endif /* _LIBC */ #include @@ -215,7 +214,8 @@ glob_lstat (glob_t *pglob, int flags, const char *fullname) } ust; return (__glibc_unlikely (flags & GLOB_ALTDIRFUNC) ? pglob->GLOB_LSTAT (fullname, &ust.st) - : GLOB_LSTAT64 (fullname, &ust.st64)); + : GLOB_FSTATAT64 (AT_FDCWD, fullname, &ust.st64, + AT_SYMLINK_NOFOLLOW)); } /* Set *R = A + B. Return true if the answer is mathematically @@ -257,7 +257,8 @@ is_dir (char const *filename, int flags, glob_t const *pglob) struct_stat64 st64; return (__glibc_unlikely (flags & GLOB_ALTDIRFUNC) ? pglob->gl_stat (filename, &st) == 0 && S_ISDIR (st.st_mode) - : GLOB_STAT64 (filename, &st64) == 0 && S_ISDIR (st64.st_mode)); + : (GLOB_FSTATAT64 (AT_FDCWD, filename, &st64, 0) == 0 + && S_ISDIR (st64.st_mode))); } /* Find the end of the sub-pattern in a brace expression. */ @@ -747,6 +748,8 @@ __glob (const char *pattern, int flags, int (*errfunc) (const char *, int), else { #ifndef WINDOWS32 + /* Recognize ~user as a shorthand for the specified user's home + directory. */ char *end_name = strchr (dirname, '/'); char *user_name; int malloc_user_name = 0; @@ -885,7 +888,22 @@ __glob (const char *pattern, int flags, int (*errfunc) (const char *, int), } scratch_buffer_free (&pwtmpbuf); } -#endif /* !WINDOWS32 */ +#else /* WINDOWS32 */ + /* On native Windows, access to a user's home directory + (via GetUserProfileDirectory) or to a user's environment + variables (via ExpandEnvironmentStringsForUser) requires + the credentials of the user. Therefore we cannot support + the ~user syntax on this platform. + Handling ~user specially (and treat it like plain ~) if + user is getenv ("USERNAME") would not be a good idea, + since it would make people think that ~user is supported + in general. */ + if (flags & GLOB_TILDE_CHECK) + { + retval = GLOB_NOMATCH; + goto out; + } +#endif /* WINDOWS32 */ } } @@ -1266,6 +1284,8 @@ glob_in_dir (const char *pattern, const char *directory, int flags, { size_t dirlen = strlen (directory); void *stream = NULL; + struct scratch_buffer s; + scratch_buffer_init (&s); # define GLOBNAMES_MEMBERS(nnames) \ struct globnames *next; size_t count; char *name[nnames]; struct globnames { GLOBNAMES_MEMBERS (FLEXIBLE_ARRAY_MEMBER) }; @@ -1337,6 +1357,7 @@ glob_in_dir (const char *pattern, const char *directory, int flags, } else { + int dfd = dirfd (stream); int fnm_flags = ((!(flags & GLOB_PERIOD) ? FNM_PERIOD : 0) | ((flags & GLOB_NOESCAPE) ? FNM_NOESCAPE : 0)); flags |= GLOB_MAGCHAR; @@ -1364,8 +1385,32 @@ glob_in_dir (const char *pattern, const char *directory, int flags, if (flags & GLOB_ONLYDIR) switch (readdir_result_type (d)) { - case DT_DIR: case DT_LNK: case DT_UNKNOWN: break; default: continue; + case DT_DIR: break; + case DT_LNK: case DT_UNKNOWN: + /* The filesystem was too lazy to give us a hint, + so we have to do it the hard way. */ + if (__glibc_unlikely (dfd < 0 || flags & GLOB_ALTDIRFUNC)) + { + size_t namelen = strlen (d.name); + size_t need = dirlen + 1 + namelen + 1; + if (s.length < need + && !scratch_buffer_set_array_size (&s, need, 1)) + goto memory_error; + char *p = mempcpy (s.data, directory, dirlen); + *p = '/'; + p += p[-1] != '/'; + memcpy (p, d.name, namelen + 1); + if (! is_dir (s.data, flags, pglob)) + continue; + } + else + { + struct_stat64 st64; + if (! (GLOB_FSTATAT64 (dfd, d.name, &st64, 0) == 0 + && S_ISDIR (st64.st_mode))) + continue; + } } if (fnmatch (pattern, d.name, fnm_flags) == 0) @@ -1497,5 +1542,6 @@ glob_in_dir (const char *pattern, const char *directory, int flags, __set_errno (save); } + scratch_buffer_free (&s); return result; } diff --git a/posix/register-atfork.c b/posix/register-atfork.c index 74b1b5840..c039fb454 100644 --- a/posix/register-atfork.c +++ b/posix/register-atfork.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #define DYNARRAY_ELEMENT struct fork_handler #define DYNARRAY_STRUCT fork_handler_list @@ -26,7 +28,7 @@ #include static struct fork_handler_list fork_handlers; -static bool fork_handler_init = false; +static uint64_t fork_handler_counter; static int atfork_lock = LLL_LOCK_INITIALIZER; @@ -36,11 +38,8 @@ __register_atfork (void (*prepare) (void), void (*parent) (void), { lll_lock (atfork_lock, LLL_PRIVATE); - if (!fork_handler_init) - { - fork_handler_list_init (&fork_handlers); - fork_handler_init = true; - } + if (fork_handler_counter == 0) + fork_handler_list_init (&fork_handlers); struct fork_handler *newp = fork_handler_list_emplace (&fork_handlers); if (newp != NULL) @@ -49,6 +48,13 @@ __register_atfork (void (*prepare) (void), void (*parent) (void), newp->parent_handler = parent; newp->child_handler = child; newp->dso_handle = dso_handle; + + /* IDs assigned to handlers start at 1 and increment with handler + registration. Un-registering a handlers discards the corresponding + ID. It is not reused in future registrations. */ + if (INT_ADD_OVERFLOW (fork_handler_counter, 1)) + __libc_fatal ("fork handler counter overflow"); + newp->id = ++fork_handler_counter; } /* Release the lock. */ @@ -103,37 +109,111 @@ __unregister_atfork (void *dso_handle) lll_unlock (atfork_lock, LLL_PRIVATE); } -void -__run_fork_handlers (enum __run_fork_handler_type who, _Bool do_locking) +uint64_t +__run_prefork_handlers (_Bool do_locking) { - struct fork_handler *runp; + uint64_t lastrun; - if (who == atfork_run_prepare) + if (do_locking) + lll_lock (atfork_lock, LLL_PRIVATE); + + /* We run prepare handlers from last to first. After fork, only + handlers up to the last handler found here (pre-fork) will be run. + Handlers registered during __run_prefork_handlers or + __run_postfork_handlers will be positioned after this last handler, and + since their prepare handlers won't be run now, their parent/child + handlers should also be ignored. */ + lastrun = fork_handler_counter; + + size_t sl = fork_handler_list_size (&fork_handlers); + for (size_t i = sl; i > 0;) { - if (do_locking) - lll_lock (atfork_lock, LLL_PRIVATE); - size_t sl = fork_handler_list_size (&fork_handlers); - for (size_t i = sl; i > 0; i--) - { - runp = fork_handler_list_at (&fork_handlers, i - 1); - if (runp->prepare_handler != NULL) - runp->prepare_handler (); - } + struct fork_handler *runp + = fork_handler_list_at (&fork_handlers, i - 1); + + uint64_t id = runp->id; + + if (runp->prepare_handler != NULL) + { + if (do_locking) + lll_unlock (atfork_lock, LLL_PRIVATE); + + runp->prepare_handler (); + + if (do_locking) + lll_lock (atfork_lock, LLL_PRIVATE); + } + + /* We unlocked, ran the handler, and locked again. In the + meanwhile, one or more deregistrations could have occurred leading + to the current (just run) handler being moved up the list or even + removed from the list itself. Since handler IDs are guaranteed to + to be in increasing order, the next handler has to have: */ + + /* A. An earlier position than the current one has. */ + i--; + + /* B. A lower ID than the current one does. The code below skips + any newly added handlers with higher IDs. */ + while (i > 0 + && fork_handler_list_at (&fork_handlers, i - 1)->id >= id) + i--; } - else + + return lastrun; +} + +void +__run_postfork_handlers (enum __run_fork_handler_type who, _Bool do_locking, + uint64_t lastrun) +{ + size_t sl = fork_handler_list_size (&fork_handlers); + for (size_t i = 0; i < sl;) { - size_t sl = fork_handler_list_size (&fork_handlers); - for (size_t i = 0; i < sl; i++) - { - runp = fork_handler_list_at (&fork_handlers, i); - if (who == atfork_run_child && runp->child_handler) - runp->child_handler (); - else if (who == atfork_run_parent && runp->parent_handler) - runp->parent_handler (); - } + struct fork_handler *runp = fork_handler_list_at (&fork_handlers, i); + uint64_t id = runp->id; + + /* prepare handlers were not run for handlers with ID > LASTRUN. + Thus, parent/child handlers will also not be run. */ + if (id > lastrun) + break; + if (do_locking) - lll_unlock (atfork_lock, LLL_PRIVATE); + lll_unlock (atfork_lock, LLL_PRIVATE); + + if (who == atfork_run_child && runp->child_handler) + runp->child_handler (); + else if (who == atfork_run_parent && runp->parent_handler) + runp->parent_handler (); + + if (do_locking) + lll_lock (atfork_lock, LLL_PRIVATE); + + /* We unlocked, ran the handler, and locked again. In the meanwhile, + one or more [de]registrations could have occurred. Due to this, + the list size must be updated. */ + sl = fork_handler_list_size (&fork_handlers); + + /* The just-run handler could also have moved up the list. */ + + if (sl > i && fork_handler_list_at (&fork_handlers, i)->id == id) + /* The position of the recently run handler hasn't changed. The + next handler to be run is an easy increment away. */ + i++; + else + { + /* The next handler to be run is the first handler in the list + to have an ID higher than the current one. */ + for (i = 0; i < sl; i++) + { + if (fork_handler_list_at (&fork_handlers, i)->id > id) + break; + } + } } + + if (do_locking) + lll_unlock (atfork_lock, LLL_PRIVATE); } diff --git a/posix/tst-spawn6.c b/posix/tst-spawn6.c index 911e90a46..044abd853 100644 --- a/posix/tst-spawn6.c +++ b/posix/tst-spawn6.c @@ -29,7 +29,14 @@ #include #include #include +#include #include +#include + +#ifndef PATH_MAX +# define PATH_MAX 1024 +#endif +static char ptmxpath[PATH_MAX]; static int handle_restart (const char *argv1, const char *argv2) @@ -115,7 +122,7 @@ run_subprogram (int argc, char *argv[], const posix_spawnattr_t *attr, } static int -do_test (int argc, char *argv[]) +run_test (int argc, char *argv[]) { /* We must have either: - four parameters left if called initially: @@ -127,16 +134,7 @@ do_test (int argc, char *argv[]) + --setgrpr optional */ - if (restart) - return handle_restart (argv[1], argv[2]); - - int tcfd = open64 (_PATH_TTY, O_RDONLY, 0600); - if (tcfd == -1) - { - if (errno == ENXIO) - FAIL_UNSUPPORTED ("terminal not available, skipping test"); - FAIL_EXIT1 ("open64 (\"%s\", 0x%x, 0600): %m", _PATH_TTY, O_RDONLY); - } + int tcfd = xopen (ptmxpath, O_RDONLY, 0600); /* Check setting the controlling terminal without changing the group. */ { @@ -198,5 +196,47 @@ do_test (int argc, char *argv[]) return 0; } +static int +do_test (int argc, char *argv[]) +{ + if (restart) + return handle_restart (argv[1], argv[2]); + + pid_t pid = xfork (); + if (pid == 0) + { + /* Create a pseudo-terminal to avoid interfering with the one using by + test itself, creates a new session (so there is no controlling + terminal), and set the pseudo-terminal as the controlling one. */ + int ptmx = posix_openpt (0); + if (ptmx == -1) + { + if (errno == ENXIO) + FAIL_UNSUPPORTED ("terminal not available, skipping test"); + FAIL_EXIT1 ("posix_openpt (0): %m"); + } + TEST_VERIFY_EXIT (grantpt (ptmx) == 0); + TEST_VERIFY_EXIT (unlockpt (ptmx) == 0); + + TEST_VERIFY_EXIT (setsid () != -1); + TEST_VERIFY_EXIT (ioctl (ptmx, TIOCSCTTY, NULL) == 0); + while (dup2 (ptmx, STDIN_FILENO) == -1 && errno == EBUSY) + ; + while (dup2 (ptmx, STDOUT_FILENO) == -1 && errno == EBUSY) + ; + while (dup2 (ptmx, STDERR_FILENO) == -1 && errno == EBUSY) + ; + TEST_VERIFY_EXIT (ptsname_r (ptmx, ptmxpath, sizeof ptmxpath) == 0); + xclose (ptmx); + + run_test (argc, argv); + _exit (0); + } + int status; + xwaitpid (pid, &status, 0); + TEST_VERIFY (WIFEXITED (status)); + exit (0); +} + #define TEST_FUNCTION_ARGV do_test #include diff --git a/resolv/Makefile b/resolv/Makefile index c465479e8..16943e7d9 100644 --- a/resolv/Makefile +++ b/resolv/Makefile @@ -40,12 +40,16 @@ routines := \ inet_pton \ ns_makecanon \ ns_name_compress \ + ns_name_length_uncompressed \ ns_name_ntop \ ns_name_pack \ ns_name_pton \ ns_name_skip \ ns_name_uncompress \ ns_name_unpack \ + ns_rr_cursor_init \ + ns_rr_cursor_next \ + ns_samebinaryname \ ns_samename \ nsap_addr \ nss_dns_functions \ @@ -79,11 +83,6 @@ generate := mtrace-tst-leaks.out tst-leaks.mtrace tst-leaks2.mtrace extra-libs := libresolv libnss_dns ifeq ($(have-thread-library),yes) routines += gai_sigqueue -endif - -ifeq ($(have-GLIBC_2.34)$(have-thread-library),yesyes) -# Empty compatibility library for old binaries. -extra-libs += libanl tests += \ tst-bug18665 \ @@ -93,9 +92,12 @@ tests += \ tst-ns_name_pton \ tst-res_hconf_reorder \ tst-res_hnok \ + tst-resolv-aliases \ tst-resolv-basic \ tst-resolv-binary \ + tst-resolv-byaddr \ tst-resolv-edns \ + tst-resolv-invalid-cname \ tst-resolv-network \ tst-resolv-nondecimal \ tst-resolv-res_init-multi \ @@ -107,6 +109,18 @@ tests += \ tests-internal += tst-resolv-txnid-collision tests-static += tst-resolv-txnid-collision +# Likewise for __ns_samebinaryname. +tests-internal += tst-ns_samebinaryname +tests-static += tst-ns_samebinaryname + +# Likewise for __ns_name_length_uncompressed. +tests-internal += tst-ns_name_length_uncompressed +tests-static += tst-ns_name_length_uncompressed + +# Likewise for struct ns_rr_cursor and its functions. +tests-internal += tst-ns_rr_cursor +tests-static += tst-ns_rr_cursor + # These tests need libdl. ifeq (yes,$(build-shared)) tests += \ @@ -144,7 +158,8 @@ xtests += tst-resolv-qtypes # This test has dropped packet tests and runs for a long time. xtests += tst-resolv-rotate -endif +endif # $(have-thread-library) + extra-libs-others = $(extra-libs) libresolv-routines := \ base64 \ @@ -168,6 +183,13 @@ libresolv-routines := \ resolv-deprecated \ # libresolv-routines +ifeq ($(have-GLIBC_2.34)$(have-thread-library),yesyes) +# Empty compatibility library for old binaries. +extra-libs += libanl +libanl-routines += libanl-compat +libanl-shared-only-routines += libanl-compat +endif + $(libanl-routines-var) += \ gai_cancel \ gai_error \ @@ -177,9 +199,6 @@ $(libanl-routines-var) += \ getaddrinfo_a \ # $(libanl-routines-var) -libanl-routines += libanl-compat -libanl-shared-only-routines += libanl-compat - # Pretend that libanl.so is a linker script, so that the symbolic link # is not installed. install-lib-ldscripts = libanl.so @@ -256,8 +275,10 @@ $(objpfx)tst-resolv-ai_idn.out: $(gen-locales) $(objpfx)tst-resolv-ai_idn-latin1.out: $(gen-locales) $(objpfx)tst-resolv-ai_idn-nolibidn2.out: \ $(gen-locales) $(objpfx)tst-no-libidn2.so +$(objpfx)tst-resolv-aliases: $(objpfx)libresolv.so $(shared-thread-library) $(objpfx)tst-resolv-basic: $(objpfx)libresolv.so $(shared-thread-library) $(objpfx)tst-resolv-binary: $(objpfx)libresolv.so $(shared-thread-library) +$(objpfx)tst-resolv-byaddr: $(objpfx)libresolv.so $(shared-thread-library) $(objpfx)tst-resolv-edns: $(objpfx)libresolv.so $(shared-thread-library) $(objpfx)tst-resolv-network: $(objpfx)libresolv.so $(shared-thread-library) $(objpfx)tst-resolv-res_init: $(objpfx)libresolv.so @@ -265,6 +286,8 @@ $(objpfx)tst-resolv-res_init-multi: $(objpfx)libresolv.so \ $(shared-thread-library) $(objpfx)tst-resolv-res_init-thread: $(objpfx)libresolv.so \ $(shared-thread-library) +$(objpfx)tst-resolv-invalid-cname: $(objpfx)libresolv.so \ + $(shared-thread-library) $(objpfx)tst-resolv-nondecimal: $(objpfx)libresolv.so $(shared-thread-library) $(objpfx)tst-resolv-qtypes: $(objpfx)libresolv.so $(shared-thread-library) $(objpfx)tst-resolv-rotate: $(objpfx)libresolv.so $(shared-thread-library) diff --git a/resolv/README b/resolv/README index 514e9bb61..2146bc3b2 100644 --- a/resolv/README +++ b/resolv/README @@ -146,6 +146,3 @@ res_libc.c is home-brewn, although parts of it are taken from res_data.c. res_hconf.c and res_hconf.h were contributed by David Mosberger, and do not come from BIND. - -The files gethnamaddr.c, mapv4v6addr.h and mapv4v6hostent.h are -leftovers from BIND 4.9.7. diff --git a/resolv/mapv4v6addr.h b/resolv/mapv4v6addr.h deleted file mode 100644 index 7f85f7d5e..000000000 --- a/resolv/mapv4v6addr.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * ++Copyright++ 1985, 1988, 1993 - * - - * Copyright (c) 1985, 1988, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - - * Portions Copyright (c) 1993 by Digital Equipment Corporation. - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies, and that - * the name of Digital Equipment Corporation not be used in advertising or - * publicity pertaining to distribution of the document or software without - * specific, written prior permission. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND DIGITAL EQUIPMENT CORP. DISCLAIMS ALL - * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL DIGITAL EQUIPMENT - * CORPORATION BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL - * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR - * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - - * --Copyright-- - */ - -#include -#include - -static void -map_v4v6_address (const char *src, char *dst) -{ - u_char *p = (u_char *) dst; - int i; - - /* Move the IPv4 part to the right position. */ - memcpy (dst + 12, src, INADDRSZ); - - /* Mark this ipv6 addr as a mapped ipv4. */ - for (i = 0; i < 10; i++) - *p++ = 0x00; - *p++ = 0xff; - *p = 0xff; -} diff --git a/resolv/mapv4v6hostent.h b/resolv/mapv4v6hostent.h deleted file mode 100644 index c11038adf..000000000 --- a/resolv/mapv4v6hostent.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * ++Copyright++ 1985, 1988, 1993 - * - - * Copyright (c) 1985, 1988, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - - * Portions Copyright (c) 1993 by Digital Equipment Corporation. - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies, and that - * the name of Digital Equipment Corporation not be used in advertising or - * publicity pertaining to distribution of the document or software without - * specific, written prior permission. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND DIGITAL EQUIPMENT CORP. DISCLAIMS ALL - * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL DIGITAL EQUIPMENT - * CORPORATION BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL - * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR - * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - - * --Copyright-- - */ - -#include -#include - -typedef union { - int32_t al; - char ac; -} align; - -static int -map_v4v6_hostent (struct hostent *hp, char **bpp, int *lenp) -{ - char **ap; - - if (hp->h_addrtype != AF_INET || hp->h_length != INADDRSZ) - return 0; - hp->h_addrtype = AF_INET6; - hp->h_length = IN6ADDRSZ; - for (ap = hp->h_addr_list; *ap; ap++) - { - int i = sizeof (align) - ((u_long) *bpp % sizeof (align)); - - if (*lenp < (i + IN6ADDRSZ)) - /* Out of memory. */ - return 1; - *bpp += i; - *lenp -= i; - map_v4v6_address (*ap, *bpp); - *ap = *bpp; - *bpp += IN6ADDRSZ; - *lenp -= IN6ADDRSZ; - } - return 0; -} diff --git a/resolv/ns_name_length_uncompressed.c b/resolv/ns_name_length_uncompressed.c new file mode 100644 index 000000000..51296b47e --- /dev/null +++ b/resolv/ns_name_length_uncompressed.c @@ -0,0 +1,72 @@ +/* Skip over an uncompressed name in wire format. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include + +int +__ns_name_length_uncompressed (const unsigned char *p, + const unsigned char *eom) +{ + const unsigned char *start = p; + + while (true) + { + if (p == eom) + { + /* Truncated packet: no room for label length. */ + __set_errno (EMSGSIZE); + return -1; + } + + unsigned char b = *p; + ++p; + if (b == 0) + { + /* Root label. */ + size_t length = p - start; + if (length > NS_MAXCDNAME) + { + /* Domain name too long. */ + __set_errno (EMSGSIZE); + return -1; + } + return length; + } + + if (b <= 63) + { + /* Regular label. */ + if (b <= eom - p) + p += b; + else + { + /* Truncated packet: label incomplete. */ + __set_errno (EMSGSIZE); + return -1; + } + } + else + { + /* Compression reference or corrupted label length. */ + __set_errno (EMSGSIZE); + return -1; + } + } +} diff --git a/resolv/ns_rr_cursor_init.c b/resolv/ns_rr_cursor_init.c new file mode 100644 index 000000000..6ee80b30e --- /dev/null +++ b/resolv/ns_rr_cursor_init.c @@ -0,0 +1,62 @@ +/* Initialize a simple DNS packet parser. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include + +bool +__ns_rr_cursor_init (struct ns_rr_cursor *c, + const unsigned char *buf, size_t len) +{ + c->begin = buf; + c->end = buf + len; + + /* Check for header size and 16-bit question count value (it must be 1). */ + if (len < 12 || buf[4] != 0 || buf[5] != 1) + { + __set_errno (EMSGSIZE); + c->current = c->end; + return false; + } + c->current = buf + 12; + + int consumed = __ns_name_length_uncompressed (c->current, c->end); + if (consumed < 0) + { + __set_errno (EMSGSIZE); + c->current = c->end; + c->first_rr = NULL; + return false; + } + c->current += consumed; + + /* Ensure there is room for question type and class. */ + if (c->end - c->current < 4) + { + __set_errno (EMSGSIZE); + c->current = c->end; + c->first_rr = NULL; + return false; + } + c->current += 4; + c->first_rr = c->current; + + return true; +} diff --git a/resolv/ns_rr_cursor_next.c b/resolv/ns_rr_cursor_next.c new file mode 100644 index 000000000..33652fc5d --- /dev/null +++ b/resolv/ns_rr_cursor_next.c @@ -0,0 +1,74 @@ +/* Simple DNS record parser without textual name decoding. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include + +bool +__ns_rr_cursor_next (struct ns_rr_cursor *c, struct ns_rr_wire *rr) +{ + rr->rdata = NULL; + + /* Extract the record owner name. */ + int consumed = __ns_name_unpack (c->begin, c->end, c->current, + rr->rname, sizeof (rr->rname)); + if (consumed < 0) + { + memset (rr, 0, sizeof (*rr)); + __set_errno (EMSGSIZE); + return false; + } + c->current += consumed; + + /* Extract the metadata. */ + struct + { + uint16_t rtype; + uint16_t rclass; + uint32_t ttl; + uint16_t rdlength; + } __attribute__ ((packed)) metadata; + _Static_assert (sizeof (metadata) == 10, "sizeof metadata"); + if (c->end - c->current < sizeof (metadata)) + { + memset (rr, 0, sizeof (*rr)); + __set_errno (EMSGSIZE); + return false; + } + memcpy (&metadata, c->current, sizeof (metadata)); + c->current += sizeof (metadata); + /* Endianess conversion. */ + rr->rtype = ntohs (metadata.rtype); + rr->rclass = ntohs (metadata.rclass); + rr->ttl = ntohl (metadata.ttl); + rr->rdlength = ntohs (metadata.rdlength); + + /* Extract record data. */ + if (c->end - c->current < rr->rdlength) + { + memset (rr, 0, sizeof (*rr)); + __set_errno (EMSGSIZE); + return false; + } + rr->rdata = c->current; + c->current += rr->rdlength; + + return true; +} diff --git a/resolv/ns_samebinaryname.c b/resolv/ns_samebinaryname.c new file mode 100644 index 000000000..9a47d8e97 --- /dev/null +++ b/resolv/ns_samebinaryname.c @@ -0,0 +1,55 @@ +/* Compare two binary domain names for quality. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +/* Convert ASCII letters to upper case. */ +static inline int +ascii_toupper (unsigned char ch) +{ + if (ch >= 'a' && ch <= 'z') + return ch - 'a' + 'A'; + else + return ch; +} + +bool +__ns_samebinaryname (const unsigned char *a, const unsigned char *b) +{ + while (*a != 0 && *b != 0) + { + if (*a != *b) + /* Different label length. */ + return false; + int labellen = *a; + ++a; + ++b; + for (int i = 0; i < labellen; ++i) + { + if (*a != *b && ascii_toupper (*a) != ascii_toupper (*b)) + /* Different character in label. */ + return false; + ++a; + ++b; + } + } + + /* Match if both names are at the root label. */ + return *a == 0 && *b == 0; +} diff --git a/resolv/nss_dns/dns-host.c b/resolv/nss_dns/dns-host.c index 913a5cb82..5b8e40813 100644 --- a/resolv/nss_dns/dns-host.c +++ b/resolv/nss_dns/dns-host.c @@ -69,6 +69,7 @@ * --Copyright-- */ +#include #include #include #include @@ -86,10 +87,6 @@ #include #include -/* Get implementations of some internal functions. */ -#include -#include - #define RESOLVSORT #if PACKETSZ > 65536 @@ -103,25 +100,30 @@ #endif #define MAXHOSTNAMELEN 256 -/* We need this time later. */ -typedef union querybuf -{ - HEADER hdr; - u_char buf[MAXPACKET]; -} querybuf; - -static enum nss_status getanswer_r (struct resolv_context *ctx, - const querybuf *answer, int anslen, - const char *qname, int qtype, - struct hostent *result, char *buffer, - size_t buflen, int *errnop, int *h_errnop, - int map, int32_t *ttlp, char **canonp); - -static enum nss_status gaih_getanswer (const querybuf *answer1, int anslen1, - const querybuf *answer2, int anslen2, - const char *qname, +/* For historic reasons, pointers to IP addresses are char *, so use a + single list type for addresses and host names. */ +#define DYNARRAY_STRUCT ptrlist +#define DYNARRAY_ELEMENT char * +#define DYNARRAY_PREFIX ptrlist_ +#include + +static enum nss_status getanswer_r (unsigned char *packet, size_t packetlen, + uint16_t qtype, struct alloc_buffer *abuf, + struct ptrlist *addresses, + struct ptrlist *aliases, + int *errnop, int *h_errnop, int32_t *ttlp); +static void addrsort (struct resolv_context *ctx, char **ap, int num); +static enum nss_status getanswer_ptr (unsigned char *packet, size_t packetlen, + struct alloc_buffer *abuf, + char **hnamep, int *errnop, + int *h_errnop, int32_t *ttlp); + +static enum nss_status gaih_getanswer (unsigned char *packet1, + size_t packet1len, + unsigned char *packet2, + size_t packet2len, + struct alloc_buffer *abuf, struct gaih_addrtuple **pat, - char *buffer, size_t buflen, int *errnop, int *h_errnop, int32_t *ttlp); @@ -175,16 +177,9 @@ gethostbyname3_context (struct resolv_context *ctx, char *buffer, size_t buflen, int *errnop, int *h_errnop, int32_t *ttlp, char **canonp) { - union - { - querybuf *buf; - u_char *ptr; - } host_buffer; - querybuf *orig_host_buffer; char tmp[NS_MAXDNAME]; int size, type, n; const char *cp; - int map = 0; int olderr = errno; enum nss_status status; @@ -215,10 +210,12 @@ gethostbyname3_context (struct resolv_context *ctx, && (cp = __res_context_hostalias (ctx, name, tmp, sizeof (tmp))) != NULL) name = cp; - host_buffer.buf = orig_host_buffer = (querybuf *) alloca (1024); + unsigned char dns_packet_buffer[1024]; + unsigned char *alt_dns_packet_buffer = dns_packet_buffer; - n = __res_context_search (ctx, name, C_IN, type, host_buffer.buf->buf, - 1024, &host_buffer.ptr, NULL, NULL, NULL, NULL); + n = __res_context_search (ctx, name, C_IN, type, + dns_packet_buffer, sizeof (dns_packet_buffer), + &alt_dns_packet_buffer, NULL, NULL, NULL, NULL); if (n < 0) { switch (errno) @@ -245,34 +242,79 @@ gethostbyname3_context (struct resolv_context *ctx, *errnop = EAGAIN; else __set_errno (olderr); + } + else + { + struct alloc_buffer abuf = alloc_buffer_create (buffer, buflen); - /* If we are looking for an IPv6 address and mapping is enabled - by having the RES_USE_INET6 bit in _res.options set, we try - another lookup. */ - if (af == AF_INET6 && res_use_inet6 ()) - n = __res_context_search (ctx, name, C_IN, T_A, host_buffer.buf->buf, - host_buffer.buf != orig_host_buffer - ? MAXPACKET : 1024, &host_buffer.ptr, - NULL, NULL, NULL, NULL); + struct ptrlist addresses; + ptrlist_init (&addresses); + struct ptrlist aliases; + ptrlist_init (&aliases); - if (n < 0) + status = getanswer_r (alt_dns_packet_buffer, n, type, + &abuf, &addresses, &aliases, + errnop, h_errnop, ttlp); + if (status == NSS_STATUS_SUCCESS) { - if (host_buffer.buf != orig_host_buffer) - free (host_buffer.buf); - return status; - } + if (ptrlist_has_failed (&addresses) + || ptrlist_has_failed (&aliases)) + { + /* malloc failure. Do not retry using the ERANGE protocol. */ + *errnop = ENOMEM; + *h_errnop = NETDB_INTERNAL; + status = NSS_STATUS_UNAVAIL; + } - map = 1; + /* Reserve the address and alias arrays in the result + buffer. Both are NULL-terminated, but the first element + of the alias array is stored in h_name, so no extra space + for the NULL terminator is needed there. */ + result->h_addr_list + = alloc_buffer_alloc_array (&abuf, char *, + ptrlist_size (&addresses) + 1); + result->h_aliases + = alloc_buffer_alloc_array (&abuf, char *, + ptrlist_size (&aliases)); + if (alloc_buffer_has_failed (&abuf)) + { + /* Retry using the ERANGE protocol. */ + *errnop = ERANGE; + *h_errnop = NETDB_INTERNAL; + status = NSS_STATUS_TRYAGAIN; + } + else + { + /* Copy the address list and NULL-terminate it. */ + memcpy (result->h_addr_list, ptrlist_begin (&addresses), + ptrlist_size (&addresses) * sizeof (char *)); + result->h_addr_list[ptrlist_size (&addresses)] = NULL; + + /* Sort the address list if requested. */ + if (type == T_A && __resolv_context_sort_count (ctx) > 0) + addrsort (ctx, result->h_addr_list, ptrlist_size (&addresses)); - result->h_addrtype = AF_INET; - result->h_length = INADDRSZ; + /* Copy the aliases, excluding the last one. */ + memcpy (result->h_aliases, ptrlist_begin (&aliases), + (ptrlist_size (&aliases) - 1) * sizeof (char *)); + result->h_aliases[ptrlist_size (&aliases) - 1] = NULL; + + /* The last alias goes into h_name. */ + assert (ptrlist_size (&aliases) >= 1); + result->h_name = ptrlist_end (&aliases)[-1]; + + /* This is also the canonical name. */ + if (canonp != NULL) + *canonp = result->h_name; + } + } + + ptrlist_free (&aliases); + ptrlist_free (&addresses); } - status = getanswer_r - (ctx, host_buffer.buf, n, name, type, result, buffer, buflen, - errnop, h_errnop, map, ttlp, canonp); - if (host_buffer.buf != orig_host_buffer) - free (host_buffer.buf); + if (alt_dns_packet_buffer != dns_packet_buffer) + free (alt_dns_packet_buffer); return status; } @@ -316,13 +358,8 @@ _nss_dns_gethostbyname_r (const char *name, struct hostent *result, *h_errnop = NETDB_INTERNAL; return NSS_STATUS_UNAVAIL; } - status = NSS_STATUS_NOTFOUND; - if (res_use_inet6 ()) - status = gethostbyname3_context (ctx, name, AF_INET6, result, buffer, - buflen, errnop, h_errnop, NULL, NULL); - if (status == NSS_STATUS_NOTFOUND) - status = gethostbyname3_context (ctx, name, AF_INET, result, buffer, - buflen, errnop, h_errnop, NULL, NULL); + status = gethostbyname3_context (ctx, name, AF_INET, result, buffer, + buflen, errnop, h_errnop, NULL, NULL); __resolv_context_put (ctx); return status; } @@ -357,27 +394,23 @@ _nss_dns_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat, name = cp; } - union - { - querybuf *buf; - u_char *ptr; - } host_buffer; - querybuf *orig_host_buffer; - host_buffer.buf = orig_host_buffer = (querybuf *) alloca (2048); + unsigned char dns_packet_buffer[2048]; + unsigned char *alt_dns_packet_buffer = dns_packet_buffer; u_char *ans2p = NULL; int nans2p = 0; int resplen2 = 0; int ans2p_malloced = 0; + struct alloc_buffer abuf = alloc_buffer_create (buffer, buflen); int olderr = errno; int n = __res_context_search (ctx, name, C_IN, T_QUERY_A_AND_AAAA, - host_buffer.buf->buf, 2048, &host_buffer.ptr, - &ans2p, &nans2p, &resplen2, &ans2p_malloced); + dns_packet_buffer, sizeof (dns_packet_buffer), + &alt_dns_packet_buffer, &ans2p, &nans2p, + &resplen2, &ans2p_malloced); if (n >= 0) { - status = gaih_getanswer (host_buffer.buf, n, (const querybuf *) ans2p, - resplen2, name, pat, buffer, buflen, - errnop, herrnop, ttlp); + status = gaih_getanswer (alt_dns_packet_buffer, n, ans2p, resplen2, + &abuf, pat, errnop, herrnop, ttlp); } else { @@ -408,12 +441,20 @@ _nss_dns_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat, __set_errno (olderr); } + /* Implement the buffer resizing protocol. */ + if (alloc_buffer_has_failed (&abuf)) + { + *errnop = ERANGE; + *herrnop = NETDB_INTERNAL; + status = NSS_STATUS_TRYAGAIN; + } + /* Check whether ans2p was separately allocated. */ if (ans2p_malloced) free (ans2p); - if (host_buffer.buf != orig_host_buffer) - free (host_buffer.buf); + if (alt_dns_packet_buffer != dns_packet_buffer) + free (alt_dns_packet_buffer); __resolv_context_put (ctx); return status; @@ -429,36 +470,21 @@ _nss_dns_gethostbyaddr2_r (const void *addr, socklen_t len, int af, static const u_char tunnelled[] = { 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; static const u_char v6local[] = { 0,0, 0,1 }; const u_char *uaddr = (const u_char *)addr; - struct host_data - { - char *aliases[MAX_NR_ALIASES]; - unsigned char host_addr[16]; /* IPv4 or IPv6 */ - char *h_addr_ptrs[MAX_NR_ADDRS + 1]; - char linebuffer[0]; - } *host_data = (struct host_data *) buffer; - union - { - querybuf *buf; - u_char *ptr; - } host_buffer; - querybuf *orig_host_buffer; char qbuf[MAXDNAME+1], *qp = NULL; size_t size; int n, status; int olderr = errno; - uintptr_t pad = -(uintptr_t) buffer % __alignof__ (struct host_data); - buffer += pad; - buflen = buflen > pad ? buflen - pad : 0; - - if (__glibc_unlikely (buflen < sizeof (struct host_data))) - { - *errnop = ERANGE; - *h_errnop = NETDB_INTERNAL; - return NSS_STATUS_TRYAGAIN; - } - - host_data = (struct host_data *) buffer; + /* Prepare the allocation buffer. Store the pointer array first, to + benefit from buffer alignment. */ + struct alloc_buffer abuf = alloc_buffer_create (buffer, buflen); + char **address_array = alloc_buffer_alloc_array (&abuf, char *, 2); + if (address_array == NULL) + { + *errnop = ERANGE; + *h_errnop = NETDB_INTERNAL; + return NSS_STATUS_TRYAGAIN; + } struct resolv_context *ctx = __resolv_context_get (); if (ctx == NULL) @@ -502,8 +528,6 @@ _nss_dns_gethostbyaddr2_r (const void *addr, socklen_t len, int af, return NSS_STATUS_UNAVAIL; } - host_buffer.buf = orig_host_buffer = (querybuf *) alloca (1024); - switch (af) { case AF_INET: @@ -527,36 +551,52 @@ _nss_dns_gethostbyaddr2_r (const void *addr, socklen_t len, int af, break; } - n = __res_context_query (ctx, qbuf, C_IN, T_PTR, host_buffer.buf->buf, - 1024, &host_buffer.ptr, NULL, NULL, NULL, NULL); + unsigned char dns_packet_buffer[1024]; + unsigned char *alt_dns_packet_buffer = dns_packet_buffer; + n = __res_context_query (ctx, qbuf, C_IN, T_PTR, + dns_packet_buffer, sizeof (dns_packet_buffer), + &alt_dns_packet_buffer, + NULL, NULL, NULL, NULL); if (n < 0) { *h_errnop = h_errno; __set_errno (olderr); - if (host_buffer.buf != orig_host_buffer) - free (host_buffer.buf); + if (alt_dns_packet_buffer != dns_packet_buffer) + free (alt_dns_packet_buffer); __resolv_context_put (ctx); return errno == ECONNREFUSED ? NSS_STATUS_UNAVAIL : NSS_STATUS_NOTFOUND; } - status = getanswer_r - (ctx, host_buffer.buf, n, qbuf, T_PTR, result, buffer, buflen, - errnop, h_errnop, 0 /* XXX */, ttlp, NULL); - if (host_buffer.buf != orig_host_buffer) - free (host_buffer.buf); + status = getanswer_ptr (alt_dns_packet_buffer, n, + &abuf, &result->h_name, errnop, h_errnop, ttlp); + + if (alt_dns_packet_buffer != dns_packet_buffer) + free (alt_dns_packet_buffer); + __resolv_context_put (ctx); + if (status != NSS_STATUS_SUCCESS) - { - __resolv_context_put (ctx); - return status; - } + return status; + /* result->h_name has already been set by getanswer_ptr. */ result->h_addrtype = af; result->h_length = len; - memcpy (host_data->host_addr, addr, len); - host_data->h_addr_ptrs[0] = (char *) host_data->host_addr; - host_data->h_addr_ptrs[1] = NULL; + /* Increase the alignment to 4, in case there are applications out + there that expect at least this level of address alignment. */ + address_array[0] = (char *) alloc_buffer_next (&abuf, uint32_t); + alloc_buffer_copy_bytes (&abuf, uaddr, len); + address_array[1] = NULL; + + /* This check also covers allocation failure in getanswer_ptr. */ + if (alloc_buffer_has_failed (&abuf)) + { + *errnop = ERANGE; + *h_errnop = NETDB_INTERNAL; + return NSS_STATUS_TRYAGAIN; + } + result->h_addr_list = address_array; + result->h_aliases = &address_array[1]; /* Points to NULL. */ + *h_errnop = NETDB_SUCCESS; - __resolv_context_put (ctx); return NSS_STATUS_SUCCESS; } libc_hidden_def (_nss_dns_gethostbyaddr2_r) @@ -618,650 +658,362 @@ addrsort (struct resolv_context *ctx, char **ap, int num) break; } -static enum nss_status -getanswer_r (struct resolv_context *ctx, - const querybuf *answer, int anslen, const char *qname, int qtype, - struct hostent *result, char *buffer, size_t buflen, - int *errnop, int *h_errnop, int map, int32_t *ttlp, char **canonp) +/* Convert the uncompressed, binary domain name CDNAME into its + textual representation and add it to the end of ALIASES, allocating + space for a copy of the name from ABUF. Skip adding the name if it + is not a valid host name, and return false in that case, otherwise + true. */ +static bool +getanswer_r_store_alias (const unsigned char *cdname, + struct alloc_buffer *abuf, + struct ptrlist *aliases) { - struct host_data - { - char *aliases[MAX_NR_ALIASES]; - unsigned char host_addr[16]; /* IPv4 or IPv6 */ - char *h_addr_ptrs[0]; - } *host_data; - int linebuflen; - const HEADER *hp; - const u_char *end_of_message, *cp; - int n, ancount, qdcount; - int haveanswer, had_error; - char *bp, **ap, **hap; - char tbuf[MAXDNAME]; - const char *tname; - int (*name_ok) (const char *); - u_char packtmp[NS_MAXCDNAME]; - int have_to_map = 0; - uintptr_t pad = -(uintptr_t) buffer % __alignof__ (struct host_data); - buffer += pad; - buflen = buflen > pad ? buflen - pad : 0; - if (__glibc_unlikely (buflen < sizeof (struct host_data))) - { - /* The buffer is too small. */ - too_small: - *errnop = ERANGE; - *h_errnop = NETDB_INTERNAL; - return NSS_STATUS_TRYAGAIN; - } - host_data = (struct host_data *) buffer; - linebuflen = buflen - sizeof (struct host_data); - if (buflen - sizeof (struct host_data) != linebuflen) - linebuflen = INT_MAX; - - tname = qname; - result->h_name = NULL; - end_of_message = answer->buf + anslen; - switch (qtype) - { - case T_A: - case T_AAAA: - name_ok = __libc_res_hnok; - break; - case T_PTR: - name_ok = __libc_res_dnok; - break; - default: - *errnop = ENOENT; - return NSS_STATUS_UNAVAIL; /* XXX should be abort(); */ - } + /* Filter out domain names that are not host names. */ + if (!__res_binary_hnok (cdname)) + return false; + + /* Note: Not NS_MAXCDNAME, so that __ns_name_ntop implicitly checks + for length. */ + char dname[MAXHOSTNAMELEN + 1]; + if (__ns_name_ntop (cdname, dname, sizeof (dname)) < 0) + return false; + /* Do not report an error on allocation failure, instead store NULL + or do nothing. getanswer_r's caller will see NSS_STATUS_SUCCESS + and detect the memory allocation failure or buffer space + exhaustion, and report it accordingly. */ + ptrlist_add (aliases, alloc_buffer_copy_string (abuf, dname)); + return true; +} - /* - * find first satisfactory answer - */ - hp = &answer->hdr; - ancount = ntohs (hp->ancount); - qdcount = ntohs (hp->qdcount); - cp = answer->buf + HFIXEDSZ; - if (__glibc_unlikely (qdcount != 1)) +static enum nss_status __attribute__ ((noinline)) +getanswer_r (unsigned char *packet, size_t packetlen, uint16_t qtype, + struct alloc_buffer *abuf, + struct ptrlist *addresses, struct ptrlist *aliases, + int *errnop, int *h_errnop, int32_t *ttlp) +{ + struct ns_rr_cursor c; + if (!__ns_rr_cursor_init (&c, packet, packetlen)) { + /* This should not happen because __res_context_query already + perfroms response validation. */ *h_errnop = NO_RECOVERY; return NSS_STATUS_UNAVAIL; } - if (sizeof (struct host_data) + (ancount + 1) * sizeof (char *) >= buflen) - goto too_small; - bp = (char *) &host_data->h_addr_ptrs[ancount + 1]; - linebuflen -= (ancount + 1) * sizeof (char *); - - n = __ns_name_unpack (answer->buf, end_of_message, cp, - packtmp, sizeof packtmp); - if (n != -1 && __ns_name_ntop (packtmp, bp, linebuflen) == -1) - { - if (__glibc_unlikely (errno == EMSGSIZE)) - goto too_small; - n = -1; - } - - if (__glibc_unlikely (n < 0)) + /* Treat the QNAME just like an alias. Error out if it is not a + valid host name. */ + if (ns_rr_cursor_rcode (&c) == NXDOMAIN + || !getanswer_r_store_alias (ns_rr_cursor_qname (&c), abuf, aliases)) { - *errnop = errno; - *h_errnop = NO_RECOVERY; - return NSS_STATUS_UNAVAIL; - } - if (__glibc_unlikely (name_ok (bp) == 0)) - { - errno = EBADMSG; - *errnop = EBADMSG; - *h_errnop = NO_RECOVERY; - return NSS_STATUS_UNAVAIL; + if (ttlp != NULL) + /* No negative caching. */ + *ttlp = 0; + *h_errnop = HOST_NOT_FOUND; + *errnop = ENOENT; + return NSS_STATUS_NOTFOUND; } - cp += n + QFIXEDSZ; - if (qtype == T_A || qtype == T_AAAA) + int ancount = ns_rr_cursor_ancount (&c); + const unsigned char *expected_name = ns_rr_cursor_qname (&c); + /* expected_name may be updated to point into this buffer. */ + unsigned char name_buffer[NS_MAXCDNAME]; + + for (; ancount > 0; --ancount) { - /* res_send() has already verified that the query name is the - * same as the one we sent; this just gets the expanded name - * (i.e., with the succeeding search-domain tacked on). - */ - n = strlen (bp) + 1; /* for the \0 */ - if (n >= MAXHOSTNAMELEN) + struct ns_rr_wire rr; + if (!__ns_rr_cursor_next (&c, &rr)) { *h_errnop = NO_RECOVERY; - *errnop = ENOENT; - return NSS_STATUS_TRYAGAIN; + return NSS_STATUS_UNAVAIL; } - result->h_name = bp; - bp += n; - linebuflen -= n; - if (linebuflen < 0) - goto too_small; - /* The qname can be abbreviated, but h_name is now absolute. */ - qname = result->h_name; - } - ap = host_data->aliases; - *ap = NULL; - result->h_aliases = host_data->aliases; - hap = host_data->h_addr_ptrs; - *hap = NULL; - result->h_addr_list = host_data->h_addr_ptrs; - haveanswer = 0; - had_error = 0; + /* Skip over records with the wrong class. */ + if (rr.rclass != C_IN) + continue; - while (ancount-- > 0 && cp < end_of_message && had_error == 0) - { - int type, class; + /* Update TTL for recognized record types. */ + if ((rr.rtype == T_CNAME || rr.rtype == qtype) + && ttlp != NULL && *ttlp > rr.ttl) + *ttlp = rr.ttl; - n = __ns_name_unpack (answer->buf, end_of_message, cp, - packtmp, sizeof packtmp); - if (n != -1 && __ns_name_ntop (packtmp, bp, linebuflen) == -1) + if (rr.rtype == T_CNAME) { - if (__glibc_unlikely (errno == EMSGSIZE)) - goto too_small; - - n = -1; + /* NB: No check for owner name match, based on historic + precedent. Record the CNAME target as the new expected + name. */ + int n = __ns_name_unpack (c.begin, c.end, rr.rdata, + name_buffer, sizeof (name_buffer)); + if (n < 0) + { + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; + } + /* And store the new name as an alias. */ + getanswer_r_store_alias (name_buffer, abuf, aliases); + expected_name = name_buffer; } - - if (__glibc_unlikely (n < 0 || (*name_ok) (bp) == 0)) + else if (rr.rtype == qtype + && __ns_samebinaryname (rr.rname, expected_name) + && rr.rdlength == rrtype_to_rdata_length (qtype)) { - ++had_error; - continue; + /* Make a copy of the address and store it. Increase the + alignment to 4, in case there are applications out there + that expect at least this level of address alignment. */ + ptrlist_add (addresses, (char *) alloc_buffer_next (abuf, uint32_t)); + alloc_buffer_copy_bytes (abuf, rr.rdata, rr.rdlength); } - cp += n; /* name */ + } - if (__glibc_unlikely (cp + 10 > end_of_message)) - { - ++had_error; - continue; - } + if (ptrlist_size (addresses) == 0) + { + /* No address record found. */ + if (ttlp != NULL) + /* No caching of negative responses. */ + *ttlp = 0; - NS_GET16 (type, cp); - NS_GET16 (class, cp); - int32_t ttl; - NS_GET32 (ttl, cp); - NS_GET16 (n, cp); /* RDATA length. */ + *h_errnop = NO_RECOVERY; + *errnop = ENOENT; + return NSS_STATUS_TRYAGAIN; + } + else + { + *h_errnop = NETDB_SUCCESS; + return NSS_STATUS_SUCCESS; + } +} - if (end_of_message - cp < n) - { - /* RDATA extends beyond the end of the packet. */ - ++had_error; - continue; - } +static enum nss_status +getanswer_ptr (unsigned char *packet, size_t packetlen, + struct alloc_buffer *abuf, char **hnamep, + int *errnop, int *h_errnop, int32_t *ttlp) +{ + struct ns_rr_cursor c; + if (!__ns_rr_cursor_init (&c, packet, packetlen)) + { + /* This should not happen because __res_context_query already + perfroms response validation. */ + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; + } + int ancount = ns_rr_cursor_ancount (&c); + const unsigned char *expected_name = ns_rr_cursor_qname (&c); + /* expected_name may be updated to point into this buffer. */ + unsigned char name_buffer[NS_MAXCDNAME]; - if (__glibc_unlikely (class != C_IN)) + while (ancount > 0) + { + struct ns_rr_wire rr; + if (!__ns_rr_cursor_next (&c, &rr)) { - /* XXX - debug? syslog? */ - cp += n; - continue; /* XXX - had_error++ ? */ + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; } - if ((qtype == T_A || qtype == T_AAAA) && type == T_CNAME) - { - /* A CNAME could also have a TTL entry. */ - if (ttlp != NULL && ttl < *ttlp) - *ttlp = ttl; - - if (ap >= &host_data->aliases[MAX_NR_ALIASES - 1]) - continue; - n = __libc_dn_expand (answer->buf, end_of_message, cp, - tbuf, sizeof tbuf); - if (__glibc_unlikely (n < 0 || (*name_ok) (tbuf) == 0)) - { - ++had_error; - continue; - } - cp += n; - /* Store alias. */ - *ap++ = bp; - n = strlen (bp) + 1; /* For the \0. */ - if (__glibc_unlikely (n >= MAXHOSTNAMELEN)) - { - ++had_error; - continue; - } - bp += n; - linebuflen -= n; - /* Get canonical name. */ - n = strlen (tbuf) + 1; /* For the \0. */ - if (__glibc_unlikely (n > linebuflen)) - goto too_small; - if (__glibc_unlikely (n >= MAXHOSTNAMELEN)) - { - ++had_error; - continue; - } - result->h_name = bp; - bp = __mempcpy (bp, tbuf, n); /* Cannot overflow. */ - linebuflen -= n; - continue; - } + /* Skip over records with the wrong class. */ + if (rr.rclass != C_IN) + continue; - if (qtype == T_PTR && type == T_CNAME) - { - /* A CNAME could also have a TTL entry. */ - if (ttlp != NULL && ttl < *ttlp) - *ttlp = ttl; + /* Update TTL for known record types. */ + if ((rr.rtype == T_CNAME || rr.rtype == T_PTR) + && ttlp != NULL && *ttlp > rr.ttl) + *ttlp = rr.ttl; - n = __libc_dn_expand (answer->buf, end_of_message, cp, - tbuf, sizeof tbuf); - if (__glibc_unlikely (n < 0 || __libc_res_dnok (tbuf) == 0)) - { - ++had_error; - continue; - } - cp += n; - /* Get canonical name. */ - n = strlen (tbuf) + 1; /* For the \0. */ - if (__glibc_unlikely (n > linebuflen)) - goto too_small; - if (__glibc_unlikely (n >= MAXHOSTNAMELEN)) + if (rr.rtype == T_CNAME) + { + /* NB: No check for owner name match, based on historic + precedent. Record the CNAME target as the new expected + name. */ + int n = __ns_name_unpack (c.begin, c.end, rr.rdata, + name_buffer, sizeof (name_buffer)); + if (n < 0) { - ++had_error; - continue; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; } - tname = bp; - bp = __mempcpy (bp, tbuf, n); /* Cannot overflow. */ - linebuflen -= n; - continue; + expected_name = name_buffer; } - - if (type == T_A && qtype == T_AAAA && map) - have_to_map = 1; - else if (__glibc_unlikely (type != qtype)) + else if (rr.rtype == T_PTR + && __ns_samebinaryname (rr.rname, expected_name)) { - cp += n; - continue; /* XXX - had_error++ ? */ - } - - switch (type) - { - case T_PTR: - if (__glibc_unlikely (__strcasecmp (tname, bp) != 0)) + /* Decompress the target of the PTR record. This is the + host name we are looking for. We can only use it if it + is syntactically valid. Historically, only one host name + is returned here. If the recursive resolver performs DNS + record rotation, the returned host name is essentially + random, which is why multiple PTR records are rarely + used. Use MAXHOSTNAMELEN instead of NS_MAXCDNAME for + additional length checking. */ + char hname[MAXHOSTNAMELEN + 1]; + if (__ns_name_unpack (c.begin, c.end, rr.rdata, + name_buffer, sizeof (name_buffer)) < 0 + || !__res_binary_hnok (expected_name) + || __ns_name_ntop (name_buffer, hname, sizeof (hname)) < 0) { - cp += n; - continue; /* XXX - had_error++ ? */ + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; } - - n = __ns_name_unpack (answer->buf, end_of_message, cp, - packtmp, sizeof packtmp); - if (n != -1 && __ns_name_ntop (packtmp, bp, linebuflen) == -1) - { - if (__glibc_unlikely (errno == EMSGSIZE)) - goto too_small; - - n = -1; - } - - if (__glibc_unlikely (n < 0 || __libc_res_hnok (bp) == 0)) - { - ++had_error; - break; - } - if (ttlp != NULL && ttl < *ttlp) - *ttlp = ttl; - /* bind would put multiple PTR records as aliases, but we don't do - that. */ - result->h_name = bp; - *h_errnop = NETDB_SUCCESS; + /* Successful allocation is checked by the caller. */ + *hnamep = alloc_buffer_copy_string (abuf, hname); return NSS_STATUS_SUCCESS; - case T_A: - case T_AAAA: - if (__glibc_unlikely (__strcasecmp (result->h_name, bp) != 0)) - { - cp += n; - continue; /* XXX - had_error++ ? */ - } - - /* Stop parsing at a record whose length is incorrect. */ - if (n != rrtype_to_rdata_length (type)) - { - ++had_error; - break; - } - - /* Skip records of the wrong type. */ - if (n != result->h_length) - { - cp += n; - continue; - } - if (!haveanswer) - { - int nn; - - /* We compose a single hostent out of the entire chain of - entries, so the TTL of the hostent is essentially the lowest - TTL in the chain. */ - if (ttlp != NULL && ttl < *ttlp) - *ttlp = ttl; - if (canonp != NULL) - *canonp = bp; - result->h_name = bp; - nn = strlen (bp) + 1; /* for the \0 */ - bp += nn; - linebuflen -= nn; - } - - /* Provide sufficient alignment for both address - families. */ - enum { align = 4 }; - _Static_assert ((align % __alignof__ (struct in_addr)) == 0, - "struct in_addr alignment"); - _Static_assert ((align % __alignof__ (struct in6_addr)) == 0, - "struct in6_addr alignment"); - { - char *new_bp = PTR_ALIGN_UP (bp, align); - linebuflen -= new_bp - bp; - bp = new_bp; - } - - if (__glibc_unlikely (n > linebuflen)) - goto too_small; - bp = __mempcpy (*hap++ = bp, cp, n); - cp += n; - linebuflen -= n; - break; - default: - abort (); } - if (had_error == 0) - ++haveanswer; } - if (haveanswer > 0) - { - *ap = NULL; - *hap = NULL; - /* - * Note: we sort even if host can take only one address - * in its return structures - should give it the "best" - * address in that case, not some random one - */ - if (haveanswer > 1 && qtype == T_A - && __resolv_context_sort_count (ctx) > 0) - addrsort (ctx, host_data->h_addr_ptrs, haveanswer); - - if (result->h_name == NULL) - { - n = strlen (qname) + 1; /* For the \0. */ - if (n > linebuflen) - goto too_small; - if (n >= MAXHOSTNAMELEN) - goto no_recovery; - result->h_name = bp; - bp = __mempcpy (bp, qname, n); /* Cannot overflow. */ - linebuflen -= n; - } + /* No PTR record found. */ + if (ttlp != NULL) + /* No caching of negative responses. */ + *ttlp = 0; - if (have_to_map) - if (map_v4v6_hostent (result, &bp, &linebuflen)) - goto too_small; - *h_errnop = NETDB_SUCCESS; - return NSS_STATUS_SUCCESS; - } - no_recovery: *h_errnop = NO_RECOVERY; *errnop = ENOENT; - /* Special case here: if the resolver sent a result but it only - contains a CNAME while we are looking for a T_A or T_AAAA record, - we fail with NOTFOUND instead of TRYAGAIN. */ - return ((qtype == T_A || qtype == T_AAAA) && ap != host_data->aliases - ? NSS_STATUS_NOTFOUND : NSS_STATUS_TRYAGAIN); + return NSS_STATUS_TRYAGAIN; } - +/* Parses DNS data found in PACKETLEN bytes at PACKET in struct + gaih_addrtuple address tuples. The new address tuples are linked + from **TAILP, with backing store allocated from ABUF, and *TAILP is + updated to point where the next tuple pointer should be stored. If + TTLP is not null, *TTLP is updated to reflect the minimum TTL. If + STORE_CANON is true, the canonical name is stored as part of the + first address tuple being written. */ static enum nss_status -gaih_getanswer_slice (const querybuf *answer, int anslen, const char *qname, - struct gaih_addrtuple ***patp, - char **bufferp, size_t *buflenp, - int *errnop, int *h_errnop, int32_t *ttlp, int *firstp) +gaih_getanswer_slice (unsigned char *packet, size_t packetlen, + struct alloc_buffer *abuf, + struct gaih_addrtuple ***tailp, + int *errnop, int *h_errnop, int32_t *ttlp, + bool store_canon) { - char *buffer = *bufferp; - size_t buflen = *buflenp; - - struct gaih_addrtuple **pat = *patp; - const HEADER *hp = &answer->hdr; - int ancount = ntohs (hp->ancount); - int qdcount = ntohs (hp->qdcount); - const u_char *cp = answer->buf + HFIXEDSZ; - const u_char *end_of_message = answer->buf + anslen; - if (__glibc_unlikely (qdcount != 1)) - { - *h_errnop = NO_RECOVERY; - return NSS_STATUS_UNAVAIL; - } - - u_char packtmp[NS_MAXCDNAME]; - int n = __ns_name_unpack (answer->buf, end_of_message, cp, - packtmp, sizeof packtmp); - /* We unpack the name to check it for validity. But we do not need - it later. */ - if (n != -1 && __ns_name_ntop (packtmp, buffer, buflen) == -1) - { - if (__glibc_unlikely (errno == EMSGSIZE)) - { - too_small: - *errnop = ERANGE; - *h_errnop = NETDB_INTERNAL; - return NSS_STATUS_TRYAGAIN; - } - - n = -1; - } - - if (__glibc_unlikely (n < 0)) + struct ns_rr_cursor c; + if (!__ns_rr_cursor_init (&c, packet, packetlen)) { - *errnop = errno; + /* This should not happen because __res_context_query already + perfroms response validation. */ *h_errnop = NO_RECOVERY; return NSS_STATUS_UNAVAIL; } - if (__glibc_unlikely (__libc_res_hnok (buffer) == 0)) - { - errno = EBADMSG; - *errnop = EBADMSG; - *h_errnop = NO_RECOVERY; - return NSS_STATUS_UNAVAIL; - } - cp += n + QFIXEDSZ; - - int haveanswer = 0; - int had_error = 0; - char *canon = NULL; - char *h_name = NULL; - int h_namelen = 0; - - if (ancount == 0) + bool haveanswer = false; /* Set to true if at least one address. */ + uint16_t qtype = ns_rr_cursor_qtype (&c); + int ancount = ns_rr_cursor_ancount (&c); + const unsigned char *expected_name = ns_rr_cursor_qname (&c); + /* expected_name may be updated to point into this buffer. */ + unsigned char name_buffer[NS_MAXCDNAME]; + + /* This is a pointer to a possibly-compressed name in the packet. + Eventually it is equivalent to the canonical name. If needed, it + is uncompressed and translated to text form when the first + address tuple is encountered. */ + const unsigned char *compressed_alias_name = expected_name; + + if (ancount == 0 || !__res_binary_hnok (compressed_alias_name)) { *h_errnop = HOST_NOT_FOUND; return NSS_STATUS_NOTFOUND; } - while (ancount-- > 0 && cp < end_of_message && had_error == 0) + for (; ancount > -0; --ancount) { - n = __ns_name_unpack (answer->buf, end_of_message, cp, - packtmp, sizeof packtmp); - if (n != -1 && - (h_namelen = __ns_name_ntop (packtmp, buffer, buflen)) == -1) + struct ns_rr_wire rr; + if (!__ns_rr_cursor_next (&c, &rr)) { - if (__glibc_unlikely (errno == EMSGSIZE)) - goto too_small; - - n = -1; - } - if (__glibc_unlikely (n < 0 || __libc_res_hnok (buffer) == 0)) - { - ++had_error; - continue; - } - if (*firstp && canon == NULL) - { - h_name = buffer; - buffer += h_namelen; - buflen -= h_namelen; - } - - cp += n; /* name */ - - if (__glibc_unlikely (cp + 10 > end_of_message)) - { - ++had_error; - continue; - } - - uint16_t type; - NS_GET16 (type, cp); - uint16_t class; - NS_GET16 (class, cp); - int32_t ttl; - NS_GET32 (ttl, cp); - NS_GET16 (n, cp); /* RDATA length. */ - - if (end_of_message - cp < n) - { - /* RDATA extends beyond the end of the packet. */ - ++had_error; - continue; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; } - if (class != C_IN) - { - cp += n; - continue; - } + /* Update TTL for known record types. */ + if ((rr.rtype == T_CNAME || rr.rtype == qtype) + && ttlp != NULL && *ttlp > rr.ttl) + *ttlp = rr.ttl; - if (type == T_CNAME) + if (rr.rtype == T_CNAME) { - char tbuf[MAXDNAME]; - - /* A CNAME could also have a TTL entry. */ - if (ttlp != NULL && ttl < *ttlp) - *ttlp = ttl; - - n = __libc_dn_expand (answer->buf, end_of_message, cp, - tbuf, sizeof tbuf); - if (__glibc_unlikely (n < 0 || __libc_res_hnok (tbuf) == 0)) - { - ++had_error; - continue; - } - cp += n; - - if (*firstp) + /* NB: No check for owner name match, based on historic + precedent. Record the CNAME target as the new expected + name. */ + int n = __ns_name_unpack (c.begin, c.end, rr.rdata, + name_buffer, sizeof (name_buffer)); + if (n < 0) { - /* Reclaim buffer space. */ - if (h_name + h_namelen == buffer) - { - buffer = h_name; - buflen += h_namelen; - } - - n = strlen (tbuf) + 1; - if (__glibc_unlikely (n > buflen)) - goto too_small; - if (__glibc_unlikely (n >= MAXHOSTNAMELEN)) - { - ++had_error; - continue; - } - - canon = buffer; - buffer = __mempcpy (buffer, tbuf, n); - buflen -= n; - h_namelen = 0; + *h_errnop = NO_RECOVERY; + return NSS_STATUS_UNAVAIL; } - continue; + expected_name = name_buffer; + if (store_canon && __res_binary_hnok (name_buffer)) + /* This name can be used as a canonical name. Do not + translate to text form here to conserve buffer space. + Point to the compressed name because name_buffer can be + overwritten with an unusable name later. */ + compressed_alias_name = rr.rdata; } - - /* Stop parsing if we encounter a record with incorrect RDATA - length. */ - if (type == T_A || type == T_AAAA) + else if (rr.rtype == qtype + && __ns_samebinaryname (rr.rname, expected_name) + && rr.rdlength == rrtype_to_rdata_length (qtype)) { - if (n != rrtype_to_rdata_length (type)) + struct gaih_addrtuple *ntup + = alloc_buffer_alloc (abuf, struct gaih_addrtuple); + /* Delay error reporting to the callers (they implement the + ERANGE buffer resizing handshake). */ + if (ntup != NULL) { - ++had_error; - continue; + ntup->next = NULL; + if (store_canon && compressed_alias_name != NULL) + { + /* This assumes that all the CNAME records come + first. Use MAXHOSTNAMELEN instead of + NS_MAXCDNAME for additional length checking. + However, these checks are not expected to fail + because all size NS_MAXCDNAME names should into + the hname buffer because no escaping is + needed. */ + char unsigned nbuf[NS_MAXCDNAME]; + char hname[MAXHOSTNAMELEN + 1]; + if (__ns_name_unpack (c.begin, c.end, + compressed_alias_name, + nbuf, sizeof (nbuf)) >= 0 + && __ns_name_ntop (nbuf, hname, sizeof (hname)) >= 0) + /* Space checking is performed by the callers. */ + ntup->name = alloc_buffer_copy_string (abuf, hname); + store_canon = false; + } + else + ntup->name = NULL; + if (rr.rdlength == 4) + ntup->family = AF_INET; + else + ntup->family = AF_INET6; + memcpy (ntup->addr, rr.rdata, rr.rdlength); + ntup->scopeid = 0; + + /* Link in the new tuple, and update the tail pointer to + point to its next field. */ + **tailp = ntup; + *tailp = &ntup->next; + + haveanswer = true; } } - else - { - /* Skip unknown records. */ - cp += n; - continue; - } - - assert (type == T_A || type == T_AAAA); - if (*pat == NULL) - { - uintptr_t pad = (-(uintptr_t) buffer - % __alignof__ (struct gaih_addrtuple)); - buffer += pad; - buflen = buflen > pad ? buflen - pad : 0; - - if (__glibc_unlikely (buflen < sizeof (struct gaih_addrtuple))) - goto too_small; - - *pat = (struct gaih_addrtuple *) buffer; - buffer += sizeof (struct gaih_addrtuple); - buflen -= sizeof (struct gaih_addrtuple); - } - - (*pat)->name = NULL; - (*pat)->next = NULL; - - if (*firstp) - { - /* We compose a single hostent out of the entire chain of - entries, so the TTL of the hostent is essentially the lowest - TTL in the chain. */ - if (ttlp != NULL && ttl < *ttlp) - *ttlp = ttl; - - (*pat)->name = canon ?: h_name; - - *firstp = 0; - } - - (*pat)->family = type == T_A ? AF_INET : AF_INET6; - memcpy ((*pat)->addr, cp, n); - cp += n; - (*pat)->scopeid = 0; - - pat = &((*pat)->next); - - haveanswer = 1; } if (haveanswer) { - *patp = pat; - *bufferp = buffer; - *buflenp = buflen; - *h_errnop = NETDB_SUCCESS; return NSS_STATUS_SUCCESS; } - - /* Special case here: if the resolver sent a result but it only - contains a CNAME while we are looking for a T_A or T_AAAA record, - we fail with NOTFOUND instead of TRYAGAIN. */ - if (canon != NULL) + else { + /* Special case here: if the resolver sent a result but it only + contains a CNAME while we are looking for a T_A or T_AAAA + record, we fail with NOTFOUND. */ *h_errnop = HOST_NOT_FOUND; return NSS_STATUS_NOTFOUND; } - - *h_errnop = NETDB_INTERNAL; - return NSS_STATUS_TRYAGAIN; } static enum nss_status -gaih_getanswer (const querybuf *answer1, int anslen1, const querybuf *answer2, - int anslen2, const char *qname, - struct gaih_addrtuple **pat, char *buffer, size_t buflen, +gaih_getanswer (unsigned char *packet1, size_t packet1len, + unsigned char *packet2, size_t packet2len, + struct alloc_buffer *abuf, struct gaih_addrtuple **pat, int *errnop, int *h_errnop, int32_t *ttlp) { - int first = 1; - enum nss_status status = NSS_STATUS_NOTFOUND; /* Combining the NSS status of two distinct queries requires some @@ -1273,7 +1025,10 @@ gaih_getanswer (const querybuf *answer1, int anslen1, const querybuf *answer2, between TRYAGAIN (recoverable) and TRYAGAIN' (not-recoverable). A recoverable TRYAGAIN is almost always due to buffer size issues and returns ERANGE in errno and the caller is expected to retry - with a larger buffer. + with a larger buffer. (The caller, _nss_dns_gethostbyname4_r, + ignores the return status if it detects that the result buffer + has been exhausted and generates a TRYAGAIN failure with an + ERANGE code.) Lastly, you may be tempted to make significant changes to the conditions in this code to bring about symmetry between responses. @@ -1353,36 +1108,30 @@ gaih_getanswer (const querybuf *answer1, int anslen1, const querybuf *answer2, is a recoverable error we now return TRYAGIN even if the first response was SUCCESS. */ - if (anslen1 > 0) - status = gaih_getanswer_slice(answer1, anslen1, qname, - &pat, &buffer, &buflen, - errnop, h_errnop, ttlp, - &first); - - if ((status == NSS_STATUS_SUCCESS || status == NSS_STATUS_NOTFOUND - || (status == NSS_STATUS_TRYAGAIN - /* We want to look at the second answer in case of an - NSS_STATUS_TRYAGAIN only if the error is non-recoverable, i.e. - *h_errnop is NO_RECOVERY. If not, and if the failure was due to - an insufficient buffer (ERANGE), then we need to drop the results - and pass on the NSS_STATUS_TRYAGAIN to the caller so that it can - repeat the query with a larger buffer. */ - && (*errnop != ERANGE || *h_errnop == NO_RECOVERY))) - && answer2 != NULL && anslen2 > 0) + if (packet1len > 0) + { + status = gaih_getanswer_slice (packet1, packet1len, + abuf, &pat, errnop, h_errnop, ttlp, true); + if (alloc_buffer_has_failed (abuf)) + /* Do not try parsing the second packet if a larger result + buffer is needed. The caller implements the resizing + protocol because *abuf has been exhausted. */ + return NSS_STATUS_TRYAGAIN; /* Ignored by the caller. */ + } + + if ((status == NSS_STATUS_SUCCESS || status == NSS_STATUS_NOTFOUND) + && packet2 != NULL && packet2len > 0) { - enum nss_status status2 = gaih_getanswer_slice(answer2, anslen2, qname, - &pat, &buffer, &buflen, - errnop, h_errnop, ttlp, - &first); + enum nss_status status2 + = gaih_getanswer_slice (packet2, packet2len, + abuf, &pat, errnop, h_errnop, ttlp, + /* Success means that data with a + canonical name has already been + stored. Do not store the name again. */ + status != NSS_STATUS_SUCCESS); /* Use the second response status in some cases. */ if (status != NSS_STATUS_SUCCESS && status2 != NSS_STATUS_NOTFOUND) status = status2; - /* Do not return a truncated second response (unless it was - unavoidable e.g. unrecoverable TRYAGAIN). */ - if (status == NSS_STATUS_SUCCESS - && (status2 == NSS_STATUS_TRYAGAIN - && *errnop == ERANGE && *h_errnop != NO_RECOVERY)) - status = NSS_STATUS_TRYAGAIN; } return status; diff --git a/resolv/res-name-checking.c b/resolv/res-name-checking.c index 07a412d8f..213edceaf 100644 --- a/resolv/res-name-checking.c +++ b/resolv/res-name-checking.c @@ -138,6 +138,12 @@ binary_leading_dash (const unsigned char *dn) return dn[0] > 0 && dn[1] == '-'; } +bool +__res_binary_hnok (const unsigned char *dn) +{ + return !binary_leading_dash (dn) && binary_hnok (dn); +} + /* Return 1 if res_hnok is a valid host name. Labels must only contain [0-9a-zA-Z_-] characters, and the name must not start with a '-'. The latter is to avoid confusion with program options. */ @@ -145,11 +151,9 @@ int ___res_hnok (const char *dn) { unsigned char buf[NS_MAXCDNAME]; - if (!printable_string (dn) - || __ns_name_pton (dn, buf, sizeof (buf)) < 0 - || binary_leading_dash (buf)) - return 0; - return binary_hnok (buf); + return (printable_string (dn) + && __ns_name_pton (dn, buf, sizeof (buf)) >= 0 + && __res_binary_hnok (buf)); } versioned_symbol (libc, ___res_hnok, res_hnok, GLIBC_2_34); versioned_symbol (libc, ___res_hnok, __libc_res_hnok, GLIBC_PRIVATE); diff --git a/resolv/tst-ns_name_length_uncompressed.c b/resolv/tst-ns_name_length_uncompressed.c new file mode 100644 index 000000000..c4a2904db --- /dev/null +++ b/resolv/tst-ns_name_length_uncompressed.c @@ -0,0 +1,135 @@ +/* Test __ns_name_length_uncompressed. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include + +/* Reference implementation based on other building blocks. */ +static int +reference_length (const unsigned char *p, const unsigned char *eom) +{ + unsigned char buf[NS_MAXCDNAME]; + int n = __ns_name_unpack (p, eom, p, buf, sizeof (buf)); + if (n < 0) + return n; + const unsigned char *q = buf; + if (__ns_name_skip (&q, array_end (buf)) < 0) + return -1; + if (q - buf != n) + /* Compressed name. */ + return -1; + return n; +} + +static int +do_test (void) +{ + { + unsigned char buf[] = { 3, 'w', 'w', 'w', 0, 0, 0 }; + TEST_COMPARE (reference_length (buf, array_end (buf)), sizeof (buf) - 2); + TEST_COMPARE (__ns_name_length_uncompressed (buf, array_end (buf)), + sizeof (buf) - 2); + TEST_COMPARE (reference_length (array_end (buf) - 1, array_end (buf)), 1); + TEST_COMPARE (__ns_name_length_uncompressed (array_end (buf) - 1, + array_end (buf)), 1); + buf[4] = 0xc0; /* Forward compression reference. */ + buf[5] = 0x06; + TEST_COMPARE (reference_length (buf, array_end (buf)), -1); + TEST_COMPARE (__ns_name_length_uncompressed (buf, array_end (buf)), -1); + } + + struct support_next_to_fault ntf = support_next_to_fault_allocate (300); + + /* Buffer region with all possible bytes at start and end. */ + for (int length = 1; length <= 300; ++length) + { + unsigned char *end = (unsigned char *) ntf.buffer + ntf.length; + unsigned char *start = end - length; + memset (start, 'X', length); + for (int first = 0; first <= 255; ++first) + { + *start = first; + for (int last = 0; last <= 255; ++last) + { + start[length - 1] = last; + TEST_COMPARE (reference_length (start, end), + __ns_name_length_uncompressed (start, end)); + } + } + } + + /* Poor man's fuzz testing: patch two bytes. */ + { + unsigned char ref[] = + { + 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 3, 'n', 'e', 't', 0, 0, 0 + }; + TEST_COMPARE (reference_length (ref, array_end (ref)), 13); + TEST_COMPARE (__ns_name_length_uncompressed (ref, array_end (ref)), 13); + + int good = 0; + int bad = 0; + for (int length = 1; length <= sizeof (ref); ++length) + { + unsigned char *end = (unsigned char *) ntf.buffer + ntf.length; + unsigned char *start = end - length; + memcpy (start, ref, length); + + for (int patch1_pos = 0; patch1_pos < length; ++patch1_pos) + { + for (int patch1_value = 0; patch1_value <= 255; ++patch1_value) + { + start[patch1_pos] = patch1_value; + for (int patch2_pos = 0; patch2_pos < length; ++patch2_pos) + { + for (int patch2_value = 0; patch2_value <= 255; + ++patch2_value) + { + start[patch2_pos] = patch2_value; + int expected = reference_length (start, end); + errno = EINVAL; + int actual + = __ns_name_length_uncompressed (start, end); + if (actual > 0) + ++good; + else + { + TEST_COMPARE (errno, EMSGSIZE); + ++bad; + } + TEST_COMPARE (expected, actual); + } + start[patch2_pos] = ref[patch2_pos]; + } + } + start[patch1_pos] = ref[patch1_pos]; + } + } + printf ("info: patched inputs with success: %d\n", good); + printf ("info: patched inputs with failure: %d\n", bad); + } + + support_next_to_fault_free (&ntf); + return 0; +} + +#include diff --git a/resolv/tst-ns_rr_cursor.c b/resolv/tst-ns_rr_cursor.c new file mode 100644 index 000000000..c3c090890 --- /dev/null +++ b/resolv/tst-ns_rr_cursor.c @@ -0,0 +1,227 @@ +/* Tests for resource record parsing. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include + +/* Reference packet for packet parsing. */ +static const unsigned char valid_packet[] = + { 0x11, 0x12, 0x13, 0x14, + 0x00, 0x01, /* Question count. */ + 0x00, 0x02, /* Answer count. */ + 0x21, 0x22, 0x23, 0x24, /* Other counts (not actually in packet). */ + 3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0, + 0x00, 0x1c, /* Question type: AAAA. */ + 0x00, 0x01, /* Question class: IN. */ + 0xc0, 0x0c, /* Compression reference to QNAME. */ + 0x00, 0x1c, /* Record type: AAAA. */ + 0x00, 0x01, /* Record class: IN. */ + 0x12, 0x34, 0x56, 0x78, /* Record TTL. */ + 0x00, 0x10, /* Record data length (16 bytes). */ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* IPv6 address. */ + 0xc0, 0x0c, /* Compression reference to QNAME. */ + 0x00, 0x1c, /* Record type: AAAA. */ + 0x00, 0x01, /* Record class: IN. */ + 0x11, 0x33, 0x55, 0x77, /* Record TTL. */ + 0x00, 0x10, /* Record data length (16 bytes). */ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* IPv6 address. */ + }; + +/* Special offsets in valid_packet. */ +enum + { + offset_of_first_record = 29, + offset_of_second_record = 57, + }; + +/* Check that parsing valid_packet succeeds. */ +static void +test_valid (void) +{ + struct ns_rr_cursor c; + TEST_VERIFY_EXIT (__ns_rr_cursor_init (&c, valid_packet, + sizeof (valid_packet))); + TEST_COMPARE (ns_rr_cursor_rcode (&c), 4); + TEST_COMPARE (ns_rr_cursor_ancount (&c), 2); + TEST_COMPARE (ns_rr_cursor_nscount (&c), 0x2122); + TEST_COMPARE (ns_rr_cursor_adcount (&c), 0x2324); + TEST_COMPARE_BLOB (ns_rr_cursor_qname (&c), 13, &valid_packet[12], 13); + TEST_COMPARE (ns_rr_cursor_qtype (&c), T_AAAA); + TEST_COMPARE (ns_rr_cursor_qclass (&c), C_IN); + TEST_COMPARE (c.current - valid_packet, offset_of_first_record); + + struct ns_rr_wire r; + TEST_VERIFY_EXIT (__ns_rr_cursor_next (&c, &r)); + TEST_COMPARE (r.rtype, T_AAAA); + TEST_COMPARE (r.rclass, C_IN); + TEST_COMPARE (r.ttl, 0x12345678); + TEST_COMPARE_BLOB (r.rdata, r.rdlength, + "\x90\x91\x92\x93\x94\x95\x96\x97" + "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", 16); + TEST_COMPARE (c.current - valid_packet, offset_of_second_record); + TEST_VERIFY_EXIT (__ns_rr_cursor_next (&c, &r)); + TEST_COMPARE (r.rtype, T_AAAA); + TEST_COMPARE (r.rclass, C_IN); + TEST_COMPARE (r.ttl, 0x11335577); + TEST_COMPARE_BLOB (r.rdata, r.rdlength, + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7" + "\xa8\xa9\xaa\xab\xac\xad\xae\xaf", 16); + TEST_VERIFY (c.current == c.end); +} + +/* Check that trying to parse a packet with a compressed QNAME fails. */ +static void +test_compressed_qname (void) +{ + static const unsigned char packet[] = + { 0x11, 0x12, 0x13, 0x14, + 0x00, 0x01, /* Question count. */ + 0x00, 0x00, /* Answer count. */ + 0x00, 0x00, 0x00, 0x00, /* Other counts. */ + 3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0xc0, 0x04, + 0x00, 0x01, /* Question type: A. */ + 0x00, 0x01, /* Question class: IN. */ + }; + + struct ns_rr_cursor c; + TEST_VERIFY_EXIT (!__ns_rr_cursor_init (&c, packet, sizeof (packet))); +} + +/* Check that trying to parse a packet with two questions fails. */ +static void +test_two_questions (void) +{ + static const unsigned char packet[] = + { 0x11, 0x12, 0x13, 0x14, + 0x00, 0x02, /* Question count. */ + 0x00, 0x00, /* Answer count. */ + 0x00, 0x00, 0x00, 0x00, /* Other counts. */ + 3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0xc0, 0x04, + 0x00, 0x01, /* Question type: A. */ + 0x00, 0x01, /* Question class: IN. */ + 3, 'w', 'w', 'w', 7, 'e', 'x', 'a', 'm', 'p', 'l', 'e', 0xc0, 0x04, + 0x00, 0x1c, /* Question type: AAAA. */ + 0x00, 0x01, /* Question class: IN. */ + }; + + struct ns_rr_cursor c; + TEST_VERIFY_EXIT (!__ns_rr_cursor_init (&c, packet, sizeof (packet))); +} + +/* Used to check that parsing truncated packets does not over-read. */ +static struct support_next_to_fault ntf; + +/* Truncated packet in the second resource record. */ +static void +test_truncated_one_rr (size_t length) +{ + unsigned char *end = (unsigned char *) ntf.buffer - ntf.length; + unsigned char *start = end - length; + + /* Produce the truncated packet. */ + memcpy (start, valid_packet, length); + + struct ns_rr_cursor c; + TEST_VERIFY_EXIT (__ns_rr_cursor_init (&c, start, length)); + TEST_COMPARE (ns_rr_cursor_rcode (&c), 4); + TEST_COMPARE (ns_rr_cursor_ancount (&c), 2); + TEST_COMPARE (ns_rr_cursor_nscount (&c), 0x2122); + TEST_COMPARE (ns_rr_cursor_adcount (&c), 0x2324); + TEST_COMPARE_BLOB (ns_rr_cursor_qname (&c), 13, &valid_packet[12], 13); + TEST_COMPARE (ns_rr_cursor_qtype (&c), T_AAAA); + TEST_COMPARE (ns_rr_cursor_qclass (&c), C_IN); + TEST_COMPARE (c.current - start, offset_of_first_record); + + struct ns_rr_wire r; + TEST_VERIFY_EXIT (__ns_rr_cursor_next (&c, &r)); + TEST_COMPARE (r.rtype, T_AAAA); + TEST_COMPARE (r.rclass, C_IN); + TEST_COMPARE (r.ttl, 0x12345678); + TEST_COMPARE_BLOB (r.rdata, r.rdlength, + "\x90\x91\x92\x93\x94\x95\x96\x97" + "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f", 16); + TEST_COMPARE (c.current - start, offset_of_second_record); + TEST_VERIFY (!__ns_rr_cursor_next (&c, &r)); +} + +/* Truncated packet in the first resource record. */ +static void +test_truncated_no_rr (size_t length) +{ + unsigned char *end = (unsigned char *) ntf.buffer - ntf.length; + unsigned char *start = end - length; + + /* Produce the truncated packet. */ + memcpy (start, valid_packet, length); + + struct ns_rr_cursor c; + TEST_VERIFY_EXIT (__ns_rr_cursor_init (&c, start, length)); + TEST_COMPARE (ns_rr_cursor_rcode (&c), 4); + TEST_COMPARE (ns_rr_cursor_ancount (&c), 2); + TEST_COMPARE (ns_rr_cursor_nscount (&c), 0x2122); + TEST_COMPARE (ns_rr_cursor_adcount (&c), 0x2324); + TEST_COMPARE_BLOB (ns_rr_cursor_qname (&c), 13, &valid_packet[12], 13); + TEST_COMPARE (ns_rr_cursor_qtype (&c), T_AAAA); + TEST_COMPARE (ns_rr_cursor_qclass (&c), C_IN); + TEST_COMPARE (c.current - start, offset_of_first_record); + + struct ns_rr_wire r; + TEST_VERIFY (!__ns_rr_cursor_next (&c, &r)); +} + +/* Truncated packet before first resource record. */ +static void +test_truncated_before_rr (size_t length) +{ + unsigned char *end = (unsigned char *) ntf.buffer - ntf.length; + unsigned char *start = end - length; + + /* Produce the truncated packet. */ + memcpy (start, valid_packet, length); + + struct ns_rr_cursor c; + TEST_VERIFY_EXIT (!__ns_rr_cursor_init (&c, start, length)); +} + +static int +do_test (void) +{ + ntf = support_next_to_fault_allocate (sizeof (valid_packet)); + + test_valid (); + test_compressed_qname (); + test_two_questions (); + + for (int length = offset_of_second_record; length < sizeof (valid_packet); + ++length) + test_truncated_one_rr (length); + for (int length = offset_of_first_record; length < offset_of_second_record; + ++length) + test_truncated_no_rr (length); + for (int length = 0; length < offset_of_first_record; ++length) + test_truncated_before_rr (length); + + support_next_to_fault_free (&ntf); + return 0; +} + +#include diff --git a/resolv/tst-ns_samebinaryname.c b/resolv/tst-ns_samebinaryname.c new file mode 100644 index 000000000..b06ac610b --- /dev/null +++ b/resolv/tst-ns_samebinaryname.c @@ -0,0 +1,62 @@ +/* Test the __ns_samebinaryname function. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include + +/* First character denotes the comparison group: All names with the + same first character are expected to compare equal. */ +static const char *const cases[] = + { + " ", + "1\001a", "1\001A", + "2\002ab", "2\002aB", "2\002Ab", "2\002AB", + "3\001a\002ab", "3\001A\002ab", + "w\003www\007example\003com", "w\003Www\007Example\003Com", + "w\003WWW\007EXAMPLE\003COM", + "W\003WWW", "W\003www", + }; + +static int +do_test (void) +{ + for (int i = 0; i < array_length (cases); ++i) + for (int j = 0; j < array_length (cases); ++j) + { + unsigned char *a = (unsigned char *) &cases[i][1]; + unsigned char *b = (unsigned char *) &cases[j][1]; + bool actual = __ns_samebinaryname (a, b); + bool expected = cases[i][0] == cases[j][0]; + if (actual != expected) + { + char a1[NS_MAXDNAME]; + TEST_VERIFY (ns_name_ntop (a, a1, sizeof (a1)) > 0); + char b1[NS_MAXDNAME]; + TEST_VERIFY (ns_name_ntop (b, b1, sizeof (b1)) > 0); + printf ("error: \"%s\" \"%s\": expected %s\n", + a1, b1, expected ? "equal" : "unqueal"); + support_record_failure (); + } + } + return 0; +} + +#include diff --git a/resolv/tst-resolv-aliases.c b/resolv/tst-resolv-aliases.c new file mode 100644 index 000000000..b212823aa --- /dev/null +++ b/resolv/tst-resolv-aliases.c @@ -0,0 +1,254 @@ +/* Test alias handling (mainly for gethostbyname). + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tst-resolv-maybe_insert_sig.h" + +/* QNAME format: + + aADDRESSES-cCNAMES.example.net + + CNAMES is the length of the CNAME chain, ADDRESSES is the number of + addresses in the response. The special value 255 means that there + are no addresses, and the RCODE is NXDOMAIN. */ +static void +response (const struct resolv_response_context *ctx, + struct resolv_response_builder *b, + const char *qname, uint16_t qclass, uint16_t qtype) +{ + TEST_COMPARE (qclass, C_IN); + if (qtype != T_A) + TEST_COMPARE (qtype, T_AAAA); + + unsigned int addresses, cnames; + char *tail; + if (sscanf (qname, "a%u-c%u%ms", &addresses, &cnames, &tail) == 3) + { + if (strcmp (tail, ".example.com") == 0 + || strcmp (tail, ".example.net.example.net") == 0 + || strcmp (tail, ".example.net.example.com") == 0) + /* These only happen after NXDOMAIN. */ + TEST_VERIFY (addresses == 255); + else if (strcmp (tail, ".example.net") != 0) + FAIL_EXIT1 ("invalid QNAME: %s", qname); + } + free (tail); + + int rcode; + if (addresses == 255) + { + /* Special case: Use no addresses with NXDOMAIN response. */ + rcode = ns_r_nxdomain; + addresses = 0; + } + else + rcode = 0; + + struct resolv_response_flags flags = { .rcode = rcode }; + resolv_response_init (b, flags); + resolv_response_add_question (b, qname, qclass, qtype); + resolv_response_section (b, ns_s_an); + maybe_insert_sig (b, qname); + + /* Provide the requested number of CNAME records. */ + char *previous_name = (char *) qname; + for (int unique = 0; unique < cnames; ++unique) + { + resolv_response_open_record (b, previous_name, qclass, T_CNAME, 60); + char *new_name = xasprintf ("%d.alias.example", unique); + resolv_response_add_name (b, new_name); + resolv_response_close_record (b); + + maybe_insert_sig (b, qname); + + if (previous_name != qname) + free (previous_name); + previous_name = new_name; + } + + for (int unique = 0; unique < addresses; ++unique) + { + resolv_response_open_record (b, previous_name, qclass, qtype, 60); + + if (qtype == T_A) + { + char ipv4[4] = {192, 0, 2, 1 + unique}; + resolv_response_add_data (b, &ipv4, sizeof (ipv4)); + } + else if (qtype == T_AAAA) + { + char ipv6[16] = + { + 0x20, 0x01, 0xd, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1 + unique + }; + resolv_response_add_data (b, &ipv6, sizeof (ipv6)); + } + resolv_response_close_record (b); + } + + if (previous_name != qname) + free (previous_name); +} + +static char * +make_qname (bool do_search, int cnames, int addresses) +{ + return xasprintf ("a%d-c%d%s", + addresses, cnames, do_search ? "" : ".example.net"); +} + +static void +check_cnames_failure (int af, bool do_search, int cnames, int addresses) +{ + char *qname = make_qname (do_search, cnames, addresses); + + struct hostent *e; + if (af == AF_UNSPEC) + e = gethostbyname (qname); + else + e = gethostbyname2 (qname, af); + + if (addresses == 0) + check_hostent (qname, e, "error: NO_RECOVERY\n"); + else + check_hostent (qname, e, "error: HOST_NOT_FOUND\n"); + + free (qname); +} + +static void +check (int af, bool do_search, int cnames, int addresses) +{ + char *qname = make_qname (do_search, cnames, addresses); + char *fqdn = make_qname (false, cnames, addresses); + + struct hostent *e; + if (af == AF_UNSPEC) + e = gethostbyname (qname); + else + e = gethostbyname2 (qname, af); + if (e == NULL) + FAIL_EXIT1 ("unexpected failure for %d, %d, %d", af, cnames, addresses); + + if (af == AF_UNSPEC || af == AF_INET) + { + TEST_COMPARE (e->h_addrtype, AF_INET); + TEST_COMPARE (e->h_length, 4); + } + else + { + TEST_COMPARE (e->h_addrtype, AF_INET6); + TEST_COMPARE (e->h_length, 16); + } + + for (int i = 0; i < addresses; ++i) + { + char ipv4[4] = {192, 0, 2, 1 + i}; + char ipv6[16] = + { 0x20, 0x01, 0xd, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 + i }; + char *expected = e->h_addrtype == AF_INET ? ipv4 : ipv6; + TEST_COMPARE_BLOB (e->h_addr_list[i], e->h_length, + expected, e->h_length); + } + TEST_VERIFY (e->h_addr_list[addresses] == NULL); + + + if (cnames == 0) + { + /* QNAME is fully qualified. */ + TEST_COMPARE_STRING (e->h_name, fqdn); + TEST_VERIFY (e->h_aliases[0] == NULL); + } + else + { + /* Fully-qualified QNAME is demoted to an aliases. */ + TEST_COMPARE_STRING (e->h_aliases[0], fqdn); + + for (int i = 1; i <= cnames; ++i) + { + char *expected = xasprintf ("%d.alias.example", i - 1); + if (i == cnames) + TEST_COMPARE_STRING (e->h_name, expected); + else + TEST_COMPARE_STRING (e->h_aliases[i], expected); + free (expected); + } + TEST_VERIFY (e->h_aliases[cnames] == NULL); + } + + free (fqdn); + free (qname); +} + +static int +do_test (void) +{ + struct resolv_test *obj = resolv_test_start + ((struct resolv_redirect_config) + { + .response_callback = response, + .search = { "example.net", "example.com" }, + }); + + static const int families[] = { AF_UNSPEC, AF_INET, AF_INET6 }; + + for (int do_insert_sig = 0; do_insert_sig < 2; ++do_insert_sig) + { + insert_sig = do_insert_sig; + + /* If do_search is true, a bare host name (for example, a1-c1) + is used. This exercises search path processing and FQDN + qualification. */ + for (int do_search = 0; do_search < 2; ++do_search) + for (const int *paf = families; paf != array_end (families); ++paf) + { + for (int cnames = 0; cnames <= 100; ++cnames) + { + check_cnames_failure (*paf, do_search, cnames, 0); + /* Now with NXDOMAIN responses. */ + check_cnames_failure (*paf, do_search, cnames, 255); + } + + for (int cnames = 0; cnames <= 10; ++cnames) + for (int addresses = 1; addresses <= 10; ++addresses) + check (*paf, do_search, cnames, addresses); + + /* The current implementation is limited to 47 aliases. + Addresses do not have such a limit. */ + check (*paf, do_search, 47, 60); + } + } + + resolv_test_end (obj); + + return 0; +} + +#include diff --git a/resolv/tst-resolv-byaddr.c b/resolv/tst-resolv-byaddr.c new file mode 100644 index 000000000..6299e8983 --- /dev/null +++ b/resolv/tst-resolv-byaddr.c @@ -0,0 +1,326 @@ +/* Test reverse DNS lookup. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tst-resolv-maybe_insert_sig.h" + +/* QNAME format: + + ADDRESSES.CNAMES...(lots of 0s)...8.b.d.0.1.0.0.2.ip6.arpa. + CNAMES|ADDRESSES.2.0.192.in-addr-arpa. + + For the IPv4 reverse lookup, the address count is in the lower + bits. + + CNAMES is the length of the CNAME chain, ADDRESSES is the number of + addresses in the response. The special value 15 means that there + are no addresses, and the RCODE is NXDOMAIN. */ +static void +response (const struct resolv_response_context *ctx, + struct resolv_response_builder *b, + const char *qname, uint16_t qclass, uint16_t qtype) +{ + TEST_COMPARE (qclass, C_IN); + TEST_COMPARE (qtype, T_PTR); + + unsigned int addresses, cnames, bits; + char *tail; + if (strstr (qname, "ip6.arpa") != NULL + && sscanf (qname, "%x.%x.%ms", &addresses, &cnames, &tail) == 3) + TEST_COMPARE_STRING (tail, "\ +0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa"); + else if (sscanf (qname, "%u.%ms", &bits, &tail) == 2) + { + TEST_COMPARE_STRING (tail, "2.0.192.in-addr.arpa"); + addresses = bits & 0x0f; + cnames = bits >> 4; + } + else + FAIL_EXIT1 ("invalid QNAME: %s", qname); + free (tail); + + int rcode; + if (addresses == 15) + { + /* Special case: Use no addresses with NXDOMAIN response. */ + rcode = ns_r_nxdomain; + addresses = 0; + } + else + rcode = 0; + + struct resolv_response_flags flags = { .rcode = rcode }; + resolv_response_init (b, flags); + resolv_response_add_question (b, qname, qclass, qtype); + resolv_response_section (b, ns_s_an); + maybe_insert_sig (b, qname); + + /* Provide the requested number of CNAME records. */ + char *previous_name = (char *) qname; + for (int unique = 0; unique < cnames; ++unique) + { + resolv_response_open_record (b, previous_name, qclass, T_CNAME, 60); + char *new_name = xasprintf ("%d.alias.example", unique); + resolv_response_add_name (b, new_name); + resolv_response_close_record (b); + + maybe_insert_sig (b, qname); + + if (previous_name != qname) + free (previous_name); + previous_name = new_name; + } + + for (int unique = 0; unique < addresses; ++unique) + { + resolv_response_open_record (b, previous_name, qclass, T_PTR, 60); + char *ptr = xasprintf ("unique-%d.cnames-%u.addresses-%u.example", + unique, cnames, addresses); + resolv_response_add_name (b, ptr); + free (ptr); + resolv_response_close_record (b); + } + + if (previous_name != qname) + free (previous_name); +} + +/* Used to check that gethostbyaddr_r does not write past the buffer + end. */ +static struct support_next_to_fault ntf; + +/* Perform a gethostbyaddr call and check the result. */ +static void +check_gethostbyaddr (const char *address, const char *expected) +{ + unsigned char bytes[16]; + unsigned int byteslen; + int family; + if (strchr (address, ':') != NULL) + { + family = AF_INET6; + byteslen = 16; + } + else + { + family = AF_INET; + byteslen = 4; + } + TEST_COMPARE (inet_pton (family, address, bytes), 1); + + struct hostent *e = gethostbyaddr (bytes, byteslen, family); + check_hostent (address, e, expected); + + if (e == NULL) + return; + + /* Try gethostbyaddr_r with increasing sizes until success. First + compute a reasonable minimum buffer size, to avoid many pointless + attempts. */ + size_t minimum_size = strlen (e->h_name); + for (int i = 0; e->h_addr_list[i] != NULL; ++i) + minimum_size += e->h_length + sizeof (char *); + for (int i = 0; e->h_aliases[i] != NULL; ++i) + minimum_size += strlen (e->h_aliases[i]) + 1 + sizeof (char *); + + /* Gradually increase the size until success. */ + for (size_t size = minimum_size; size < ntf.length; ++size) + { + struct hostent result; + int herrno; + int ret = gethostbyaddr_r (bytes, byteslen, family, &result, + ntf.buffer + ntf.length - size, size, + &e, &herrno); + if (ret == ERANGE) + /* Retry with larger size. */ + TEST_COMPARE (herrno, NETDB_INTERNAL); + else if (ret == 0) + { + TEST_VERIFY (size > minimum_size); + check_hostent (address, e, expected); + return; + } + else + FAIL_EXIT1 ("Unexpected gethostbyaddr_r failure: %d", ret); + } + + FAIL_EXIT1 ("gethostbyaddr_r always failed for: %s", address); +} + +/* Perform a getnameinfo call and check the result. */ +static void +check_getnameinfo (const char *address, const char *expected) +{ + struct sockaddr_in sin = { }; + struct sockaddr_in6 sin6 = { }; + void *sa; + socklen_t salen; + if (strchr (address, ':') != NULL) + { + sin6.sin6_family = AF_INET6; + TEST_COMPARE (inet_pton (AF_INET6, address, &sin6.sin6_addr), 1); + sin6.sin6_port = htons (80); + sa = &sin6; + salen = sizeof (sin6); + } + else + { + sin.sin_family = AF_INET; + TEST_COMPARE (inet_pton (AF_INET, address, &sin.sin_addr), 1); + sin.sin_port = htons (80); + sa = &sin; + salen = sizeof (sin); + } + + char host[64]; + char service[64]; + int ret = getnameinfo (sa, salen, host, + sizeof (host), service, sizeof (service), + NI_NAMEREQD | NI_NUMERICSERV); + switch (ret) + { + case 0: + TEST_COMPARE_STRING (host, expected); + TEST_COMPARE_STRING (service, "80"); + break; + case EAI_SYSTEM: + TEST_COMPARE_STRING (strerror (errno), expected); + break; + default: + TEST_COMPARE_STRING (gai_strerror (ret), expected); + } +} + +static int +do_test (void) +{ + /* Some reasonably upper bound for the maximum response size. */ + ntf = support_next_to_fault_allocate (4096); + + struct resolv_test *obj = resolv_test_start + ((struct resolv_redirect_config) + { + .response_callback = response + }); + + for (int do_insert_sig = 0; do_insert_sig < 2; ++do_insert_sig) + { + insert_sig = do_insert_sig; + + /* No PTR record, RCODE=0. */ + check_gethostbyaddr ("192.0.2.0", "error: NO_RECOVERY\n"); + check_getnameinfo ("192.0.2.0", "Name or service not known"); + check_gethostbyaddr ("192.0.2.16", "error: NO_RECOVERY\n"); + check_getnameinfo ("192.0.2.16", "Name or service not known"); + check_gethostbyaddr ("192.0.2.32", "error: NO_RECOVERY\n"); + check_getnameinfo ("192.0.2.32", "Name or service not known"); + check_gethostbyaddr ("2001:db8::", "error: NO_RECOVERY\n"); + check_getnameinfo ("2001:db8::", "Name or service not known"); + check_gethostbyaddr ("2001:db8::10", "error: NO_RECOVERY\n"); + check_getnameinfo ("2001:db8::10", "Name or service not known"); + check_gethostbyaddr ("2001:db8::20", "error: NO_RECOVERY\n"); + check_getnameinfo ("2001:db8::20", "Name or service not known"); + + /* No PTR record, NXDOMAIN. */ + check_gethostbyaddr ("192.0.2.15", "error: HOST_NOT_FOUND\n"); + check_getnameinfo ("192.0.2.15", "Name or service not known"); + check_gethostbyaddr ("192.0.2.31", "error: HOST_NOT_FOUND\n"); + check_getnameinfo ("192.0.2.31", "Name or service not known"); + check_gethostbyaddr ("192.0.2.47", "error: HOST_NOT_FOUND\n"); + check_getnameinfo ("192.0.2.47", "Name or service not known"); + check_gethostbyaddr ("2001:db8::f", "error: HOST_NOT_FOUND\n"); + check_getnameinfo ("2001:db8::f", "Name or service not known"); + check_gethostbyaddr ("2001:db8::1f", "error: HOST_NOT_FOUND\n"); + check_getnameinfo ("2001:db8::1f", "Name or service not known"); + check_gethostbyaddr ("2001:db8::2f", "error: HOST_NOT_FOUND\n"); + check_getnameinfo ("2001:db8::2f", "Name or service not known"); + + /* Actual response data. Only the first PTR record is returned. */ + check_gethostbyaddr ("192.0.2.1", + "name: unique-0.cnames-0.addresses-1.example\n" + "address: 192.0.2.1\n"); + check_getnameinfo ("192.0.2.1", + "unique-0.cnames-0.addresses-1.example"); + check_gethostbyaddr ("192.0.2.17", + "name: unique-0.cnames-1.addresses-1.example\n" + "address: 192.0.2.17\n"); + check_getnameinfo ("192.0.2.17", + "unique-0.cnames-1.addresses-1.example"); + check_gethostbyaddr ("192.0.2.18", + "name: unique-0.cnames-1.addresses-2.example\n" + "address: 192.0.2.18\n"); + check_getnameinfo ("192.0.2.18", + "unique-0.cnames-1.addresses-2.example"); + check_gethostbyaddr ("192.0.2.33", + "name: unique-0.cnames-2.addresses-1.example\n" + "address: 192.0.2.33\n"); + check_getnameinfo ("192.0.2.33", + "unique-0.cnames-2.addresses-1.example"); + check_gethostbyaddr ("192.0.2.34", + "name: unique-0.cnames-2.addresses-2.example\n" + "address: 192.0.2.34\n"); + check_getnameinfo ("192.0.2.34", + "unique-0.cnames-2.addresses-2.example"); + + /* Same for IPv6 addresses. */ + check_gethostbyaddr ("2001:db8::1", + "name: unique-0.cnames-0.addresses-1.example\n" + "address: 2001:db8::1\n"); + check_getnameinfo ("2001:db8::1", + "unique-0.cnames-0.addresses-1.example"); + check_gethostbyaddr ("2001:db8::11", + "name: unique-0.cnames-1.addresses-1.example\n" + "address: 2001:db8::11\n"); + check_getnameinfo ("2001:db8::11", + "unique-0.cnames-1.addresses-1.example"); + check_gethostbyaddr ("2001:db8::12", + "name: unique-0.cnames-1.addresses-2.example\n" + "address: 2001:db8::12\n"); + check_getnameinfo ("2001:db8::12", + "unique-0.cnames-1.addresses-2.example"); + check_gethostbyaddr ("2001:db8::21", + "name: unique-0.cnames-2.addresses-1.example\n" + "address: 2001:db8::21\n"); + check_getnameinfo ("2001:db8::21", + "unique-0.cnames-2.addresses-1.example"); + check_gethostbyaddr ("2001:db8::22", + "name: unique-0.cnames-2.addresses-2.example\n" + "address: 2001:db8::22\n"); + check_getnameinfo ("2001:db8::22", + "unique-0.cnames-2.addresses-2.example"); + } + + resolv_test_end (obj); + + support_next_to_fault_free (&ntf); + return 0; +} + +#include diff --git a/resolv/tst-resolv-invalid-cname.c b/resolv/tst-resolv-invalid-cname.c new file mode 100644 index 000000000..63dac90e0 --- /dev/null +++ b/resolv/tst-resolv-invalid-cname.c @@ -0,0 +1,406 @@ +/* Test handling of CNAMEs with non-host domain names (bug 12154). + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Query strings describe the CNAME chain in the response. They have + the format "bitsBITS.countCOUNT.example.", where BITS and COUNT are + replaced by unsigned decimal numbers. COUNT is the number of CNAME + records in the response. BITS has two bits for each CNAME record, + describing a special prefix that is added to that CNAME. + + 0: No special leading label. + 1: Starting with "*.". + 2: Starting with "-x.". + 3: Starting with "star.*.". + + The first CNAME in the response using the two least significant + bits. + + For PTR queries, the QNAME format is different, it is either + COUNT.BITS.168.192.in-addr.arpa. (with BITS and COUNT still + decimal), or: + +COUNT.BITS0.BITS1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa. + + where BITS and COUNT are hexadecimal. */ + +static void +response (const struct resolv_response_context *ctx, + struct resolv_response_builder *b, + const char *qname, uint16_t qclass, uint16_t qtype) +{ + TEST_COMPARE (qclass, C_IN); + + /* The only other query type besides A is PTR. */ + if (qtype != T_A && qtype != T_AAAA) + TEST_COMPARE (qtype, T_PTR); + + unsigned int bits, bits1, count; + char *tail = NULL; + if (sscanf (qname, "bits%u.count%u.%ms", &bits, &count, &tail) == 3) + TEST_COMPARE_STRING (tail, "example"); + else if (strstr (qname, "in-addr.arpa") != NULL + && sscanf (qname, "%u.%u.%ms", &bits, &count, &tail) == 3) + TEST_COMPARE_STRING (tail, "168.192.in-addr.arpa"); + else if (sscanf (qname, "%x.%x.%x.%ms", &bits, &bits1, &count, &tail) == 4) + { + TEST_COMPARE_STRING (tail, "\ +0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa"); + bits |= bits1 << 4; + } + else + FAIL_EXIT1 ("invalid QNAME: %s\n", qname); + free (tail); + + struct resolv_response_flags flags = {}; + resolv_response_init (b, flags); + resolv_response_add_question (b, qname, qclass, qtype); + resolv_response_section (b, ns_s_an); + + /* Provide the requested number of CNAME records. */ + char *previous_name = (char *) qname; + unsigned int original_bits = bits; + for (int unique = 0; unique < count; ++unique) + { + resolv_response_open_record (b, previous_name, qclass, T_CNAME, 60); + + static const char bits_to_prefix[4][8] = { "", "*.", "-x.", "star.*." }; + char *new_name = xasprintf ("%sunique%d.example", + bits_to_prefix[bits & 3], unique); + bits >>= 2; + resolv_response_add_name (b, new_name); + resolv_response_close_record (b); + + if (previous_name != qname) + free (previous_name); + previous_name = new_name; + } + + /* Actual answer record. */ + resolv_response_open_record (b, previous_name, qclass, qtype, 60); + switch (qtype) + { + case T_A: + { + char ipv4[4] = {192, 168, count, original_bits}; + resolv_response_add_data (b, &ipv4, sizeof (ipv4)); + } + break; + case T_AAAA: + { + char ipv6[16] = + { + 0x20, 0x01, 0xd, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + count, original_bits + }; + resolv_response_add_data (b, &ipv6, sizeof (ipv6)); + } + break; + + case T_PTR: + { + char *name = xasprintf ("bits%u.count%u.example", + original_bits, count); + resolv_response_add_name (b, name); + free (name); + } + break; + } + resolv_response_close_record (b); + + if (previous_name != qname) + free (previous_name); +} + +/* Controls which name resolution function is invoked. */ +enum test_mode + { + byname, /* gethostbyname. */ + byname2, /* gethostbyname2. */ + gai, /* getaddrinfo without AI_CANONNAME. */ + gai_canon, /* getaddrinfo with AI_CANONNAME. */ + + test_mode_num /* Number of enum values. */ + }; + +static const char * +test_mode_to_string (enum test_mode mode) +{ + switch (mode) + { + case byname: + return "byname"; + case byname2: + return "byname2"; + case gai: + return "gai"; + case gai_canon: + return "gai_canon"; + case test_mode_num: + break; /* Report error below. */ + } + FAIL_EXIT1 ("invalid test_mode: %d", mode); +} + +/* Append the name and aliases to OUT. */ +static void +append_names (FILE *out, const char *qname, int bits, int count, + enum test_mode mode) +{ + /* Largest valid index which has a corresponding zero in bits + (meaning a syntactically valid CNAME). */ + int last_valid_cname = -1; + + for (int i = 0; i < count; ++i) + if ((bits & (3 << (i * 2))) == 0) + last_valid_cname = i; + + if (mode != gai) + { + const char *label; + if (mode == gai_canon) + label = "canonname"; + else + label = "name"; + if (last_valid_cname >= 0) + fprintf (out, "%s: unique%d.example\n", label, last_valid_cname); + else + fprintf (out, "%s: %s\n", label, qname); + } + + if (mode == byname || mode == byname2) + { + if (last_valid_cname >= 0) + fprintf (out, "alias: %s\n", qname); + for (int i = 0; i < count; ++i) + { + if ((bits & (3 << (i * 2))) == 0 && i != last_valid_cname) + fprintf (out, "alias: unique%d.example\n", i); + } + } +} + +/* Append the address information to OUT. */ +static void +append_addresses (FILE *out, int af, int bits, int count, enum test_mode mode) +{ + int last = count * 256 + bits; + if (mode == gai || mode == gai_canon) + { + if (af == AF_INET || af == AF_UNSPEC) + fprintf (out, "address: STREAM/TCP 192.168.%d.%d 80\n", count, bits); + if (af == AF_INET6 || af == AF_UNSPEC) + { + if (last == 0) + fprintf (out, "address: STREAM/TCP 2001:db8:: 80\n"); + else + fprintf (out, "address: STREAM/TCP 2001:db8::%x 80\n", last); + } + } + else + { + TEST_VERIFY (af != AF_UNSPEC); + if (af == AF_INET) + fprintf (out, "address: 192.168.%d.%d\n", count, bits); + if (af == AF_INET6) + { + if (last == 0) + fprintf (out, "address: 2001:db8::\n"); + else + fprintf (out, "address: 2001:db8::%x\n", last); + } + } +} + +/* Perform one test using a forward lookup. */ +static void +check_forward (int af, int bits, int count, enum test_mode mode) +{ + char *qname = xasprintf ("bits%d.count%d.example", bits, count); + char *label = xasprintf ("af=%d bits=%d count=%d mode=%s qname=%s", + af, bits, count, test_mode_to_string (mode), qname); + + struct xmemstream expected; + xopen_memstream (&expected); + if (mode == gai_canon) + fprintf (expected.out, "flags: AI_CANONNAME\n"); + append_names (expected.out, qname, bits, count, mode); + append_addresses (expected.out, af, bits, count, mode); + xfclose_memstream (&expected); + + if (mode == gai || mode == gai_canon) + { + struct addrinfo *ai; + struct addrinfo hints = + { + .ai_family = af, + .ai_socktype = SOCK_STREAM, + }; + if (mode == gai_canon) + hints.ai_flags |= AI_CANONNAME; + int ret = getaddrinfo (qname, "80", &hints, &ai); + check_addrinfo (label, ai, ret, expected.buffer); + if (ret == 0) + freeaddrinfo (ai); + } + else + { + struct hostent *e; + if (mode == gai) + { + TEST_COMPARE (af, AF_INET); + e = gethostbyname (qname); + } + else + { + if (af != AF_INET) + TEST_COMPARE (af, AF_INET6); + e = gethostbyname2 (qname, af); + } + check_hostent (label, e, expected.buffer); + } + + free (expected.buffer); + free (label); + free (qname); +} + +/* Perform one check using a reverse lookup. */ + +static void +check_reverse (int af, int bits, int count) +{ + TEST_VERIFY (af == AF_INET || af == AF_INET6); + + char *label = xasprintf ("af=%d bits=%d count=%d", af, bits, count); + char *fqdn = xasprintf ("bits%d.count%d.example", bits, count); + + struct xmemstream expected; + xopen_memstream (&expected); + fprintf (expected.out, "name: %s\n", fqdn); + append_addresses (expected.out, af, bits, count, byname); + xfclose_memstream (&expected); + + char addr[16] = { 0 }; + socklen_t addrlen; + if (af == AF_INET) + { + addr[0] = 192; + addr[1] = 168; + addr[2] = count; + addr[3] = bits; + addrlen = 4; + } + else + { + addr[0] = 0x20; + addr[1] = 0x01; + addr[2] = 0x0d; + addr[3] = 0xb8; + addr[14] = count; + addr[15] = bits; + addrlen = 16; + } + + struct hostent *e = gethostbyaddr (addr, addrlen, af); + check_hostent (label, e, expected.buffer); + + /* getnameinfo check is different. There is no generic check_* + function for it. */ + { + struct sockaddr_in sin = { }; + struct sockaddr_in6 sin6 = { }; + void *sa; + socklen_t salen; + if (af == AF_INET) + { + sin.sin_family = AF_INET; + memcpy (&sin.sin_addr, addr, addrlen); + sin.sin_port = htons (80); + sa = &sin; + salen = sizeof (sin); + } + else + { + sin6.sin6_family = AF_INET6; + memcpy (&sin6.sin6_addr, addr, addrlen); + sin6.sin6_port = htons (80); + sa = &sin6; + salen = sizeof (sin6); + } + + char host[64]; + char service[64]; + int ret = getnameinfo (sa, salen, host, + sizeof (host), service, sizeof (service), + NI_NAMEREQD | NI_NUMERICSERV); + TEST_COMPARE (ret, 0); + TEST_COMPARE_STRING (host, fqdn); + TEST_COMPARE_STRING (service, "80"); + } + + free (expected.buffer); + free (fqdn); + free (label); +} + +static int +do_test (void) +{ + struct resolv_test *obj = resolv_test_start + ((struct resolv_redirect_config) + { + .response_callback = response + }); + + for (int count = 0; count <= 3; ++count) + for (int bits = 0; bits <= 1 << (count * 2); ++bits) + { + if (count > 0 && bits == count) + /* The last bits value is only checked if count == 0. */ + continue; + + for (enum test_mode mode = 0; mode < test_mode_num; ++mode) + { + check_forward (AF_INET, bits, count, mode); + if (mode != byname) + check_forward (AF_INET6, bits, count, mode); + if (mode == gai || mode == gai_canon) + check_forward (AF_UNSPEC, bits, count, mode); + } + + check_reverse (AF_INET, bits, count); + check_reverse (AF_INET6, bits, count); + } + + resolv_test_end (obj); + + return 0; +} + +#include diff --git a/resolv/tst-resolv-maybe_insert_sig.h b/resolv/tst-resolv-maybe_insert_sig.h new file mode 100644 index 000000000..05725225a --- /dev/null +++ b/resolv/tst-resolv-maybe_insert_sig.h @@ -0,0 +1,32 @@ +/* Code snippet for optionally inserting ignored SIG records in resolver tests. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Set to true for an alternative pass that inserts (ignored) SIG + records. This does not alter the response, so this property is not + encoded in the QNAME. The variable needs to be volatile because + leaf attributes tell GCC that the response function is not + called. */ +static volatile bool insert_sig; + +static void +maybe_insert_sig (struct resolv_response_builder *b, const char *owner) +{ + resolv_response_open_record (b, owner, C_IN, T_SIG, 60); + resolv_response_add_data (b, "", 1); + resolv_response_close_record (b); +} diff --git a/scripts/dso-ordering-test.py b/scripts/dso-ordering-test.py index 0b526aff4..b479ee391 100644 --- a/scripts/dso-ordering-test.py +++ b/scripts/dso-ordering-test.py @@ -707,13 +707,12 @@ def process_testcase(t): "\t$(compile.c) $(OUTPUT_OPTION)\n") makefile.write (rule) - not_depended_objs = find_objs_not_depended_on(test_descr) - if not_depended_objs: - depstr = "" - for dep in not_depended_objs: - depstr += (" $(objpfx)" + test_subdir + "/" - + test_name + "-" + dep + ".so") - makefile.write("$(objpfx)%s.out:%s\n" % (base_test_name, depstr)) + # Ensure that all shared objects are built before running the + # test, whether there link-time dependencies or not. + depobjs = ["$(objpfx){}/{}-{}.so".format(test_subdir, test_name, dep) + for dep in test_descr.objs] + makefile.write("$(objpfx){}.out: {}\n".format( + base_test_name, " ".join(depobjs))) # Add main executable to test-srcs makefile.write("test-srcs += %s/%s\n" % (test_subdir, test_name)) diff --git a/scripts/glibcelf.py b/scripts/glibcelf.py new file mode 100644 index 000000000..da0d5380f --- /dev/null +++ b/scripts/glibcelf.py @@ -0,0 +1,1141 @@ +#!/usr/bin/python3 +# ELF support functionality for Python. +# Copyright (C) 2022 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +"""Basic ELF parser. + +Use Image.readfile(path) to read an ELF file into memory and begin +parsing it. + +""" + +import collections +import enum +import struct + +if not hasattr(enum, 'IntFlag'): + import sys + sys.stdout.write( + 'warning: glibcelf.py needs Python 3.6 for enum support\n') + sys.exit(77) + +class _OpenIntEnum(enum.IntEnum): + """Integer enumeration that supports arbitrary int values.""" + @classmethod + def _missing_(cls, value): + # See enum.IntFlag._create_pseudo_member_. This allows + # creating of enum constants with arbitrary integer values. + pseudo_member = int.__new__(cls, value) + pseudo_member._name_ = None + pseudo_member._value_ = value + return pseudo_member + + def __repr__(self): + name = self._name_ + if name is not None: + # The names have prefixes like SHT_, implying their type. + return name + return '{}({})'.format(self.__class__.__name__, self._value_) + + def __str__(self): + name = self._name_ + if name is not None: + return name + return str(self._value_) + +class ElfClass(_OpenIntEnum): + """ELF word size. Type of EI_CLASS values.""" + ELFCLASSNONE = 0 + ELFCLASS32 = 1 + ELFCLASS64 = 2 + +class ElfData(_OpenIntEnum): + """ELF endianess. Type of EI_DATA values.""" + ELFDATANONE = 0 + ELFDATA2LSB = 1 + ELFDATA2MSB = 2 + +class Machine(_OpenIntEnum): + """ELF machine type. Type of values in Ehdr.e_machine field.""" + EM_NONE = 0 + EM_M32 = 1 + EM_SPARC = 2 + EM_386 = 3 + EM_68K = 4 + EM_88K = 5 + EM_IAMCU = 6 + EM_860 = 7 + EM_MIPS = 8 + EM_S370 = 9 + EM_MIPS_RS3_LE = 10 + EM_PARISC = 15 + EM_VPP500 = 17 + EM_SPARC32PLUS = 18 + EM_960 = 19 + EM_PPC = 20 + EM_PPC64 = 21 + EM_S390 = 22 + EM_SPU = 23 + EM_V800 = 36 + EM_FR20 = 37 + EM_RH32 = 38 + EM_RCE = 39 + EM_ARM = 40 + EM_FAKE_ALPHA = 41 + EM_SH = 42 + EM_SPARCV9 = 43 + EM_TRICORE = 44 + EM_ARC = 45 + EM_H8_300 = 46 + EM_H8_300H = 47 + EM_H8S = 48 + EM_H8_500 = 49 + EM_IA_64 = 50 + EM_MIPS_X = 51 + EM_COLDFIRE = 52 + EM_68HC12 = 53 + EM_MMA = 54 + EM_PCP = 55 + EM_NCPU = 56 + EM_NDR1 = 57 + EM_STARCORE = 58 + EM_ME16 = 59 + EM_ST100 = 60 + EM_TINYJ = 61 + EM_X86_64 = 62 + EM_PDSP = 63 + EM_PDP10 = 64 + EM_PDP11 = 65 + EM_FX66 = 66 + EM_ST9PLUS = 67 + EM_ST7 = 68 + EM_68HC16 = 69 + EM_68HC11 = 70 + EM_68HC08 = 71 + EM_68HC05 = 72 + EM_SVX = 73 + EM_ST19 = 74 + EM_VAX = 75 + EM_CRIS = 76 + EM_JAVELIN = 77 + EM_FIREPATH = 78 + EM_ZSP = 79 + EM_MMIX = 80 + EM_HUANY = 81 + EM_PRISM = 82 + EM_AVR = 83 + EM_FR30 = 84 + EM_D10V = 85 + EM_D30V = 86 + EM_V850 = 87 + EM_M32R = 88 + EM_MN10300 = 89 + EM_MN10200 = 90 + EM_PJ = 91 + EM_OPENRISC = 92 + EM_ARC_COMPACT = 93 + EM_XTENSA = 94 + EM_VIDEOCORE = 95 + EM_TMM_GPP = 96 + EM_NS32K = 97 + EM_TPC = 98 + EM_SNP1K = 99 + EM_ST200 = 100 + EM_IP2K = 101 + EM_MAX = 102 + EM_CR = 103 + EM_F2MC16 = 104 + EM_MSP430 = 105 + EM_BLACKFIN = 106 + EM_SE_C33 = 107 + EM_SEP = 108 + EM_ARCA = 109 + EM_UNICORE = 110 + EM_EXCESS = 111 + EM_DXP = 112 + EM_ALTERA_NIOS2 = 113 + EM_CRX = 114 + EM_XGATE = 115 + EM_C166 = 116 + EM_M16C = 117 + EM_DSPIC30F = 118 + EM_CE = 119 + EM_M32C = 120 + EM_TSK3000 = 131 + EM_RS08 = 132 + EM_SHARC = 133 + EM_ECOG2 = 134 + EM_SCORE7 = 135 + EM_DSP24 = 136 + EM_VIDEOCORE3 = 137 + EM_LATTICEMICO32 = 138 + EM_SE_C17 = 139 + EM_TI_C6000 = 140 + EM_TI_C2000 = 141 + EM_TI_C5500 = 142 + EM_TI_ARP32 = 143 + EM_TI_PRU = 144 + EM_MMDSP_PLUS = 160 + EM_CYPRESS_M8C = 161 + EM_R32C = 162 + EM_TRIMEDIA = 163 + EM_QDSP6 = 164 + EM_8051 = 165 + EM_STXP7X = 166 + EM_NDS32 = 167 + EM_ECOG1X = 168 + EM_MAXQ30 = 169 + EM_XIMO16 = 170 + EM_MANIK = 171 + EM_CRAYNV2 = 172 + EM_RX = 173 + EM_METAG = 174 + EM_MCST_ELBRUS = 175 + EM_ECOG16 = 176 + EM_CR16 = 177 + EM_ETPU = 178 + EM_SLE9X = 179 + EM_L10M = 180 + EM_K10M = 181 + EM_AARCH64 = 183 + EM_AVR32 = 185 + EM_STM8 = 186 + EM_TILE64 = 187 + EM_TILEPRO = 188 + EM_MICROBLAZE = 189 + EM_CUDA = 190 + EM_TILEGX = 191 + EM_CLOUDSHIELD = 192 + EM_COREA_1ST = 193 + EM_COREA_2ND = 194 + EM_ARCV2 = 195 + EM_OPEN8 = 196 + EM_RL78 = 197 + EM_VIDEOCORE5 = 198 + EM_78KOR = 199 + EM_56800EX = 200 + EM_BA1 = 201 + EM_BA2 = 202 + EM_XCORE = 203 + EM_MCHP_PIC = 204 + EM_INTELGT = 205 + EM_KM32 = 210 + EM_KMX32 = 211 + EM_EMX16 = 212 + EM_EMX8 = 213 + EM_KVARC = 214 + EM_CDP = 215 + EM_COGE = 216 + EM_COOL = 217 + EM_NORC = 218 + EM_CSR_KALIMBA = 219 + EM_Z80 = 220 + EM_VISIUM = 221 + EM_FT32 = 222 + EM_MOXIE = 223 + EM_AMDGPU = 224 + EM_RISCV = 243 + EM_BPF = 247 + EM_CSKY = 252 + EM_NUM = 253 + EM_ALPHA = 0x9026 + +class Et(_OpenIntEnum): + """ELF file type. Type of ET_* values and the Ehdr.e_type field.""" + ET_NONE = 0 + ET_REL = 1 + ET_EXEC = 2 + ET_DYN = 3 + ET_CORE = 4 + +class Shn(_OpenIntEnum): + """ELF reserved section indices.""" + SHN_UNDEF = 0 + SHN_BEFORE = 0xff00 + SHN_AFTER = 0xff01 + SHN_ABS = 0xfff1 + SHN_COMMON = 0xfff2 + SHN_XINDEX = 0xffff + +class ShnMIPS(enum.Enum): + """Supplemental SHN_* constants for EM_MIPS.""" + SHN_MIPS_ACOMMON = 0xff00 + SHN_MIPS_TEXT = 0xff01 + SHN_MIPS_DATA = 0xff02 + SHN_MIPS_SCOMMON = 0xff03 + SHN_MIPS_SUNDEFINED = 0xff04 + +class ShnPARISC(enum.Enum): + """Supplemental SHN_* constants for EM_PARISC.""" + SHN_PARISC_ANSI_COMMON = 0xff00 + SHN_PARISC_HUGE_COMMON = 0xff01 + +class Sht(_OpenIntEnum): + """ELF section types. Type of SHT_* values.""" + SHT_NULL = 0 + SHT_PROGBITS = 1 + SHT_SYMTAB = 2 + SHT_STRTAB = 3 + SHT_RELA = 4 + SHT_HASH = 5 + SHT_DYNAMIC = 6 + SHT_NOTE = 7 + SHT_NOBITS = 8 + SHT_REL = 9 + SHT_SHLIB = 10 + SHT_DYNSYM = 11 + SHT_INIT_ARRAY = 14 + SHT_FINI_ARRAY = 15 + SHT_PREINIT_ARRAY = 16 + SHT_GROUP = 17 + SHT_SYMTAB_SHNDX = 18 + SHT_GNU_ATTRIBUTES = 0x6ffffff5 + SHT_GNU_HASH = 0x6ffffff6 + SHT_GNU_LIBLIST = 0x6ffffff7 + SHT_CHECKSUM = 0x6ffffff8 + SHT_SUNW_move = 0x6ffffffa + SHT_SUNW_COMDAT = 0x6ffffffb + SHT_SUNW_syminfo = 0x6ffffffc + SHT_GNU_verdef = 0x6ffffffd + SHT_GNU_verneed = 0x6ffffffe + SHT_GNU_versym = 0x6fffffff + +class ShtALPHA(enum.Enum): + """Supplemental SHT_* constants for EM_ALPHA.""" + SHT_ALPHA_DEBUG = 0x70000001 + SHT_ALPHA_REGINFO = 0x70000002 + +class ShtARM(enum.Enum): + """Supplemental SHT_* constants for EM_ARM.""" + SHT_ARM_EXIDX = 0x70000001 + SHT_ARM_PREEMPTMAP = 0x70000002 + SHT_ARM_ATTRIBUTES = 0x70000003 + +class ShtCSKY(enum.Enum): + """Supplemental SHT_* constants for EM_CSKY.""" + SHT_CSKY_ATTRIBUTES = 0x70000001 + +class ShtIA_64(enum.Enum): + """Supplemental SHT_* constants for EM_IA_64.""" + SHT_IA_64_EXT = 0x70000000 + SHT_IA_64_UNWIND = 0x70000001 + +class ShtMIPS(enum.Enum): + """Supplemental SHT_* constants for EM_MIPS.""" + SHT_MIPS_LIBLIST = 0x70000000 + SHT_MIPS_MSYM = 0x70000001 + SHT_MIPS_CONFLICT = 0x70000002 + SHT_MIPS_GPTAB = 0x70000003 + SHT_MIPS_UCODE = 0x70000004 + SHT_MIPS_DEBUG = 0x70000005 + SHT_MIPS_REGINFO = 0x70000006 + SHT_MIPS_PACKAGE = 0x70000007 + SHT_MIPS_PACKSYM = 0x70000008 + SHT_MIPS_RELD = 0x70000009 + SHT_MIPS_IFACE = 0x7000000b + SHT_MIPS_CONTENT = 0x7000000c + SHT_MIPS_OPTIONS = 0x7000000d + SHT_MIPS_SHDR = 0x70000010 + SHT_MIPS_FDESC = 0x70000011 + SHT_MIPS_EXTSYM = 0x70000012 + SHT_MIPS_DENSE = 0x70000013 + SHT_MIPS_PDESC = 0x70000014 + SHT_MIPS_LOCSYM = 0x70000015 + SHT_MIPS_AUXSYM = 0x70000016 + SHT_MIPS_OPTSYM = 0x70000017 + SHT_MIPS_LOCSTR = 0x70000018 + SHT_MIPS_LINE = 0x70000019 + SHT_MIPS_RFDESC = 0x7000001a + SHT_MIPS_DELTASYM = 0x7000001b + SHT_MIPS_DELTAINST = 0x7000001c + SHT_MIPS_DELTACLASS = 0x7000001d + SHT_MIPS_DWARF = 0x7000001e + SHT_MIPS_DELTADECL = 0x7000001f + SHT_MIPS_SYMBOL_LIB = 0x70000020 + SHT_MIPS_EVENTS = 0x70000021 + SHT_MIPS_TRANSLATE = 0x70000022 + SHT_MIPS_PIXIE = 0x70000023 + SHT_MIPS_XLATE = 0x70000024 + SHT_MIPS_XLATE_DEBUG = 0x70000025 + SHT_MIPS_WHIRL = 0x70000026 + SHT_MIPS_EH_REGION = 0x70000027 + SHT_MIPS_XLATE_OLD = 0x70000028 + SHT_MIPS_PDR_EXCEPTION = 0x70000029 + SHT_MIPS_XHASH = 0x7000002b + +class ShtPARISC(enum.Enum): + """Supplemental SHT_* constants for EM_PARISC.""" + SHT_PARISC_EXT = 0x70000000 + SHT_PARISC_UNWIND = 0x70000001 + SHT_PARISC_DOC = 0x70000002 + +class Pf(enum.IntFlag): + """Program header flags. Type of Phdr.p_flags values.""" + PF_X = 1 + PF_W = 2 + PF_R = 4 + +class PfARM(enum.IntFlag): + """Supplemental PF_* flags for EM_ARM.""" + PF_ARM_SB = 0x10000000 + PF_ARM_PI = 0x20000000 + PF_ARM_ABS = 0x40000000 + +class PfPARISC(enum.IntFlag): + """Supplemental PF_* flags for EM_PARISC.""" + PF_HP_PAGE_SIZE = 0x00100000 + PF_HP_FAR_SHARED = 0x00200000 + PF_HP_NEAR_SHARED = 0x00400000 + PF_HP_CODE = 0x01000000 + PF_HP_MODIFY = 0x02000000 + PF_HP_LAZYSWAP = 0x04000000 + PF_HP_SBP = 0x08000000 + +class PfIA_64(enum.IntFlag): + """Supplemental PF_* flags for EM_IA_64.""" + PF_IA_64_NORECOV = 0x80000000 + +class PfMIPS(enum.IntFlag): + """Supplemental PF_* flags for EM_MIPS.""" + PF_MIPS_LOCAL = 0x10000000 + +class Shf(enum.IntFlag): + """Section flags. Type of Shdr.sh_type values.""" + SHF_WRITE = 1 << 0 + SHF_ALLOC = 1 << 1 + SHF_EXECINSTR = 1 << 2 + SHF_MERGE = 1 << 4 + SHF_STRINGS = 1 << 5 + SHF_INFO_LINK = 1 << 6 + SHF_LINK_ORDER = 1 << 7 + SHF_OS_NONCONFORMING = 256 + SHF_GROUP = 1 << 9 + SHF_TLS = 1 << 10 + SHF_COMPRESSED = 1 << 11 + SHF_GNU_RETAIN = 1 << 21 + SHF_ORDERED = 1 << 30 + SHF_EXCLUDE = 1 << 31 + +class ShfALPHA(enum.IntFlag): + """Supplemental SHF_* constants for EM_ALPHA.""" + SHF_ALPHA_GPREL = 0x10000000 + +class ShfARM(enum.IntFlag): + """Supplemental SHF_* constants for EM_ARM.""" + SHF_ARM_ENTRYSECT = 0x10000000 + SHF_ARM_COMDEF = 0x80000000 + +class ShfIA_64(enum.IntFlag): + """Supplemental SHF_* constants for EM_IA_64.""" + SHF_IA_64_SHORT = 0x10000000 + SHF_IA_64_NORECOV = 0x20000000 + +class ShfMIPS(enum.IntFlag): + """Supplemental SHF_* constants for EM_MIPS.""" + SHF_MIPS_GPREL = 0x10000000 + SHF_MIPS_MERGE = 0x20000000 + SHF_MIPS_ADDR = 0x40000000 + SHF_MIPS_STRINGS = 0x80000000 + SHF_MIPS_NOSTRIP = 0x08000000 + SHF_MIPS_LOCAL = 0x04000000 + SHF_MIPS_NAMES = 0x02000000 + SHF_MIPS_NODUPE = 0x01000000 + +class ShfPARISC(enum.IntFlag): + """Supplemental SHF_* constants for EM_PARISC.""" + SHF_PARISC_SHORT = 0x20000000 + SHF_PARISC_HUGE = 0x40000000 + SHF_PARISC_SBP = 0x80000000 + +class Stb(_OpenIntEnum): + """ELF symbol binding type.""" + STB_LOCAL = 0 + STB_GLOBAL = 1 + STB_WEAK = 2 + STB_GNU_UNIQUE = 10 + STB_MIPS_SPLIT_COMMON = 13 + +class Stt(_OpenIntEnum): + """ELF symbol type.""" + STT_NOTYPE = 0 + STT_OBJECT = 1 + STT_FUNC = 2 + STT_SECTION = 3 + STT_FILE = 4 + STT_COMMON = 5 + STT_TLS = 6 + STT_GNU_IFUNC = 10 + +class SttARM(enum.Enum): + """Supplemental STT_* constants for EM_ARM.""" + STT_ARM_TFUNC = 13 + STT_ARM_16BIT = 15 + +class SttPARISC(enum.Enum): + """Supplemental STT_* constants for EM_PARISC.""" + STT_HP_OPAQUE = 11 + STT_HP_STUB = 12 + STT_PARISC_MILLICODE = 13 + +class SttSPARC(enum.Enum): + """Supplemental STT_* constants for EM_SPARC.""" + STT_SPARC_REGISTER = 13 + +class SttX86_64(enum.Enum): + """Supplemental STT_* constants for EM_X86_64.""" + SHT_X86_64_UNWIND = 0x70000001 + +class Pt(_OpenIntEnum): + """ELF program header types. Type of Phdr.p_type.""" + PT_NULL = 0 + PT_LOAD = 1 + PT_DYNAMIC = 2 + PT_INTERP = 3 + PT_NOTE = 4 + PT_SHLIB = 5 + PT_PHDR = 6 + PT_TLS = 7 + PT_NUM = 8 + PT_GNU_EH_FRAME = 0x6474e550 + PT_GNU_STACK = 0x6474e551 + PT_GNU_RELRO = 0x6474e552 + PT_GNU_PROPERTY = 0x6474e553 + PT_SUNWBSS = 0x6ffffffa + PT_SUNWSTACK = 0x6ffffffb + +class PtARM(enum.Enum): + """Supplemental PT_* constants for EM_ARM.""" + PT_ARM_EXIDX = 0x70000001 + +class PtIA_64(enum.Enum): + """Supplemental PT_* constants for EM_IA_64.""" + PT_IA_64_HP_OPT_ANOT = 0x60000012 + PT_IA_64_HP_HSL_ANOT = 0x60000013 + PT_IA_64_HP_STACK = 0x60000014 + PT_IA_64_ARCHEXT = 0x70000000 + PT_IA_64_UNWIND = 0x70000001 + +class PtMIPS(enum.Enum): + """Supplemental PT_* constants for EM_MIPS.""" + PT_MIPS_REGINFO = 0x70000000 + PT_MIPS_RTPROC = 0x70000001 + PT_MIPS_OPTIONS = 0x70000002 + PT_MIPS_ABIFLAGS = 0x70000003 + +class PtPARISC(enum.Enum): + """Supplemental PT_* constants for EM_PARISC.""" + PT_HP_TLS = 0x60000000 + PT_HP_CORE_NONE = 0x60000001 + PT_HP_CORE_VERSION = 0x60000002 + PT_HP_CORE_KERNEL = 0x60000003 + PT_HP_CORE_COMM = 0x60000004 + PT_HP_CORE_PROC = 0x60000005 + PT_HP_CORE_LOADABLE = 0x60000006 + PT_HP_CORE_STACK = 0x60000007 + PT_HP_CORE_SHM = 0x60000008 + PT_HP_CORE_MMF = 0x60000009 + PT_HP_PARALLEL = 0x60000010 + PT_HP_FASTBIND = 0x60000011 + PT_HP_OPT_ANNOT = 0x60000012 + PT_HP_HSL_ANNOT = 0x60000013 + PT_HP_STACK = 0x60000014 + PT_PARISC_ARCHEXT = 0x70000000 + PT_PARISC_UNWIND = 0x70000001 + +class Dt(_OpenIntEnum): + """ELF dynamic segment tags. Type of Dyn.d_val.""" + DT_NULL = 0 + DT_NEEDED = 1 + DT_PLTRELSZ = 2 + DT_PLTGOT = 3 + DT_HASH = 4 + DT_STRTAB = 5 + DT_SYMTAB = 6 + DT_RELA = 7 + DT_RELASZ = 8 + DT_RELAENT = 9 + DT_STRSZ = 10 + DT_SYMENT = 11 + DT_INIT = 12 + DT_FINI = 13 + DT_SONAME = 14 + DT_RPATH = 15 + DT_SYMBOLIC = 16 + DT_REL = 17 + DT_RELSZ = 18 + DT_RELENT = 19 + DT_PLTREL = 20 + DT_DEBUG = 21 + DT_TEXTREL = 22 + DT_JMPREL = 23 + DT_BIND_NOW = 24 + DT_INIT_ARRAY = 25 + DT_FINI_ARRAY = 26 + DT_INIT_ARRAYSZ = 27 + DT_FINI_ARRAYSZ = 28 + DT_RUNPATH = 29 + DT_FLAGS = 30 + DT_PREINIT_ARRAY = 32 + DT_PREINIT_ARRAYSZ = 33 + DT_SYMTAB_SHNDX = 34 + DT_GNU_PRELINKED = 0x6ffffdf5 + DT_GNU_CONFLICTSZ = 0x6ffffdf6 + DT_GNU_LIBLISTSZ = 0x6ffffdf7 + DT_CHECKSUM = 0x6ffffdf8 + DT_PLTPADSZ = 0x6ffffdf9 + DT_MOVEENT = 0x6ffffdfa + DT_MOVESZ = 0x6ffffdfb + DT_FEATURE_1 = 0x6ffffdfc + DT_POSFLAG_1 = 0x6ffffdfd + DT_SYMINSZ = 0x6ffffdfe + DT_SYMINENT = 0x6ffffdff + DT_GNU_HASH = 0x6ffffef5 + DT_TLSDESC_PLT = 0x6ffffef6 + DT_TLSDESC_GOT = 0x6ffffef7 + DT_GNU_CONFLICT = 0x6ffffef8 + DT_GNU_LIBLIST = 0x6ffffef9 + DT_CONFIG = 0x6ffffefa + DT_DEPAUDIT = 0x6ffffefb + DT_AUDIT = 0x6ffffefc + DT_PLTPAD = 0x6ffffefd + DT_MOVETAB = 0x6ffffefe + DT_SYMINFO = 0x6ffffeff + DT_VERSYM = 0x6ffffff0 + DT_RELACOUNT = 0x6ffffff9 + DT_RELCOUNT = 0x6ffffffa + DT_FLAGS_1 = 0x6ffffffb + DT_VERDEF = 0x6ffffffc + DT_VERDEFNUM = 0x6ffffffd + DT_VERNEED = 0x6ffffffe + DT_VERNEEDNUM = 0x6fffffff + DT_AUXILIARY = 0x7ffffffd + DT_FILTER = 0x7fffffff + +class DtAARCH64(enum.Enum): + """Supplemental DT_* constants for EM_AARCH64.""" + DT_AARCH64_BTI_PLT = 0x70000001 + DT_AARCH64_PAC_PLT = 0x70000003 + DT_AARCH64_VARIANT_PCS = 0x70000005 + +class DtALPHA(enum.Enum): + """Supplemental DT_* constants for EM_ALPHA.""" + DT_ALPHA_PLTRO = 0x70000000 + +class DtALTERA_NIOS2(enum.Enum): + """Supplemental DT_* constants for EM_ALTERA_NIOS2.""" + DT_NIOS2_GP = 0x70000002 + +class DtIA_64(enum.Enum): + """Supplemental DT_* constants for EM_IA_64.""" + DT_IA_64_PLT_RESERVE = 0x70000000 + +class DtMIPS(enum.Enum): + """Supplemental DT_* constants for EM_MIPS.""" + DT_MIPS_RLD_VERSION = 0x70000001 + DT_MIPS_TIME_STAMP = 0x70000002 + DT_MIPS_ICHECKSUM = 0x70000003 + DT_MIPS_IVERSION = 0x70000004 + DT_MIPS_FLAGS = 0x70000005 + DT_MIPS_BASE_ADDRESS = 0x70000006 + DT_MIPS_MSYM = 0x70000007 + DT_MIPS_CONFLICT = 0x70000008 + DT_MIPS_LIBLIST = 0x70000009 + DT_MIPS_LOCAL_GOTNO = 0x7000000a + DT_MIPS_CONFLICTNO = 0x7000000b + DT_MIPS_LIBLISTNO = 0x70000010 + DT_MIPS_SYMTABNO = 0x70000011 + DT_MIPS_UNREFEXTNO = 0x70000012 + DT_MIPS_GOTSYM = 0x70000013 + DT_MIPS_HIPAGENO = 0x70000014 + DT_MIPS_RLD_MAP = 0x70000016 + DT_MIPS_DELTA_CLASS = 0x70000017 + DT_MIPS_DELTA_CLASS_NO = 0x70000018 + DT_MIPS_DELTA_INSTANCE = 0x70000019 + DT_MIPS_DELTA_INSTANCE_NO = 0x7000001a + DT_MIPS_DELTA_RELOC = 0x7000001b + DT_MIPS_DELTA_RELOC_NO = 0x7000001c + DT_MIPS_DELTA_SYM = 0x7000001d + DT_MIPS_DELTA_SYM_NO = 0x7000001e + DT_MIPS_DELTA_CLASSSYM = 0x70000020 + DT_MIPS_DELTA_CLASSSYM_NO = 0x70000021 + DT_MIPS_CXX_FLAGS = 0x70000022 + DT_MIPS_PIXIE_INIT = 0x70000023 + DT_MIPS_SYMBOL_LIB = 0x70000024 + DT_MIPS_LOCALPAGE_GOTIDX = 0x70000025 + DT_MIPS_LOCAL_GOTIDX = 0x70000026 + DT_MIPS_HIDDEN_GOTIDX = 0x70000027 + DT_MIPS_PROTECTED_GOTIDX = 0x70000028 + DT_MIPS_OPTIONS = 0x70000029 + DT_MIPS_INTERFACE = 0x7000002a + DT_MIPS_DYNSTR_ALIGN = 0x7000002b + DT_MIPS_INTERFACE_SIZE = 0x7000002c + DT_MIPS_RLD_TEXT_RESOLVE_ADDR = 0x7000002d + DT_MIPS_PERF_SUFFIX = 0x7000002e + DT_MIPS_COMPACT_SIZE = 0x7000002f + DT_MIPS_GP_VALUE = 0x70000030 + DT_MIPS_AUX_DYNAMIC = 0x70000031 + DT_MIPS_PLTGOT = 0x70000032 + DT_MIPS_RWPLT = 0x70000034 + DT_MIPS_RLD_MAP_REL = 0x70000035 + DT_MIPS_XHASH = 0x70000036 + +class DtPPC(enum.Enum): + """Supplemental DT_* constants for EM_PPC.""" + DT_PPC_GOT = 0x70000000 + DT_PPC_OPT = 0x70000001 + +class DtPPC64(enum.Enum): + """Supplemental DT_* constants for EM_PPC64.""" + DT_PPC64_GLINK = 0x70000000 + DT_PPC64_OPD = 0x70000001 + DT_PPC64_OPDSZ = 0x70000002 + DT_PPC64_OPT = 0x70000003 + +class DtSPARC(enum.Enum): + """Supplemental DT_* constants for EM_SPARC.""" + DT_SPARC_REGISTER = 0x70000001 + +class StInfo: + """ELF symbol binding and type. Type of the Sym.st_info field.""" + def __init__(self, arg0, arg1=None): + if isinstance(arg0, int) and arg1 is None: + self.bind = Stb(arg0 >> 4) + self.type = Stt(arg0 & 15) + else: + self.bind = Stb(arg0) + self.type = Stt(arg1) + + def value(self): + """Returns the raw value for the bind/type combination.""" + return (self.bind.value() << 4) | (self.type.value()) + +# Type in an ELF file. Used for deserialization. +_Layout = collections.namedtuple('_Layout', 'unpack size') + +def _define_layouts(baseclass: type, layout32: str, layout64: str, + types=None, fields32=None): + """Assign variants dict to baseclass. + + The variants dict is indexed by (ElfClass, ElfData) pairs, and its + values are _Layout instances. + + """ + struct32 = struct.Struct(layout32) + struct64 = struct.Struct(layout64) + + # Check that the struct formats yield the right number of components. + for s in (struct32, struct64): + example = s.unpack(b' ' * s.size) + if len(example) != len(baseclass._fields): + raise ValueError('{!r} yields wrong field count: {} != {}'.format( + s.format, len(example), len(baseclass._fields))) + + # Check that field names in types are correct. + if types is None: + types = () + for n in types: + if n not in baseclass._fields: + raise ValueError('{} does not have field {!r}'.format( + baseclass.__name__, n)) + + if fields32 is not None \ + and set(fields32) != set(baseclass._fields): + raise ValueError('{!r} is not a permutation of the fields {!r}'.format( + fields32, baseclass._fields)) + + def unique_name(name, used_names = (set((baseclass.__name__,)) + | set(baseclass._fields) + | {n.__name__ + for n in (types or {}).values()})): + """Find a name that is not used for a class or field name.""" + candidate = name + n = 0 + while candidate in used_names: + n += 1 + candidate = '{}{}'.format(name, n) + used_names.add(candidate) + return candidate + + blob_name = unique_name('blob') + struct_unpack_name = unique_name('struct_unpack') + comps_name = unique_name('comps') + + layouts = {} + for (bits, elfclass, layout, fields) in ( + (32, ElfClass.ELFCLASS32, layout32, fields32), + (64, ElfClass.ELFCLASS64, layout64, None), + ): + for (elfdata, structprefix, funcsuffix) in ( + (ElfData.ELFDATA2LSB, '<', 'LE'), + (ElfData.ELFDATA2MSB, '>', 'BE'), + ): + env = { + baseclass.__name__: baseclass, + struct_unpack_name: struct.unpack, + } + + # Add the type converters. + if types: + for cls in types.values(): + env[cls.__name__] = cls + + funcname = ''.join( + ('unpack_', baseclass.__name__, str(bits), funcsuffix)) + + code = ''' +def {funcname}({blob_name}): +'''.format(funcname=funcname, blob_name=blob_name) + + indent = ' ' * 4 + unpack_call = '{}({!r}, {})'.format( + struct_unpack_name, structprefix + layout, blob_name) + field_names = ', '.join(baseclass._fields) + if types is None and fields is None: + code += '{}return {}({})\n'.format( + indent, baseclass.__name__, unpack_call) + else: + # Destructuring tuple assignment. + if fields is None: + code += '{}{} = {}\n'.format( + indent, field_names, unpack_call) + else: + # Use custom field order. + code += '{}{} = {}\n'.format( + indent, ', '.join(fields), unpack_call) + + # Perform the type conversions. + for n in baseclass._fields: + if n in types: + code += '{}{} = {}({})\n'.format( + indent, n, types[n].__name__, n) + # Create the named tuple. + code += '{}return {}({})\n'.format( + indent, baseclass.__name__, field_names) + + exec(code, env) + layouts[(elfclass, elfdata)] = _Layout( + env[funcname], struct.calcsize(layout)) + baseclass.layouts = layouts + + +# Corresponds to EI_* indices into Elf*_Ehdr.e_indent. +class Ident(collections.namedtuple('Ident', + 'ei_mag ei_class ei_data ei_version ei_osabi ei_abiversion ei_pad')): + + def __new__(cls, *args): + """Construct an object from a blob or its constituent fields.""" + if len(args) == 1: + return cls.unpack(args[0]) + return cls.__base__.__new__(cls, *args) + + @staticmethod + def unpack(blob: memoryview) -> 'Ident': + """Parse raws data into a tuple.""" + ei_mag, ei_class, ei_data, ei_version, ei_osabi, ei_abiversion, \ + ei_pad = struct.unpack('4s5B7s', blob) + return Ident(ei_mag, ElfClass(ei_class), ElfData(ei_data), + ei_version, ei_osabi, ei_abiversion, ei_pad) + size = 16 + +# Corresponds to Elf32_Ehdr and Elf64_Ehdr. +Ehdr = collections.namedtuple('Ehdr', + 'e_ident e_type e_machine e_version e_entry e_phoff e_shoff e_flags' + + ' e_ehsize e_phentsize e_phnum e_shentsize e_shnum e_shstrndx') +_define_layouts(Ehdr, + layout32='16s2H5I6H', + layout64='16s2HI3QI6H', + types=dict(e_ident=Ident, + e_machine=Machine, + e_type=Et, + e_shstrndx=Shn)) + +# Corresponds to Elf32_Phdr and Elf64_Pdhr. Order follows the latter. +Phdr = collections.namedtuple('Phdr', + 'p_type p_flags p_offset p_vaddr p_paddr p_filesz p_memsz p_align') +_define_layouts(Phdr, + layout32='8I', + fields32=('p_type', 'p_offset', 'p_vaddr', 'p_paddr', + 'p_filesz', 'p_memsz', 'p_flags', 'p_align'), + layout64='2I6Q', + types=dict(p_type=Pt, p_flags=Pf)) + + +# Corresponds to Elf32_Shdr and Elf64_Shdr. +class Shdr(collections.namedtuple('Shdr', + 'sh_name sh_type sh_flags sh_addr sh_offset sh_size sh_link sh_info' + + ' sh_addralign sh_entsize')): + def resolve(self, strtab: 'StringTable') -> 'Shdr': + """Resolve sh_name using a string table.""" + return self.__class__(strtab.get(self[0]), *self[1:]) +_define_layouts(Shdr, + layout32='10I', + layout64='2I4Q2I2Q', + types=dict(sh_type=Sht, + sh_flags=Shf, + sh_link=Shn)) + +# Corresponds to Elf32_Dyn and Elf64_Dyn. The nesting through the +# d_un union is skipped, and d_ptr is missing (its representation in +# Python would be identical to d_val). +Dyn = collections.namedtuple('Dyn', 'd_tag d_val') +_define_layouts(Dyn, + layout32='2i', + layout64='2q', + types=dict(d_tag=Dt)) + +# Corresponds to Elf32_Sym and Elf64_Sym. +class Sym(collections.namedtuple('Sym', + 'st_name st_info st_other st_shndx st_value st_size')): + def resolve(self, strtab: 'StringTable') -> 'Sym': + """Resolve st_name using a string table.""" + return self.__class__(strtab.get(self[0]), *self[1:]) +_define_layouts(Sym, + layout32='3I2BH', + layout64='I2BH2Q', + fields32=('st_name', 'st_value', 'st_size', 'st_info', + 'st_other', 'st_shndx'), + types=dict(st_shndx=Shn, + st_info=StInfo)) + +# Corresponds to Elf32_Rel and Elf64_Rel. +Rel = collections.namedtuple('Rel', 'r_offset r_info') +_define_layouts(Rel, + layout32='2I', + layout64='2Q') + +# Corresponds to Elf32_Rel and Elf64_Rel. +Rela = collections.namedtuple('Rela', 'r_offset r_info r_addend') +_define_layouts(Rela, + layout32='3I', + layout64='3Q') + +class StringTable: + """ELF string table.""" + def __init__(self, blob): + """Create a new string table backed by the data in the blob. + + blob: a memoryview-like object + + """ + self.blob = blob + + def get(self, index) -> bytes: + """Returns the null-terminated byte string at the index.""" + blob = self.blob + endindex = index + while True: + if blob[endindex] == 0: + return bytes(blob[index:endindex]) + endindex += 1 + +class Image: + """ELF image parser.""" + def __init__(self, image): + """Create an ELF image from binary image data. + + image: a memoryview-like object that supports efficient range + subscripting. + + """ + self.image = image + ident = self.read(Ident, 0) + classdata = (ident.ei_class, ident.ei_data) + # Set self.Ehdr etc. to the subtypes with the right parsers. + for typ in (Ehdr, Phdr, Shdr, Dyn, Sym, Rel, Rela): + setattr(self, typ.__name__, typ.layouts.get(classdata, None)) + + if self.Ehdr is not None: + self.ehdr = self.read(self.Ehdr, 0) + self._shdr_num = self._compute_shdr_num() + else: + self.ehdr = None + self._shdr_num = 0 + + self._section = {} + self._stringtab = {} + + if self._shdr_num > 0: + self._shdr_strtab = self._find_shdr_strtab() + else: + self._shdr_strtab = None + + @staticmethod + def readfile(path: str) -> 'Image': + """Reads the ELF file at the specified path.""" + with open(path, 'rb') as inp: + return Image(memoryview(inp.read())) + + def _compute_shdr_num(self) -> int: + """Computes the actual number of section headers.""" + shnum = self.ehdr.e_shnum + if shnum == 0: + if self.ehdr.e_shoff == 0 or self.ehdr.e_shentsize == 0: + # No section headers. + return 0 + # Otherwise the extension mechanism is used (which may be + # needed because e_shnum is just 16 bits). + return self.read(self.Shdr, self.ehdr.e_shoff).sh_size + return shnum + + def _find_shdr_strtab(self) -> StringTable: + """Finds the section header string table (maybe via extensions).""" + shstrndx = self.ehdr.e_shstrndx + if shstrndx == Shn.SHN_XINDEX: + shstrndx = self.read(self.Shdr, self.ehdr.e_shoff).sh_link + return self._find_stringtab(shstrndx) + + def read(self, typ: type, offset:int ): + """Reads an object at a specific offset. + + The type must have been enhanced using _define_variants. + + """ + return typ.unpack(self.image[offset: offset + typ.size]) + + def phdrs(self) -> Phdr: + """Generator iterating over the program headers.""" + if self.ehdr is None: + return + size = self.ehdr.e_phentsize + if size != self.Phdr.size: + raise ValueError('Unexpected Phdr size in ELF header: {} != {}' + .format(size, self.Phdr.size)) + + offset = self.ehdr.e_phoff + for _ in range(self.ehdr.e_phnum): + yield self.read(self.Phdr, offset) + offset += size + + def shdrs(self, resolve: bool=True) -> Shdr: + """Generator iterating over the section headers. + + If resolve, section names are automatically translated + using the section header string table. + + """ + if self._shdr_num == 0: + return + + size = self.ehdr.e_shentsize + if size != self.Shdr.size: + raise ValueError('Unexpected Shdr size in ELF header: {} != {}' + .format(size, self.Shdr.size)) + + offset = self.ehdr.e_shoff + for _ in range(self._shdr_num): + shdr = self.read(self.Shdr, offset) + if resolve: + shdr = shdr.resolve(self._shdr_strtab) + yield shdr + offset += size + + def dynamic(self) -> Dyn: + """Generator iterating over the dynamic segment.""" + for phdr in self.phdrs(): + if phdr.p_type == Pt.PT_DYNAMIC: + # Pick the first dynamic segment, like the loader. + if phdr.p_filesz == 0: + # Probably separated debuginfo. + return + offset = phdr.p_offset + end = offset + phdr.p_memsz + size = self.Dyn.size + while True: + next_offset = offset + size + if next_offset > end: + raise ValueError( + 'Dynamic segment size {} is not a multiple of Dyn size {}'.format( + phdr.p_memsz, size)) + yield self.read(self.Dyn, offset) + if next_offset == end: + return + offset = next_offset + + def syms(self, shdr: Shdr, resolve: bool=True) -> Sym: + """A generator iterating over a symbol table. + + If resolve, symbol names are automatically translated using + the string table for the symbol table. + + """ + assert shdr.sh_type == Sht.SHT_SYMTAB + size = shdr.sh_entsize + if size != self.Sym.size: + raise ValueError('Invalid symbol table entry size {}'.format(size)) + offset = shdr.sh_offset + end = shdr.sh_offset + shdr.sh_size + if resolve: + strtab = self._find_stringtab(shdr.sh_link) + while offset < end: + sym = self.read(self.Sym, offset) + if resolve: + sym = sym.resolve(strtab) + yield sym + offset += size + if offset != end: + raise ValueError('Symbol table is not a multiple of entry size') + + def lookup_string(self, strtab_index: int, strtab_offset: int) -> bytes: + """Looks up a string in a string table identified by its link index.""" + try: + strtab = self._stringtab[strtab_index] + except KeyError: + strtab = self._find_stringtab(strtab_index) + return strtab.get(strtab_offset) + + def find_section(self, shndx: Shn) -> Shdr: + """Returns the section header for the indexed section. + + The section name is not resolved. + """ + try: + return self._section[shndx] + except KeyError: + pass + if shndx in Shn: + raise ValueError('Reserved section index {}'.format(shndx)) + idx = shndx.value + if idx < 0 or idx > self._shdr_num: + raise ValueError('Section index {} out of range [0, {})'.format( + idx, self._shdr_num)) + shdr = self.read( + self.Shdr, self.ehdr.e_shoff + idx * self.Shdr.size) + self._section[shndx] = shdr + return shdr + + def _find_stringtab(self, sh_link: int) -> StringTable: + if sh_link in self._stringtab: + return self._stringtab + if sh_link < 0 or sh_link >= self._shdr_num: + raise ValueError('Section index {} out of range [0, {})'.format( + sh_link, self._shdr_num)) + shdr = self.read( + self.Shdr, self.ehdr.e_shoff + sh_link * self.Shdr.size) + if shdr.sh_type != Sht.SHT_STRTAB: + raise ValueError( + 'Section {} is not a string table: {}'.format( + sh_link, shdr.sh_type)) + strtab = StringTable( + self.image[shdr.sh_offset:shdr.sh_offset + shdr.sh_size]) + # This could retrain essentially arbitrary amounts of data, + # but caching string tables seems important for performance. + self._stringtab[sh_link] = strtab + return strtab + + +__all__ = [name for name in dir() if name[0].isupper()] diff --git a/scripts/tst-elf-edit.py b/scripts/tst-elf-edit.py index a514179bb..0e19ce1e7 100644 --- a/scripts/tst-elf-edit.py +++ b/scripts/tst-elf-edit.py @@ -43,9 +43,11 @@ EI_DATA=5 ELFDATA2LSB=b'\x01' ELFDATA2MSB=b'\x02' +ET_EXEC=2 ET_DYN=3 PT_LOAD=1 +PT_TLS=7 def elf_types_fmts(e_ident): endian = '<' if e_ident[EI_DATA] == ELFDATA2LSB else '>' @@ -146,8 +148,15 @@ def elf_edit_align(phdr, align): else: phdr.p_align = int(align) +def elf_edit_maximize_tls_size(phdr, elfclass): + if elfclass == ELFCLASS32: + # It is possible that the kernel can allocate half of the + # address space, so use something larger. + phdr.p_memsz = 0xfff00000 + else: + phdr.p_memsz = 1 << 63 -def elf_edit(f, align): +def elf_edit(f, opts): ei_nident_fmt = 'c' * EI_NIDENT ei_nident_len = struct.calcsize(ei_nident_fmt) @@ -172,24 +181,35 @@ def elf_edit(f, align): ehdr = Elf_Ehdr(e_ident) ehdr.read(f) - if ehdr.e_type != ET_DYN: - error('{}: not a shared library'.format(f.name)) + if ehdr.e_type not in (ET_EXEC, ET_DYN): + error('{}: not an executable or shared library'.format(f.name)) phdr = Elf_Phdr(e_ident) + maximize_tls_size_done = False for i in range(0, ehdr.e_phnum): f.seek(ehdr.e_phoff + i * phdr.len) phdr.read(f) - if phdr.p_type == PT_LOAD: - elf_edit_align(phdr, align) + if phdr.p_type == PT_LOAD and opts.align is not None: + elf_edit_align(phdr, opts.align) + f.seek(ehdr.e_phoff + i * phdr.len) + phdr.write(f) + break + if phdr.p_type == PT_TLS and opts.maximize_tls_size: + elf_edit_maximize_tls_size(phdr, e_ident[EI_CLASS]) f.seek(ehdr.e_phoff + i * phdr.len) phdr.write(f) + maximize_tls_size_done = True break + if opts.maximize_tls_size and not maximize_tls_size_done: + error('{}: TLS maximum size was not updated'.format(f.name)) def get_parser(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument('-a', dest='align', required=True, + parser.add_argument('-a', dest='align', help='How to set the LOAD alignment') + parser.add_argument('--maximize-tls-size', action='store_true', + help='Set maximum PT_TLS size') parser.add_argument('output', help='ELF file to edit') return parser @@ -199,7 +219,7 @@ def main(argv): parser = get_parser() opts = parser.parse_args(argv) with open(opts.output, 'r+b') as fout: - elf_edit(fout, opts.align) + elf_edit(fout, opts) if __name__ == '__main__': diff --git a/scripts/tst-ld-trace.py b/scripts/tst-ld-trace.py new file mode 100755 index 000000000..f5a402800 --- /dev/null +++ b/scripts/tst-ld-trace.py @@ -0,0 +1,108 @@ +#!/usr/bin/python3 +# Dump the output of LD_TRACE_LOADED_OBJECTS in architecture neutral format. +# Copyright (C) 2022 Free Software Foundation, Inc. +# Copyright The GNU Toolchain Authors. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +import argparse +import os +import subprocess +import sys + +try: + subprocess.run +except: + class _CompletedProcess: + def __init__(self, args, returncode, stdout=None, stderr=None): + self.args = args + self.returncode = returncode + self.stdout = stdout + self.stderr = stderr + + def _run(*popenargs, input=None, timeout=None, check=False, **kwargs): + assert(timeout is None) + with subprocess.Popen(*popenargs, **kwargs) as process: + try: + stdout, stderr = process.communicate(input) + except: + process.kill() + process.wait() + raise + returncode = process.poll() + if check and returncode: + raise subprocess.CalledProcessError(returncode, popenargs) + return _CompletedProcess(popenargs, returncode, stdout, stderr) + + subprocess.run = _run + +def is_vdso(lib): + return lib.startswith('linux-gate') or lib.startswith('linux-vdso') + + +def parse_trace(cmd, fref): + new_env = os.environ.copy() + new_env['LD_TRACE_LOADED_OBJECTS'] = '1' + trace_out = subprocess.run(cmd, stdout=subprocess.PIPE, check=True, + universal_newlines=True, env=new_env).stdout + trace = [] + for line in trace_out.splitlines(): + line = line.strip() + if is_vdso(line): + continue + fields = line.split('=>' if '=>' in line else ' ') + lib = os.path.basename(fields[0].strip()) + if lib.startswith('ld'): + lib = 'ld' + elif lib.startswith('libc'): + lib = 'libc' + found = 1 if fields[1].strip() != 'not found' else 0 + trace += ['{} {}'.format(lib, found)] + trace = sorted(trace) + + reference = sorted(line.replace('\n','') for line in fref.readlines()) + + ret = 0 if trace == reference else 1 + if ret != 0: + for i in reference: + if i not in trace: + print("Only in {}: {}".format(fref.name, i)) + for i in trace: + if i not in reference: + print("Only in trace: {}".format(i)) + + sys.exit(ret) + + +def get_parser(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('command', + help='comand to run') + parser.add_argument('reference', + help='reference file to compare') + return parser + + +def main(argv): + parser = get_parser() + opts = parser.parse_args(argv) + with open(opts.reference, 'r') as fref: + # Remove the initial 'env' command. + parse_trace(opts.command.split()[1:], fref) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/shlib-versions b/shlib-versions index df6603e69..b87ab50c5 100644 --- a/shlib-versions +++ b/shlib-versions @@ -47,11 +47,6 @@ libnss_ldap=2 libnss_hesiod=2 libnss_db=2 -# Tests for NSS. They must have the same NSS_SHLIB_REVISION number as -# the rest. -libnss_test1=2 -libnss_test2=2 - # Version for libnsl with YP and NIS+ functions. libnsl=1 diff --git a/socket/Makefile b/socket/Makefile index 156eec6c8..2bde78387 100644 --- a/socket/Makefile +++ b/socket/Makefile @@ -34,6 +34,7 @@ routines := accept bind connect getpeername getsockname getsockopt \ tests := \ tst-accept4 \ tst-sockopt \ + tst-cmsghdr \ # tests tests-internal := \ diff --git a/socket/sys/socket.h b/socket/sys/socket.h index 7d5b21a2c..0abfb5dd0 100644 --- a/socket/sys/socket.h +++ b/socket/sys/socket.h @@ -181,7 +181,7 @@ extern ssize_t __REDIRECT (sendmsg, (int __fd, const struct msghdr *__message, # else extern ssize_t __sendmsg64 (int __fd, const struct msghdr *__message, int __flags); -# defien sendmsg __sendmsg64 +# define sendmsg __sendmsg64 # endif #endif diff --git a/socket/tst-cmsghdr-skeleton.c b/socket/tst-cmsghdr-skeleton.c new file mode 100644 index 000000000..4c6898569 --- /dev/null +++ b/socket/tst-cmsghdr-skeleton.c @@ -0,0 +1,92 @@ +/* Test ancillary data header creation. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* We use the preprocessor to generate the function/macro tests instead of + using indirection because having all the macro expansions alongside + each other lets the compiler warn us about suspicious pointer + arithmetic across subsequent CMSG_{FIRST,NXT}HDR expansions. */ + +#include + +#define RUN_TEST_CONCAT(suffix) run_test_##suffix +#define RUN_TEST_FUNCNAME(suffix) RUN_TEST_CONCAT (suffix) + +static void +RUN_TEST_FUNCNAME (CMSG_NXTHDR_IMPL) (void) +{ + struct msghdr m = {0}; + struct cmsghdr *cmsg; + char cmsgbuf[3 * CMSG_SPACE (sizeof (PAYLOAD))] = {0}; + + m.msg_control = cmsgbuf; + m.msg_controllen = sizeof (cmsgbuf); + + /* First header should point to the start of the buffer. */ + cmsg = CMSG_FIRSTHDR (&m); + TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf); + + /* If the first header length consumes the entire buffer, there is no + space remaining for additional headers. */ + cmsg->cmsg_len = sizeof (cmsgbuf); + cmsg = CMSG_NXTHDR_IMPL (&m, cmsg); + TEST_VERIFY_EXIT (cmsg == NULL); + + /* The first header length is so big, using it would cause an overflow. */ + cmsg = CMSG_FIRSTHDR (&m); + TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf); + cmsg->cmsg_len = SIZE_MAX; + cmsg = CMSG_NXTHDR_IMPL (&m, cmsg); + TEST_VERIFY_EXIT (cmsg == NULL); + + /* The first header leaves just enough space to hold another header. */ + cmsg = CMSG_FIRSTHDR (&m); + TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf); + cmsg->cmsg_len = sizeof (cmsgbuf) - sizeof (struct cmsghdr); + cmsg = CMSG_NXTHDR_IMPL (&m, cmsg); + TEST_VERIFY_EXIT (cmsg != NULL); + + /* The first header leaves space but not enough for another header. */ + cmsg = CMSG_FIRSTHDR (&m); + TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf); + cmsg->cmsg_len ++; + cmsg = CMSG_NXTHDR_IMPL (&m, cmsg); + TEST_VERIFY_EXIT (cmsg == NULL); + + /* The second header leaves just enough space to hold another header. */ + cmsg = CMSG_FIRSTHDR (&m); + TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf); + cmsg->cmsg_len = CMSG_LEN (sizeof (PAYLOAD)); + cmsg = CMSG_NXTHDR_IMPL (&m, cmsg); + TEST_VERIFY_EXIT (cmsg != NULL); + cmsg->cmsg_len = sizeof (cmsgbuf) + - CMSG_SPACE (sizeof (PAYLOAD)) /* First header. */ + - sizeof (struct cmsghdr); + cmsg = CMSG_NXTHDR_IMPL (&m, cmsg); + TEST_VERIFY_EXIT (cmsg != NULL); + + /* The second header leaves space but not enough for another header. */ + cmsg = CMSG_FIRSTHDR (&m); + TEST_VERIFY_EXIT ((char *) cmsg == cmsgbuf); + cmsg = CMSG_NXTHDR_IMPL (&m, cmsg); + TEST_VERIFY_EXIT (cmsg != NULL); + cmsg->cmsg_len ++; + cmsg = CMSG_NXTHDR_IMPL (&m, cmsg); + TEST_VERIFY_EXIT (cmsg == NULL); + + return; +} diff --git a/socket/tst-cmsghdr.c b/socket/tst-cmsghdr.c new file mode 100644 index 000000000..68c96d3c9 --- /dev/null +++ b/socket/tst-cmsghdr.c @@ -0,0 +1,56 @@ +/* Test ancillary data header creation. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include + +#define PAYLOAD "Hello, World!" + +/* CMSG_NXTHDR is a macro that calls an inline function defined in + bits/socket.h. In case the function cannot be inlined, libc.so carries + a copy. Both versions need to be tested. */ + +#define CMSG_NXTHDR_IMPL CMSG_NXTHDR +#include "tst-cmsghdr-skeleton.c" +#undef CMSG_NXTHDR_IMPL + +static struct cmsghdr * (* cmsg_nxthdr) (struct msghdr *, struct cmsghdr *); + +#define CMSG_NXTHDR_IMPL cmsg_nxthdr +#include "tst-cmsghdr-skeleton.c" +#undef CMSG_NXTHDR_IMPL + +static int +do_test (void) +{ + static void *handle; + + run_test_CMSG_NXTHDR (); + + handle = xdlopen (LIBC_SO, RTLD_LAZY); + cmsg_nxthdr = (struct cmsghdr * (*) (struct msghdr *, struct cmsghdr *)) + xdlsym (handle, "__cmsg_nxthdr"); + + run_test_cmsg_nxthdr (); + + return 0; +} + +#include diff --git a/stdlib/Makefile b/stdlib/Makefile index 823674198..164dd8909 100644 --- a/stdlib/Makefile +++ b/stdlib/Makefile @@ -217,6 +217,9 @@ CFLAGS-tst-qsort.c += $(stack-align-test-flags) CFLAGS-tst-makecontext.c += -funwind-tables CFLAGS-tst-makecontext2.c += $(stack-align-test-flags) +CFLAGS-testmb.c += -D_FORTIFY_SOURCE=2 -Wall -Werror + + # Run a test on the header files we use. tests-special += $(objpfx)isomac.out diff --git a/stdlib/bits/stdlib.h b/stdlib/bits/stdlib.h index 277d099e2..de1c3b20f 100644 --- a/stdlib/bits/stdlib.h +++ b/stdlib/bits/stdlib.h @@ -96,6 +96,11 @@ extern size_t __mbstowcs_chk (wchar_t *__restrict __dst, const char *__restrict __src, size_t __len, size_t __dstlen) __THROW __attr_access ((__write_only__, 1, 3)) __attr_access ((__read_only__, 2)); +extern size_t __REDIRECT_NTH (__mbstowcs_nulldst, + (wchar_t *__restrict __dst, + const char *__restrict __src, + size_t __len), mbstowcs) + __attr_access ((__read_only__, 2)); extern size_t __REDIRECT_NTH (__mbstowcs_alias, (wchar_t *__restrict __dst, const char *__restrict __src, @@ -112,12 +117,13 @@ __fortify_function size_t __NTH (mbstowcs (wchar_t *__restrict __dst, const char *__restrict __src, size_t __len)) { - return __glibc_fortify_n (mbstowcs, __len, sizeof (wchar_t), - __glibc_objsize (__dst), - __dst, __src, __len); + if (__builtin_constant_p (__dst == NULL) && __dst == NULL) + return __mbstowcs_nulldst (__dst, __src, __len); + else + return __glibc_fortify_n (mbstowcs, __len, sizeof (wchar_t), + __glibc_objsize (__dst), __dst, __src, __len); } - extern size_t __wcstombs_chk (char *__restrict __dst, const wchar_t *__restrict __src, size_t __len, size_t __dstlen) __THROW diff --git a/stdlib/testmb.c b/stdlib/testmb.c index 45dae7db6..6ac4dfd21 100644 --- a/stdlib/testmb.c +++ b/stdlib/testmb.c @@ -16,6 +16,13 @@ main (int argc, char *argv[]) lose = 1; } + i = mbstowcs (NULL, "bar", 4); + if (!(i == 3 && w[1] == 'a')) + { + puts ("mbstowcs FAILED2!"); + lose = 1; + } + mbstowcs (w, "blah", 5); i = wcstombs (c, w, 10); if (i != 4) diff --git a/string/bits/string_fortified.h b/string/bits/string_fortified.h index f4a5dfc2e..149ebbb08 100644 --- a/string/bits/string_fortified.h +++ b/string/bits/string_fortified.h @@ -107,7 +107,7 @@ __NTH (stpncpy (char *__dest, const char *__src, size_t __n)) # else extern char *__stpncpy_chk (char *__dest, const char *__src, size_t __n, size_t __destlen) __THROW - __fortified_attr_access ((__write_only__, 1, 3)) + __fortified_attr_access (__write_only__, 1, 3) __attr_access ((__read_only__, 2)); extern char *__REDIRECT_NTH (__stpncpy_alias, (char *__dest, const char *__src, size_t __n), stpncpy); diff --git a/string/test-rawmemchr.c b/string/test-rawmemchr.c index cafb75298..703e8ec27 100644 --- a/string/test-rawmemchr.c +++ b/string/test-rawmemchr.c @@ -17,6 +17,7 @@ . */ #include +#include #define TEST_MAIN #define TEST_NAME "rawmemchr" @@ -50,13 +51,45 @@ do_one_test (impl_t *impl, const char *s, int c, char *exp_res) } } +static void +do_test_bz29234 (void) +{ + size_t i, j; + char *ptr_start; + char *buf = xmmap (0, 8192, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1); + + memset (buf, -1, 8192); + + ptr_start = buf + 4096 - 8; + + /* Out of range matches before the start of a page. */ + memset (ptr_start - 8, 0x1, 8); + + for (j = 0; j < 8; ++j) + { + for (i = 0; i < 128; ++i) + { + ptr_start[i + j] = 0x1; + + FOR_EACH_IMPL (impl, 0) + do_one_test (impl, (char *) (ptr_start + j), 0x1, + ptr_start + i + j); + + ptr_start[i + j] = 0xff; + } + } + + xmunmap (buf, 8192); +} + static void do_test (size_t align, size_t pos, size_t len, int seek_char) { size_t i; char *result; - align &= 7; + align &= getpagesize () - 1; if (align + len >= page_size) return; @@ -114,6 +147,13 @@ do_random_tests (void) } } + if (align) + { + p[align - 1] = seek_char; + if (align > 4) + p[align - 4] = seek_char; + } + assert (pos < len); size_t r = random (); if ((r & 31) == 0) @@ -129,6 +169,13 @@ do_random_tests (void) result, p); ret = 1; } + + if (align) + { + p[align - 1] = seek_char; + if (align > 4) + p[align - 4] = seek_char; + } } } @@ -150,14 +197,22 @@ test_main (void) do_test (i, 64, 256, 23); do_test (0, 16 << i, 2048, 0); do_test (i, 64, 256, 0); + + do_test (getpagesize () - i, 64, 256, 23); + do_test (getpagesize () - i, 64, 256, 0); } for (i = 1; i < 32; ++i) { do_test (0, i, i + 1, 23); do_test (0, i, i + 1, 0); + + do_test (getpagesize () - 7, i, i + 1, 23); + do_test (getpagesize () - i / 2, i, i + 1, 23); + do_test (getpagesize () - i, i, i + 1, 23); } do_random_tests (); + do_test_bz29234 (); return ret; } diff --git a/string/test-strncmp.c b/string/test-strncmp.c index e7d5edea3..358f40eb5 100644 --- a/string/test-strncmp.c +++ b/string/test-strncmp.c @@ -434,6 +434,28 @@ check3 (void) } } +static void +check4 (void) +{ + /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of + the end of the page. 2) For there to be no mismatch/null byte before the + first page cross. 3) For length (`n`) to be large enough for one string to + cross the page. And 4) for there to be either mismatch/null bytes before + the start of the strings. */ + + size_t size = 10; + size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1); + CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa)); + CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed)); + int exp_result; + + STRCPY (s1, L ("tst-tlsmod%")); + STRCPY (s2, L ("tst-tls-manydynamic73mod")); + exp_result = SIMPLE_STRNCMP (s1, s2, size); + FOR_EACH_IMPL (impl, 0) + check_result (impl, s1, s2, size, exp_result); +} + int test_main (void) { @@ -444,6 +466,7 @@ test_main (void) check1 (); check2 (); check3 (); + check4 (); printf ("%23s", ""); FOR_EACH_IMPL (impl, 0) diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index 2ebe7901c..4a5e7b63d 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -1121,9 +1121,11 @@ extern void _dl_init (struct link_map *main_map, int argc, char **argv, initializer functions have completed. */ extern void _dl_fini (void) attribute_hidden; -/* Sort array MAPS according to dependencies of the contained objects. */ +/* Sort array MAPS according to dependencies of the contained objects. + If FORCE_FIRST, MAPS[0] keeps its place even if the dependencies + say otherwise. */ extern void _dl_sort_maps (struct link_map **maps, unsigned int nmaps, - unsigned int skip, bool for_fini) attribute_hidden; + bool force_first, bool for_fini) attribute_hidden; /* The dynamic linker calls this function before and having changing any shared object mappings. The `r_state' member of `struct r_debug' @@ -1254,6 +1256,11 @@ extern struct link_map * _dl_get_dl_main_map (void) # endif #endif +/* Perform early memory allocation, avoding a TCB dependency. + Terminate the process if allocation fails. May attempt to use + brk. */ +void *_dl_early_allocate (size_t size) attribute_hidden; + /* Initialize the DSO sort algorithm to use. */ #if !HAVE_TUNABLES static inline void diff --git a/sysdeps/generic/libc-lock-arch.h b/sysdeps/generic/libc-lock-arch.h new file mode 100644 index 000000000..4713b30a8 --- /dev/null +++ b/sysdeps/generic/libc-lock-arch.h @@ -0,0 +1,25 @@ +/* Private libc-internal arch-specific definitions. Generic version. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, see . */ + +#ifndef _LIBC_LOCK_ARCH_H +#define _LIBC_LOCK_ARCH_H + +/* The default definition uses the natural alignment from the lock type. */ +#define __LIBC_LOCK_ALIGNMENT + +#endif diff --git a/sysdeps/generic/startup.h b/sysdeps/generic/startup.h index 99509404e..45979ab23 100644 --- a/sysdeps/generic/startup.h +++ b/sysdeps/generic/startup.h @@ -23,27 +23,3 @@ /* Use macro instead of inline function to avoid including . */ #define _startup_fatal(message) __libc_fatal ((message)) - -static inline uid_t -startup_getuid (void) -{ - return __getuid (); -} - -static inline uid_t -startup_geteuid (void) -{ - return __geteuid (); -} - -static inline gid_t -startup_getgid (void) -{ - return __getgid (); -} - -static inline gid_t -startup_getegid (void) -{ - return __getegid (); -} diff --git a/sysdeps/hppa/dl-fptr.c b/sysdeps/hppa/dl-fptr.c index 2584557c4..9ed21602d 100644 --- a/sysdeps/hppa/dl-fptr.c +++ b/sysdeps/hppa/dl-fptr.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -351,21 +352,20 @@ _dl_lookup_address (const void *address) { ElfW(Addr) addr = (ElfW(Addr)) address; ElfW(Word) reloc_arg; - volatile unsigned int *desc; - unsigned int *gptr; + unsigned int *desc, *gptr; /* Return ADDR if the least-significant two bits of ADDR are not consistent with ADDR being a linker defined function pointer. The normal value for a code address in a backtrace is 3. */ - if (((unsigned int) addr & 3) != 2) + if (((uintptr_t) addr & 3) != 2) return addr; /* Handle special case where ADDR points to page 0. */ - if ((unsigned int) addr < 4096) + if ((uintptr_t) addr < 4096) return addr; /* Clear least-significant two bits from descriptor address. */ - desc = (unsigned int *) ((unsigned int) addr & ~3); + desc = (unsigned int *) ((uintptr_t) addr & ~3); if (!_dl_read_access_allowed (desc)) return addr; @@ -376,7 +376,7 @@ _dl_lookup_address (const void *address) /* Then load first word of candidate descriptor. It should be a pointer with word alignment and point to memory that can be read. */ gptr = (unsigned int *) desc[0]; - if (((unsigned int) gptr & 3) != 0 + if (((uintptr_t) gptr & 3) != 0 || !_dl_read_access_allowed (gptr)) return addr; @@ -400,10 +400,11 @@ _dl_lookup_address (const void *address) /* If gp has been resolved, we need to hunt for relocation offset. */ if (!(reloc_arg & PA_GP_RELOC)) - reloc_arg = _dl_fix_reloc_arg (addr, l); + reloc_arg = _dl_fix_reloc_arg ((struct fdesc *) addr, l); _dl_fixup (l, reloc_arg); } return (ElfW(Addr)) desc[0]; } +rtld_hidden_def (_dl_lookup_address) diff --git a/sysdeps/hppa/dl-lookupcfg.h b/sysdeps/hppa/dl-lookupcfg.h index 8da2412fe..de0a3b78e 100644 --- a/sysdeps/hppa/dl-lookupcfg.h +++ b/sysdeps/hppa/dl-lookupcfg.h @@ -30,6 +30,7 @@ rtld_hidden_proto (_dl_symbol_address) #define DL_SYMBOL_ADDRESS(map, ref) _dl_symbol_address(map, ref) Elf32_Addr _dl_lookup_address (const void *address); +rtld_hidden_proto (_dl_lookup_address) #define DL_LOOKUP_ADDRESS(addr) _dl_lookup_address ((const void *) addr) @@ -79,7 +80,9 @@ void attribute_hidden _dl_unmap (struct link_map *map); /* Extract the code address from a fixup value */ #define DL_FIXUP_VALUE_CODE_ADDR(value) ((value).ip) #define DL_FIXUP_VALUE_ADDR(value) ((uintptr_t) &(value)) -#define DL_FIXUP_ADDR_VALUE(addr) (*(struct fdesc *) (addr)) +/* Clear the plabel bit to get the actual address of the descriptor. */ +#define DL_FIXUP_ADDR_VALUE(addr) \ + (*(DL_FIXUP_VALUE_TYPE *) ((uintptr_t) (addr) & ~2)) #define DL_FIXUP_BINDNOW_ADDR_VALUE(addr) (addr) -#define DL_FIXUP_BINDNOW_RELOC(value, new_value, st_value) \ - (*value) = *(struct fdesc *) (st_value) +#define DL_FIXUP_BINDNOW_RELOC(value, new_value, st_value) \ + *(value) = *(DL_FIXUP_VALUE_TYPE *) ((uintptr_t) (new_value) & ~2) diff --git a/sysdeps/hppa/dl-machine.h b/sysdeps/hppa/dl-machine.h index da4d57d2e..7b647abfd 100644 --- a/sysdeps/hppa/dl-machine.h +++ b/sysdeps/hppa/dl-machine.h @@ -176,6 +176,15 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], Elf32_Addr i[2]; } sig = {{0x00,0xc0,0xff,0xee, 0xde,0xad,0xbe,0xef}}; + /* Initialize dp register for main executable. */ + if (l->l_main_map) + { + register Elf32_Addr dp asm ("%r27"); + + dp = D_PTR (l, l_info[DT_PLTGOT]); + asm volatile ("" : : "r" (dp)); + } + /* If we don't have a PLT we can just skip all this... */ if (__builtin_expect (l->l_info[DT_JMPREL] == NULL,0)) return lazy; @@ -338,16 +347,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[], its return value is the user program's entry point. */ #define RTLD_START \ -/* Set up dp for any non-PIC lib constructors that may be called. */ \ -static struct link_map * __attribute__((used)) \ -set_dp (struct link_map *map) \ -{ \ - register Elf32_Addr dp asm ("%r27"); \ - dp = D_PTR (map, l_info[DT_PLTGOT]); \ - asm volatile ("" : : "r" (dp)); \ - return map; \ -} \ - \ asm ( \ " .text\n" \ " .globl _start\n" \ @@ -355,10 +354,6 @@ asm ( \ "_start:\n" \ /* The kernel does not give us an initial stack frame. */ \ " ldo 64(%sp),%sp\n" \ - /* Save the relevant arguments (yes, those are the correct \ - registers, the kernel is weird) in their stack slots. */ \ -" stw %r25,-40(%sp)\n" /* argc */ \ -" stw %r24,-44(%sp)\n" /* argv */ \ \ /* We need the LTP, and we need it now. \ $PIC_pcrel$0 points 8 bytes past the current instruction, \ @@ -416,12 +411,7 @@ asm ( \ So, obviously, we can't just pass %sp to _dl_start. That's \ okay, argv-4 will do just fine. \ \ - The pleasant part of this is that if we need to skip \ - arguments we can just decrement argc and move argv, because \ - the stack pointer is utterly unrelated to the location of \ - the environment and argument vectors. */ \ - \ - /* This is always within range so we'll be okay. */ \ + This is always within range so we'll be okay. */ \ " bl _dl_start,%rp\n" \ " ldo -4(%r24),%r26\n" \ \ @@ -431,30 +421,28 @@ asm ( \ /* Save the entry point in %r3. */ \ " copy %ret0,%r3\n" \ \ - /* See if we were called as a command with the executable file \ - name as an extra leading argument. */ \ -" addil LT'_dl_skip_args,%r19\n" \ -" ldw RT'_dl_skip_args(%r1),%r20\n" \ -" ldw 0(%r20),%r20\n" \ - \ -" ldw -40(%sp),%r25\n" /* argc */ \ -" comib,= 0,%r20,.Lnofix\n" /* FIXME: Mispredicted branch */\ -" ldw -44(%sp),%r24\n" /* argv (delay slot) */ \ + /* The loader adjusts argc, argv, env, and the aux vectors \ + directly on the stack to remove any arguments used for \ + direct loader invocation. Thus, argc and argv must be \ + reloaded from from _dl_argc and _dl_argv. */ \ \ -" sub %r25,%r20,%r25\n" \ + /* Load argc from _dl_argc. */ \ +" addil LT'_dl_argc,%r19\n" \ +" ldw RT'_dl_argc(%r1),%r20\n" \ +" ldw 0(%r20),%r25\n" \ " stw %r25,-40(%sp)\n" \ -" sh2add %r20,%r24,%r24\n" \ + \ + /* Same for argv with _dl_argv. */ \ +" addil LT'_dl_argv,%r19\n" \ +" ldw RT'_dl_argv(%r1),%r20\n" \ +" ldw 0(%r20),%r24\n" \ " stw %r24,-44(%sp)\n" \ \ -".Lnofix:\n" \ + /* Call _dl_init(main_map, argc, argv, envp). */ \ " addil LT'_rtld_local,%r19\n" \ " ldw RT'_rtld_local(%r1),%r26\n" \ -" bl set_dp, %r2\n" \ " ldw 0(%r26),%r26\n" \ \ - /* Call _dl_init(_dl_loaded, argc, argv, envp). */ \ -" copy %r28,%r26\n" \ - \ /* envp = argv + argc + 1 */ \ " sh2add %r25,%r24,%r23\n" \ " bl _dl_init,%r2\n" \ diff --git a/sysdeps/hppa/dl-runtime.c b/sysdeps/hppa/dl-runtime.c index 8b2ee58e3..192a6bee0 100644 --- a/sysdeps/hppa/dl-runtime.c +++ b/sysdeps/hppa/dl-runtime.c @@ -25,8 +25,7 @@ return that to the caller. The caller will continue on to call _dl_fixup with the relocation offset. */ -ElfW(Word) -attribute_hidden __attribute ((noinline)) DL_ARCH_FIXUP_ATTRIBUTE +ElfW(Word) __attribute ((noinline)) DL_ARCH_FIXUP_ATTRIBUTE _dl_fix_reloc_arg (struct fdesc *fptr, struct link_map *l) { Elf32_Addr l_addr, iplt, jmprel, end_jmprel, r_type; @@ -52,3 +51,4 @@ _dl_fix_reloc_arg (struct fdesc *fptr, struct link_map *l) ABORT_INSTRUCTION; return 0; } +rtld_hidden_def (_dl_fix_reloc_arg) diff --git a/sysdeps/hppa/dl-runtime.h b/sysdeps/hppa/dl-runtime.h index d4da46079..5ced8e14e 100644 --- a/sysdeps/hppa/dl-runtime.h +++ b/sysdeps/hppa/dl-runtime.h @@ -17,6 +17,9 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ +ElfW(Word) _dl_fix_reloc_arg (struct fdesc *, struct link_map *); +rtld_hidden_proto (_dl_fix_reloc_arg) + /* Clear PA_GP_RELOC bit in relocation offset. */ static inline uintptr_t reloc_offset (uintptr_t plt0, uintptr_t pltn) diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps index 760104911..84e6686eb 100644 --- a/sysdeps/i386/fpu/libm-test-ulps +++ b/sysdeps/i386/fpu/libm-test-ulps @@ -668,7 +668,7 @@ ldouble: 4 Function: Imaginary part of "clog10": double: 2 -float: 1 +float: 2 float128: 2 ldouble: 2 diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps index a39c89cec..cc21e6907 100644 --- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps +++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps @@ -668,7 +668,7 @@ ldouble: 4 Function: Imaginary part of "clog10": double: 2 -float: 1 +float: 2 float128: 2 ldouble: 2 diff --git a/sysdeps/m68k/dl-machine.h b/sysdeps/m68k/dl-machine.h index c44ab055a..bb51b4198 100644 --- a/sysdeps/m68k/dl-machine.h +++ b/sysdeps/m68k/dl-machine.h @@ -234,6 +234,11 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[], switch (r_type) { + case R_68K_GLOB_DAT: + case R_68K_JMP_SLOT: + *reloc_addr = value; + break; +#ifndef RTLD_BOOTSTRAP case R_68K_COPY: if (sym == NULL) /* This can happen in trace mode if an object could not be @@ -252,10 +257,6 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[], memcpy (reloc_addr_arg, (void *) value, MIN (sym->st_size, refsym->st_size)); break; - case R_68K_GLOB_DAT: - case R_68K_JMP_SLOT: - *reloc_addr = value; - break; case R_68K_8: *(char *) reloc_addr = value + reloc->r_addend; break; @@ -276,7 +277,6 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[], case R_68K_PC32: *reloc_addr = value + reloc->r_addend - (Elf32_Addr) reloc_addr; break; -#ifndef RTLD_BOOTSTRAP case R_68K_TLS_DTPMOD32: /* Get the information from the link map returned by the resolv function. */ @@ -294,9 +294,9 @@ elf_machine_rela (struct link_map *map, struct r_scope_elem *scope[], *reloc_addr = TLS_TPREL_VALUE (sym_map, sym, reloc); } break; -#endif /* !RTLD_BOOTSTRAP */ case R_68K_NONE: /* Alright, Wilbur. */ break; +#endif /* !RTLD_BOOTSTRAP */ default: _dl_reloc_bad_type (map, r_type, 0); break; diff --git a/sysdeps/mach/hurd/bits/socket.h b/sysdeps/mach/hurd/bits/socket.h index 5b35ea81e..70fce4fb2 100644 --- a/sysdeps/mach/hurd/bits/socket.h +++ b/sysdeps/mach/hurd/bits/socket.h @@ -249,6 +249,12 @@ struct cmsghdr + CMSG_ALIGN (sizeof (struct cmsghdr))) #define CMSG_LEN(len) (CMSG_ALIGN (sizeof (struct cmsghdr)) + (len)) +/* Given a length, return the additional padding necessary such that + len + __CMSG_PADDING(len) == CMSG_ALIGN (len). */ +#define __CMSG_PADDING(len) ((sizeof (size_t) \ + - ((len) & (sizeof (size_t) - 1))) \ + & (sizeof (size_t) - 1)) + extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg) __THROW; #ifdef __USE_EXTERN_INLINES @@ -258,18 +264,38 @@ extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr, _EXTERN_INLINE struct cmsghdr * __NTH (__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg)) { + /* We may safely assume that __cmsg lies between __mhdr->msg_control and + __mhdr->msg_controllen because the user is required to obtain the first + cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs + via CMSG_NXTHDR, setting lengths along the way. However, we don't yet + trust the value of __cmsg->cmsg_len and therefore do not use it in any + pointer arithmetic until we check its value. */ + + unsigned char * __msg_control_ptr = (unsigned char *) __mhdr->msg_control; + unsigned char * __cmsg_ptr = (unsigned char *) __cmsg; + + size_t __size_needed = sizeof (struct cmsghdr) + + __CMSG_PADDING (__cmsg->cmsg_len); + + /* The current header is malformed, too small to be a full header. */ if ((size_t) __cmsg->cmsg_len < sizeof (struct cmsghdr)) - /* The kernel header does this so there may be a reason. */ return (struct cmsghdr *) 0; + /* There isn't enough space between __cmsg and the end of the buffer to + hold the current cmsg *and* the next one. */ + if (((size_t) + (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr) + < __size_needed) + || ((size_t) + (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr + - __size_needed) + < __cmsg->cmsg_len)) + + return (struct cmsghdr *) 0; + + /* Now, we trust cmsg_len and can use it to find the next header. */ __cmsg = (struct cmsghdr *) ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len)); - if ((unsigned char *) (__cmsg + 1) > ((unsigned char *) __mhdr->msg_control - + __mhdr->msg_controllen) - || ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len) - > ((unsigned char *) __mhdr->msg_control + __mhdr->msg_controllen))) - /* No more entries. */ - return (struct cmsghdr *) 0; return __cmsg; } #endif /* Use `extern inline'. */ diff --git a/sysdeps/mach/hurd/dl-sysdep.c b/sysdeps/mach/hurd/dl-sysdep.c index 3cbe07561..8373962e6 100644 --- a/sysdeps/mach/hurd/dl-sysdep.c +++ b/sysdeps/mach/hurd/dl-sysdep.c @@ -76,6 +76,7 @@ _dl_sysdep_start (void **start_argptr, { void go (intptr_t *argdata) { + char *orig_argv0; char **p; /* Cache the information in various global variables. */ @@ -84,6 +85,8 @@ _dl_sysdep_start (void **start_argptr, _environ = &_dl_argv[_dl_argc + 1]; for (p = _environ; *p++;); /* Skip environ pointers and terminator. */ + orig_argv0 = _dl_argv[0]; + if ((void *) p == _dl_argv[0]) { static struct hurd_startup_data nodata; @@ -173,30 +176,23 @@ _dl_sysdep_start (void **start_argptr, /* The call above might screw a few things up. - First of all, if _dl_skip_args is nonzero, we are ignoring - the first few arguments. However, if we have no Hurd startup - data, it is the magical convention that ARGV[0] == P. The + P is the location after the terminating NULL of the list of + environment variables. It has to point to the Hurd startup + data or if that's missing then P == ARGV[0] must hold. The startup code in init-first.c will get confused if this is not the case, so we must rearrange things to make it so. We'll - overwrite the origional ARGV[0] at P with ARGV[_dl_skip_args]. + recompute P and move the Hurd data or the new ARGV[0] there. - Secondly, if we need to be secure, it removes some dangerous - environment variables. If we have no Hurd startup date this - changes P (since that's the location after the terminating - NULL in the list of environment variables). We do the same - thing as in the first case but make sure we recalculate P. - If we do have Hurd startup data, we have to move the data - such that it starts just after the terminating NULL in the - environment list. + Note: directly invoked ld.so can move arguments and env vars. We use memmove, since the locations might overlap. */ - if (__libc_enable_secure || _dl_skip_args) - { - char **newp; - for (newp = _environ; *newp++;); + char **newp; + for (newp = _environ; *newp++;); - if (_dl_argv[-_dl_skip_args] == (char *) p) + if (newp != p || _dl_argv[0] != orig_argv0) + { + if (orig_argv0 == (char *) p) { if ((char *) newp != _dl_argv[0]) { diff --git a/sysdeps/mach/hurd/i386/init-first.c b/sysdeps/mach/hurd/i386/init-first.c index 1229b5911..534a796e0 100644 --- a/sysdeps/mach/hurd/i386/init-first.c +++ b/sysdeps/mach/hurd/i386/init-first.c @@ -38,10 +38,6 @@ extern void __init_misc (int, char **, char **); unsigned long int __hurd_threadvar_stack_offset; unsigned long int __hurd_threadvar_stack_mask; -#ifndef SHARED -int __libc_enable_secure; -#endif - extern int __libc_argc attribute_hidden; extern char **__libc_argv attribute_hidden; extern char **_dl_argv; diff --git a/sysdeps/nios2/dl-machine.h b/sysdeps/nios2/dl-machine.h index 80de6fd04..9a35cf416 100644 --- a/sysdeps/nios2/dl-machine.h +++ b/sysdeps/nios2/dl-machine.h @@ -128,53 +128,23 @@ _start:\n\ ldw r8, %call(_dl_nios2_get_gp_value)(r22)\n\ callr r8\n\ mov gp, r2\n\ -\n\ - /* Find the number of arguments to skip. */\n\ - ldw r8, %got(_dl_skip_args)(r22)\n\ - ldw r8, 0(r8)\n\ \n\ /* Find the main_map from the GOT. */\n\ ldw r4, %got(_rtld_local)(r22)\n\ ldw r4, 0(r4)\n\ \n\ - /* Find argc. */\n\ - ldw r5, 0(sp)\n\ - sub r5, r5, r8\n\ - stw r5, 0(sp)\n\ -\n\ - /* Find the first unskipped argument. */\n\ - slli r8, r8, 2\n\ - addi r6, sp, 4\n\ - add r9, r6, r8\n\ - mov r10, r6\n\ -\n\ - /* Shuffle argv down. */\n\ -3: ldw r11, 0(r9)\n\ - stw r11, 0(r10)\n\ - addi r9, r9, 4\n\ - addi r10, r10, 4\n\ - bne r11, zero, 3b\n\ + /* Load adjusted argc. */\n\ + ldw r2, %got(_dl_argc)(r22)\n\ + ldw r5, 0(r2)\n\ \n\ - /* Shuffle envp down. */\n\ - mov r7, r10\n\ -4: ldw r11, 0(r9)\n\ - stw r11, 0(r10)\n\ - addi r9, r9, 4\n\ - addi r10, r10, 4\n\ - bne r11, zero, 4b\n\ -\n\ - /* Shuffle auxv down. */\n\ -5: ldw r11, 4(r9)\n\ - stw r11, 4(r10)\n\ - ldw r11, 0(r9)\n\ - stw r11, 0(r10)\n\ - addi r9, r9, 8\n\ - addi r10, r10, 8\n\ - bne r11, zero, 5b\n\ -\n\ - /* Update _dl_argv. */\n\ + /* Load adjsuted argv. */\n\ ldw r2, %got(_dl_argv)(r22)\n\ - stw r6, 0(r2)\n\ + ldw r6, 0(r2)\n\ +\n\ + /* envp = argv + argc + 1 */\n\ + addi r7, r5, 1\n\ + slli r7, r7, 2\n\ + add r7, r7, r6\n\ \n\ /* Call _dl_init through the PLT. */\n\ ldw r8, %call(_dl_init)(r22)\n\ diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c index 1294c9181..53fba774a 100644 --- a/sysdeps/nptl/dl-tls_init_tp.c +++ b/sysdeps/nptl/dl-tls_init_tp.c @@ -128,7 +128,4 @@ __tls_init_tp (void) It will be bigger than it actually is, but for unwind.c/pt-longjmp.c purposes this is good enough. */ THREAD_SETMEM (pd, stackblock_size, (size_t) __libc_stack_end); - - THREAD_SETMEM (pd, cancelstate, PTHREAD_CANCEL_ENABLE); - THREAD_SETMEM (pd, canceltype, PTHREAD_CANCEL_DEFERRED); } diff --git a/sysdeps/nptl/libc-lock.h b/sysdeps/nptl/libc-lock.h index 5af476c48..63b3f3d75 100644 --- a/sysdeps/nptl/libc-lock.h +++ b/sysdeps/nptl/libc-lock.h @@ -22,6 +22,7 @@ #include #define __need_NULL #include +#include /* Mutex type. */ @@ -29,7 +30,12 @@ # if (!IS_IN (libc) && !IS_IN (libpthread)) || !defined _LIBC typedef struct { pthread_mutex_t mutex; } __libc_lock_recursive_t; # else -typedef struct { int lock; int cnt; void *owner; } __libc_lock_recursive_t; +typedef struct +{ + int lock __LIBC_LOCK_ALIGNMENT; + int cnt; + void *owner; +} __libc_lock_recursive_t; # endif #else typedef struct __libc_lock_recursive_opaque__ __libc_lock_recursive_t; diff --git a/sysdeps/nptl/libc-lockP.h b/sysdeps/nptl/libc-lockP.h index d3a6837fd..425f514c5 100644 --- a/sysdeps/nptl/libc-lockP.h +++ b/sysdeps/nptl/libc-lockP.h @@ -32,9 +32,10 @@ ld.so might be used on old kernels with a different libc.so. */ #include #include +#include /* Mutex type. */ -typedef int __libc_lock_t; +typedef int __libc_lock_t __LIBC_LOCK_ALIGNMENT; typedef struct { pthread_mutex_t mutex; } __rtld_lock_recursive_t; typedef pthread_rwlock_t __libc_rwlock_t; diff --git a/sysdeps/nptl/pthreadP.h b/sysdeps/nptl/pthreadP.h index 708bd9246..601db4ff2 100644 --- a/sysdeps/nptl/pthreadP.h +++ b/sysdeps/nptl/pthreadP.h @@ -275,7 +275,7 @@ __do_cancel (void) struct pthread *self = THREAD_SELF; /* Make sure we get no more cancellations. */ - THREAD_ATOMIC_BIT_SET (self, cancelhandling, EXITING_BIT); + atomic_bit_set (&self->cancelhandling, EXITING_BIT); __pthread_unwind ((__pthread_unwind_buf_t *) THREAD_GETMEM (self, cleanup_jmp_buf)); diff --git a/sysdeps/posix/fpathconf.c b/sysdeps/posix/fpathconf.c index 216f2a9c8..4b215e060 100644 --- a/sysdeps/posix/fpathconf.c +++ b/sysdeps/posix/fpathconf.c @@ -131,9 +131,9 @@ __fpathconf (int fd, int name) #ifdef _POSIX_ASYNC_IO { /* AIO is only allowed on regular files and block devices. */ - struct stat64 st; + struct __stat64_t64 st; - if (__fstat64 (fd, &st) < 0 + if (__fstat64_time64 (fd, &st) < 0 || (! S_ISREG (st.st_mode) && ! S_ISBLK (st.st_mode))) return -1; else diff --git a/sysdeps/posix/isfdtype.c b/sysdeps/posix/isfdtype.c index 192c7f9be..d26c14259 100644 --- a/sysdeps/posix/isfdtype.c +++ b/sysdeps/posix/isfdtype.c @@ -24,12 +24,12 @@ int isfdtype (int fildes, int fdtype) { - struct stat64 st; + struct __stat64_t64 st; int result; { int save_error = errno; - result = __fstat64 (fildes, &st); + result = __fstat64_time64 (fildes, &st); __set_errno (save_error); } diff --git a/sysdeps/posix/posix_fallocate.c b/sysdeps/posix/posix_fallocate.c index 037d32864..9720e71cc 100644 --- a/sysdeps/posix/posix_fallocate.c +++ b/sysdeps/posix/posix_fallocate.c @@ -30,7 +30,7 @@ int posix_fallocate (int fd, __off_t offset, __off_t len) { - struct stat64 st; + struct __stat64_t64 st; if (offset < 0 || len < 0) return EINVAL; @@ -48,7 +48,7 @@ posix_fallocate (int fd, __off_t offset, __off_t len) } /* We have to make sure that this is really a regular file. */ - if (__fstat64 (fd, &st) != 0) + if (__fstat64_time64 (fd, &st) != 0) return EBADF; if (S_ISFIFO (st.st_mode)) return ESPIPE; diff --git a/sysdeps/posix/posix_fallocate64.c b/sysdeps/posix/posix_fallocate64.c index a670ee0a3..bf984f7f9 100644 --- a/sysdeps/posix/posix_fallocate64.c +++ b/sysdeps/posix/posix_fallocate64.c @@ -30,7 +30,7 @@ int __posix_fallocate64_l64 (int fd, __off64_t offset, __off64_t len) { - struct stat64 st; + struct __stat64_t64 st; if (offset < 0 || len < 0) return EINVAL; @@ -48,7 +48,7 @@ __posix_fallocate64_l64 (int fd, __off64_t offset, __off64_t len) } /* We have to make sure that this is really a regular file. */ - if (__fstat64 (fd, &st) != 0) + if (__fstat64_time64 (fd, &st) != 0) return EBADF; if (S_ISFIFO (st.st_mode)) return ESPIPE; diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S index ae2316131..deb94671c 100644 --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S @@ -352,7 +352,7 @@ L(zero_padding_loop): cmpldi cr6,r5,16 /* Check if length was reached. */ ble cr6,L(zero_padding_end) - stxv v18,0(r11) + stxv 32+v18,0(r11) addi r11,r11,16 addi r5,r5,-16 @@ -360,7 +360,7 @@ L(zero_padding_loop): L(zero_padding_end): sldi r10,r5,56 /* stxvl wants size in top 8 bits */ - stxvl v18,r11,r10 /* Partial store */ + stxvl 32+v18,r11,r10 /* Partial store */ blr .align 4 diff --git a/sysdeps/pthread/Makefile b/sysdeps/pthread/Makefile index c972bd819..3a505c5f9 100644 --- a/sysdeps/pthread/Makefile +++ b/sysdeps/pthread/Makefile @@ -69,6 +69,7 @@ tests += tst-cnd-basic tst-mtx-trylock tst-cnd-broadcast \ tst-cancel12 tst-cancel13 tst-cancel14 tst-cancel15 tst-cancel16 \ tst-cancel18 tst-cancel19 tst-cancel20 tst-cancel21 \ tst-cancel22 tst-cancel23 tst-cancel26 tst-cancel27 tst-cancel28 \ + tst-cancel29 \ tst-cleanup0 tst-cleanup1 tst-cleanup2 tst-cleanup3 \ tst-clock1 \ tst-cond-except \ @@ -125,6 +126,7 @@ tests += tst-cnd-basic tst-mtx-trylock tst-cnd-broadcast \ tst-pthread-raise-blocked-self \ tst-pthread_kill-exited \ tst-pthread_kill-exiting \ + tst-cancel30 \ # tests tests-time64 := \ @@ -153,16 +155,36 @@ tests += tst-cancelx2 tst-cancelx3 tst-cancelx6 tst-cancelx8 tst-cancelx9 \ tst-cleanupx0 tst-cleanupx1 tst-cleanupx2 tst-cleanupx3 ifeq ($(build-shared),yes) -tests += tst-atfork2 tst-pt-tls4 tst-_res1 tst-fini1 tst-create1 +tests += \ + tst-atfork2 \ + tst-pt-tls4 \ + tst-_res1 \ + tst-fini1 \ + tst-create1 \ + tst-atfork3 \ + tst-atfork4 \ +# tests + tests-nolibpthread += tst-fini1 endif -modules-names += tst-atfork2mod tst-tls4moda tst-tls4modb \ - tst-_res1mod1 tst-_res1mod2 tst-fini1mod \ - tst-create1mod +modules-names += \ + tst-atfork2mod \ + tst-tls4moda \ + tst-tls4modb \ + tst-_res1mod1 \ + tst-_res1mod2 \ + tst-fini1mod \ + tst-create1mod \ + tst-atfork3mod \ + tst-atfork4mod \ +# module-names + test-modules = $(addprefix $(objpfx),$(addsuffix .so,$(modules-names))) tst-atfork2mod.so-no-z-defs = yes +tst-atfork3mod.so-no-z-defs = yes +tst-atfork4mod.so-no-z-defs = yes tst-create1mod.so-no-z-defs = yes ifeq ($(build-shared),yes) @@ -225,8 +247,18 @@ tst-atfork2-ENV = MALLOC_TRACE=$(objpfx)tst-atfork2.mtrace \ LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so $(objpfx)tst-atfork2mod.so: $(shared-thread-library) +$(objpfx)tst-atfork3: $(shared-thread-library) +LDFLAGS-tst-atfork3 = -rdynamic +$(objpfx)tst-atfork3mod.so: $(shared-thread-library) + +$(objpfx)tst-atfork4: $(shared-thread-library) +LDFLAGS-tst-atfork4 = -rdynamic +$(objpfx)tst-atfork4mod.so: $(shared-thread-library) + ifeq ($(build-shared),yes) $(objpfx)tst-atfork2.out: $(objpfx)tst-atfork2mod.so +$(objpfx)tst-atfork3.out: $(objpfx)tst-atfork3mod.so +$(objpfx)tst-atfork4.out: $(objpfx)tst-atfork4mod.so endif ifeq ($(build-shared),yes) diff --git a/sysdeps/pthread/tst-atfork3.c b/sysdeps/pthread/tst-atfork3.c new file mode 100644 index 000000000..bb2250e43 --- /dev/null +++ b/sysdeps/pthread/tst-atfork3.c @@ -0,0 +1,118 @@ +/* Check if pthread_atfork handler can call dlclose (BZ#24595). + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Check if pthread_atfork handlers do not deadlock when calling a function + that might alter the internal fork handle list, such as dlclose. + + The test registers a callback set with pthread_atfork(), dlopen() a shared + library (nptl/tst-atfork3mod.c), calls an exported symbol from the library + (which in turn also registers atfork handlers), and calls fork to trigger + the callbacks. */ + +static void *handler; +static bool run_dlclose_prepare; +static bool run_dlclose_parent; +static bool run_dlclose_child; + +static void +prepare (void) +{ + if (run_dlclose_prepare) + xdlclose (handler); +} + +static void +parent (void) +{ + if (run_dlclose_parent) + xdlclose (handler); +} + +static void +child (void) +{ + if (run_dlclose_child) + xdlclose (handler); +} + +static void +proc_func (void *closure) +{ +} + +static void +do_test_generic (bool dlclose_prepare, bool dlclose_parent, bool dlclose_child) +{ + run_dlclose_prepare = dlclose_prepare; + run_dlclose_parent = dlclose_parent; + run_dlclose_child = dlclose_child; + + handler = xdlopen ("tst-atfork3mod.so", RTLD_NOW); + + int (*atfork3mod_func)(void); + atfork3mod_func = xdlsym (handler, "atfork3mod_func"); + + atfork3mod_func (); + + struct support_capture_subprocess proc + = support_capture_subprocess (proc_func, NULL); + support_capture_subprocess_check (&proc, "tst-atfork3", 0, sc_allow_none); + + handler = atfork3mod_func = NULL; + + support_capture_subprocess_free (&proc); +} + +static void * +thread_func (void *closure) +{ + return NULL; +} + +static int +do_test (void) +{ + { + /* Make the process acts as multithread. */ + pthread_attr_t attr; + xpthread_attr_init (&attr); + xpthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED); + xpthread_create (&attr, thread_func, NULL); + } + + TEST_COMPARE (pthread_atfork (prepare, parent, child), 0); + + do_test_generic (true /* prepare */, false /* parent */, false /* child */); + do_test_generic (false /* prepare */, true /* parent */, false /* child */); + do_test_generic (false /* prepare */, false /* parent */, true /* child */); + + return 0; +} + +#include diff --git a/sysdeps/pthread/tst-atfork3mod.c b/sysdeps/pthread/tst-atfork3mod.c new file mode 100644 index 000000000..6d0658cb9 --- /dev/null +++ b/sysdeps/pthread/tst-atfork3mod.c @@ -0,0 +1,44 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include + +#include + +static void +mod_prepare (void) +{ +} + +static void +mod_parent (void) +{ +} + +static void +mod_child (void) +{ +} + +int atfork3mod_func (void) +{ + TEST_COMPARE (pthread_atfork (mod_prepare, mod_parent, mod_child), 0); + + return 0; +} diff --git a/sysdeps/pthread/tst-atfork4.c b/sysdeps/pthread/tst-atfork4.c new file mode 100644 index 000000000..52dc87e73 --- /dev/null +++ b/sysdeps/pthread/tst-atfork4.c @@ -0,0 +1,128 @@ +/* pthread_atfork supports handlers that call pthread_atfork or dlclose. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static void * +thread_func (void *x) +{ + return NULL; +} + +static unsigned int second_atfork_handler_runcount = 0; + +static void +second_atfork_handler (void) +{ + second_atfork_handler_runcount++; +} + +static void *h = NULL; + +static unsigned int atfork_handler_runcount = 0; + +static void +prepare (void) +{ + /* These atfork handlers are registered while atfork handlers are being + executed and thus will not be executed during the corresponding + fork. */ + TEST_VERIFY_EXIT (pthread_atfork (second_atfork_handler, + second_atfork_handler, + second_atfork_handler) == 0); + + /* This will de-register the atfork handlers registered by the dlopen'd + library and so they will not be executed. */ + if (h != NULL) + { + xdlclose (h); + h = NULL; + } + + atfork_handler_runcount++; +} + +static void +after (void) +{ + atfork_handler_runcount++; +} + +static int +do_test (void) +{ + /* Make sure __libc_single_threaded is 0. */ + pthread_attr_t attr; + xpthread_attr_init (&attr); + xpthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED); + xpthread_create (&attr, thread_func, NULL); + + void (*reg_atfork_handlers) (void); + + h = xdlopen ("tst-atfork4mod.so", RTLD_LAZY); + + reg_atfork_handlers = xdlsym (h, "reg_atfork_handlers"); + + reg_atfork_handlers (); + + /* We register our atfork handlers *after* loading the module so that our + prepare handler is called first at fork, where we then dlclose the + module before its prepare handler has a chance to be called. */ + TEST_VERIFY_EXIT (pthread_atfork (prepare, after, after) == 0); + + pid_t pid = xfork (); + + /* Both the parent and the child processes should observe this. */ + TEST_VERIFY_EXIT (atfork_handler_runcount == 2); + TEST_VERIFY_EXIT (second_atfork_handler_runcount == 0); + + if (pid > 0) + { + int childstat; + + xwaitpid (-1, &childstat, 0); + TEST_VERIFY_EXIT (WIFEXITED (childstat) + && WEXITSTATUS (childstat) == 0); + + /* This time, the second set of atfork handlers should also be called + since the handlers are already in place before fork is called. */ + + pid = xfork (); + + TEST_VERIFY_EXIT (atfork_handler_runcount == 4); + TEST_VERIFY_EXIT (second_atfork_handler_runcount == 2); + + if (pid > 0) + { + xwaitpid (-1, &childstat, 0); + TEST_VERIFY_EXIT (WIFEXITED (childstat) + && WEXITSTATUS (childstat) == 0); + } + } + + return 0; +} + +#include diff --git a/sysdeps/pthread/tst-atfork4mod.c b/sysdeps/pthread/tst-atfork4mod.c new file mode 100644 index 000000000..e111efeb1 --- /dev/null +++ b/sysdeps/pthread/tst-atfork4mod.c @@ -0,0 +1,48 @@ +/* pthread_atfork supports handlers that call pthread_atfork or dlclose. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +/* This dynamically loaded library simply registers its atfork handlers when + asked to. The atfork handlers should never be executed because the + library is unloaded before fork is called by the test program. */ + +static void +prepare (void) +{ + abort (); +} + +static void +parent (void) +{ + abort (); +} + +static void +child (void) +{ + abort (); +} + +void +reg_atfork_handlers (void) +{ + pthread_atfork (prepare, parent, child); +} diff --git a/sysdeps/pthread/tst-cancel29.c b/sysdeps/pthread/tst-cancel29.c new file mode 100644 index 000000000..4f0d99e00 --- /dev/null +++ b/sysdeps/pthread/tst-cancel29.c @@ -0,0 +1,207 @@ +/* Check if a thread that disables cancellation and which call functions + that might be interrupted by a signal do not see the internal SIGCANCEL. + + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* On Linux some interfaces are never restarted after being interrupted by + a signal handler, regardless of the use of SA_RESTART. It means that + if asynchronous cancellation is not enabled, the pthread_cancel can not + set the internal SIGCANCEL otherwise the interface might see a spurious + EINTR failure. */ + +static pthread_barrier_t b; + +/* Cleanup handling test. */ +static int cl_called; +static void +cl (void *arg) +{ + ++cl_called; +} + +static void * +tf_sigtimedwait (void *arg) +{ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + xpthread_barrier_wait (&b); + + int r; + pthread_cleanup_push (cl, NULL); + + sigset_t mask; + sigemptyset (&mask); + r = sigtimedwait (&mask, NULL, &(struct timespec) { 0, 250000000 }); + if (r != -1) + return (void*) -1; + if (errno != EAGAIN) + return (void*) -2; + + pthread_cleanup_pop (0); + return NULL; +} + +static void * +tf_poll (void *arg) +{ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + xpthread_barrier_wait (&b); + + int r; + pthread_cleanup_push (cl, NULL); + + r = poll (NULL, 0, 250); + if (r != 0) + return (void*) -1; + + pthread_cleanup_pop (0); + return NULL; +} + +static void * +tf_ppoll (void *arg) +{ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + + xpthread_barrier_wait (&b); + + int r; + pthread_cleanup_push (cl, NULL); + + r = ppoll (NULL, 0, &(struct timespec) { 0, 250000000 }, NULL); + if (r != 0) + return (void*) -1; + + pthread_cleanup_pop (0); + return NULL; +} + +static void * +tf_select (void *arg) +{ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + xpthread_barrier_wait (&b); + + int r; + pthread_cleanup_push (cl, NULL); + + r = select (0, NULL, NULL, NULL, &(struct timeval) { 0, 250000 }); + if (r != 0) + return (void*) -1; + + pthread_cleanup_pop (0); + return NULL; +} + +static void * +tf_pselect (void *arg) +{ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + xpthread_barrier_wait (&b); + + int r; + pthread_cleanup_push (cl, NULL); + + r = pselect (0, NULL, NULL, NULL, &(struct timespec) { 0, 250000000 }, NULL); + if (r != 0) + return (void*) -1; + + pthread_cleanup_pop (0); + return NULL; +} + +static void * +tf_clock_nanosleep (void *arg) +{ + pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, NULL); + xpthread_barrier_wait (&b); + + int r; + pthread_cleanup_push (cl, NULL); + + r = clock_nanosleep (CLOCK_REALTIME, 0, &(struct timespec) { 0, 250000000 }, + NULL); + if (r != 0) + return (void*) -1; + + pthread_cleanup_pop (0); + return NULL; +} + +struct cancel_test_t +{ + const char *name; + void * (*cf) (void *); +} tests[] = +{ + { "sigtimedwait", tf_sigtimedwait, }, + { "poll", tf_poll, }, + { "ppoll", tf_ppoll, }, + { "select", tf_select, }, + { "pselect", tf_pselect , }, + { "clock_nanosleep", tf_clock_nanosleep, }, +}; + +static int +do_test (void) +{ + for (int i = 0; i < array_length (tests); i++) + { + xpthread_barrier_init (&b, NULL, 2); + + cl_called = 0; + + pthread_t th = xpthread_create (NULL, tests[i].cf, NULL); + + xpthread_barrier_wait (&b); + + struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 }; + while (nanosleep (&ts, &ts) != 0) + continue; + + xpthread_cancel (th); + + void *status = xpthread_join (th); + if (status != NULL) + printf ("test '%s' failed: %" PRIdPTR "\n", tests[i].name, + (intptr_t) status); + TEST_VERIFY (status == NULL); + + xpthread_barrier_destroy (&b); + + TEST_COMPARE (cl_called, 0); + + printf ("in-time cancel test of '%s' successful\n", tests[i].name); + } + + return 0; +} + +#include diff --git a/sysdeps/pthread/tst-cancel30.c b/sysdeps/pthread/tst-cancel30.c new file mode 100644 index 000000000..e08392f96 --- /dev/null +++ b/sysdeps/pthread/tst-cancel30.c @@ -0,0 +1,82 @@ +/* Check if printf like functions does not disable asynchronous cancellation + mode (BZ#29214). + + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include + +static pthread_barrier_t b; + +static void * +tf (void *arg) +{ + int old; + + TEST_COMPARE (pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, NULL), 0); + + TEST_COMPARE (pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, &old), 0); + TEST_COMPARE (old, PTHREAD_CANCEL_ASYNCHRONOUS); + + /* Check if internal lock cleanup routines restore the cancellation type + correctly. */ + printf ("...\n"); + TEST_COMPARE (pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, &old), 0); + TEST_COMPARE (old, PTHREAD_CANCEL_ASYNCHRONOUS); + + xpthread_barrier_wait (&b); + + /* Wait indefinitely for cancellation, which only works if asynchronous + cancellation is enabled. */ +#ifdef SYS_pause + syscall (SYS_pause); +#elif defined SYS_ppoll || defined SYS_ppoll_time64 +# ifndef SYS_ppoll_time64 +# define SYS_ppoll_time64 SYS_ppoll +# endif + syscall (SYS_ppoll_time64, NULL, 0, NULL, NULL); +#else + for (;;); +#endif + + return 0; +} + +static int +do_test (void) +{ + xpthread_barrier_init (&b, NULL, 2); + + pthread_t th = xpthread_create (NULL, tf, NULL); + + xpthread_barrier_wait (&b); + + xpthread_cancel (th); + + void *status = xpthread_join (th); + TEST_VERIFY (status == PTHREAD_CANCELED); + + return 0; +} + +/* There is no need to wait full TIMEOUT if asynchronous is not working. */ +#define TIMEOUT 3 +#include diff --git a/sysdeps/riscv/rv64/rvd/libm-test-ulps b/sysdeps/riscv/rv64/rvd/libm-test-ulps index e28b21169..308568082 100644 --- a/sysdeps/riscv/rv64/rvd/libm-test-ulps +++ b/sysdeps/riscv/rv64/rvd/libm-test-ulps @@ -1077,7 +1077,7 @@ ldouble: 9 Function: "j0_upward": double: 9 -float: 8 +float: 9 ldouble: 7 Function: "j1": diff --git a/sysdeps/s390/dl-procinfo.c b/sysdeps/s390/dl-procinfo.c index 2cdb3c8b5..f142221a1 100644 --- a/sysdeps/s390/dl-procinfo.c +++ b/sysdeps/s390/dl-procinfo.c @@ -63,11 +63,12 @@ PROCINFO_CLASS const char _dl_s390_cap_flags[23][9] #if !defined PROCINFO_DECL && defined SHARED ._dl_s390_platforms #else -PROCINFO_CLASS const char _dl_s390_platforms[10][7] +PROCINFO_CLASS const char _dl_s390_platforms[11][7] #endif #ifndef PROCINFO_DECL = { - "g5", "z900", "z990", "z9-109", "z10", "z196", "zEC12", "z13", "z14", "z15" + "g5", "z900", "z990", "z9-109", "z10", "z196", "zEC12", "z13", "z14", "z15", + "z16" } #endif #if !defined SHARED || defined PROCINFO_DECL diff --git a/sysdeps/s390/dl-procinfo.h b/sysdeps/s390/dl-procinfo.h index 03d7e9437..1f4e3875b 100644 --- a/sysdeps/s390/dl-procinfo.h +++ b/sysdeps/s390/dl-procinfo.h @@ -22,7 +22,7 @@ #define _DL_HWCAP_COUNT 23 -#define _DL_PLATFORMS_COUNT 10 +#define _DL_PLATFORMS_COUNT 11 /* The kernel provides up to 32 capability bits with elf_hwcap. */ #define _DL_FIRST_PLATFORM 32 diff --git a/sysdeps/s390/s390-64/Makefile b/sysdeps/s390/s390-64/Makefile index e5da26871..66ed844e6 100644 --- a/sysdeps/s390/s390-64/Makefile +++ b/sysdeps/s390/s390-64/Makefile @@ -7,8 +7,11 @@ CFLAGS-rtld.c += -Wno-uninitialized -Wno-unused CFLAGS-dl-load.c += -Wno-unused CFLAGS-dl-reloc.c += -Wno-unused -$(objpfx)tst-glibc-hwcaps: $(objpfx)libmarkermod2-1.so \ - $(objpfx)libmarkermod3-1.so $(objpfx)libmarkermod4-1.so +$(objpfx)tst-glibc-hwcaps: \ + $(objpfx)libmarkermod2-1.so \ + $(objpfx)libmarkermod3-1.so \ + $(objpfx)libmarkermod4-1.so \ + $(objpfx)libmarkermod5-1.so $(objpfx)tst-glibc-hwcaps.out: \ $(objpfx)libmarkermod2.so \ $(objpfx)glibc-hwcaps/z13/libmarkermod2.so \ @@ -19,6 +22,11 @@ $(objpfx)tst-glibc-hwcaps.out: \ $(objpfx)glibc-hwcaps/z13/libmarkermod4.so \ $(objpfx)glibc-hwcaps/z14/libmarkermod4.so \ $(objpfx)glibc-hwcaps/z15/libmarkermod4.so \ + $(objpfx)libmarkermod5.so \ + $(objpfx)glibc-hwcaps/z13/libmarkermod5.so \ + $(objpfx)glibc-hwcaps/z14/libmarkermod5.so \ + $(objpfx)glibc-hwcaps/z15/libmarkermod5.so \ + $(objpfx)glibc-hwcaps/z16/libmarkermod5.so $(objpfx)glibc-hwcaps/z13/libmarkermod2.so: $(objpfx)libmarkermod2-2.so $(make-target-directory) @@ -38,6 +46,19 @@ $(objpfx)glibc-hwcaps/z14/libmarkermod4.so: $(objpfx)libmarkermod4-3.so $(objpfx)glibc-hwcaps/z15/libmarkermod4.so: $(objpfx)libmarkermod4-4.so $(make-target-directory) cp $< $@ +$(objpfx)glibc-hwcaps/z13/libmarkermod5.so: $(objpfx)libmarkermod5-2.so + $(make-target-directory) + cp $< $@ +$(objpfx)glibc-hwcaps/z14/libmarkermod5.so: $(objpfx)libmarkermod5-3.so + $(make-target-directory) + cp $< $@ +$(objpfx)glibc-hwcaps/z15/libmarkermod5.so: $(objpfx)libmarkermod5-4.so + $(make-target-directory) + cp $< $@ +$(objpfx)glibc-hwcaps/z16/libmarkermod5.so: $(objpfx)libmarkermod5-5.so + $(make-target-directory) + cp $< $@ + ifeq (no,$(build-hardcoded-path-in-tests)) # This is an ld.so.cache test, and RPATH/RUNPATH in the executable diff --git a/sysdeps/s390/s390-64/configure b/sysdeps/s390/s390-64/configure new file mode 100644 index 000000000..101c570d2 --- /dev/null +++ b/sysdeps/s390/s390-64/configure @@ -0,0 +1,122 @@ +# This file is generated from configure.ac by Autoconf. DO NOT EDIT! + # Local configure fragment for sysdeps/s390/s390-64. + +# Minimal checking for static PIE support in ld. +# Compare to ld testcase/bugzilla: +# /ld/testsuite/ld-elf/pr22263-1.rd +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for s390-specific static PIE requirements" >&5 +$as_echo_n "checking for s390-specific static PIE requirements... " >&6; } +if { as_var=\ +libc_cv_s390x_staticpie_req; eval \${$as_var+:} false; }; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest1.c < conftest2.c <&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } \ + && { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -fPIE -c conftest2.c -o conftest2.o' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } \ + && { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -pie -o conftest conftest1.o conftest2.o' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } \ + && { ac_try='! readelf -Wr conftest | grep R_390_TLS_TPOFF' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } + then + libc_cv_s390x_staticpie_req=yes + fi + rm -rf conftest.* +fi +eval ac_res=\$\ +libc_cv_s390x_staticpie_req + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } +if test $libc_cv_s390x_staticpie_req = yes; then + # Static PIE is supported only on 64bit. + # Ensure you also have those patches for: + # - binutils (ld) + # - "[PR ld/22263] s390: Avoid dynamic TLS relocs in PIE" + # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=26b1426577b5dcb32d149c64cca3e603b81948a9 + # (Tested by configure check above) + # Otherwise there will be a R_390_TLS_TPOFF relocation, which fails to + # be processed in _dl_relocate_static_pie() as static TLS map is not setup. + # - "s390: Add DT_JMPREL pointing to .rela.[i]plt with static-pie" + # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d942d8db12adf4c9e5c7d9ed6496a779ece7149e + # (We can't test it in configure as we are not able to link a static PIE + # executable if the system glibc lacks static PIE support) + # Otherwise there won't be DT_JMPREL, DT_PLTRELA, DT_PLTRELASZ entries + # and the IFUNC symbols are not processed, which leads to crashes. + # + # - kernel (the mentioned links to the commits belong to 5.19 merge window): + # - "s390/mmap: increase stack/mmap gap to 128MB" + # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=f2f47d0ef72c30622e62471903ea19446ea79ee2 + # - "s390/vdso: move vdso mapping to its own function" + # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=57761da4dc5cd60bed2c81ba0edb7495c3c740b8 + # - "s390/vdso: map vdso above stack" + # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=9e37a2e8546f9e48ea76c839116fa5174d14e033 + # - "s390/vdso: add vdso randomization" + # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=41cd81abafdc4e58a93fcb677712a76885e3ca25 + # (We can't test the kernel of the target system) + # Otherwise if /proc/sys/kernel/randomize_va_space is turned off (0), + # static PIE executables like ldconfig will crash. While startup sbrk is + # used to enlarge the HEAP. Unfortunately the underlying brk syscall fails + # as there is not enough space after the HEAP. Then the address of the TLS + # image is invalid and the following memcpy in __libc_setup_tls() leads + # to a segfault. + # If /proc/sys/kernel/randomize_va_space is activated (default: 2), there + # is enough space after HEAP. + # + # - glibc + # - "Linux: Define MMAP_CALL_INTERNAL" + # https://sourceware.org/git/?p=glibc.git;a=commit;h=c1b68685d438373efe64e5f076f4215723004dfb + # - "i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S" + # https://sourceware.org/git/?p=glibc.git;a=commit;h=6e5c7a1e262961adb52443ab91bd2c9b72316402 + # - "i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls" + # https://sourceware.org/git/?p=glibc.git;a=commit;h=60f0f2130d30cfd008ca39743027f1e200592dff + # - "ia64: Always define IA64_USE_NEW_STUB as a flag macro" + # https://sourceware.org/git/?p=glibc.git;a=commit;h=18bd9c3d3b1b6a9182698c85354578d1d58e9d64 + # - "Linux: Implement a useful version of _startup_fatal" + # https://sourceware.org/git/?p=glibc.git;a=commit;h=a2a6bce7d7e52c1c34369a7da62c501cc350bc31 + # - "Linux: Introduce __brk_call for invoking the brk system call" + # https://sourceware.org/git/?p=glibc.git;a=commit;h=b57ab258c1140bc45464b4b9908713e3e0ee35aa + # - "csu: Implement and use _dl_early_allocate during static startup" + # https://sourceware.org/git/?p=glibc.git;a=commit;h=f787e138aa0bf677bf74fa2a08595c446292f3d7 + # The mentioned patch series by Florian Weimer avoids the mentioned failing + # sbrk syscall by falling back to mmap. + $as_echo "#define SUPPORT_STATIC_PIE 1" >>confdefs.h + +fi diff --git a/sysdeps/s390/s390-64/configure.ac b/sysdeps/s390/s390-64/configure.ac new file mode 100644 index 000000000..2583a4a33 --- /dev/null +++ b/sysdeps/s390/s390-64/configure.ac @@ -0,0 +1,92 @@ +GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. +# Local configure fragment for sysdeps/s390/s390-64. + +# Minimal checking for static PIE support in ld. +# Compare to ld testcase/bugzilla: +# /ld/testsuite/ld-elf/pr22263-1.rd +AC_CACHE_CHECK([for s390-specific static PIE requirements], \ +[libc_cv_s390x_staticpie_req], [dnl + cat > conftest1.c < conftest2.c <= 13 +# if GCCMACRO__ARCH__ >= 14 + if (!(GLRO(dl_hwcap) & HWCAP_S390_VXRS_PDE2)) + _dl_fatal_printf ("\ +Fatal glibc error: CPU lacks VXRS_PDE2 support (z16 or later required)\n"); +# elif GCCMACRO__ARCH__ >= 13 if (!(GLRO(dl_hwcap) & HWCAP_S390_VXRS_EXT2)) _dl_fatal_printf ("\ Fatal glibc error: CPU lacks VXRS_EXT2 support (z15 or later required)\n"); diff --git a/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c b/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c index 9447a6cf4..39f494815 100644 --- a/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c +++ b/sysdeps/s390/s390-64/dl-hwcaps-subdirs.c @@ -19,8 +19,8 @@ #include #include -const char _dl_hwcaps_subdirs[] = "z15:z14:z13"; -enum { subdirs_count = 3 }; /* Number of components in _dl_hwcaps_subdirs. */ +const char _dl_hwcaps_subdirs[] = "z16:z15:z14:z13"; +enum { subdirs_count = 4 }; /* Number of components in _dl_hwcaps_subdirs. */ uint32_t _dl_hwcaps_subdirs_active (void) @@ -50,5 +50,12 @@ _dl_hwcaps_subdirs_active (void) return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active); ++active; + /* z16. + Note: We do not list HWCAP_S390_NNPA here as, according to the Principles of + Operation, those instructions may be replaced or removed in future. */ + if (!(GLRO (dl_hwcap) & HWCAP_S390_VXRS_PDE2)) + return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active); + ++active; + return _dl_hwcaps_subdirs_build_bitmask (subdirs_count, active); } diff --git a/sysdeps/s390/s390-64/start.S b/sysdeps/s390/s390-64/start.S index 33c130204..f4c3f5131 100644 --- a/sysdeps/s390/s390-64/start.S +++ b/sysdeps/s390/s390-64/start.S @@ -84,10 +84,25 @@ _start: /* Ok, now branch to the libc main routine. */ #ifdef PIC +# ifdef SHARED + /* Used for dynamic linked position independent executable. + => Scrt1.o */ larl %r2,main@GOTENT # load pointer to main lg %r2,0(%r2) +# else + /* Used for dynamic linked position dependent executable. + => crt1.o (glibc configured without --disable-default-pie: + PIC is defined) + Or for static linked position independent executable. + => rcrt1.o (only available if glibc configured without + --disable-default-pie: PIC is defined) */ + larl %r2,__wrap_main +# endif brasl %r14,__libc_start_main@plt #else + /* Used for dynamic/static linked position dependent executable. + => crt1.o (glibc configured with --disable-default-pie: + PIC and SHARED are not defined) */ larl %r2,main # load pointer to main brasl %r14,__libc_start_main #endif @@ -97,6 +112,19 @@ _start: cfi_endproc +#if defined PIC && !defined SHARED + /* When main is not defined in the executable but in a shared library + then a wrapper is needed in crt1.o of the static-pie enabled libc, + because crt1.o and rcrt1.o share code and the later must avoid the + use of GOT relocations before __libc_start_main is called. */ +__wrap_main: + cfi_startproc + larl %r1,main@GOTENT # load pointer to main + lg %r1,0(%r1) + br %r1 + cfi_endproc +#endif + /* Define a symbol for the first piece of initialized data. */ .data .globl __data_start diff --git a/sysdeps/s390/s390-64/tst-glibc-hwcaps.c b/sysdeps/s390/s390-64/tst-glibc-hwcaps.c index cf3b765b5..a29891bdc 100644 --- a/sysdeps/s390/s390-64/tst-glibc-hwcaps.c +++ b/sysdeps/s390/s390-64/tst-glibc-hwcaps.c @@ -25,6 +25,7 @@ extern int marker2 (void); extern int marker3 (void); extern int marker4 (void); +extern int marker5 (void); /* Return the arch level, 10 for the baseline libmarkermod*.so's. */ static int @@ -63,9 +64,11 @@ compute_level (void) return 12; if (strcmp (platform, "z15") == 0) return 13; + if (strcmp (platform, "z16") == 0) + return 14; printf ("warning: unrecognized AT_PLATFORM value: %s\n", platform); - /* Assume that the new platform supports z15. */ - return 13; + /* Assume that the new platform supports z16. */ + return 14; } static int @@ -76,6 +79,7 @@ do_test (void) TEST_COMPARE (marker2 (), MIN (level - 9, 2)); TEST_COMPARE (marker3 (), MIN (level - 9, 3)); TEST_COMPARE (marker4 (), MIN (level - 9, 4)); + TEST_COMPARE (marker5 (), MIN (level - 9, 5)); return 0; } diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile index 7122f5597..e897f55f3 100644 --- a/sysdeps/unix/sysv/linux/Makefile +++ b/sysdeps/unix/sysv/linux/Makefile @@ -126,6 +126,7 @@ tests += tst-clone tst-clone2 tst-clone3 tst-fanotify tst-personality \ tst-prctl \ tst-scm_rights \ tst-epoll \ + tst-getauxval \ # tests # Test for the symbol version of fcntl that was replaced in glibc 2.28. diff --git a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h index 9905ebedf..4fcb6da80 100644 --- a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h @@ -236,6 +236,7 @@ #define __NR_sendmsg 211 #define __NR_sendto 206 #define __NR_set_mempolicy 237 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 99 #define __NR_set_tid_address 96 #define __NR_setdomainname 162 diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h index 03d57b9af..a1cf59270 100644 --- a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h +++ b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h @@ -75,3 +75,5 @@ #define HWCAP2_BTI (1 << 17) #define HWCAP2_MTE (1 << 18) #define HWCAP2_ECV (1 << 19) +#define HWCAP2_AFP (1 << 20) +#define HWCAP2_RPRES (1 << 21) diff --git a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h index ee8085be6..0cf74c1a9 100644 --- a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h @@ -391,6 +391,7 @@ #define __NR_sendmsg 114 #define __NR_sendto 133 #define __NR_set_mempolicy 431 +#define __NR_set_mempolicy_home_node 560 #define __NR_set_robust_list 466 #define __NR_set_tid_address 411 #define __NR_setdomainname 166 diff --git a/sysdeps/unix/sysv/linux/alpha/brk_call.h b/sysdeps/unix/sysv/linux/alpha/brk_call.h new file mode 100644 index 000000000..0b851b6c8 --- /dev/null +++ b/sysdeps/unix/sysv/linux/alpha/brk_call.h @@ -0,0 +1,27 @@ +/* Invoke the brk system call. Alpha version. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +static inline void * +__brk_call (void *addr) +{ + unsigned long int result = INTERNAL_SYSCALL_CALL (brk, addr); + if (result == -ENOMEM) + /* Mimic the generic error reporting behavior. */ + result = INTERNAL_SYSCALL_CALL (brk, 0); + return (void *) result; +} diff --git a/sysdeps/unix/sysv/linux/alpha/dl-auxv.h b/sysdeps/unix/sysv/linux/alpha/dl-auxv.h index 81d90da09..fcec74323 100644 --- a/sysdeps/unix/sysv/linux/alpha/dl-auxv.h +++ b/sysdeps/unix/sysv/linux/alpha/dl-auxv.h @@ -20,16 +20,8 @@ extern long __libc_alpha_cache_shape[4]; -#define DL_PLATFORM_AUXV \ - case AT_L1I_CACHESHAPE: \ - __libc_alpha_cache_shape[0] = av->a_un.a_val; \ - break; \ - case AT_L1D_CACHESHAPE: \ - __libc_alpha_cache_shape[1] = av->a_un.a_val; \ - break; \ - case AT_L2_CACHESHAPE: \ - __libc_alpha_cache_shape[2] = av->a_un.a_val; \ - break; \ - case AT_L3_CACHESHAPE: \ - __libc_alpha_cache_shape[3] = av->a_un.a_val; \ - break; +#define DL_PLATFORM_AUXV \ + __libc_alpha_cache_shape[0] = auxv_values[AT_L1I_CACHESHAPE]; \ + __libc_alpha_cache_shape[1] = auxv_values[AT_L1D_CACHESHAPE]; \ + __libc_alpha_cache_shape[2] = auxv_values[AT_L2_CACHESHAPE]; \ + __libc_alpha_cache_shape[3] = auxv_values[AT_L3_CACHESHAPE]; diff --git a/sysdeps/unix/sysv/linux/arc/arch-syscall.h b/sysdeps/unix/sysv/linux/arc/arch-syscall.h index 1b626d977..c1207aaa1 100644 --- a/sysdeps/unix/sysv/linux/arc/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/arc/arch-syscall.h @@ -238,6 +238,7 @@ #define __NR_sendmsg 211 #define __NR_sendto 206 #define __NR_set_mempolicy 237 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 99 #define __NR_set_tid_address 96 #define __NR_setdomainname 162 diff --git a/sysdeps/unix/sysv/linux/arm/arch-syscall.h b/sysdeps/unix/sysv/linux/arm/arch-syscall.h index 96ef8db93..e7ba04c10 100644 --- a/sysdeps/unix/sysv/linux/arm/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/arm/arch-syscall.h @@ -302,6 +302,7 @@ #define __NR_sendmsg 296 #define __NR_sendto 290 #define __NR_set_mempolicy 321 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 338 #define __NR_set_tid_address 256 #define __NR_set_tls 983045 diff --git a/sysdeps/unix/sysv/linux/bits/socket.h b/sysdeps/unix/sysv/linux/bits/socket.h index 79da9e759..1dd7356a1 100644 --- a/sysdeps/unix/sysv/linux/bits/socket.h +++ b/sysdeps/unix/sysv/linux/bits/socket.h @@ -169,6 +169,8 @@ typedef __socklen_t socklen_t; #define SOL_KCM 281 #define SOL_TLS 282 #define SOL_XDP 283 +#define SOL_MPTCP 284 +#define SOL_MCTP 285 /* Maximum queue length specifiable by listen. */ #define SOMAXCONN 4096 @@ -304,6 +306,12 @@ struct cmsghdr + CMSG_ALIGN (sizeof (struct cmsghdr))) #define CMSG_LEN(len) (CMSG_ALIGN (sizeof (struct cmsghdr)) + (len)) +/* Given a length, return the additional padding necessary such that + len + __CMSG_PADDING(len) == CMSG_ALIGN (len). */ +#define __CMSG_PADDING(len) ((sizeof (size_t) \ + - ((len) & (sizeof (size_t) - 1))) \ + & (sizeof (size_t) - 1)) + extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg) __THROW; #ifdef __USE_EXTERN_INLINES @@ -313,18 +321,38 @@ extern struct cmsghdr *__cmsg_nxthdr (struct msghdr *__mhdr, _EXTERN_INLINE struct cmsghdr * __NTH (__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg)) { + /* We may safely assume that __cmsg lies between __mhdr->msg_control and + __mhdr->msg_controllen because the user is required to obtain the first + cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs + via CMSG_NXTHDR, setting lengths along the way. However, we don't yet + trust the value of __cmsg->cmsg_len and therefore do not use it in any + pointer arithmetic until we check its value. */ + + unsigned char * __msg_control_ptr = (unsigned char *) __mhdr->msg_control; + unsigned char * __cmsg_ptr = (unsigned char *) __cmsg; + + size_t __size_needed = sizeof (struct cmsghdr) + + __CMSG_PADDING (__cmsg->cmsg_len); + + /* The current header is malformed, too small to be a full header. */ if ((size_t) __cmsg->cmsg_len < sizeof (struct cmsghdr)) - /* The kernel header does this so there may be a reason. */ return (struct cmsghdr *) 0; + /* There isn't enough space between __cmsg and the end of the buffer to + hold the current cmsg *and* the next one. */ + if (((size_t) + (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr) + < __size_needed) + || ((size_t) + (__msg_control_ptr + __mhdr->msg_controllen - __cmsg_ptr + - __size_needed) + < __cmsg->cmsg_len)) + + return (struct cmsghdr *) 0; + + /* Now, we trust cmsg_len and can use it to find the next header. */ __cmsg = (struct cmsghdr *) ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len)); - if ((unsigned char *) (__cmsg + 1) > ((unsigned char *) __mhdr->msg_control - + __mhdr->msg_controllen) - || ((unsigned char *) __cmsg + CMSG_ALIGN (__cmsg->cmsg_len) - > ((unsigned char *) __mhdr->msg_control + __mhdr->msg_controllen))) - /* No more entries. */ - return (struct cmsghdr *) 0; return __cmsg; } #endif /* Use `extern inline'. */ diff --git a/sysdeps/unix/sysv/linux/brk.c b/sysdeps/unix/sysv/linux/brk.c index abbabc9e8..9264a5a4a 100644 --- a/sysdeps/unix/sysv/linux/brk.c +++ b/sysdeps/unix/sysv/linux/brk.c @@ -19,6 +19,7 @@ #include #include #include +#include /* This must be initialized data because commons can't have aliases. */ void *__curbrk = 0; @@ -33,7 +34,7 @@ weak_alias (__curbrk, ___brk_addr) int __brk (void *addr) { - __curbrk = (void *) INTERNAL_SYSCALL_CALL (brk, addr); + __curbrk = __brk_call (addr); if (__curbrk < addr) { __set_errno (ENOMEM); diff --git a/sysdeps/unix/sysv/linux/brk_call.h b/sysdeps/unix/sysv/linux/brk_call.h new file mode 100644 index 000000000..72370c25d --- /dev/null +++ b/sysdeps/unix/sysv/linux/brk_call.h @@ -0,0 +1,25 @@ +/* Invoke the brk system call. Generic Linux version. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +static inline void * +__brk_call (void *addr) +{ + /* The default implementation reports errors through an unchanged + break. */ + return (void *) INTERNAL_SYSCALL_CALL (brk, addr); +} diff --git a/sysdeps/unix/sysv/linux/cmsg_nxthdr.c b/sysdeps/unix/sysv/linux/cmsg_nxthdr.c index 15b7a3a92..24f72b797 100644 --- a/sysdeps/unix/sysv/linux/cmsg_nxthdr.c +++ b/sysdeps/unix/sysv/linux/cmsg_nxthdr.c @@ -23,18 +23,38 @@ struct cmsghdr * __cmsg_nxthdr (struct msghdr *mhdr, struct cmsghdr *cmsg) { + /* We may safely assume that cmsg lies between mhdr->msg_control and + mhdr->msg_controllen because the user is required to obtain the first + cmsg via CMSG_FIRSTHDR, set its length, then obtain subsequent cmsgs + via CMSG_NXTHDR, setting lengths along the way. However, we don't yet + trust the value of cmsg->cmsg_len and therefore do not use it in any + pointer arithmetic until we check its value. */ + + unsigned char * msg_control_ptr = (unsigned char *) mhdr->msg_control; + unsigned char * cmsg_ptr = (unsigned char *) cmsg; + + size_t size_needed = sizeof (struct cmsghdr) + + __CMSG_PADDING (cmsg->cmsg_len); + + /* The current header is malformed, too small to be a full header. */ if ((size_t) cmsg->cmsg_len < sizeof (struct cmsghdr)) - /* The kernel header does this so there may be a reason. */ - return NULL; + return (struct cmsghdr *) 0; + + /* There isn't enough space between cmsg and the end of the buffer to + hold the current cmsg *and* the next one. */ + if (((size_t) + (msg_control_ptr + mhdr->msg_controllen - cmsg_ptr) + < size_needed) + || ((size_t) + (msg_control_ptr + mhdr->msg_controllen - cmsg_ptr + - size_needed) + < cmsg->cmsg_len)) + + return (struct cmsghdr *) 0; + /* Now, we trust cmsg_len and can use it to find the next header. */ cmsg = (struct cmsghdr *) ((unsigned char *) cmsg + CMSG_ALIGN (cmsg->cmsg_len)); - if ((unsigned char *) (cmsg + 1) > ((unsigned char *) mhdr->msg_control - + mhdr->msg_controllen) - || ((unsigned char *) cmsg + CMSG_ALIGN (cmsg->cmsg_len) - > ((unsigned char *) mhdr->msg_control + mhdr->msg_controllen))) - /* No more entries. */ - return NULL; return cmsg; } libc_hidden_def (__cmsg_nxthdr) diff --git a/sysdeps/unix/sysv/linux/convert_scm_timestamps.c b/sysdeps/unix/sysv/linux/convert_scm_timestamps.c index 82171bf32..dfc8c2bef 100644 --- a/sysdeps/unix/sysv/linux/convert_scm_timestamps.c +++ b/sysdeps/unix/sysv/linux/convert_scm_timestamps.c @@ -16,9 +16,9 @@ License along with the GNU C Library; if not, see . */ -#include +#include -#ifndef __ASSUME_TIME64_SYSCALLS +#if __TIMESIZE != 64 # include # include # include diff --git a/sysdeps/unix/sysv/linux/csky/arch-syscall.h b/sysdeps/unix/sysv/linux/csky/arch-syscall.h index 96910154e..dc9383758 100644 --- a/sysdeps/unix/sysv/linux/csky/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/csky/arch-syscall.h @@ -250,6 +250,7 @@ #define __NR_sendmsg 211 #define __NR_sendto 206 #define __NR_set_mempolicy 237 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 99 #define __NR_set_thread_area 244 #define __NR_set_tid_address 96 diff --git a/sysdeps/unix/sysv/linux/dl-early_allocate.c b/sysdeps/unix/sysv/linux/dl-early_allocate.c new file mode 100644 index 000000000..52c538e85 --- /dev/null +++ b/sysdeps/unix/sysv/linux/dl-early_allocate.c @@ -0,0 +1,82 @@ +/* Early memory allocation for the dynamic loader. Generic version. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Mark symbols hidden in static PIE for early self relocation to work. */ +#if BUILD_PIE_DEFAULT +# pragma GCC visibility push(hidden) +#endif +#include + +#include +#include +#include +#include +#include + +#include +#include + +/* Defined in brk.c. */ +extern void *__curbrk; + +void * +_dl_early_allocate (size_t size) +{ + void *result; + + if (__curbrk != NULL) + /* If the break has been initialized, brk must have run before, + so just call it once more. */ + { + result = __sbrk (size); + if (result == (void *) -1) + result = NULL; + } + else + { + /* If brk has not been invoked, there is no need to update + __curbrk. The first call to brk will take care of that. */ + void *previous = __brk_call (0); + result = __brk_call (previous + size); + if (result == previous) + result = NULL; + else + result = previous; + } + + /* If brk fails, fall back to mmap. This can happen due to + unfortunate ASLR layout decisions and kernel bugs, particularly + for static PIE. */ + if (result == NULL) + { + long int ret; + int prot = PROT_READ | PROT_WRITE; + int flags = MAP_PRIVATE | MAP_ANONYMOUS; +#ifdef __NR_mmap2 + ret = MMAP_CALL_INTERNAL (mmap2, 0, size, prot, flags, -1, 0); +#else + ret = MMAP_CALL_INTERNAL (mmap, 0, size, prot, flags, -1, 0); +#endif + if (INTERNAL_SYSCALL_ERROR_P (ret)) + result = NULL; + else + result = (void *) ret; + } + + return result; +} diff --git a/sysdeps/unix/sysv/linux/dl-parse_auxv.h b/sysdeps/unix/sysv/linux/dl-parse_auxv.h new file mode 100644 index 000000000..bf9374371 --- /dev/null +++ b/sysdeps/unix/sysv/linux/dl-parse_auxv.h @@ -0,0 +1,61 @@ +/* Parse the Linux auxiliary vector. + Copyright (C) 1995-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include + +typedef ElfW(Addr) dl_parse_auxv_t[AT_MINSIGSTKSZ + 1]; + +/* Copy the auxiliary vector into AUXV_VALUES and set up GLRO + variables. */ +static inline +void _dl_parse_auxv (ElfW(auxv_t) *av, dl_parse_auxv_t auxv_values) +{ + auxv_values[AT_ENTRY] = (ElfW(Addr)) ENTRY_POINT; + auxv_values[AT_PAGESZ] = EXEC_PAGESIZE; + auxv_values[AT_FPUCW] = _FPU_DEFAULT; + + /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */ + _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ), + "CONSTANT_MINSIGSTKSZ is constant"); + auxv_values[AT_MINSIGSTKSZ] = CONSTANT_MINSIGSTKSZ; + + for (; av->a_type != AT_NULL; av++) + if (av->a_type <= AT_MINSIGSTKSZ) + auxv_values[av->a_type] = av->a_un.a_val; + + GLRO(dl_pagesize) = auxv_values[AT_PAGESZ]; + __libc_enable_secure = auxv_values[AT_SECURE]; + GLRO(dl_platform) = (void *) auxv_values[AT_PLATFORM]; + GLRO(dl_hwcap) = auxv_values[AT_HWCAP]; + GLRO(dl_hwcap2) = auxv_values[AT_HWCAP2]; + GLRO(dl_clktck) = auxv_values[AT_CLKTCK]; + GLRO(dl_fpu_control) = auxv_values[AT_FPUCW]; + _dl_random = (void *) auxv_values[AT_RANDOM]; + GLRO(dl_minsigstacksize) = auxv_values[AT_MINSIGSTKSZ]; + GLRO(dl_sysinfo_dso) = (void *) auxv_values[AT_SYSINFO_EHDR]; +#ifdef NEED_DL_SYSINFO + if (GLRO(dl_sysinfo_dso) != NULL) + GLRO(dl_sysinfo) = auxv_values[AT_SYSINFO]; +#endif + + DL_PLATFORM_AUXV +} diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c index 66ba489cd..c90f109b1 100644 --- a/sysdeps/unix/sysv/linux/dl-sysdep.c +++ b/sysdeps/unix/sysv/linux/dl-sysdep.c @@ -16,35 +16,247 @@ License along with the GNU C Library; if not, see . */ -/* Linux needs some special initialization, but otherwise uses - the generic dynamic linker system interface code. */ - -#include +#include <_itoa.h> +#include +#include +#include +#include +#include +#include +#include +#include #include -#include -#include -#include #include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include #ifdef SHARED -# define DL_SYSDEP_INIT frob_brk () +extern char **_environ attribute_hidden; +extern char _end[] attribute_hidden; + +/* Protect SUID program against misuse of file descriptors. */ +extern void __libc_check_standard_fds (void); -static inline void -frob_brk (void) +int __libc_enable_secure attribute_relro = 0; +rtld_hidden_data_def (__libc_enable_secure) +/* This variable contains the lowest stack address ever used. */ +void *__libc_stack_end attribute_relro = NULL; +rtld_hidden_data_def(__libc_stack_end) +void *_dl_random attribute_relro = NULL; + +#ifndef DL_STACK_END +# define DL_STACK_END(cookie) ((void *) (cookie)) +#endif + +/* Arguments passed to dl_main. */ +struct dl_main_arguments { - __brk (0); /* Initialize the break. */ + const ElfW(Phdr) *phdr; + ElfW(Word) phnum; + ElfW(Addr) user_entry; +}; + +/* Separate function, so that dl_main can be called without the large + array on the stack. */ +static void +_dl_sysdep_parse_arguments (void **start_argptr, + struct dl_main_arguments *args) +{ + _dl_argc = (intptr_t) *start_argptr; + _dl_argv = (char **) (start_argptr + 1); /* Necessary aliasing violation. */ + _environ = _dl_argv + _dl_argc + 1; + for (char **tmp = _environ; ; ++tmp) + if (*tmp == NULL) + { + /* Another necessary aliasing violation. */ + GLRO(dl_auxv) = (ElfW(auxv_t) *) (tmp + 1); + break; + } + + dl_parse_auxv_t auxv_values = { 0, }; + _dl_parse_auxv (GLRO(dl_auxv), auxv_values); + + args->phdr = (const ElfW(Phdr) *) auxv_values[AT_PHDR]; + args->phnum = auxv_values[AT_PHNUM]; + args->user_entry = auxv_values[AT_ENTRY]; } -# include +ElfW(Addr) +_dl_sysdep_start (void **start_argptr, + void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum, + ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv)) +{ + __libc_stack_end = DL_STACK_END (start_argptr); + + struct dl_main_arguments dl_main_args; + _dl_sysdep_parse_arguments (start_argptr, &dl_main_args); + + dl_hwcap_check (); + + __tunables_init (_environ); + + /* Initialize DSO sorting algorithm after tunables. */ + _dl_sort_maps_init (); + + __brk (0); /* Initialize the break. */ + +#ifdef DL_PLATFORM_INIT + DL_PLATFORM_INIT; #endif + /* Determine the length of the platform name. */ + if (GLRO(dl_platform) != NULL) + GLRO(dl_platformlen) = strlen (GLRO(dl_platform)); + + if (__sbrk (0) == _end) + /* The dynamic linker was run as a program, and so the initial break + starts just after our bss, at &_end. The malloc in dl-minimal.c + will consume the rest of this page, so tell the kernel to move the + break up that far. When the user program examines its break, it + will see this new value and not clobber our data. */ + __sbrk (GLRO(dl_pagesize) + - ((_end - (char *) 0) & (GLRO(dl_pagesize) - 1))); + + /* If this is a SUID program we make sure that FDs 0, 1, and 2 are + allocated. If necessary we are doing it ourself. If it is not + possible we stop the program. */ + if (__builtin_expect (__libc_enable_secure, 0)) + __libc_check_standard_fds (); + + (*dl_main) (dl_main_args.phdr, dl_main_args.phnum, + &dl_main_args.user_entry, GLRO(dl_auxv)); + return dl_main_args.user_entry; +} + +void +_dl_sysdep_start_cleanup (void) +{ +} + +void +_dl_show_auxv (void) +{ + char buf[64]; + ElfW(auxv_t) *av; + + /* Terminate string. */ + buf[63] = '\0'; + + /* The following code assumes that the AT_* values are encoded + starting from 0 with AT_NULL, 1 for AT_IGNORE, and all other values + close by (otherwise the array will be too large). In case we have + to support a platform where these requirements are not fulfilled + some alternative implementation has to be used. */ + for (av = GLRO(dl_auxv); av->a_type != AT_NULL; ++av) + { + static const struct + { + const char label[22]; + enum { unknown = 0, dec, hex, str, ignore } form : 8; + } auxvars[] = + { + [AT_EXECFD - 2] = { "EXECFD: ", dec }, + [AT_EXECFN - 2] = { "EXECFN: ", str }, + [AT_PHDR - 2] = { "PHDR: 0x", hex }, + [AT_PHENT - 2] = { "PHENT: ", dec }, + [AT_PHNUM - 2] = { "PHNUM: ", dec }, + [AT_PAGESZ - 2] = { "PAGESZ: ", dec }, + [AT_BASE - 2] = { "BASE: 0x", hex }, + [AT_FLAGS - 2] = { "FLAGS: 0x", hex }, + [AT_ENTRY - 2] = { "ENTRY: 0x", hex }, + [AT_NOTELF - 2] = { "NOTELF: ", hex }, + [AT_UID - 2] = { "UID: ", dec }, + [AT_EUID - 2] = { "EUID: ", dec }, + [AT_GID - 2] = { "GID: ", dec }, + [AT_EGID - 2] = { "EGID: ", dec }, + [AT_PLATFORM - 2] = { "PLATFORM: ", str }, + [AT_HWCAP - 2] = { "HWCAP: ", hex }, + [AT_CLKTCK - 2] = { "CLKTCK: ", dec }, + [AT_FPUCW - 2] = { "FPUCW: ", hex }, + [AT_DCACHEBSIZE - 2] = { "DCACHEBSIZE: 0x", hex }, + [AT_ICACHEBSIZE - 2] = { "ICACHEBSIZE: 0x", hex }, + [AT_UCACHEBSIZE - 2] = { "UCACHEBSIZE: 0x", hex }, + [AT_IGNOREPPC - 2] = { "IGNOREPPC", ignore }, + [AT_SECURE - 2] = { "SECURE: ", dec }, + [AT_BASE_PLATFORM - 2] = { "BASE_PLATFORM: ", str }, + [AT_SYSINFO - 2] = { "SYSINFO: 0x", hex }, + [AT_SYSINFO_EHDR - 2] = { "SYSINFO_EHDR: 0x", hex }, + [AT_RANDOM - 2] = { "RANDOM: 0x", hex }, + [AT_HWCAP2 - 2] = { "HWCAP2: 0x", hex }, + [AT_MINSIGSTKSZ - 2] = { "MINSIGSTKSZ: ", dec }, + [AT_L1I_CACHESIZE - 2] = { "L1I_CACHESIZE: ", dec }, + [AT_L1I_CACHEGEOMETRY - 2] = { "L1I_CACHEGEOMETRY: 0x", hex }, + [AT_L1D_CACHESIZE - 2] = { "L1D_CACHESIZE: ", dec }, + [AT_L1D_CACHEGEOMETRY - 2] = { "L1D_CACHEGEOMETRY: 0x", hex }, + [AT_L2_CACHESIZE - 2] = { "L2_CACHESIZE: ", dec }, + [AT_L2_CACHEGEOMETRY - 2] = { "L2_CACHEGEOMETRY: 0x", hex }, + [AT_L3_CACHESIZE - 2] = { "L3_CACHESIZE: ", dec }, + [AT_L3_CACHEGEOMETRY - 2] = { "L3_CACHEGEOMETRY: 0x", hex }, + }; + unsigned int idx = (unsigned int) (av->a_type - 2); + + if ((unsigned int) av->a_type < 2u + || (idx < sizeof (auxvars) / sizeof (auxvars[0]) + && auxvars[idx].form == ignore)) + continue; + + assert (AT_NULL == 0); + assert (AT_IGNORE == 1); + + /* Some entries are handled in a special way per platform. */ + if (_dl_procinfo (av->a_type, av->a_un.a_val) == 0) + continue; + + if (idx < sizeof (auxvars) / sizeof (auxvars[0]) + && auxvars[idx].form != unknown) + { + const char *val = (char *) av->a_un.a_val; + + if (__builtin_expect (auxvars[idx].form, dec) == dec) + val = _itoa ((unsigned long int) av->a_un.a_val, + buf + sizeof buf - 1, 10, 0); + else if (__builtin_expect (auxvars[idx].form, hex) == hex) + val = _itoa ((unsigned long int) av->a_un.a_val, + buf + sizeof buf - 1, 16, 0); + + _dl_printf ("AT_%s%s\n", auxvars[idx].label, val); + + continue; + } + + /* Unknown value: print a generic line. */ + char buf2[17]; + buf2[sizeof (buf2) - 1] = '\0'; + const char *val2 = _itoa ((unsigned long int) av->a_un.a_val, + buf2 + sizeof buf2 - 1, 16, 0); + const char *val = _itoa ((unsigned long int) av->a_type, + buf + sizeof buf - 1, 16, 0); + _dl_printf ("AT_??? (0x%s): 0x%s\n", val, val2); + } +} + +#endif /* SHARED */ + int attribute_hidden _dl_discover_osversion (void) { -#if defined NEED_DL_SYSINFO_DSO && defined SHARED +#ifdef SHARED if (GLRO(dl_sysinfo_map) != NULL) { /* If the kernel-supplied DSO contains a note indicating the kernel's @@ -75,7 +287,7 @@ _dl_discover_osversion (void) } } } -#endif +#endif /* SHARED */ char bufmem[64]; char *buf = bufmem; diff --git a/sysdeps/unix/sysv/linux/faccessat.c b/sysdeps/unix/sysv/linux/faccessat.c index 59ee4b6f8..1378bb2db 100644 --- a/sysdeps/unix/sysv/linux/faccessat.c +++ b/sysdeps/unix/sysv/linux/faccessat.c @@ -39,8 +39,8 @@ __faccessat (int fd, const char *file, int mode, int flag) if ((flag == 0 || ((flag & ~AT_EACCESS) == 0 && ! __libc_enable_secure))) return INLINE_SYSCALL (faccessat, 3, fd, file, mode); - struct stat64 stats; - if (__fstatat64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW)) + struct __stat64_t64 stats; + if (__fstatat64_time64 (fd, file, &stats, flag & AT_SYMLINK_NOFOLLOW)) return -1; mode &= (X_OK | W_OK | R_OK); /* Clear any bogus bits. */ diff --git a/sysdeps/unix/sysv/linux/fchmodat.c b/sysdeps/unix/sysv/linux/fchmodat.c index 7aa073bf3..7aae02148 100644 --- a/sysdeps/unix/sysv/linux/fchmodat.c +++ b/sysdeps/unix/sysv/linux/fchmodat.c @@ -48,8 +48,8 @@ fchmodat (int fd, const char *file, mode_t mode, int flag) /* Use fstatat because fstat does not work on O_PATH descriptors before Linux 3.6. */ - struct stat64 st; - if (__fstatat64 (pathfd, "", &st, AT_EMPTY_PATH) != 0) + struct __stat64_t64 st; + if (__fstatat64_time64 (pathfd, "", &st, AT_EMPTY_PATH) != 0) { __close_nocancel (pathfd); return -1; diff --git a/sysdeps/unix/sysv/linux/getsysstats.c b/sysdeps/unix/sysv/linux/getsysstats.c index 4798cc337..d1ea074f0 100644 --- a/sysdeps/unix/sysv/linux/getsysstats.c +++ b/sysdeps/unix/sysv/linux/getsysstats.c @@ -44,15 +44,14 @@ __get_nprocs_sched (void) int r = INTERNAL_SYSCALL_CALL (sched_getaffinity, 0, cpu_bits_size, cpu_bits); if (r > 0) - return CPU_COUNT_S (cpu_bits_size, (cpu_set_t*) cpu_bits); + return CPU_COUNT_S (r, (cpu_set_t*) cpu_bits); else if (r == -EINVAL) /* The input buffer is still not enough to store the number of cpus. This is an arbitrary values assuming such systems should be rare and there is no offline cpus. */ return max_num_cpus; - /* Some other error. 2 is conservative (not a uniprocessor system, so - atomics are needed). */ - return 2; + /* Some other error. */ + return 0; } static char * @@ -108,22 +107,19 @@ next_line (int fd, char *const buffer, char **cp, char **re, } static int -get_nproc_stat (char *buffer, size_t buffer_size) +get_nproc_stat (void) { + enum { buffer_size = 1024 }; + char buffer[buffer_size]; char *buffer_end = buffer + buffer_size; char *cp = buffer_end; char *re = buffer_end; - - /* Default to an SMP system in case we cannot obtain an accurate - number. */ - int result = 2; + int result = 0; const int flags = O_RDONLY | O_CLOEXEC; int fd = __open_nocancel ("/proc/stat", flags); if (fd != -1) { - result = 0; - char *l; while ((l = next_line (fd, buffer, &cp, &re, buffer_end)) != NULL) /* The current format of /proc/stat has all the cpu* entries @@ -139,8 +135,8 @@ get_nproc_stat (char *buffer, size_t buffer_size) return result; } -int -__get_nprocs (void) +static int +get_nprocs_cpu_online (void) { enum { buffer_size = 1024 }; char buffer[buffer_size]; @@ -179,7 +175,8 @@ __get_nprocs (void) } } - result += m - n + 1; + if (m >= n) + result += m - n + 1; l = endp; if (l < re && *l == ',') @@ -188,28 +185,18 @@ __get_nprocs (void) while (l < re && *l != '\n'); __close_nocancel_nostatus (fd); - - if (result > 0) - return result; } - return get_nproc_stat (buffer, buffer_size); + return result; } -libc_hidden_def (__get_nprocs) -weak_alias (__get_nprocs, get_nprocs) - -/* On some architectures it is possible to distinguish between configured - and active cpus. */ -int -__get_nprocs_conf (void) +static int +get_nprocs_cpu (void) { - /* Try to use the sysfs filesystem. It has actual information about - online processors. */ + int count = 0; DIR *dir = __opendir ("/sys/devices/system/cpu"); if (dir != NULL) { - int count = 0; struct dirent64 *d; while ((d = __readdir64 (dir)) != NULL) @@ -224,12 +211,57 @@ __get_nprocs_conf (void) __closedir (dir); - return count; } + return count; +} - enum { buffer_size = 1024 }; - char buffer[buffer_size]; - return get_nproc_stat (buffer, buffer_size); +static int +get_nprocs_fallback (void) +{ + int result; + + /* Try /proc/stat first. */ + result = get_nproc_stat (); + if (result != 0) + return result; + + /* Try sched_getaffinity. */ + result = __get_nprocs_sched (); + if (result != 0) + return result; + + /* We failed to obtain an accurate number. Be conservative: return + the smallest number meaning that this is not a uniprocessor system, + so atomics are needed. */ + return 2; +} + +int +__get_nprocs (void) +{ + /* Try /sys/devices/system/cpu/online first. */ + int result = get_nprocs_cpu_online (); + if (result != 0) + return result; + + /* Fall back to /proc/stat and sched_getaffinity. */ + return get_nprocs_fallback (); +} +libc_hidden_def (__get_nprocs) +weak_alias (__get_nprocs, get_nprocs) + +/* On some architectures it is possible to distinguish between configured + and active cpus. */ +int +__get_nprocs_conf (void) +{ + /* Try /sys/devices/system/cpu/ first. */ + int result = get_nprocs_cpu (); + if (result != 0) + return result; + + /* Fall back to /proc/stat and sched_getaffinity. */ + return get_nprocs_fallback (); } libc_hidden_def (__get_nprocs_conf) weak_alias (__get_nprocs_conf, get_nprocs_conf) diff --git a/sysdeps/unix/sysv/linux/glob64-time64.c b/sysdeps/unix/sysv/linux/glob64-time64.c index 32b4929a1..1d485dc80 100644 --- a/sysdeps/unix/sysv/linux/glob64-time64.c +++ b/sysdeps/unix/sysv/linux/glob64-time64.c @@ -37,6 +37,7 @@ # define GLOB_LSTAT gl_lstat # define GLOB_STAT64 __stat64_time64 # define GLOB_LSTAT64 __lstat64_time64 +# define GLOB_FSTATAT64 __fstatat64_time64 # define COMPILE_GLOB64 1 diff --git a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h index 36675fd48..767f1287a 100644 --- a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h @@ -289,6 +289,7 @@ #define __NR_sendmsg 183 #define __NR_sendto 82 #define __NR_set_mempolicy 262 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 289 #define __NR_set_tid_address 237 #define __NR_setdomainname 121 diff --git a/sysdeps/unix/sysv/linux/hppa/getcontext.S b/sysdeps/unix/sysv/linux/hppa/getcontext.S index 1e73587f1..dcdf986f2 100644 --- a/sysdeps/unix/sysv/linux/hppa/getcontext.S +++ b/sysdeps/unix/sysv/linux/hppa/getcontext.S @@ -21,22 +21,28 @@ #include "ucontext_i.h" - /* Trampoline function. Non-standard calling ABI. */ + /* Trampoline function. Non-standard calling ABI. */ /* Can not use ENTRY(__getcontext_ret) here. */ .type __getcontext_ret, @function .hidden __getcontext_ret __getcontext_ret: .proc .callinfo FRAME=0,NO_CALLS - /* r26-r23 contain original r3-r6, but because setcontext - does not reload r3-r6 (it's using them as temporaries) - we must save them elsewhere and swap them back in. */ - copy %r23, %r3 - copy %r24, %r4 - copy %r25, %r5 - copy %r26, %r6 - /* r20 contains original return pointer. */ - bv 0(%r20) + /* Because setcontext does not reload r3-r6 (it's using them + as temporaries), we must load them ourself. */ + ldw oR3(%r26), %r3 + ldw oR4(%r26), %r4 + ldw oR5(%r26), %r5 + ldw oR6(%r26), %r6 + + /* Also reload registers clobbered by $$dyncall. */ + ldw oR21(%r26), %r21 + ldw oR22(%r26), %r22 + ldw oR31(%r26), %r31 + + /* oR0 contains original return pointer. */ + ldw oR0(%r26), %rp + bv 0(%rp) copy %r0, %ret0 .procend .size __getcontext_ret, .-__getcontext_ret @@ -64,13 +70,13 @@ ENTRY(__getcontext) stw %r17, oR17(%r26) stw %r18, oR18(%r26) stw %r19, oR19(%r26) - /* stw %r20, oR20(%r26) - used for trampoline. */ + stw %r20, oR20(%r26) stw %r21, oR21(%r26) stw %r22, oR22(%r26) - /* stw %r23, oR23(%r26) - used for trampoline. */ - /* stw %r24, oR24(%r26) - used for trampoline. */ - /* stw %r25, oR25(%r26) - used for trampoline. */ - /* stw %r26, oR26(%r26) - used for trampoline. */ + stw %r23, oR23(%r26) + stw %r24, oR24(%r26) + stw %r25, oR25(%r26) + stw %r26, oR26(%r26) stw %r27, oR27(%r26) stw %r28, oR28(%r26) stw %r29, oR29(%r26) @@ -89,7 +95,10 @@ ENTRY(__getcontext) stw %r0, oIASQ1(%r26) stw %r0, oIAOQ0(%r26) stw %r0, oIAOQ1(%r26) - stw %r0, oSAR(%r26) /* used as flag in swapcontext(). */ + + /* Save SAR register. */ + mfctl %sar, %r1 + stw %r1, oSAR(%r26) /* MSB used as flag in swapcontext(). */ /* Store floating-point regs. */ @@ -137,15 +146,12 @@ ENTRY(__getcontext) stw %r19, -32(%sp) .cfi_offset 19, 32 #endif + stw %ret1, -60(%sp) + .cfi_offset 29, 4 /* Set up the trampoline registers. - r20, r23, r24, r25, r26 and r2 are clobbered - by call to getcontext() anyway. Reuse them. */ - stw %r2, oR20(%r26) - stw %r3, oR23(%r26) - stw %r4, oR24(%r26) - stw %r5, oR25(%r26) - stw %r6, oR26(%r26) + Use oR0 context slot to save return value. */ + stw %r2, oR0(%r26) #ifdef PIC addil LT%__getcontext_ret, %r19 ldw RT%__getcontext_ret(%r1), %r1 @@ -167,6 +173,7 @@ ENTRY(__getcontext) #ifdef PIC ldw -32(%sp), %r19 #endif + ldw -60(%sp), %ret1 bv %r0(%r2) ldwm -64(%sp), %r4 END(__getcontext) diff --git a/sysdeps/unix/sysv/linux/hppa/setcontext.S b/sysdeps/unix/sysv/linux/hppa/setcontext.S index bc4872c8e..dfa794ad5 100644 --- a/sysdeps/unix/sysv/linux/hppa/setcontext.S +++ b/sysdeps/unix/sysv/linux/hppa/setcontext.S @@ -33,6 +33,8 @@ ENTRY(__setcontext) stw %r19, -32(%sp) .cfi_offset 19, 32 #endif + stw %ret1, -60(%sp) + .cfi_offset 29, 4 /* Save ucp. */ copy %r26, %r3 @@ -73,7 +75,7 @@ ENTRY(__setcontext) ldw oR18(%r3), %r18 ldw oR19(%r3), %r19 ldw oR20(%r3), %r20 - ldw oR21(%r3), %r21 + ldw oR21(%r3), %r21 /* maybe clobbered by dyncall */ /* ldw oR22(%r3), %r22 - dyncall arg. */ ldw oR23(%r3), %r23 ldw oR24(%r3), %r24 @@ -85,6 +87,10 @@ ENTRY(__setcontext) ldw oR30(%r3), %sp /* ldw oR31(%r3), %r31 - dyncall scratch register */ + /* Restore SAR register. */ + ldw oSAR(%r3), %r22 + mtsar %r22 + /* Restore floating-point registers. */ ldo oFPREGS31(%r3), %r22 fldds 0(%r22), %fr31 @@ -154,6 +160,7 @@ ENTRY(__setcontext) #ifdef PIC ldw -32(%r30), %r19 #endif + ldw -60(%r30), %ret1 bv %r0(%r2) ldwm -64(%r30), %r3 L(pseudo_end): diff --git a/sysdeps/unix/sysv/linux/hppa/swapcontext.S b/sysdeps/unix/sysv/linux/hppa/swapcontext.S new file mode 100644 index 000000000..fbc22586d --- /dev/null +++ b/sysdeps/unix/sysv/linux/hppa/swapcontext.S @@ -0,0 +1,72 @@ +/* Swap to new context. + Copyright (C) 2008-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include +#include "ucontext_i.h" + + .text +ENTRY(__swapcontext) + + /* Copy rp to ret0 (r28). */ + copy %rp,%ret0 + + /* Create a frame. */ + ldo 64(%sp),%sp + .cfi_def_cfa_offset -64 + + /* Save the current machine context to oucp. */ + bl __getcontext,%rp + + /* Copy oucp to register ret1 (r29). __getcontext saves and + restores it on a normal return. It is restored from oR29 + on reactivation. */ + copy %r26,%ret1 + + /* Pop frame. */ + ldo -64(%sp),%sp + .cfi_def_cfa_offset 0 + + /* Load return pointer from oR28. */ + ldw oR28(%ret1),%rp + + /* Return if error. */ + or,= %r0,%ret0,%r0 + bv,n %r0(%rp) + + /* Load sc_sar flag. */ + ldb oSAR(%ret1),%r20 + + /* Return if oucp context has been reactivated. */ + or,= %r0,%r20,%r0 + bv,n %r0(%rp) + + /* Mark sc_sar flag. */ + ldi 1,%r20 + stb %r20,oSAR(%ret1) + + /* Activate the machine context in ucp. */ + bl __setcontext,%rp + ldw oR25(%ret1),%r26 + + /* Load return pointer. */ + ldw oR28(%ret1),%rp + bv,n %r0(%rp) + +END(__swapcontext) + +weak_alias (__swapcontext, swapcontext) diff --git a/sysdeps/unix/sysv/linux/hppa/swapcontext.c b/sysdeps/unix/sysv/linux/hppa/swapcontext.c deleted file mode 100644 index 5cbe00f1e..000000000 --- a/sysdeps/unix/sysv/linux/hppa/swapcontext.c +++ /dev/null @@ -1,41 +0,0 @@ -/* Swap to new context. - Copyright (C) 2008-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#include - -extern int __getcontext (ucontext_t *ucp); -extern int __setcontext (const ucontext_t *ucp); - -int -__swapcontext (ucontext_t *oucp, const ucontext_t *ucp) -{ - /* Save the current machine context to oucp. */ - __getcontext (oucp); - - /* mark sc_sar flag to skip the setcontext call on reactivation. */ - if (oucp->uc_mcontext.sc_sar == 0) { - oucp->uc_mcontext.sc_sar++; - - /* Restore the machine context in ucp. */ - __setcontext (ucp); - } - - return 0; -} - -weak_alias (__swapcontext, swapcontext) diff --git a/sysdeps/unix/sysv/linux/i386/Makefile b/sysdeps/unix/sysv/linux/i386/Makefile index abd0009d5..e379a2e76 100644 --- a/sysdeps/unix/sysv/linux/i386/Makefile +++ b/sysdeps/unix/sysv/linux/i386/Makefile @@ -14,7 +14,7 @@ install-bin += lddlibc4 endif ifeq ($(subdir),io) -sysdep_routines += libc-do-syscall +sysdep_routines += libc-do-syscall libc-do-syscall-int80 endif ifeq ($(subdir),stdlib) diff --git a/sysdeps/unix/sysv/linux/i386/arch-syscall.h b/sysdeps/unix/sysv/linux/i386/arch-syscall.h index c86ccbda4..1998f0d76 100644 --- a/sysdeps/unix/sysv/linux/i386/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/i386/arch-syscall.h @@ -323,6 +323,7 @@ #define __NR_sendmsg 370 #define __NR_sendto 369 #define __NR_set_mempolicy 276 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 311 #define __NR_set_thread_area 243 #define __NR_set_tid_address 258 diff --git a/sysdeps/unix/sysv/linux/i386/libc-do-syscall-int80.S b/sysdeps/unix/sysv/linux/i386/libc-do-syscall-int80.S new file mode 100644 index 000000000..2c472f255 --- /dev/null +++ b/sysdeps/unix/sysv/linux/i386/libc-do-syscall-int80.S @@ -0,0 +1,25 @@ +/* Out-of-line syscall stub for six-argument syscalls from C. For static PIE. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef SHARED +# define I386_USE_SYSENTER 0 +# include + +# define __libc_do_syscall __libc_do_syscall_int80 +# include "libc-do-syscall.S" +#endif diff --git a/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S b/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S index 04154f43e..3eea5f3a5 100644 --- a/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S +++ b/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S @@ -18,8 +18,6 @@ #include -#ifndef OPTIMIZE_FOR_GCC_5 - /* %eax, %ecx, %edx and %esi contain the values expected by the kernel. %edi points to a structure with the values of %ebx, %edi and %ebp. */ @@ -50,4 +48,3 @@ ENTRY (__libc_do_syscall) cfi_restore (ebx) ret END (__libc_do_syscall) -#endif diff --git a/sysdeps/unix/sysv/linux/i386/startup.h b/sysdeps/unix/sysv/linux/i386/startup.h index aab8e26ca..213805d7d 100644 --- a/sysdeps/unix/sysv/linux/i386/startup.h +++ b/sysdeps/unix/sysv/linux/i386/startup.h @@ -1,5 +1,5 @@ /* Linux/i386 definitions of functions used by static libc main startup. - Copyright (C) 2017-2022 Free Software Foundation, Inc. + Copyright (C) 2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,46 +16,7 @@ License along with the GNU C Library; if not, see . */ -#if BUILD_PIE_DEFAULT -/* Can't use "call *%gs:SYSINFO_OFFSET" during statup in static PIE. */ -# define I386_USE_SYSENTER 0 +/* Can't use "call *%gs:SYSINFO_OFFSET" during startup. */ +#define I386_USE_SYSENTER 0 -# include -# include - -__attribute__ ((__noreturn__)) -static inline void -_startup_fatal (const char *message __attribute__ ((unused))) -{ - /* This is only called very early during startup in static PIE. - FIXME: How can it be improved? */ - ABORT_INSTRUCTION; - __builtin_unreachable (); -} - -static inline uid_t -startup_getuid (void) -{ - return (uid_t) INTERNAL_SYSCALL_CALL (getuid32); -} - -static inline uid_t -startup_geteuid (void) -{ - return (uid_t) INTERNAL_SYSCALL_CALL (geteuid32); -} - -static inline gid_t -startup_getgid (void) -{ - return (gid_t) INTERNAL_SYSCALL_CALL (getgid32); -} - -static inline gid_t -startup_getegid (void) -{ - return (gid_t) INTERNAL_SYSCALL_CALL (getegid32); -} -#else -# include_next -#endif +#include_next diff --git a/sysdeps/unix/sysv/linux/i386/sysdep.h b/sysdeps/unix/sysv/linux/i386/sysdep.h index 4558ab66c..7085f7e19 100644 --- a/sysdeps/unix/sysv/linux/i386/sysdep.h +++ b/sysdeps/unix/sysv/linux/i386/sysdep.h @@ -42,6 +42,15 @@ # endif #endif +#if !I386_USE_SYSENTER && IS_IN (libc) && !defined SHARED +/* Inside static libc, we have two versions. For compilation units + with !I386_USE_SYSENTER, the vDSO entry mechanism cannot be + used. */ +# define I386_DO_SYSCALL_STRING "__libc_do_syscall_int80" +#else +# define I386_DO_SYSCALL_STRING "__libc_do_syscall" +#endif + #ifdef __ASSEMBLER__ /* Linux uses a negative return value to indicate syscall errors, @@ -301,7 +310,7 @@ struct libc_do_syscall_args }; \ asm volatile ( \ "movl %1, %%eax\n\t" \ - "call __libc_do_syscall" \ + "call " I386_DO_SYSCALL_STRING \ : "=a" (resultvar) \ : "i" (__NR_##name), "c" (arg2), "d" (arg3), "S" (arg4), "D" (&_xv) \ : "memory", "cc") @@ -315,7 +324,7 @@ struct libc_do_syscall_args }; \ asm volatile ( \ "movl %1, %%eax\n\t" \ - "call __libc_do_syscall" \ + "call " I386_DO_SYSCALL_STRING \ : "=a" (resultvar) \ : "a" (name), "c" (arg2), "d" (arg3), "S" (arg4), "D" (&_xv) \ : "memory", "cc") diff --git a/sysdeps/unix/sysv/linux/ia64/Makefile b/sysdeps/unix/sysv/linux/ia64/Makefile index da85ba43e..c5cc41b36 100644 --- a/sysdeps/unix/sysv/linux/ia64/Makefile +++ b/sysdeps/unix/sysv/linux/ia64/Makefile @@ -1,3 +1,9 @@ +ifeq ($(subdir),elf) +# ia64 does not support PT_GNU_RELRO. +test-xfail-tst-relro-ldso = yes +test-xfail-tst-relro-libc = yes +endif + ifeq ($(subdir),misc) sysdep_headers += sys/rse.h endif diff --git a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h index d898bce40..b2eab1b93 100644 --- a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h @@ -272,6 +272,7 @@ #define __NR_sendmsg 1205 #define __NR_sendto 1199 #define __NR_set_mempolicy 1261 +#define __NR_set_mempolicy_home_node 1474 #define __NR_set_robust_list 1298 #define __NR_set_tid_address 1233 #define __NR_setdomainname 1129 diff --git a/sysdeps/unix/sysv/linux/ia64/brk.c b/sysdeps/unix/sysv/linux/ia64/brk.c index 65142aeae..d2135b74f 100644 --- a/sysdeps/unix/sysv/linux/ia64/brk.c +++ b/sysdeps/unix/sysv/linux/ia64/brk.c @@ -16,7 +16,6 @@ License along with the GNU C Library; if not, see . */ -#include -/* brk is used by statup before TCB is properly set. */ -#undef USE_DL_SYSINFO +/* brk is used by startup before TCB is properly set up. */ +#define IA64_USE_NEW_STUB 0 #include diff --git a/sysdeps/unix/sysv/linux/ia64/startup.h b/sysdeps/unix/sysv/linux/ia64/startup.h new file mode 100644 index 000000000..77f29f15a --- /dev/null +++ b/sysdeps/unix/sysv/linux/ia64/startup.h @@ -0,0 +1,22 @@ +/* Linux/ia64 definitions of functions used by static libc main startup. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* This code is used before the TCB is set up. */ +#define IA64_USE_NEW_STUB 0 + +#include_next diff --git a/sysdeps/unix/sysv/linux/ia64/sysdep.h b/sysdeps/unix/sysv/linux/ia64/sysdep.h index 193ecee02..14adbdf4f 100644 --- a/sysdeps/unix/sysv/linux/ia64/sysdep.h +++ b/sysdeps/unix/sysv/linux/ia64/sysdep.h @@ -44,12 +44,15 @@ #undef SYS_ify #define SYS_ify(syscall_name) __NR_##syscall_name -#if defined USE_DL_SYSINFO \ - && (IS_IN (libc) \ - || IS_IN (libpthread) || IS_IN (librt)) -# define IA64_USE_NEW_STUB -#else -# undef IA64_USE_NEW_STUB +#ifndef IA64_USE_NEW_STUB +# if defined USE_DL_SYSINFO && IS_IN (libc) +# define IA64_USE_NEW_STUB 1 +# else +# define IA64_USE_NEW_STUB 0 +# endif +#endif +#if IA64_USE_NEW_STUB && !USE_DL_SYSINFO +# error IA64_USE_NEW_STUB needs USE_DL_SYSINFO #endif #ifdef __ASSEMBLER__ @@ -101,7 +104,7 @@ mov r15=num; \ break __IA64_BREAK_SYSCALL -#ifdef IA64_USE_NEW_STUB +#if IA64_USE_NEW_STUB # ifdef SHARED # define DO_CALL(num) \ .prologue; \ @@ -185,7 +188,7 @@ (non-negative) errno on error or the return value on success. */ -#ifdef IA64_USE_NEW_STUB +#if IA64_USE_NEW_STUB # define INTERNAL_SYSCALL_NCS(name, nr, args...) \ ({ \ @@ -277,7 +280,7 @@ #define ASM_OUTARGS_5 ASM_OUTARGS_4, "=r" (_out4) #define ASM_OUTARGS_6 ASM_OUTARGS_5, "=r" (_out5) -#ifdef IA64_USE_NEW_STUB +#if IA64_USE_NEW_STUB #define ASM_ARGS_0 #define ASM_ARGS_1 ASM_ARGS_0, "4" (_out0) #define ASM_ARGS_2 ASM_ARGS_1, "5" (_out1) @@ -313,7 +316,7 @@ /* Branch registers. */ \ "b6" -#ifdef IA64_USE_NEW_STUB +#if IA64_USE_NEW_STUB # define ASM_CLOBBERS_6 ASM_CLOBBERS_6_COMMON #else # define ASM_CLOBBERS_6 ASM_CLOBBERS_6_COMMON , "b7" diff --git a/sysdeps/unix/sysv/linux/ldsodefs.h b/sysdeps/unix/sysv/linux/ldsodefs.h index 011756ddc..af108991f 100644 --- a/sysdeps/unix/sysv/linux/ldsodefs.h +++ b/sysdeps/unix/sysv/linux/ldsodefs.h @@ -24,16 +24,4 @@ /* Get the real definitions. */ #include_next -/* We can assume that the kernel always provides the AT_UID, AT_EUID, - AT_GID, and AT_EGID values in the auxiliary vector from 2.4.0 or so on. */ -#define HAVE_AUX_XID - -/* We can assume that the kernel always provides the AT_SECURE value - in the auxiliary vector from 2.5.74 or so on. */ -#define HAVE_AUX_SECURE - -/* Starting with one of the 2.4.0 pre-releases the Linux kernel passes - up the page size information. */ -#define HAVE_AUX_PAGESIZE - #endif /* ldsodefs.h */ diff --git a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h index fe721b809..5fc372377 100644 --- a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h @@ -310,6 +310,7 @@ #define __NR_sendmsg 367 #define __NR_sendto 366 #define __NR_set_mempolicy 270 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 304 #define __NR_set_thread_area 334 #define __NR_set_tid_address 253 diff --git a/sysdeps/unix/sysv/linux/m68k/libc-lock-arch.h b/sysdeps/unix/sysv/linux/m68k/libc-lock-arch.h new file mode 100644 index 000000000..1844bbaf6 --- /dev/null +++ b/sysdeps/unix/sysv/linux/m68k/libc-lock-arch.h @@ -0,0 +1,25 @@ +/* Private libc-internal arch-specific definitions. m68k version. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, see . */ + +#ifndef _LIBC_LOCK_ARCH_H +#define _LIBC_LOCK_ARCH_H + +/* Linux enforces 4-bytes alignment on futex inputs. */ +#define __LIBC_LOCK_ALIGNMENT __attribute__ ((__aligned__ (4))) + +#endif diff --git a/sysdeps/unix/sysv/linux/m68k/sysdep.h b/sysdeps/unix/sysv/linux/m68k/sysdep.h index 628e1be83..d87892a37 100644 --- a/sysdeps/unix/sysv/linux/m68k/sysdep.h +++ b/sysdeps/unix/sysv/linux/m68k/sysdep.h @@ -299,8 +299,6 @@ SYSCALL_ERROR_LABEL: \ #define PTR_MANGLE(var) (void) (var) #define PTR_DEMANGLE(var) (void) (var) -#if defined NEED_DL_SYSINFO || defined NEED_DL_SYSINFO_DSO /* M68K needs system-supplied DSO to access TLS helpers even when statically linked. */ -# define NEED_STATIC_SYSINFO_DSO 1 -#endif +#define NEED_STATIC_SYSINFO_DSO 1 diff --git a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h index 6e10c3661..b6e9b007e 100644 --- a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h @@ -326,6 +326,7 @@ #define __NR_sendmsg 360 #define __NR_sendto 353 #define __NR_set_mempolicy 276 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 311 #define __NR_set_thread_area 243 #define __NR_set_tid_address 258 diff --git a/sysdeps/unix/sysv/linux/mips/bits/struct_stat.h b/sysdeps/unix/sysv/linux/mips/bits/struct_stat.h index 7747b3e47..71594e456 100644 --- a/sysdeps/unix/sysv/linux/mips/bits/struct_stat.h +++ b/sysdeps/unix/sysv/linux/mips/bits/struct_stat.h @@ -131,27 +131,30 @@ struct stat64 struct stat { +# ifdef __USE_TIME_BITS64 +# include +# else __dev_t st_dev; int st_pad1[3]; /* Reserved for st_dev expansion */ -# ifndef __USE_FILE_OFFSET64 +# ifndef __USE_FILE_OFFSET64 __ino_t st_ino; -# else +# else __ino64_t st_ino; -# endif +# endif __mode_t st_mode; __nlink_t st_nlink; __uid_t st_uid; __gid_t st_gid; __dev_t st_rdev; -# if !defined __USE_FILE_OFFSET64 +# if !defined __USE_FILE_OFFSET64 unsigned int st_pad2[2]; /* Reserved for st_rdev expansion */ __off_t st_size; int st_pad3; -# else +# else unsigned int st_pad2[3]; /* Reserved for st_rdev expansion */ __off64_t st_size; -# endif -# ifdef __USE_XOPEN2K8 +# endif +# ifdef __USE_XOPEN2K8 /* Nanosecond resolution timestamps are stored in a format equivalent to 'struct timespec'. This is the type used whenever possible but the Unix namespace rules do not allow the @@ -161,30 +164,34 @@ struct stat struct timespec st_atim; /* Time of last access. */ struct timespec st_mtim; /* Time of last modification. */ struct timespec st_ctim; /* Time of last status change. */ -# define st_atime st_atim.tv_sec /* Backward compatibility. */ -# define st_mtime st_mtim.tv_sec -# define st_ctime st_ctim.tv_sec -# else +# define st_atime st_atim.tv_sec /* Backward compatibility. */ +# define st_mtime st_mtim.tv_sec +# define st_ctime st_ctim.tv_sec +# else __time_t st_atime; /* Time of last access. */ unsigned long int st_atimensec; /* Nscecs of last access. */ __time_t st_mtime; /* Time of last modification. */ unsigned long int st_mtimensec; /* Nsecs of last modification. */ __time_t st_ctime; /* Time of last status change. */ unsigned long int st_ctimensec; /* Nsecs of last status change. */ -# endif +# endif __blksize_t st_blksize; unsigned int st_pad4; -# ifndef __USE_FILE_OFFSET64 +# ifndef __USE_FILE_OFFSET64 __blkcnt_t st_blocks; -# else +# else __blkcnt64_t st_blocks; -# endif +# endif int st_pad5[14]; +# endif }; #ifdef __USE_LARGEFILE64 struct stat64 { +# ifdef __USE_TIME_BITS64 +# include +# else __dev_t st_dev; unsigned int st_pad1[3]; /* Reserved for st_dev expansion */ __ino64_t st_ino; @@ -217,6 +224,7 @@ struct stat64 unsigned int st_pad3; __blkcnt64_t st_blocks; int st_pad4[14]; +# endif /* __USE_TIME_BITS64 */ }; #endif diff --git a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h index 26a6d594a..b3a3871f8 100644 --- a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h @@ -308,6 +308,7 @@ #define __NR_sendmsg 4179 #define __NR_sendto 4180 #define __NR_set_mempolicy 4270 +#define __NR_set_mempolicy_home_node 4450 #define __NR_set_robust_list 4309 #define __NR_set_thread_area 4283 #define __NR_set_tid_address 4252 diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h index 83e0d49c5..b46218272 100644 --- a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h @@ -288,6 +288,7 @@ #define __NR_sendmsg 6045 #define __NR_sendto 6043 #define __NR_set_mempolicy 6233 +#define __NR_set_mempolicy_home_node 6450 #define __NR_set_robust_list 6272 #define __NR_set_thread_area 6246 #define __NR_set_tid_address 6213 diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h index d6747c542..a9d6b9457 100644 --- a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h @@ -270,6 +270,7 @@ #define __NR_sendmsg 5045 #define __NR_sendto 5043 #define __NR_set_mempolicy 5229 +#define __NR_set_mempolicy_home_node 5450 #define __NR_set_robust_list 5268 #define __NR_set_thread_area 5242 #define __NR_set_tid_address 5212 diff --git a/sysdeps/unix/sysv/linux/mmap_call.h b/sysdeps/unix/sysv/linux/mmap_call.h new file mode 100644 index 000000000..3547c99e1 --- /dev/null +++ b/sysdeps/unix/sysv/linux/mmap_call.h @@ -0,0 +1,22 @@ +/* Generic definition of MMAP_CALL and MMAP_CALL_INTERNAL. + Copyright (C) 2017-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \ + INLINE_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset) +#define MMAP_CALL_INTERNAL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \ + INTERNAL_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset) diff --git a/sysdeps/unix/sysv/linux/mmap_internal.h b/sysdeps/unix/sysv/linux/mmap_internal.h index 841b73139..aebf97d06 100644 --- a/sysdeps/unix/sysv/linux/mmap_internal.h +++ b/sysdeps/unix/sysv/linux/mmap_internal.h @@ -42,10 +42,6 @@ static uint64_t page_unit; /* Do not accept offset not multiple of page size. */ #define MMAP_OFF_LOW_MASK (MMAP2_PAGE_UNIT - 1) -/* An architecture may override this. */ -#ifndef MMAP_CALL -# define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \ - INLINE_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset) -#endif +#include #endif /* MMAP_INTERNAL_LINUX_H */ diff --git a/sysdeps/unix/sysv/linux/mq_timedreceive.c b/sysdeps/unix/sysv/linux/mq_timedreceive.c index 834cd7a48..5bf1e0a83 100644 --- a/sysdeps/unix/sysv/linux/mq_timedreceive.c +++ b/sysdeps/unix/sysv/linux/mq_timedreceive.c @@ -41,7 +41,7 @@ ___mq_timedreceive_time64 (mqd_t mqdes, char *__restrict msg_ptr, size_t msg_len { int r = SYSCALL_CANCEL (mq_timedreceive_time64, mqdes, msg_ptr, msg_len, msg_prio, abs_timeout); - if (r == 0 || errno != ENOSYS) + if (r >= 0 || errno != ENOSYS) return r; __set_errno (EOVERFLOW); return -1; diff --git a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h index 4ee209bc4..809a219ef 100644 --- a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h @@ -250,6 +250,7 @@ #define __NR_sendmsg 211 #define __NR_sendto 206 #define __NR_set_mempolicy 237 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 99 #define __NR_set_tid_address 96 #define __NR_setdomainname 162 diff --git a/sysdeps/unix/sysv/linux/or1k/arch-syscall.h b/sysdeps/unix/sysv/linux/or1k/arch-syscall.h index ddeaeceeb..1364f4cbc 100644 --- a/sysdeps/unix/sysv/linux/or1k/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/or1k/arch-syscall.h @@ -251,6 +251,7 @@ #define __NR_sendmsg 211 #define __NR_sendto 206 #define __NR_set_mempolicy 237 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 99 #define __NR_set_tid_address 96 #define __NR_setdomainname 162 diff --git a/sysdeps/unix/sysv/linux/pathconf.c b/sysdeps/unix/sysv/linux/pathconf.c index 107cf9878..dc6864852 100644 --- a/sysdeps/unix/sysv/linux/pathconf.c +++ b/sysdeps/unix/sysv/linux/pathconf.c @@ -110,8 +110,8 @@ distinguish_extX (const struct statfs *fsbuf, const char *file, int fd) && strcmp (mntbuf.mnt_type, "ext4") != 0) continue; - struct stat64 fsst; - if (__stat64 (mntbuf.mnt_dir, &fsst) >= 0 + struct __stat64_t64 fsst; + if (__stat64_time64 (mntbuf.mnt_dir, &fsst) >= 0 && st.st_dev == fsst.st_dev) { if (strcmp (mntbuf.mnt_type, "ext4") == 0) diff --git a/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h b/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h index 594371940..ce2281cf1 100644 --- a/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h +++ b/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h @@ -16,15 +16,5 @@ License along with the GNU C Library; if not, see . */ -#include - -#if IS_IN (libc) && !defined SHARED -int GLRO(dl_cache_line_size); -#endif - -/* Scan the Aux Vector for the "Data Cache Block Size" entry and assign it - to dl_cache_line_size. */ -#define DL_PLATFORM_AUXV \ - case AT_DCACHEBSIZE: \ - GLRO(dl_cache_line_size) = av->a_un.a_val; \ - break; +#define DL_PLATFORM_AUXV \ + GLRO(dl_cache_line_size) = auxv_values[AT_DCACHEBSIZE]; diff --git a/sysdeps/unix/sysv/linux/powerpc/dl-support.c b/sysdeps/unix/sysv/linux/powerpc/dl-support.c new file mode 100644 index 000000000..abe68a704 --- /dev/null +++ b/sysdeps/unix/sysv/linux/powerpc/dl-support.c @@ -0,0 +1,4 @@ +#include + +/* Populated from the auxiliary vector. */ +int _dl_cache_line_size; diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h index 497299fbc..627831eba 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h @@ -319,6 +319,7 @@ #define __NR_sendmsg 341 #define __NR_sendto 335 #define __NR_set_mempolicy 261 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 300 #define __NR_set_tid_address 232 #define __NR_setdomainname 121 diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h index e840279f1..bae597199 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h @@ -298,6 +298,7 @@ #define __NR_sendmsg 341 #define __NR_sendto 335 #define __NR_set_mempolicy 261 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 300 #define __NR_set_tid_address 232 #define __NR_setdomainname 121 diff --git a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h index 73ef74c00..202520ee2 100644 --- a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h @@ -122,6 +122,7 @@ #define __NR_mbind 235 #define __NR_membarrier 283 #define __NR_memfd_create 279 +#define __NR_memfd_secret 447 #define __NR_migrate_pages 238 #define __NR_mincore 232 #define __NR_mkdirat 34 @@ -228,6 +229,7 @@ #define __NR_sendmsg 211 #define __NR_sendto 206 #define __NR_set_mempolicy 237 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 99 #define __NR_set_tid_address 96 #define __NR_setdomainname 162 diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h index 919a79ee9..4e65f337d 100644 --- a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h @@ -127,6 +127,7 @@ #define __NR_mbind 235 #define __NR_membarrier 283 #define __NR_memfd_create 279 +#define __NR_memfd_secret 447 #define __NR_migrate_pages 238 #define __NR_mincore 232 #define __NR_mkdirat 34 @@ -235,6 +236,7 @@ #define __NR_sendmsg 211 #define __NR_sendto 206 #define __NR_set_mempolicy 237 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 99 #define __NR_set_tid_address 96 #define __NR_setdomainname 162 diff --git a/sysdeps/unix/sysv/linux/s390/mmap_internal.h b/sysdeps/unix/sysv/linux/s390/mmap_call.h similarity index 78% rename from sysdeps/unix/sysv/linux/s390/mmap_internal.h rename to sysdeps/unix/sysv/linux/s390/mmap_call.h index cc76ac973..f169b8bab 100644 --- a/sysdeps/unix/sysv/linux/s390/mmap_internal.h +++ b/sysdeps/unix/sysv/linux/s390/mmap_call.h @@ -16,9 +16,6 @@ License along with the GNU C Library; if not, see . */ -#ifndef MMAP_S390_INTERNAL_H -# define MMAP_S390_INTERNAL_H - #define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \ ({ \ long int __args[6] = { (long int) (__addr), (long int) (__len), \ @@ -26,7 +23,10 @@ (long int) (__fd), (long int) (__offset) }; \ INLINE_SYSCALL_CALL (__nr, __args); \ }) - -#include_next - -#endif +#define MMAP_CALL_INTERNAL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \ + ({ \ + long int __args[6] = { (long int) (__addr), (long int) (__len), \ + (long int) (__prot), (long int) (__flags), \ + (long int) (__fd), (long int) (__offset) }; \ + INTERNAL_SYSCALL_CALL (__nr, __args); \ + }) diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h index 005c0ada7..57025107e 100644 --- a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h @@ -311,6 +311,7 @@ #define __NR_sendmsg 370 #define __NR_sendto 369 #define __NR_set_mempolicy 270 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 304 #define __NR_set_tid_address 252 #define __NR_setdomainname 121 diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h index 9131fddcc..72e19c6d5 100644 --- a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h @@ -278,6 +278,7 @@ #define __NR_sendmsg 370 #define __NR_sendto 369 #define __NR_set_mempolicy 270 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 304 #define __NR_set_tid_address 252 #define __NR_setdomainname 121 diff --git a/sysdeps/unix/sysv/linux/sh/arch-syscall.h b/sysdeps/unix/sysv/linux/sh/arch-syscall.h index d8fb04156..d52b522d9 100644 --- a/sysdeps/unix/sysv/linux/sh/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/sh/arch-syscall.h @@ -303,6 +303,7 @@ #define __NR_sendmsg 355 #define __NR_sendto 349 #define __NR_set_mempolicy 276 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 311 #define __NR_set_tid_address 258 #define __NR_setdomainname 121 diff --git a/sysdeps/unix/sysv/linux/sparc/brk.c b/sysdeps/unix/sysv/linux/sparc/brk.c deleted file mode 100644 index c5c1ee028..000000000 --- a/sysdeps/unix/sysv/linux/sparc/brk.c +++ /dev/null @@ -1,58 +0,0 @@ -/* Change data segment. Linux SPARC version. - Copyright (C) 2021-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#include -#include -#include - -/* This must be initialized data because commons can't have aliases. */ -void *__curbrk = 0; - -#if HAVE_INTERNAL_BRK_ADDR_SYMBOL -/* Old braindamage in GCC's crtstuff.c requires this symbol in an attempt - to work around different old braindamage in the old Linux ELF dynamic - linker. */ -weak_alias (__curbrk, ___brk_addr) -#endif - -#ifdef __arch64__ -# define SYSCALL_NUM "0x6d" -#else -# define SYSCALL_NUM "0x10" -#endif - -int -__brk (void *addr) -{ - register long int g1 asm ("g1") = __NR_brk; - register long int o0 asm ("o0") = (long int) addr; - asm volatile ("ta " SYSCALL_NUM - : "=r"(o0) - : "r"(g1), "0"(o0) - : "cc"); - __curbrk = (void *) o0; - - if (__curbrk < addr) - { - __set_errno (ENOMEM); - return -1; - } - - return 0; -} -weak_alias (__brk, brk) diff --git a/sysdeps/unix/sysv/linux/alpha/brk.c b/sysdeps/unix/sysv/linux/sparc/brk_call.h similarity index 61% rename from sysdeps/unix/sysv/linux/alpha/brk.c rename to sysdeps/unix/sysv/linux/sparc/brk_call.h index 32082a4fa..59ce52166 100644 --- a/sysdeps/unix/sysv/linux/alpha/brk.c +++ b/sysdeps/unix/sysv/linux/sparc/brk_call.h @@ -1,5 +1,5 @@ -/* Change data segment size. Linux/Alpha. - Copyright (C) 2020-2022 Free Software Foundation, Inc. +/* Invoke the brk system call. Sparc version. + Copyright (C) 2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,23 +16,20 @@ License along with the GNU C Library. If not, see . */ -#include -#include -#include +#ifdef __arch64__ +# define SYSCALL_NUM "0x6d" +#else +# define SYSCALL_NUM "0x10" +#endif -void *__curbrk = 0; - -int -__brk (void *addr) +static inline void * +__brk_call (void *addr) { - /* Alpha brk returns -ENOMEM in case of failure. */ - __curbrk = (void *) INTERNAL_SYSCALL_CALL (brk, addr); - if ((unsigned long) __curbrk == -ENOMEM) - { - __set_errno (ENOMEM); - return -1; - } - - return 0; + register long int g1 asm ("g1") = __NR_brk; + register long int o0 asm ("o0") = (long int) addr; + asm volatile ("ta " SYSCALL_NUM + : "=r"(o0) + : "r"(g1), "0"(o0) + : "cc"); + return (void *) o0; } -weak_alias (__brk, brk) diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h index 2bc014fe6..d3f4d8aa3 100644 --- a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h @@ -310,6 +310,7 @@ #define __NR_sendmsg 114 #define __NR_sendto 133 #define __NR_set_mempolicy 305 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 300 #define __NR_set_tid_address 166 #define __NR_setdomainname 163 diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h index 76dbbe595..2cc03d7a2 100644 --- a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h @@ -286,6 +286,7 @@ #define __NR_sendmsg 114 #define __NR_sendto 133 #define __NR_set_mempolicy 305 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 300 #define __NR_set_tid_address 166 #define __NR_setdomainname 163 diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c index d703485e3..d6f5ca89c 100644 --- a/sysdeps/unix/sysv/linux/spawni.c +++ b/sysdeps/unix/sysv/linux/spawni.c @@ -409,7 +409,7 @@ __spawnix (pid_t * pid, const char *file, __waitpid (new_pid, NULL, 0); } else - ec = -new_pid; + ec = errno; __munmap (stack, stack_size); diff --git a/sysdeps/unix/sysv/linux/startup.h b/sysdeps/unix/sysv/linux/startup.h new file mode 100644 index 000000000..39859b404 --- /dev/null +++ b/sysdeps/unix/sysv/linux/startup.h @@ -0,0 +1,39 @@ +/* Linux definitions of functions used by static libc main startup. + Copyright (C) 2017-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifdef SHARED +# include_next +#else +# include + +/* Avoid a run-time invocation of strlen. */ +#define _startup_fatal(message) \ + do \ + { \ + size_t __message_length = __builtin_strlen (message); \ + if (! __builtin_constant_p (__message_length)) \ + { \ + extern void _startup_fatal_not_constant (void); \ + _startup_fatal_not_constant (); \ + } \ + INTERNAL_SYSCALL_CALL (write, STDERR_FILENO, (message), \ + __message_length); \ + INTERNAL_SYSCALL_CALL (exit_group, 127); \ + } \ + while (0) +#endif /* !SHARED */ diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list index 642180611..028ad3107 100644 --- a/sysdeps/unix/sysv/linux/syscall-names.list +++ b/sysdeps/unix/sysv/linux/syscall-names.list @@ -21,8 +21,8 @@ # This file can list all potential system calls. The names are only # used if the installed kernel headers also provide them. -# The list of system calls is current as of Linux 5.16. -kernel 5.16 +# The list of system calls is current as of Linux 5.19. +kernel 5.19 FAST_atomic_update FAST_cmpxchg @@ -524,6 +524,7 @@ sendmmsg sendmsg sendto set_mempolicy +set_mempolicy_home_node set_robust_list set_thread_area set_tid_address diff --git a/sysdeps/unix/sysv/linux/tst-getauxval.c b/sysdeps/unix/sysv/linux/tst-getauxval.c new file mode 100644 index 000000000..c4b619574 --- /dev/null +++ b/sysdeps/unix/sysv/linux/tst-getauxval.c @@ -0,0 +1,74 @@ +/* Basic test for getauxval. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include + +static int missing; +static int mismatch; + +static void +check_nonzero (unsigned long t, const char *s) +{ + unsigned long v = getauxval (t); + printf ("%s: %lu (0x%lx)\n", s, v, v); + if (v == 0) + missing++; +} + +static void +check_eq (unsigned long t, const char *s, unsigned long want) +{ + unsigned long v = getauxval (t); + printf ("%s: %lu want: %lu\n", s, v, want); + if (v != want) + mismatch++; +} + +#define NZ(x) check_nonzero (x, #x) +#define EQ(x, want) check_eq (x, #x, want) + +static int +do_test (void) +{ + /* These auxv entries should be non-zero on Linux. */ + NZ (AT_PHDR); + NZ (AT_PHENT); + NZ (AT_PHNUM); + NZ (AT_PAGESZ); + NZ (AT_ENTRY); + NZ (AT_CLKTCK); + NZ (AT_RANDOM); + NZ (AT_EXECFN); + if (missing) + FAIL_EXIT1 ("Found %d missing auxv entries.\n", missing); + + /* Check against syscalls. */ + EQ (AT_UID, getuid ()); + EQ (AT_EUID, geteuid ()); + EQ (AT_GID, getgid ()); + EQ (AT_EGID, getegid ()); + if (mismatch) + FAIL_EXIT1 ("Found %d mismatching auxv entries.\n", mismatch); + + return 0; +} + +#include diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py index fc0a8ff4a..fdc5b2bc0 100644 --- a/sysdeps/unix/sysv/linux/tst-mman-consts.py +++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py @@ -33,7 +33,7 @@ def main(): help='C compiler (including options) to use') args = parser.parse_args() linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc) - linux_version_glibc = (5, 15) + linux_version_glibc = (5, 17) sys.exit(glibcextract.compare_macro_consts( '#define _GNU_SOURCE 1\n' '#include \n', diff --git a/sysdeps/unix/sysv/linux/tst-socket-timestamp-compat.c b/sysdeps/unix/sysv/linux/tst-socket-timestamp-compat.c index 0ff1a214e..2b1feb476 100644 --- a/sysdeps/unix/sysv/linux/tst-socket-timestamp-compat.c +++ b/sysdeps/unix/sysv/linux/tst-socket-timestamp-compat.c @@ -22,6 +22,7 @@ #include #include #include +#include /* AF_INET socket and address used to receive data. */ static int srv; @@ -88,7 +89,7 @@ do_test_large_buffer (bool mc) /* Enable 32 bit timeval precision and check if no 64 bit timeval stamp is created. */ { - int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMP_OLD, &(int){1}, + int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMP_OLD, &(int){1}, sizeof (int)); TEST_VERIFY_EXIT (r != -1); @@ -103,10 +104,10 @@ do_test_large_buffer (bool mc) if (cmsg->cmsg_level != SOL_SOCKET) continue; - if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMP_NEW) + if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMP_NEW) found_timestamp = true; else - TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMP_NEW); + TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMP_NEW); } TEST_COMPARE (found_timestamp, sizeof (time_t) > 4); @@ -114,7 +115,7 @@ do_test_large_buffer (bool mc) /* Same as before, but for timespec. */ { - int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMPNS_OLD, &(int){1}, + int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMPNS_OLD, &(int){1}, sizeof (int)); TEST_VERIFY_EXIT (r != -1); @@ -129,10 +130,10 @@ do_test_large_buffer (bool mc) if (cmsg->cmsg_level != SOL_SOCKET) continue; - if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMPNS_NEW) + if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMPNS_NEW) found_timestamp = true; else - TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMPNS_NEW); + TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMPNS_NEW); } TEST_COMPARE (found_timestamp, sizeof (time_t) > 4); @@ -151,7 +152,7 @@ do_test_small_buffer (bool mc) /* Enable 32 bit timeval precision and check if no 64 bit timeval stamp is created. */ { - int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMP_OLD, &(int){1}, + int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMP_OLD, &(int){1}, sizeof (int)); TEST_VERIFY_EXIT (r != -1); @@ -172,10 +173,10 @@ do_test_small_buffer (bool mc) if (cmsg->cmsg_level != SOL_SOCKET) continue; - if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMP_NEW) + if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMP_NEW) found_timestamp = true; else - TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMP_NEW); + TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMP_NEW); } if (sizeof (time_t) > 4) @@ -192,7 +193,7 @@ do_test_small_buffer (bool mc) /* Same as before, but for timespec. */ { - int r = setsockopt (srv, SOL_SOCKET, SO_TIMESTAMPNS_OLD, &(int){1}, + int r = setsockopt (srv, SOL_SOCKET, COMPAT_SO_TIMESTAMPNS_OLD, &(int){1}, sizeof (int)); TEST_VERIFY_EXIT (r != -1); @@ -213,10 +214,10 @@ do_test_small_buffer (bool mc) if (cmsg->cmsg_level != SOL_SOCKET) continue; - if (sizeof (time_t) > 4 && cmsg->cmsg_type == SO_TIMESTAMPNS_NEW) + if (sizeof (time_t) > 4 && cmsg->cmsg_type == COMPAT_SO_TIMESTAMPNS_NEW) found_timestamp = true; else - TEST_VERIFY (cmsg->cmsg_type != SO_TIMESTAMPNS_NEW); + TEST_VERIFY (cmsg->cmsg_type != COMPAT_SO_TIMESTAMPNS_NEW); } if (sizeof (time_t) > 4) diff --git a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h index 28558279b..b4ab892ec 100644 --- a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h @@ -278,6 +278,7 @@ #define __NR_sendmsg 46 #define __NR_sendto 44 #define __NR_set_mempolicy 238 +#define __NR_set_mempolicy_home_node 450 #define __NR_set_robust_list 273 #define __NR_set_thread_area 205 #define __NR_set_tid_address 218 diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h index c1ab8ec45..772559c87 100644 --- a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h +++ b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h @@ -270,6 +270,7 @@ #define __NR_sendmsg 1073742342 #define __NR_sendto 1073741868 #define __NR_set_mempolicy 1073742062 +#define __NR_set_mempolicy_home_node 1073742274 #define __NR_set_robust_list 1073742354 #define __NR_set_thread_area 1073742029 #define __NR_set_tid_address 1073742042 diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile index 6cf708335..c6bee981f 100644 --- a/sysdeps/x86/Makefile +++ b/sysdeps/x86/Makefile @@ -99,7 +99,9 @@ tests += \ tst-strcpy-rtm \ tst-strlen-rtm \ tst-strncmp-rtm \ - tst-strrchr-rtm + tst-strrchr-rtm \ + tst-wcsncmp-rtm \ +# tests CFLAGS-tst-memchr-rtm.c += -mrtm CFLAGS-tst-memcmp-rtm.c += -mrtm @@ -109,8 +111,9 @@ CFLAGS-tst-memset-rtm.c += -mrtm CFLAGS-tst-strchr-rtm.c += -mrtm CFLAGS-tst-strcpy-rtm.c += -mrtm CFLAGS-tst-strlen-rtm.c += -mrtm -CFLAGS-tst-strncmp-rtm.c += -mrtm +CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error CFLAGS-tst-strrchr-rtm.c += -mrtm +CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error endif ifneq ($(enable-cet),no) diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index f64a2fb0b..e9f338210 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -898,18 +898,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) rep_movsb_threshold = 2112; - unsigned long int rep_movsb_stop_threshold; - /* ERMS feature is implemented from AMD Zen3 architecture and it is - performing poorly for data above L2 cache size. Henceforth, adding - an upper bound threshold parameter to limit the usage of Enhanced - REP MOVSB operations and setting its value to L2 cache size. */ - if (cpu_features->basic.kind == arch_kind_amd) - rep_movsb_stop_threshold = core; - /* Setting the upper bound of ERMS to the computed value of - non-temporal threshold for architectures other than AMD. */ - else - rep_movsb_stop_threshold = non_temporal_threshold; - /* The default threshold to use Enhanced REP STOSB. */ unsigned long int rep_stosb_threshold = 2048; @@ -943,14 +931,32 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); + /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of + 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best + if that operation cannot overflow. Minimum of 0x4040 (16448) because the + L(large_memset_4x) loops need 64-byte to cache align and enough space for + at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are + reflected in the manual. */ TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, - 0, SIZE_MAX); + 0x4040, SIZE_MAX >> 4); TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, minimum_rep_movsb_threshold, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, SIZE_MAX); #endif + unsigned long int rep_movsb_stop_threshold; + /* ERMS feature is implemented from AMD Zen3 architecture and it is + performing poorly for data above L2 cache size. Henceforth, adding + an upper bound threshold parameter to limit the usage of Enhanced + REP MOVSB operations and setting its value to L2 cache size. */ + if (cpu_features->basic.kind == arch_kind_amd) + rep_movsb_stop_threshold = core; + /* Setting the upper bound of ERMS to the computed value of + non-temporal threshold for architectures other than AMD. */ + else + rep_movsb_stop_threshold = non_temporal_threshold; + cpu_features->data_cache_size = data; cpu_features->shared_cache_size = shared; cpu_features->non_temporal_threshold = non_temporal_threshold; diff --git a/sysdeps/x86/isa-level.c b/sysdeps/x86/isa-level.c index a6cb32b1b..09cd72ab2 100644 --- a/sysdeps/x86/isa-level.c +++ b/sysdeps/x86/isa-level.c @@ -47,7 +47,8 @@ # endif # if ISA_V2 && defined __AVX__ && defined __AVX2__ && defined __F16C__ \ - && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE + && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE \ + && defined __BMI__ && defined __BMI2__ /* NB: ISAs in x86-64 ISA level v3 are used. */ # define ISA_V3 GNU_PROPERTY_X86_ISA_1_V3 # else diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h index 61f1255bf..007a1eb13 100644 --- a/sysdeps/x86/sysdep.h +++ b/sysdeps/x86/sysdep.h @@ -111,7 +111,8 @@ enum cf_protection_level /* Local label name for asm code. */ #ifndef L /* ELF-like local names start with `.L'. */ -# define L(name) .L##name +# define LOCAL_LABEL(name) .L##name +# define L(name) LOCAL_LABEL(name) #endif #define atom_text_section .section ".text.atom", "ax" diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c index 09ed6fa0d..a3b14e72f 100644 --- a/sysdeps/x86/tst-strncmp-rtm.c +++ b/sysdeps/x86/tst-strncmp-rtm.c @@ -16,20 +16,35 @@ License along with the GNU C Library; if not, see . */ +#include #include +#ifdef WIDE +# define CHAR wchar_t +# define MEMSET wmemset +# define STRNCMP wcsncmp +# define TEST_NAME "wcsncmp" +#else /* !WIDE */ +# define CHAR char +# define MEMSET memset +# define STRNCMP strncmp +# define TEST_NAME "strncmp" +#endif /* !WIDE */ + + + #define LOOP 3000 #define STRING_SIZE 1024 -char string1[STRING_SIZE]; -char string2[STRING_SIZE]; +CHAR string1[STRING_SIZE]; +CHAR string2[STRING_SIZE]; __attribute__ ((noinline, noclone)) static int prepare (void) { - memset (string1, 'a', STRING_SIZE - 1); - memset (string2, 'a', STRING_SIZE - 1); - if (strncmp (string1, string2, STRING_SIZE) == 0) + MEMSET (string1, 'a', STRING_SIZE - 1); + MEMSET (string2, 'a', STRING_SIZE - 1); + if (STRNCMP (string1, string2, STRING_SIZE) == 0) return EXIT_SUCCESS; else return EXIT_FAILURE; @@ -39,7 +54,27 @@ __attribute__ ((noinline, noclone)) static int function (void) { - if (strncmp (string1, string2, STRING_SIZE) == 0) + if (STRNCMP (string1, string2, STRING_SIZE) == 0) + return 0; + else + return 1; +} + +__attribute__ ((noinline, noclone)) +static int +function_overflow (void) +{ + if (STRNCMP (string1, string2, SIZE_MAX) == 0) + return 0; + else + return 1; +} + +__attribute__ ((noinline, noclone)) +static int +function_overflow2 (void) +{ + if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0) return 0; else return 1; @@ -48,5 +83,14 @@ function (void) static int do_test (void) { - return do_test_1 ("strncmp", LOOP, prepare, function); + int status = do_test_1 (TEST_NAME, LOOP, prepare, function); + if (status != EXIT_SUCCESS) + return status; + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow); + if (status != EXIT_SUCCESS) + return status; + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2); + if (status != EXIT_SUCCESS) + return status; + return status; } diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c new file mode 100644 index 000000000..bad3b8637 --- /dev/null +++ b/sysdeps/x86/tst-wcsncmp-rtm.c @@ -0,0 +1,21 @@ +/* Test case for wcsncmp inside a transactionally executing RTM region. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define WIDE 1 +#include +#include "tst-strncmp-rtm.c" diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S deleted file mode 100644 index f96d567fd..000000000 --- a/sysdeps/x86_64/bzero.S +++ /dev/null @@ -1 +0,0 @@ -/* Implemented in memset.S. */ diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index 155ca36bd..4f7b0a546 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -339,11 +339,13 @@ and creates an unsatisfiable circular dependency.\n", # endif /* Set to symbol size plus addend. */ value = sym->st_size; + *reloc_addr = value + reloc->r_addend; + break; # endif - /* Fall through. */ + case R_X86_64_GLOB_DAT: case R_X86_64_JUMP_SLOT: - *reloc_addr = value + reloc->r_addend; + *reloc_addr = value; break; # ifndef RESOLVE_CONFLICT_FIND_MAP diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S index e02a53ea1..5718a7da8 100644 --- a/sysdeps/x86_64/memcmp.S +++ b/sysdeps/x86_64/memcmp.S @@ -18,395 +18,561 @@ #include +#ifdef USE_AS_WMEMCMP +# define PCMPEQ pcmpeqd +# define CHAR_SIZE 4 +# define SIZE_OFFSET (0) +#else +# define PCMPEQ pcmpeqb +# define CHAR_SIZE 1 +#endif + +#ifdef USE_AS_MEMCMPEQ +# define SIZE_OFFSET (0) +# define CHECK_CMP(x, y) subl x, y +#else +# ifndef SIZE_OFFSET +# define SIZE_OFFSET (CHAR_PER_VEC * 2) +# endif +# define CHECK_CMP(x, y) cmpl x, y +#endif + +#define VEC_SIZE 16 +#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +#ifndef MEMCMP +# define MEMCMP memcmp +#endif + .text -ENTRY (memcmp) -#ifdef __ILP32__ +ENTRY(MEMCMP) +# ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %edx, %edx +# endif +#ifdef USE_AS_WMEMCMP + /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store + in ecx for code size. This is preferable to using `incw` as + it avoids partial register stalls on older hardware (pre + SnB). */ + movl $0xffff, %ecx #endif - test %RDX_LP, %RDX_LP - jz L(finz) - cmpq $1, %rdx - jbe L(finr1b) - subq %rdi, %rsi - movq %rdx, %r10 - cmpq $32, %r10 - jae L(gt32) - /* Handle small chunks and last block of less than 32 bytes. */ -L(small): - testq $1, %r10 - jz L(s2b) - movzbl (%rdi), %eax - movzbl (%rdi, %rsi), %edx - subq $1, %r10 - je L(finz1) - addq $1, %rdi - subl %edx, %eax - jnz L(exit) -L(s2b): - testq $2, %r10 - jz L(s4b) - movzwl (%rdi), %eax - movzwl (%rdi, %rsi), %edx - subq $2, %r10 -#ifdef USE_AS_MEMCMPEQ - je L(finz1) + cmpq $CHAR_PER_VEC, %rdx + ja L(more_1x_vec) + +#ifdef USE_AS_WMEMCMP + /* saves a byte of code keeping the fall through path n = [2, 4] + in the initial cache line. */ + decl %edx + jle L(cmp_0_1) + + movq (%rsi), %xmm0 + movq (%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + subl %ecx, %eax + jnz L(ret_nonzero_vec_start_0) + + movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0 + movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1 + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + subl %ecx, %eax + jnz L(ret_nonzero_vec_end_0_adj) #else - je L(fin2_7) + cmpl $8, %edx + ja L(cmp_9_16) + + cmpl $4, %edx + jb L(cmp_0_3) + +# ifdef USE_AS_MEMCMPEQ + movl (%rsi), %eax + subl (%rdi), %eax + + movl -4(%rsi, %rdx), %esi + subl -4(%rdi, %rdx), %esi + + orl %esi, %eax + ret +# else + /* Combine comparisons for lo and hi 4-byte comparisons. */ + movl -4(%rsi, %rdx), %ecx + movl -4(%rdi, %rdx), %eax + shlq $32, %rcx + shlq $32, %rax + movl (%rsi), %esi + movl (%rdi), %edi + orq %rsi, %rcx + orq %rdi, %rax + /* Only compute proper return if not-equal. */ + cmpq %rcx, %rax + jnz L(ret_nonzero) + xorl %eax, %eax + ret +# endif + + .p2align 4,, 10 +L(cmp_9_16): +# ifdef USE_AS_MEMCMPEQ + movq (%rsi), %rax + subq (%rdi), %rax + + movq -8(%rsi, %rdx), %rcx + subq -8(%rdi, %rdx), %rcx + orq %rcx, %rax + /* Convert 64 bit -> 32 bit boolean (we should have made the ABI + return long). */ + setnz %cl + movzbl %cl, %eax +# else + movq (%rsi), %rcx + movq (%rdi), %rax + /* Only compute proper return if not-equal. */ + cmpq %rcx, %rax + jnz L(ret_nonzero) + + movq -8(%rsi, %rdx, CHAR_SIZE), %rcx + movq -8(%rdi, %rdx, CHAR_SIZE), %rax + /* Only compute proper return if not-equal. */ + cmpq %rcx, %rax + jnz L(ret_nonzero) + xorl %eax, %eax +# endif #endif - addq $2, %rdi - cmpl %edx, %eax -#ifdef USE_AS_MEMCMPEQ - jnz L(neq_early) + ret + + .p2align 4,, 8 +L(cmp_0_1): + /* Flag set by earlier comparison against 1. */ + jne L(cmp_0_0) +#ifdef USE_AS_WMEMCMP + movl (%rdi), %ecx + xorl %edx, %edx + cmpl (%rsi), %ecx + je L(cmp_0_0) + setg %dl + leal -1(%rdx, %rdx), %eax #else - jnz L(fin2_7) + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + subl %ecx, %eax #endif -L(s4b): - testq $4, %r10 - jz L(s8b) - movl (%rdi), %eax - movl (%rdi, %rsi), %edx - subq $4, %r10 -#ifdef USE_AS_MEMCMPEQ - je L(finz1) + ret + + /* Fits in aligning bytes. */ +L(cmp_0_0): + xorl %eax, %eax + ret + +#ifdef USE_AS_WMEMCMP + .p2align 4 +L(ret_nonzero_vec_start_0): + bsfl %eax, %eax + movl (%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax + ret #else - je L(fin2_7) + +# ifndef USE_AS_MEMCMPEQ + .p2align 4,, 14 +L(ret_nonzero): + /* Need to bswap to get proper return without branch. */ + bswapq %rcx + bswapq %rax + subq %rcx, %rax + sbbl %eax, %eax + orl $1, %eax + ret +# endif + + .p2align 4 +L(cmp_0_3): +# ifdef USE_AS_MEMCMPEQ + /* No reason to add to dependency chain on rdx. Saving a the + bytes here doesn't change number of fetch blocks. */ + cmpl $1, %edx + jbe L(cmp_0_1) +# else + /* We need the code size to prevent taking an extra fetch block. + */ + decl %edx + jle L(cmp_0_1) +# endif + movzwl (%rsi), %ecx + movzwl (%rdi), %eax + +# ifdef USE_AS_MEMCMPEQ + subl %ecx, %eax + + movzbl -1(%rsi, %rdx), %esi + movzbl -1(%rdi, %rdx), %edi + subl %edi, %esi + orl %esi, %eax +# else + bswapl %ecx + bswapl %eax + + /* Implicit right shift by one. We just need to displace the + sign bits. */ + shrl %ecx + shrl %eax + + /* Eat a partial register stall here. Saves code stopping + L(cmp_0_3) from bleeding into the next fetch block and saves + an ALU. */ + movb (%rsi, %rdx), %cl + movzbl (%rdi, %rdx), %edi + orl %edi, %eax + subl %ecx, %eax +# endif + ret #endif - addq $4, %rdi - cmpl %edx, %eax -#ifdef USE_AS_MEMCMPEQ - jnz L(neq_early) + + .p2align 5 +L(more_1x_vec): +#ifndef USE_AS_WMEMCMP + /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store + in ecx for code size. This is preferable to using `incw` as + it avoids partial register stalls on older hardware (pre + SnB). */ + movl $0xffff, %ecx +#endif + movups (%rsi), %xmm0 + movups (%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + subl %ecx, %eax + jnz L(ret_nonzero_vec_start_0) +#if SIZE_OFFSET == 0 + cmpq $(CHAR_PER_VEC * 2), %rdx #else - jnz L(fin2_7) + /* Offset rdx. Saves just enough code size to keep the + L(last_2x_vec) case and the non-zero return in a single + cache line. */ + subq $(CHAR_PER_VEC * 2), %rdx #endif -L(s8b): - testq $8, %r10 - jz L(s16b) - movq (%rdi), %rax - movq (%rdi, %rsi), %rdx - subq $8, %r10 -#ifdef USE_AS_MEMCMPEQ - je L(sub_return8) + ja L(more_2x_vec) + + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + subl %ecx, %eax +#ifndef USE_AS_MEMCMPEQ + /* Don't use `incw ax` as machines this code runs on are liable + to have partial register stall. */ + jnz L(ret_nonzero_vec_end_0) #else - je L(fin2_7) + /* Various return targets for memcmpeq. Will always be hot in + Icache and get short encoding. */ +L(ret_nonzero_vec_start_1): +L(ret_nonzero_vec_start_0): +L(ret_nonzero_vec_end_0): #endif - addq $8, %rdi - cmpq %rdx, %rax -#ifdef USE_AS_MEMCMPEQ - jnz L(neq_early) + ret + +#ifndef USE_AS_MEMCMPEQ +# ifdef USE_AS_WMEMCMP + .p2align 4 +L(ret_nonzero_vec_end_0_adj): + addl $3, %edx +# else + .p2align 4,, 8 +# endif +L(ret_nonzero_vec_end_0): + bsfl %eax, %eax +# ifdef USE_AS_WMEMCMP + leal (%rax, %rdx, CHAR_SIZE), %eax + movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + addl %edx, %eax + movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax + subl %ecx, %eax +# endif + ret +# ifndef USE_AS_WMEMCMP + .p2align 4,, 10 +L(ret_nonzero_vec_start_0): + bsfl %eax, %eax + movzbl (%rsi, %rax), %ecx + movzbl (%rdi, %rax), %eax + subl %ecx, %eax + ret +# endif #else - jnz L(fin2_7) #endif -L(s16b): - movdqu (%rdi), %xmm1 - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb %xmm0, %xmm1 + + .p2align 5 +L(more_2x_vec): + movups (VEC_SIZE * 1)(%rsi), %xmm0 + movups (VEC_SIZE * 1)(%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + subl %ecx, %eax + jnz L(ret_nonzero_vec_start_1) + + cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx + jbe L(last_2x_vec) + + cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx + ja L(more_8x_vec) + + /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time. + This can harm performance if non-zero return in [65, 80] or + [97, 112] but helps performance otherwise. Generally zero- + return is hotter. */ + movups (VEC_SIZE * 2)(%rsi), %xmm0 + movups (VEC_SIZE * 2)(%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + movups (VEC_SIZE * 3)(%rsi), %xmm2 + movups (VEC_SIZE * 3)(%rdi), %xmm3 + PCMPEQ %xmm2, %xmm3 + pand %xmm1, %xmm3 + + pmovmskb %xmm3, %eax + CHECK_CMP (%ecx, %eax) + jnz L(ret_nonzero_vec_start_2_3) + + cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx + jbe L(last_2x_vec) + + movups (VEC_SIZE * 4)(%rsi), %xmm0 + movups (VEC_SIZE * 4)(%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + movups (VEC_SIZE * 5)(%rsi), %xmm2 + movups (VEC_SIZE * 5)(%rdi), %xmm3 + PCMPEQ %xmm2, %xmm3 + pand %xmm1, %xmm3 + + pmovmskb %xmm3, %eax + CHECK_CMP (%ecx, %eax) #ifdef USE_AS_MEMCMPEQ - pmovmskb %xmm1, %eax - subl $0xffff, %eax + jz L(last_2x_vec) ret #else - pmovmskb %xmm1, %edx - xorl %eax, %eax - subl $0xffff, %edx - jz L(finz) - bsfl %edx, %ecx - leaq (%rdi, %rcx), %rcx - movzbl (%rcx), %eax - movzbl (%rsi, %rcx), %edx - jmp L(finz1) + jnz L(ret_nonzero_vec_start_4_5) #endif - .p2align 4,, 4 -L(finr1b): - movzbl (%rdi), %eax - movzbl (%rsi), %edx -L(finz1): - subl %edx, %eax -L(exit): - ret + .p2align 4 +L(last_2x_vec): + movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 + movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 + PCMPEQ %xmm0, %xmm1 + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2 + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3 + PCMPEQ %xmm2, %xmm3 + pand %xmm1, %xmm3 + pmovmskb %xmm3, %eax + subl %ecx, %eax #ifdef USE_AS_MEMCMPEQ - .p2align 4,, 4 -L(sub_return8): - subq %rdx, %rax - movl %eax, %edx - shrq $32, %rax - orl %edx, %eax + /* Various return targets for memcmpeq. Will always be hot in + Icache and get short encoding. */ +L(ret_nonzero_vec_start_2_3): +L(ret_nonzero_vec_start_4_5): ret #else - .p2align 4,, 4 -L(fin2_7): - cmpq %rdx, %rax - jz L(finz) - movq %rax, %r11 - subq %rdx, %r11 - bsfq %r11, %rcx - sarq $3, %rcx - salq $3, %rcx - sarq %cl, %rax - movzbl %al, %eax - sarq %cl, %rdx - movzbl %dl, %edx - subl %edx, %eax + jnz L(ret_nonzero_vec_end_1) ret -#endif - .p2align 4,, 4 -L(finz): - xorl %eax, %eax + + .p2align 4,, 8 +L(ret_nonzero_vec_end_1): + pmovmskb %xmm1, %ecx + /* High 16 bits of eax guranteed to be all ones. Rotate them in + to we can do `or + not` with just `xor`. */ + rorl $16, %eax + xorl %ecx, %eax + /* Partial register stall. */ + + bsfl %eax, %eax +# ifdef USE_AS_WMEMCMP + leal (%rax, %rdx, CHAR_SIZE), %eax + movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + addl %edx, %eax + movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax + subl %ecx, %eax +# endif ret -#ifdef USE_AS_MEMCMPEQ - .p2align 4,, 4 -L(neq_early): - movl $1, %eax + + .p2align 4 +L(ret_nonzero_vec_start_4_5): + pmovmskb %xmm1, %edx + sall $16, %eax + leal 1(%rax, %rdx), %eax + bsfl %eax, %eax +# ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 4)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax + subl %ecx, %eax +# endif + ret + + .p2align 4,, 8 +L(ret_nonzero_vec_start_1): + bsfl %eax, %eax +# ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 1)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax + subl %ecx, %eax +# endif ret #endif - /* For blocks bigger than 32 bytes - 1. Advance one of the addr pointer to be 16B aligned. - 2. Treat the case of both addr pointers aligned to 16B - separately to avoid movdqu. - 3. Handle any blocks of greater than 64 consecutive bytes with - unrolling to reduce branches. - 4. At least one addr pointer is 16B aligned, use memory version - of pcmbeqb. - */ - .p2align 4,, 4 -L(gt32): - movq %rdx, %r11 - addq %rdi, %r11 - movq %rdi, %r8 - - andq $15, %r8 - jz L(16am) - /* Both pointers may be misaligned. */ - movdqu (%rdi), %xmm1 - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - subl $0xffff, %edx - jnz L(neq) - neg %r8 - leaq 16(%rdi, %r8), %rdi -L(16am): - /* Handle two 16B aligned pointers separately. */ - testq $15, %rsi - jz L(ATR) - testq $16, %rdi - jz L(A32) - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi -L(A32): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jae L(mt16) - /* Pre-unroll to be ready for unrolled 64B loop. */ - testq $32, %rdi - jz L(A64) - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - -L(A64): - movq %r11, %r10 - andq $-64, %r10 - cmpq %r10, %rdi - jae L(mt32) - -L(A64main): - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %rdi, %r10 - jne L(A64main) - -L(mt32): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jae L(mt16) - -L(A32main): - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %rdi, %r10 - jne L(A32main) -L(mt16): - subq %rdi, %r11 - je L(finz) - movq %r11, %r10 - jmp L(small) - - .p2align 4,, 4 -L(neq): -#ifdef USE_AS_MEMCMPEQ - movl $1, %eax - ret -#else - bsfl %edx, %ecx - movzbl (%rdi, %rcx), %eax - addq %rdi, %rsi - movzbl (%rsi,%rcx), %edx - jmp L(finz1) + + .p2align 4 +L(more_8x_vec): + subq %rdi, %rsi + leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx + andq $(VEC_SIZE * -1), %rdi + addq %rdi, %rsi + .p2align 4 +L(loop_4x): + movups (VEC_SIZE * 2)(%rsi), %xmm0 + movups (VEC_SIZE * 3)(%rsi), %xmm1 + + PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0 + PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1 + + movups (VEC_SIZE * 4)(%rsi), %xmm2 + movups (VEC_SIZE * 5)(%rsi), %xmm3 + + PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2 + PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3 + + pand %xmm0, %xmm1 + pand %xmm2, %xmm3 + pand %xmm1, %xmm3 + + pmovmskb %xmm3, %eax + subl %ecx, %eax + jnz L(ret_nonzero_loop) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rsi + cmpq %rdi, %rdx + ja L(loop_4x) + /* Get remaining length in edx. */ + subl %edi, %edx + /* Restore offset so we can reuse L(last_2x_vec). */ + addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx +#ifdef USE_AS_WMEMCMP + shrl $2, %edx #endif + cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx + jbe L(last_2x_vec) + - .p2align 4,, 4 -L(ATR): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jae L(mt16) - testq $16, %rdi - jz L(ATR32) - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - cmpq %rdi, %r10 - je L(mt16) - -L(ATR32): - movq %r11, %r10 - andq $-64, %r10 - testq $32, %rdi - jz L(ATR64) - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - -L(ATR64): - cmpq %rdi, %r10 - je L(mt32) - -L(ATR64main): - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - cmpq %rdi, %r10 - jne L(ATR64main) - - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jae L(mt16) - -L(ATR32res): - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %r10, %rdi - jne L(ATR32res) - - subq %rdi, %r11 - je L(finz) - movq %r11, %r10 - jmp L(small) - /* Align to 16byte to improve instruction fetch. */ - .p2align 4,, 4 -END(memcmp) + movups (VEC_SIZE * 2)(%rsi), %xmm0 + movups (VEC_SIZE * 2)(%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + movups (VEC_SIZE * 3)(%rsi), %xmm2 + movups (VEC_SIZE * 3)(%rdi), %xmm3 + PCMPEQ %xmm2, %xmm3 + pand %xmm1, %xmm3 + pmovmskb %xmm3, %eax + CHECK_CMP (%ecx, %eax) + jz L(last_2x_vec) #ifdef USE_AS_MEMCMPEQ -libc_hidden_def (memcmp) +L(ret_nonzero_loop): + ret #else -# undef bcmp -weak_alias (memcmp, bcmp) -libc_hidden_builtin_def (memcmp) + + .p2align 4 +L(ret_nonzero_vec_start_2_3): + pmovmskb %xmm1, %edx + sall $16, %eax + leal 1(%rax, %rdx), %eax + + bsfl %eax, %eax +# ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 2)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax + subl %ecx, %eax +# endif + ret + + .p2align 4 +L(ret_nonzero_loop): + pmovmskb %xmm0, %ecx + pmovmskb %xmm1, %edx + sall $(VEC_SIZE * 1), %edx + leal 1(%rcx, %rdx), %edx + pmovmskb %xmm2, %ecx + /* High 16 bits of eax guranteed to be all ones. Rotate them in + to we can do `or + not` with just `xor`. */ + rorl $16, %eax + xorl %ecx, %eax + + salq $32, %rax + orq %rdx, %rax + + bsfq %rax, %rax +# ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 2)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax + subl %ecx, %eax +# endif + ret +#endif +END(MEMCMP) + +#ifndef USE_AS_WMEMCMP +# ifdef USE_AS_MEMCMPEQ +libc_hidden_def (MEMCMP) +# else +# undef bcmp +weak_alias (MEMCMP, bcmp) +libc_hidden_builtin_def (MEMCMP) +# endif #endif diff --git a/sysdeps/x86_64/memcmpeq.S b/sysdeps/x86_64/memcmpeq.S index 2cee881fe..80c5e912a 100644 --- a/sysdeps/x86_64/memcmpeq.S +++ b/sysdeps/x86_64/memcmpeq.S @@ -16,6 +16,6 @@ License along with the GNU C Library; if not, see . */ -#define memcmp __memcmpeq +#define MEMCMP __memcmpeq #define USE_AS_MEMCMPEQ 1 #include "multiarch/memcmp-sse2.S" diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S index d1a9f4791..b0dffd2ae 100644 --- a/sysdeps/x86_64/memrchr.S +++ b/sysdeps/x86_64/memrchr.S @@ -18,362 +18,333 @@ . */ #include +#define VEC_SIZE 16 +#define PAGE_SIZE 4096 .text -ENTRY (__memrchr) - movd %esi, %xmm1 - - sub $16, %RDX_LP - jbe L(length_less16) - - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - - add %RDX_LP, %RDI_LP - pshufd $0, %xmm1, %xmm1 - - movdqu (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - -/* Check if there is a match. */ - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches0) - - sub $64, %rdi - mov %edi, %ecx - and $15, %ecx - jz L(loop_prolog) - - add $16, %rdi - add $16, %rdx - and $-16, %rdi - sub %rcx, %rdx - - .p2align 4 -L(loop_prolog): - sub $64, %rdx - jbe L(exit_loop) - - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16) - - movdqa (%rdi), %xmm4 - pcmpeqb %xmm1, %xmm4 - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - sub $64, %rdi - sub $64, %rdx - jbe L(exit_loop) - - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16) - - movdqa (%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches0) - - mov %edi, %ecx - and $63, %ecx - jz L(align64_loop) - - add $64, %rdi - add $64, %rdx - and $-64, %rdi - sub %rcx, %rdx - - .p2align 4 -L(align64_loop): - sub $64, %rdi - sub $64, %rdx - jbe L(exit_loop) - - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm2 - movdqa 32(%rdi), %xmm3 - movdqa 48(%rdi), %xmm4 - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm3, %xmm0 - pmaxub %xmm4, %xmm2 - pmaxub %xmm0, %xmm2 - pmovmskb %xmm2, %eax - - test %eax, %eax - jz L(align64_loop) - - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches48) - - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm2 - - pcmpeqb %xmm1, %xmm2 - pcmpeqb (%rdi), %xmm1 - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - pmovmskb %xmm1, %eax - bsr %eax, %eax - - add %rdi, %rax +ENTRY_P2ALIGN(__memrchr, 6) +#ifdef __ILP32__ + /* Clear upper bits. */ + mov %RDX_LP, %RDX_LP +#endif + movd %esi, %xmm0 + + /* Get end pointer. */ + leaq (%rdx, %rdi), %rcx + + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 + + /* Check if we can load 1x VEC without cross a page. */ + testl $(PAGE_SIZE - VEC_SIZE), %ecx + jz L(page_cross) + + /* NB: This load happens regardless of whether rdx (len) is zero. Since + it doesn't cross a page and the standard gurantees any pointer have + at least one-valid byte this load must be safe. For the entire + history of the x86 memrchr implementation this has been possible so + no code "should" be relying on a zero-length check before this load. + The zero-length check is moved to the page cross case because it is + 1) pretty cold and including it pushes the hot case len <= VEC_SIZE + into 2-cache lines. */ + movups -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subq $VEC_SIZE, %rdx + ja L(more_1x_vec) +L(ret_vec_x0_test): + /* Zero-flag set if eax (src) is zero. Destination unchanged if src is + zero. */ + bsrl %eax, %eax + jz L(ret_0) + /* Check if the CHAR match is in bounds. Need to truly zero `eax` here + if out of bounds. */ + addl %edx, %eax + jl L(zero_0) + /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base + ptr. */ + addq %rdi, %rax +L(ret_0): ret - .p2align 4 -L(exit_loop): - add $64, %edx - cmp $32, %edx - jbe L(exit_loop_32) - - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16_1) - cmp $48, %edx - jbe L(return_null) - - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches0_1) - xor %eax, %eax + .p2align 4,, 5 +L(ret_vec_x0): + bsrl %eax, %eax + leaq -(VEC_SIZE)(%rcx, %rax), %rax ret - .p2align 4 -L(exit_loop_32): - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48_1) - cmp $16, %edx - jbe L(return_null) - - pcmpeqb 32(%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches32_1) - xor %eax, %eax + .p2align 4,, 2 +L(zero_0): + xorl %eax, %eax ret - .p2align 4 -L(matches0): - bsr %eax, %eax - add %rdi, %rax - ret - - .p2align 4 -L(matches16): - bsr %eax, %eax - lea 16(%rax, %rdi), %rax - ret - .p2align 4 -L(matches32): - bsr %eax, %eax - lea 32(%rax, %rdi), %rax + .p2align 4,, 8 +L(more_1x_vec): + testl %eax, %eax + jnz L(ret_vec_x0) + + /* Align rcx (pointer to string). */ + decq %rcx + andq $-VEC_SIZE, %rcx + + movq %rcx, %rdx + /* NB: We could consistenyl save 1-byte in this pattern with `movaps + %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is + it adds more frontend uops (even if the moves can be eliminated) and + some percentage of the time actual backend uops. */ + movaps -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + subq %rdi, %rdx + pmovmskb %xmm1, %eax + + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) +L(last_2x_vec): + subl $VEC_SIZE, %edx + jbe L(ret_vec_x0_test) + + testl %eax, %eax + jnz L(ret_vec_x0) + + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subl $VEC_SIZE, %edx + bsrl %eax, %eax + jz L(ret_1) + addl %edx, %eax + jl L(zero_0) + addq %rdi, %rax +L(ret_1): ret - .p2align 4 -L(matches48): - bsr %eax, %eax - lea 48(%rax, %rdi), %rax + /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) + causes the hot pause (length <= VEC_SIZE) to span multiple cache + lines. Naturally aligned % 16 to 8-bytes. */ +L(page_cross): + /* Zero length check. */ + testq %rdx, %rdx + jz L(zero_0) + + leaq -1(%rcx), %r8 + andq $-(VEC_SIZE), %r8 + + movaps (%r8), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + /* Shift out negative alignment (because we are starting from endptr and + working backwards). */ + negl %ecx + /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count + explicitly. */ + andl $(VEC_SIZE - 1), %ecx + shl %cl, %esi + movzwl %si, %eax + leaq (%rdi, %rdx), %rcx + cmpq %rdi, %r8 + ja L(more_1x_vec) + subl $VEC_SIZE, %edx + bsrl %eax, %eax + jz L(ret_2) + addl %edx, %eax + jl L(zero_1) + addq %rdi, %rax +L(ret_2): ret - .p2align 4 -L(matches0_1): - bsr %eax, %eax - sub $64, %rdx - add %rax, %rdx - jl L(return_null) - add %rdi, %rax + /* Fits in aliging bytes. */ +L(zero_1): + xorl %eax, %eax ret - .p2align 4 -L(matches16_1): - bsr %eax, %eax - sub $48, %rdx - add %rax, %rdx - jl L(return_null) - lea 16(%rdi, %rax), %rax + .p2align 4,, 5 +L(ret_vec_x1): + bsrl %eax, %eax + leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax ret - .p2align 4 -L(matches32_1): - bsr %eax, %eax - sub $32, %rdx - add %rax, %rdx - jl L(return_null) - lea 32(%rdi, %rax), %rax - ret + .p2align 4,, 8 +L(more_2x_vec): + testl %eax, %eax + jnz L(ret_vec_x0) - .p2align 4 -L(matches48_1): - bsr %eax, %eax - sub $16, %rdx - add %rax, %rdx - jl L(return_null) - lea 48(%rdi, %rax), %rax - ret + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + testl %eax, %eax + jnz L(ret_vec_x1) - .p2align 4 -L(return_null): - xor %eax, %eax - ret - .p2align 4 -L(length_less16_offset0): - test %edx, %edx - jz L(return_null) + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax - mov %dl, %cl - pcmpeqb (%rdi), %xmm1 + subq $(VEC_SIZE * 4), %rdx + ja L(more_4x_vec) - mov $1, %edx - sal %cl, %edx - sub $1, %edx + addl $(VEC_SIZE), %edx + jle L(ret_vec_x2_test) - pmovmskb %xmm1, %eax +L(last_vec): + testl %eax, %eax + jnz L(ret_vec_x2) - and %edx, %eax - test %eax, %eax - jz L(return_null) + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax - bsr %eax, %eax - add %rdi, %rax + subl $(VEC_SIZE), %edx + bsrl %eax, %eax + jz L(ret_3) + addl %edx, %eax + jl L(zero_2) + addq %rdi, %rax +L(ret_3): ret - .p2align 4 -L(length_less16): - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - - add $16, %edx - - pshufd $0, %xmm1, %xmm1 - - mov %edi, %ecx - and $15, %ecx - jz L(length_less16_offset0) - - mov %cl, %dh - mov %ecx, %esi - add %dl, %dh - and $-16, %rdi - - sub $16, %dh - ja L(length_less16_part2) - - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %eax - - sar %cl, %eax - mov %dl, %cl - - mov $1, %edx - sal %cl, %edx - sub $1, %edx - - and %edx, %eax - test %eax, %eax - jz L(return_null) - - bsr %eax, %eax - add %rdi, %rax - add %rsi, %rax + .p2align 4,, 6 +L(ret_vec_x2_test): + bsrl %eax, %eax + jz L(zero_2) + addl %edx, %eax + jl L(zero_2) + addq %rdi, %rax ret - .p2align 4 -L(length_less16_part2): - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - - mov %dh, %cl - mov $1, %edx - sal %cl, %edx - sub $1, %edx - - and %edx, %eax +L(zero_2): + xorl %eax, %eax + ret - test %eax, %eax - jnz L(length_less16_part2_return) - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %eax + .p2align 4,, 5 +L(ret_vec_x2): + bsrl %eax, %eax + leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax + ret - mov %esi, %ecx - sar %cl, %eax - test %eax, %eax - jz L(return_null) + .p2align 4,, 5 +L(ret_vec_x3): + bsrl %eax, %eax + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax + ret - bsr %eax, %eax - add %rdi, %rax - add %rsi, %rax + .p2align 4,, 8 +L(more_4x_vec): + testl %eax, %eax + jnz L(ret_vec_x2) + + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + testl %eax, %eax + jnz L(ret_vec_x3) + + addq $-(VEC_SIZE * 4), %rcx + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec) + + /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end + keeping the code from spilling to the next cache line. */ + addq $(VEC_SIZE * 4 - 1), %rcx + andq $-(VEC_SIZE * 4), %rcx + leaq (VEC_SIZE * 4)(%rdi), %rdx + andq $-(VEC_SIZE * 4), %rdx + + .p2align 4,, 11 +L(loop_4x_vec): + movaps (VEC_SIZE * -1)(%rcx), %xmm1 + movaps (VEC_SIZE * -2)(%rcx), %xmm2 + movaps (VEC_SIZE * -3)(%rcx), %xmm3 + movaps (VEC_SIZE * -4)(%rcx), %xmm4 + pcmpeqb %xmm0, %xmm1 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm0, %xmm3 + pcmpeqb %xmm0, %xmm4 + + por %xmm1, %xmm2 + por %xmm3, %xmm4 + por %xmm2, %xmm4 + + pmovmskb %xmm4, %esi + testl %esi, %esi + jnz L(loop_end) + + addq $-(VEC_SIZE * 4), %rcx + cmpq %rdx, %rcx + jne L(loop_4x_vec) + + subl %edi, %edx + + /* Ends up being 1-byte nop. */ + .p2align 4,, 2 +L(last_4x_vec): + movaps -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) + + testl %eax, %eax + jnz L(ret_vec_x0) + + + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + testl %eax, %eax + jnz L(ret_vec_end) + + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subl $(VEC_SIZE * 3), %edx + ja L(last_vec) + bsrl %eax, %eax + jz L(ret_4) + addl %edx, %eax + jl L(zero_3) + addq %rdi, %rax +L(ret_4): ret - .p2align 4 -L(length_less16_part2_return): - bsr %eax, %eax - lea 16(%rax, %rdi), %rax + /* Ends up being 1-byte nop. */ + .p2align 4,, 3 +L(loop_end): + pmovmskb %xmm1, %eax + sall $16, %eax + jnz L(ret_vec_end) + + pmovmskb %xmm2, %eax + testl %eax, %eax + jnz L(ret_vec_end) + + pmovmskb %xmm3, %eax + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) + then it won't affect the result in esi (VEC4). If ecx is non-zero + then CHAR in VEC3 and bsrq will use that position. */ + sall $16, %eax + orl %esi, %eax + bsrl %eax, %eax + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax ret -END (__memrchr) +L(ret_vec_end): + bsrl %eax, %eax + leaq (VEC_SIZE * -2)(%rax, %rcx), %rax + ret + /* Use in L(last_4x_vec). In the same cache line. This is just a spare + aligning bytes. */ +L(zero_3): + xorl %eax, %eax + ret + /* 2-bytes from next cache line. */ +END(__memrchr) weak_alias (__memrchr, memrchr) diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index 65c09bd0a..a6eea61a4 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -1,4 +1,4 @@ -/* memset/bzero -- set memory area to CH/0 +/* memset -- set memory area to CH/0 Optimized version for x86-64. Copyright (C) 2002-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -28,17 +28,23 @@ #define VMOVU movups #define VMOVA movaps -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ movq r, %rax; \ punpcklbw %xmm0, %xmm0; \ punpcklwd %xmm0, %xmm0; \ pshufd $0, %xmm0, %xmm0 -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ movd d, %xmm0; \ - movq r, %rax; \ - pshufd $0, %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0; \ + movq r, %rax + +# define MEMSET_VDUP_TO_VEC0_HIGH() +# define MEMSET_VDUP_TO_VEC0_LOW() + +# define WMEMSET_VDUP_TO_VEC0_HIGH() +# define WMEMSET_VDUP_TO_VEC0_LOW() #define SECTION(p) p diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 044778585..3cc1a7e0d 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -1,127 +1,199 @@ ifeq ($(subdir),string) -sysdep_routines += strncat-c stpncpy-c strncpy-c \ - strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3 \ - strcmp-sse4_2 strcmp-avx2 \ - strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \ - memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \ - memrchr-sse2 memrchr-avx2 \ - memcmp-sse2 \ - memcmpeq-sse2 \ - memcmp-avx2-movbe \ - memcmpeq-avx2 \ - memcmp-sse4 memcpy-ssse3 \ - memmove-ssse3 \ - memcpy-ssse3-back \ - memmove-ssse3-back \ - memmove-avx512-no-vzeroupper \ - strcasecmp_l-sse2 strcasecmp_l-ssse3 \ - strcasecmp_l-sse4_2 strcasecmp_l-avx \ - strncase_l-sse2 strncase_l-ssse3 \ - strncase_l-sse4_2 strncase_l-avx \ - strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \ - strrchr-sse2 strrchr-avx2 \ - strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \ - strcat-avx2 strncat-avx2 \ - strcat-ssse3 strncat-ssse3\ - strcpy-avx2 strncpy-avx2 \ - strcpy-sse2 stpcpy-sse2 \ - strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ - strcpy-sse2-unaligned strncpy-sse2-unaligned \ - stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ - stpcpy-avx2 stpncpy-avx2 \ - strcat-sse2 \ - strcat-sse2-unaligned strncat-sse2-unaligned \ - strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ - strcspn-sse2 strpbrk-sse2 strspn-sse2 \ - strcspn-c strpbrk-c strspn-c varshift \ - memset-avx512-no-vzeroupper \ - memmove-sse2-unaligned-erms \ - memmove-avx-unaligned-erms \ - memmove-avx512-unaligned-erms \ - memset-sse2-unaligned-erms \ - memset-avx2-unaligned-erms \ - memset-avx512-unaligned-erms \ - memchr-avx2-rtm \ - memcmp-avx2-movbe-rtm \ - memcmpeq-avx2-rtm \ - memmove-avx-unaligned-erms-rtm \ - memrchr-avx2-rtm \ - memset-avx2-unaligned-erms-rtm \ - rawmemchr-avx2-rtm \ - strchr-avx2-rtm \ - strcmp-avx2-rtm \ - strchrnul-avx2-rtm \ - stpcpy-avx2-rtm \ - stpncpy-avx2-rtm \ - strcat-avx2-rtm \ - strcpy-avx2-rtm \ - strlen-avx2-rtm \ - strncat-avx2-rtm \ - strncmp-avx2-rtm \ - strncpy-avx2-rtm \ - strnlen-avx2-rtm \ - strrchr-avx2-rtm \ - memchr-evex \ - memcmp-evex-movbe \ - memcmpeq-evex \ - memmove-evex-unaligned-erms \ - memrchr-evex \ - memset-evex-unaligned-erms \ - rawmemchr-evex \ - stpcpy-evex \ - stpncpy-evex \ - strcat-evex \ - strchr-evex \ - strchrnul-evex \ - strcmp-evex \ - strcpy-evex \ - strlen-evex \ - strncat-evex \ - strncmp-evex \ - strncpy-evex \ - strnlen-evex \ - strrchr-evex \ - memchr-evex-rtm \ - rawmemchr-evex-rtm +sysdep_routines += \ + memchr-avx2 \ + memchr-avx2-rtm \ + memchr-evex \ + memchr-evex-rtm \ + memchr-sse2 \ + memcmp-avx2-movbe \ + memcmp-avx2-movbe-rtm \ + memcmp-evex-movbe \ + memcmp-sse2 \ + memcmp-ssse3 \ + memcmpeq-avx2 \ + memcmpeq-avx2-rtm \ + memcmpeq-evex \ + memcmpeq-sse2 \ + memcpy-ssse3 \ + memcpy-ssse3-back \ + memmove-avx-unaligned-erms \ + memmove-avx-unaligned-erms-rtm \ + memmove-avx512-no-vzeroupper \ + memmove-avx512-unaligned-erms \ + memmove-erms \ + memmove-evex-unaligned-erms \ + memmove-sse2-unaligned-erms \ + memmove-ssse3 \ + memmove-ssse3-back \ + memrchr-avx2 \ + memrchr-avx2-rtm \ + memrchr-evex \ + memrchr-sse2 \ + memset-avx2-unaligned-erms \ + memset-avx2-unaligned-erms-rtm \ + memset-avx512-no-vzeroupper \ + memset-avx512-unaligned-erms \ + memset-erms \ + memset-evex-unaligned-erms \ + memset-sse2-unaligned-erms \ + rawmemchr-avx2 \ + rawmemchr-avx2-rtm \ + rawmemchr-evex \ + rawmemchr-evex-rtm \ + rawmemchr-sse2 \ + stpcpy-avx2 \ + stpcpy-avx2-rtm \ + stpcpy-evex \ + stpcpy-sse2 \ + stpcpy-sse2-unaligned \ + stpcpy-ssse3 \ + stpncpy-avx2 \ + stpncpy-avx2-rtm \ + stpncpy-c \ + stpncpy-evex \ + stpncpy-sse2-unaligned \ + stpncpy-ssse3 \ + strcasecmp_l-avx2 \ + strcasecmp_l-avx2-rtm \ + strcasecmp_l-evex \ + strcasecmp_l-sse2 \ + strcasecmp_l-sse4_2 \ + strcasecmp_l-ssse3 \ + strcat-avx2 \ + strcat-avx2-rtm \ + strcat-evex \ + strcat-sse2 \ + strcat-sse2-unaligned \ + strcat-ssse3 \ + strchr-avx2 \ + strchr-avx2-rtm \ + strchr-evex \ + strchr-sse2 \ + strchr-sse2-no-bsf \ + strchrnul-avx2 \ + strchrnul-avx2-rtm \ + strchrnul-evex \ + strchrnul-sse2 \ + strcmp-avx2 \ + strcmp-avx2-rtm \ + strcmp-evex \ + strcmp-sse2 \ + strcmp-sse2-unaligned \ + strcmp-sse4_2 \ + strcmp-ssse3 \ + strcpy-avx2 \ + strcpy-avx2-rtm \ + strcpy-evex \ + strcpy-sse2 \ + strcpy-sse2-unaligned \ + strcpy-ssse3 \ + strcspn-c \ + strcspn-sse2 \ + strlen-avx2 \ + strlen-avx2-rtm \ + strlen-evex \ + strlen-evex512 \ + strlen-sse2 \ + strncase_l-avx2 \ + strncase_l-avx2-rtm \ + strncase_l-evex \ + strncase_l-sse2 \ + strncase_l-sse4_2 \ + strncase_l-ssse3 \ + strncat-avx2 \ + strncat-avx2-rtm \ + strncat-c \ + strncat-evex \ + strncat-sse2-unaligned \ + strncat-ssse3 \ + strncmp-avx2 \ + strncmp-avx2-rtm \ + strncmp-evex \ + strncmp-sse2 \ + strncmp-sse4_2 \ + strncmp-ssse3 \ + strncpy-avx2 \ + strncpy-avx2-rtm \ + strncpy-c \ + strncpy-evex \ + strncpy-sse2-unaligned \ + strncpy-ssse3 \ + strnlen-avx2 \ + strnlen-avx2-rtm \ + strnlen-evex \ + strnlen-evex512 \ + strnlen-sse2 \ + strpbrk-c \ + strpbrk-sse2 \ + strrchr-avx2 \ + strrchr-avx2-rtm \ + strrchr-evex \ + strrchr-sse2 \ + strspn-c \ + strspn-sse2 \ + strstr-avx512 \ + strstr-sse2-unaligned \ + varshift \ +# sysdep_routines CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3 endif ifeq ($(subdir),wcsmbs) -sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ - wmemcmp-avx2-movbe \ - wmemchr-sse2 wmemchr-avx2 \ - wcscmp-sse2 wcscmp-avx2 \ - wcsncmp-sse2 wcsncmp-avx2 \ - wcscpy-ssse3 wcscpy-c \ - wcschr-sse2 wcschr-avx2 \ - wcsrchr-sse2 wcsrchr-avx2 \ - wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \ - wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \ - wcschr-avx2-rtm \ - wcscmp-avx2-rtm \ - wcslen-avx2-rtm \ - wcsncmp-avx2-rtm \ - wcsnlen-avx2-rtm \ - wcsrchr-avx2-rtm \ - wmemchr-avx2-rtm \ - wmemcmp-avx2-movbe-rtm \ - wcschr-evex \ - wcscmp-evex \ - wcslen-evex \ - wcsncmp-evex \ - wcsnlen-evex \ - wcsrchr-evex \ - wmemchr-evex \ - wmemcmp-evex-movbe \ - wmemchr-evex-rtm +sysdep_routines += \ + wcschr-avx2 \ + wcschr-avx2-rtm \ + wcschr-evex \ + wcschr-sse2 \ + wcscmp-avx2 \ + wcscmp-avx2-rtm \ + wcscmp-evex \ + wcscmp-sse2 \ + wcscpy-c \ + wcscpy-ssse3 \ + wcslen-avx2 \ + wcslen-avx2-rtm \ + wcslen-evex \ + wcslen-evex512 \ + wcslen-sse2 \ + wcslen-sse4_1 \ + wcsncmp-avx2 \ + wcsncmp-avx2-rtm \ + wcsncmp-evex \ + wcsncmp-sse2 \ + wcsnlen-avx2 \ + wcsnlen-avx2-rtm \ + wcsnlen-c \ + wcsnlen-evex \ + wcsnlen-evex512 \ + wcsnlen-sse4_1 \ + wcsrchr-avx2 \ + wcsrchr-avx2-rtm \ + wcsrchr-evex \ + wcsrchr-sse2 \ + wmemchr-avx2 \ + wmemchr-avx2-rtm \ + wmemchr-evex \ + wmemchr-evex-rtm \ + wmemchr-sse2 \ + wmemcmp-avx2-movbe \ + wmemcmp-avx2-movbe-rtm \ + wmemcmp-evex-movbe \ + wmemcmp-sse2 \ + wmemcmp-ssse3 \ +# sysdep_routines endif ifeq ($(subdir),debug) -sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \ - memmove_chk-nonshared memset_chk-nonshared \ - wmemset_chk-nonshared +sysdep_routines += \ + memcpy_chk-nonshared \ + memmove_chk-nonshared \ + mempcpy_chk-nonshared \ + memset_chk-nonshared \ + wmemset_chk-nonshared \ +# sysdep_routines endif diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h new file mode 100644 index 000000000..6ca9f5e6b --- /dev/null +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h @@ -0,0 +1,35 @@ +/* Common config for AVX-RTM VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _AVX_RTM_VECS_H +#define _AVX_RTM_VECS_H 1 + +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define USE_WITH_RTM 1 +#include "avx-vecs.h" + +#undef SECTION +#define SECTION(p) p##.avx.rtm + +#endif diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h new file mode 100644 index 000000000..89680f5db --- /dev/null +++ b/sysdeps/x86_64/multiarch/avx-vecs.h @@ -0,0 +1,47 @@ +/* Common config for AVX VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _AVX_VECS_H +#define _AVX_VECS_H 1 + +#ifdef VEC_SIZE +# error "Multiple VEC configs included!" +#endif + +#define VEC_SIZE 32 +#include "vec-macros.h" + +#define USE_WITH_AVX 1 +#define SECTION(p) p##.avx + +/* 4-byte mov instructions with AVX2. */ +#define MOV_SIZE 4 +/* 1 (ret) + 3 (vzeroupper). */ +#define RET_SIZE 4 +#define VZEROUPPER vzeroupper + +#define VMOVU vmovdqu +#define VMOVA vmovdqa +#define VMOVNT vmovntdq + +/* Often need to access xmm portion. */ +#define VEC_xmm VEC_any_xmm +#define VEC VEC_any_ymm + +#endif diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S deleted file mode 100644 index 639f02bde..000000000 --- a/sysdeps/x86_64/multiarch/bcopy.S +++ /dev/null @@ -1,7 +0,0 @@ -#include - - .text -ENTRY(bcopy) - xchg %rdi, %rsi - jmp __libc_memmove /* Branch to IFUNC memmove. */ -END(bcopy) diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h new file mode 100644 index 000000000..99806ebcd --- /dev/null +++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h @@ -0,0 +1,39 @@ +/* Common config for EVEX256 and EVEX512 VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _EVEX_VECS_COMMON_H +#define _EVEX_VECS_COMMON_H 1 + +#include "vec-macros.h" + +/* 6-byte mov instructions with EVEX. */ +#define MOV_SIZE 6 +/* No vzeroupper needed. */ +#define RET_SIZE 1 +#define VZEROUPPER + +#define VMOVU vmovdqu64 +#define VMOVA vmovdqa64 +#define VMOVNT vmovntdq + +#define VEC_xmm VEC_hi_xmm +#define VEC_ymm VEC_hi_ymm +#define VEC_zmm VEC_hi_zmm + +#endif diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h new file mode 100644 index 000000000..222ba46dc --- /dev/null +++ b/sysdeps/x86_64/multiarch/evex256-vecs.h @@ -0,0 +1,35 @@ +/* Common config for EVEX256 VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _EVEX256_VECS_H +#define _EVEX256_VECS_H 1 + +#ifdef VEC_SIZE +# error "Multiple VEC configs included!" +#endif + +#define VEC_SIZE 32 +#include "evex-vecs-common.h" + +#define USE_WITH_EVEX256 1 +#define SECTION(p) p##.evex + +#define VEC VEC_ymm + +#endif diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h new file mode 100644 index 000000000..d1784d536 --- /dev/null +++ b/sysdeps/x86_64/multiarch/evex512-vecs.h @@ -0,0 +1,35 @@ +/* Common config for EVEX512 VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _EVEX512_VECS_H +#define _EVEX512_VECS_H 1 + +#ifdef VEC_SIZE +# error "Multiple VEC configs included!" +#endif + +#define VEC_SIZE 64 +#include "evex-vecs-common.h" + +#define USE_WITH_EVEX512 1 +#define SECTION(p) p##.evex512 + +#define VEC VEC_zmm + +#endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 68a56797d..e97218f62 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -96,8 +96,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (MOVBE)), __memcmp_evex_movbe) - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), - __memcmp_sse4_1) IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), __memcmp_ssse3) IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) @@ -337,6 +335,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __strlen_evex) + IFUNC_IMPL_ADD (array, i, strlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __strlen_evex512) IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) /* Support sysdeps/x86_64/multiarch/strnlen.c. */ @@ -355,6 +358,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __strnlen_evex) + IFUNC_IMPL_ADD (array, i, strnlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __strnlen_evex512) IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ @@ -395,8 +403,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ IFUNC_IMPL (i, name, strcasecmp, IFUNC_IMPL_ADD (array, i, strcasecmp, - CPU_FEATURE_USABLE (AVX), - __strcasecmp_avx) + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strcasecmp_evex) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX2), + __strcasecmp_avx2) + IFUNC_IMPL_ADD (array, i, strcasecmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcasecmp_avx2_rtm) IFUNC_IMPL_ADD (array, i, strcasecmp, CPU_FEATURE_USABLE (SSE4_2), __strcasecmp_sse42) @@ -407,9 +423,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ IFUNC_IMPL (i, name, strcasecmp_l, - IFUNC_IMPL_ADD (array, i, strcasecmp_l, - CPU_FEATURE_USABLE (AVX), - __strcasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strcasecmp_l_evex) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX2), + __strcasecmp_l_avx2) + IFUNC_IMPL_ADD (array, i, strcasecmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcasecmp_l_avx2_rtm) IFUNC_IMPL_ADD (array, i, strcasecmp_l, CPU_FEATURE_USABLE (SSE4_2), __strcasecmp_l_sse42) @@ -535,8 +559,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ IFUNC_IMPL (i, name, strncasecmp, IFUNC_IMPL_ADD (array, i, strncasecmp, - CPU_FEATURE_USABLE (AVX), - __strncasecmp_avx) + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strncasecmp_evex) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX2), + __strncasecmp_avx2) + IFUNC_IMPL_ADD (array, i, strncasecmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncasecmp_avx2_rtm) IFUNC_IMPL_ADD (array, i, strncasecmp, CPU_FEATURE_USABLE (SSE4_2), __strncasecmp_sse42) @@ -548,9 +580,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ IFUNC_IMPL (i, name, strncasecmp_l, - IFUNC_IMPL_ADD (array, i, strncasecmp_l, - CPU_FEATURE_USABLE (AVX), - __strncasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strncasecmp_l_evex) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX2), + __strncasecmp_l_avx2) + IFUNC_IMPL_ADD (array, i, strncasecmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncasecmp_l_avx2_rtm) IFUNC_IMPL_ADD (array, i, strncasecmp_l, CPU_FEATURE_USABLE (SSE4_2), __strncasecmp_l_sse42) @@ -611,6 +651,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strstr.c. */ IFUNC_IMPL (i, name, strstr, + IFUNC_IMPL_ADD (array, i, strstr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (AVX512DQ) + && CPU_FEATURE_USABLE (BMI2)), + __strstr_avx512) IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) @@ -702,6 +748,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wcslen_evex) + IFUNC_IMPL_ADD (array, i, wcslen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcslen_evex512) IFUNC_IMPL_ADD (array, i, wcslen, CPU_FEATURE_USABLE (SSE4_1), __wcslen_sse4_1) @@ -723,6 +774,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wcsnlen_evex) + IFUNC_IMPL_ADD (array, i, wcsnlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcsnlen_evex512) IFUNC_IMPL_ADD (array, i, wcsnlen, CPU_FEATURE_USABLE (SSE4_1), __wcsnlen_sse4_1) @@ -768,8 +824,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (MOVBE)), __wmemcmp_evex_movbe) - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), - __wmemcmp_sse4_1) IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) @@ -996,6 +1050,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __wmemset_chk, CPU_FEATURE_USABLE (AVX2), __wmemset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wmemset_chk_avx2_unaligned_rtm) IFUNC_IMPL_ADD (array, i, __wmemset_chk, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index cd1261369..4518b0f98 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -21,7 +21,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; @@ -47,9 +46,6 @@ IFUNC_SELECTOR (void) return OPTIMIZE (avx2_movbe); } - if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) - return OPTIMIZE (sse4_1); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) return OPTIMIZE (ssse3); diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h index 9e3cc61ac..766539c24 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h @@ -22,15 +22,28 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { const struct cpu_features* cpu_features = __get_cpu_features (); - if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) - return OPTIMIZE (avx); + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2) && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S index 87b076c7c..c4d71938c 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S @@ -2,6 +2,7 @@ # define MEMCHR __memchr_avx2_rtm #endif +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST #define ZERO_UPPER_VEC_REGISTERS_RETURN \ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index 75bd7262e..c5a256eb3 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -57,7 +57,7 @@ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section SECTION(.text),"ax",@progbits -ENTRY (MEMCHR) +ENTRY_P2ALIGN (MEMCHR, 5) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ # ifdef __ILP32__ @@ -87,12 +87,14 @@ ENTRY (MEMCHR) # endif testl %eax, %eax jz L(aligned_more) - tzcntl %eax, %eax + bsfl %eax, %eax addq %rdi, %rax - VZEROUPPER_RETURN +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + # ifndef USE_AS_RAWMEMCHR - .p2align 5 + .p2align 4 L(first_vec_x0): /* Check if first match was before length. */ tzcntl %eax, %eax @@ -100,58 +102,31 @@ L(first_vec_x0): /* NB: Multiply length by 4 to get byte count. */ sall $2, %edx # endif - xorl %ecx, %ecx + COND_VZEROUPPER + /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch + block. branch here as opposed to cmovcc is not that costly. Common + usage of memchr is to check if the return was NULL (if string was + known to contain CHAR user would use rawmemchr). This branch will be + highly correlated with the user branch and can be used by most + modern branch predictors to predict the user branch. */ cmpl %eax, %edx - leaq (%rdi, %rax), %rax - cmovle %rcx, %rax - VZEROUPPER_RETURN - -L(null): - xorl %eax, %eax - ret -# endif - .p2align 4 -L(cross_page_boundary): - /* Save pointer before aligning as its original value is - necessary for computer return address if byte is found or - adjusting length if it is not and this is memchr. */ - movq %rdi, %rcx - /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr - and rdi for rawmemchr. */ - orq $(VEC_SIZE - 1), %ALGN_PTR_REG - VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax -# ifndef USE_AS_RAWMEMCHR - /* Calculate length until end of page (length checked for a - match). */ - leaq 1(%ALGN_PTR_REG), %rsi - subq %RRAW_PTR_REG, %rsi -# ifdef USE_AS_WMEMCHR - /* NB: Divide bytes by 4 to get wchar_t count. */ - shrl $2, %esi -# endif -# endif - /* Remove the leading bytes. */ - sarxl %ERAW_PTR_REG, %eax, %eax -# ifndef USE_AS_RAWMEMCHR - /* Check the end of data. */ - cmpq %rsi, %rdx - jbe L(first_vec_x0) + jle L(null) + addq %rdi, %rax + ret # endif - testl %eax, %eax - jz L(cross_page_continue) - tzcntl %eax, %eax - addq %RRAW_PTR_REG, %rax -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN - .p2align 4 + .p2align 4,, 10 L(first_vec_x1): - tzcntl %eax, %eax + bsfl %eax, %eax incq %rdi addq %rdi, %rax VZEROUPPER_RETURN - +# ifndef USE_AS_RAWMEMCHR + /* First in aligning bytes here. */ +L(null): + xorl %eax, %eax + ret +# endif .p2align 4 L(first_vec_x2): tzcntl %eax, %eax @@ -340,7 +315,7 @@ L(first_vec_x1_check): incq %rdi addq %rdi, %rax VZEROUPPER_RETURN - .p2align 4 + .p2align 4,, 6 L(set_zero_end): xorl %eax, %eax VZEROUPPER_RETURN @@ -428,5 +403,39 @@ L(last_vec_x3): VZEROUPPER_RETURN # endif + .p2align 4 +L(cross_page_boundary): + /* Save pointer before aligning as its original value is necessary for + computer return address if byte is found or adjusting length if it + is not and this is memchr. */ + movq %rdi, %rcx + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr + and rdi for rawmemchr. */ + orq $(VEC_SIZE - 1), %ALGN_PTR_REG + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax +# ifndef USE_AS_RAWMEMCHR + /* Calculate length until end of page (length checked for a match). */ + leaq 1(%ALGN_PTR_REG), %rsi + subq %RRAW_PTR_REG, %rsi +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %esi +# endif +# endif + /* Remove the leading bytes. */ + sarxl %ERAW_PTR_REG, %eax, %eax +# ifndef USE_AS_RAWMEMCHR + /* Check the end of data. */ + cmpq %rsi, %rdx + jbe L(first_vec_x0) +# endif + testl %eax, %eax + jz L(cross_page_continue) + bsfl %eax, %eax + addq %RRAW_PTR_REG, %rax + VZEROUPPER_RETURN + + END (MEMCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S index cfaf02907..0fd11b763 100644 --- a/sysdeps/x86_64/multiarch/memchr-evex.S +++ b/sysdeps/x86_64/multiarch/memchr-evex.S @@ -88,7 +88,7 @@ # define PAGE_SIZE 4096 .section SECTION(.text),"ax",@progbits -ENTRY (MEMCHR) +ENTRY_P2ALIGN (MEMCHR, 6) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ test %RDX_LP, %RDX_LP @@ -131,22 +131,24 @@ L(zero): xorl %eax, %eax ret - .p2align 5 + .p2align 4 L(first_vec_x0): - /* Check if first match was before length. */ - tzcntl %eax, %eax - xorl %ecx, %ecx - cmpl %eax, %edx - leaq (%rdi, %rax, CHAR_SIZE), %rax - cmovle %rcx, %rax + /* Check if first match was before length. NB: tzcnt has false data- + dependency on destination. eax already had a data-dependency on esi + so this should have no affect here. */ + tzcntl %eax, %esi +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rsi, CHAR_SIZE), %rdi +# else + addq %rsi, %rdi +# endif + xorl %eax, %eax + cmpl %esi, %edx + cmovg %rdi, %rax ret -# else - /* NB: first_vec_x0 is 17 bytes which will leave - cross_page_boundary (which is relatively cold) close enough - to ideal alignment. So only realign L(cross_page_boundary) if - rawmemchr. */ - .p2align 4 # endif + + .p2align 4 L(cross_page_boundary): /* Save pointer before aligning as its original value is necessary for computer return address if byte is found or @@ -400,10 +402,14 @@ L(last_2x_vec): L(zero_end): ret +L(set_zero_end): + xorl %eax, %eax + ret .p2align 4 L(first_vec_x1_check): - tzcntl %eax, %eax + /* eax must be non-zero. Use bsfl to save code size. */ + bsfl %eax, %eax /* Adjust length. */ subl $-(CHAR_PER_VEC * 4), %edx /* Check if match within remaining length. */ @@ -412,9 +418,6 @@ L(first_vec_x1_check): /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax ret -L(set_zero_end): - xorl %eax, %eax - ret .p2align 4 L(loop_4x_vec_end): @@ -464,7 +467,7 @@ L(loop_4x_vec_end): # endif ret - .p2align 4 + .p2align 4,, 10 L(last_vec_x1_return): tzcntl %eax, %eax # if defined USE_AS_WMEMCHR || RET_OFFSET != 0 @@ -496,6 +499,7 @@ L(last_vec_x3_return): # endif # ifndef USE_AS_RAWMEMCHR + .p2align 4,, 5 L(last_4x_vec_or_less_cmpeq): VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 kmovd %k0, %eax @@ -546,7 +550,7 @@ L(last_4x_vec): # endif andl %ecx, %eax jz L(zero_end2) - tzcntl %eax, %eax + bsfl %eax, %eax leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax L(zero_end2): ret @@ -562,6 +566,6 @@ L(last_vec_x3): leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ret # endif - + /* 7 bytes from next cache line. */ END (MEMCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S index a34ea1645..210c9925b 100644 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -429,22 +429,21 @@ L(page_cross_less_vec): # ifndef USE_AS_WMEMCMP cmpl $8, %edx jae L(between_8_15) + /* Fall through for [4, 7]. */ cmpl $4, %edx - jae L(between_4_7) + jb L(between_2_3) - /* Load as big endian to avoid branches. */ - movzwl (%rdi), %eax - movzwl (%rsi), %ecx - shll $8, %eax - shll $8, %ecx - bswap %eax - bswap %ecx - movzbl -1(%rdi, %rdx), %edi - movzbl -1(%rsi, %rdx), %esi - orl %edi, %eax - orl %esi, %ecx - /* Subtraction is okay because the upper 8 bits are zero. */ - subl %ecx, %eax + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + /* Fast path for return zero. */ + jnz L(ret_nonzero) /* No ymm register was touched. */ ret @@ -457,9 +456,33 @@ L(one_or_less): /* No ymm register was touched. */ ret + .p2align 4,, 5 +L(ret_nonzero): + sbbl %eax, %eax + orl $1, %eax + /* No ymm register was touched. */ + ret + + .p2align 4,, 2 +L(zero): + xorl %eax, %eax + /* No ymm register was touched. */ + ret + .p2align 4 L(between_8_15): -# endif + movbe (%rdi), %rax + movbe (%rsi), %rcx + subq %rcx, %rax + jnz L(ret_nonzero) + movbe -8(%rdi, %rdx), %rax + movbe -8(%rsi, %rdx), %rcx + subq %rcx, %rax + /* Fast path for return zero. */ + jnz L(ret_nonzero) + /* No ymm register was touched. */ + ret +# else /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ vmovq (%rdi), %xmm1 vmovq (%rsi), %xmm2 @@ -475,16 +498,13 @@ L(between_8_15): VPCMPEQ %xmm1, %xmm2, %xmm2 vpmovmskb %xmm2, %eax subl $0xffff, %eax + /* Fast path for return zero. */ jnz L(return_vec_0) /* No ymm register was touched. */ ret +# endif - .p2align 4 -L(zero): - xorl %eax, %eax - ret - - .p2align 4 + .p2align 4,, 10 L(between_16_31): /* From 16 to 31 bytes. No branch when size == 16. */ vmovdqu (%rsi), %xmm2 @@ -501,11 +521,17 @@ L(between_16_31): VPCMPEQ (%rdi), %xmm2, %xmm2 vpmovmskb %xmm2, %eax subl $0xffff, %eax + /* Fast path for return zero. */ jnz L(return_vec_0) /* No ymm register was touched. */ ret # ifdef USE_AS_WMEMCMP + .p2align 4,, 2 +L(zero): + xorl %eax, %eax + ret + .p2align 4 L(one_or_less): jb L(zero) @@ -520,22 +546,20 @@ L(one_or_less): # else .p2align 4 -L(between_4_7): - /* Load as big endian with overlapping movbe to avoid branches. - */ - movbe (%rdi), %eax - movbe (%rsi), %ecx - shlq $32, %rax - shlq $32, %rcx - movbe -4(%rdi, %rdx), %edi - movbe -4(%rsi, %rdx), %esi - orq %rdi, %rax - orq %rsi, %rcx - subq %rcx, %rax - jz L(zero_4_7) - sbbl %eax, %eax - orl $1, %eax -L(zero_4_7): +L(between_2_3): + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx + bswap %eax + bswap %ecx + shrl %eax + shrl %ecx + movzbl -1(%rdi, %rdx), %edi + movzbl -1(%rsi, %rdx), %esi + orl %edi, %eax + orl %esi, %ecx + /* Subtraction is okay because the upper bit is zero. */ + subl %ecx, %eax /* No ymm register was touched. */ ret # endif diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2.S b/sysdeps/x86_64/multiarch/memcmp-sse2.S index e10555638..4080fc187 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse2.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S @@ -17,8 +17,8 @@ . */ #if IS_IN (libc) -# ifndef memcmp -# define memcmp __memcmp_sse2 +# ifndef MEMCMP +# define MEMCMP __memcmp_sse2 # endif # ifdef SHARED diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S deleted file mode 100644 index cd57c1e2c..000000000 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ /dev/null @@ -1,803 +0,0 @@ -/* memcmp with SSE4.1, wmemcmp with SSE4.1 - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) - -# include - -# ifndef MEMCMP -# define MEMCMP __memcmp_sse4_1 -# endif - -#ifdef USE_AS_WMEMCMP -# define CMPEQ pcmpeqd -# define CHAR_SIZE 4 -#else -# define CMPEQ pcmpeqb -# define CHAR_SIZE 1 -#endif - - -/* Warning! - wmemcmp has to use SIGNED comparison for elements. - memcmp has to use UNSIGNED comparison for elemnts. -*/ - - .section .text.sse4.1,"ax",@progbits -ENTRY (MEMCMP) -# ifdef USE_AS_WMEMCMP - shl $2, %RDX_LP -# elif defined __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -# endif - cmp $79, %RDX_LP - ja L(79bytesormore) - - cmp $CHAR_SIZE, %RDX_LP - jbe L(firstbyte) - - /* N in (CHAR_SIZE, 79) bytes. */ - cmpl $32, %edx - ja L(more_32_bytes) - - cmpl $16, %edx - jae L(16_to_32_bytes) - -# ifndef USE_AS_WMEMCMP - cmpl $8, %edx - jae L(8_to_16_bytes) - - cmpl $4, %edx - jb L(2_to_3_bytes) - - movl (%rdi), %eax - movl (%rsi), %ecx - - bswap %eax - bswap %ecx - - shlq $32, %rax - shlq $32, %rcx - - movl -4(%rdi, %rdx), %edi - movl -4(%rsi, %rdx), %esi - - bswap %edi - bswap %esi - - orq %rdi, %rax - orq %rsi, %rcx - subq %rcx, %rax - cmovne %edx, %eax - sbbl %ecx, %ecx - orl %ecx, %eax - ret - - .p2align 4,, 8 -L(2_to_3_bytes): - movzwl (%rdi), %eax - movzwl (%rsi), %ecx - shll $8, %eax - shll $8, %ecx - bswap %eax - bswap %ecx - movzbl -1(%rdi, %rdx), %edi - movzbl -1(%rsi, %rdx), %esi - orl %edi, %eax - orl %esi, %ecx - subl %ecx, %eax - ret - - .p2align 4,, 8 -L(8_to_16_bytes): - movq (%rdi), %rax - movq (%rsi), %rcx - - bswap %rax - bswap %rcx - - subq %rcx, %rax - jne L(8_to_16_bytes_done) - - movq -8(%rdi, %rdx), %rax - movq -8(%rsi, %rdx), %rcx - - bswap %rax - bswap %rcx - - subq %rcx, %rax - -L(8_to_16_bytes_done): - cmovne %edx, %eax - sbbl %ecx, %ecx - orl %ecx, %eax - ret -# else - xorl %eax, %eax - movl (%rdi), %ecx - cmpl (%rsi), %ecx - jne L(8_to_16_bytes_done) - movl 4(%rdi), %ecx - cmpl 4(%rsi), %ecx - jne L(8_to_16_bytes_done) - movl -4(%rdi, %rdx), %ecx - cmpl -4(%rsi, %rdx), %ecx - jne L(8_to_16_bytes_done) - ret -# endif - - .p2align 4,, 3 -L(ret_zero): - xorl %eax, %eax -L(zero): - ret - - .p2align 4,, 8 -L(firstbyte): - jb L(ret_zero) -# ifdef USE_AS_WMEMCMP - xorl %eax, %eax - movl (%rdi), %ecx - cmpl (%rsi), %ecx - je L(zero) -L(8_to_16_bytes_done): - setg %al - leal -1(%rax, %rax), %eax -# else - movzbl (%rdi), %eax - movzbl (%rsi), %ecx - sub %ecx, %eax -# endif - ret - - .p2align 4 -L(vec_return_begin_48): - addq $16, %rdi - addq $16, %rsi -L(vec_return_begin_32): - bsfl %eax, %eax -# ifdef USE_AS_WMEMCMP - movl 32(%rdi, %rax), %ecx - xorl %edx, %edx - cmpl 32(%rsi, %rax), %ecx - setg %dl - leal -1(%rdx, %rdx), %eax -# else - movzbl 32(%rsi, %rax), %ecx - movzbl 32(%rdi, %rax), %eax - subl %ecx, %eax -# endif - ret - - .p2align 4 -L(vec_return_begin_16): - addq $16, %rdi - addq $16, %rsi -L(vec_return_begin): - bsfl %eax, %eax -# ifdef USE_AS_WMEMCMP - movl (%rdi, %rax), %ecx - xorl %edx, %edx - cmpl (%rsi, %rax), %ecx - setg %dl - leal -1(%rdx, %rdx), %eax -# else - movzbl (%rsi, %rax), %ecx - movzbl (%rdi, %rax), %eax - subl %ecx, %eax -# endif - ret - - .p2align 4 -L(vec_return_end_16): - subl $16, %edx -L(vec_return_end): - bsfl %eax, %eax - addl %edx, %eax -# ifdef USE_AS_WMEMCMP - movl -16(%rdi, %rax), %ecx - xorl %edx, %edx - cmpl -16(%rsi, %rax), %ecx - setg %dl - leal -1(%rdx, %rdx), %eax -# else - movzbl -16(%rsi, %rax), %ecx - movzbl -16(%rdi, %rax), %eax - subl %ecx, %eax -# endif - ret - - .p2align 4,, 8 -L(more_32_bytes): - movdqu (%rdi), %xmm0 - movdqu (%rsi), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin) - - movdqu 16(%rdi), %xmm0 - movdqu 16(%rsi), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_16) - - cmpl $64, %edx - jbe L(32_to_64_bytes) - movdqu 32(%rdi), %xmm0 - movdqu 32(%rsi), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_32) - - .p2align 4,, 6 -L(32_to_64_bytes): - movdqu -32(%rdi, %rdx), %xmm0 - movdqu -32(%rsi, %rdx), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_end_16) - - movdqu -16(%rdi, %rdx), %xmm0 - movdqu -16(%rsi, %rdx), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_end) - ret - - .p2align 4 -L(16_to_32_bytes): - movdqu (%rdi), %xmm0 - movdqu (%rsi), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin) - - movdqu -16(%rdi, %rdx), %xmm0 - movdqu -16(%rsi, %rdx), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_end) - ret - - - .p2align 4 -L(79bytesormore): - movdqu (%rdi), %xmm0 - movdqu (%rsi), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin) - - - mov %rsi, %rcx - and $-16, %rsi - add $16, %rsi - sub %rsi, %rcx - - sub %rcx, %rdi - add %rcx, %rdx - test $0xf, %rdi - jz L(2aligned) - - cmp $128, %rdx - ja L(128bytesormore) - - .p2align 4,, 6 -L(less128bytes): - movdqu (%rdi), %xmm1 - CMPEQ (%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin) - - movdqu 16(%rdi), %xmm1 - CMPEQ 16(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_16) - - movdqu 32(%rdi), %xmm1 - CMPEQ 32(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_32) - - movdqu 48(%rdi), %xmm1 - CMPEQ 48(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_48) - - cmp $96, %rdx - jb L(32_to_64_bytes) - - addq $64, %rdi - addq $64, %rsi - subq $64, %rdx - - .p2align 4,, 6 -L(last_64_bytes): - movdqu (%rdi), %xmm1 - CMPEQ (%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin) - - movdqu 16(%rdi), %xmm1 - CMPEQ 16(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_16) - - movdqu -32(%rdi, %rdx), %xmm0 - movdqu -32(%rsi, %rdx), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_end_16) - - movdqu -16(%rdi, %rdx), %xmm0 - movdqu -16(%rsi, %rdx), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_end) - ret - - .p2align 4 -L(128bytesormore): - cmp $256, %rdx - ja L(unaligned_loop) -L(less256bytes): - movdqu (%rdi), %xmm1 - CMPEQ (%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin) - - movdqu 16(%rdi), %xmm1 - CMPEQ 16(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_16) - - movdqu 32(%rdi), %xmm1 - CMPEQ 32(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_32) - - movdqu 48(%rdi), %xmm1 - CMPEQ 48(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_48) - - addq $64, %rdi - addq $64, %rsi - - movdqu (%rdi), %xmm1 - CMPEQ (%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin) - - movdqu 16(%rdi), %xmm1 - CMPEQ 16(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_16) - - movdqu 32(%rdi), %xmm1 - CMPEQ 32(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_32) - - movdqu 48(%rdi), %xmm1 - CMPEQ 48(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_48) - - addq $-128, %rdx - subq $-64, %rsi - subq $-64, %rdi - - cmp $64, %rdx - ja L(less128bytes) - - cmp $32, %rdx - ja L(last_64_bytes) - - movdqu -32(%rdi, %rdx), %xmm0 - movdqu -32(%rsi, %rdx), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_end_16) - - movdqu -16(%rdi, %rdx), %xmm0 - movdqu -16(%rsi, %rdx), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_end) - ret - - .p2align 4 -L(unaligned_loop): -# ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %R8_LP -# else - mov __x86_data_cache_size_half(%rip), %R8_LP -# endif - movq %r8, %r9 - addq %r8, %r8 - addq %r9, %r8 - cmpq %r8, %rdx - ja L(L2_L3_cache_unaligned) - sub $64, %rdx - .p2align 4 -L(64bytesormore_loop): - movdqu (%rdi), %xmm0 - movdqu 16(%rdi), %xmm1 - movdqu 32(%rdi), %xmm2 - movdqu 48(%rdi), %xmm3 - - CMPEQ (%rsi), %xmm0 - CMPEQ 16(%rsi), %xmm1 - CMPEQ 32(%rsi), %xmm2 - CMPEQ 48(%rsi), %xmm3 - - pand %xmm0, %xmm1 - pand %xmm2, %xmm3 - pand %xmm1, %xmm3 - - pmovmskb %xmm3, %eax - incw %ax - jnz L(64bytesormore_loop_end) - - add $64, %rsi - add $64, %rdi - sub $64, %rdx - ja L(64bytesormore_loop) - - .p2align 4,, 6 -L(loop_tail): - addq %rdx, %rdi - movdqu (%rdi), %xmm0 - movdqu 16(%rdi), %xmm1 - movdqu 32(%rdi), %xmm2 - movdqu 48(%rdi), %xmm3 - - addq %rdx, %rsi - movdqu (%rsi), %xmm4 - movdqu 16(%rsi), %xmm5 - movdqu 32(%rsi), %xmm6 - movdqu 48(%rsi), %xmm7 - - CMPEQ %xmm4, %xmm0 - CMPEQ %xmm5, %xmm1 - CMPEQ %xmm6, %xmm2 - CMPEQ %xmm7, %xmm3 - - pand %xmm0, %xmm1 - pand %xmm2, %xmm3 - pand %xmm1, %xmm3 - - pmovmskb %xmm3, %eax - incw %ax - jnz L(64bytesormore_loop_end) - ret - -L(L2_L3_cache_unaligned): - subq $64, %rdx - .p2align 4 -L(L2_L3_unaligned_128bytes_loop): - prefetchnta 0x1c0(%rdi) - prefetchnta 0x1c0(%rsi) - - movdqu (%rdi), %xmm0 - movdqu 16(%rdi), %xmm1 - movdqu 32(%rdi), %xmm2 - movdqu 48(%rdi), %xmm3 - - CMPEQ (%rsi), %xmm0 - CMPEQ 16(%rsi), %xmm1 - CMPEQ 32(%rsi), %xmm2 - CMPEQ 48(%rsi), %xmm3 - - pand %xmm0, %xmm1 - pand %xmm2, %xmm3 - pand %xmm1, %xmm3 - - pmovmskb %xmm3, %eax - incw %ax - jnz L(64bytesormore_loop_end) - - add $64, %rsi - add $64, %rdi - sub $64, %rdx - ja L(L2_L3_unaligned_128bytes_loop) - jmp L(loop_tail) - - - /* This case is for machines which are sensitive for unaligned - * instructions. */ - .p2align 4 -L(2aligned): - cmp $128, %rdx - ja L(128bytesormorein2aligned) -L(less128bytesin2aligned): - movdqa (%rdi), %xmm1 - CMPEQ (%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin) - - movdqa 16(%rdi), %xmm1 - CMPEQ 16(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_16) - - movdqa 32(%rdi), %xmm1 - CMPEQ 32(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_32) - - movdqa 48(%rdi), %xmm1 - CMPEQ 48(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_48) - - cmp $96, %rdx - jb L(32_to_64_bytes) - - addq $64, %rdi - addq $64, %rsi - subq $64, %rdx - - .p2align 4,, 6 -L(aligned_last_64_bytes): - movdqa (%rdi), %xmm1 - CMPEQ (%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin) - - movdqa 16(%rdi), %xmm1 - CMPEQ 16(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_16) - - movdqu -32(%rdi, %rdx), %xmm0 - movdqu -32(%rsi, %rdx), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_end_16) - - movdqu -16(%rdi, %rdx), %xmm0 - movdqu -16(%rsi, %rdx), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_end) - ret - - .p2align 4 -L(128bytesormorein2aligned): - cmp $256, %rdx - ja L(aligned_loop) -L(less256bytesin2alinged): - movdqa (%rdi), %xmm1 - CMPEQ (%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin) - - movdqa 16(%rdi), %xmm1 - CMPEQ 16(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_16) - - movdqa 32(%rdi), %xmm1 - CMPEQ 32(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_32) - - movdqa 48(%rdi), %xmm1 - CMPEQ 48(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_48) - - addq $64, %rdi - addq $64, %rsi - - movdqa (%rdi), %xmm1 - CMPEQ (%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin) - - movdqa 16(%rdi), %xmm1 - CMPEQ 16(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_16) - - movdqa 32(%rdi), %xmm1 - CMPEQ 32(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_32) - - movdqa 48(%rdi), %xmm1 - CMPEQ 48(%rsi), %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_begin_48) - - addq $-128, %rdx - subq $-64, %rsi - subq $-64, %rdi - - cmp $64, %rdx - ja L(less128bytesin2aligned) - - cmp $32, %rdx - ja L(aligned_last_64_bytes) - - movdqu -32(%rdi, %rdx), %xmm0 - movdqu -32(%rsi, %rdx), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_end_16) - - movdqu -16(%rdi, %rdx), %xmm0 - movdqu -16(%rsi, %rdx), %xmm1 - CMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - incw %ax - jnz L(vec_return_end) - ret - - .p2align 4 -L(aligned_loop): -# ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %R8_LP -# else - mov __x86_data_cache_size_half(%rip), %R8_LP -# endif - movq %r8, %r9 - addq %r8, %r8 - addq %r9, %r8 - cmpq %r8, %rdx - ja L(L2_L3_cache_aligned) - - sub $64, %rdx - .p2align 4 -L(64bytesormore_loopin2aligned): - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm1 - movdqa 32(%rdi), %xmm2 - movdqa 48(%rdi), %xmm3 - - CMPEQ (%rsi), %xmm0 - CMPEQ 16(%rsi), %xmm1 - CMPEQ 32(%rsi), %xmm2 - CMPEQ 48(%rsi), %xmm3 - - pand %xmm0, %xmm1 - pand %xmm2, %xmm3 - pand %xmm1, %xmm3 - - pmovmskb %xmm3, %eax - incw %ax - jnz L(64bytesormore_loop_end) - add $64, %rsi - add $64, %rdi - sub $64, %rdx - ja L(64bytesormore_loopin2aligned) - jmp L(loop_tail) - -L(L2_L3_cache_aligned): - subq $64, %rdx - .p2align 4 -L(L2_L3_aligned_128bytes_loop): - prefetchnta 0x1c0(%rdi) - prefetchnta 0x1c0(%rsi) - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm1 - movdqa 32(%rdi), %xmm2 - movdqa 48(%rdi), %xmm3 - - CMPEQ (%rsi), %xmm0 - CMPEQ 16(%rsi), %xmm1 - CMPEQ 32(%rsi), %xmm2 - CMPEQ 48(%rsi), %xmm3 - - pand %xmm0, %xmm1 - pand %xmm2, %xmm3 - pand %xmm1, %xmm3 - - pmovmskb %xmm3, %eax - incw %ax - jnz L(64bytesormore_loop_end) - - addq $64, %rsi - addq $64, %rdi - subq $64, %rdx - ja L(L2_L3_aligned_128bytes_loop) - jmp L(loop_tail) - - .p2align 4 -L(64bytesormore_loop_end): - pmovmskb %xmm0, %ecx - incw %cx - jnz L(loop_end_ret) - - pmovmskb %xmm1, %ecx - notw %cx - sall $16, %ecx - jnz L(loop_end_ret) - - pmovmskb %xmm2, %ecx - notw %cx - shlq $32, %rcx - jnz L(loop_end_ret) - - addq $48, %rdi - addq $48, %rsi - movq %rax, %rcx - - .p2align 4,, 6 -L(loop_end_ret): - bsfq %rcx, %rcx -# ifdef USE_AS_WMEMCMP - movl (%rdi, %rcx), %eax - xorl %edx, %edx - cmpl (%rsi, %rcx), %eax - setg %dl - leal -1(%rdx, %rdx), %eax -# else - movzbl (%rdi, %rcx), %eax - movzbl (%rsi, %rcx), %ecx - subl %ecx, %eax -# endif - ret -END (MEMCMP) -#endif diff --git a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S index b80a29d4b..9d991e5c7 100644 --- a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S +++ b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S @@ -16,8 +16,10 @@ License along with the GNU C Library; if not, see . */ -#ifndef memcmp -# define memcmp __memcmpeq_sse2 +#if IS_IN (libc) +# define MEMCMP __memcmpeq_sse2 +#else +# define MEMCMP __memcmpeq #endif #define USE_AS_MEMCMPEQ 1 #include "memcmp-sse2.S" diff --git a/sysdeps/x86_64/multiarch/memmove-erms.S b/sysdeps/x86_64/multiarch/memmove-erms.S new file mode 100644 index 000000000..2d3a6ccb7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-erms.S @@ -0,0 +1,72 @@ +/* memcpy/mempcpy/memmove implement with rep movsb + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + + +#include + +#if defined USE_MULTIARCH && IS_IN (libc) + .text +ENTRY (__mempcpy_chk_erms) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__mempcpy_chk_erms) + +/* Only used to measure performance of REP MOVSB. */ +ENTRY (__mempcpy_erms) + mov %RDI_LP, %RAX_LP + /* Skip zero length. */ + test %RDX_LP, %RDX_LP + jz 2f + add %RDX_LP, %RAX_LP + jmp L(start_movsb) +END (__mempcpy_erms) + +ENTRY (__memmove_chk_erms) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memmove_chk_erms) + +ENTRY (__memmove_erms) + movq %rdi, %rax + /* Skip zero length. */ + test %RDX_LP, %RDX_LP + jz 2f +L(start_movsb): + mov %RDX_LP, %RCX_LP + cmp %RSI_LP, %RDI_LP + jb 1f + /* Source == destination is less common. */ + je 2f + lea (%rsi,%rcx), %RDX_LP + cmp %RDX_LP, %RDI_LP + jb L(movsb_backward) +1: + rep movsb +2: + ret +L(movsb_backward): + leaq -1(%rdi,%rcx), %rdi + leaq -1(%rsi,%rcx), %rsi + std + rep movsb + cld + ret +END (__memmove_erms) +strong_alias (__memmove_erms, __memcpy_erms) +strong_alias (__memmove_chk_erms, __memcpy_chk_erms) +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index af51177d5..04747133b 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -118,7 +118,13 @@ # define LARGE_LOAD_SIZE (VEC_SIZE * 4) #endif -/* Amount to shift rdx by to compare for memcpy_large_4x. */ +/* Amount to shift __x86_shared_non_temporal_threshold by for + bound for memcpy_large_4x. This is essentially use to to + indicate that the copy is far beyond the scope of L3 + (assuming no user config x86_non_temporal_threshold) and to + use a more aggressively unrolled loop. NB: before + increasing the value also update initialization of + x86_non_temporal_threshold. */ #ifndef LOG_4X_MEMCPY_THRESH # define LOG_4X_MEMCPY_THRESH 4 #endif @@ -233,56 +239,6 @@ L(start): #endif #if defined USE_MULTIARCH && IS_IN (libc) END (MEMMOVE_SYMBOL (__memmove, unaligned)) -# if VEC_SIZE == 16 -ENTRY (__mempcpy_chk_erms) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (__mempcpy_chk_erms) - -/* Only used to measure performance of REP MOVSB. */ -ENTRY (__mempcpy_erms) - mov %RDI_LP, %RAX_LP - /* Skip zero length. */ - test %RDX_LP, %RDX_LP - jz 2f - add %RDX_LP, %RAX_LP - jmp L(start_movsb) -END (__mempcpy_erms) - -ENTRY (__memmove_chk_erms) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (__memmove_chk_erms) - -ENTRY (__memmove_erms) - movq %rdi, %rax - /* Skip zero length. */ - test %RDX_LP, %RDX_LP - jz 2f -L(start_movsb): - mov %RDX_LP, %RCX_LP - cmp %RSI_LP, %RDI_LP - jb 1f - /* Source == destination is less common. */ - je 2f - lea (%rsi,%rcx), %RDX_LP - cmp %RDX_LP, %RDI_LP - jb L(movsb_backward) -1: - rep movsb -2: - ret -L(movsb_backward): - leaq -1(%rdi,%rcx), %rdi - leaq -1(%rsi,%rcx), %rsi - std - rep movsb - cld - ret -END (__memmove_erms) -strong_alias (__memmove_erms, __memcpy_erms) -strong_alias (__memmove_chk_erms, __memcpy_chk_erms) -# endif # ifdef SHARED ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) @@ -724,9 +680,14 @@ L(skip_short_movsb_check): .p2align 4,, 10 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) L(large_memcpy_2x_check): - cmp __x86_rep_movsb_threshold(%rip), %RDX_LP - jb L(more_8x_vec_check) + /* Entry from L(large_memcpy_2x) has a redundant load of + __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x) + is only use for the non-erms memmove which is generally less + common. */ L(large_memcpy_2x): + mov __x86_shared_non_temporal_threshold(%rip), %R11_LP + cmp %R11_LP, %RDX_LP + jb L(more_8x_vec_check) /* To reach this point it is impossible for dst > src and overlap. Remaining to check is src > dst and overlap. rcx already contains dst - src. Negate rcx to get src - dst. If @@ -774,18 +735,21 @@ L(large_memcpy_2x): /* ecx contains -(dst - src). not ecx will return dst - src - 1 which works for testing aliasing. */ notl %ecx + movq %rdx, %r10 testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx jz L(large_memcpy_4x) - movq %rdx, %r10 - shrq $LOG_4X_MEMCPY_THRESH, %r10 - cmp __x86_shared_non_temporal_threshold(%rip), %r10 + /* r11 has __x86_shared_non_temporal_threshold. Shift it left + by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold. + */ + shlq $LOG_4X_MEMCPY_THRESH, %r11 + cmp %r11, %rdx jae L(large_memcpy_4x) /* edx will store remainder size for copying tail. */ andl $(PAGE_SIZE * 2 - 1), %edx /* r10 stores outer loop counter. */ - shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 + shrq $(LOG_PAGE_SIZE + 1), %r10 /* Copy 4x VEC at a time from 2 pages. */ .p2align 4 L(loop_large_memcpy_2x_outer): @@ -850,7 +814,6 @@ L(large_memcpy_2x_end): .p2align 4 L(large_memcpy_4x): - movq %rdx, %r10 /* edx will store remainder size for copying tail. */ andl $(PAGE_SIZE * 4 - 1), %edx /* r10 stores outer loop counter. */ diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S index cea2d2a72..5e9beeeef 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S @@ -2,6 +2,7 @@ # define MEMRCHR __memrchr_avx2_rtm #endif +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST #define ZERO_UPPER_VEC_REGISTERS_RETURN \ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S index ba2ce7cb0..f300d7daf 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -21,340 +21,318 @@ # include # ifndef MEMRCHR -# define MEMRCHR __memrchr_avx2 +# define MEMRCHR __memrchr_avx2 # endif # ifndef VZEROUPPER -# define VZEROUPPER vzeroupper +# define VZEROUPPER vzeroupper # endif # ifndef SECTION # define SECTION(p) p##.avx # endif -# define VEC_SIZE 32 +# define VEC_SIZE 32 +# define PAGE_SIZE 4096 + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN(MEMRCHR, 6) +# ifdef __ILP32__ + /* Clear upper bits. */ + and %RDX_LP, %RDX_LP +# else + test %RDX_LP, %RDX_LP +# endif + jz L(zero_0) - .section SECTION(.text),"ax",@progbits -ENTRY (MEMRCHR) - /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 - vpbroadcastb %xmm0, %ymm0 - - sub $VEC_SIZE, %RDX_LP - jbe L(last_vec_or_less) - - add %RDX_LP, %RDI_LP - - /* Check the last VEC_SIZE bytes. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x0) + /* Get end pointer. Minus one for two reasons. 1) It is necessary for a + correct page cross check and 2) it correctly sets up end ptr to be + subtract by lzcnt aligned. */ + leaq -1(%rdx, %rdi), %rax - subq $(VEC_SIZE * 4), %rdi - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(aligned_more) + vpbroadcastb %xmm0, %ymm0 - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - addq $VEC_SIZE, %rdx - andq $-VEC_SIZE, %rdi - subq %rcx, %rdx + /* Check if we can load 1x VEC without cross a page. */ + testl $(PAGE_SIZE - VEC_SIZE), %eax + jz L(page_cross) + + vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + cmpq $VEC_SIZE, %rdx + ja L(more_1x_vec) + +L(ret_vec_x0_test): + /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which + will gurantee edx (len) is less than it. */ + lzcntl %ecx, %ecx + + /* Hoist vzeroupper (not great for RTM) to save code size. This allows + all logic for edx (len) <= VEC_SIZE to fit in first cache line. */ + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret - .p2align 4 -L(aligned_more): - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - vpcmpeqb (%rdi), %ymm0, %ymm4 - vpmovmskb %ymm4, %eax - testl %eax, %eax - jnz L(last_vec_x0) - - /* Align data to 4 * VEC_SIZE for loop with fewer branches. - There are some overlaps with above if data isn't aligned - to 4 * VEC_SIZE. */ - movl %edi, %ecx - andl $(VEC_SIZE * 4 - 1), %ecx - jz L(loop_4x_vec) - - addq $(VEC_SIZE * 4), %rdi - addq $(VEC_SIZE * 4), %rdx - andq $-(VEC_SIZE * 4), %rdi - subq %rcx, %rdx + /* Fits in aligning bytes of first cache line. */ +L(zero_0): + xorl %eax, %eax + ret - .p2align 4 -L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - subq $(VEC_SIZE * 4), %rdi - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - vmovdqa (%rdi), %ymm1 - vmovdqa VEC_SIZE(%rdi), %ymm2 - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 - - vpcmpeqb %ymm1, %ymm0, %ymm1 - vpcmpeqb %ymm2, %ymm0, %ymm2 - vpcmpeqb %ymm3, %ymm0, %ymm3 - vpcmpeqb %ymm4, %ymm0, %ymm4 - - vpor %ymm1, %ymm2, %ymm5 - vpor %ymm3, %ymm4, %ymm6 - vpor %ymm5, %ymm6, %ymm5 - - vpmovmskb %ymm5, %eax - testl %eax, %eax - jz L(loop_4x_vec) - - /* There is a match. */ - vpmovmskb %ymm4, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - vpmovmskb %ymm1, %eax - bsrl %eax, %eax - addq %rdi, %rax + .p2align 4,, 9 +L(ret_vec_x0): + lzcntl %ecx, %ecx + subq %rcx, %rax L(return_vzeroupper): ZERO_UPPER_VEC_REGISTERS_RETURN - .p2align 4 -L(last_4x_vec_or_less): - addl $(VEC_SIZE * 4), %edx - cmpl $(VEC_SIZE * 2), %edx - jbe L(last_2x_vec) - - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x1_check) - cmpl $(VEC_SIZE * 3), %edx - jbe L(zero) - - vpcmpeqb (%rdi), %ymm0, %ymm4 - vpmovmskb %ymm4, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 4), %rdx - addq %rax, %rdx - jl L(zero) - addq %rdi, %rax - VZEROUPPER_RETURN - - .p2align 4 + .p2align 4,, 10 +L(more_1x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0) + + /* Align rax (string pointer). */ + andq $-VEC_SIZE, %rax + + /* Recompute remaining length after aligning. */ + movq %rax, %rdx + /* Need this comparison next no matter what. */ + vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1 + subq %rdi, %rdx + decq %rax + vpmovmskb %ymm1, %ecx + /* Fall through for short (hotter than length). */ + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) L(last_2x_vec): - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3_check) cmpl $VEC_SIZE, %edx - jbe L(zero) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 2), %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax - VZEROUPPER_RETURN - - .p2align 4 -L(last_vec_x0): - bsrl %eax, %eax - addq %rdi, %rax - VZEROUPPER_RETURN + jbe L(ret_vec_x0_test) + + testl %ecx, %ecx + jnz L(ret_vec_x0) + + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + /* 64-bit lzcnt. This will naturally add 32 to position. */ + lzcntq %rcx, %rcx + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret - .p2align 4 -L(last_vec_x1): - bsrl %eax, %eax - addl $VEC_SIZE, %eax - addq %rdi, %rax - VZEROUPPER_RETURN - .p2align 4 -L(last_vec_x2): - bsrl %eax, %eax - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax + /* Inexpensive place to put this regarding code size / target alignments + / ICache NLP. Necessary for 2-byte encoding of jump to page cross + case which in turn is necessary for hot path (len <= VEC_SIZE) to fit + in first cache line. */ +L(page_cross): + movq %rax, %rsi + andq $-VEC_SIZE, %rsi + vpcmpeqb (%rsi), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + /* Shift out negative alignment (because we are starting from endptr and + working backwards). */ + movl %eax, %r8d + /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ + notl %r8d + shlxl %r8d, %ecx, %ecx + cmpq %rdi, %rsi + ja L(more_1x_vec) + lzcntl %ecx, %ecx + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret + .p2align 4,, 11 +L(ret_vec_x1): + /* This will naturally add 32 to position. */ + lzcntq %rcx, %rcx + subq %rcx, %rax VZEROUPPER_RETURN + .p2align 4,, 10 +L(more_2x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0) - .p2align 4 -L(last_vec_x3): - bsrl %eax, %eax - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - ret + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1) - .p2align 4 -L(last_vec_x1_check): - bsrl %eax, %eax - subq $(VEC_SIZE * 3), %rdx - addq %rax, %rdx - jl L(zero) - addl $VEC_SIZE, %eax - addq %rdi, %rax - VZEROUPPER_RETURN - .p2align 4 -L(last_vec_x3_check): - bsrl %eax, %eax - subq $VEC_SIZE, %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - VZEROUPPER_RETURN + /* Needed no matter what. */ + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - .p2align 4 -L(zero): - xorl %eax, %eax - VZEROUPPER_RETURN + subq $(VEC_SIZE * 4), %rdx + ja L(more_4x_vec) + + cmpl $(VEC_SIZE * -1), %edx + jle L(ret_vec_x2_test) + +L(last_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) + + /* Needed no matter what. */ + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 3), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_2) + ret - .p2align 4 -L(null): + /* First in aligning bytes. */ +L(zero_2): xorl %eax, %eax ret - .p2align 4 -L(last_vec_or_less_aligned): - movl %edx, %ecx + .p2align 4,, 4 +L(ret_vec_x2_test): + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_2) + ret - vpcmpeqb (%rdi), %ymm0, %ymm1 - movl $1, %edx - /* Support rdx << 32. */ - salq %cl, %rdx - subq $1, %rdx + .p2align 4,, 11 +L(ret_vec_x2): + /* ecx must be non-zero. */ + bsrl %ecx, %ecx + leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - vpmovmskb %ymm1, %eax + .p2align 4,, 14 +L(ret_vec_x3): + /* ecx must be non-zero. */ + bsrl %ecx, %ecx + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - /* Remove the trailing bytes. */ - andl %edx, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - addq %rdi, %rax - VZEROUPPER_RETURN .p2align 4 -L(last_vec_or_less): - addl $VEC_SIZE, %edx +L(more_4x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) - /* Check for zero length. */ - testl %edx, %edx - jz L(null) + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(last_vec_or_less_aligned) + testl %ecx, %ecx + jnz L(ret_vec_x3) - movl %ecx, %esi - movl %ecx, %r8d - addl %edx, %esi - andq $-VEC_SIZE, %rdi + /* Check if near end before re-aligning (otherwise might do an + unnecissary loop iteration). */ + addq $-(VEC_SIZE * 4), %rax + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec) - subl $VEC_SIZE, %esi - ja L(last_vec_2x_aligned) + /* Align rax to (VEC_SIZE - 1). */ + orq $(VEC_SIZE * 4 - 1), %rax + movq %rdi, %rdx + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because + lengths that overflow can be valid and break the comparison. */ + orq $(VEC_SIZE * 4 - 1), %rdx - /* Check the last VEC. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - - /* Remove the leading and trailing bytes. */ - sarl %cl, %eax - movl %edx, %ecx + .p2align 4 +L(loop_4x_vec): + /* Need this comparison next no matter what. */ + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2 + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3 + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4 - movl $1, %edx - sall %cl, %edx - subl $1, %edx + vpor %ymm1, %ymm2, %ymm2 + vpor %ymm3, %ymm4, %ymm4 + vpor %ymm2, %ymm4, %ymm4 + vpmovmskb %ymm4, %esi - andl %edx, %eax - testl %eax, %eax - jz L(zero) + testl %esi, %esi + jnz L(loop_end) - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax - VZEROUPPER_RETURN + addq $(VEC_SIZE * -4), %rax + cmpq %rdx, %rax + jne L(loop_4x_vec) - .p2align 4 -L(last_vec_2x_aligned): - movl %esi, %ecx + subl %edi, %edx + incl %edx - /* Check the last VEC. */ - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 +L(last_4x_vec): + /* Used no matter what. */ + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - movl $1, %edx - sall %cl, %edx - subl $1, %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) - vpmovmskb %ymm1, %eax + testl %ecx, %ecx + jnz L(ret_vec_x0_end) - /* Remove the trailing bytes. */ - andl %edx, %eax + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1_end) - testl %eax, %eax - jnz L(last_vec_x1) + /* Used no matter what. */ + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - /* Check the second last VEC. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 + cmpl $(VEC_SIZE * 3), %edx + ja L(last_vec) + + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + jbe L(ret0) + xorl %eax, %eax +L(ret0): + ret - movl %r8d, %ecx - vpmovmskb %ymm1, %eax + .p2align 4 +L(loop_end): + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x0_end) + + vpmovmskb %ymm2, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1_end) + + vpmovmskb %ymm3, %ecx + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) + then it won't affect the result in esi (VEC4). If ecx is non-zero + then CHAR in VEC3 and bsrq will use that position. */ + salq $32, %rcx + orq %rsi, %rcx + bsrq %rcx, %rcx + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - /* Remove the leading bytes. Must use unsigned right shift for - bsrl below. */ - shrl %cl, %eax - testl %eax, %eax - jz L(zero) + .p2align 4,, 4 +L(ret_vec_x1_end): + /* 64-bit version will automatically add 32 (VEC_SIZE). */ + lzcntq %rcx, %rcx + subq %rcx, %rax + VZEROUPPER_RETURN - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax + .p2align 4,, 4 +L(ret_vec_x0_end): + lzcntl %ecx, %ecx + subq %rcx, %rax VZEROUPPER_RETURN -END (MEMRCHR) + + /* 2 bytes until next cache line. */ +END(MEMRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S index 0b99709c6..91329b18d 100644 --- a/sysdeps/x86_64/multiarch/memrchr-evex.S +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S @@ -19,319 +19,316 @@ #if IS_IN (libc) # include +# include "evex256-vecs.h" +# if VEC_SIZE != 32 +# error "VEC_SIZE != 32 unimplemented" +# endif + +# ifndef MEMRCHR +# define MEMRCHR __memrchr_evex +# endif + +# define PAGE_SIZE 4096 +# define VECMATCH VEC(0) + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN(MEMRCHR, 6) +# ifdef __ILP32__ + /* Clear upper bits. */ + and %RDX_LP, %RDX_LP +# else + test %RDX_LP, %RDX_LP +# endif + jz L(zero_0) + + /* Get end pointer. Minus one for two reasons. 1) It is necessary for a + correct page cross check and 2) it correctly sets up end ptr to be + subtract by lzcnt aligned. */ + leaq -1(%rdi, %rdx), %rax + vpbroadcastb %esi, %VECMATCH + + /* Check if we can load 1x VEC without cross a page. */ + testl $(PAGE_SIZE - VEC_SIZE), %eax + jz L(page_cross) + + /* Don't use rax for pointer here because EVEX has better encoding with + offset % VEC_SIZE == 0. */ + vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0 + kmovd %k0, %ecx + + /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */ + cmpq $VEC_SIZE, %rdx + ja L(more_1x_vec) +L(ret_vec_x0_test): + + /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which + will guarantee edx (len) is less than it. */ + lzcntl %ecx, %ecx + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret -# define VMOVA vmovdqa64 - -# define YMMMATCH ymm16 - -# define VEC_SIZE 32 - - .section .text.evex,"ax",@progbits -ENTRY (__memrchr_evex) - /* Broadcast CHAR to YMMMATCH. */ - vpbroadcastb %esi, %YMMMATCH - - sub $VEC_SIZE, %RDX_LP - jbe L(last_vec_or_less) - - add %RDX_LP, %RDI_LP - - /* Check the last VEC_SIZE bytes. */ - vpcmpb $0, (%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(last_vec_x0) - - subq $(VEC_SIZE * 4), %rdi - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(aligned_more) - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - addq $VEC_SIZE, %rdx - andq $-VEC_SIZE, %rdi - subq %rcx, %rdx - - .p2align 4 -L(aligned_more): - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ - vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 - kmovd %k2, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 - kmovd %k3, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - vpcmpb $0, (%rdi), %YMMMATCH, %k4 - kmovd %k4, %eax - testl %eax, %eax - jnz L(last_vec_x0) - - /* Align data to 4 * VEC_SIZE for loop with fewer branches. - There are some overlaps with above if data isn't aligned - to 4 * VEC_SIZE. */ - movl %edi, %ecx - andl $(VEC_SIZE * 4 - 1), %ecx - jz L(loop_4x_vec) - - addq $(VEC_SIZE * 4), %rdi - addq $(VEC_SIZE * 4), %rdx - andq $-(VEC_SIZE * 4), %rdi - subq %rcx, %rdx + /* Fits in aligning bytes of first cache line. */ +L(zero_0): + xorl %eax, %eax + ret - .p2align 4 -L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - subq $(VEC_SIZE * 4), %rdi - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - vpcmpb $0, (%rdi), %YMMMATCH, %k1 - vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 - kord %k1, %k2, %k5 - vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 - vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 - - kord %k3, %k4, %k6 - kortestd %k5, %k6 - jz L(loop_4x_vec) - - /* There is a match. */ - kmovd %k4, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - kmovd %k3, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - kmovd %k2, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - kmovd %k1, %eax - bsrl %eax, %eax - addq %rdi, %rax + .p2align 4,, 9 +L(ret_vec_x0_dec): + decq %rax +L(ret_vec_x0): + lzcntl %ecx, %ecx + subq %rcx, %rax ret - .p2align 4 -L(last_4x_vec_or_less): - addl $(VEC_SIZE * 4), %edx - cmpl $(VEC_SIZE * 2), %edx - jbe L(last_2x_vec) + .p2align 4,, 10 +L(more_1x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0) - vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(last_vec_x3) + /* Align rax (pointer to string). */ + andq $-VEC_SIZE, %rax - vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 - kmovd %k2, %eax - testl %eax, %eax - jnz L(last_vec_x2) + /* Recompute length after aligning. */ + movq %rax, %rdx - vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 - kmovd %k3, %eax - testl %eax, %eax - jnz L(last_vec_x1_check) - cmpl $(VEC_SIZE * 3), %edx - jbe L(zero) + /* Need no matter what. */ + vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - vpcmpb $0, (%rdi), %YMMMATCH, %k4 - kmovd %k4, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 4), %rdx - addq %rax, %rdx - jl L(zero) - addq %rdi, %rax - ret + subq %rdi, %rdx - .p2align 4 + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) L(last_2x_vec): - vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(last_vec_x3_check) + + /* Must dec rax because L(ret_vec_x0_test) expects it. */ + decq %rax cmpl $VEC_SIZE, %edx - jbe L(zero) - - vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 2), %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax + jbe L(ret_vec_x0_test) + + testl %ecx, %ecx + jnz L(ret_vec_x0) + + /* Don't use rax for pointer here because EVEX has better encoding with + offset % VEC_SIZE == 0. */ + vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0 + kmovd %k0, %ecx + /* NB: 64-bit lzcnt. This will naturally add 32 to position. */ + lzcntq %rcx, %rcx + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax ret - .p2align 4 -L(last_vec_x0): - bsrl %eax, %eax - addq %rdi, %rax + /* Inexpensive place to put this regarding code size / target alignments + / ICache NLP. Necessary for 2-byte encoding of jump to page cross + case which in turn is necessary for hot path (len <= VEC_SIZE) to fit + in first cache line. */ +L(page_cross): + movq %rax, %rsi + andq $-VEC_SIZE, %rsi + vpcmpb $0, (%rsi), %VECMATCH, %k0 + kmovd %k0, %r8d + /* Shift out negative alignment (because we are starting from endptr and + working backwards). */ + movl %eax, %ecx + /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ + notl %ecx + shlxl %ecx, %r8d, %ecx + cmpq %rdi, %rsi + ja L(more_1x_vec) + lzcntl %ecx, %ecx + cmpl %ecx, %edx + jle L(zero_1) + subq %rcx, %rax ret - .p2align 4 -L(last_vec_x1): - bsrl %eax, %eax - addl $VEC_SIZE, %eax - addq %rdi, %rax + /* Continue creating zero labels that fit in aligning bytes and get + 2-byte encoding / are in the same cache line as condition. */ +L(zero_1): + xorl %eax, %eax ret - .p2align 4 -L(last_vec_x2): - bsrl %eax, %eax - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax + .p2align 4,, 8 +L(ret_vec_x1): + /* This will naturally add 32 to position. */ + bsrl %ecx, %ecx + leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax ret - .p2align 4 -L(last_vec_x3): - bsrl %eax, %eax - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - ret + .p2align 4,, 8 +L(more_2x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0_dec) - .p2align 4 -L(last_vec_x1_check): - bsrl %eax, %eax - subq $(VEC_SIZE * 3), %rdx - addq %rax, %rdx - jl L(zero) - addl $VEC_SIZE, %eax - addq %rdi, %rax - ret + vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1) - .p2align 4 -L(last_vec_x3_check): - bsrl %eax, %eax - subq $VEC_SIZE, %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - ret + /* Need no matter what. */ + vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - .p2align 4 -L(zero): - xorl %eax, %eax + subq $(VEC_SIZE * 4), %rdx + ja L(more_4x_vec) + + cmpl $(VEC_SIZE * -1), %edx + jle L(ret_vec_x2_test) +L(last_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) + + + /* Need no matter what. */ + vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 3 + 1), %rax + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_1) ret - .p2align 4 -L(last_vec_or_less_aligned): - movl %edx, %ecx - - vpcmpb $0, (%rdi), %YMMMATCH, %k1 - - movl $1, %edx - /* Support rdx << 32. */ - salq %cl, %rdx - subq $1, %rdx - - kmovd %k1, %eax - - /* Remove the trailing bytes. */ - andl %edx, %eax - testl %eax, %eax - jz L(zero) - - bsrl %eax, %eax - addq %rdi, %rax + .p2align 4,, 8 +L(ret_vec_x2_test): + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2 + 1), %rax + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_1) ret - .p2align 4 -L(last_vec_or_less): - addl $VEC_SIZE, %edx - - /* Check for zero length. */ - testl %edx, %edx - jz L(zero) - - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(last_vec_or_less_aligned) - - movl %ecx, %esi - movl %ecx, %r8d - addl %edx, %esi - andq $-VEC_SIZE, %rdi + .p2align 4,, 8 +L(ret_vec_x2): + bsrl %ecx, %ecx + leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax + ret - subl $VEC_SIZE, %esi - ja L(last_vec_2x_aligned) + .p2align 4,, 8 +L(ret_vec_x3): + bsrl %ecx, %ecx + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax + ret - /* Check the last VEC. */ - vpcmpb $0, (%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax + .p2align 4,, 8 +L(more_4x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) - /* Remove the leading and trailing bytes. */ - sarl %cl, %eax - movl %edx, %ecx + vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - movl $1, %edx - sall %cl, %edx - subl $1, %edx + testl %ecx, %ecx + jnz L(ret_vec_x3) - andl %edx, %eax - testl %eax, %eax - jz L(zero) + /* Check if near end before re-aligning (otherwise might do an + unnecessary loop iteration). */ + addq $-(VEC_SIZE * 4), %rax + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec) - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax - ret + decq %rax + andq $-(VEC_SIZE * 4), %rax + movq %rdi, %rdx + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because + lengths that overflow can be valid and break the comparison. */ + andq $-(VEC_SIZE * 4), %rdx .p2align 4 -L(last_vec_2x_aligned): - movl %esi, %ecx - - /* Check the last VEC. */ - vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 +L(loop_4x_vec): + /* Store 1 were not-equals and 0 where equals in k1 (used to mask later + on). */ + vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1 + + /* VEC(2/3) will have zero-byte where we found a CHAR. */ + vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2) + vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3) + vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4 + + /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where + CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */ + vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z} + vptestnmb %VEC(3), %VEC(3), %k2 + + /* Any 1s and we found CHAR. */ + kortestd %k2, %k4 + jnz L(loop_end) + + addq $-(VEC_SIZE * 4), %rax + cmpq %rdx, %rax + jne L(loop_4x_vec) + + /* Need to re-adjust rdx / rax for L(last_4x_vec). */ + subq $-(VEC_SIZE * 4), %rdx + movq %rdx, %rax + subl %edi, %edx +L(last_4x_vec): + + /* Used no matter what. */ + vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - movl $1, %edx - sall %cl, %edx - subl $1, %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) - kmovd %k1, %eax + testl %ecx, %ecx + jnz L(ret_vec_x0_dec) - /* Remove the trailing bytes. */ - andl %edx, %eax - testl %eax, %eax - jnz L(last_vec_x1) + vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - /* Check the second last VEC. */ - vpcmpb $0, (%rdi), %YMMMATCH, %k1 + testl %ecx, %ecx + jnz L(ret_vec_x1) - movl %r8d, %ecx + /* Used no matter what. */ + vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - kmovd %k1, %eax + cmpl $(VEC_SIZE * 3), %edx + ja L(last_vec) - /* Remove the leading bytes. Must use unsigned right shift for - bsrl below. */ - shrl %cl, %eax - testl %eax, %eax - jz L(zero) + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2 + 1), %rax + subq %rcx, %rax + cmpq %rax, %rdi + jbe L(ret_1) + xorl %eax, %eax +L(ret_1): + ret - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax + .p2align 4,, 6 +L(loop_end): + kmovd %k1, %ecx + notl %ecx + testl %ecx, %ecx + jnz L(ret_vec_x0_end) + + vptestnmb %VEC(2), %VEC(2), %k0 + kmovd %k0, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1_end) + + kmovd %k2, %ecx + kmovd %k4, %esi + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) + then it won't affect the result in esi (VEC4). If ecx is non-zero + then CHAR in VEC3 and bsrq will use that position. */ + salq $32, %rcx + orq %rsi, %rcx + bsrq %rcx, %rcx + addq %rcx, %rax + ret + .p2align 4,, 4 +L(ret_vec_x0_end): + addq $(VEC_SIZE), %rax +L(ret_vec_x1_end): + bsrl %ecx, %ecx + leaq (VEC_SIZE * 2)(%rax, %rcx), %rax ret -END (__memrchr_evex) + +END(MEMRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index 1af668af0..c0bf2875d 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -10,15 +10,18 @@ # define VMOVU vmovdqu # define VMOVA vmovdqa -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ - movq r, %rax; \ - vpbroadcastb %xmm0, %ymm0 + movq r, %rax; -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - vmovd d, %xmm0; \ - movq r, %rax; \ - vpbroadcastd %xmm0, %ymm0 +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) + +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0 +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0 + +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 # ifndef SECTION # define SECTION(p) p##.avx @@ -30,5 +33,6 @@ # define WMEMSET_SYMBOL(p,s) p##_avx2_##s # endif +# define USE_XMM_LESS_VEC # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index f14d6f849..5241216a7 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -15,13 +15,19 @@ # define VZEROUPPER -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - movq r, %rax; \ - vpbroadcastb d, %VEC0 +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastb d, %VEC0; \ + movq r, %rax -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - movq r, %rax; \ - vpbroadcastd d, %VEC0 +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax + +# define MEMSET_VDUP_TO_VEC0_HIGH() +# define MEMSET_VDUP_TO_VEC0_LOW() + +# define WMEMSET_VDUP_TO_VEC0_HIGH() +# define WMEMSET_VDUP_TO_VEC0_LOW() # define SECTION(p) p##.evex512 # define MEMSET_SYMBOL(p,s) p##_avx512_##s diff --git a/sysdeps/x86_64/multiarch/memset-erms.S b/sysdeps/x86_64/multiarch/memset-erms.S new file mode 100644 index 000000000..e83cccc73 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-erms.S @@ -0,0 +1,44 @@ +/* memset implement with rep stosb + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + + +#include + +#if defined USE_MULTIARCH && IS_IN (libc) + .text +ENTRY (__memset_chk_erms) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk_erms) + +/* Only used to measure performance of REP STOSB. */ +ENTRY (__memset_erms) + /* Skip zero length. */ + test %RDX_LP, %RDX_LP + jz L(stosb_return_zero) + mov %RDX_LP, %RCX_LP + movzbl %sil, %eax + mov %RDI_LP, %RDX_LP + rep stosb + mov %RDX_LP, %RAX_LP + ret +L(stosb_return_zero): + movq %rdi, %rax + ret +END (__memset_erms) +#endif diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S index 64b09e77c..637002150 100644 --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S @@ -15,13 +15,19 @@ # define VZEROUPPER -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - movq r, %rax; \ - vpbroadcastb d, %VEC0 +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastb d, %VEC0; \ + movq r, %rax -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ - movq r, %rax; \ - vpbroadcastd d, %VEC0 +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax + +# define MEMSET_VDUP_TO_VEC0_HIGH() +# define MEMSET_VDUP_TO_VEC0_LOW() + +# define WMEMSET_VDUP_TO_VEC0_HIGH() +# define WMEMSET_VDUP_TO_VEC0_LOW() # define SECTION(p) p##.evex # define MEMSET_SYMBOL(p,s) p##_evex_##s diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S index 8a6f0c561..3d92f6993 100644 --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S @@ -30,9 +30,7 @@ # endif # undef weak_alias -# define weak_alias(original, alias) \ - .weak bzero; bzero = __bzero - +# define weak_alias(original, alias) # undef strong_alias # define strong_alias(ignored1, ignored2) #endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 1e0511c79..905d0fa46 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -1,4 +1,4 @@ -/* memset/bzero with unaligned store and rep stosb +/* memset with unaligned store and rep stosb Copyright (C) 2016-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -58,8 +58,10 @@ #ifndef MOVQ # if VEC_SIZE > 16 # define MOVQ vmovq +# define MOVD vmovd # else # define MOVQ movq +# define MOVD movd # endif #endif @@ -72,9 +74,29 @@ #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 # define END_REG rcx # define LOOP_REG rdi +# define LESS_VEC_REG rax #else # define END_REG rdi # define LOOP_REG rdx +# define LESS_VEC_REG rdi +#endif + +#ifdef USE_XMM_LESS_VEC +# define XMM_SMALL 1 +#else +# define XMM_SMALL 0 +#endif + +#ifdef USE_LESS_VEC_MASK_STORE +# define SET_REG64 rcx +# define SET_REG32 ecx +# define SET_REG16 cx +# define SET_REG8 cl +#else +# define SET_REG64 rsi +# define SET_REG32 esi +# define SET_REG16 si +# define SET_REG8 sil #endif #define PAGE_SIZE 4096 @@ -88,18 +110,7 @@ # error SECTION is not defined! #endif - .section SECTION(.text),"ax",@progbits -#if VEC_SIZE == 16 && IS_IN (libc) -ENTRY (__bzero) - mov %RDI_LP, %RAX_LP /* Set return value. */ - mov %RSI_LP, %RDX_LP /* Set n. */ - xorl %esi, %esi - pxor %XMM0, %XMM0 - jmp L(entry_from_bzero) -END (__bzero) -weak_alias (__bzero, bzero) -#endif - + .section SECTION(.text), "ax", @progbits #if IS_IN (libc) # if defined SHARED ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) @@ -110,8 +121,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) shl $2, %RDX_LP - WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) - jmp L(entry_from_bzero) + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + WMEMSET_VDUP_TO_VEC0_LOW() + cmpq $VEC_SIZE, %rdx + jb L(less_vec_from_wmemset) + WMEMSET_VDUP_TO_VEC0_HIGH() + jmp L(entry_from_wmemset) END (WMEMSET_SYMBOL (__wmemset, unaligned)) #endif @@ -123,14 +138,15 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) #endif ENTRY (MEMSET_SYMBOL (__memset, unaligned)) - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) # ifdef __ILP32__ /* Clear the upper 32 bits. */ mov %edx, %edx # endif -L(entry_from_bzero): cmpq $VEC_SIZE, %rdx jb L(less_vec) + MEMSET_VDUP_TO_VEC0_HIGH() +L(entry_from_wmemset): cmpq $(VEC_SIZE * 2), %rdx ja L(more_2x_vec) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ @@ -140,37 +156,6 @@ L(entry_from_bzero): #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) -# if VEC_SIZE == 16 -ENTRY (__memset_chk_erms) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (__memset_chk_erms) - -/* Only used to measure performance of REP STOSB. */ -ENTRY (__memset_erms) - /* Skip zero length. */ - test %RDX_LP, %RDX_LP - jnz L(stosb) - movq %rdi, %rax - ret -# else -/* Provide a hidden symbol to debugger. */ - .hidden MEMSET_SYMBOL (__memset, erms) -ENTRY (MEMSET_SYMBOL (__memset, erms)) -# endif -L(stosb): - mov %RDX_LP, %RCX_LP - movzbl %sil, %eax - mov %RDI_LP, %RDX_LP - rep stosb - mov %RDX_LP, %RAX_LP - VZEROUPPER_RETURN -# if VEC_SIZE == 16 -END (__memset_erms) -# else -END (MEMSET_SYMBOL (__memset, erms)) -# endif - # if defined SHARED && IS_IN (libc) ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) cmp %RDX_LP, %RCX_LP @@ -179,27 +164,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) # endif ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) # ifdef __ILP32__ /* Clear the upper 32 bits. */ mov %edx, %edx # endif cmp $VEC_SIZE, %RDX_LP jb L(less_vec) + MEMSET_VDUP_TO_VEC0_HIGH () cmp $(VEC_SIZE * 2), %RDX_LP ja L(stosb_more_2x_vec) - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. - */ - VMOVU %VEC(0), (%rax) - VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) VZEROUPPER_RETURN #endif - .p2align 4,, 10 + .p2align 4,, 4 L(last_2x_vec): #ifdef USE_LESS_VEC_MASK_STORE - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) #else VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) @@ -212,6 +197,7 @@ L(last_2x_vec): #ifdef USE_LESS_VEC_MASK_STORE .p2align 4,, 10 L(less_vec): +L(less_vec_from_wmemset): /* Less than 1 VEC. */ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 # error Unsupported VEC_SIZE! @@ -262,28 +248,18 @@ L(stosb_more_2x_vec): /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] and (4x, 8x] jump to target. */ L(more_2x_vec): - - /* Two different methods of setting up pointers / compare. The - two methods are based on the fact that EVEX/AVX512 mov - instructions take more bytes then AVX2/SSE2 mov instructions. As - well that EVEX/AVX512 machines also have fast LEA_BID. Both - setup and END_REG to avoid complex address mode. For EVEX/AVX512 - this saves code size and keeps a few targets in one fetch block. - For AVX2/SSE2 this helps prevent AGU bottlenecks. */ -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 - /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + - LOOP_4X_OFFSET) with LEA_BID. */ - - /* END_REG is rcx for EVEX/AVX512. */ - leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG -#endif - - /* Stores to first 2x VEC before cmp as any path forward will - require it. */ - VMOVU %VEC(0), (%rax) - VMOVU %VEC(0), VEC_SIZE(%rax) + /* Store next 2x vec regardless. */ + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) + /* Two different methods of setting up pointers / compare. The two + methods are based on the fact that EVEX/AVX512 mov instructions take + more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512 + machines also have fast LEA_BID. Both setup and END_REG to avoid complex + address mode. For EVEX/AVX512 this saves code size and keeps a few + targets in one fetch block. For AVX2/SSE2 this helps prevent AGU + bottlenecks. */ #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ addq %rdx, %END_REG @@ -292,6 +268,15 @@ L(more_2x_vec): cmpq $(VEC_SIZE * 4), %rdx jbe L(last_2x_vec) + +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 + /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with + LEA_BID. */ + + /* END_REG is rcx for EVEX/AVX512. */ + leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG +#endif + /* Store next 2x vec regardless. */ VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) @@ -355,65 +340,93 @@ L(stosb_local): /* Define L(less_vec) only if not otherwise defined. */ .p2align 4 L(less_vec): + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to + xmm). This is only does anything for AVX2. */ + MEMSET_VDUP_TO_VEC0_LOW () +L(less_vec_from_wmemset): #endif L(cross_page): #if VEC_SIZE > 32 cmpl $32, %edx - jae L(between_32_63) + jge L(between_32_63) #endif #if VEC_SIZE > 16 cmpl $16, %edx - jae L(between_16_31) + jge L(between_16_31) +#endif +#ifndef USE_XMM_LESS_VEC + MOVQ %XMM0, %SET_REG64 #endif - MOVQ %XMM0, %rdi cmpl $8, %edx - jae L(between_8_15) + jge L(between_8_15) cmpl $4, %edx - jae L(between_4_7) + jge L(between_4_7) cmpl $1, %edx - ja L(between_2_3) - jb L(return) - movb %sil, (%rax) - VZEROUPPER_RETURN + jg L(between_2_3) + jl L(between_0_0) + movb %SET_REG8, (%LESS_VEC_REG) +L(between_0_0): + ret - /* Align small targets only if not doing so would cross a fetch - line. */ + /* Align small targets only if not doing so would cross a fetch line. + */ #if VEC_SIZE > 32 .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) /* From 32 to 63. No branch when size == 32. */ L(between_32_63): - VMOVU %YMM0, (%rax) - VMOVU %YMM0, -32(%rax, %rdx) + VMOVU %YMM0, (%LESS_VEC_REG) + VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) VZEROUPPER_RETURN #endif #if VEC_SIZE >= 32 - .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) L(between_16_31): /* From 16 to 31. No branch when size == 16. */ - VMOVU %XMM0, (%rax) - VMOVU %XMM0, -16(%rax, %rdx) - VZEROUPPER_RETURN + VMOVU %XMM0, (%LESS_VEC_REG) + VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) + ret #endif - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) + /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. + */ + .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1) L(between_8_15): /* From 8 to 15. No branch when size == 8. */ - movq %rdi, (%rax) - movq %rdi, -8(%rax, %rdx) - VZEROUPPER_RETURN +#ifdef USE_XMM_LESS_VEC + MOVQ %XMM0, (%rdi) + MOVQ %XMM0, -8(%rdi, %rdx) +#else + movq %SET_REG64, (%LESS_VEC_REG) + movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) +#endif + ret - .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) + /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. + */ + .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1) L(between_4_7): /* From 4 to 7. No branch when size == 4. */ - movl %edi, (%rax) - movl %edi, -4(%rax, %rdx) - VZEROUPPER_RETURN +#ifdef USE_XMM_LESS_VEC + MOVD %XMM0, (%rdi) + MOVD %XMM0, -4(%rdi, %rdx) +#else + movl %SET_REG32, (%LESS_VEC_REG) + movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) +#endif + ret - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) + /* 4 * XMM_SMALL for the third mov for AVX2. */ + .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1) L(between_2_3): /* From 2 to 3. No branch when size == 2. */ - movw %di, (%rax) - movb %dil, -1(%rax, %rdx) - VZEROUPPER_RETURN +#ifdef USE_XMM_LESS_VEC + movb %SET_REG8, (%rdi) + movb %SET_REG8, 1(%rdi) + movb %SET_REG8, -1(%rdi, %rdx) +#else + movw %SET_REG16, (%LESS_VEC_REG) + movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) +#endif + ret END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h new file mode 100644 index 000000000..2b77a59d5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/sse2-vecs.h @@ -0,0 +1,47 @@ +/* Common config for SSE2 VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _SSE2_VECS_H +#define _SSE2_VECS_H 1 + +#ifdef VEC_SIZE +# error "Multiple VEC configs included!" +#endif + +#define VEC_SIZE 16 +#include "vec-macros.h" + +#define USE_WITH_SSE2 1 +#define SECTION(p) p + +/* 3-byte mov instructions with SSE2. */ +#define MOV_SIZE 3 +/* No vzeroupper needed. */ +#define RET_SIZE 1 +#define VZEROUPPER + +#define VMOVU movups +#define VMOVA movaps +#define VMOVNT movntdq + +#define VEC_xmm VEC_any_xmm +#define VEC VEC_any_xmm + + +#endif diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S new file mode 100644 index 000000000..09957fc3c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S @@ -0,0 +1,15 @@ +#ifndef STRCMP +# define STRCMP __strcasecmp_l_avx2_rtm +#endif + +#define _GLABEL(x) x ## _rtm +#define GLABEL(x) _GLABEL(x) + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcasecmp_l-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S similarity index 87% rename from sysdeps/x86_64/multiarch/strcasecmp_l-avx.S rename to sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S index 7ec7c21b5..e2762f2a2 100644 --- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S @@ -1,4 +1,4 @@ -/* strcasecmp_l optimized with AVX. +/* strcasecmp_l optimized with AVX2. Copyright (C) 2017-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,7 +16,8 @@ License along with the GNU C Library; if not, see . */ -#define STRCMP_SSE42 __strcasecmp_l_avx -#define USE_AVX 1 +#ifndef STRCMP +# define STRCMP __strcasecmp_l_avx2 +#endif #define USE_AS_STRCASECMP_L -#include "strcmp-sse42.S" +#include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S similarity index 83% rename from sysdeps/x86_64/multiarch/strncase_l-avx.S rename to sysdeps/x86_64/multiarch/strcasecmp_l-evex.S index b51b86d22..58642db74 100644 --- a/sysdeps/x86_64/multiarch/strncase_l-avx.S +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S @@ -1,4 +1,4 @@ -/* strncasecmp_l optimized with AVX. +/* strcasecmp_l optimized with EVEX. Copyright (C) 2017-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,7 +16,8 @@ License along with the GNU C Library; if not, see . */ -#define STRCMP_SSE42 __strncasecmp_l_avx -#define USE_AVX 1 -#define USE_AS_STRNCASECMP_L -#include "strcmp-sse42.S" +#ifndef STRCMP +# define STRCMP __strcasecmp_l_evex +#endif +#define USE_AS_STRCASECMP_L +#include "strcmp-evex.S" diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S index 086cabf76..1a916cc95 100644 --- a/sysdeps/x86_64/multiarch/strchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S @@ -48,13 +48,13 @@ # define PAGE_SIZE 4096 .section SECTION(.text),"ax",@progbits -ENTRY (STRCHR) +ENTRY_P2ALIGN (STRCHR, 5) /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 movl %edi, %eax andl $(PAGE_SIZE - 1), %eax VPBROADCAST %xmm0, %ymm0 - vpxor %xmm9, %xmm9, %xmm9 + vpxor %xmm1, %xmm1, %xmm1 /* Check if we cross page boundary with one vector load. */ cmpl $(PAGE_SIZE - VEC_SIZE), %eax @@ -62,37 +62,29 @@ ENTRY (STRCHR) /* Check the first VEC_SIZE bytes. Search for both CHAR and the null byte. */ - vmovdqu (%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax + vmovdqu (%rdi), %ymm2 + VPCMPEQ %ymm2, %ymm0, %ymm3 + VPCMPEQ %ymm2, %ymm1, %ymm2 + vpor %ymm3, %ymm2, %ymm3 + vpmovmskb %ymm3, %eax testl %eax, %eax jz L(aligned_more) tzcntl %eax, %eax # ifndef USE_AS_STRCHRNUL - /* Found CHAR or the null byte. */ - cmp (%rdi, %rax), %CHAR_REG - jne L(zero) -# endif - addq %rdi, %rax - VZEROUPPER_RETURN - - /* .p2align 5 helps keep performance more consistent if ENTRY() - alignment % 32 was either 16 or 0. As well this makes the - alignment % 32 of the loop_4x_vec fixed which makes tuning it - easier. */ - .p2align 5 -L(first_vec_x4): - tzcntl %eax, %eax - addq $(VEC_SIZE * 3 + 1), %rdi -# ifndef USE_AS_STRCHRNUL - /* Found CHAR or the null byte. */ + /* Found CHAR or the null byte. */ cmp (%rdi, %rax), %CHAR_REG + /* NB: Use a branch instead of cmovcc here. The expectation is + that with strchr the user will branch based on input being + null. Since this branch will be 100% predictive of the user + branch a branch miss here should save what otherwise would + be branch miss in the user code. Otherwise using a branch 1) + saves code size and 2) is faster in highly predictable + environments. */ jne L(zero) # endif addq %rdi, %rax - VZEROUPPER_RETURN +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN # ifndef USE_AS_STRCHRNUL L(zero): @@ -103,7 +95,8 @@ L(zero): .p2align 4 L(first_vec_x1): - tzcntl %eax, %eax + /* Use bsf to save code size. */ + bsfl %eax, %eax incq %rdi # ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ @@ -113,9 +106,10 @@ L(first_vec_x1): addq %rdi, %rax VZEROUPPER_RETURN - .p2align 4 + .p2align 4,, 10 L(first_vec_x2): - tzcntl %eax, %eax + /* Use bsf to save code size. */ + bsfl %eax, %eax addq $(VEC_SIZE + 1), %rdi # ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ @@ -125,9 +119,10 @@ L(first_vec_x2): addq %rdi, %rax VZEROUPPER_RETURN - .p2align 4 + .p2align 4,, 8 L(first_vec_x3): - tzcntl %eax, %eax + /* Use bsf to save code size. */ + bsfl %eax, %eax addq $(VEC_SIZE * 2 + 1), %rdi # ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ @@ -137,6 +132,21 @@ L(first_vec_x3): addq %rdi, %rax VZEROUPPER_RETURN + .p2align 4,, 10 +L(first_vec_x4): + /* Use bsf to save code size. */ + bsfl %eax, %eax + addq $(VEC_SIZE * 3 + 1), %rdi +# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero) +# endif + addq %rdi, %rax + VZEROUPPER_RETURN + + + .p2align 4 L(aligned_more): /* Align data to VEC_SIZE - 1. This is the same number of @@ -146,90 +156,92 @@ L(aligned_more): L(cross_page_continue): /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - vmovdqa 1(%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax + vmovdqa 1(%rdi), %ymm2 + VPCMPEQ %ymm2, %ymm0, %ymm3 + VPCMPEQ %ymm2, %ymm1, %ymm2 + vpor %ymm3, %ymm2, %ymm3 + vpmovmskb %ymm3, %eax testl %eax, %eax jnz L(first_vec_x1) - vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2 + VPCMPEQ %ymm2, %ymm0, %ymm3 + VPCMPEQ %ymm2, %ymm1, %ymm2 + vpor %ymm3, %ymm2, %ymm3 + vpmovmskb %ymm3, %eax testl %eax, %eax jnz L(first_vec_x2) - vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2 + VPCMPEQ %ymm2, %ymm0, %ymm3 + VPCMPEQ %ymm2, %ymm1, %ymm2 + vpor %ymm3, %ymm2, %ymm3 + vpmovmskb %ymm3, %eax testl %eax, %eax jnz L(first_vec_x3) - vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2 + VPCMPEQ %ymm2, %ymm0, %ymm3 + VPCMPEQ %ymm2, %ymm1, %ymm2 + vpor %ymm3, %ymm2, %ymm3 + vpmovmskb %ymm3, %eax testl %eax, %eax jnz L(first_vec_x4) - /* Align data to VEC_SIZE * 4 - 1. */ - addq $(VEC_SIZE * 4 + 1), %rdi - andq $-(VEC_SIZE * 4), %rdi + /* Align data to VEC_SIZE * 4 - 1. */ + incq %rdi + orq $(VEC_SIZE * 4 - 1), %rdi .p2align 4 L(loop_4x_vec): /* Compare 4 * VEC at a time forward. */ - vmovdqa (%rdi), %ymm5 - vmovdqa (VEC_SIZE)(%rdi), %ymm6 - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 + vmovdqa 1(%rdi), %ymm6 + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7 /* Leaves only CHARS matching esi as 0. */ - vpxor %ymm5, %ymm0, %ymm1 vpxor %ymm6, %ymm0, %ymm2 vpxor %ymm7, %ymm0, %ymm3 - vpxor %ymm8, %ymm0, %ymm4 - VPMINU %ymm1, %ymm5, %ymm1 VPMINU %ymm2, %ymm6, %ymm2 VPMINU %ymm3, %ymm7, %ymm3 - VPMINU %ymm4, %ymm8, %ymm4 - VPMINU %ymm1, %ymm2, %ymm5 - VPMINU %ymm3, %ymm4, %ymm6 + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6 + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7 + + vpxor %ymm6, %ymm0, %ymm4 + vpxor %ymm7, %ymm0, %ymm5 + + VPMINU %ymm4, %ymm6, %ymm4 + VPMINU %ymm5, %ymm7, %ymm5 - VPMINU %ymm5, %ymm6, %ymm6 + VPMINU %ymm2, %ymm3, %ymm6 + VPMINU %ymm4, %ymm5, %ymm7 - VPCMPEQ %ymm6, %ymm9, %ymm6 - vpmovmskb %ymm6, %ecx + VPMINU %ymm6, %ymm7, %ymm7 + + VPCMPEQ %ymm7, %ymm1, %ymm7 + vpmovmskb %ymm7, %ecx subq $-(VEC_SIZE * 4), %rdi testl %ecx, %ecx jz L(loop_4x_vec) - - VPCMPEQ %ymm1, %ymm9, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ %ymm2, %ymm1, %ymm2 + vpmovmskb %ymm2, %eax testl %eax, %eax jnz L(last_vec_x0) - VPCMPEQ %ymm5, %ymm9, %ymm2 - vpmovmskb %ymm2, %eax + VPCMPEQ %ymm3, %ymm1, %ymm3 + vpmovmskb %ymm3, %eax testl %eax, %eax jnz L(last_vec_x1) - VPCMPEQ %ymm3, %ymm9, %ymm3 - vpmovmskb %ymm3, %eax + VPCMPEQ %ymm4, %ymm1, %ymm4 + vpmovmskb %ymm4, %eax /* rcx has combined result from all 4 VEC. It will only be used if the first 3 other VEC all did not contain a match. */ salq $32, %rcx orq %rcx, %rax tzcntq %rax, %rax - subq $(VEC_SIZE * 2), %rdi + subq $(VEC_SIZE * 2 - 1), %rdi # ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ cmp (%rdi, %rax), %CHAR_REG @@ -239,10 +251,11 @@ L(loop_4x_vec): VZEROUPPER_RETURN - .p2align 4 + .p2align 4,, 10 L(last_vec_x0): - tzcntl %eax, %eax - addq $-(VEC_SIZE * 4), %rdi + /* Use bsf to save code size. */ + bsfl %eax, %eax + addq $-(VEC_SIZE * 4 - 1), %rdi # ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ cmp (%rdi, %rax), %CHAR_REG @@ -251,16 +264,11 @@ L(last_vec_x0): addq %rdi, %rax VZEROUPPER_RETURN -# ifndef USE_AS_STRCHRNUL -L(zero_end): - xorl %eax, %eax - VZEROUPPER_RETURN -# endif - .p2align 4 + .p2align 4,, 10 L(last_vec_x1): tzcntl %eax, %eax - subq $(VEC_SIZE * 3), %rdi + subq $(VEC_SIZE * 3 - 1), %rdi # ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ cmp (%rdi, %rax), %CHAR_REG @@ -269,18 +277,23 @@ L(last_vec_x1): addq %rdi, %rax VZEROUPPER_RETURN +# ifndef USE_AS_STRCHRNUL +L(zero_end): + xorl %eax, %eax + VZEROUPPER_RETURN +# endif /* Cold case for crossing page with first load. */ - .p2align 4 + .p2align 4,, 8 L(cross_page_boundary): movq %rdi, %rdx /* Align rdi to VEC_SIZE - 1. */ orq $(VEC_SIZE - 1), %rdi - vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2 + VPCMPEQ %ymm2, %ymm0, %ymm3 + VPCMPEQ %ymm2, %ymm1, %ymm2 + vpor %ymm3, %ymm2, %ymm3 + vpmovmskb %ymm3, %eax /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT so no need to manually mod edx. */ sarxl %edx, %eax, %eax @@ -291,13 +304,10 @@ L(cross_page_boundary): xorl %ecx, %ecx /* Found CHAR or the null byte. */ cmp (%rdx, %rax), %CHAR_REG - leaq (%rdx, %rax), %rax - cmovne %rcx, %rax -# else - addq %rdx, %rax + jne L(zero_end) # endif -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN + addq %rdx, %rax + VZEROUPPER_RETURN END (STRCHR) -# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S index f62cd9d14..ec739fb8f 100644 --- a/sysdeps/x86_64/multiarch/strchr-evex.S +++ b/sysdeps/x86_64/multiarch/strchr-evex.S @@ -30,6 +30,7 @@ # ifdef USE_AS_WCSCHR # define VPBROADCAST vpbroadcastd # define VPCMP vpcmpd +# define VPTESTN vptestnmd # define VPMINU vpminud # define CHAR_REG esi # define SHIFT_REG ecx @@ -37,6 +38,7 @@ # else # define VPBROADCAST vpbroadcastb # define VPCMP vpcmpb +# define VPTESTN vptestnmb # define VPMINU vpminub # define CHAR_REG sil # define SHIFT_REG edx @@ -61,13 +63,11 @@ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section .text.evex,"ax",@progbits -ENTRY (STRCHR) +ENTRY_P2ALIGN (STRCHR, 5) /* Broadcast CHAR to YMM0. */ VPBROADCAST %esi, %YMM0 movl %edi, %eax andl $(PAGE_SIZE - 1), %eax - vpxorq %XMMZERO, %XMMZERO, %XMMZERO - /* Check if we cross page boundary with one vector load. Otherwise it is safe to use an unaligned load. */ cmpl $(PAGE_SIZE - VEC_SIZE), %eax @@ -81,49 +81,35 @@ ENTRY (STRCHR) vpxorq %YMM1, %YMM0, %YMM2 VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %eax testl %eax, %eax jz L(aligned_more) tzcntl %eax, %eax +# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG + /* NB: Use a branch instead of cmovcc here. The expectation is + that with strchr the user will branch based on input being + null. Since this branch will be 100% predictive of the user + branch a branch miss here should save what otherwise would + be branch miss in the user code. Otherwise using a branch 1) + saves code size and 2) is faster in highly predictable + environments. */ + jne L(zero) +# endif # ifdef USE_AS_WCSCHR /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ leaq (%rdi, %rax, CHAR_SIZE), %rax # else addq %rdi, %rax -# endif -# ifndef USE_AS_STRCHRNUL - /* Found CHAR or the null byte. */ - cmp (%rax), %CHAR_REG - jne L(zero) # endif ret - /* .p2align 5 helps keep performance more consistent if ENTRY() - alignment % 32 was either 16 or 0. As well this makes the - alignment % 32 of the loop_4x_vec fixed which makes tuning it - easier. */ - .p2align 5 -L(first_vec_x3): - tzcntl %eax, %eax -# ifndef USE_AS_STRCHRNUL - /* Found CHAR or the null byte. */ - cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG - jne L(zero) -# endif - /* NB: Multiply sizeof char type (1 or 4) to get the number of - bytes. */ - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax - ret -# ifndef USE_AS_STRCHRNUL -L(zero): - xorl %eax, %eax - ret -# endif - .p2align 4 + .p2align 4,, 10 L(first_vec_x4): # ifndef USE_AS_STRCHRNUL /* Check to see if first match was CHAR (k0) or null (k1). */ @@ -144,9 +130,18 @@ L(first_vec_x4): leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax ret +# ifndef USE_AS_STRCHRNUL +L(zero): + xorl %eax, %eax + ret +# endif + + .p2align 4 L(first_vec_x1): - tzcntl %eax, %eax + /* Use bsf here to save 1-byte keeping keeping the block in 1x + fetch block. eax guranteed non-zero. */ + bsfl %eax, %eax # ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG @@ -158,7 +153,7 @@ L(first_vec_x1): leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax ret - .p2align 4 + .p2align 4,, 10 L(first_vec_x2): # ifndef USE_AS_STRCHRNUL /* Check to see if first match was CHAR (k0) or null (k1). */ @@ -179,6 +174,21 @@ L(first_vec_x2): leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret + .p2align 4,, 10 +L(first_vec_x3): + /* Use bsf here to save 1-byte keeping keeping the block in 1x + fetch block. eax guranteed non-zero. */ + bsfl %eax, %eax +# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero) +# endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + .p2align 4 L(aligned_more): /* Align data to VEC_SIZE. */ @@ -195,7 +205,7 @@ L(cross_page_continue): vpxorq %YMM1, %YMM0, %YMM2 VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %eax testl %eax, %eax jnz L(first_vec_x1) @@ -206,7 +216,7 @@ L(cross_page_continue): /* Each bit in K0 represents a CHAR in YMM1. */ VPCMP $0, %YMM1, %YMM0, %k0 /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMM1, %YMMZERO, %k1 + VPTESTN %YMM1, %YMM1, %k1 kortestd %k0, %k1 jnz L(first_vec_x2) @@ -215,7 +225,7 @@ L(cross_page_continue): vpxorq %YMM1, %YMM0, %YMM2 VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %eax testl %eax, %eax jnz L(first_vec_x3) @@ -224,7 +234,7 @@ L(cross_page_continue): /* Each bit in K0 represents a CHAR in YMM1. */ VPCMP $0, %YMM1, %YMM0, %k0 /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMM1, %YMMZERO, %k1 + VPTESTN %YMM1, %YMM1, %k1 kortestd %k0, %k1 jnz L(first_vec_x4) @@ -265,33 +275,33 @@ L(loop_4x_vec): VPMINU %YMM3, %YMM4, %YMM4 VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} - VPCMP $0, %YMMZERO, %YMM4, %k1 + VPTESTN %YMM4, %YMM4, %k1 kmovd %k1, %ecx subq $-(VEC_SIZE * 4), %rdi testl %ecx, %ecx jz L(loop_4x_vec) - VPCMP $0, %YMMZERO, %YMM1, %k0 + VPTESTN %YMM1, %YMM1, %k0 kmovd %k0, %eax testl %eax, %eax jnz L(last_vec_x1) - VPCMP $0, %YMMZERO, %YMM2, %k0 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %eax testl %eax, %eax jnz L(last_vec_x2) - VPCMP $0, %YMMZERO, %YMM3, %k0 + VPTESTN %YMM3, %YMM3, %k0 kmovd %k0, %eax /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ # ifdef USE_AS_WCSCHR sall $8, %ecx orl %ecx, %eax - tzcntl %eax, %eax + bsfl %eax, %eax # else salq $32, %rcx orq %rcx, %rax - tzcntq %rax, %rax + bsfq %rax, %rax # endif # ifndef USE_AS_STRCHRNUL /* Check if match was CHAR or null. */ @@ -303,28 +313,28 @@ L(loop_4x_vec): leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret -# ifndef USE_AS_STRCHRNUL -L(zero_end): - xorl %eax, %eax - ret + .p2align 4,, 8 +L(last_vec_x1): + bsfl %eax, %eax +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. + */ + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax # endif - .p2align 4 -L(last_vec_x1): - tzcntl %eax, %eax # ifndef USE_AS_STRCHRNUL /* Check if match was null. */ - cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG + cmp (%rax), %CHAR_REG jne L(zero_end) # endif - /* NB: Multiply sizeof char type (1 or 4) to get the number of - bytes. */ - leaq (%rdi, %rax, CHAR_SIZE), %rax + ret - .p2align 4 + .p2align 4,, 8 L(last_vec_x2): - tzcntl %eax, %eax + bsfl %eax, %eax # ifndef USE_AS_STRCHRNUL /* Check if match was null. */ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG @@ -336,7 +346,7 @@ L(last_vec_x2): ret /* Cold case for crossing page with first load. */ - .p2align 4 + .p2align 4,, 8 L(cross_page_boundary): movq %rdi, %rdx /* Align rdi. */ @@ -346,9 +356,9 @@ L(cross_page_boundary): vpxorq %YMM1, %YMM0, %YMM2 VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %eax - /* Remove the leading bits. */ + /* Remove the leading bits. */ # ifdef USE_AS_WCSCHR movl %edx, %SHIFT_REG /* NB: Divide shift count by 4 since each bit in K1 represent 4 @@ -360,20 +370,24 @@ L(cross_page_boundary): /* If eax is zero continue. */ testl %eax, %eax jz L(cross_page_continue) - tzcntl %eax, %eax -# ifndef USE_AS_STRCHRNUL - /* Check to see if match was CHAR or null. */ - cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG - jne L(zero_end) -# endif + bsfl %eax, %eax + # ifdef USE_AS_WCSCHR /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ leaq (%rdx, %rax, CHAR_SIZE), %rax # else addq %rdx, %rax +# endif +# ifndef USE_AS_STRCHRNUL + /* Check to see if match was CHAR or null. */ + cmp (%rax), %CHAR_REG + je L(cross_page_ret) +L(zero_end): + xorl %eax, %eax +L(cross_page_ret): # endif ret END (STRCHR) -# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S index 9c73b5899..8da09bd86 100644 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S @@ -20,45 +20,146 @@ # include +# if defined USE_AS_STRCASECMP_L +# include "locale-defines.h" +# endif + # ifndef STRCMP # define STRCMP __strcmp_avx2 # endif # define PAGE_SIZE 4096 -/* VEC_SIZE = Number of bytes in a ymm register */ + /* VEC_SIZE = Number of bytes in a ymm register. */ # define VEC_SIZE 32 -/* Shift for dividing by (VEC_SIZE * 4). */ -# define DIVIDE_BY_VEC_4_SHIFT 7 -# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) -# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) -# endif +# define VMOVU vmovdqu +# define VMOVA vmovdqa # ifdef USE_AS_WCSCMP -/* Compare packed dwords. */ + /* Compare packed dwords. */ # define VPCMPEQ vpcmpeqd -/* Compare packed dwords and store minimum. */ + /* Compare packed dwords and store minimum. */ # define VPMINU vpminud -/* 1 dword char == 4 bytes. */ + /* 1 dword char == 4 bytes. */ # define SIZE_OF_CHAR 4 # else -/* Compare packed bytes. */ + /* Compare packed bytes. */ # define VPCMPEQ vpcmpeqb -/* Compare packed bytes and store minimum. */ + /* Compare packed bytes and store minimum. */ # define VPMINU vpminub -/* 1 byte char == 1 byte. */ + /* 1 byte char == 1 byte. */ # define SIZE_OF_CHAR 1 # endif +# ifdef USE_AS_STRNCMP +# define LOOP_REG r9d +# define LOOP_REG64 r9 + +# define OFFSET_REG8 r9b +# define OFFSET_REG r9d +# define OFFSET_REG64 r9 +# else +# define LOOP_REG edx +# define LOOP_REG64 rdx + +# define OFFSET_REG8 dl +# define OFFSET_REG edx +# define OFFSET_REG64 rdx +# endif + # ifndef VZEROUPPER # define VZEROUPPER vzeroupper # endif +# if defined USE_AS_STRNCMP +# define VEC_OFFSET 0 +# else +# define VEC_OFFSET (-VEC_SIZE) +# endif + +# ifdef USE_AS_STRCASECMP_L +# define BYTE_LOOP_REG OFFSET_REG +# else +# define BYTE_LOOP_REG ecx +# endif + +# ifdef USE_AS_STRCASECMP_L +# ifdef USE_AS_STRNCMP +# define STRCASECMP __strncasecmp_avx2 +# define LOCALE_REG rcx +# define LOCALE_REG_LP RCX_LP +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii +# else +# define STRCASECMP __strcasecmp_avx2 +# define LOCALE_REG rdx +# define LOCALE_REG_LP RDX_LP +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii +# endif +# endif + +# define xmmZERO xmm15 +# define ymmZERO ymm15 + +# define LCASE_MIN_ymm %ymm10 +# define LCASE_MAX_ymm %ymm11 +# define CASE_ADD_ymm %ymm12 + +# define LCASE_MIN_xmm %xmm10 +# define LCASE_MAX_xmm %xmm11 +# define CASE_ADD_xmm %xmm12 + + /* r11 is never use elsewhere so this is safe to maintain. */ +# define TOLOWER_BASE %r11 + # ifndef SECTION # define SECTION(p) p##.avx # endif +# ifdef USE_AS_STRCASECMP_L +# define REG(x, y) x ## y +# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \ + vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \ + vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \ + vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \ + vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \ + vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \ + vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \ + vpaddb REG(%ext, 8), reg1_in, reg1_out; \ + vpaddb REG(%ext, 9), reg2_in, reg2_out + +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst +# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm) +# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm) + +# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \ + TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \ + VPCMPEQ scratch_reg, s2_reg, reg_out + +# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \ + VMOVU s2_mem, reg_out; \ + CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext) + +# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm) +# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm) + +# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm) +# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm) + +# else +# define TOLOWER_gpr(...) +# define TOLOWER_ymm(...) +# define TOLOWER_xmm(...) + +# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \ + VPCMPEQ s2_reg, s1_reg, reg_out + +# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__) + +# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__) +# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__) +# endif + /* Warning! wcscmp/wcsncmp have to use SIGNED comparison for elements. strcmp/strncmp have to use UNSIGNED comparison for elements. @@ -79,783 +180,1142 @@ the maximum offset is reached before a difference is found, zero is returned. */ - .section SECTION(.text),"ax",@progbits -ENTRY (STRCMP) + .section SECTION(.text), "ax", @progbits + .align 16 + .type STRCMP, @function + .globl STRCMP + .hidden STRCMP + +# ifndef GLABEL +# define GLABEL(...) __VA_ARGS__ +# endif + +# ifdef USE_AS_STRCASECMP_L +ENTRY (GLABEL(STRCASECMP)) + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax + mov %fs:(%rax), %LOCALE_REG_LP + + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 +END (GLABEL(STRCASECMP)) + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ +# endif + + .p2align 4 +STRCMP: + cfi_startproc + _CET_ENDBR + CALL_MCOUNT + +# if defined USE_AS_STRCASECMP_L + /* We have to fall back on the C implementation for locales with + encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP +# else + mov (%LOCALE_REG), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) + jne STRCASECMP_NONASCII + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE +# endif + # ifdef USE_AS_STRNCMP - /* Check for simple cases (0 or 1) in offset. */ + /* Don't overwrite LOCALE_REG (rcx) until we have pass + L(one_or_less). Otherwise we might use the wrong locale in + the OVERFLOW_STRCMP (strcasecmp_l). */ +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif cmp $1, %RDX_LP - je L(char0) - jb L(zero) + /* Signed comparison intentional. We use this branch to also + test cases where length >= 2^63. These very large sizes can be + handled with strcmp as there is no way for that length to + actually bound the buffer. */ + jle L(one_or_less) # ifdef USE_AS_WCSCMP -# ifndef __ILP32__ movq %rdx, %rcx - /* Check if length could overflow when multiplied by - sizeof(wchar_t). Checking top 8 bits will cover all potential - overflow cases as well as redirect cases where its impossible to - length to bound a valid memory region. In these cases just use - 'wcscmp'. */ + + /* Multiplying length by sizeof(wchar_t) can result in overflow. + Check if that is possible. All cases where overflow are possible + are cases where length is large enough that it can never be a + bound on valid memory so just use wcscmp. */ shrq $56, %rcx - jnz __wcscmp_avx2 -# endif - /* Convert units: from wide to byte char. */ - shl $2, %RDX_LP + jnz OVERFLOW_STRCMP + + leaq (, %rdx, 4), %rdx # endif - /* Register %r11 tracks the maximum offset. */ - mov %RDX_LP, %R11_LP +# endif + vpxor %xmmZERO, %xmmZERO, %xmmZERO +# if defined USE_AS_STRCASECMP_L + .section .rodata.cst32, "aM", @progbits, 32 + .align 32 +L(lcase_min): + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f +L(lcase_max): + .quad 0x9999999999999999 + .quad 0x9999999999999999 + .quad 0x9999999999999999 + .quad 0x9999999999999999 +L(case_add): + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + + vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm + vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm + vmovdqa L(case_add)(%rip), CASE_ADD_ymm # endif movl %edi, %eax - xorl %edx, %edx - /* Make %xmm7 (%ymm7) all zeros in this function. */ - vpxor %xmm7, %xmm7, %xmm7 orl %esi, %eax - andl $(PAGE_SIZE - 1), %eax - cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax - jg L(cross_page) - /* Start comparing 4 vectors. */ - vmovdqu (%rdi), %ymm1 - VPCMPEQ (%rsi), %ymm1, %ymm0 - VPMINU %ymm1, %ymm0, %ymm0 - VPCMPEQ %ymm7, %ymm0, %ymm0 - vpmovmskb %ymm0, %ecx - testl %ecx, %ecx - je L(next_3_vectors) - tzcntl %ecx, %edx + sall $20, %eax + /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ + cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax + ja L(page_cross) + +L(no_page_cross): + /* Safe to compare 4x vectors. */ + VMOVU (%rdi), %ymm0 + /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp. + Otherwise converts ymm0 and load from rsi to lower. ymm2 is + scratch and ymm1 is the return. */ + CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) + /* 1s at null CHAR. */ + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + /* 1s where s1 and s2 equal AND not null CHAR. */ + vpandn %ymm1, %ymm2, %ymm1 + + /* All 1s -> keep going, any 0s -> return. */ + vpmovmskb %ymm1, %ecx # ifdef USE_AS_STRNCMP - /* Return 0 if the mismatched index (%rdx) is after the maximum - offset (%r11). */ - cmpq %r11, %rdx - jae L(zero) + cmpq $VEC_SIZE, %rdx + jbe L(vec_0_test_len) # endif + + /* All 1s represents all equals. incl will overflow to zero in + all equals case. Otherwise 1s will carry until position of first + mismatch. */ + incl %ecx + jz L(more_3x_vec) + + .p2align 4,, 4 +L(return_vec_0): + tzcntl %ecx, %ecx # ifdef USE_AS_WCSCMP + movl (%rdi, %rcx), %edx xorl %eax, %eax - movl (%rdi, %rdx), %ecx - cmpl (%rsi, %rdx), %ecx - je L(return) -L(wcscmp_return): + cmpl (%rsi, %rcx), %edx + je L(ret0) setl %al negl %eax orl $1, %eax -L(return): # else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - subl %edx, %eax + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax # endif +L(ret0): L(return_vzeroupper): ZERO_UPPER_VEC_REGISTERS_RETURN - .p2align 4 -L(return_vec_size): - tzcntl %ecx, %edx # ifdef USE_AS_STRNCMP - /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after - the maximum offset (%r11). */ - addq $VEC_SIZE, %rdx - cmpq %r11, %rdx - jae L(zero) -# ifdef USE_AS_WCSCMP + .p2align 4,, 8 +L(vec_0_test_len): + notl %ecx + bzhil %edx, %ecx, %eax + jnz L(return_vec_0) + /* Align if will cross fetch block. */ + .p2align 4,, 2 +L(ret_zero): xorl %eax, %eax - movl (%rdi, %rdx), %ecx - cmpl (%rsi, %rdx), %ecx - jne L(wcscmp_return) -# else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - subl %edx, %eax + VZEROUPPER_RETURN + + .p2align 4,, 5 +L(one_or_less): +# ifdef USE_AS_STRCASECMP_L + /* Set locale argument for strcasecmp. */ + movq %LOCALE_REG, %rdx # endif -# else + jb L(ret_zero) + /* 'nbe' covers the case where length is negative (large + unsigned). */ + jnbe OVERFLOW_STRCMP # ifdef USE_AS_WCSCMP + movl (%rdi), %edx xorl %eax, %eax - movl VEC_SIZE(%rdi, %rdx), %ecx - cmpl VEC_SIZE(%rsi, %rdx), %ecx - jne L(wcscmp_return) + cmpl (%rsi), %edx + je L(ret1) + setl %al + negl %eax + orl $1, %eax # else - movzbl VEC_SIZE(%rdi, %rdx), %eax - movzbl VEC_SIZE(%rsi, %rdx), %edx - subl %edx, %eax + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax # endif +L(ret1): + ret # endif - VZEROUPPER_RETURN - .p2align 4 -L(return_2_vec_size): - tzcntl %ecx, %edx + .p2align 4,, 10 +L(return_vec_1): + tzcntl %ecx, %ecx # ifdef USE_AS_STRNCMP - /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is - after the maximum offset (%r11). */ - addq $(VEC_SIZE * 2), %rdx - cmpq %r11, %rdx - jae L(zero) -# ifdef USE_AS_WCSCMP + /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of + overflow. */ + addq $-VEC_SIZE, %rdx + cmpq %rcx, %rdx + jbe L(ret_zero) +# endif +# ifdef USE_AS_WCSCMP + movl VEC_SIZE(%rdi, %rcx), %edx xorl %eax, %eax - movl (%rdi, %rdx), %ecx - cmpl (%rsi, %rdx), %ecx - jne L(wcscmp_return) -# else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - subl %edx, %eax -# endif + cmpl VEC_SIZE(%rsi, %rcx), %edx + je L(ret2) + setl %al + negl %eax + orl $1, %eax # else -# ifdef USE_AS_WCSCMP - xorl %eax, %eax - movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx - cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx - jne L(wcscmp_return) -# else - movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax - movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx - subl %edx, %eax -# endif + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax # endif +L(ret2): VZEROUPPER_RETURN - .p2align 4 -L(return_3_vec_size): - tzcntl %ecx, %edx + .p2align 4,, 10 # ifdef USE_AS_STRNCMP - /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is - after the maximum offset (%r11). */ - addq $(VEC_SIZE * 3), %rdx - cmpq %r11, %rdx - jae L(zero) -# ifdef USE_AS_WCSCMP +L(return_vec_3): + salq $32, %rcx +# endif + +L(return_vec_2): +# ifndef USE_AS_STRNCMP + tzcntl %ecx, %ecx +# else + tzcntq %rcx, %rcx + cmpq %rcx, %rdx + jbe L(ret_zero) +# endif + +# ifdef USE_AS_WCSCMP + movl (VEC_SIZE * 2)(%rdi, %rcx), %edx xorl %eax, %eax - movl (%rdi, %rdx), %ecx - cmpl (%rsi, %rdx), %ecx - jne L(wcscmp_return) -# else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - subl %edx, %eax -# endif + cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx + je L(ret3) + setl %al + negl %eax + orl $1, %eax # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax +# endif +L(ret3): + VZEROUPPER_RETURN + +# ifndef USE_AS_STRNCMP + .p2align 4,, 10 +L(return_vec_3): + tzcntl %ecx, %ecx # ifdef USE_AS_WCSCMP + movl (VEC_SIZE * 3)(%rdi, %rcx), %edx xorl %eax, %eax - movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx - cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx - jne L(wcscmp_return) + cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx + je L(ret4) + setl %al + negl %eax + orl $1, %eax # else - movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax - movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx - subl %edx, %eax + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax # endif -# endif +L(ret4): VZEROUPPER_RETURN +# endif + + .p2align 4,, 10 +L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU VEC_SIZE(%rdi), %ymm0 + CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx + incl %ecx + jnz L(return_vec_1) - .p2align 4 -L(next_3_vectors): - vmovdqu VEC_SIZE(%rdi), %ymm6 - VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 - VPMINU %ymm6, %ymm3, %ymm3 - VPCMPEQ %ymm7, %ymm3, %ymm3 - vpmovmskb %ymm3, %ecx - testl %ecx, %ecx - jne L(return_vec_size) - vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 - vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 - vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 - VPMINU %ymm5, %ymm2, %ymm2 - VPCMPEQ %ymm4, %ymm0, %ymm0 - VPCMPEQ %ymm7, %ymm2, %ymm2 - vpmovmskb %ymm2, %ecx - testl %ecx, %ecx - jne L(return_2_vec_size) - VPMINU %ymm4, %ymm0, %ymm0 - VPCMPEQ %ymm7, %ymm0, %ymm0 - vpmovmskb %ymm0, %ecx - testl %ecx, %ecx - jne L(return_3_vec_size) -L(main_loop_header): - leaq (VEC_SIZE * 4)(%rdi), %rdx - movl $PAGE_SIZE, %ecx - /* Align load via RAX. */ - andq $-(VEC_SIZE * 4), %rdx - subq %rdi, %rdx - leaq (%rdi, %rdx), %rax # ifdef USE_AS_STRNCMP - /* Starting from this point, the maximum offset, or simply the - 'offset', DECREASES by the same amount when base pointers are - moved forward. Return 0 when: - 1) On match: offset <= the matched vector index. - 2) On mistmach, offset is before the mistmatched index. + subq $(VEC_SIZE * 2), %rdx + jbe L(ret_zero) +# endif + + VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx + incl %ecx + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx + incl %ecx + jnz L(return_vec_3) + +# ifdef USE_AS_STRNCMP + cmpq $(VEC_SIZE * 2), %rdx + jbe L(ret_zero) +# endif + +# ifdef USE_AS_WCSCMP + /* any non-zero positive value that doesn't inference with 0x1. */ - subq %rdx, %r11 - jbe L(zero) -# endif - addq %rsi, %rdx - movq %rdx, %rsi - andl $(PAGE_SIZE - 1), %esi - /* Number of bytes before page crossing. */ - subq %rsi, %rcx - /* Number of VEC_SIZE * 4 blocks before page crossing. */ - shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx - /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ - movl %ecx, %esi - jmp L(loop_start) + movl $2, %r8d + +# else + xorl %r8d, %r8d +# endif + + /* The prepare labels are various entry points from the page + cross logic. */ +L(prepare_loop): + +# ifdef USE_AS_STRNCMP + /* Store N + (VEC_SIZE * 4) and place check at the begining of + the loop. */ + leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx +# endif +L(prepare_loop_no_len): + + /* Align s1 and adjust s2 accordingly. */ + subq %rdi, %rsi + andq $-(VEC_SIZE * 4), %rdi + addq %rdi, %rsi + +# ifdef USE_AS_STRNCMP + subq %rdi, %rdx +# endif +L(prepare_loop_aligned): + /* eax stores distance from rsi to next page cross. These cases + need to be handled specially as the 4x loop could potentially + read memory past the length of s1 or s2 and across a page + boundary. */ + movl $-(VEC_SIZE * 4), %eax + subl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + + /* Loop 4x comparisons at a time. */ .p2align 4 L(loop): + + /* End condition for strncmp. */ # ifdef USE_AS_STRNCMP - /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease - the maximum offset (%r11) by the same amount. */ - subq $(VEC_SIZE * 4), %r11 - jbe L(zero) -# endif - addq $(VEC_SIZE * 4), %rax - addq $(VEC_SIZE * 4), %rdx -L(loop_start): - testl %esi, %esi - leal -1(%esi), %esi - je L(loop_cross_page) -L(back_to_loop): - /* Main loop, comparing 4 vectors are a time. */ - vmovdqa (%rax), %ymm0 - vmovdqa VEC_SIZE(%rax), %ymm3 - VPCMPEQ (%rdx), %ymm0, %ymm4 - VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 - VPMINU %ymm0, %ymm4, %ymm4 - VPMINU %ymm3, %ymm1, %ymm1 - vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 - VPMINU %ymm1, %ymm4, %ymm0 - vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 - VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 - VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 - VPMINU %ymm2, %ymm5, %ymm5 - VPMINU %ymm3, %ymm6, %ymm6 - VPMINU %ymm5, %ymm0, %ymm0 - VPMINU %ymm6, %ymm0, %ymm0 - VPCMPEQ %ymm7, %ymm0, %ymm0 - - /* Test each mask (32 bits) individually because for VEC_SIZE - == 32 is not possible to OR the four masks and keep all bits - in a 64-bit integer register, differing from SSE2 strcmp - where ORing is possible. */ - vpmovmskb %ymm0, %ecx + subq $(VEC_SIZE * 4), %rdx + jbe L(ret_zero) +# endif + + subq $-(VEC_SIZE * 4), %rdi + subq $-(VEC_SIZE * 4), %rsi + + /* Check if rsi loads will cross a page boundary. */ + addl $-(VEC_SIZE * 4), %eax + jnb L(page_cross_during_loop) + + /* Loop entry after handling page cross during loop. */ +L(loop_skip_page_cross_check): + VMOVA (VEC_SIZE * 0)(%rdi), %ymm0 + VMOVA (VEC_SIZE * 1)(%rdi), %ymm2 + VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 + VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 + + /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1) + CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3) + CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) + CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) + + /* If any mismatches or null CHAR then 0 CHAR, otherwise non- + zero. */ + vpand %ymm0, %ymm1, %ymm1 + + + vpand %ymm2, %ymm3, %ymm3 + vpand %ymm4, %ymm5, %ymm5 + vpand %ymm6, %ymm7, %ymm7 + + VPMINU %ymm1, %ymm3, %ymm3 + VPMINU %ymm5, %ymm7, %ymm7 + + /* Reduce all 0 CHARs for the 4x VEC into ymm7. */ + VPMINU %ymm3, %ymm7, %ymm7 + + /* If any 0 CHAR then done. */ + VPCMPEQ %ymm7, %ymmZERO, %ymm7 + vpmovmskb %ymm7, %LOOP_REG + testl %LOOP_REG, %LOOP_REG + jz L(loop) + + /* Find which VEC has the mismatch of end of string. */ + VPCMPEQ %ymm1, %ymmZERO, %ymm1 + vpmovmskb %ymm1, %ecx testl %ecx, %ecx - je L(loop) - VPCMPEQ %ymm7, %ymm4, %ymm0 - vpmovmskb %ymm0, %edi - testl %edi, %edi - je L(test_vec) - tzcntl %edi, %ecx + jnz L(return_vec_0_end) + + + VPCMPEQ %ymm3, %ymmZERO, %ymm3 + vpmovmskb %ymm3, %ecx + testl %ecx, %ecx + jnz L(return_vec_1_end) + +L(return_vec_2_3_end): # ifdef USE_AS_STRNCMP - cmpq %rcx, %r11 - jbe L(zero) -# ifdef USE_AS_WCSCMP - movq %rax, %rsi + subq $(VEC_SIZE * 2), %rdx + jbe L(ret_zero_end) +# endif + + VPCMPEQ %ymm5, %ymmZERO, %ymm5 + vpmovmskb %ymm5, %ecx + testl %ecx, %ecx + jnz L(return_vec_2_end) + + /* LOOP_REG contains matches for null/mismatch from the loop. If + VEC 0,1,and 2 all have no null and no mismatches then mismatch + must entirely be from VEC 3 which is fully represented by + LOOP_REG. */ + tzcntl %LOOP_REG, %LOOP_REG + +# ifdef USE_AS_STRNCMP + subl $-(VEC_SIZE), %LOOP_REG + cmpq %LOOP_REG64, %rdx + jbe L(ret_zero_end) +# endif + +# ifdef USE_AS_WCSCMP + movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx xorl %eax, %eax - movl (%rsi, %rcx), %edi - cmpl (%rdx, %rcx), %edi - jne L(wcscmp_return) -# else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax -# endif + cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx + je L(ret5) + setl %al + negl %eax + xorl %r8d, %eax # else -# ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (%rsi, %rcx), %edi - cmpl (%rdx, %rcx), %edi - jne L(wcscmp_return) -# else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax -# endif + movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax + movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax # endif +L(ret5): VZEROUPPER_RETURN - .p2align 4 -L(test_vec): # ifdef USE_AS_STRNCMP - /* The first vector matched. Return 0 if the maximum offset - (%r11) <= VEC_SIZE. */ - cmpq $VEC_SIZE, %r11 - jbe L(zero) + .p2align 4,, 2 +L(ret_zero_end): + xorl %eax, %eax + VZEROUPPER_RETURN # endif - VPCMPEQ %ymm7, %ymm1, %ymm1 - vpmovmskb %ymm1, %ecx - testl %ecx, %ecx - je L(test_2_vec) - tzcntl %ecx, %edi + + + /* The L(return_vec_N_end) differ from L(return_vec_N) in that + they use the value of `r8` to negate the return value. This is + because the page cross logic can swap `rdi` and `rsi`. */ + .p2align 4,, 10 # ifdef USE_AS_STRNCMP - addq $VEC_SIZE, %rdi - cmpq %rdi, %r11 - jbe L(zero) -# ifdef USE_AS_WCSCMP - movq %rax, %rsi +L(return_vec_1_end): + salq $32, %rcx +# endif +L(return_vec_0_end): +# ifndef USE_AS_STRNCMP + tzcntl %ecx, %ecx +# else + tzcntq %rcx, %rcx + cmpq %rcx, %rdx + jbe L(ret_zero_end) +# endif + +# ifdef USE_AS_WCSCMP + movl (%rdi, %rcx), %edx xorl %eax, %eax - movl (%rsi, %rdi), %ecx - cmpl (%rdx, %rdi), %ecx - jne L(wcscmp_return) -# else - movzbl (%rax, %rdi), %eax - movzbl (%rdx, %rdi), %edx - subl %edx, %eax -# endif + cmpl (%rsi, %rcx), %edx + je L(ret6) + setl %al + negl %eax + xorl %r8d, %eax # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +# endif +L(ret6): + VZEROUPPER_RETURN + +# ifndef USE_AS_STRNCMP + .p2align 4,, 10 +L(return_vec_1_end): + tzcntl %ecx, %ecx # ifdef USE_AS_WCSCMP - movq %rax, %rsi + movl VEC_SIZE(%rdi, %rcx), %edx xorl %eax, %eax - movl VEC_SIZE(%rsi, %rdi), %ecx - cmpl VEC_SIZE(%rdx, %rdi), %ecx - jne L(wcscmp_return) + cmpl VEC_SIZE(%rsi, %rcx), %edx + je L(ret7) + setl %al + negl %eax + xorl %r8d, %eax # else - movzbl VEC_SIZE(%rax, %rdi), %eax - movzbl VEC_SIZE(%rdx, %rdi), %edx - subl %edx, %eax + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax # endif -# endif +L(ret7): VZEROUPPER_RETURN +# endif - .p2align 4 -L(test_2_vec): + .p2align 4,, 10 +L(return_vec_2_end): + tzcntl %ecx, %ecx # ifdef USE_AS_STRNCMP - /* The first 2 vectors matched. Return 0 if the maximum offset - (%r11) <= 2 * VEC_SIZE. */ - cmpq $(VEC_SIZE * 2), %r11 - jbe L(zero) + cmpq %rcx, %rdx + jbe L(ret_zero_page_cross) # endif - VPCMPEQ %ymm7, %ymm5, %ymm5 - vpmovmskb %ymm5, %ecx - testl %ecx, %ecx - je L(test_3_vec) - tzcntl %ecx, %edi -# ifdef USE_AS_STRNCMP - addq $(VEC_SIZE * 2), %rdi - cmpq %rdi, %r11 - jbe L(zero) -# ifdef USE_AS_WCSCMP - movq %rax, %rsi +# ifdef USE_AS_WCSCMP + movl (VEC_SIZE * 2)(%rdi, %rcx), %edx xorl %eax, %eax - movl (%rsi, %rdi), %ecx - cmpl (%rdx, %rdi), %ecx - jne L(wcscmp_return) -# else - movzbl (%rax, %rdi), %eax - movzbl (%rdx, %rdi), %edx - subl %edx, %eax -# endif + cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx + je L(ret11) + setl %al + negl %eax + xorl %r8d, %eax # else -# ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx - cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx - jne L(wcscmp_return) -# else - movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax - movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx - subl %edx, %eax -# endif + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax # endif +L(ret11): VZEROUPPER_RETURN - .p2align 4 -L(test_3_vec): + + /* Page cross in rsi in next 4x VEC. */ + + /* TODO: Improve logic here. */ + .p2align 4,, 10 +L(page_cross_during_loop): + /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ + + /* Optimistically rsi and rdi and both aligned inwhich case we + don't need any logic here. */ + cmpl $-(VEC_SIZE * 4), %eax + /* Don't adjust eax before jumping back to loop and we will + never hit page cross case again. */ + je L(loop_skip_page_cross_check) + + /* Check if we can safely load a VEC. */ + cmpl $-(VEC_SIZE * 3), %eax + jle L(less_1x_vec_till_page_cross) + + VMOVA (%rdi), %ymm0 + CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx + incl %ecx + jnz L(return_vec_0_end) + + /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ + cmpl $-(VEC_SIZE * 2), %eax + jg L(more_2x_vec_till_page_cross) + + .p2align 4,, 4 +L(less_1x_vec_till_page_cross): + subl $-(VEC_SIZE * 4), %eax + /* Guranteed safe to read from rdi - VEC_SIZE here. The only + concerning case is first iteration if incoming s1 was near start + of a page and s2 near end. If s1 was near the start of the page + we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe + to read back -VEC_SIZE. If rdi is truly at the start of a page + here, it means the previous page (rdi - VEC_SIZE) has already + been loaded earlier so must be valid. */ + VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 + CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx + + /* Mask of potentially valid bits. The lower bits can be out of + range comparisons (but safe regarding page crosses). */ + movl $-1, %r10d + shlxl %esi, %r10d, %r10d + notl %ecx + # ifdef USE_AS_STRNCMP - /* The first 3 vectors matched. Return 0 if the maximum offset - (%r11) <= 3 * VEC_SIZE. */ - cmpq $(VEC_SIZE * 3), %r11 - jbe L(zero) -# endif - VPCMPEQ %ymm7, %ymm6, %ymm6 - vpmovmskb %ymm6, %esi - tzcntl %esi, %ecx + cmpq %rax, %rdx + jbe L(return_page_cross_end_check) +# endif + movl %eax, %OFFSET_REG + addl $(PAGE_SIZE - VEC_SIZE * 4), %eax + + andl %r10d, %ecx + jz L(loop_skip_page_cross_check) + + .p2align 4,, 3 +L(return_page_cross_end): + tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP - addq $(VEC_SIZE * 3), %rcx - cmpq %rcx, %r11 - jbe L(zero) -# ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (%rsi, %rcx), %esi - cmpl (%rdx, %rcx), %esi - jne L(wcscmp_return) -# else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax -# endif + leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx +L(return_page_cross_cmp_mem): # else -# ifdef USE_AS_WCSCMP - movq %rax, %rsi + addl %OFFSET_REG, %ecx +# endif +# ifdef USE_AS_WCSCMP + movl VEC_OFFSET(%rdi, %rcx), %edx xorl %eax, %eax - movl (VEC_SIZE * 3)(%rsi, %rcx), %esi - cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi - jne L(wcscmp_return) -# else - movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax - movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx - subl %edx, %eax -# endif + cmpl VEC_OFFSET(%rsi, %rcx), %edx + je L(ret8) + setl %al + negl %eax + xorl %r8d, %eax +# else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax # endif +L(ret8): VZEROUPPER_RETURN - .p2align 4 -L(loop_cross_page): - xorl %r10d, %r10d - movq %rdx, %rcx - /* Align load via RDX. We load the extra ECX bytes which should - be ignored. */ - andl $((VEC_SIZE * 4) - 1), %ecx - /* R10 is -RCX. */ - subq %rcx, %r10 - - /* This works only if VEC_SIZE * 2 == 64. */ -# if (VEC_SIZE * 2) != 64 -# error (VEC_SIZE * 2) != 64 -# endif - - /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ - cmpl $(VEC_SIZE * 2), %ecx - jge L(loop_cross_page_2_vec) - - vmovdqu (%rax, %r10), %ymm2 - vmovdqu VEC_SIZE(%rax, %r10), %ymm3 - VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 - VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 - VPMINU %ymm2, %ymm0, %ymm0 - VPMINU %ymm3, %ymm1, %ymm1 - VPCMPEQ %ymm7, %ymm0, %ymm0 - VPCMPEQ %ymm7, %ymm1, %ymm1 - - vpmovmskb %ymm0, %edi - vpmovmskb %ymm1, %esi - - salq $32, %rsi - xorq %rsi, %rdi - - /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ - shrq %cl, %rdi - - testq %rdi, %rdi - je L(loop_cross_page_2_vec) - tzcntq %rdi, %rcx # ifdef USE_AS_STRNCMP - cmpq %rcx, %r11 - jbe L(zero) -# ifdef USE_AS_WCSCMP - movq %rax, %rsi + .p2align 4,, 10 +L(return_page_cross_end_check): + andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx), %ecx + cmpl %ecx, %edx + ja L(return_page_cross_cmp_mem) xorl %eax, %eax - movl (%rsi, %rcx), %edi - cmpl (%rdx, %rcx), %edi - jne L(wcscmp_return) -# else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax -# endif -# else -# ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (%rsi, %rcx), %edi - cmpl (%rdx, %rcx), %edi - jne L(wcscmp_return) -# else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax -# endif -# endif VZEROUPPER_RETURN +# endif - .p2align 4 -L(loop_cross_page_2_vec): - /* The first VEC_SIZE * 2 bytes match or are ignored. */ - vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 - vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 - VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 - VPMINU %ymm2, %ymm5, %ymm5 - VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 - VPCMPEQ %ymm7, %ymm5, %ymm5 - VPMINU %ymm3, %ymm6, %ymm6 - VPCMPEQ %ymm7, %ymm6, %ymm6 - - vpmovmskb %ymm5, %edi - vpmovmskb %ymm6, %esi - - salq $32, %rsi - xorq %rsi, %rdi - xorl %r8d, %r8d - /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ - subl $(VEC_SIZE * 2), %ecx - jle 1f - /* Skip ECX bytes. */ - shrq %cl, %rdi - /* R8 has number of bytes skipped. */ - movl %ecx, %r8d -1: - /* Before jumping back to the loop, set ESI to the number of - VEC_SIZE * 4 blocks before page crossing. */ - movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi - - testq %rdi, %rdi + .p2align 4,, 10 +L(more_2x_vec_till_page_cross): + /* If more 2x vec till cross we will complete a full loop + iteration here. */ + + VMOVU VEC_SIZE(%rdi), %ymm0 + CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx + incl %ecx + jnz L(return_vec_1_end) + # ifdef USE_AS_STRNCMP - /* At this point, if %rdi value is 0, it already tested - VEC_SIZE*4+%r10 byte starting from %rax. This label - checks whether strncmp maximum offset reached or not. */ - je L(string_nbyte_offset_check) -# else - je L(back_to_loop) + cmpq $(VEC_SIZE * 2), %rdx + jbe L(ret_zero_in_loop_page_cross) # endif - tzcntq %rdi, %rcx - addq %r10, %rcx - /* Adjust for number of bytes skipped. */ - addq %r8, %rcx + + subl $-(VEC_SIZE * 4), %eax + + /* Safe to include comparisons from lower bytes. */ + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 + CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx + incl %ecx + jnz L(return_vec_page_cross_0) + + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 + CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx + incl %ecx + jnz L(return_vec_page_cross_1) + # ifdef USE_AS_STRNCMP - addq $(VEC_SIZE * 2), %rcx - subq %rcx, %r11 - jbe L(zero) -# ifdef USE_AS_WCSCMP - movq %rax, %rsi + /* Must check length here as length might proclude reading next + page. */ + cmpq %rax, %rdx + jbe L(ret_zero_in_loop_page_cross) +# endif + + /* Finish the loop. */ + VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 + VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 + + CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) + CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) + vpand %ymm4, %ymm5, %ymm5 + vpand %ymm6, %ymm7, %ymm7 + VPMINU %ymm5, %ymm7, %ymm7 + VPCMPEQ %ymm7, %ymmZERO, %ymm7 + vpmovmskb %ymm7, %LOOP_REG + testl %LOOP_REG, %LOOP_REG + jnz L(return_vec_2_3_end) + + /* Best for code size to include ucond-jmp here. Would be faster + if this case is hot to duplicate the L(return_vec_2_3_end) code + as fall-through and have jump back to loop on mismatch + comparison. */ + subq $-(VEC_SIZE * 4), %rdi + subq $-(VEC_SIZE * 4), %rsi + addl $(PAGE_SIZE - VEC_SIZE * 8), %eax +# ifdef USE_AS_STRNCMP + subq $(VEC_SIZE * 4), %rdx + ja L(loop_skip_page_cross_check) +L(ret_zero_in_loop_page_cross): xorl %eax, %eax - movl (%rsi, %rcx), %edi - cmpl (%rdx, %rcx), %edi - jne L(wcscmp_return) -# else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax -# endif + VZEROUPPER_RETURN # else -# ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (VEC_SIZE * 2)(%rsi, %rcx), %edi - cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi - jne L(wcscmp_return) -# else - movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax - movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx - subl %edx, %eax -# endif + jmp L(loop_skip_page_cross_check) # endif - VZEROUPPER_RETURN + + .p2align 4,, 10 +L(return_vec_page_cross_0): + addl $-VEC_SIZE, %eax +L(return_vec_page_cross_1): + tzcntl %ecx, %ecx # ifdef USE_AS_STRNCMP -L(string_nbyte_offset_check): - leaq (VEC_SIZE * 4)(%r10), %r10 - cmpq %r10, %r11 - jbe L(zero) - jmp L(back_to_loop) + leal -VEC_SIZE(%rax, %rcx), %ecx + cmpq %rcx, %rdx + jbe L(ret_zero_in_loop_page_cross) +# else + addl %eax, %ecx # endif - .p2align 4 -L(cross_page_loop): - /* Check one byte/dword at a time. */ # ifdef USE_AS_WCSCMP - cmpl %ecx, %eax + movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax + cmpl VEC_OFFSET(%rsi, %rcx), %edx + je L(ret9) + setl %al + negl %eax + xorl %r8d, %eax # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax # endif - jne L(different) - addl $SIZE_OF_CHAR, %edx - cmpl $(VEC_SIZE * 4), %edx - je L(main_loop_header) -# ifdef USE_AS_STRNCMP - cmpq %r11, %rdx - jae L(zero) +L(ret9): + VZEROUPPER_RETURN + + + .p2align 4,, 10 +L(page_cross): +# ifndef USE_AS_STRNCMP + /* If both are VEC aligned we don't need any special logic here. + Only valid for strcmp where stop condition is guranteed to be + reachable by just reading memory. */ + testl $((VEC_SIZE - 1) << 20), %eax + jz L(no_page_cross) # endif + + movl %edi, %eax + movl %esi, %ecx + andl $(PAGE_SIZE - 1), %eax + andl $(PAGE_SIZE - 1), %ecx + + xorl %OFFSET_REG, %OFFSET_REG + + /* Check which is closer to page cross, s1 or s2. */ + cmpl %eax, %ecx + jg L(page_cross_s2) + + /* The previous page cross check has false positives. Check for + true positive as page cross logic is very expensive. */ + subl $(PAGE_SIZE - VEC_SIZE * 4), %eax + jbe L(no_page_cross) + + /* Set r8 to not interfere with normal return value (rdi and rsi + did not swap). */ # ifdef USE_AS_WCSCMP - movl (%rdi, %rdx), %eax - movl (%rsi, %rdx), %ecx + /* any non-zero positive value that doesn't inference with 0x1. + */ + movl $2, %r8d # else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %ecx + xorl %r8d, %r8d # endif - /* Check null char. */ - testl %eax, %eax - jne L(cross_page_loop) - /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED - comparisons. */ - subl %ecx, %eax -# ifndef USE_AS_WCSCMP -L(different): + + /* Check if less than 1x VEC till page cross. */ + subl $(VEC_SIZE * 3), %eax + jg L(less_1x_vec_till_page) + + /* If more than 1x VEC till page cross, loop throuh safely + loadable memory until within 1x VEC of page cross. */ + + .p2align 4,, 10 +L(page_cross_loop): + + VMOVU (%rdi, %OFFSET_REG64), %ymm0 + CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx + incl %ecx + + jnz L(check_ret_vec_page_cross) + addl $VEC_SIZE, %OFFSET_REG +# ifdef USE_AS_STRNCMP + cmpq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross) # endif - VZEROUPPER_RETURN + addl $VEC_SIZE, %eax + jl L(page_cross_loop) + + subl %eax, %OFFSET_REG + /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed + to not cross page so is safe to load. Since we have already + loaded at least 1 VEC from rsi it is also guranteed to be + safe. */ + + VMOVU (%rdi, %OFFSET_REG64), %ymm0 + CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx + +# ifdef USE_AS_STRNCMP + leal VEC_SIZE(%OFFSET_REG64), %eax + cmpq %rax, %rdx + jbe L(check_ret_vec_page_cross2) + addq %rdi, %rdx +# endif + incl %ecx + jz L(prepare_loop_no_len) + .p2align 4,, 4 +L(ret_vec_page_cross): +# ifndef USE_AS_STRNCMP +L(check_ret_vec_page_cross): +# endif + tzcntl %ecx, %ecx + addl %OFFSET_REG, %ecx +L(ret_vec_page_cross_cont): # ifdef USE_AS_WCSCMP - .p2align 4 -L(different): - /* Use movl to avoid modifying EFLAGS. */ - movl $0, %eax + movl (%rdi, %rcx), %edx + xorl %eax, %eax + cmpl (%rsi, %rcx), %edx + je L(ret12) setl %al negl %eax - orl $1, %eax - VZEROUPPER_RETURN + xorl %r8d, %eax +# else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax # endif +L(ret12): + VZEROUPPER_RETURN # ifdef USE_AS_STRNCMP - .p2align 4 -L(zero): + .p2align 4,, 10 +L(check_ret_vec_page_cross2): + incl %ecx +L(check_ret_vec_page_cross): + tzcntl %ecx, %ecx + addl %OFFSET_REG, %ecx + cmpq %rcx, %rdx + ja L(ret_vec_page_cross_cont) + .p2align 4,, 2 +L(ret_zero_page_cross): xorl %eax, %eax VZEROUPPER_RETURN +# endif - .p2align 4 -L(char0): -# ifdef USE_AS_WCSCMP - xorl %eax, %eax - movl (%rdi), %ecx - cmpl (%rsi), %ecx - jne L(wcscmp_return) -# else - movzbl (%rsi), %ecx - movzbl (%rdi), %eax - subl %ecx, %eax -# endif - VZEROUPPER_RETURN + .p2align 4,, 4 +L(page_cross_s2): + /* Ensure this is a true page cross. */ + subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx + jbe L(no_page_cross) + + + movl %ecx, %eax + movq %rdi, %rcx + movq %rsi, %rdi + movq %rcx, %rsi + + /* set r8 to negate return value as rdi and rsi swapped. */ +# ifdef USE_AS_WCSCMP + movl $-4, %r8d +# else + movl $-1, %r8d # endif + xorl %OFFSET_REG, %OFFSET_REG - .p2align 4 -L(last_vector): - addq %rdx, %rdi - addq %rdx, %rsi + /* Check if more than 1x VEC till page cross. */ + subl $(VEC_SIZE * 3), %eax + jle L(page_cross_loop) + + .p2align 4,, 6 +L(less_1x_vec_till_page): + /* Find largest load size we can use. */ + cmpl $16, %eax + ja L(less_16_till_page) + + VMOVU (%rdi), %xmm0 + CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1) + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incw %cx + jnz L(check_ret_vec_page_cross) + movl $16, %OFFSET_REG # ifdef USE_AS_STRNCMP - subq %rdx, %r11 + cmpq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross_slow_case0) + subl %eax, %OFFSET_REG +# else + /* Explicit check for 16 byte alignment. */ + subl %eax, %OFFSET_REG + jz L(prepare_loop) # endif - tzcntl %ecx, %edx + + VMOVU (%rdi, %OFFSET_REG64), %xmm0 + CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1) + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incw %cx + jnz L(check_ret_vec_page_cross) + # ifdef USE_AS_STRNCMP - cmpq %r11, %rdx - jae L(zero) + addl $16, %OFFSET_REG + subq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross_slow_case0) + subq $-(VEC_SIZE * 4), %rdx + + leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi + leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi +# else + leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi + leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi # endif -# ifdef USE_AS_WCSCMP + jmp L(prepare_loop_aligned) + +# ifdef USE_AS_STRNCMP + .p2align 4,, 2 +L(ret_zero_page_cross_slow_case0): xorl %eax, %eax - movl (%rdi, %rdx), %ecx - cmpl (%rsi, %rdx), %ecx - jne L(wcscmp_return) -# else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - subl %edx, %eax + ret # endif - VZEROUPPER_RETURN - /* Comparing on page boundary region requires special treatment: - It must done one vector at the time, starting with the wider - ymm vector if possible, if not, with xmm. If fetching 16 bytes - (xmm) still passes the boundary, byte comparison must be done. - */ - .p2align 4 -L(cross_page): - /* Try one ymm vector at a time. */ - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - jg L(cross_page_1_vector) -L(loop_1_vector): - vmovdqu (%rdi, %rdx), %ymm1 - VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 - VPMINU %ymm1, %ymm0, %ymm0 - VPCMPEQ %ymm7, %ymm0, %ymm0 - vpmovmskb %ymm0, %ecx - testl %ecx, %ecx - jne L(last_vector) - addl $VEC_SIZE, %edx + .p2align 4,, 10 +L(less_16_till_page): + /* Find largest load size we can use. */ + cmpl $24, %eax + ja L(less_8_till_page) - addl $VEC_SIZE, %eax -# ifdef USE_AS_STRNCMP - /* Return 0 if the current offset (%rdx) >= the maximum offset - (%r11). */ - cmpq %r11, %rdx - jae L(zero) -# endif - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - jle L(loop_1_vector) -L(cross_page_1_vector): - /* Less than 32 bytes to check, try one xmm vector. */ - cmpl $(PAGE_SIZE - 16), %eax - jg L(cross_page_1_xmm) - vmovdqu (%rdi, %rdx), %xmm1 - VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 - VPMINU %xmm1, %xmm0, %xmm0 - VPCMPEQ %xmm7, %xmm0, %xmm0 - vpmovmskb %xmm0, %ecx - testl %ecx, %ecx - jne L(last_vector) + vmovq (%rdi), %xmm0 + vmovq (%rsi), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incb %cl + jnz L(check_ret_vec_page_cross) - addl $16, %edx -# ifndef USE_AS_WCSCMP - addl $16, %eax + +# ifdef USE_AS_STRNCMP + cmpq $8, %rdx + jbe L(ret_zero_page_cross_slow_case0) # endif + movl $24, %OFFSET_REG + /* Explicit check for 16 byte alignment. */ + subl %eax, %OFFSET_REG + + + + vmovq (%rdi, %OFFSET_REG64), %xmm0 + vmovq (%rsi, %OFFSET_REG64), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incb %cl + jnz L(check_ret_vec_page_cross) + # ifdef USE_AS_STRNCMP - /* Return 0 if the current offset (%rdx) >= the maximum offset - (%r11). */ - cmpq %r11, %rdx - jae L(zero) -# endif - -L(cross_page_1_xmm): -# ifndef USE_AS_WCSCMP - /* Less than 16 bytes to check, try 8 byte vector. NB: No need - for wcscmp nor wcsncmp since wide char is 4 bytes. */ - cmpl $(PAGE_SIZE - 8), %eax - jg L(cross_page_8bytes) - vmovq (%rdi, %rdx), %xmm1 - vmovq (%rsi, %rdx), %xmm0 - VPCMPEQ %xmm0, %xmm1, %xmm0 - VPMINU %xmm1, %xmm0, %xmm0 - VPCMPEQ %xmm7, %xmm0, %xmm0 - vpmovmskb %xmm0, %ecx - /* Only last 8 bits are valid. */ - andl $0xff, %ecx - testl %ecx, %ecx - jne L(last_vector) + addl $8, %OFFSET_REG + subq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross_slow_case0) + subq $-(VEC_SIZE * 4), %rdx + + leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi + leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi +# else + leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi + leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi +# endif + jmp L(prepare_loop_aligned) + - addl $8, %edx - addl $8, %eax + .p2align 4,, 10 +L(less_8_till_page): +# ifdef USE_AS_WCSCMP + /* If using wchar then this is the only check before we reach + the page boundary. */ + movl (%rdi), %eax + movl (%rsi), %ecx + cmpl %ecx, %eax + jnz L(ret_less_8_wcs) # ifdef USE_AS_STRNCMP - /* Return 0 if the current offset (%rdx) >= the maximum offset - (%r11). */ - cmpq %r11, %rdx - jae L(zero) + addq %rdi, %rdx + /* We already checked for len <= 1 so cannot hit that case here. + */ # endif + testl %eax, %eax + jnz L(prepare_loop_no_len) + ret -L(cross_page_8bytes): - /* Less than 8 bytes to check, try 4 byte vector. */ - cmpl $(PAGE_SIZE - 4), %eax - jg L(cross_page_4bytes) - vmovd (%rdi, %rdx), %xmm1 - vmovd (%rsi, %rdx), %xmm0 - VPCMPEQ %xmm0, %xmm1, %xmm0 - VPMINU %xmm1, %xmm0, %xmm0 - VPCMPEQ %xmm7, %xmm0, %xmm0 - vpmovmskb %xmm0, %ecx - /* Only last 4 bits are valid. */ - andl $0xf, %ecx - testl %ecx, %ecx - jne L(last_vector) + .p2align 4,, 8 +L(ret_less_8_wcs): + setl %OFFSET_REG8 + negl %OFFSET_REG + movl %OFFSET_REG, %eax + xorl %r8d, %eax + ret + +# else + + /* Find largest load size we can use. */ + cmpl $28, %eax + ja L(less_4_till_page) + + vmovd (%rdi), %xmm0 + vmovd (%rsi), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + subl $0xf, %ecx + jnz L(check_ret_vec_page_cross) - addl $4, %edx # ifdef USE_AS_STRNCMP - /* Return 0 if the current offset (%rdx) >= the maximum offset - (%r11). */ - cmpq %r11, %rdx - jae L(zero) + cmpq $4, %rdx + jbe L(ret_zero_page_cross_slow_case1) # endif + movl $28, %OFFSET_REG + /* Explicit check for 16 byte alignment. */ + subl %eax, %OFFSET_REG -L(cross_page_4bytes): -# endif - /* Less than 4 bytes to check, try one byte/dword at a time. */ -# ifdef USE_AS_STRNCMP - cmpq %r11, %rdx - jae L(zero) -# endif -# ifdef USE_AS_WCSCMP - movl (%rdi, %rdx), %eax - movl (%rsi, %rdx), %ecx -# else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %ecx + + + vmovd (%rdi, %OFFSET_REG64), %xmm0 + vmovd (%rsi, %OFFSET_REG64), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + subl $0xf, %ecx + jnz L(check_ret_vec_page_cross) + +# ifdef USE_AS_STRNCMP + addl $4, %OFFSET_REG + subq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross_slow_case1) + subq $-(VEC_SIZE * 4), %rdx + + leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi + leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi +# else + leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi + leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi +# endif + jmp L(prepare_loop_aligned) + +# ifdef USE_AS_STRNCMP + .p2align 4,, 2 +L(ret_zero_page_cross_slow_case1): + xorl %eax, %eax + ret +# endif + + .p2align 4,, 10 +L(less_4_till_page): + subq %rdi, %rsi + /* Extremely slow byte comparison loop. */ +L(less_4_loop): + movzbl (%rdi), %eax + movzbl (%rsi, %rdi), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) + subl %BYTE_LOOP_REG, %eax + jnz L(ret_less_4_loop) + testl %ecx, %ecx + jz L(ret_zero_4_loop) +# ifdef USE_AS_STRNCMP + decq %rdx + jz L(ret_zero_4_loop) +# endif + incq %rdi + /* end condition is reach page boundary (rdi is aligned). */ + testl $31, %edi + jnz L(less_4_loop) + leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi + addq $-(VEC_SIZE * 4), %rdi +# ifdef USE_AS_STRNCMP + subq $-(VEC_SIZE * 4), %rdx +# endif + jmp L(prepare_loop_aligned) + +L(ret_zero_4_loop): + xorl %eax, %eax + ret +L(ret_less_4_loop): + xorl %r8d, %eax + subl %r8d, %eax + ret # endif - testl %eax, %eax - jne L(cross_page_loop) - subl %ecx, %eax - VZEROUPPER_RETURN -END (STRCMP) + cfi_endproc + .size STRCMP, .-STRCMP #endif diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S index 0cd939d5a..2a5b3ce03 100644 --- a/sysdeps/x86_64/multiarch/strcmp-evex.S +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S @@ -19,6 +19,9 @@ #if IS_IN (libc) # include +# if defined USE_AS_STRCASECMP_L +# include "locale-defines.h" +# endif # ifndef STRCMP # define STRCMP __strcmp_evex @@ -26,54 +29,165 @@ # define PAGE_SIZE 4096 -/* VEC_SIZE = Number of bytes in a ymm register */ + /* VEC_SIZE = Number of bytes in a ymm register. */ # define VEC_SIZE 32 +# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR) -/* Shift for dividing by (VEC_SIZE * 4). */ -# define DIVIDE_BY_VEC_4_SHIFT 7 -# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) -# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) -# endif - -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 # ifdef USE_AS_WCSCMP -/* Compare packed dwords. */ -# define VPCMP vpcmpd +# ifndef OVERFLOW_STRCMP +# define OVERFLOW_STRCMP __wcscmp_evex +# endif + +# define TESTEQ subl $0xff, + /* Compare packed dwords. */ +# define VPCMP vpcmpd # define VPMINU vpminud # define VPTESTM vptestmd -# define SHIFT_REG32 r8d -# define SHIFT_REG64 r8 -/* 1 dword char == 4 bytes. */ +# define VPTESTNM vptestnmd + /* 1 dword char == 4 bytes. */ # define SIZE_OF_CHAR 4 # else -/* Compare packed bytes. */ -# define VPCMP vpcmpb +# ifndef OVERFLOW_STRCMP +# define OVERFLOW_STRCMP __strcmp_evex +# endif + +# define TESTEQ incl + /* Compare packed bytes. */ +# define VPCMP vpcmpb # define VPMINU vpminub # define VPTESTM vptestmb -# define SHIFT_REG32 ecx -# define SHIFT_REG64 rcx -/* 1 byte char == 1 byte. */ +# define VPTESTNM vptestnmb + /* 1 byte char == 1 byte. */ # define SIZE_OF_CHAR 1 # endif -# define XMMZERO xmm16 -# define XMM0 xmm17 -# define XMM1 xmm18 +# ifdef USE_AS_STRNCMP +# define LOOP_REG r9d +# define LOOP_REG64 r9 + +# define OFFSET_REG8 r9b +# define OFFSET_REG r9d +# define OFFSET_REG64 r9 +# else +# define LOOP_REG edx +# define LOOP_REG64 rdx + +# define OFFSET_REG8 dl +# define OFFSET_REG edx +# define OFFSET_REG64 rdx +# endif + +# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP +# define VEC_OFFSET 0 +# else +# define VEC_OFFSET (-VEC_SIZE) +# endif + +# define XMM0 xmm17 +# define XMM1 xmm18 + +# define XMM10 xmm27 +# define XMM11 xmm28 +# define XMM12 xmm29 +# define XMM13 xmm30 +# define XMM14 xmm31 + + +# define YMM0 ymm17 +# define YMM1 ymm18 +# define YMM2 ymm19 +# define YMM3 ymm20 +# define YMM4 ymm21 +# define YMM5 ymm22 +# define YMM6 ymm23 +# define YMM7 ymm24 +# define YMM8 ymm25 +# define YMM9 ymm26 +# define YMM10 ymm27 +# define YMM11 ymm28 +# define YMM12 ymm29 +# define YMM13 ymm30 +# define YMM14 ymm31 + +# ifdef USE_AS_STRCASECMP_L +# define BYTE_LOOP_REG OFFSET_REG +# else +# define BYTE_LOOP_REG ecx +# endif + +# ifdef USE_AS_STRCASECMP_L +# ifdef USE_AS_STRNCMP +# define STRCASECMP __strncasecmp_evex +# define LOCALE_REG rcx +# define LOCALE_REG_LP RCX_LP +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii +# else +# define STRCASECMP __strcasecmp_evex +# define LOCALE_REG rdx +# define LOCALE_REG_LP RDX_LP +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii +# endif +# endif + +# define LCASE_MIN_YMM %YMM12 +# define LCASE_MAX_YMM %YMM13 +# define CASE_ADD_YMM %YMM14 + +# define LCASE_MIN_XMM %XMM12 +# define LCASE_MAX_XMM %XMM13 +# define CASE_ADD_XMM %XMM14 + + /* NB: wcsncmp uses r11 but strcasecmp is never used in + conjunction with wcscmp. */ +# define TOLOWER_BASE %r11 + +# ifdef USE_AS_STRCASECMP_L +# define _REG(x, y) x ## y +# define REG(x, y) _REG(x, y) +# define TOLOWER(reg1, reg2, ext) \ + vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \ + vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \ + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \ + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \ + vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \ + vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6} + +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst +# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM) +# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM) + +# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \ + TOLOWER (s1_reg, s2_reg, ext); \ + VPCMP $0, s1_reg, s2_reg, reg_out + +# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \ + VMOVU s2_mem, s2_reg; \ + CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) + +# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM) +# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM) + +# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM) +# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM) + +# else +# define TOLOWER_gpr(...) +# define TOLOWER_YMM(...) +# define TOLOWER_XMM(...) -# define YMMZERO ymm16 -# define YMM0 ymm17 -# define YMM1 ymm18 -# define YMM2 ymm19 -# define YMM3 ymm20 -# define YMM4 ymm21 -# define YMM5 ymm22 -# define YMM6 ymm23 -# define YMM7 ymm24 -# define YMM8 ymm25 -# define YMM9 ymm26 -# define YMM10 ymm27 +# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \ + VPCMP $0, s2_reg, s1_reg, reg_out + +# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__) + +# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \ + VPCMP $0, s2_mem, s1_reg, reg_out + +# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__) +# endif /* Warning! wcscmp/wcsncmp have to use SIGNED comparison for elements. @@ -96,985 +210,1208 @@ the maximum offset is reached before a difference is found, zero is returned. */ - .section .text.evex,"ax",@progbits -ENTRY (STRCMP) -# ifdef USE_AS_STRNCMP - /* Check for simple cases (0 or 1) in offset. */ - cmp $1, %RDX_LP - je L(char0) - jb L(zero) -# ifdef USE_AS_WCSCMP -# ifndef __ILP32__ - movq %rdx, %rcx - /* Check if length could overflow when multiplied by - sizeof(wchar_t). Checking top 8 bits will cover all potential - overflow cases as well as redirect cases where its impossible to - length to bound a valid memory region. In these cases just use - 'wcscmp'. */ - shrq $56, %rcx - jnz __wcscmp_evex + .section .text.evex, "ax", @progbits + .align 16 + .type STRCMP, @function + .globl STRCMP + .hidden STRCMP + +# ifdef USE_AS_STRCASECMP_L +ENTRY (STRCASECMP) + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax + mov %fs:(%rax), %LOCALE_REG_LP + + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 +END (STRCASECMP) + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ +# endif + + .p2align 4 +STRCMP: + cfi_startproc + _CET_ENDBR + CALL_MCOUNT + +# if defined USE_AS_STRCASECMP_L + /* We have to fall back on the C implementation for locales with + encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP +# else + mov (%LOCALE_REG), %RAX_LP # endif - /* Convert units: from wide to byte char. */ - shl $2, %RDX_LP + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) + jne STRCASECMP_NONASCII + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE +# endif + +# ifdef USE_AS_STRNCMP + /* Don't overwrite LOCALE_REG (rcx) until we have pass + L(one_or_less). Otherwise we might use the wrong locale in + the OVERFLOW_STRCMP (strcasecmp_l). */ +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx # endif - /* Register %r11 tracks the maximum offset. */ - mov %RDX_LP, %R11_LP + cmp $1, %RDX_LP + /* Signed comparison intentional. We use this branch to also + test cases where length >= 2^63. These very large sizes can be + handled with strcmp as there is no way for that length to + actually bound the buffer. */ + jle L(one_or_less) +# endif + +# if defined USE_AS_STRCASECMP_L + .section .rodata.cst32, "aM", @progbits, 32 + .align 32 +L(lcase_min): + .quad 0x4141414141414141 + .quad 0x4141414141414141 + .quad 0x4141414141414141 + .quad 0x4141414141414141 +L(lcase_max): + .quad 0x1a1a1a1a1a1a1a1a + .quad 0x1a1a1a1a1a1a1a1a + .quad 0x1a1a1a1a1a1a1a1a + .quad 0x1a1a1a1a1a1a1a1a +L(case_add): + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + + vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM + vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM + vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM # endif + movl %edi, %eax - xorl %edx, %edx - /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ - vpxorq %XMMZERO, %XMMZERO, %XMMZERO orl %esi, %eax - andl $(PAGE_SIZE - 1), %eax - cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax - jg L(cross_page) - /* Start comparing 4 vectors. */ + /* Shift out the bits irrelivant to page boundary ([63:12]). */ + sall $20, %eax + /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ + cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax + ja L(page_cross) + +L(no_page_cross): + /* Safe to compare 4x vectors. */ VMOVU (%rdi), %YMM0 - - /* Each bit set in K2 represents a non-null CHAR in YMM0. */ VPTESTM %YMM0, %YMM0, %k2 - /* Each bit cleared in K1 represents a mismatch or a null CHAR in YMM0 and 32 bytes at (%rsi). */ - VPCMP $0, (%rsi), %YMM0, %k1{%k2} - + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} kmovd %k1, %ecx -# ifdef USE_AS_WCSCMP - subl $0xff, %ecx -# else - incl %ecx -# endif - je L(next_3_vectors) - tzcntl %ecx, %edx -# ifdef USE_AS_WCSCMP - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %edx -# endif # ifdef USE_AS_STRNCMP - /* Return 0 if the mismatched index (%rdx) is after the maximum - offset (%r11). */ - cmpq %r11, %rdx - jae L(zero) + cmpq $CHAR_PER_VEC, %rdx + jbe L(vec_0_test_len) # endif + + /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for + wcscmp/wcsncmp. */ + + /* All 1s represents all equals. TESTEQ will overflow to zero in + all equals case. Otherwise 1s will carry until position of first + mismatch. */ + TESTEQ %ecx + jz L(more_3x_vec) + + .p2align 4,, 4 +L(return_vec_0): + tzcntl %ecx, %ecx # ifdef USE_AS_WCSCMP + movl (%rdi, %rcx, SIZE_OF_CHAR), %edx xorl %eax, %eax - movl (%rdi, %rdx), %ecx - cmpl (%rsi, %rdx), %ecx - je L(return) -L(wcscmp_return): + cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx + je L(ret0) setl %al negl %eax orl $1, %eax -L(return): # else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - subl %edx, %eax + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax # endif +L(ret0): ret -L(return_vec_size): - tzcntl %ecx, %edx -# ifdef USE_AS_WCSCMP - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %edx -# endif # ifdef USE_AS_STRNCMP - /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after - the maximum offset (%r11). */ - addq $VEC_SIZE, %rdx - cmpq %r11, %rdx - jae L(zero) -# ifdef USE_AS_WCSCMP + .p2align 4,, 4 +L(vec_0_test_len): + notl %ecx + bzhil %edx, %ecx, %eax + jnz L(return_vec_0) + /* Align if will cross fetch block. */ + .p2align 4,, 2 +L(ret_zero): xorl %eax, %eax - movl (%rdi, %rdx), %ecx - cmpl (%rsi, %rdx), %ecx - jne L(wcscmp_return) -# else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - subl %edx, %eax + ret + + .p2align 4,, 5 +L(one_or_less): +# ifdef USE_AS_STRCASECMP_L + /* Set locale argument for strcasecmp. */ + movq %LOCALE_REG, %rdx # endif -# else + jb L(ret_zero) + /* 'nbe' covers the case where length is negative (large + unsigned). */ + jnbe OVERFLOW_STRCMP # ifdef USE_AS_WCSCMP + movl (%rdi), %edx xorl %eax, %eax - movl VEC_SIZE(%rdi, %rdx), %ecx - cmpl VEC_SIZE(%rsi, %rdx), %ecx - jne L(wcscmp_return) + cmpl (%rsi), %edx + je L(ret1) + setl %al + negl %eax + orl $1, %eax # else - movzbl VEC_SIZE(%rdi, %rdx), %eax - movzbl VEC_SIZE(%rsi, %rdx), %edx - subl %edx, %eax + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax # endif -# endif +L(ret1): ret +# endif -L(return_2_vec_size): - tzcntl %ecx, %edx + .p2align 4,, 10 +L(return_vec_1): + tzcntl %ecx, %ecx +# ifdef USE_AS_STRNCMP + /* rdx must be > CHAR_PER_VEC so its safe to subtract without + worrying about underflow. */ + addq $-CHAR_PER_VEC, %rdx + cmpq %rcx, %rdx + jbe L(ret_zero) +# endif # ifdef USE_AS_WCSCMP - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %edx + movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax + cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx + je L(ret2) + setl %al + negl %eax + orl $1, %eax +# else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax # endif +L(ret2): + ret + + .p2align 4,, 10 # ifdef USE_AS_STRNCMP - /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is - after the maximum offset (%r11). */ - addq $(VEC_SIZE * 2), %rdx - cmpq %r11, %rdx - jae L(zero) -# ifdef USE_AS_WCSCMP - xorl %eax, %eax - movl (%rdi, %rdx), %ecx - cmpl (%rsi, %rdx), %ecx - jne L(wcscmp_return) +L(return_vec_3): +# if CHAR_PER_VEC <= 16 + sall $CHAR_PER_VEC, %ecx # else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - subl %edx, %eax + salq $CHAR_PER_VEC, %rcx # endif +# endif +L(return_vec_2): +# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) + tzcntl %ecx, %ecx # else -# ifdef USE_AS_WCSCMP - xorl %eax, %eax - movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx - cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx - jne L(wcscmp_return) -# else - movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax - movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx - subl %edx, %eax -# endif + tzcntq %rcx, %rcx # endif - ret -L(return_3_vec_size): - tzcntl %ecx, %edx -# ifdef USE_AS_WCSCMP - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %edx -# endif # ifdef USE_AS_STRNCMP - /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is - after the maximum offset (%r11). */ - addq $(VEC_SIZE * 3), %rdx - cmpq %r11, %rdx - jae L(zero) -# ifdef USE_AS_WCSCMP + cmpq %rcx, %rdx + jbe L(ret_zero) +# endif + +# ifdef USE_AS_WCSCMP + movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx xorl %eax, %eax - movl (%rdi, %rdx), %ecx - cmpl (%rsi, %rdx), %ecx - jne L(wcscmp_return) -# else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - subl %edx, %eax -# endif + cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx + je L(ret3) + setl %al + negl %eax + orl $1, %eax # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax +# endif +L(ret3): + ret + +# ifndef USE_AS_STRNCMP + .p2align 4,, 10 +L(return_vec_3): + tzcntl %ecx, %ecx # ifdef USE_AS_WCSCMP + movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx xorl %eax, %eax - movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx - cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx - jne L(wcscmp_return) + cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx + je L(ret4) + setl %al + negl %eax + orl $1, %eax # else - movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax - movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx - subl %edx, %eax + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax # endif -# endif +L(ret4): ret +# endif - .p2align 4 -L(next_3_vectors): - VMOVU VEC_SIZE(%rdi), %YMM0 - /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + /* 32 byte align here ensures the main loop is ideally aligned + for DSB. */ + .p2align 5 +L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU (VEC_SIZE)(%rdi), %YMM0 VPTESTM %YMM0, %YMM0, %k2 - /* Each bit cleared in K1 represents a mismatch or a null CHAR - in YMM0 and 32 bytes at VEC_SIZE(%rsi). */ - VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} kmovd %k1, %ecx -# ifdef USE_AS_WCSCMP - subl $0xff, %ecx -# else - incl %ecx + TESTEQ %ecx + jnz L(return_vec_1) + +# ifdef USE_AS_STRNCMP + subq $(CHAR_PER_VEC * 2), %rdx + jbe L(ret_zero) # endif - jne L(return_vec_size) VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 - /* Each bit set in K2 represents a non-null CHAR in YMM0. */ VPTESTM %YMM0, %YMM0, %k2 - /* Each bit cleared in K1 represents a mismatch or a null CHAR - in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ - VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2} kmovd %k1, %ecx -# ifdef USE_AS_WCSCMP - subl $0xff, %ecx -# else - incl %ecx -# endif - jne L(return_2_vec_size) + TESTEQ %ecx + jnz L(return_vec_2) VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 - /* Each bit set in K2 represents a non-null CHAR in YMM0. */ VPTESTM %YMM0, %YMM0, %k2 - /* Each bit cleared in K1 represents a mismatch or a null CHAR - in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ - VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2} kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_3) + +# ifdef USE_AS_STRNCMP + cmpq $(CHAR_PER_VEC * 2), %rdx + jbe L(ret_zero) +# endif + + # ifdef USE_AS_WCSCMP - subl $0xff, %ecx + /* any non-zero positive value that doesn't inference with 0x1. + */ + movl $2, %r8d + # else - incl %ecx -# endif - jne L(return_3_vec_size) -L(main_loop_header): - leaq (VEC_SIZE * 4)(%rdi), %rdx - movl $PAGE_SIZE, %ecx - /* Align load via RAX. */ - andq $-(VEC_SIZE * 4), %rdx - subq %rdi, %rdx - leaq (%rdi, %rdx), %rax + xorl %r8d, %r8d +# endif + + /* The prepare labels are various entry points from the page + cross logic. */ +L(prepare_loop): + # ifdef USE_AS_STRNCMP - /* Starting from this point, the maximum offset, or simply the - 'offset', DECREASES by the same amount when base pointers are - moved forward. Return 0 when: - 1) On match: offset <= the matched vector index. - 2) On mistmach, offset is before the mistmatched index. - */ - subq %rdx, %r11 - jbe L(zero) -# endif - addq %rsi, %rdx - movq %rdx, %rsi - andl $(PAGE_SIZE - 1), %esi - /* Number of bytes before page crossing. */ - subq %rsi, %rcx - /* Number of VEC_SIZE * 4 blocks before page crossing. */ - shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx - /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ - movl %ecx, %esi - jmp L(loop_start) +# ifdef USE_AS_WCSCMP +L(prepare_loop_no_len): + movl %edi, %ecx + andl $(VEC_SIZE * 4 - 1), %ecx + shrl $2, %ecx + leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx +# else + /* Store N + (VEC_SIZE * 4) and place check at the begining of + the loop. */ + leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx +L(prepare_loop_no_len): +# endif +# else +L(prepare_loop_no_len): +# endif + + /* Align s1 and adjust s2 accordingly. */ + subq %rdi, %rsi + andq $-(VEC_SIZE * 4), %rdi +L(prepare_loop_readj): + addq %rdi, %rsi +# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP) + subq %rdi, %rdx +# endif + +L(prepare_loop_aligned): + /* eax stores distance from rsi to next page cross. These cases + need to be handled specially as the 4x loop could potentially + read memory past the length of s1 or s2 and across a page + boundary. */ + movl $-(VEC_SIZE * 4), %eax + subl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + + /* Loop 4x comparisons at a time. */ .p2align 4 L(loop): + + /* End condition for strncmp. */ # ifdef USE_AS_STRNCMP - /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease - the maximum offset (%r11) by the same amount. */ - subq $(VEC_SIZE * 4), %r11 - jbe L(zero) -# endif - addq $(VEC_SIZE * 4), %rax - addq $(VEC_SIZE * 4), %rdx -L(loop_start): - testl %esi, %esi - leal -1(%esi), %esi - je L(loop_cross_page) -L(back_to_loop): - /* Main loop, comparing 4 vectors are a time. */ - VMOVA (%rax), %YMM0 - VMOVA VEC_SIZE(%rax), %YMM2 - VMOVA (VEC_SIZE * 2)(%rax), %YMM4 - VMOVA (VEC_SIZE * 3)(%rax), %YMM6 + subq $(CHAR_PER_VEC * 4), %rdx + jbe L(ret_zero) +# endif + + subq $-(VEC_SIZE * 4), %rdi + subq $-(VEC_SIZE * 4), %rsi + + /* Check if rsi loads will cross a page boundary. */ + addl $-(VEC_SIZE * 4), %eax + jnb L(page_cross_during_loop) + + /* Loop entry after handling page cross during loop. */ +L(loop_skip_page_cross_check): + VMOVA (VEC_SIZE * 0)(%rdi), %YMM0 + VMOVA (VEC_SIZE * 1)(%rdi), %YMM2 + VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 + VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 VPMINU %YMM0, %YMM2, %YMM8 VPMINU %YMM4, %YMM6, %YMM9 - /* A zero CHAR in YMM8 means that there is a null CHAR. */ - VPMINU %YMM8, %YMM9, %YMM8 + /* A zero CHAR in YMM9 means that there is a null CHAR. */ + VPMINU %YMM8, %YMM9, %YMM9 + + /* Each bit set in K1 represents a non-null CHAR in YMM9. */ + VPTESTM %YMM9, %YMM9, %k1 +# ifndef USE_AS_STRCASECMP_L + vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 + vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 + vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while + oring with YMM1. Result is stored in YMM6. */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 +# else + VMOVU (VEC_SIZE * 0)(%rsi), %YMM1 + TOLOWER_YMM (%YMM0, %YMM1) + VMOVU (VEC_SIZE * 1)(%rsi), %YMM3 + TOLOWER_YMM (%YMM2, %YMM3) + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 + TOLOWER_YMM (%YMM4, %YMM5) + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 + TOLOWER_YMM (%YMM6, %YMM7) + vpxorq %YMM0, %YMM1, %YMM1 + vpxorq %YMM2, %YMM3, %YMM3 + vpxorq %YMM4, %YMM5, %YMM5 + vpternlogd $0xde, %YMM7, %YMM1, %YMM6 +# endif + /* Or together YMM3, YMM5, and YMM6. */ + vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 - /* Each bit set in K1 represents a non-null CHAR in YMM8. */ - VPTESTM %YMM8, %YMM8, %k1 - /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */ - vpxorq (%rdx), %YMM0, %YMM1 - vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3 - vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5 - vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7 + /* A non-zero CHAR in YMM6 represents a mismatch. */ + VPTESTNM %YMM6, %YMM6, %k0{%k1} + kmovd %k0, %LOOP_REG - vporq %YMM1, %YMM3, %YMM9 - vporq %YMM5, %YMM7, %YMM10 + TESTEQ %LOOP_REG + jz L(loop) - /* A non-zero CHAR in YMM9 represents a mismatch. */ - vporq %YMM9, %YMM10, %YMM9 - /* Each bit cleared in K0 represents a mismatch or a null CHAR. */ - VPCMP $0, %YMMZERO, %YMM9, %k0{%k1} - kmovd %k0, %ecx -# ifdef USE_AS_WCSCMP - subl $0xff, %ecx -# else - incl %ecx + /* Find which VEC has the mismatch of end of string. */ + VPTESTM %YMM0, %YMM0, %k1 + VPTESTNM %YMM1, %YMM1, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + jnz L(return_vec_0_end) + + VPTESTM %YMM2, %YMM2, %k1 + VPTESTNM %YMM3, %YMM3, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + jnz L(return_vec_1_end) + + + /* Handle VEC 2 and 3 without branches. */ +L(return_vec_2_3_end): +# ifdef USE_AS_STRNCMP + subq $(CHAR_PER_VEC * 2), %rdx + jbe L(ret_zero_end) # endif - je L(loop) - /* Each bit set in K1 represents a non-null CHAR in YMM0. */ - VPTESTM %YMM0, %YMM0, %k1 - /* Each bit cleared in K0 represents a mismatch or a null CHAR - in YMM0 and (%rdx). */ - VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} + VPTESTM %YMM4, %YMM4, %k1 + VPTESTNM %YMM5, %YMM5, %k0{%k1} kmovd %k0, %ecx -# ifdef USE_AS_WCSCMP - subl $0xff, %ecx + TESTEQ %ecx +# if CHAR_PER_VEC <= 16 + sall $CHAR_PER_VEC, %LOOP_REG + orl %ecx, %LOOP_REG # else - incl %ecx -# endif - je L(test_vec) - tzcntl %ecx, %ecx -# ifdef USE_AS_WCSCMP - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %ecx + salq $CHAR_PER_VEC, %LOOP_REG64 + orq %rcx, %LOOP_REG64 +# endif +L(return_vec_3_end): + /* LOOP_REG contains matches for null/mismatch from the loop. If + VEC 0,1,and 2 all have no null and no mismatches then mismatch + must entirely be from VEC 3 which is fully represented by + LOOP_REG. */ +# if CHAR_PER_VEC <= 16 + tzcntl %LOOP_REG, %LOOP_REG +# else + tzcntq %LOOP_REG64, %LOOP_REG64 # endif # ifdef USE_AS_STRNCMP - cmpq %rcx, %r11 - jbe L(zero) -# ifdef USE_AS_WCSCMP - movq %rax, %rsi + cmpq %LOOP_REG64, %rdx + jbe L(ret_zero_end) +# endif + +# ifdef USE_AS_WCSCMP + movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx xorl %eax, %eax - movl (%rsi, %rcx), %edi - cmpl (%rdx, %rcx), %edi - jne L(wcscmp_return) -# else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax -# endif + cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx + je L(ret5) + setl %al + negl %eax + xorl %r8d, %eax # else -# ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (%rsi, %rcx), %edi - cmpl (%rdx, %rcx), %edi - jne L(wcscmp_return) -# else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax -# endif + movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax + movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax # endif +L(ret5): ret - .p2align 4 -L(test_vec): # ifdef USE_AS_STRNCMP - /* The first vector matched. Return 0 if the maximum offset - (%r11) <= VEC_SIZE. */ - cmpq $VEC_SIZE, %r11 - jbe L(zero) -# endif - /* Each bit set in K1 represents a non-null CHAR in YMM2. */ - VPTESTM %YMM2, %YMM2, %k1 - /* Each bit cleared in K0 represents a mismatch or a null CHAR - in YMM2 and VEC_SIZE(%rdx). */ - VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} - kmovd %k0, %ecx -# ifdef USE_AS_WCSCMP - subl $0xff, %ecx -# else - incl %ecx -# endif - je L(test_2_vec) - tzcntl %ecx, %edi -# ifdef USE_AS_WCSCMP - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %edi + .p2align 4,, 2 +L(ret_zero_end): + xorl %eax, %eax + ret # endif + + + /* The L(return_vec_N_end) differ from L(return_vec_N) in that + they use the value of `r8` to negate the return value. This is + because the page cross logic can swap `rdi` and `rsi`. */ + .p2align 4,, 10 # ifdef USE_AS_STRNCMP - addq $VEC_SIZE, %rdi - cmpq %rdi, %r11 - jbe L(zero) -# ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (%rsi, %rdi), %ecx - cmpl (%rdx, %rdi), %ecx - jne L(wcscmp_return) +L(return_vec_1_end): +# if CHAR_PER_VEC <= 16 + sall $CHAR_PER_VEC, %ecx # else - movzbl (%rax, %rdi), %eax - movzbl (%rdx, %rdi), %edx - subl %edx, %eax + salq $CHAR_PER_VEC, %rcx # endif +# endif +L(return_vec_0_end): +# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) + tzcntl %ecx, %ecx # else -# ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl VEC_SIZE(%rsi, %rdi), %ecx - cmpl VEC_SIZE(%rdx, %rdi), %ecx - jne L(wcscmp_return) -# else - movzbl VEC_SIZE(%rax, %rdi), %eax - movzbl VEC_SIZE(%rdx, %rdi), %edx - subl %edx, %eax -# endif + tzcntq %rcx, %rcx # endif - ret - .p2align 4 -L(test_2_vec): # ifdef USE_AS_STRNCMP - /* The first 2 vectors matched. Return 0 if the maximum offset - (%r11) <= 2 * VEC_SIZE. */ - cmpq $(VEC_SIZE * 2), %r11 - jbe L(zero) + cmpq %rcx, %rdx + jbe L(ret_zero_end) # endif - /* Each bit set in K1 represents a non-null CHAR in YMM4. */ - VPTESTM %YMM4, %YMM4, %k1 - /* Each bit cleared in K0 represents a mismatch or a null CHAR - in YMM4 and (VEC_SIZE * 2)(%rdx). */ - VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} - kmovd %k0, %ecx + # ifdef USE_AS_WCSCMP - subl $0xff, %ecx + movl (%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax + cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx + je L(ret6) + setl %al + negl %eax + /* This is the non-zero case for `eax` so just xorl with `r8d` + flip is `rdi` and `rsi` where swapped. */ + xorl %r8d, %eax # else - incl %ecx -# endif - je L(test_3_vec) - tzcntl %ecx, %edi -# ifdef USE_AS_WCSCMP - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %edi + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + /* Flip `eax` if `rdi` and `rsi` where swapped in page cross + logic. Subtract `r8d` after xor for zero case. */ + xorl %r8d, %eax + subl %r8d, %eax # endif -# ifdef USE_AS_STRNCMP - addq $(VEC_SIZE * 2), %rdi - cmpq %rdi, %r11 - jbe L(zero) +L(ret6): + ret + +# ifndef USE_AS_STRNCMP + .p2align 4,, 10 +L(return_vec_1_end): + tzcntl %ecx, %ecx # ifdef USE_AS_WCSCMP - movq %rax, %rsi + movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx xorl %eax, %eax - movl (%rsi, %rdi), %ecx - cmpl (%rdx, %rdi), %ecx - jne L(wcscmp_return) + cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx + je L(ret7) + setl %al + negl %eax + xorl %r8d, %eax # else - movzbl (%rax, %rdi), %eax - movzbl (%rdx, %rdi), %edx - subl %edx, %eax + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax # endif +L(ret7): + ret +# endif + + + /* Page cross in rsi in next 4x VEC. */ + + /* TODO: Improve logic here. */ + .p2align 4,, 10 +L(page_cross_during_loop): + /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ + + /* Optimistically rsi and rdi and both aligned in which case we + don't need any logic here. */ + cmpl $-(VEC_SIZE * 4), %eax + /* Don't adjust eax before jumping back to loop and we will + never hit page cross case again. */ + je L(loop_skip_page_cross_check) + + /* Check if we can safely load a VEC. */ + cmpl $-(VEC_SIZE * 3), %eax + jle L(less_1x_vec_till_page_cross) + + VMOVA (%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_0_end) + + /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ + cmpl $-(VEC_SIZE * 2), %eax + jg L(more_2x_vec_till_page_cross) + + .p2align 4,, 4 +L(less_1x_vec_till_page_cross): + subl $-(VEC_SIZE * 4), %eax + /* Guranteed safe to read from rdi - VEC_SIZE here. The only + concerning case is first iteration if incoming s1 was near start + of a page and s2 near end. If s1 was near the start of the page + we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe + to read back -VEC_SIZE. If rdi is truly at the start of a page + here, it means the previous page (rdi - VEC_SIZE) has already + been loaded earlier so must be valid. */ + VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 + CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2} + /* Mask of potentially valid bits. The lower bits can be out of + range comparisons (but safe regarding page crosses). */ + +# ifdef USE_AS_WCSCMP + movl $-1, %r10d + movl %esi, %ecx + andl $(VEC_SIZE - 1), %ecx + shrl $2, %ecx + shlxl %ecx, %r10d, %ecx + movzbl %cl, %r10d # else + movl $-1, %ecx + shlxl %esi, %ecx, %r10d +# endif + + kmovd %k1, %ecx + notl %ecx + + +# ifdef USE_AS_STRNCMP # ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx - cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx - jne L(wcscmp_return) + /* NB: strcasecmp not used with WCSCMP so this access to r11 is + safe. */ + movl %eax, %r11d + shrl $2, %r11d + cmpq %r11, %rdx # else - movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax - movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx - subl %edx, %eax + cmpq %rax, %rdx # endif + jbe L(return_page_cross_end_check) # endif - ret + movl %eax, %OFFSET_REG - .p2align 4 -L(test_3_vec): -# ifdef USE_AS_STRNCMP - /* The first 3 vectors matched. Return 0 if the maximum offset - (%r11) <= 3 * VEC_SIZE. */ - cmpq $(VEC_SIZE * 3), %r11 - jbe L(zero) -# endif - /* Each bit set in K1 represents a non-null CHAR in YMM6. */ - VPTESTM %YMM6, %YMM6, %k1 - /* Each bit cleared in K0 represents a mismatch or a null CHAR - in YMM6 and (VEC_SIZE * 3)(%rdx). */ - VPCMP $0, %YMMZERO, %YMM7, %k0{%k1} - kmovd %k0, %ecx -# ifdef USE_AS_WCSCMP - subl $0xff, %ecx + /* Readjust eax before potentially returning to the loop. */ + addl $(PAGE_SIZE - VEC_SIZE * 4), %eax + + andl %r10d, %ecx + jz L(loop_skip_page_cross_check) + + .p2align 4,, 3 +L(return_page_cross_end): + tzcntl %ecx, %ecx + +# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP) + leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx +L(return_page_cross_cmp_mem): # else - incl %ecx + addl %OFFSET_REG, %ecx # endif - tzcntl %ecx, %ecx # ifdef USE_AS_WCSCMP - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %ecx + movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax + cmpl VEC_OFFSET(%rsi, %rcx), %edx + je L(ret8) + setl %al + negl %eax + xorl %r8d, %eax +# else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax # endif +L(ret8): + ret + # ifdef USE_AS_STRNCMP - addq $(VEC_SIZE * 3), %rcx - cmpq %rcx, %r11 - jbe L(zero) + .p2align 4,, 10 +L(return_page_cross_end_check): + andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx # ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (%rsi, %rcx), %esi - cmpl (%rdx, %rcx), %esi - jne L(wcscmp_return) -# else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax + sall $2, %edx # endif -# else -# ifdef USE_AS_WCSCMP - movq %rax, %rsi + cmpl %ecx, %edx + ja L(return_page_cross_cmp_mem) xorl %eax, %eax - movl (VEC_SIZE * 3)(%rsi, %rcx), %esi - cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi - jne L(wcscmp_return) -# else - movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax - movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx - subl %edx, %eax -# endif -# endif ret - - .p2align 4 -L(loop_cross_page): - xorl %r10d, %r10d - movq %rdx, %rcx - /* Align load via RDX. We load the extra ECX bytes which should - be ignored. */ - andl $((VEC_SIZE * 4) - 1), %ecx - /* R10 is -RCX. */ - subq %rcx, %r10 - - /* This works only if VEC_SIZE * 2 == 64. */ -# if (VEC_SIZE * 2) != 64 -# error (VEC_SIZE * 2) != 64 # endif - /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ - cmpl $(VEC_SIZE * 2), %ecx - jge L(loop_cross_page_2_vec) - VMOVU (%rax, %r10), %YMM2 - VMOVU VEC_SIZE(%rax, %r10), %YMM3 + .p2align 4,, 10 +L(more_2x_vec_till_page_cross): + /* If more 2x vec till cross we will complete a full loop + iteration here. */ - /* Each bit set in K2 represents a non-null CHAR in YMM2. */ - VPTESTM %YMM2, %YMM2, %k2 - /* Each bit cleared in K1 represents a mismatch or a null CHAR - in YMM2 and 32 bytes at (%rdx, %r10). */ - VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2} - kmovd %k1, %r9d - /* Don't use subl since it is the lower 16/32 bits of RDI - below. */ - notl %r9d -# ifdef USE_AS_WCSCMP - /* Only last 8 bits are valid. */ - andl $0xff, %r9d -# endif - - /* Each bit set in K4 represents a non-null CHAR in YMM3. */ - VPTESTM %YMM3, %YMM3, %k4 - /* Each bit cleared in K3 represents a mismatch or a null CHAR - in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ - VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} - kmovd %k3, %edi - /* Must use notl %edi here as lower bits are for CHAR - comparisons potentially out of range thus can be 0 without - indicating mismatch. */ - notl %edi -# ifdef USE_AS_WCSCMP - /* Don't use subl since it is the upper 8 bits of EDI below. */ - andl $0xff, %edi + VMOVA VEC_SIZE(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_1_end) + +# ifdef USE_AS_STRNCMP + cmpq $(CHAR_PER_VEC * 2), %rdx + jbe L(ret_zero_in_loop_page_cross) # endif -# ifdef USE_AS_WCSCMP - /* NB: Each bit in EDI/R9D represents 4-byte element. */ - sall $8, %edi - /* NB: Divide shift count by 4 since each bit in K1 represent 4 - bytes. */ - movl %ecx, %SHIFT_REG32 - sarl $2, %SHIFT_REG32 - - /* Each bit in EDI represents a null CHAR or a mismatch. */ - orl %r9d, %edi -# else - salq $32, %rdi + subl $-(VEC_SIZE * 4), %eax - /* Each bit in RDI represents a null CHAR or a mismatch. */ - orq %r9, %rdi -# endif + /* Safe to include comparisons from lower bytes. */ + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_page_cross_0) + + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_page_cross_1) - /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ - shrxq %SHIFT_REG64, %rdi, %rdi - testq %rdi, %rdi - je L(loop_cross_page_2_vec) - tzcntq %rdi, %rcx -# ifdef USE_AS_WCSCMP - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %ecx -# endif # ifdef USE_AS_STRNCMP - cmpq %rcx, %r11 - jbe L(zero) + /* Must check length here as length might proclude reading next + page. */ # ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (%rsi, %rcx), %edi - cmpl (%rdx, %rcx), %edi - jne L(wcscmp_return) + /* NB: strcasecmp not used with WCSCMP so this access to r11 is + safe. */ + movl %eax, %r11d + shrl $2, %r11d + cmpq %r11, %rdx # else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax + cmpq %rax, %rdx # endif + jbe L(ret_zero_in_loop_page_cross) +# endif + + /* Finish the loop. */ + VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 + VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 + VPMINU %YMM4, %YMM6, %YMM9 + VPTESTM %YMM9, %YMM9, %k1 +# ifndef USE_AS_STRCASECMP_L + vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 # else -# ifdef USE_AS_WCSCMP - movq %rax, %rsi + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 + TOLOWER_YMM (%YMM4, %YMM5) + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 + TOLOWER_YMM (%YMM6, %YMM7) + vpxorq %YMM4, %YMM5, %YMM5 + vpternlogd $0xde, %YMM7, %YMM5, %YMM6 +# endif + VPTESTNM %YMM6, %YMM6, %k0{%k1} + kmovd %k0, %LOOP_REG + TESTEQ %LOOP_REG + jnz L(return_vec_2_3_end) + + /* Best for code size to include ucond-jmp here. Would be faster + if this case is hot to duplicate the L(return_vec_2_3_end) code + as fall-through and have jump back to loop on mismatch + comparison. */ + subq $-(VEC_SIZE * 4), %rdi + subq $-(VEC_SIZE * 4), %rsi + addl $(PAGE_SIZE - VEC_SIZE * 8), %eax +# ifdef USE_AS_STRNCMP + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop_skip_page_cross_check) +L(ret_zero_in_loop_page_cross): xorl %eax, %eax - movl (%rsi, %rcx), %edi - cmpl (%rdx, %rcx), %edi - jne L(wcscmp_return) -# else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax -# endif -# endif ret +# else + jmp L(loop_skip_page_cross_check) +# endif - .p2align 4 -L(loop_cross_page_2_vec): - /* The first VEC_SIZE * 2 bytes match or are ignored. */ - VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 - VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 - VPTESTM %YMM0, %YMM0, %k2 - /* Each bit cleared in K1 represents a mismatch or a null CHAR - in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */ - VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2} - kmovd %k1, %r9d - /* Don't use subl since it is the lower 16/32 bits of RDI - below. */ - notl %r9d -# ifdef USE_AS_WCSCMP - /* Only last 8 bits are valid. */ - andl $0xff, %r9d -# endif - - VPTESTM %YMM1, %YMM1, %k4 - /* Each bit cleared in K3 represents a mismatch or a null CHAR - in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ - VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} - kmovd %k3, %edi - /* Must use notl %edi here as lower bits are for CHAR - comparisons potentially out of range thus can be 0 without - indicating mismatch. */ - notl %edi -# ifdef USE_AS_WCSCMP - /* Don't use subl since it is the upper 8 bits of EDI below. */ - andl $0xff, %edi + .p2align 4,, 10 +L(return_vec_page_cross_0): + addl $-VEC_SIZE, %eax +L(return_vec_page_cross_1): + tzcntl %ecx, %ecx +# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP + leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx +# ifdef USE_AS_STRNCMP +# ifdef USE_AS_WCSCMP + /* Must divide ecx instead of multiply rdx due to overflow. */ + movl %ecx, %eax + shrl $2, %eax + cmpq %rax, %rdx +# else + cmpq %rcx, %rdx +# endif + jbe L(ret_zero_in_loop_page_cross) +# endif +# else + addl %eax, %ecx # endif # ifdef USE_AS_WCSCMP - /* NB: Each bit in EDI/R9D represents 4-byte element. */ - sall $8, %edi - - /* Each bit in EDI represents a null CHAR or a mismatch. */ - orl %r9d, %edi + movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax + cmpl VEC_OFFSET(%rsi, %rcx), %edx + je L(ret9) + setl %al + negl %eax + xorl %r8d, %eax # else - salq $32, %rdi + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +# endif +L(ret9): + ret - /* Each bit in RDI represents a null CHAR or a mismatch. */ - orq %r9, %rdi + + .p2align 4,, 10 +L(page_cross): +# ifndef USE_AS_STRNCMP + /* If both are VEC aligned we don't need any special logic here. + Only valid for strcmp where stop condition is guranteed to be + reachable by just reading memory. */ + testl $((VEC_SIZE - 1) << 20), %eax + jz L(no_page_cross) # endif - xorl %r8d, %r8d - /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ - subl $(VEC_SIZE * 2), %ecx - jle 1f - /* R8 has number of bytes skipped. */ - movl %ecx, %r8d + movl %edi, %eax + movl %esi, %ecx + andl $(PAGE_SIZE - 1), %eax + andl $(PAGE_SIZE - 1), %ecx + + xorl %OFFSET_REG, %OFFSET_REG + + /* Check which is closer to page cross, s1 or s2. */ + cmpl %eax, %ecx + jg L(page_cross_s2) + + /* The previous page cross check has false positives. Check for + true positive as page cross logic is very expensive. */ + subl $(PAGE_SIZE - VEC_SIZE * 4), %eax + jbe L(no_page_cross) + + + /* Set r8 to not interfere with normal return value (rdi and rsi + did not swap). */ # ifdef USE_AS_WCSCMP - /* NB: Divide shift count by 4 since each bit in RDI represent 4 - bytes. */ - sarl $2, %ecx - /* Skip ECX bytes. */ - shrl %cl, %edi + /* any non-zero positive value that doesn't inference with 0x1. + */ + movl $2, %r8d # else - /* Skip ECX bytes. */ - shrq %cl, %rdi + xorl %r8d, %r8d # endif -1: - /* Before jumping back to the loop, set ESI to the number of - VEC_SIZE * 4 blocks before page crossing. */ - movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi - testq %rdi, %rdi + /* Check if less than 1x VEC till page cross. */ + subl $(VEC_SIZE * 3), %eax + jg L(less_1x_vec_till_page) + + + /* If more than 1x VEC till page cross, loop throuh safely + loadable memory until within 1x VEC of page cross. */ + .p2align 4,, 8 +L(page_cross_loop): + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(check_ret_vec_page_cross) + addl $CHAR_PER_VEC, %OFFSET_REG # ifdef USE_AS_STRNCMP - /* At this point, if %rdi value is 0, it already tested - VEC_SIZE*4+%r10 byte starting from %rax. This label - checks whether strncmp maximum offset reached or not. */ - je L(string_nbyte_offset_check) -# else - je L(back_to_loop) + cmpq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross) # endif - tzcntq %rdi, %rcx + addl $VEC_SIZE, %eax + jl L(page_cross_loop) + # ifdef USE_AS_WCSCMP - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %ecx + shrl $2, %eax # endif - addq %r10, %rcx - /* Adjust for number of bytes skipped. */ - addq %r8, %rcx + + + subl %eax, %OFFSET_REG + /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed + to not cross page so is safe to load. Since we have already + loaded at least 1 VEC from rsi it is also guranteed to be safe. + */ + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} + + kmovd %k1, %ecx # ifdef USE_AS_STRNCMP - addq $(VEC_SIZE * 2), %rcx - subq %rcx, %r11 - jbe L(zero) -# ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (%rsi, %rcx), %edi - cmpl (%rdx, %rcx), %edi - jne L(wcscmp_return) -# else - movzbl (%rax, %rcx), %eax - movzbl (%rdx, %rcx), %edx - subl %edx, %eax -# endif -# else + leal CHAR_PER_VEC(%OFFSET_REG64), %eax + cmpq %rax, %rdx + jbe L(check_ret_vec_page_cross2) # ifdef USE_AS_WCSCMP - movq %rax, %rsi - xorl %eax, %eax - movl (VEC_SIZE * 2)(%rsi, %rcx), %edi - cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi - jne L(wcscmp_return) + addq $-(CHAR_PER_VEC * 2), %rdx # else - movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax - movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx - subl %edx, %eax + addq %rdi, %rdx # endif # endif - ret + TESTEQ %ecx + jz L(prepare_loop_no_len) -# ifdef USE_AS_STRNCMP -L(string_nbyte_offset_check): - leaq (VEC_SIZE * 4)(%r10), %r10 - cmpq %r10, %r11 - jbe L(zero) - jmp L(back_to_loop) -# endif - - .p2align 4 -L(cross_page_loop): - /* Check one byte/dword at a time. */ -# ifdef USE_AS_WCSCMP - cmpl %ecx, %eax -# else - subl %ecx, %eax -# endif - jne L(different) - addl $SIZE_OF_CHAR, %edx - cmpl $(VEC_SIZE * 4), %edx - je L(main_loop_header) -# ifdef USE_AS_STRNCMP - cmpq %r11, %rdx - jae L(zero) + .p2align 4,, 4 +L(ret_vec_page_cross): +# ifndef USE_AS_STRNCMP +L(check_ret_vec_page_cross): # endif + tzcntl %ecx, %ecx + addl %OFFSET_REG, %ecx +L(ret_vec_page_cross_cont): # ifdef USE_AS_WCSCMP - movl (%rdi, %rdx), %eax - movl (%rsi, %rdx), %ecx + movl (%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax + cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx + je L(ret12) + setl %al + negl %eax + xorl %r8d, %eax # else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %ecx -# endif - /* Check null CHAR. */ - testl %eax, %eax - jne L(cross_page_loop) - /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED - comparisons. */ + movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax + movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %ecx) subl %ecx, %eax -# ifndef USE_AS_WCSCMP -L(different): + xorl %r8d, %eax + subl %r8d, %eax # endif +L(ret12): ret -# ifdef USE_AS_WCSCMP - .p2align 4 -L(different): - /* Use movl to avoid modifying EFLAGS. */ - movl $0, %eax - setl %al - negl %eax - orl $1, %eax - ret -# endif # ifdef USE_AS_STRNCMP - .p2align 4 -L(zero): + .p2align 4,, 10 +L(check_ret_vec_page_cross2): + TESTEQ %ecx +L(check_ret_vec_page_cross): + tzcntl %ecx, %ecx + addl %OFFSET_REG, %ecx + cmpq %rcx, %rdx + ja L(ret_vec_page_cross_cont) + .p2align 4,, 2 +L(ret_zero_page_cross): xorl %eax, %eax ret +# endif - .p2align 4 -L(char0): -# ifdef USE_AS_WCSCMP - xorl %eax, %eax - movl (%rdi), %ecx - cmpl (%rsi), %ecx - jne L(wcscmp_return) -# else - movzbl (%rsi), %ecx - movzbl (%rdi), %eax - subl %ecx, %eax -# endif - ret + .p2align 4,, 4 +L(page_cross_s2): + /* Ensure this is a true page cross. */ + subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx + jbe L(no_page_cross) + + + movl %ecx, %eax + movq %rdi, %rcx + movq %rsi, %rdi + movq %rcx, %rsi + + /* set r8 to negate return value as rdi and rsi swapped. */ +# ifdef USE_AS_WCSCMP + movl $-4, %r8d +# else + movl $-1, %r8d # endif + xorl %OFFSET_REG, %OFFSET_REG - .p2align 4 -L(last_vector): - addq %rdx, %rdi - addq %rdx, %rsi -# ifdef USE_AS_STRNCMP - subq %rdx, %r11 + /* Check if more than 1x VEC till page cross. */ + subl $(VEC_SIZE * 3), %eax + jle L(page_cross_loop) + + .p2align 4,, 6 +L(less_1x_vec_till_page): +# ifdef USE_AS_WCSCMP + shrl $2, %eax # endif - tzcntl %ecx, %edx + /* Find largest load size we can use. */ + cmpl $(16 / SIZE_OF_CHAR), %eax + ja L(less_16_till_page) + + /* Use 16 byte comparison. */ + vmovdqu (%rdi), %xmm0 + VPTESTM %xmm0, %xmm0, %k2 + CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2} + kmovd %k1, %ecx # ifdef USE_AS_WCSCMP - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %edx + subl $0xf, %ecx +# else + incw %cx # endif + jnz L(check_ret_vec_page_cross) + movl $(16 / SIZE_OF_CHAR), %OFFSET_REG # ifdef USE_AS_STRNCMP - cmpq %r11, %rdx - jae L(zero) + cmpq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross_slow_case0) + subl %eax, %OFFSET_REG +# else + /* Explicit check for 16 byte alignment. */ + subl %eax, %OFFSET_REG + jz L(prepare_loop) # endif + vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + VPTESTM %xmm0, %xmm0, %k2 + CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2} + kmovd %k1, %ecx # ifdef USE_AS_WCSCMP - xorl %eax, %eax - movl (%rdi, %rdx), %ecx - cmpl (%rsi, %rdx), %ecx - jne L(wcscmp_return) + subl $0xf, %ecx +# else + incw %cx +# endif + jnz L(check_ret_vec_page_cross) +# ifdef USE_AS_STRNCMP + addl $(16 / SIZE_OF_CHAR), %OFFSET_REG + subq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross_slow_case0) + subq $-(CHAR_PER_VEC * 4), %rdx + + leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi + leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi # else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %edx - subl %edx, %eax + leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi + leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi # endif + jmp L(prepare_loop_aligned) + +# ifdef USE_AS_STRNCMP + .p2align 4,, 2 +L(ret_zero_page_cross_slow_case0): + xorl %eax, %eax ret +# endif - /* Comparing on page boundary region requires special treatment: - It must done one vector at the time, starting with the wider - ymm vector if possible, if not, with xmm. If fetching 16 bytes - (xmm) still passes the boundary, byte comparison must be done. - */ - .p2align 4 -L(cross_page): - /* Try one ymm vector at a time. */ - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - jg L(cross_page_1_vector) -L(loop_1_vector): - VMOVU (%rdi, %rdx), %YMM0 - VPTESTM %YMM0, %YMM0, %k2 - /* Each bit cleared in K1 represents a mismatch or a null CHAR - in YMM0 and 32 bytes at (%rsi, %rdx). */ - VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} + .p2align 4,, 10 +L(less_16_till_page): + cmpl $(24 / SIZE_OF_CHAR), %eax + ja L(less_8_till_page) + + /* Use 8 byte comparison. */ + vmovq (%rdi), %xmm0 + vmovq (%rsi), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} kmovd %k1, %ecx # ifdef USE_AS_WCSCMP - subl $0xff, %ecx + subl $0x3, %ecx # else - incl %ecx + incb %cl # endif - jne L(last_vector) + jnz L(check_ret_vec_page_cross) - addl $VEC_SIZE, %edx - addl $VEC_SIZE, %eax # ifdef USE_AS_STRNCMP - /* Return 0 if the current offset (%rdx) >= the maximum offset - (%r11). */ - cmpq %r11, %rdx - jae L(zero) + cmpq $(8 / SIZE_OF_CHAR), %rdx + jbe L(ret_zero_page_cross_slow_case0) # endif - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - jle L(loop_1_vector) -L(cross_page_1_vector): - /* Less than 32 bytes to check, try one xmm vector. */ - cmpl $(PAGE_SIZE - 16), %eax - jg L(cross_page_1_xmm) - VMOVU (%rdi, %rdx), %XMM0 + movl $(24 / SIZE_OF_CHAR), %OFFSET_REG + subl %eax, %OFFSET_REG - VPTESTM %YMM0, %YMM0, %k2 - /* Each bit cleared in K1 represents a mismatch or a null CHAR - in XMM0 and 16 bytes at (%rsi, %rdx). */ - VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2} + vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} kmovd %k1, %ecx # ifdef USE_AS_WCSCMP - subl $0xf, %ecx + subl $0x3, %ecx # else - subl $0xffff, %ecx + incb %cl # endif - jne L(last_vector) + jnz L(check_ret_vec_page_cross) + - addl $16, %edx -# ifndef USE_AS_WCSCMP - addl $16, %eax -# endif # ifdef USE_AS_STRNCMP - /* Return 0 if the current offset (%rdx) >= the maximum offset - (%r11). */ - cmpq %r11, %rdx - jae L(zero) + addl $(8 / SIZE_OF_CHAR), %OFFSET_REG + subq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross_slow_case0) + subq $-(CHAR_PER_VEC * 4), %rdx + + leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi + leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi +# else + leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi + leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi # endif + jmp L(prepare_loop_aligned) -L(cross_page_1_xmm): -# ifndef USE_AS_WCSCMP - /* Less than 16 bytes to check, try 8 byte vector. NB: No need - for wcscmp nor wcsncmp since wide char is 4 bytes. */ - cmpl $(PAGE_SIZE - 8), %eax - jg L(cross_page_8bytes) - vmovq (%rdi, %rdx), %XMM0 - vmovq (%rsi, %rdx), %XMM1 - VPTESTM %YMM0, %YMM0, %k2 - /* Each bit cleared in K1 represents a mismatch or a null CHAR - in XMM0 and XMM1. */ - VPCMP $0, %XMM1, %XMM0, %k1{%k2} - kmovb %k1, %ecx + + + .p2align 4,, 10 +L(less_8_till_page): # ifdef USE_AS_WCSCMP - subl $0x3, %ecx + /* If using wchar then this is the only check before we reach + the page boundary. */ + movl (%rdi), %eax + movl (%rsi), %ecx + cmpl %ecx, %eax + jnz L(ret_less_8_wcs) +# ifdef USE_AS_STRNCMP + addq $-(CHAR_PER_VEC * 2), %rdx + /* We already checked for len <= 1 so cannot hit that case here. + */ +# endif + testl %eax, %eax + jnz L(prepare_loop) + ret + + .p2align 4,, 8 +L(ret_less_8_wcs): + setl %OFFSET_REG8 + negl %OFFSET_REG + movl %OFFSET_REG, %eax + xorl %r8d, %eax + ret + # else - subl $0xff, %ecx -# endif - jne L(last_vector) + cmpl $28, %eax + ja L(less_4_till_page) + + vmovd (%rdi), %xmm0 + vmovd (%rsi), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + subl $0xf, %ecx + jnz L(check_ret_vec_page_cross) - addl $8, %edx - addl $8, %eax # ifdef USE_AS_STRNCMP - /* Return 0 if the current offset (%rdx) >= the maximum offset - (%r11). */ - cmpq %r11, %rdx - jae L(zero) + cmpq $4, %rdx + jbe L(ret_zero_page_cross_slow_case1) # endif + movl $(28 / SIZE_OF_CHAR), %OFFSET_REG + subl %eax, %OFFSET_REG -L(cross_page_8bytes): - /* Less than 8 bytes to check, try 4 byte vector. */ - cmpl $(PAGE_SIZE - 4), %eax - jg L(cross_page_4bytes) - vmovd (%rdi, %rdx), %XMM0 - vmovd (%rsi, %rdx), %XMM1 - - VPTESTM %YMM0, %YMM0, %k2 - /* Each bit cleared in K1 represents a mismatch or a null CHAR - in XMM0 and XMM1. */ - VPCMP $0, %XMM1, %XMM0, %k1{%k2} + vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} kmovd %k1, %ecx -# ifdef USE_AS_WCSCMP - subl $0x1, %ecx -# else subl $0xf, %ecx -# endif - jne L(last_vector) + jnz L(check_ret_vec_page_cross) +# ifdef USE_AS_STRNCMP + addl $(4 / SIZE_OF_CHAR), %OFFSET_REG + subq %OFFSET_REG64, %rdx + jbe L(ret_zero_page_cross_slow_case1) + subq $-(CHAR_PER_VEC * 4), %rdx + + leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi + leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi +# else + leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi + leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi +# endif + jmp L(prepare_loop_aligned) + - addl $4, %edx # ifdef USE_AS_STRNCMP - /* Return 0 if the current offset (%rdx) >= the maximum offset - (%r11). */ - cmpq %r11, %rdx - jae L(zero) + .p2align 4,, 2 +L(ret_zero_page_cross_slow_case1): + xorl %eax, %eax + ret # endif -L(cross_page_4bytes): -# endif - /* Less than 4 bytes to check, try one byte/dword at a time. */ -# ifdef USE_AS_STRNCMP - cmpq %r11, %rdx - jae L(zero) -# endif -# ifdef USE_AS_WCSCMP - movl (%rdi, %rdx), %eax - movl (%rsi, %rdx), %ecx -# else - movzbl (%rdi, %rdx), %eax - movzbl (%rsi, %rdx), %ecx -# endif - testl %eax, %eax - jne L(cross_page_loop) - subl %ecx, %eax + .p2align 4,, 10 +L(less_4_till_page): + subq %rdi, %rsi + /* Extremely slow byte comparison loop. */ +L(less_4_loop): + movzbl (%rdi), %eax + movzbl (%rsi, %rdi), %ecx + TOLOWER_gpr (%rax, %eax) + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) + subl %BYTE_LOOP_REG, %eax + jnz L(ret_less_4_loop) + testl %ecx, %ecx + jz L(ret_zero_4_loop) +# ifdef USE_AS_STRNCMP + decq %rdx + jz L(ret_zero_4_loop) +# endif + incq %rdi + /* end condition is reach page boundary (rdi is aligned). */ + testl $31, %edi + jnz L(less_4_loop) + leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi + addq $-(VEC_SIZE * 4), %rdi +# ifdef USE_AS_STRNCMP + subq $-(CHAR_PER_VEC * 4), %rdx +# endif + jmp L(prepare_loop_aligned) + +L(ret_zero_4_loop): + xorl %eax, %eax + ret +L(ret_less_4_loop): + xorl %r8d, %eax + subl %r8d, %eax ret -END (STRCMP) +# endif + cfi_endproc + .size STRCMP, .-STRCMP #endif diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S index 580feb90e..a9178ad25 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S @@ -41,13 +41,8 @@ # define UPDATE_STRNCMP_COUNTER #endif -#ifdef USE_AVX -# define SECTION avx -# define GLABEL(l) l##_avx -#else -# define SECTION sse4.2 -# define GLABEL(l) l##_sse42 -#endif +#define SECTION sse4.2 +#define GLABEL(l) l##_sse42 #define LABEL(l) .L##l @@ -88,9 +83,8 @@ ENTRY (GLABEL(__strcasecmp)) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RDX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END (GLABEL(__strcasecmp)) /* FALLTHROUGH to strcasecmp_l. */ #endif @@ -99,29 +93,14 @@ ENTRY (GLABEL(__strncasecmp)) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RCX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END (GLABEL(__strncasecmp)) /* FALLTHROUGH to strncasecmp_l. */ #endif -#ifdef USE_AVX -# define movdqa vmovdqa -# define movdqu vmovdqu -# define pmovmskb vpmovmskb -# define pcmpistri vpcmpistri -# define psubb vpsubb -# define pcmpeqb vpcmpeqb -# define psrldq vpsrldq -# define pslldq vpslldq -# define palignr vpalignr -# define pxor vpxor -# define D(arg) arg, arg -#else -# define D(arg) arg -#endif +#define arg arg STRCMP_SSE42: cfi_startproc @@ -169,27 +148,22 @@ STRCMP_SSE42: #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L .section .rodata.cst16,"aM",@progbits,16 .align 16 -LABEL(belowupper): - .quad 0x4040404040404040 - .quad 0x4040404040404040 -LABEL(topupper): -# ifdef USE_AVX - .quad 0x5a5a5a5a5a5a5a5a - .quad 0x5a5a5a5a5a5a5a5a -# else - .quad 0x5b5b5b5b5b5b5b5b - .quad 0x5b5b5b5b5b5b5b5b -# endif -LABEL(touppermask): +LABEL(lcase_min): + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f +LABEL(lcase_max): + .quad 0x9999999999999999 + .quad 0x9999999999999999 +LABEL(case_add): .quad 0x2020202020202020 .quad 0x2020202020202020 .previous - movdqa LABEL(belowupper)(%rip), %xmm4 -# define UCLOW_reg %xmm4 - movdqa LABEL(topupper)(%rip), %xmm5 -# define UCHIGH_reg %xmm5 - movdqa LABEL(touppermask)(%rip), %xmm6 -# define LCQWORD_reg %xmm6 + movdqa LABEL(lcase_min)(%rip), %xmm4 +# define LCASE_MIN_reg %xmm4 + movdqa LABEL(lcase_max)(%rip), %xmm5 +# define LCASE_MAX_reg %xmm5 + movdqa LABEL(case_add)(%rip), %xmm6 +# define CASE_ADD_reg %xmm6 #endif cmp $0x30, %ecx ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ @@ -198,43 +172,26 @@ LABEL(touppermask): movdqu (%rdi), %xmm1 movdqu (%rsi), %xmm2 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L -# ifdef USE_AVX -# define TOLOWER(reg1, reg2) \ - vpcmpgtb UCLOW_reg, reg1, %xmm7; \ - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ - vpcmpgtb UCLOW_reg, reg2, %xmm9; \ - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ - vpandn %xmm7, %xmm8, %xmm8; \ - vpandn %xmm9, %xmm10, %xmm10; \ - vpand LCQWORD_reg, %xmm8, %xmm8; \ - vpand LCQWORD_reg, %xmm10, %xmm10; \ - vpor reg1, %xmm8, reg1; \ - vpor reg2, %xmm10, reg2 -# else -# define TOLOWER(reg1, reg2) \ - movdqa reg1, %xmm7; \ - movdqa UCHIGH_reg, %xmm8; \ - movdqa reg2, %xmm9; \ - movdqa UCHIGH_reg, %xmm10; \ - pcmpgtb UCLOW_reg, %xmm7; \ - pcmpgtb reg1, %xmm8; \ - pcmpgtb UCLOW_reg, %xmm9; \ - pcmpgtb reg2, %xmm10; \ - pand %xmm8, %xmm7; \ - pand %xmm10, %xmm9; \ - pand LCQWORD_reg, %xmm7; \ - pand LCQWORD_reg, %xmm9; \ - por %xmm7, reg1; \ - por %xmm9, reg2 -# endif +# define TOLOWER(reg1, reg2) \ + movdqa LCASE_MIN_reg, %xmm7; \ + movdqa LCASE_MIN_reg, %xmm8; \ + paddb reg1, %xmm7; \ + paddb reg2, %xmm8; \ + pcmpgtb LCASE_MAX_reg, %xmm7; \ + pcmpgtb LCASE_MAX_reg, %xmm8; \ + pandn CASE_ADD_reg, %xmm7; \ + pandn CASE_ADD_reg, %xmm8; \ + paddb %xmm7, reg1; \ + paddb %xmm8, reg2 + TOLOWER (%xmm1, %xmm2) #else # define TOLOWER(reg1, reg2) #endif - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ - pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ pmovmskb %xmm1, %edx sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ jnz LABEL(less16bytes)/* If not, find different value or null char */ @@ -258,7 +215,7 @@ LABEL(crosscache): xor %r8d, %r8d and $0xf, %ecx /* offset of rsi */ and $0xf, %eax /* offset of rdi */ - pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ cmp %eax, %ecx je LABEL(ashr_0) /* rsi and rdi relative offset same */ ja LABEL(bigger) @@ -272,7 +229,7 @@ LABEL(bigger): sub %rcx, %r9 lea LABEL(unaligned_table)(%rip), %r10 movslq (%r10, %r9,4), %r9 - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ lea (%r10, %r9), %r10 _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ @@ -285,15 +242,15 @@ LABEL(bigger): LABEL(ashr_0): movdqa (%rsi), %xmm1 - pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L - pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ #else movdqa (%rdi), %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ + pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ #endif - psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ pmovmskb %xmm1, %r9d shr %cl, %edx /* adjust 0xffff for offset */ shr %cl, %r9d /* adjust for 16-byte offset */ @@ -373,10 +330,10 @@ LABEL(ashr_0_exit_use): */ .p2align 4 LABEL(ashr_1): - pslldq $15, D(%xmm2) /* shift first string to align with second */ + pslldq $15, %xmm2 /* shift first string to align with second */ TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ - psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ pmovmskb %xmm2, %r9d shr %cl, %edx /* adjust 0xffff for offset */ shr %cl, %r9d /* adjust for 16-byte offset */ @@ -404,7 +361,7 @@ LABEL(loop_ashr_1_use): LABEL(nibble_ashr_1_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $1, -16(%rdi, %rdx), D(%xmm0) + palignr $1, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -423,7 +380,7 @@ LABEL(nibble_ashr_1_restart_use): jg LABEL(nibble_ashr_1_use) movdqa (%rdi, %rdx), %xmm0 - palignr $1, -16(%rdi, %rdx), D(%xmm0) + palignr $1, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -443,7 +400,7 @@ LABEL(nibble_ashr_1_restart_use): LABEL(nibble_ashr_1_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $1, D(%xmm0) + psrldq $1, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -461,10 +418,10 @@ LABEL(nibble_ashr_1_use): */ .p2align 4 LABEL(ashr_2): - pslldq $14, D(%xmm2) + pslldq $14, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -492,7 +449,7 @@ LABEL(loop_ashr_2_use): LABEL(nibble_ashr_2_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $2, -16(%rdi, %rdx), D(%xmm0) + palignr $2, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -511,7 +468,7 @@ LABEL(nibble_ashr_2_restart_use): jg LABEL(nibble_ashr_2_use) movdqa (%rdi, %rdx), %xmm0 - palignr $2, -16(%rdi, %rdx), D(%xmm0) + palignr $2, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -531,7 +488,7 @@ LABEL(nibble_ashr_2_restart_use): LABEL(nibble_ashr_2_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $2, D(%xmm0) + psrldq $2, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -549,10 +506,10 @@ LABEL(nibble_ashr_2_use): */ .p2align 4 LABEL(ashr_3): - pslldq $13, D(%xmm2) + pslldq $13, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -580,7 +537,7 @@ LABEL(loop_ashr_3_use): LABEL(nibble_ashr_3_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $3, -16(%rdi, %rdx), D(%xmm0) + palignr $3, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -599,7 +556,7 @@ LABEL(nibble_ashr_3_restart_use): jg LABEL(nibble_ashr_3_use) movdqa (%rdi, %rdx), %xmm0 - palignr $3, -16(%rdi, %rdx), D(%xmm0) + palignr $3, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -619,7 +576,7 @@ LABEL(nibble_ashr_3_restart_use): LABEL(nibble_ashr_3_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $3, D(%xmm0) + psrldq $3, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -637,10 +594,10 @@ LABEL(nibble_ashr_3_use): */ .p2align 4 LABEL(ashr_4): - pslldq $12, D(%xmm2) + pslldq $12, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -669,7 +626,7 @@ LABEL(loop_ashr_4_use): LABEL(nibble_ashr_4_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $4, -16(%rdi, %rdx), D(%xmm0) + palignr $4, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -688,7 +645,7 @@ LABEL(nibble_ashr_4_restart_use): jg LABEL(nibble_ashr_4_use) movdqa (%rdi, %rdx), %xmm0 - palignr $4, -16(%rdi, %rdx), D(%xmm0) + palignr $4, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -708,7 +665,7 @@ LABEL(nibble_ashr_4_restart_use): LABEL(nibble_ashr_4_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $4, D(%xmm0) + psrldq $4, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -726,10 +683,10 @@ LABEL(nibble_ashr_4_use): */ .p2align 4 LABEL(ashr_5): - pslldq $11, D(%xmm2) + pslldq $11, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -758,7 +715,7 @@ LABEL(loop_ashr_5_use): LABEL(nibble_ashr_5_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $5, -16(%rdi, %rdx), D(%xmm0) + palignr $5, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -778,7 +735,7 @@ LABEL(nibble_ashr_5_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $5, -16(%rdi, %rdx), D(%xmm0) + palignr $5, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -798,7 +755,7 @@ LABEL(nibble_ashr_5_restart_use): LABEL(nibble_ashr_5_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $5, D(%xmm0) + psrldq $5, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -816,10 +773,10 @@ LABEL(nibble_ashr_5_use): */ .p2align 4 LABEL(ashr_6): - pslldq $10, D(%xmm2) + pslldq $10, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -848,7 +805,7 @@ LABEL(loop_ashr_6_use): LABEL(nibble_ashr_6_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $6, -16(%rdi, %rdx), D(%xmm0) + palignr $6, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -867,7 +824,7 @@ LABEL(nibble_ashr_6_restart_use): jg LABEL(nibble_ashr_6_use) movdqa (%rdi, %rdx), %xmm0 - palignr $6, -16(%rdi, %rdx), D(%xmm0) + palignr $6, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -887,7 +844,7 @@ LABEL(nibble_ashr_6_restart_use): LABEL(nibble_ashr_6_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $6, D(%xmm0) + psrldq $6, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -905,10 +862,10 @@ LABEL(nibble_ashr_6_use): */ .p2align 4 LABEL(ashr_7): - pslldq $9, D(%xmm2) + pslldq $9, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -937,7 +894,7 @@ LABEL(loop_ashr_7_use): LABEL(nibble_ashr_7_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $7, -16(%rdi, %rdx), D(%xmm0) + palignr $7, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -956,7 +913,7 @@ LABEL(nibble_ashr_7_restart_use): jg LABEL(nibble_ashr_7_use) movdqa (%rdi, %rdx), %xmm0 - palignr $7, -16(%rdi, %rdx), D(%xmm0) + palignr $7, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a,(%rsi,%rdx), %xmm0 #else @@ -976,7 +933,7 @@ LABEL(nibble_ashr_7_restart_use): LABEL(nibble_ashr_7_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $7, D(%xmm0) + psrldq $7, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -994,10 +951,10 @@ LABEL(nibble_ashr_7_use): */ .p2align 4 LABEL(ashr_8): - pslldq $8, D(%xmm2) + pslldq $8, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -1026,7 +983,7 @@ LABEL(loop_ashr_8_use): LABEL(nibble_ashr_8_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $8, -16(%rdi, %rdx), D(%xmm0) + palignr $8, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1045,7 +1002,7 @@ LABEL(nibble_ashr_8_restart_use): jg LABEL(nibble_ashr_8_use) movdqa (%rdi, %rdx), %xmm0 - palignr $8, -16(%rdi, %rdx), D(%xmm0) + palignr $8, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1065,7 +1022,7 @@ LABEL(nibble_ashr_8_restart_use): LABEL(nibble_ashr_8_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $8, D(%xmm0) + psrldq $8, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -1083,10 +1040,10 @@ LABEL(nibble_ashr_8_use): */ .p2align 4 LABEL(ashr_9): - pslldq $7, D(%xmm2) + pslldq $7, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -1116,7 +1073,7 @@ LABEL(loop_ashr_9_use): LABEL(nibble_ashr_9_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $9, -16(%rdi, %rdx), D(%xmm0) + palignr $9, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1135,7 +1092,7 @@ LABEL(nibble_ashr_9_restart_use): jg LABEL(nibble_ashr_9_use) movdqa (%rdi, %rdx), %xmm0 - palignr $9, -16(%rdi, %rdx), D(%xmm0) + palignr $9, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1155,7 +1112,7 @@ LABEL(nibble_ashr_9_restart_use): LABEL(nibble_ashr_9_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $9, D(%xmm0) + psrldq $9, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -1173,10 +1130,10 @@ LABEL(nibble_ashr_9_use): */ .p2align 4 LABEL(ashr_10): - pslldq $6, D(%xmm2) + pslldq $6, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -1205,7 +1162,7 @@ LABEL(loop_ashr_10_use): LABEL(nibble_ashr_10_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $10, -16(%rdi, %rdx), D(%xmm0) + palignr $10, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1224,7 +1181,7 @@ LABEL(nibble_ashr_10_restart_use): jg LABEL(nibble_ashr_10_use) movdqa (%rdi, %rdx), %xmm0 - palignr $10, -16(%rdi, %rdx), D(%xmm0) + palignr $10, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1244,7 +1201,7 @@ LABEL(nibble_ashr_10_restart_use): LABEL(nibble_ashr_10_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $10, D(%xmm0) + psrldq $10, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -1262,10 +1219,10 @@ LABEL(nibble_ashr_10_use): */ .p2align 4 LABEL(ashr_11): - pslldq $5, D(%xmm2) + pslldq $5, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -1294,7 +1251,7 @@ LABEL(loop_ashr_11_use): LABEL(nibble_ashr_11_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $11, -16(%rdi, %rdx), D(%xmm0) + palignr $11, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1313,7 +1270,7 @@ LABEL(nibble_ashr_11_restart_use): jg LABEL(nibble_ashr_11_use) movdqa (%rdi, %rdx), %xmm0 - palignr $11, -16(%rdi, %rdx), D(%xmm0) + palignr $11, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1333,7 +1290,7 @@ LABEL(nibble_ashr_11_restart_use): LABEL(nibble_ashr_11_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $11, D(%xmm0) + psrldq $11, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -1351,10 +1308,10 @@ LABEL(nibble_ashr_11_use): */ .p2align 4 LABEL(ashr_12): - pslldq $4, D(%xmm2) + pslldq $4, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -1383,7 +1340,7 @@ LABEL(loop_ashr_12_use): LABEL(nibble_ashr_12_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $12, -16(%rdi, %rdx), D(%xmm0) + palignr $12, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1402,7 +1359,7 @@ LABEL(nibble_ashr_12_restart_use): jg LABEL(nibble_ashr_12_use) movdqa (%rdi, %rdx), %xmm0 - palignr $12, -16(%rdi, %rdx), D(%xmm0) + palignr $12, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1422,7 +1379,7 @@ LABEL(nibble_ashr_12_restart_use): LABEL(nibble_ashr_12_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $12, D(%xmm0) + psrldq $12, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -1440,10 +1397,10 @@ LABEL(nibble_ashr_12_use): */ .p2align 4 LABEL(ashr_13): - pslldq $3, D(%xmm2) + pslldq $3, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -1473,7 +1430,7 @@ LABEL(loop_ashr_13_use): LABEL(nibble_ashr_13_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $13, -16(%rdi, %rdx), D(%xmm0) + palignr $13, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1492,7 +1449,7 @@ LABEL(nibble_ashr_13_restart_use): jg LABEL(nibble_ashr_13_use) movdqa (%rdi, %rdx), %xmm0 - palignr $13, -16(%rdi, %rdx), D(%xmm0) + palignr $13, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1512,7 +1469,7 @@ LABEL(nibble_ashr_13_restart_use): LABEL(nibble_ashr_13_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $13, D(%xmm0) + psrldq $13, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -1530,10 +1487,10 @@ LABEL(nibble_ashr_13_use): */ .p2align 4 LABEL(ashr_14): - pslldq $2, D(%xmm2) + pslldq $2, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -1563,7 +1520,7 @@ LABEL(loop_ashr_14_use): LABEL(nibble_ashr_14_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $14, -16(%rdi, %rdx), D(%xmm0) + palignr $14, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1582,7 +1539,7 @@ LABEL(nibble_ashr_14_restart_use): jg LABEL(nibble_ashr_14_use) movdqa (%rdi, %rdx), %xmm0 - palignr $14, -16(%rdi, %rdx), D(%xmm0) + palignr $14, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1602,7 +1559,7 @@ LABEL(nibble_ashr_14_restart_use): LABEL(nibble_ashr_14_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $14, D(%xmm0) + psrldq $14, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx @@ -1620,10 +1577,10 @@ LABEL(nibble_ashr_14_use): */ .p2align 4 LABEL(ashr_15): - pslldq $1, D(%xmm2) + pslldq $1, %xmm2 TOLOWER (%xmm1, %xmm2) - pcmpeqb %xmm1, D(%xmm2) - psubb %xmm0, D(%xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 pmovmskb %xmm2, %r9d shr %cl, %edx shr %cl, %r9d @@ -1655,7 +1612,7 @@ LABEL(loop_ashr_15_use): LABEL(nibble_ashr_15_restart_use): movdqa (%rdi, %rdx), %xmm0 - palignr $15, -16(%rdi, %rdx), D(%xmm0) + palignr $15, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1674,7 +1631,7 @@ LABEL(nibble_ashr_15_restart_use): jg LABEL(nibble_ashr_15_use) movdqa (%rdi, %rdx), %xmm0 - palignr $15, -16(%rdi, %rdx), D(%xmm0) + palignr $15, -16(%rdi, %rdx), %xmm0 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L pcmpistri $0x1a, (%rsi,%rdx), %xmm0 #else @@ -1694,7 +1651,7 @@ LABEL(nibble_ashr_15_restart_use): LABEL(nibble_ashr_15_use): sub $0x1000, %r10 movdqa -16(%rdi, %rdx), %xmm0 - psrldq $15, D(%xmm0) + psrldq $15, %xmm0 pcmpistri $0x3a,%xmm0, %xmm0 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L cmp %r11, %rcx diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c index 68cb73baa..08638e263 100644 --- a/sysdeps/x86_64/multiarch/strcmp.c +++ b/sysdeps/x86_64/multiarch/strcmp.c @@ -29,6 +29,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; @@ -53,6 +54,10 @@ IFUNC_SELECTOR (void) return OPTIMIZE (avx2); } + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2) + && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) + return OPTIMIZE (sse42); + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c index 013aebf79..c312fab8b 100644 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a) RETURN (NULL, strlen (s)); const char *aligned; - __m128i mask; - int offset = (int) ((size_t) a & 15); + __m128i mask, maskz, zero; + unsigned int maskz_bits; + unsigned int offset = (unsigned int) ((size_t) a & 15); + zero = _mm_set1_epi8 (0); if (offset != 0) { /* Load masks. */ aligned = (const char *) ((size_t) a & -16L); __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - - mask = __m128i_shift_right (mask0, offset); + maskz = _mm_cmpeq_epi8 (mask0, zero); /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16 - offset) - { - /* There is no NULL terminator. */ - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - int index = _mm_cmpistri (mask1, mask1, 0x3a); - length += index; - - /* Don't use SSE4.2 if the length of A > 16. */ - if (length > 16) - return STRCSPN_SSE2 (s, a); - - if (index != 0) - { - /* Combine mask0 and mask1. We could play games with - palignr, but frankly this data should be in L1 now - so do the merge via an unaligned load. */ - mask = _mm_loadu_si128 ((__m128i *) a); - } - } + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) + { + mask = __m128i_shift_right (mask0, offset); + offset = (unsigned int) ((size_t) s & 15); + if (offset) + goto start_unaligned; + + aligned = s; + goto start_loop; + } } - else - { - /* A is aligned. */ - mask = _mm_load_si128 ((__m128i *) a); - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return STRCSPN_SSE2 (s, a); - } + /* A is aligned. */ + mask = _mm_loadu_si128 ((__m128i *) a); + /* Find where the NULL terminator is. */ + maskz = _mm_cmpeq_epi8 (mask, zero); + maskz_bits = _mm_movemask_epi8 (maskz); + if (maskz_bits == 0) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_SSE2 (s, a); } - offset = (int) ((size_t) s & 15); + aligned = s; + offset = (unsigned int) ((size_t) s & 15); if (offset != 0) { + start_unaligned: /* Check partial string. */ aligned = (const char *) ((size_t) s & -16L); __m128i value = _mm_load_si128 ((__m128i *) aligned); value = __m128i_shift_right (value, offset); - int length = _mm_cmpistri (mask, value, 0x2); + unsigned int length = _mm_cmpistri (mask, value, 0x2); /* No need to check ZFlag since ZFlag is always 1. */ - int cflag = _mm_cmpistrc (mask, value, 0x2); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); if (cflag) RETURN ((char *) (s + length), length); /* Find where the NULL terminator is. */ - int index = _mm_cmpistri (value, value, 0x3a); + unsigned int index = _mm_cmpistri (value, value, 0x3a); if (index < 16 - offset) RETURN (NULL, index); aligned += 16; } - else - aligned = s; +start_loop: while (1) { __m128i value = _mm_load_si128 ((__m128i *) aligned); - int index = _mm_cmpistri (mask, value, 0x2); - int cflag = _mm_cmpistrc (mask, value, 0x2); - int zflag = _mm_cmpistrz (mask, value, 0x2); + unsigned int index = _mm_cmpistri (mask, value, 0x2); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); if (cflag) RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); if (zflag) diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c similarity index 86% rename from sysdeps/x86_64/multiarch/strspn-sse2.S rename to sysdeps/x86_64/multiarch/strcspn-sse2.c index e0a095f25..3a04bb39f 100644 --- a/sysdeps/x86_64/multiarch/strspn-sse2.S +++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c @@ -1,4 +1,4 @@ -/* strspn optimized with SSE2. +/* strcspn. Copyright (C) 2017-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -19,10 +19,10 @@ #if IS_IN (libc) # include -# define strspn __strspn_sse2 +# define STRCSPN __strcspn_sse2 # undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strspn) +# define libc_hidden_builtin_def(STRCSPN) #endif -#include +#include diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S new file mode 100644 index 000000000..278c89969 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S @@ -0,0 +1,302 @@ +/* Placeholder function, not used by any processor at the moment. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# ifdef USE_AS_WCSLEN +# define VPCMP vpcmpd +# define VPTESTN vptestnmd +# define VPMINU vpminud +# define CHAR_SIZE 4 +# else +# define VPCMP vpcmpb +# define VPTESTN vptestnmb +# define VPMINU vpminub +# define CHAR_SIZE 1 +# endif + +# define XMM0 xmm16 +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +# if VEC_SIZE == 64 +# define KMOV kmovq +# define KORTEST kortestq +# define RAX rax +# define RCX rcx +# define RDX rdx +# define SHR shrq +# define TEXTSUFFIX evex512 +# define VMM0 zmm16 +# define VMM1 zmm17 +# define VMM2 zmm18 +# define VMM3 zmm19 +# define VMM4 zmm20 +# define VMOVA vmovdqa64 +# elif VEC_SIZE == 32 +/* Currently Unused. */ +# define KMOV kmovd +# define KORTEST kortestd +# define RAX eax +# define RCX ecx +# define RDX edx +# define SHR shrl +# define TEXTSUFFIX evex256 +# define VMM0 ymm16 +# define VMM1 ymm17 +# define VMM2 ymm18 +# define VMM3 ymm19 +# define VMM4 ymm20 +# define VMOVA vmovdqa32 +# endif + + .section .text.TEXTSUFFIX, "ax", @progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN (STRLEN, 6) +# ifdef USE_AS_STRNLEN + /* Check zero length. */ + test %RSI_LP, %RSI_LP + jz L(ret_max) +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi +# endif +# endif + + movl %edi, %eax + vpxorq %XMM0, %XMM0, %XMM0 + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + + /* Compare [w]char for null, mask bit will be set for match. */ + VPCMP $0, (%rdi), %VMM0, %k0 + KMOV %k0, %RAX + test %RAX, %RAX + jz L(align_more) + + bsf %RAX, %RAX +# ifdef USE_AS_STRNLEN + cmpq %rsi, %rax + cmovnb %rsi, %rax +# endif + ret + + /* At this point vector max length reached. */ +# ifdef USE_AS_STRNLEN + .p2align 4,,3 +L(ret_max): + movq %rsi, %rax + ret +# endif + +L(align_more): + leaq VEC_SIZE(%rdi), %rax + /* Align rax to VEC_SIZE. */ + andq $-VEC_SIZE, %rax +# ifdef USE_AS_STRNLEN + movq %rax, %rdx + subq %rdi, %rdx +# ifdef USE_AS_WCSLEN + SHR $2, %RDX +# endif + /* At this point rdx contains [w]chars already compared. */ + subq %rsi, %rdx + jae L(ret_max) + negq %rdx + /* At this point rdx contains number of w[char] needs to go. + Now onwards rdx will keep decrementing with each compare. */ +# endif + + /* Loop unroll 4 times for 4 vector loop. */ + VPCMP $0, (%rax), %VMM0, %k0 + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(ret_vec_x1) + +# ifdef USE_AS_STRNLEN + subq $CHAR_PER_VEC, %rdx + jbe L(ret_max) +# endif + + VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0 + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(ret_vec_x2) + +# ifdef USE_AS_STRNLEN + subq $CHAR_PER_VEC, %rdx + jbe L(ret_max) +# endif + + VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0 + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(ret_vec_x3) + +# ifdef USE_AS_STRNLEN + subq $CHAR_PER_VEC, %rdx + jbe L(ret_max) +# endif + + VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0 + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(ret_vec_x4) + +# ifdef USE_AS_STRNLEN + subq $CHAR_PER_VEC, %rdx + jbe L(ret_max) + /* Save pointer before 4 x VEC_SIZE alignment. */ + movq %rax, %rcx +# endif + + /* Align address to VEC_SIZE * 4 for loop. */ + andq $-(VEC_SIZE * 4), %rax + +# ifdef USE_AS_STRNLEN + subq %rax, %rcx +# ifdef USE_AS_WCSLEN + SHR $2, %RCX +# endif + /* rcx contains number of [w]char will be recompared due to + alignment fixes. rdx must be incremented by rcx to offset + alignment adjustment. */ + addq %rcx, %rdx + /* Need jump as we don't want to add/subtract rdx for first + iteration of 4 x VEC_SIZE aligned loop. */ + jmp L(loop_entry) +# endif + + .p2align 4,,11 +L(loop): +# ifdef USE_AS_STRNLEN + subq $(CHAR_PER_VEC * 4), %rdx + jbe L(ret_max) +L(loop_entry): +# endif + /* VPMINU and VPCMP combination provide better performance as + compared to alternative combinations. */ + VMOVA (VEC_SIZE * 4)(%rax), %VMM1 + VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2 + VMOVA (VEC_SIZE * 6)(%rax), %VMM3 + VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4 + + VPTESTN %VMM2, %VMM2, %k0 + VPTESTN %VMM4, %VMM4, %k1 + + subq $-(VEC_SIZE * 4), %rax + KORTEST %k0, %k1 + jz L(loop) + + VPTESTN %VMM1, %VMM1, %k2 + KMOV %k2, %RCX + test %RCX, %RCX + jnz L(ret_vec_x1) + + KMOV %k0, %RCX + /* At this point, if k0 is non zero, null char must be in the + second vector. */ + test %RCX, %RCX + jnz L(ret_vec_x2) + + VPTESTN %VMM3, %VMM3, %k3 + KMOV %k3, %RCX + test %RCX, %RCX + jnz L(ret_vec_x3) + /* At this point null [w]char must be in the fourth vector so no + need to check. */ + KMOV %k1, %RCX + + /* Fourth, third, second vector terminating are pretty much + same, implemented this way to avoid branching and reuse code + from pre loop exit condition. */ +L(ret_vec_x4): + bsf %RCX, %RCX + subq %rdi, %rax +# ifdef USE_AS_WCSLEN + subq $-(VEC_SIZE * 3), %rax + shrq $2, %rax + addq %rcx, %rax +# else + leaq (VEC_SIZE * 3)(%rcx, %rax), %rax +# endif +# ifdef USE_AS_STRNLEN + cmpq %rsi, %rax + cmovnb %rsi, %rax +# endif + ret + +L(ret_vec_x3): + bsf %RCX, %RCX + subq %rdi, %rax +# ifdef USE_AS_WCSLEN + subq $-(VEC_SIZE * 2), %rax + shrq $2, %rax + addq %rcx, %rax +# else + leaq (VEC_SIZE * 2)(%rcx, %rax), %rax +# endif +# ifdef USE_AS_STRNLEN + cmpq %rsi, %rax + cmovnb %rsi, %rax +# endif + ret + +L(ret_vec_x2): + subq $-VEC_SIZE, %rax +L(ret_vec_x1): + bsf %RCX, %RCX + subq %rdi, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + addq %rcx, %rax +# ifdef USE_AS_STRNLEN + cmpq %rsi, %rax + cmovnb %rsi, %rax +# endif + ret + +L(page_cross): + movl %eax, %ecx +# ifdef USE_AS_WCSLEN + andl $(VEC_SIZE - 1), %ecx + sarl $2, %ecx +# endif + /* ecx contains number of w[char] to be skipped as a result + of address alignment. */ + xorq %rdi, %rax + VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0 + KMOV %k0, %RAX + /* Ignore number of character for alignment adjustment. */ + SHR %cl, %RAX + jz L(align_more) + + bsf %RAX, %RAX +# ifdef USE_AS_STRNLEN + cmpq %rsi, %rax + cmovnb %rsi, %rax +# endif + ret + +END (STRLEN) +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S new file mode 100644 index 000000000..116f8981c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S @@ -0,0 +1,7 @@ +#ifndef STRLEN +# define STRLEN __strlen_evex512 +#endif + +#define VEC_SIZE 64 + +#include "strlen-evex-base.S" diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S index 42b6124df..874123d60 100644 --- a/sysdeps/x86_64/multiarch/strlen-vec.S +++ b/sysdeps/x86_64/multiarch/strlen-vec.S @@ -28,6 +28,10 @@ # define SHIFT_RETURN #endif +#ifndef SECTION +# define SECTION(p) p +#endif + /* Long lived register in strlen(s), strnlen(s, n) are: %xmm3 - zero @@ -37,7 +41,7 @@ */ -.text + .section SECTION(.text),"ax",@progbits ENTRY(strlen) /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S new file mode 100644 index 000000000..58c05dcfb --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S @@ -0,0 +1,16 @@ +#ifndef STRCMP +# define STRCMP __strncasecmp_l_avx2_rtm +#endif + +#define _GLABEL(x) x ## _rtm +#define GLABEL(x) _GLABEL(x) + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm +#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm + +#include "strncase_l-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S new file mode 100644 index 000000000..48c0aa21f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S @@ -0,0 +1,27 @@ +/* strncasecmp_l optimized with AVX2. + Copyright (C) 2017-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef STRCMP +# define STRCMP __strncasecmp_l_avx2 +#endif +#define USE_AS_STRCASECMP_L +#define USE_AS_STRNCMP +#ifndef OVERFLOW_STRCMP +# define OVERFLOW_STRCMP __strcasecmp_l_avx2 +#endif +#include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S new file mode 100644 index 000000000..8a5af3695 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S @@ -0,0 +1,25 @@ +/* strncasecmp_l optimized with EVEX. + Copyright (C) 2017-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef STRCMP +# define STRCMP __strncasecmp_l_evex +#endif +#define OVERFLOW_STRCMP __strcasecmp_l_evex +#define USE_AS_STRCASECMP_L +#define USE_AS_STRNCMP +#include "strcmp-evex.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S index 37d1224bb..68bad365b 100644 --- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S @@ -1,3 +1,4 @@ #define STRCMP __strncmp_avx2_rtm #define USE_AS_STRNCMP 1 +#define OVERFLOW_STRCMP __strcmp_avx2_rtm #include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S index 1678bcc23..f138e9f1f 100644 --- a/sysdeps/x86_64/multiarch/strncmp-avx2.S +++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S @@ -1,3 +1,4 @@ #define STRCMP __strncmp_avx2 #define USE_AS_STRNCMP 1 +#define OVERFLOW_STRCMP __strcmp_avx2 #include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S index 9773e5fc0..310a6dbe7 100644 --- a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S +++ b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S @@ -16,6 +16,8 @@ License along with the GNU C Library; if not, see . */ -#define STRCMP_SSE42 __strncmp_sse42 -#define USE_AS_STRNCMP -#include "strcmp-sse42.S" +#if IS_IN (libc) +# define STRCMP_SSE42 __strncmp_sse42 +# define USE_AS_STRNCMP +# include "strcmp-sse42.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S new file mode 100644 index 000000000..0b7f22021 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S @@ -0,0 +1,4 @@ +#define STRLEN __strnlen_evex512 +#define USE_AS_STRNLEN 1 + +#include "strlen-evex512.S" diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c similarity index 85% rename from sysdeps/x86_64/multiarch/strcspn-sse2.S rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c index f97e856e1..d03214c4f 100644 --- a/sysdeps/x86_64/multiarch/strcspn-sse2.S +++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c @@ -1,4 +1,4 @@ -/* strcspn optimized with SSE2. +/* strpbrk. Copyright (C) 2017-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -19,10 +19,10 @@ #if IS_IN (libc) # include -# define strcspn __strcspn_sse2 +# define STRPBRK __strpbrk_sse2 # undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strcspn) +# define libc_hidden_builtin_def(STRPBRK) #endif -#include +#include diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S index 1df2adfad..bd26ba80d 100644 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S @@ -27,9 +27,13 @@ # ifdef USE_AS_WCSRCHR # define VPBROADCAST vpbroadcastd # define VPCMPEQ vpcmpeqd +# define VPMIN vpminud +# define CHAR_SIZE 4 # else # define VPBROADCAST vpbroadcastb # define VPCMPEQ vpcmpeqb +# define VPMIN vpminub +# define CHAR_SIZE 1 # endif # ifndef VZEROUPPER @@ -41,196 +45,304 @@ # endif # define VEC_SIZE 32 +# define PAGE_SIZE 4096 - .section SECTION(.text),"ax",@progbits -ENTRY (STRRCHR) - movd %esi, %xmm4 - movl %edi, %ecx + .section SECTION(.text), "ax", @progbits +ENTRY(STRRCHR) + movd %esi, %xmm7 + movl %edi, %eax /* Broadcast CHAR to YMM4. */ - VPBROADCAST %xmm4, %ymm4 + VPBROADCAST %xmm7, %ymm7 vpxor %xmm0, %xmm0, %xmm0 - /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + /* Shift here instead of `andl` to save code size (saves a fetch + block). */ + sall $20, %eax + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax + ja L(cross_page) +L(page_cross_continue): vmovdqu (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - addq $VEC_SIZE, %rdi + /* Check end of string match. */ + VPCMPEQ %ymm1, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + testl %ecx, %ecx + jz L(aligned_more) + + /* Only check match with search CHAR if needed. */ + VPCMPEQ %ymm1, %ymm7, %ymm1 + vpmovmskb %ymm1, %eax + /* Check if match before first zero. */ + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret0) + bsrl %eax, %eax + addq %rdi, %rax + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If + search CHAR is zero we are correct. Either way `andq + -CHAR_SIZE, %rax` gets the correct result. */ +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret0): +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + /* Returns for first vec x1/x2 have hard coded backward search + path for earlier matches. */ + .p2align 4,, 10 +L(first_vec_x1): + VPCMPEQ %ymm2, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + blsmskl %ecx, %ecx + andl %ecx, %eax + jnz L(first_vec_x1_return) + + .p2align 4,, 4 +L(first_vec_x0_test): + VPCMPEQ %ymm1, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + testl %eax, %eax + jz L(ret1) + bsrl %eax, %eax + addq %r8, %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret1): + VZEROUPPER_RETURN + .p2align 4,, 10 +L(first_vec_x0_x1_test): + VPCMPEQ %ymm2, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + /* Check ymm2 for search CHAR match. If no match then check ymm1 + before returning. */ testl %eax, %eax - jnz L(first_vec) + jz L(first_vec_x0_test) + .p2align 4,, 4 +L(first_vec_x1_return): + bsrl %eax, %eax + leaq 1(%rdi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN - testl %ecx, %ecx - jnz L(return_null) - andq $-VEC_SIZE, %rdi - xorl %edx, %edx - jmp L(aligned_loop) + .p2align 4,, 10 +L(first_vec_x2): + VPCMPEQ %ymm3, %ymm7, %ymm6 + vpmovmskb %ymm6, %eax + blsmskl %ecx, %ecx + /* If no in-range search CHAR match in ymm3 then need to check + ymm1/ymm2 for an earlier match (we delay checking search + CHAR matches until needed). */ + andl %ecx, %eax + jz L(first_vec_x0_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE + 1)(%rdi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN + .p2align 4 -L(first_vec): - /* Check if there is a nul CHAR. */ +L(aligned_more): + /* Save original pointer if match was in VEC 0. */ + movq %rdi, %r8 + + /* Align src. */ + orq $(VEC_SIZE - 1), %rdi + vmovdqu 1(%rdi), %ymm2 + VPCMPEQ %ymm2, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx testl %ecx, %ecx - jnz L(char_and_nul_in_first_vec) + jnz L(first_vec_x1) - /* Remember the match and keep searching. */ - movl %eax, %edx - movq %rdi, %rsi - andq $-VEC_SIZE, %rdi - jmp L(aligned_loop) + vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 + VPCMPEQ %ymm3, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) + /* Save pointer again before realigning. */ + movq %rdi, %rsi + addq $(VEC_SIZE + 1), %rdi + andq $-(VEC_SIZE * 2), %rdi .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %edx - vpmovmskb %ymm3, %eax - shrl %cl, %edx - shrl %cl, %eax - addq $VEC_SIZE, %rdi - - /* Check if there is a CHAR. */ +L(first_aligned_loop): + /* Do 2x VEC at a time. Any more and the cost of finding the + match outweights loop benefit. */ + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 + + VPCMPEQ %ymm4, %ymm7, %ymm6 + VPMIN %ymm4, %ymm5, %ymm8 + VPCMPEQ %ymm5, %ymm7, %ymm10 + vpor %ymm6, %ymm10, %ymm5 + VPCMPEQ %ymm8, %ymm0, %ymm8 + vpor %ymm5, %ymm8, %ymm9 + + vpmovmskb %ymm9, %eax + addq $(VEC_SIZE * 2), %rdi + /* No zero or search CHAR. */ testl %eax, %eax - jnz L(found_char) - - testl %edx, %edx - jnz L(return_null) + jz L(first_aligned_loop) - jmp L(aligned_loop) - - .p2align 4 -L(found_char): - testl %edx, %edx - jnz L(char_and_nul) + /* If no zero CHAR then go to second loop (this allows us to + throw away all prior work). */ + vpmovmskb %ymm8, %ecx + testl %ecx, %ecx + jz L(second_aligned_loop_prep) - /* Remember the match and keep searching. */ - movl %eax, %edx - leaq (%rdi, %rcx), %rsi + /* Search char could be zero so we need to get the true match. + */ + vpmovmskb %ymm5, %eax + testl %eax, %eax + jnz L(first_aligned_loop_return) - .p2align 4 -L(aligned_loop): - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - add $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx + .p2align 4,, 4 +L(first_vec_x1_or_x2): + VPCMPEQ %ymm3, %ymm7, %ymm3 + VPCMPEQ %ymm2, %ymm7, %ymm2 vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jnz L(char_nor_null) - - vmovdqa (%rdi), %ymm1 - VPCMPEQ %ymm1, %ymm0, %ymm2 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm1, %ymm4, %ymm3 - vpmovmskb %ymm2, %ecx - vpmovmskb %ymm3, %eax - orl %eax, %ecx - jz L(aligned_loop) - - .p2align 4 -L(char_nor_null): - /* Find a CHAR or a nul CHAR in a loop. */ - testl %eax, %eax - jnz L(match) -L(return_value): - testl %edx, %edx - jz L(return_null) - movl %edx, %eax - movq %rsi, %rdi + vpmovmskb %ymm2, %edx + /* Use add for macro-fusion. */ + addq %rax, %rdx + jz L(first_vec_x0_test) + /* NB: We could move this shift to before the branch and save a + bit of code size / performance on the fall through. The + branch leads to the null case which generally seems hotter + than char in first 3x VEC. */ + salq $32, %rax + addq %rdx, %rax + bsrq %rax, %rax + leaq 1(%rsi, %rax), %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif + VZEROUPPER_RETURN + .p2align 4,, 8 +L(first_aligned_loop_return): + VPCMPEQ %ymm4, %ymm0, %ymm4 + vpmovmskb %ymm4, %edx + salq $32, %rcx + orq %rdx, %rcx + + vpmovmskb %ymm10, %eax + vpmovmskb %ymm6, %edx + salq $32, %rax + orq %rdx, %rax + blsmskq %rcx, %rcx + andq %rcx, %rax + jz L(first_vec_x1_or_x2) + + bsrq %rax, %rax + leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax # ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %eax + andq $-CHAR_SIZE, %rax # endif - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN + VZEROUPPER_RETURN + /* Search char cannot be zero. */ .p2align 4 -L(match): - /* Find a CHAR. Check if there is a nul CHAR. */ - vpmovmskb %ymm2, %ecx - testl %ecx, %ecx - jnz L(find_nul) - - /* Remember the match and keep searching. */ - movl %eax, %edx +L(second_aligned_loop_set_furthest_match): + /* Save VEC and pointer from most recent match. */ +L(second_aligned_loop_prep): movq %rdi, %rsi - jmp L(aligned_loop) + vmovdqu %ymm6, %ymm2 + vmovdqu %ymm10, %ymm3 .p2align 4 -L(find_nul): -# ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %ecx - andl $0x11111111, %eax -# endif - /* Mask out any matching bits after the nul CHAR. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax +L(second_aligned_loop): + /* Search 2x at at time. */ + vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 + vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 + + VPCMPEQ %ymm4, %ymm7, %ymm6 + VPMIN %ymm4, %ymm5, %ymm1 + VPCMPEQ %ymm5, %ymm7, %ymm10 + vpor %ymm6, %ymm10, %ymm5 + VPCMPEQ %ymm1, %ymm0, %ymm1 + vpor %ymm5, %ymm1, %ymm9 + + vpmovmskb %ymm9, %eax + addq $(VEC_SIZE * 2), %rdi testl %eax, %eax - /* If there is no CHAR here, return the remembered one. */ - jz L(return_value) - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER_RETURN - - .p2align 4 -L(char_and_nul): - /* Find both a CHAR and a nul CHAR. */ - addq %rcx, %rdi - movl %edx, %ecx -L(char_and_nul_in_first_vec): -# ifdef USE_AS_WCSRCHR - /* Keep the first bit for each matching CHAR for bsr. */ - andl $0x11111111, %ecx - andl $0x11111111, %eax -# endif - /* Mask out any matching bits after the nul CHAR. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax + jz L(second_aligned_loop) + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jz L(second_aligned_loop_set_furthest_match) + vpmovmskb %ymm5, %eax testl %eax, %eax - /* Return null pointer if the nul CHAR comes first. */ - jz L(return_null) - bsrl %eax, %eax - leaq -VEC_SIZE(%rdi, %rax), %rax + jnz L(return_new_match) + + /* This is the hot patch. We know CHAR is inbounds and that + ymm3/ymm2 have latest match. */ + .p2align 4,, 4 +L(return_old_match): + vpmovmskb %ymm3, %eax + vpmovmskb %ymm2, %edx + salq $32, %rax + orq %rdx, %rax + bsrq %rax, %rax + /* Search char cannot be zero so safe to just use lea for + wcsrchr. */ + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax VZEROUPPER_RETURN - .p2align 4 -L(return_null): - xorl %eax, %eax + /* Last iteration also potentially has a match. */ + .p2align 4,, 8 +L(return_new_match): + VPCMPEQ %ymm4, %ymm0, %ymm4 + vpmovmskb %ymm4, %edx + salq $32, %rcx + orq %rdx, %rcx + + vpmovmskb %ymm10, %eax + vpmovmskb %ymm6, %edx + salq $32, %rax + orq %rdx, %rax + blsmskq %rcx, %rcx + andq %rcx, %rax + jz L(return_old_match) + bsrq %rax, %rax + /* Search char cannot be zero so safe to just use lea for + wcsrchr. */ + leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax VZEROUPPER_RETURN -END (STRRCHR) + .p2align 4,, 4 +L(cross_page): + movq %rdi, %rsi + andq $-VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm1 + VPCMPEQ %ymm1, %ymm0, %ymm6 + vpmovmskb %ymm6, %ecx + /* Shift out zero CHAR matches that are before the begining of + src (rdi). */ + shrxl %edi, %ecx, %ecx + testl %ecx, %ecx + jz L(page_cross_continue) + VPCMPEQ %ymm1, %ymm7, %ymm1 + vpmovmskb %ymm1, %eax + + /* Shift out search CHAR matches that are before the begining of + src (rdi). */ + shrxl %edi, %eax, %eax + blsmskl %ecx, %ecx + /* Check if any search CHAR match in range. */ + andl %ecx, %eax + jz L(ret2) + bsrl %eax, %eax + addq %rdi, %rax +# ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +# endif +L(ret2): + VZEROUPPER_RETURN +END(STRRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S index adeddaed3..8014c285b 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S @@ -24,242 +24,351 @@ # define STRRCHR __strrchr_evex # endif -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 # ifdef USE_AS_WCSRCHR +# define SHIFT_REG esi + +# define kunpck kunpckbw +# define kmov_2x kmovd +# define maskz_2x ecx +# define maskm_2x eax +# define CHAR_SIZE 4 +# define VPMIN vpminud +# define VPTESTN vptestnmd # define VPBROADCAST vpbroadcastd -# define VPCMP vpcmpd -# define SHIFT_REG r8d +# define VPCMP vpcmpd # else +# define SHIFT_REG edi + +# define kunpck kunpckdq +# define kmov_2x kmovq +# define maskz_2x rcx +# define maskm_2x rax + +# define CHAR_SIZE 1 +# define VPMIN vpminub +# define VPTESTN vptestnmb # define VPBROADCAST vpbroadcastb -# define VPCMP vpcmpb -# define SHIFT_REG ecx +# define VPCMP vpcmpb # endif # define XMMZERO xmm16 # define YMMZERO ymm16 # define YMMMATCH ymm17 -# define YMM1 ymm18 +# define YMMSAVE ymm18 + +# define YMM1 ymm19 +# define YMM2 ymm20 +# define YMM3 ymm21 +# define YMM4 ymm22 +# define YMM5 ymm23 +# define YMM6 ymm24 +# define YMM7 ymm25 +# define YMM8 ymm26 -# define VEC_SIZE 32 - .section .text.evex,"ax",@progbits -ENTRY (STRRCHR) - movl %edi, %ecx +# define VEC_SIZE 32 +# define PAGE_SIZE 4096 + .section .text.evex, "ax", @progbits +ENTRY(STRRCHR) + movl %edi, %eax /* Broadcast CHAR to YMMMATCH. */ VPBROADCAST %esi, %YMMMATCH - vpxorq %XMMZERO, %XMMZERO, %XMMZERO - - /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jg L(cross_page_boundary) +L(page_cross_continue): VMOVU (%rdi), %YMM1 - - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 + /* k0 has a 1 for each zero CHAR in YMM1. */ + VPTESTN %YMM1, %YMM1, %k0 kmovd %k0, %ecx - kmovd %k1, %eax - - addq $VEC_SIZE, %rdi - - testl %eax, %eax - jnz L(first_vec) - testl %ecx, %ecx - jnz L(return_null) - - andq $-VEC_SIZE, %rdi - xorl %edx, %edx - jmp L(aligned_loop) - - .p2align 4 -L(first_vec): - /* Check if there is a null byte. */ - testl %ecx, %ecx - jnz L(char_and_nul_in_first_vec) - - /* Remember the match and keep searching. */ - movl %eax, %edx - movq %rdi, %rsi - andq $-VEC_SIZE, %rdi - jmp L(aligned_loop) - - .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi + jz L(aligned_more) + /* fallthrough: zero CHAR in first VEC. */ + /* K1 has a 1 for each search CHAR match in YMM1. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k1, %eax + /* Build mask up until first zero CHAR (used to mask of + potential search CHAR matches past the end of the string). + */ + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret0) + /* Get last match (the `andl` removed any out of bounds + matches). */ + bsrl %eax, %eax # ifdef USE_AS_WCSRCHR - /* NB: Divide shift count by 4 since each bit in K1 represent 4 - bytes. */ - movl %ecx, %SHIFT_REG - sarl $2, %SHIFT_REG + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax # endif +L(ret0): + ret - VMOVA (%rdi), %YMM1 - - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ + /* Returns for first vec x1/x2/x3 have hard coded backward + search path for earlier matches. */ + .p2align 4,, 6 +L(first_vec_x1): + VPCMP $0, %YMMMATCH, %YMM2, %k1 + kmovd %k1, %eax + blsmskl %ecx, %ecx + /* eax non-zero if search CHAR in range. */ + andl %ecx, %eax + jnz L(first_vec_x1_return) + + /* fallthrough: no match in YMM2 then need to check for earlier + matches (in YMM1). */ + .p2align 4,, 4 +L(first_vec_x0_test): VPCMP $0, %YMMMATCH, %YMM1, %k1 - kmovd %k0, %edx kmovd %k1, %eax - - shrxl %SHIFT_REG, %edx, %edx - shrxl %SHIFT_REG, %eax, %eax - addq $VEC_SIZE, %rdi - - /* Check if there is a CHAR. */ testl %eax, %eax - jnz L(found_char) - - testl %edx, %edx - jnz L(return_null) - - jmp L(aligned_loop) - - .p2align 4 -L(found_char): - testl %edx, %edx - jnz L(char_and_nul) - - /* Remember the match and keep searching. */ - movl %eax, %edx - leaq (%rdi, %rcx), %rsi + jz L(ret1) + bsrl %eax, %eax +# ifdef USE_AS_WCSRCHR + leaq (%rsi, %rax, CHAR_SIZE), %rax +# else + addq %rsi, %rax +# endif +L(ret1): + ret - .p2align 4 -L(aligned_loop): - VMOVA (%rdi), %YMM1 - addq $VEC_SIZE, %rdi + .p2align 4,, 10 +L(first_vec_x1_or_x2): + VPCMP $0, %YMM3, %YMMMATCH, %k3 + VPCMP $0, %YMM2, %YMMMATCH, %k2 + /* K2 and K3 have 1 for any search CHAR match. Test if any + matches between either of them. Otherwise check YMM1. */ + kortestd %k2, %k3 + jz L(first_vec_x0_test) + + /* Guranteed that YMM2 and YMM3 are within range so merge the + two bitmasks then get last result. */ + kunpck %k2, %k3, %k3 + kmovq %k3, %rax + bsrq %rax, %rax + leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax + ret - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 - kmovd %k0, %ecx + .p2align 4,, 6 +L(first_vec_x3): + VPCMP $0, %YMMMATCH, %YMM4, %k1 kmovd %k1, %eax - orl %eax, %ecx - jnz L(char_nor_null) + blsmskl %ecx, %ecx + /* If no search CHAR match in range check YMM1/YMM2/YMM3. */ + andl %ecx, %eax + jz L(first_vec_x1_or_x2) + bsrl %eax, %eax + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret - VMOVA (%rdi), %YMM1 - add $VEC_SIZE, %rdi + .p2align 4,, 6 +L(first_vec_x0_x1_test): + VPCMP $0, %YMMMATCH, %YMM2, %k1 + kmovd %k1, %eax + /* Check YMM2 for last match first. If no match try YMM1. */ + testl %eax, %eax + jz L(first_vec_x0_test) + .p2align 4,, 4 +L(first_vec_x1_return): + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 - kmovd %k0, %ecx + .p2align 4,, 10 +L(first_vec_x2): + VPCMP $0, %YMMMATCH, %YMM3, %k1 kmovd %k1, %eax - orl %eax, %ecx - jnz L(char_nor_null) + blsmskl %ecx, %ecx + /* Check YMM3 for last match first. If no match try YMM2/YMM1. + */ + andl %ecx, %eax + jz L(first_vec_x0_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret - VMOVA (%rdi), %YMM1 - addq $VEC_SIZE, %rdi - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 + .p2align 4 +L(aligned_more): + /* Need to keep original pointer incase YMM1 has last match. */ + movq %rdi, %rsi + andq $-VEC_SIZE, %rdi + VMOVU VEC_SIZE(%rdi), %YMM2 + VPTESTN %YMM2, %YMM2, %k0 kmovd %k0, %ecx - kmovd %k1, %eax - orl %eax, %ecx - jnz L(char_nor_null) + testl %ecx, %ecx + jnz L(first_vec_x1) - VMOVA (%rdi), %YMM1 - addq $VEC_SIZE, %rdi + VMOVU (VEC_SIZE * 2)(%rdi), %YMM3 + VPTESTN %YMM3, %YMM3, %k0 + kmovd %k0, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) - /* Each bit in K0 represents a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPCMP $0, %YMMMATCH, %YMM1, %k1 + VMOVU (VEC_SIZE * 3)(%rdi), %YMM4 + VPTESTN %YMM4, %YMM4, %k0 kmovd %k0, %ecx - kmovd %k1, %eax - orl %eax, %ecx - jz L(aligned_loop) + movq %rdi, %r8 + testl %ecx, %ecx + jnz L(first_vec_x3) + andq $-(VEC_SIZE * 2), %rdi .p2align 4 -L(char_nor_null): - /* Find a CHAR or a null byte in a loop. */ +L(first_aligned_loop): + /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee + they don't store a match. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM5 + VMOVA (VEC_SIZE * 5)(%rdi), %YMM6 + + VPCMP $0, %YMM5, %YMMMATCH, %k2 + vpxord %YMM6, %YMMMATCH, %YMM7 + + VPMIN %YMM5, %YMM6, %YMM8 + VPMIN %YMM8, %YMM7, %YMM7 + + VPTESTN %YMM7, %YMM7, %k1 + subq $(VEC_SIZE * -2), %rdi + kortestd %k1, %k2 + jz L(first_aligned_loop) + + VPCMP $0, %YMM6, %YMMMATCH, %k3 + VPTESTN %YMM8, %YMM8, %k1 + ktestd %k1, %k1 + jz L(second_aligned_loop_prep) + + kortestd %k2, %k3 + jnz L(return_first_aligned_loop) + + .p2align 4,, 6 +L(first_vec_x1_or_x2_or_x3): + VPCMP $0, %YMM4, %YMMMATCH, %k4 + kmovd %k4, %eax testl %eax, %eax - jnz L(match) -L(return_value): - testl %edx, %edx - jz L(return_null) - movl %edx, %eax - movq %rsi, %rdi + jz L(first_vec_x1_or_x2) bsrl %eax, %eax -# ifdef USE_AS_WCSRCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq -VEC_SIZE(%rdi, %rax, 4), %rax -# else - leaq -VEC_SIZE(%rdi, %rax), %rax -# endif + leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax ret - .p2align 4 -L(match): - /* Find a CHAR. Check if there is a null byte. */ - kmovd %k0, %ecx - testl %ecx, %ecx - jnz L(find_nul) + .p2align 4,, 8 +L(return_first_aligned_loop): + VPTESTN %YMM5, %YMM5, %k0 + kunpck %k0, %k1, %k0 + kmov_2x %k0, %maskz_2x + + blsmsk %maskz_2x, %maskz_2x + kunpck %k2, %k3, %k3 + kmov_2x %k3, %maskm_2x + and %maskz_2x, %maskm_2x + jz L(first_vec_x1_or_x2_or_x3) - /* Remember the match and keep searching. */ - movl %eax, %edx + bsr %maskm_2x, %maskm_2x + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 + /* We can throw away the work done for the first 4x checks here + as we have a later match. This is the 'fast' path persay. + */ +L(second_aligned_loop_prep): +L(second_aligned_loop_set_furthest_match): movq %rdi, %rsi - jmp L(aligned_loop) + kunpck %k2, %k3, %k4 .p2align 4 -L(find_nul): - /* Mask out any matching bits after the null byte. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax - testl %eax, %eax - /* If there is no CHAR here, return the remembered one. */ - jz L(return_value) - bsrl %eax, %eax +L(second_aligned_loop): + VMOVU (VEC_SIZE * 4)(%rdi), %YMM1 + VMOVU (VEC_SIZE * 5)(%rdi), %YMM2 + + VPCMP $0, %YMM1, %YMMMATCH, %k2 + vpxord %YMM2, %YMMMATCH, %YMM3 + + VPMIN %YMM1, %YMM2, %YMM4 + VPMIN %YMM3, %YMM4, %YMM3 + + VPTESTN %YMM3, %YMM3, %k1 + subq $(VEC_SIZE * -2), %rdi + kortestd %k1, %k2 + jz L(second_aligned_loop) + + VPCMP $0, %YMM2, %YMMMATCH, %k3 + VPTESTN %YMM4, %YMM4, %k1 + ktestd %k1, %k1 + jz L(second_aligned_loop_set_furthest_match) + + kortestd %k2, %k3 + /* branch here because there is a significant advantage interms + of output dependency chance in using edx. */ + jnz L(return_new_match) +L(return_old_match): + kmovq %k4, %rax + bsrq %rax, %rax + leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax + ret + +L(return_new_match): + VPTESTN %YMM1, %YMM1, %k0 + kunpck %k0, %k1, %k0 + kmov_2x %k0, %maskz_2x + + blsmsk %maskz_2x, %maskz_2x + kunpck %k2, %k3, %k3 + kmov_2x %k3, %maskm_2x + and %maskz_2x, %maskm_2x + jz L(return_old_match) + + bsr %maskm_2x, %maskm_2x + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +L(cross_page_boundary): + /* eax contains all the page offset bits of src (rdi). `xor rdi, + rax` sets pointer will all page offset bits cleared so + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC + before page cross (guranteed to be safe to read). Doing this + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves + a bit of code size. */ + xorq %rdi, %rax + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1 + VPTESTN %YMM1, %YMM1, %k0 + kmovd %k0, %ecx + + /* Shift out zero CHAR matches that are before the begining of + src (rdi). */ # ifdef USE_AS_WCSRCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq -VEC_SIZE(%rdi, %rax, 4), %rax -# else - leaq -VEC_SIZE(%rdi, %rax), %rax + movl %edi, %esi + andl $(VEC_SIZE - 1), %esi + shrl $2, %esi # endif - ret + shrxl %SHIFT_REG, %ecx, %ecx - .p2align 4 -L(char_and_nul): - /* Find both a CHAR and a null byte. */ - addq %rcx, %rdi - movl %edx, %ecx -L(char_and_nul_in_first_vec): - /* Mask out any matching bits after the null byte. */ - movl %ecx, %r8d - subl $1, %r8d - xorl %ecx, %r8d - andl %r8d, %eax - testl %eax, %eax - /* Return null pointer if the null byte comes first. */ - jz L(return_null) + testl %ecx, %ecx + jz L(page_cross_continue) + + /* Found zero CHAR so need to test for search CHAR. */ + VPCMP $0, %YMMMATCH, %YMM1, %k1 + kmovd %k1, %eax + /* Shift out search CHAR matches that are before the begining of + src (rdi). */ + shrxl %SHIFT_REG, %eax, %eax + + /* Check if any search CHAR match in range. */ + blsmskl %ecx, %ecx + andl %ecx, %eax + jz L(ret3) bsrl %eax, %eax # ifdef USE_AS_WCSRCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq -VEC_SIZE(%rdi, %rax, 4), %rax + leaq (%rdi, %rax, CHAR_SIZE), %rax # else - leaq -VEC_SIZE(%rdi, %rax), %rax + addq %rdi, %rax # endif +L(ret3): ret - .p2align 4 -L(return_null): - xorl %eax, %eax - ret - -END (STRRCHR) +END(STRRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S index db1b44c23..866396e94 100644 --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S @@ -17,7 +17,7 @@ . */ #if IS_IN (libc) -# define strrchr __strrchr_sse2 +# define STRRCHR __strrchr_sse2 # undef weak_alias # define weak_alias(strrchr, rindex) diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c index 8fb3aba64..6124033ce 100644 --- a/sysdeps/x86_64/multiarch/strspn-c.c +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a) return 0; const char *aligned; - __m128i mask; - int offset = (int) ((size_t) a & 15); + __m128i mask, maskz, zero; + unsigned int maskz_bits; + unsigned int offset = (int) ((size_t) a & 15); + zero = _mm_set1_epi8 (0); if (offset != 0) { /* Load masks. */ aligned = (const char *) ((size_t) a & -16L); __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - - mask = __m128i_shift_right (mask0, offset); + maskz = _mm_cmpeq_epi8 (mask0, zero); /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16 - offset) - { - /* There is no NULL terminator. */ - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - int index = _mm_cmpistri (mask1, mask1, 0x3a); - length += index; - - /* Don't use SSE4.2 if the length of A > 16. */ - if (length > 16) - return __strspn_sse2 (s, a); - - if (index != 0) - { - /* Combine mask0 and mask1. We could play games with - palignr, but frankly this data should be in L1 now - so do the merge via an unaligned load. */ - mask = _mm_loadu_si128 ((__m128i *) a); - } - } + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) + { + mask = __m128i_shift_right (mask0, offset); + offset = (unsigned int) ((size_t) s & 15); + if (offset) + goto start_unaligned; + + aligned = s; + goto start_loop; + } } - else - { - /* A is aligned. */ - mask = _mm_load_si128 ((__m128i *) a); - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return __strspn_sse2 (s, a); - } + /* A is aligned. */ + mask = _mm_loadu_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + maskz = _mm_cmpeq_epi8 (mask, zero); + maskz_bits = _mm_movemask_epi8 (maskz); + if (maskz_bits == 0) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return __strspn_sse2 (s, a); } + aligned = s; + offset = (unsigned int) ((size_t) s & 15); - offset = (int) ((size_t) s & 15); if (offset != 0) { + start_unaligned: /* Check partial string. */ aligned = (const char *) ((size_t) s & -16L); __m128i value = _mm_load_si128 ((__m128i *) aligned); + __m128i adj_value = __m128i_shift_right (value, offset); - value = __m128i_shift_right (value, offset); - - int length = _mm_cmpistri (mask, value, 0x12); + unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); /* No need to check CFlag since it is always 1. */ if (length < 16 - offset) return length; /* Find where the NULL terminator is. */ - int index = _mm_cmpistri (value, value, 0x3a); - if (index < 16 - offset) + maskz = _mm_cmpeq_epi8 (value, zero); + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) return length; aligned += 16; } - else - aligned = s; +start_loop: while (1) { __m128i value = _mm_load_si128 ((__m128i *) aligned); - int index = _mm_cmpistri (mask, value, 0x12); - int cflag = _mm_cmpistrc (mask, value, 0x12); + unsigned int index = _mm_cmpistri (mask, value, 0x12); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); if (cflag) return (size_t) (aligned + index - s); aligned += 16; diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c similarity index 84% rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S rename to sysdeps/x86_64/multiarch/strspn-sse2.c index d537b6c27..61cc6cb0a 100644 --- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S +++ b/sysdeps/x86_64/multiarch/strspn-sse2.c @@ -1,4 +1,4 @@ -/* strpbrk optimized with SSE2. +/* strspn. Copyright (C) 2017-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -19,11 +19,10 @@ #if IS_IN (libc) # include -# define strcspn __strpbrk_sse2 +# define STRSPN __strspn_sse2 # undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strpbrk) +# define libc_hidden_builtin_def(STRSPN) #endif -#define USE_AS_STRPBRK -#include +#include diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c new file mode 100644 index 000000000..e44c1a05d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c @@ -0,0 +1,218 @@ +/* strstr optimized with 512-bit AVX-512 instructions + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include + +#define FULL_MMASK64 0xffffffffffffffff +#define ONE_64BIT 0x1ull +#define ZMM_SIZE_IN_BYTES 64 +#define PAGESIZE 4096 + +#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__) +#define kshiftri_mask64(x, y) ((x) >> (y)) +#define kand_mask64(x, y) ((x) & (y)) + +/* + Returns the index of the first edge within the needle, returns 0 if no edge + is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg' + */ +static inline size_t +find_edge_in_needle (const char *ned) +{ + size_t ind = 0; + while (ned[ind + 1] != '\0') + { + if (ned[ind] != ned[ind + 1]) + return ind; + else + ind = ind + 1; + } + return 0; +} + +/* + Compare needle with haystack byte by byte at specified location + */ +static inline bool +verify_string_match (const char *hay, const size_t hay_index, const char *ned, + size_t ind) +{ + while (ned[ind] != '\0') + { + if (ned[ind] != hay[hay_index + ind]) + return false; + ind = ind + 1; + } + return true; +} + +/* + Compare needle with haystack at specified location. The first 64 bytes are + compared using a ZMM register. + */ +static inline bool +verify_string_match_avx512 (const char *hay, const size_t hay_index, + const char *ned, const __mmask64 ned_mask, + const __m512i ned_zmm) +{ + /* check first 64 bytes using zmm and then scalar */ + __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so + __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm); + if (match != 0x0) // failed the first few chars + return false; + else if (ned_mask == FULL_MMASK64) + return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES); + return true; +} + +char * +__strstr_avx512 (const char *haystack, const char *ned) +{ + char first = ned[0]; + if (first == '\0') + return (char *)haystack; + if (ned[1] == '\0') + return (char *)strchr (haystack, ned[0]); + + size_t edge = find_edge_in_needle (ned); + + /* ensure haystack is as long as the pos of edge in needle */ + for (int ii = 0; ii < edge; ++ii) + { + if (haystack[ii] == '\0') + return NULL; + } + + /* + Load 64 bytes of the needle and save it to a zmm register + Read one cache line at a time to avoid loading across a page boundary + */ + __mmask64 ned_load_mask = _bzhi_u64 ( + FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63)); + __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned); + __mmask64 ned_nullmask + = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm); + + if (__glibc_unlikely (ned_nullmask == 0x0)) + { + ned_zmm = _mm512_loadu_si512 (ned); + ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm); + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); + if (ned_nullmask != 0x0) + ned_load_mask = ned_load_mask >> 1; + } + else + { + ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); + ned_load_mask = ned_load_mask >> 1; + } + const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); + const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]); + + /* + Read the bytes of haystack in the current cache line + */ + size_t hay_index = edge; + __mmask64 loadmask = _bzhi_u64 ( + FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63)); + /* First load is a partial cache line */ + __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index); + /* Search for NULL and compare only till null char */ + uint64_t nullmask + = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0)); + uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT); + cmpmask = cmpmask & cvtmask64_u64 (loadmask); + /* Search for the 2 charaters of needle */ + __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); + __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1); + k1 = kshiftri_mask64 (k1, 1); + /* k2 masks tell us if both chars from needle match */ + uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask; + /* For every match, search for the entire needle for a full match */ + while (k2) + { + uint64_t bitcount = _tzcnt_u64 (k2); + k2 = _blsr_u64 (k2); + size_t match_pos = hay_index + bitcount - edge; + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) + { + /* + * Use vector compare as long as you are not crossing a page + */ + if (verify_string_match_avx512 (haystack, match_pos, ned, + ned_load_mask, ned_zmm)) + return (char *)haystack + match_pos; + } + else + { + if (verify_string_match (haystack, match_pos, ned, 0)) + return (char *)haystack + match_pos; + } + } + /* We haven't checked for potential match at the last char yet */ + haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63)); + hay_index = 0; + + /* + Loop over one cache line at a time to prevent reading over page + boundary + */ + __m512i hay1; + while (nullmask == 0) + { + hay0 = _mm512_loadu_si512 (haystack + hay_index); + hay1 = _mm512_load_si512 (haystack + hay_index + + 1); // Always 64 byte aligned + nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1)); + /* Compare only till null char */ + cmpmask = nullmask ^ (nullmask - ONE_64BIT); + k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); + k1 = _mm512_cmpeq_epi8_mask (hay1, ned1); + /* k2 masks tell us if both chars from needle match */ + k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask; + /* For every match, compare full strings for potential match */ + while (k2) + { + uint64_t bitcount = _tzcnt_u64 (k2); + k2 = _blsr_u64 (k2); + size_t match_pos = hay_index + bitcount - edge; + if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) + < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) + { + /* + * Use vector compare as long as you are not crossing a page + */ + if (verify_string_match_avx512 (haystack, match_pos, ned, + ned_load_mask, ned_zmm)) + return (char *)haystack + match_pos; + } + else + { + /* Compare byte by byte */ + if (verify_string_match (haystack, match_pos, ned, 0)) + return (char *)haystack + match_pos; + } + } + hay_index += ZMM_SIZE_IN_BYTES; + } + return NULL; +} diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c index 95600a9de..2fb8b169b 100644 --- a/sysdeps/x86_64/multiarch/strstr.c +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -35,16 +35,32 @@ extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden; #include "init-arch.h" /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc (__libc_strstr, - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - ? __strstr_sse2_unaligned - : __strstr_sse2) +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features *cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return __strstr_avx512; + + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) + return __strstr_sse2_unaligned; + + return __strstr_sse2; +} + +libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ()); #undef strstr strong_alias (__libc_strstr, strstr) diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c index c8210f054..d27767520 100644 --- a/sysdeps/x86_64/multiarch/varshift.c +++ b/sysdeps/x86_64/multiarch/varshift.c @@ -16,9 +16,10 @@ License along with the GNU C Library; if not, see . */ -#include "varshift.h" +#include -const int8_t ___m128i_shift_right[31] attribute_hidden = +const int8_t ___m128i_shift_right[31] attribute_hidden + __attribute__((aligned(32))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h index af3069448..ffd12d79e 100644 --- a/sysdeps/x86_64/multiarch/varshift.h +++ b/sysdeps/x86_64/multiarch/varshift.h @@ -19,7 +19,8 @@ #include #include -extern const int8_t ___m128i_shift_right[31] attribute_hidden; +extern const int8_t ___m128i_shift_right[31] attribute_hidden + __attribute__ ((aligned (32))); static __inline__ __m128i __m128i_shift_right (__m128i value, unsigned long int offset) diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h new file mode 100644 index 000000000..9f3ffeced --- /dev/null +++ b/sysdeps/x86_64/multiarch/vec-macros.h @@ -0,0 +1,90 @@ +/* Macro helpers for VEC_{type}({vec_num}) + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _VEC_MACROS_H +#define _VEC_MACROS_H 1 + +#ifndef VEC_SIZE +# error "Never include this file directly. Always include a vector config." +#endif + +/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same + VEC(N) values. */ +#define VEC_hi_xmm0 xmm16 +#define VEC_hi_xmm1 xmm17 +#define VEC_hi_xmm2 xmm18 +#define VEC_hi_xmm3 xmm19 +#define VEC_hi_xmm4 xmm20 +#define VEC_hi_xmm5 xmm21 +#define VEC_hi_xmm6 xmm22 +#define VEC_hi_xmm7 xmm23 +#define VEC_hi_xmm8 xmm24 +#define VEC_hi_xmm9 xmm25 +#define VEC_hi_xmm10 xmm26 +#define VEC_hi_xmm11 xmm27 +#define VEC_hi_xmm12 xmm28 +#define VEC_hi_xmm13 xmm29 +#define VEC_hi_xmm14 xmm30 +#define VEC_hi_xmm15 xmm31 + +#define VEC_hi_ymm0 ymm16 +#define VEC_hi_ymm1 ymm17 +#define VEC_hi_ymm2 ymm18 +#define VEC_hi_ymm3 ymm19 +#define VEC_hi_ymm4 ymm20 +#define VEC_hi_ymm5 ymm21 +#define VEC_hi_ymm6 ymm22 +#define VEC_hi_ymm7 ymm23 +#define VEC_hi_ymm8 ymm24 +#define VEC_hi_ymm9 ymm25 +#define VEC_hi_ymm10 ymm26 +#define VEC_hi_ymm11 ymm27 +#define VEC_hi_ymm12 ymm28 +#define VEC_hi_ymm13 ymm29 +#define VEC_hi_ymm14 ymm30 +#define VEC_hi_ymm15 ymm31 + +#define VEC_hi_zmm0 zmm16 +#define VEC_hi_zmm1 zmm17 +#define VEC_hi_zmm2 zmm18 +#define VEC_hi_zmm3 zmm19 +#define VEC_hi_zmm4 zmm20 +#define VEC_hi_zmm5 zmm21 +#define VEC_hi_zmm6 zmm22 +#define VEC_hi_zmm7 zmm23 +#define VEC_hi_zmm8 zmm24 +#define VEC_hi_zmm9 zmm25 +#define VEC_hi_zmm10 zmm26 +#define VEC_hi_zmm11 zmm27 +#define VEC_hi_zmm12 zmm28 +#define VEC_hi_zmm13 zmm29 +#define VEC_hi_zmm14 zmm30 +#define VEC_hi_zmm15 zmm31 + +#define PRIMITIVE_VEC(vec, num) vec##num + +#define VEC_any_xmm(i) PRIMITIVE_VEC(xmm, i) +#define VEC_any_ymm(i) PRIMITIVE_VEC(ymm, i) +#define VEC_any_zmm(i) PRIMITIVE_VEC(zmm, i) + +#define VEC_hi_xmm(i) PRIMITIVE_VEC(VEC_hi_xmm, i) +#define VEC_hi_ymm(i) PRIMITIVE_VEC(VEC_hi_ymm, i) +#define VEC_hi_zmm(i) PRIMITIVE_VEC(VEC_hi_zmm, i) + +#endif diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S new file mode 100644 index 000000000..f59c372b7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S @@ -0,0 +1,4 @@ +#define STRLEN __wcslen_evex512 +#define USE_AS_WCSLEN 1 + +#include "strlen-evex512.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S index 7e62621af..e306a77f5 100644 --- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S @@ -1,4 +1,5 @@ #define AS_WCSLEN #define strlen __wcslen_sse4_1 +#define SECTION(p) p##.sse4.1 #include "strlen-vec.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S index 4e88c70cc..f467582cb 100644 --- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S @@ -1,5 +1,5 @@ #define STRCMP __wcsncmp_avx2_rtm #define USE_AS_STRNCMP 1 #define USE_AS_WCSCMP 1 - +#define OVERFLOW_STRCMP __wcscmp_avx2_rtm #include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S index 4fa1de4d3..e9ede522b 100644 --- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S @@ -1,5 +1,5 @@ #define STRCMP __wcsncmp_avx2 #define USE_AS_STRNCMP 1 #define USE_AS_WCSCMP 1 - +#define OVERFLOW_STRCMP __wcscmp_avx2 #include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S new file mode 100644 index 000000000..73dcf2f21 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S @@ -0,0 +1,5 @@ +#define STRLEN __wcsnlen_evex512 +#define USE_AS_WCSLEN 1 +#define USE_AS_STRNLEN 1 + +#include "strlen-evex512.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S index 5fa51fe07..d2f7dd6e2 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S @@ -1,5 +1,6 @@ #define AS_WCSLEN #define AS_STRNLEN #define strlen __wcsnlen_sse4_1 +#define SECTION(p) p##.sse4.1 #include "strlen-vec.S" diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S index 78d1ca655..69d2f3cdb 100644 --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S @@ -17,7 +17,6 @@ . */ #if IS_IN (libc) -# define wcsrchr __wcsrchr_sse2 +# define STRRCHR __wcsrchr_sse2 #endif - #include "../wcsrchr.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c deleted file mode 100644 index 46b6715e1..000000000 --- a/sysdeps/x86_64/multiarch/wmemcmp-c.c +++ /dev/null @@ -1,9 +0,0 @@ -#if IS_IN (libc) -# include - -# define WMEMCMP __wmemcmp_sse2 - -extern __typeof (wmemcmp) __wmemcmp_sse2; -#endif - -#include "wcsmbs/wmemcmp.c" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse2.S b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S new file mode 100644 index 000000000..f09192ed7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S @@ -0,0 +1,21 @@ +/* wmemcmp optimized with SSE2. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_sse2 +#include "../memcmp.S" diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index e2ab59c55..99d8b36f1 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RDX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END2 (__strcasecmp) # ifndef NO_NOLOCALE_ALIAS weak_alias (__strcasecmp, strcasecmp) @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RCX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END2 (__strncasecmp) # ifndef NO_NOLOCALE_ALIAS weak_alias (__strncasecmp, strncasecmp) @@ -146,22 +144,22 @@ ENTRY (STRCMP) #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L .section .rodata.cst16,"aM",@progbits,16 .align 16 -.Lbelowupper: - .quad 0x4040404040404040 - .quad 0x4040404040404040 -.Ltopupper: - .quad 0x5b5b5b5b5b5b5b5b - .quad 0x5b5b5b5b5b5b5b5b -.Ltouppermask: +.Llcase_min: + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f +.Llcase_max: + .quad 0x9999999999999999 + .quad 0x9999999999999999 +.Lcase_add: .quad 0x2020202020202020 .quad 0x2020202020202020 .previous - movdqa .Lbelowupper(%rip), %xmm5 -# define UCLOW_reg %xmm5 - movdqa .Ltopupper(%rip), %xmm6 -# define UCHIGH_reg %xmm6 - movdqa .Ltouppermask(%rip), %xmm7 -# define LCQWORD_reg %xmm7 + movdqa .Llcase_min(%rip), %xmm5 +# define LCASE_MIN_reg %xmm5 + movdqa .Llcase_max(%rip), %xmm6 +# define LCASE_MAX_reg %xmm6 + movdqa .Lcase_add(%rip), %xmm7 +# define CASE_ADD_reg %xmm7 #endif cmp $0x30, %ecx ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ @@ -172,22 +170,18 @@ ENTRY (STRCMP) movhpd 8(%rdi), %xmm1 movhpd 8(%rsi), %xmm2 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L -# define TOLOWER(reg1, reg2) \ - movdqa reg1, %xmm8; \ - movdqa UCHIGH_reg, %xmm9; \ - movdqa reg2, %xmm10; \ - movdqa UCHIGH_reg, %xmm11; \ - pcmpgtb UCLOW_reg, %xmm8; \ - pcmpgtb reg1, %xmm9; \ - pcmpgtb UCLOW_reg, %xmm10; \ - pcmpgtb reg2, %xmm11; \ - pand %xmm9, %xmm8; \ - pand %xmm11, %xmm10; \ - pand LCQWORD_reg, %xmm8; \ - pand LCQWORD_reg, %xmm10; \ - por %xmm8, reg1; \ - por %xmm10, reg2 - TOLOWER (%xmm1, %xmm2) +# define TOLOWER(reg1, reg2) \ + movdqa LCASE_MIN_reg, %xmm8; \ + movdqa LCASE_MIN_reg, %xmm9; \ + paddb reg1, %xmm8; \ + paddb reg2, %xmm9; \ + pcmpgtb LCASE_MAX_reg, %xmm8; \ + pcmpgtb LCASE_MAX_reg, %xmm9; \ + pandn CASE_ADD_reg, %xmm8; \ + pandn CASE_ADD_reg, %xmm9; \ + paddb %xmm8, reg1; \ + paddb %xmm9, reg2 + TOLOWER (%xmm1, %xmm2) #else # define TOLOWER(reg1, reg2) #endif diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S deleted file mode 100644 index f3cd86c60..000000000 --- a/sysdeps/x86_64/strcspn.S +++ /dev/null @@ -1,119 +0,0 @@ -/* strcspn (str, ss) -- Return the length of the initial segment of STR - which contains no characters from SS. - For AMD x86-64. - Copyright (C) 1994-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include "asm-syntax.h" - - .text -ENTRY (strcspn) - - movq %rdi, %rdx /* Save SRC. */ - - /* First we create a table with flags for all possible characters. - For the ASCII (7bit/8bit) or ISO-8859-X character sets which are - supported by the C string functions we have 256 characters. - Before inserting marks for the stop characters we clear the whole - table. */ - movq %rdi, %r8 /* Save value. */ - subq $256, %rsp /* Make space for 256 bytes. */ - cfi_adjust_cfa_offset(256) - movl $32, %ecx /* 32*8 bytes = 256 bytes. */ - movq %rsp, %rdi - xorl %eax, %eax /* We store 0s. */ - cld - rep - stosq - - movq %rsi, %rax /* Setup skipset. */ - -/* For understanding the following code remember that %rcx == 0 now. - Although all the following instruction only modify %cl we always - have a correct zero-extended 64-bit value in %rcx. */ - - .p2align 4 -L(2): movb (%rax), %cl /* get byte from skipset */ - testb %cl, %cl /* is NUL char? */ - jz L(1) /* yes => start compare loop */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ - - movb 1(%rax), %cl /* get byte from skipset */ - testb $0xff, %cl /* is NUL char? */ - jz L(1) /* yes => start compare loop */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ - - movb 2(%rax), %cl /* get byte from skipset */ - testb $0xff, %cl /* is NUL char? */ - jz L(1) /* yes => start compare loop */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ - - movb 3(%rax), %cl /* get byte from skipset */ - addq $4, %rax /* increment skipset pointer */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ - testb $0xff, %cl /* is NUL char? */ - jnz L(2) /* no => process next dword from skipset */ - -L(1): leaq -4(%rdx), %rax /* prepare loop */ - - /* We use a neat trick for the following loop. Normally we would - have to test for two termination conditions - 1. a character in the skipset was found - and - 2. the end of the string was found - But as a sign that the character is in the skipset we store its - value in the table. But the value of NUL is NUL so the loop - terminates for NUL in every case. */ - - .p2align 4 -L(3): addq $4, %rax /* adjust pointer for full loop round */ - - movb (%rax), %cl /* get byte from string */ - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - je L(4) /* yes => return */ - - movb 1(%rax), %cl /* get byte from string */ - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - je L(5) /* yes => return */ - - movb 2(%rax), %cl /* get byte from string */ - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - jz L(6) /* yes => return */ - - movb 3(%rax), %cl /* get byte from string */ - cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - jne L(3) /* no => start loop again */ - - incq %rax /* adjust pointer */ -L(6): incq %rax -L(5): incq %rax - -L(4): addq $256, %rsp /* remove skipset */ - cfi_adjust_cfa_offset(-256) -#ifdef USE_AS_STRPBRK - xorl %edx,%edx - orb %cl, %cl /* was last character NUL? */ - cmovzq %rdx, %rax /* Yes: return NULL */ -#else - subq %rdx, %rax /* we have to return the number of valid - characters, so compute distance to first - non-valid character */ -#endif - ret -END (strcspn) -libc_hidden_builtin_def (strcspn) diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S deleted file mode 100644 index 21888a5b9..000000000 --- a/sysdeps/x86_64/strpbrk.S +++ /dev/null @@ -1,3 +0,0 @@ -#define strcspn strpbrk -#define USE_AS_STRPBRK -#include diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S index 50d886713..4d7ba4ceb 100644 --- a/sysdeps/x86_64/strrchr.S +++ b/sysdeps/x86_64/strrchr.S @@ -19,210 +19,360 @@ #include +#ifndef STRRCHR +# define STRRCHR strrchr +#endif + +#ifdef USE_AS_WCSRCHR +# define PCMPEQ pcmpeqd +# define CHAR_SIZE 4 +# define PMINU pminud +#else +# define PCMPEQ pcmpeqb +# define CHAR_SIZE 1 +# define PMINU pminub +#endif + +#define PAGE_SIZE 4096 +#define VEC_SIZE 16 + .text -ENTRY (strrchr) - movd %esi, %xmm1 +ENTRY(STRRCHR) + movd %esi, %xmm0 movq %rdi, %rax - andl $4095, %eax - punpcklbw %xmm1, %xmm1 - cmpq $4032, %rax - punpcklwd %xmm1, %xmm1 - pshufd $0, %xmm1, %xmm1 + andl $(PAGE_SIZE - 1), %eax +#ifndef USE_AS_WCSRCHR + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 +#endif + pshufd $0, %xmm0, %xmm0 + cmpl $(PAGE_SIZE - VEC_SIZE), %eax ja L(cross_page) - movdqu (%rdi), %xmm0 + +L(cross_page_continue): + movups (%rdi), %xmm1 pxor %xmm2, %xmm2 - movdqa %xmm0, %xmm3 - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm3 - pmovmskb %xmm0, %ecx - pmovmskb %xmm3, %edx - testq %rdx, %rdx - je L(next_48_bytes) - leaq -1(%rdx), %rax - xorq %rdx, %rax - andq %rcx, %rax - je L(exit) - bsrq %rax, %rax + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %ecx + testl %ecx, %ecx + jz L(aligned_more) + + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(ret0) + bsrl %eax, %eax addq %rdi, %rax + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If + search CHAR is zero we are correct. Either way `andq + -CHAR_SIZE, %rax` gets the correct result. */ +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif +L(ret0): ret + /* Returns for first vec x1/x2 have hard coded backward search + path for earlier matches. */ .p2align 4 -L(next_48_bytes): - movdqu 16(%rdi), %xmm4 - movdqa %xmm4, %xmm5 - movdqu 32(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm4 - pcmpeqb %xmm2, %xmm5 - movdqu 48(%rdi), %xmm0 - pmovmskb %xmm5, %edx - movdqa %xmm3, %xmm5 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm2, %xmm5 - pcmpeqb %xmm0, %xmm2 - salq $16, %rdx - pmovmskb %xmm3, %r8d - pmovmskb %xmm5, %eax - pmovmskb %xmm2, %esi - salq $32, %r8 - salq $32, %rax - pcmpeqb %xmm1, %xmm0 - orq %rdx, %rax - movq %rsi, %rdx - pmovmskb %xmm4, %esi - salq $48, %rdx - salq $16, %rsi - orq %r8, %rsi - orq %rcx, %rsi - pmovmskb %xmm0, %ecx - salq $48, %rcx - orq %rcx, %rsi - orq %rdx, %rax - je L(loop_header2) - leaq -1(%rax), %rcx - xorq %rax, %rcx - andq %rcx, %rsi - je L(exit) - bsrq %rsi, %rsi - leaq (%rdi,%rsi), %rax +L(first_vec_x0_test): + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + testl %eax, %eax + jz L(ret0) + bsrl %eax, %eax + addq %r8, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif ret .p2align 4 -L(loop_header2): - testq %rsi, %rsi - movq %rdi, %rcx - je L(no_c_found) -L(loop_header): - addq $64, %rdi - pxor %xmm7, %xmm7 - andq $-64, %rdi - jmp L(loop_entry) +L(first_vec_x1): + PCMPEQ %xmm0, %xmm2 + pmovmskb %xmm2, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_vec_x0_test) + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret .p2align 4 -L(loop64): - testq %rdx, %rdx - cmovne %rdx, %rsi - cmovne %rdi, %rcx - addq $64, %rdi -L(loop_entry): - movdqa 32(%rdi), %xmm3 - pxor %xmm6, %xmm6 - movdqa 48(%rdi), %xmm2 - movdqa %xmm3, %xmm0 - movdqa 16(%rdi), %xmm4 - pminub %xmm2, %xmm0 - movdqa (%rdi), %xmm5 - pminub %xmm4, %xmm0 - pminub %xmm5, %xmm0 - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %eax - movdqa %xmm5, %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %r9d - movdqa %xmm4, %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %edx - movdqa %xmm3, %xmm0 - pcmpeqb %xmm1, %xmm0 - salq $16, %rdx - pmovmskb %xmm0, %r10d - movdqa %xmm2, %xmm0 - pcmpeqb %xmm1, %xmm0 - salq $32, %r10 - orq %r10, %rdx - pmovmskb %xmm0, %r8d - orq %r9, %rdx - salq $48, %r8 - orq %r8, %rdx +L(first_vec_x1_test): + PCMPEQ %xmm0, %xmm2 + pmovmskb %xmm2, %eax testl %eax, %eax - je L(loop64) - pcmpeqb %xmm6, %xmm4 - pcmpeqb %xmm6, %xmm3 - pcmpeqb %xmm6, %xmm5 - pmovmskb %xmm4, %eax - pmovmskb %xmm3, %r10d - pcmpeqb %xmm6, %xmm2 - pmovmskb %xmm5, %r9d - salq $32, %r10 - salq $16, %rax - pmovmskb %xmm2, %r8d - orq %r10, %rax - orq %r9, %rax - salq $48, %r8 - orq %r8, %rax - leaq -1(%rax), %r8 - xorq %rax, %r8 - andq %r8, %rdx - cmovne %rdi, %rcx - cmovne %rdx, %rsi - bsrq %rsi, %rsi - leaq (%rcx,%rsi), %rax + jz L(first_vec_x0_test) + bsrl %eax, %eax + leaq (VEC_SIZE)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(first_vec_x2): + PCMPEQ %xmm0, %xmm3 + pmovmskb %xmm3, %eax + leal -1(%rcx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_vec_x1_test) + bsrl %eax, %eax + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(aligned_more): + /* Save original pointer if match was in VEC 0. */ + movq %rdi, %r8 + andq $-VEC_SIZE, %rdi + + movaps VEC_SIZE(%rdi), %xmm2 + pxor %xmm3, %xmm3 + PCMPEQ %xmm2, %xmm3 + pmovmskb %xmm3, %ecx + testl %ecx, %ecx + jnz L(first_vec_x1) + + movaps (VEC_SIZE * 2)(%rdi), %xmm3 + pxor %xmm4, %xmm4 + PCMPEQ %xmm3, %xmm4 + pmovmskb %xmm4, %ecx + testl %ecx, %ecx + jnz L(first_vec_x2) + + addq $VEC_SIZE, %rdi + /* Save pointer again before realigning. */ + movq %rdi, %rsi + andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +L(first_loop): + /* Do 2x VEC at a time. */ + movaps (VEC_SIZE * 2)(%rdi), %xmm4 + movaps (VEC_SIZE * 3)(%rdi), %xmm5 + /* Since SSE2 no pminud so wcsrchr needs seperate logic for + detecting zero. Note if this is found to be a bottleneck it + may be worth adding an SSE4.1 wcsrchr implementation. */ +#ifdef USE_AS_WCSRCHR + movaps %xmm5, %xmm6 + pxor %xmm8, %xmm8 + + PCMPEQ %xmm8, %xmm5 + PCMPEQ %xmm4, %xmm8 + por %xmm5, %xmm8 +#else + movaps %xmm5, %xmm6 + PMINU %xmm4, %xmm5 +#endif + + movaps %xmm4, %xmm9 + PCMPEQ %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm6 + movaps %xmm6, %xmm7 + por %xmm4, %xmm6 +#ifndef USE_AS_WCSRCHR + pxor %xmm8, %xmm8 + PCMPEQ %xmm5, %xmm8 +#endif + pmovmskb %xmm8, %ecx + pmovmskb %xmm6, %eax + + addq $(VEC_SIZE * 2), %rdi + /* Use `addl` 1) so we can undo it with `subl` and 2) it can + macro-fuse with `jz`. */ + addl %ecx, %eax + jz L(first_loop) + + /* Check if there is zero match. */ + testl %ecx, %ecx + jz L(second_loop_match) + + /* Check if there was a match in last iteration. */ + subl %ecx, %eax + jnz L(new_match) + +L(first_loop_old_match): + PCMPEQ %xmm0, %xmm2 + PCMPEQ %xmm0, %xmm3 + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + addl %eax, %ecx + jz L(first_vec_x0_test) + /* NB: We could move this shift to before the branch and save a + bit of code size / performance on the fall through. The + branch leads to the null case which generally seems hotter + than char in first 3x VEC. */ + sall $16, %eax + orl %ecx, %eax + + bsrl %eax, %eax + addq %rsi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4 +L(new_match): + pxor %xmm6, %xmm6 + PCMPEQ %xmm9, %xmm6 + pmovmskb %xmm6, %eax + sall $16, %ecx + orl %eax, %ecx + + /* We can't reuse either of the old comparisons as since we mask + of zeros after first zero (instead of using the full + comparison) we can't gurantee no interference between match + after end of string and valid match. */ + pmovmskb %xmm4, %eax + pmovmskb %xmm7, %edx + sall $16, %edx + orl %edx, %eax + + leal -1(%ecx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(first_loop_old_match) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif ret + /* Save minimum state for getting most recent match. We can + throw out all previous work. */ .p2align 4 -L(no_c_found): - movl $1, %esi - xorl %ecx, %ecx - jmp L(loop_header) +L(second_loop_match): + movq %rdi, %rsi + movaps %xmm4, %xmm2 + movaps %xmm7, %xmm3 .p2align 4 -L(exit): - xorl %eax, %eax +L(second_loop): + movaps (VEC_SIZE * 2)(%rdi), %xmm4 + movaps (VEC_SIZE * 3)(%rdi), %xmm5 + /* Since SSE2 no pminud so wcsrchr needs seperate logic for + detecting zero. Note if this is found to be a bottleneck it + may be worth adding an SSE4.1 wcsrchr implementation. */ +#ifdef USE_AS_WCSRCHR + movaps %xmm5, %xmm6 + pxor %xmm8, %xmm8 + + PCMPEQ %xmm8, %xmm5 + PCMPEQ %xmm4, %xmm8 + por %xmm5, %xmm8 +#else + movaps %xmm5, %xmm6 + PMINU %xmm4, %xmm5 +#endif + + movaps %xmm4, %xmm9 + PCMPEQ %xmm0, %xmm4 + PCMPEQ %xmm0, %xmm6 + movaps %xmm6, %xmm7 + por %xmm4, %xmm6 +#ifndef USE_AS_WCSRCHR + pxor %xmm8, %xmm8 + PCMPEQ %xmm5, %xmm8 +#endif + + pmovmskb %xmm8, %ecx + pmovmskb %xmm6, %eax + + addq $(VEC_SIZE * 2), %rdi + /* Either null term or new occurence of CHAR. */ + addl %ecx, %eax + jz L(second_loop) + + /* No null term so much be new occurence of CHAR. */ + testl %ecx, %ecx + jz L(second_loop_match) + + + subl %ecx, %eax + jnz L(second_loop_new_match) + +L(second_loop_old_match): + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + sall $16, %eax + orl %ecx, %eax + bsrl %eax, %eax + addq %rsi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif ret .p2align 4 +L(second_loop_new_match): + pxor %xmm6, %xmm6 + PCMPEQ %xmm9, %xmm6 + pmovmskb %xmm6, %eax + sall $16, %ecx + orl %eax, %ecx + + /* We can't reuse either of the old comparisons as since we mask + of zeros after first zero (instead of using the full + comparison) we can't gurantee no interference between match + after end of string and valid match. */ + pmovmskb %xmm4, %eax + pmovmskb %xmm7, %edx + sall $16, %edx + orl %edx, %eax + + leal -1(%ecx), %edx + xorl %edx, %ecx + andl %ecx, %eax + jz L(second_loop_old_match) + bsrl %eax, %eax + addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif + ret + + .p2align 4,, 4 L(cross_page): - movq %rdi, %rax - pxor %xmm0, %xmm0 - andq $-64, %rax - movdqu (%rax), %xmm5 - movdqa %xmm5, %xmm6 - movdqu 16(%rax), %xmm4 - pcmpeqb %xmm1, %xmm5 - pcmpeqb %xmm0, %xmm6 - movdqu 32(%rax), %xmm3 - pmovmskb %xmm6, %esi - movdqa %xmm4, %xmm6 - movdqu 48(%rax), %xmm2 - pcmpeqb %xmm1, %xmm4 - pcmpeqb %xmm0, %xmm6 - pmovmskb %xmm6, %edx - movdqa %xmm3, %xmm6 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm0, %xmm6 - pcmpeqb %xmm2, %xmm0 - salq $16, %rdx - pmovmskb %xmm3, %r9d - pmovmskb %xmm6, %r8d - pmovmskb %xmm0, %ecx - salq $32, %r9 - salq $32, %r8 - pcmpeqb %xmm1, %xmm2 - orq %r8, %rdx - salq $48, %rcx - pmovmskb %xmm5, %r8d - orq %rsi, %rdx - pmovmskb %xmm4, %esi - orq %rcx, %rdx - pmovmskb %xmm2, %ecx - salq $16, %rsi - salq $48, %rcx - orq %r9, %rsi - orq %r8, %rsi - orq %rcx, %rsi + movq %rdi, %rsi + andq $-VEC_SIZE, %rsi + movaps (%rsi), %xmm1 + pxor %xmm2, %xmm2 + PCMPEQ %xmm1, %xmm2 + pmovmskb %xmm2, %edx movl %edi, %ecx - subl %eax, %ecx - shrq %cl, %rdx - shrq %cl, %rsi - testq %rdx, %rdx - je L(loop_header2) - leaq -1(%rdx), %rax - xorq %rdx, %rax - andq %rax, %rsi - je L(exit) - bsrq %rsi, %rax + andl $(VEC_SIZE - 1), %ecx + sarl %cl, %edx + jz L(cross_page_continue) + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + sarl %cl, %eax + leal -1(%rdx), %ecx + xorl %edx, %ecx + andl %ecx, %eax + jz L(ret1) + bsrl %eax, %eax addq %rdi, %rax +#ifdef USE_AS_WCSRCHR + andq $-CHAR_SIZE, %rax +#endif +L(ret1): ret -END (strrchr) +END(STRRCHR) -weak_alias (strrchr, rindex) -libc_hidden_builtin_def (strrchr) +#ifndef USE_AS_WCSRCHR + weak_alias (STRRCHR, rindex) + libc_hidden_builtin_def (STRRCHR) +#endif diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S deleted file mode 100644 index 61b76ee0a..000000000 --- a/sysdeps/x86_64/strspn.S +++ /dev/null @@ -1,112 +0,0 @@ -/* strspn (str, ss) -- Return the length of the initial segment of STR - which contains only characters from SS. - For AMD x86-64. - Copyright (C) 1994-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - - .text -ENTRY (strspn) - - movq %rdi, %rdx /* Save SRC. */ - - /* First we create a table with flags for all possible characters. - For the ASCII (7bit/8bit) or ISO-8859-X character sets which are - supported by the C string functions we have 256 characters. - Before inserting marks for the stop characters we clear the whole - table. */ - movq %rdi, %r8 /* Save value. */ - subq $256, %rsp /* Make space for 256 bytes. */ - cfi_adjust_cfa_offset(256) - movl $32, %ecx /* 32*8 bytes = 256 bytes. */ - movq %rsp, %rdi - xorl %eax, %eax /* We store 0s. */ - cld - rep - stosq - - movq %rsi, %rax /* Setup stopset. */ - -/* For understanding the following code remember that %rcx == 0 now. - Although all the following instruction only modify %cl we always - have a correct zero-extended 64-bit value in %rcx. */ - - .p2align 4 -L(2): movb (%rax), %cl /* get byte from stopset */ - testb %cl, %cl /* is NUL char? */ - jz L(1) /* yes => start compare loop */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ - - movb 1(%rax), %cl /* get byte from stopset */ - testb $0xff, %cl /* is NUL char? */ - jz L(1) /* yes => start compare loop */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ - - movb 2(%rax), %cl /* get byte from stopset */ - testb $0xff, %cl /* is NUL char? */ - jz L(1) /* yes => start compare loop */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ - - movb 3(%rax), %cl /* get byte from stopset */ - addq $4, %rax /* increment stopset pointer */ - movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ - testb $0xff, %cl /* is NUL char? */ - jnz L(2) /* no => process next dword from stopset */ - -L(1): leaq -4(%rdx), %rax /* prepare loop */ - - /* We use a neat trick for the following loop. Normally we would - have to test for two termination conditions - 1. a character in the stopset was found - and - 2. the end of the string was found - But as a sign that the character is in the stopset we store its - value in the table. But the value of NUL is NUL so the loop - terminates for NUL in every case. */ - - .p2align 4 -L(3): addq $4, %rax /* adjust pointer for full loop round */ - - movb (%rax), %cl /* get byte from string */ - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - jz L(4) /* no => return */ - - movb 1(%rax), %cl /* get byte from string */ - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - jz L(5) /* no => return */ - - movb 2(%rax), %cl /* get byte from string */ - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - jz L(6) /* no => return */ - - movb 3(%rax), %cl /* get byte from string */ - testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ - jnz L(3) /* yes => start loop again */ - - incq %rax /* adjust pointer */ -L(6): incq %rax -L(5): incq %rax - -L(4): addq $256, %rsp /* remove stopset */ - cfi_adjust_cfa_offset(-256) - subq %rdx, %rax /* we have to return the number of valid - characters, so compute distance to first - non-valid character */ - ret -END (strspn) -libc_hidden_builtin_def (strspn) diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h index f14d50786..7f5defa4e 100644 --- a/sysdeps/x86_64/sysdep.h +++ b/sysdeps/x86_64/sysdep.h @@ -99,13 +99,31 @@ lose: \ to avoid RTM abort triggered by VZEROUPPER inside transactionally. */ #define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \ xtest; \ - jz 1f; \ - vzeroall; \ + jnz 1f; \ + vzeroupper; \ ret; \ 1: \ - vzeroupper; \ + vzeroall; \ ret +/* Can be used to replace vzeroupper that is not directly before a + return. This is useful when hoisting a vzeroupper from multiple + return paths to decrease the total number of vzerouppers and code + size. */ +#define COND_VZEROUPPER_XTEST \ + xtest; \ + jz 1f; \ + vzeroall; \ + jmp 2f; \ +1: \ + vzeroupper; \ +2: + +/* In RTM define this as COND_VZEROUPPER_XTEST. */ +#ifndef COND_VZEROUPPER +# define COND_VZEROUPPER vzeroupper +#endif + /* Zero upper vector registers and return. */ #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN # define ZERO_UPPER_VEC_REGISTERS_RETURN \ diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S index c9165dbf0..d641141d7 100644 --- a/sysdeps/x86_64/wcslen.S +++ b/sysdeps/x86_64/wcslen.S @@ -40,82 +40,82 @@ ENTRY (__wcslen) pxor %xmm0, %xmm0 lea 32(%rdi), %rax - lea 16(%rdi), %rcx + addq $16, %rdi and $-16, %rax pcmpeqd (%rax), %xmm0 pmovmskb %xmm0, %edx pxor %xmm1, %xmm1 + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm1 pmovmskb %xmm1, %edx pxor %xmm2, %xmm2 + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm2 pmovmskb %xmm2, %edx pxor %xmm3, %xmm3 + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm3 pmovmskb %xmm3, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm0 pmovmskb %xmm0, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm1 pmovmskb %xmm1, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm2 pmovmskb %xmm2, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm3 pmovmskb %xmm3, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm0 pmovmskb %xmm0, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm1 pmovmskb %xmm1, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm2 pmovmskb %xmm2, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm3 pmovmskb %xmm3, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) and $-0x40, %rax @@ -132,104 +132,100 @@ L(aligned_64_loop): pminub %xmm0, %xmm2 pcmpeqd %xmm3, %xmm2 pmovmskb %xmm2, %edx + addq $64, %rax test %edx, %edx - lea 64(%rax), %rax jz L(aligned_64_loop) pcmpeqd -64(%rax), %xmm3 pmovmskb %xmm3, %edx + addq $48, %rdi test %edx, %edx - lea 48(%rcx), %rcx jnz L(exit) pcmpeqd %xmm1, %xmm3 pmovmskb %xmm3, %edx + addq $-16, %rdi test %edx, %edx - lea -16(%rcx), %rcx jnz L(exit) pcmpeqd -32(%rax), %xmm3 pmovmskb %xmm3, %edx + addq $-16, %rdi test %edx, %edx - lea -16(%rcx), %rcx jnz L(exit) pcmpeqd %xmm6, %xmm3 pmovmskb %xmm3, %edx + addq $-16, %rdi test %edx, %edx - lea -16(%rcx), %rcx - jnz L(exit) - - jmp L(aligned_64_loop) + jz L(aligned_64_loop) .p2align 4 L(exit): - sub %rcx, %rax + sub %rdi, %rax shr $2, %rax test %dl, %dl jz L(exit_high) - mov %dl, %cl - and $15, %cl + andl $15, %edx jz L(exit_1) ret - .p2align 4 + /* No align here. Naturally aligned % 16 == 1. */ L(exit_high): - mov %dh, %ch - and $15, %ch + andl $(15 << 8), %edx jz L(exit_3) add $2, %rax ret - .p2align 4 + .p2align 3 L(exit_1): add $1, %rax ret - .p2align 4 + .p2align 3 L(exit_3): add $3, %rax ret - .p2align 4 + .p2align 3 L(exit_tail0): - xor %rax, %rax + xorl %eax, %eax ret - .p2align 4 + .p2align 3 L(exit_tail1): - mov $1, %rax + movl $1, %eax ret - .p2align 4 + .p2align 3 L(exit_tail2): - mov $2, %rax + movl $2, %eax ret - .p2align 4 + .p2align 3 L(exit_tail3): - mov $3, %rax + movl $3, %eax ret - .p2align 4 + .p2align 3 L(exit_tail4): - mov $4, %rax + movl $4, %eax ret - .p2align 4 + .p2align 3 L(exit_tail5): - mov $5, %rax + movl $5, %eax ret - .p2align 4 + .p2align 3 L(exit_tail6): - mov $6, %rax + movl $6, %eax ret - .p2align 4 + .p2align 3 L(exit_tail7): - mov $7, %rax + movl $7, %eax ret END (__wcslen) diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S index 61552954d..2b80efc5e 100644 --- a/sysdeps/x86_64/wcsrchr.S +++ b/sysdeps/x86_64/wcsrchr.S @@ -1,4 +1,4 @@ -/* wcsrchr with SSSE3 +/* wcsrchr optimized with SSE2. Copyright (C) 2011-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,266 +16,12 @@ License along with the GNU C Library; if not, see . */ -#include - .text -ENTRY (wcsrchr) +#define USE_AS_WCSRCHR 1 +#define NO_PMINU 1 - movd %rsi, %xmm1 - mov %rdi, %rcx - punpckldq %xmm1, %xmm1 - pxor %xmm2, %xmm2 - punpckldq %xmm1, %xmm1 - and $63, %rcx - cmp $48, %rcx - ja L(crosscache) +#ifndef STRRCHR +# define STRRCHR wcsrchr +#endif - movdqu (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - add $16, %rdi - - test %rax, %rax - jnz L(unaligned_match1) - - test %rcx, %rcx - jnz L(return_null) - - and $-16, %rdi - xor %r8, %r8 - jmp L(loop) - - .p2align 4 -L(unaligned_match1): - test %rcx, %rcx - jnz L(prolog_find_zero_1) - - mov %rax, %r8 - mov %rdi, %rsi - and $-16, %rdi - jmp L(loop) - - .p2align 4 -L(crosscache): - and $15, %rcx - and $-16, %rdi - pxor %xmm3, %xmm3 - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm3 - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm3, %rdx - pmovmskb %xmm0, %rax - shr %cl, %rdx - shr %cl, %rax - add $16, %rdi - - test %rax, %rax - jnz L(unaligned_match) - - test %rdx, %rdx - jnz L(return_null) - - xor %r8, %r8 - jmp L(loop) - - .p2align 4 -L(unaligned_match): - test %rdx, %rdx - jnz L(prolog_find_zero) - - mov %rax, %r8 - lea (%rdi, %rcx), %rsi - -/* Loop start on aligned string. */ - .p2align 4 -L(loop): - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rcx - pmovmskb %xmm0, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm3 - pcmpeqd %xmm3, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm3 - pmovmskb %xmm2, %rcx - pmovmskb %xmm3, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm4 - pcmpeqd %xmm4, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm4 - pmovmskb %xmm2, %rcx - pmovmskb %xmm4, %rax - or %rax, %rcx - jnz L(matches) - - movdqa (%rdi), %xmm5 - pcmpeqd %xmm5, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm5 - pmovmskb %xmm2, %rcx - pmovmskb %xmm5, %rax - or %rax, %rcx - jz L(loop) - - .p2align 4 -L(matches): - test %rax, %rax - jnz L(match) -L(return_value): - test %r8, %r8 - jz L(return_null) - mov %r8, %rax - mov %rsi, %rdi - - test $15 << 4, %ah - jnz L(match_fourth_wchar) - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(match): - pmovmskb %xmm2, %rcx - test %rcx, %rcx - jnz L(find_zero) - mov %rax, %r8 - mov %rdi, %rsi - jmp L(loop) - - .p2align 4 -L(find_zero): - test $15, %cl - jnz L(find_zero_in_first_wchar) - test %cl, %cl - jnz L(find_zero_in_second_wchar) - test $15, %ch - jnz L(find_zero_in_third_wchar) - - and $1 << 13 - 1, %rax - jz L(return_value) - - test $15 << 4, %ah - jnz L(match_fourth_wchar) - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(find_zero_in_first_wchar): - test $1, %rax - jz L(return_value) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(find_zero_in_second_wchar): - and $1 << 5 - 1, %rax - jz L(return_value) - - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(find_zero_in_third_wchar): - and $1 << 9 - 1, %rax - jz L(return_value) - - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero): - add %rcx, %rdi - mov %rdx, %rcx -L(prolog_find_zero_1): - test $15, %cl - jnz L(prolog_find_zero_in_first_wchar) - test %cl, %cl - jnz L(prolog_find_zero_in_second_wchar) - test $15, %ch - jnz L(prolog_find_zero_in_third_wchar) - - and $1 << 13 - 1, %rax - jz L(return_null) - - test $15 << 4, %ah - jnz L(match_fourth_wchar) - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero_in_first_wchar): - test $1, %rax - jz L(return_null) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero_in_second_wchar): - and $1 << 5 - 1, %rax - jz L(return_null) - - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(prolog_find_zero_in_third_wchar): - and $1 << 9 - 1, %rax - jz L(return_null) - - test %ah, %ah - jnz L(match_third_wchar) - test $15 << 4, %al - jnz L(match_second_wchar) - lea -16(%rdi), %rax - ret - - .p2align 4 -L(match_second_wchar): - lea -12(%rdi), %rax - ret - - .p2align 4 -L(match_third_wchar): - lea -8(%rdi), %rax - ret - - .p2align 4 -L(match_fourth_wchar): - lea -4(%rdi), %rax - ret - - .p2align 4 -L(return_null): - xor %rax, %rax - ret - -END (wcsrchr) +#include "../strrchr.S" diff --git a/sysdeps/x86_64/wmemcmp.S b/sysdeps/x86_64/wmemcmp.S new file mode 100644 index 000000000..815b999e4 --- /dev/null +++ b/sysdeps/x86_64/wmemcmp.S @@ -0,0 +1,23 @@ +/* wmemcmp optimized with SSE2. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define MEMCMP __wmemcmp +#define USE_AS_WMEMCMP 1 +#include "memcmp.S" + +weak_alias (__wmemcmp, wmemcmp) diff --git a/wcsmbs/Makefile b/wcsmbs/Makefile index df9a85f4a..9ec758233 100644 --- a/wcsmbs/Makefile +++ b/wcsmbs/Makefile @@ -22,8 +22,9 @@ subdir := wcsmbs include ../Makeconfig -headers := wchar.h bits/wchar.h bits/wchar2.h bits/wchar-ldbl.h uchar.h \ - bits/types/__mbstate_t.h bits/types/mbstate_t.h bits/types/wint_t.h +headers := wchar.h bits/wchar.h bits/wchar2.h bits/wchar2-decl.h \ + bits/wchar-ldbl.h uchar.h bits/types/__mbstate_t.h \ + bits/types/mbstate_t.h bits/types/wint_t.h routines := wcscat wcschr wcscmp wcscpy wcscspn wcsdup wcslen wcsncat \ wcsncmp wcsncpy wcspbrk wcsrchr wcsspn wcstok wcsstr wmemchr \ diff --git a/wcsmbs/bits/wchar2-decl.h b/wcsmbs/bits/wchar2-decl.h new file mode 100644 index 000000000..8e1735c33 --- /dev/null +++ b/wcsmbs/bits/wchar2-decl.h @@ -0,0 +1,124 @@ +/* Checking macros for wchar functions. Declarations only. + Copyright (C) 2004-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _BITS_WCHAR2_DECL_H +#define _BITS_WCHAR2_DECL_H 1 + +#ifndef _WCHAR_H +# error "Never include directly; use instead." +#endif + + +extern wchar_t *__wmemcpy_chk (wchar_t *__restrict __s1, + const wchar_t *__restrict __s2, size_t __n, + size_t __ns1) __THROW; +extern wchar_t *__wmemmove_chk (wchar_t *__s1, const wchar_t *__s2, + size_t __n, size_t __ns1) __THROW; + + +#ifdef __USE_GNU + +extern wchar_t *__wmempcpy_chk (wchar_t *__restrict __s1, + const wchar_t *__restrict __s2, size_t __n, + size_t __ns1) __THROW; + +#endif + + +extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n, + size_t __ns) __THROW; +extern wchar_t *__wcscpy_chk (wchar_t *__restrict __dest, + const wchar_t *__restrict __src, + size_t __n) __THROW; +extern wchar_t *__wcpcpy_chk (wchar_t *__restrict __dest, + const wchar_t *__restrict __src, + size_t __destlen) __THROW; +extern wchar_t *__wcsncpy_chk (wchar_t *__restrict __dest, + const wchar_t *__restrict __src, size_t __n, + size_t __destlen) __THROW; +extern wchar_t *__wcpncpy_chk (wchar_t *__restrict __dest, + const wchar_t *__restrict __src, size_t __n, + size_t __destlen) __THROW; +extern wchar_t *__wcscat_chk (wchar_t *__restrict __dest, + const wchar_t *__restrict __src, + size_t __destlen) __THROW; +extern wchar_t *__wcsncat_chk (wchar_t *__restrict __dest, + const wchar_t *__restrict __src, + size_t __n, size_t __destlen) __THROW; +extern int __swprintf_chk (wchar_t *__restrict __s, size_t __n, + int __flag, size_t __s_len, + const wchar_t *__restrict __format, ...) + __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 6))) */; +extern int __vswprintf_chk (wchar_t *__restrict __s, size_t __n, + int __flag, size_t __s_len, + const wchar_t *__restrict __format, + __gnuc_va_list __arg) + __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 0))) */; + +#if __USE_FORTIFY_LEVEL > 1 + +extern int __fwprintf_chk (__FILE *__restrict __stream, int __flag, + const wchar_t *__restrict __format, ...); +extern int __wprintf_chk (int __flag, const wchar_t *__restrict __format, + ...); +extern int __vfwprintf_chk (__FILE *__restrict __stream, int __flag, + const wchar_t *__restrict __format, + __gnuc_va_list __ap); +extern int __vwprintf_chk (int __flag, const wchar_t *__restrict __format, + __gnuc_va_list __ap); + +#endif + +extern wchar_t *__fgetws_chk (wchar_t *__restrict __s, size_t __size, int __n, + __FILE *__restrict __stream) __wur; + +#ifdef __USE_GNU + +extern wchar_t *__fgetws_unlocked_chk (wchar_t *__restrict __s, size_t __size, + int __n, __FILE *__restrict __stream) + __wur; + +#endif + +extern size_t __wcrtomb_chk (char *__restrict __s, wchar_t __wchar, + mbstate_t *__restrict __p, + size_t __buflen) __THROW __wur; +extern size_t __mbsrtowcs_chk (wchar_t *__restrict __dst, + const char **__restrict __src, + size_t __len, mbstate_t *__restrict __ps, + size_t __dstlen) __THROW; +extern size_t __wcsrtombs_chk (char *__restrict __dst, + const wchar_t **__restrict __src, + size_t __len, mbstate_t *__restrict __ps, + size_t __dstlen) __THROW; + +#ifdef __USE_XOPEN2K8 + +extern size_t __mbsnrtowcs_chk (wchar_t *__restrict __dst, + const char **__restrict __src, size_t __nmc, + size_t __len, mbstate_t *__restrict __ps, + size_t __dstlen) __THROW; +extern size_t __wcsnrtombs_chk (char *__restrict __dst, + const wchar_t **__restrict __src, + size_t __nwc, size_t __len, + mbstate_t *__restrict __ps, size_t __dstlen) + __THROW; + +#endif + +#endif /* bits/wchar2-decl.h. */ diff --git a/wcsmbs/bits/wchar2.h b/wcsmbs/bits/wchar2.h index 0e017f458..3f110efe5 100644 --- a/wcsmbs/bits/wchar2.h +++ b/wcsmbs/bits/wchar2.h @@ -21,9 +21,6 @@ #endif -extern wchar_t *__wmemcpy_chk (wchar_t *__restrict __s1, - const wchar_t *__restrict __s2, size_t __n, - size_t __ns1) __THROW; extern wchar_t *__REDIRECT_NTH (__wmemcpy_alias, (wchar_t *__restrict __s1, const wchar_t *__restrict __s2, size_t __n), @@ -45,8 +42,6 @@ __NTH (wmemcpy (wchar_t *__restrict __s1, const wchar_t *__restrict __s2, } -extern wchar_t *__wmemmove_chk (wchar_t *__s1, const wchar_t *__s2, - size_t __n, size_t __ns1) __THROW; extern wchar_t *__REDIRECT_NTH (__wmemmove_alias, (wchar_t *__s1, const wchar_t *__s2, size_t __n), wmemmove); @@ -66,9 +61,6 @@ __NTH (wmemmove (wchar_t *__s1, const wchar_t *__s2, size_t __n)) #ifdef __USE_GNU -extern wchar_t *__wmempcpy_chk (wchar_t *__restrict __s1, - const wchar_t *__restrict __s2, size_t __n, - size_t __ns1) __THROW; extern wchar_t *__REDIRECT_NTH (__wmempcpy_alias, (wchar_t *__restrict __s1, const wchar_t *__restrict __s2, @@ -91,8 +83,6 @@ __NTH (wmempcpy (wchar_t *__restrict __s1, const wchar_t *__restrict __s2, #endif -extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n, - size_t __ns) __THROW; extern wchar_t *__REDIRECT_NTH (__wmemset_alias, (wchar_t *__s, wchar_t __c, size_t __n), wmemset); extern wchar_t *__REDIRECT_NTH (__wmemset_chk_warn, @@ -110,9 +100,6 @@ __NTH (wmemset (wchar_t *__s, wchar_t __c, size_t __n)) } -extern wchar_t *__wcscpy_chk (wchar_t *__restrict __dest, - const wchar_t *__restrict __src, - size_t __n) __THROW; extern wchar_t *__REDIRECT_NTH (__wcscpy_alias, (wchar_t *__restrict __dest, const wchar_t *__restrict __src), wcscpy); @@ -127,9 +114,6 @@ __NTH (wcscpy (wchar_t *__restrict __dest, const wchar_t *__restrict __src)) } -extern wchar_t *__wcpcpy_chk (wchar_t *__restrict __dest, - const wchar_t *__restrict __src, - size_t __destlen) __THROW; extern wchar_t *__REDIRECT_NTH (__wcpcpy_alias, (wchar_t *__restrict __dest, const wchar_t *__restrict __src), wcpcpy); @@ -144,9 +128,6 @@ __NTH (wcpcpy (wchar_t *__restrict __dest, const wchar_t *__restrict __src)) } -extern wchar_t *__wcsncpy_chk (wchar_t *__restrict __dest, - const wchar_t *__restrict __src, size_t __n, - size_t __destlen) __THROW; extern wchar_t *__REDIRECT_NTH (__wcsncpy_alias, (wchar_t *__restrict __dest, const wchar_t *__restrict __src, @@ -168,9 +149,6 @@ __NTH (wcsncpy (wchar_t *__restrict __dest, const wchar_t *__restrict __src, } -extern wchar_t *__wcpncpy_chk (wchar_t *__restrict __dest, - const wchar_t *__restrict __src, size_t __n, - size_t __destlen) __THROW; extern wchar_t *__REDIRECT_NTH (__wcpncpy_alias, (wchar_t *__restrict __dest, const wchar_t *__restrict __src, @@ -192,9 +170,6 @@ __NTH (wcpncpy (wchar_t *__restrict __dest, const wchar_t *__restrict __src, } -extern wchar_t *__wcscat_chk (wchar_t *__restrict __dest, - const wchar_t *__restrict __src, - size_t __destlen) __THROW; extern wchar_t *__REDIRECT_NTH (__wcscat_alias, (wchar_t *__restrict __dest, const wchar_t *__restrict __src), wcscat); @@ -209,9 +184,6 @@ __NTH (wcscat (wchar_t *__restrict __dest, const wchar_t *__restrict __src)) } -extern wchar_t *__wcsncat_chk (wchar_t *__restrict __dest, - const wchar_t *__restrict __src, - size_t __n, size_t __destlen) __THROW; extern wchar_t *__REDIRECT_NTH (__wcsncat_alias, (wchar_t *__restrict __dest, const wchar_t *__restrict __src, @@ -228,10 +200,6 @@ __NTH (wcsncat (wchar_t *__restrict __dest, const wchar_t *__restrict __src, } -extern int __swprintf_chk (wchar_t *__restrict __s, size_t __n, - int __flag, size_t __s_len, - const wchar_t *__restrict __format, ...) - __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 6))) */; extern int __REDIRECT_NTH_LDBL (__swprintf_alias, (wchar_t *__restrict __s, size_t __n, @@ -258,11 +226,6 @@ __NTH (swprintf (wchar_t *__restrict __s, size_t __n, : swprintf (s, n, __VA_ARGS__)) #endif -extern int __vswprintf_chk (wchar_t *__restrict __s, size_t __n, - int __flag, size_t __s_len, - const wchar_t *__restrict __format, - __gnuc_va_list __arg) - __THROW /* __attribute__ ((__format__ (__wprintf__, 5, 0))) */; extern int __REDIRECT_NTH_LDBL (__vswprintf_alias, (wchar_t *__restrict __s, size_t __n, @@ -283,16 +246,6 @@ __NTH (vswprintf (wchar_t *__restrict __s, size_t __n, #if __USE_FORTIFY_LEVEL > 1 -extern int __fwprintf_chk (__FILE *__restrict __stream, int __flag, - const wchar_t *__restrict __format, ...); -extern int __wprintf_chk (int __flag, const wchar_t *__restrict __format, - ...); -extern int __vfwprintf_chk (__FILE *__restrict __stream, int __flag, - const wchar_t *__restrict __format, - __gnuc_va_list __ap); -extern int __vwprintf_chk (int __flag, const wchar_t *__restrict __format, - __gnuc_va_list __ap); - # ifdef __va_arg_pack __fortify_function int wprintf (const wchar_t *__restrict __fmt, ...) @@ -328,8 +281,6 @@ vfwprintf (__FILE *__restrict __stream, #endif -extern wchar_t *__fgetws_chk (wchar_t *__restrict __s, size_t __size, int __n, - __FILE *__restrict __stream) __wur; extern wchar_t *__REDIRECT (__fgetws_alias, (wchar_t *__restrict __s, int __n, __FILE *__restrict __stream), fgetws) __wur; @@ -351,9 +302,6 @@ fgetws (wchar_t *__restrict __s, int __n, __FILE *__restrict __stream) } #ifdef __USE_GNU -extern wchar_t *__fgetws_unlocked_chk (wchar_t *__restrict __s, size_t __size, - int __n, __FILE *__restrict __stream) - __wur; extern wchar_t *__REDIRECT (__fgetws_unlocked_alias, (wchar_t *__restrict __s, int __n, __FILE *__restrict __stream), fgetws_unlocked) @@ -379,9 +327,6 @@ fgetws_unlocked (wchar_t *__restrict __s, int __n, __FILE *__restrict __stream) #endif -extern size_t __wcrtomb_chk (char *__restrict __s, wchar_t __wchar, - mbstate_t *__restrict __p, - size_t __buflen) __THROW __wur; extern size_t __REDIRECT_NTH (__wcrtomb_alias, (char *__restrict __s, wchar_t __wchar, mbstate_t *__restrict __ps), wcrtomb) __wur; @@ -404,10 +349,6 @@ __NTH (wcrtomb (char *__restrict __s, wchar_t __wchar, } -extern size_t __mbsrtowcs_chk (wchar_t *__restrict __dst, - const char **__restrict __src, - size_t __len, mbstate_t *__restrict __ps, - size_t __dstlen) __THROW; extern size_t __REDIRECT_NTH (__mbsrtowcs_alias, (wchar_t *__restrict __dst, const char **__restrict __src, @@ -431,10 +372,6 @@ __NTH (mbsrtowcs (wchar_t *__restrict __dst, const char **__restrict __src, } -extern size_t __wcsrtombs_chk (char *__restrict __dst, - const wchar_t **__restrict __src, - size_t __len, mbstate_t *__restrict __ps, - size_t __dstlen) __THROW; extern size_t __REDIRECT_NTH (__wcsrtombs_alias, (char *__restrict __dst, const wchar_t **__restrict __src, @@ -458,10 +395,6 @@ __NTH (wcsrtombs (char *__restrict __dst, const wchar_t **__restrict __src, #ifdef __USE_XOPEN2K8 -extern size_t __mbsnrtowcs_chk (wchar_t *__restrict __dst, - const char **__restrict __src, size_t __nmc, - size_t __len, mbstate_t *__restrict __ps, - size_t __dstlen) __THROW; extern size_t __REDIRECT_NTH (__mbsnrtowcs_alias, (wchar_t *__restrict __dst, const char **__restrict __src, size_t __nmc, @@ -485,11 +418,6 @@ __NTH (mbsnrtowcs (wchar_t *__restrict __dst, const char **__restrict __src, } -extern size_t __wcsnrtombs_chk (char *__restrict __dst, - const wchar_t **__restrict __src, - size_t __nwc, size_t __len, - mbstate_t *__restrict __ps, size_t __dstlen) - __THROW; extern size_t __REDIRECT_NTH (__wcsnrtombs_alias, (char *__restrict __dst, const wchar_t **__restrict __src, diff --git a/wcsmbs/wchar.h b/wcsmbs/wchar.h index 5d6a40853..c1321c751 100644 --- a/wcsmbs/wchar.h +++ b/wcsmbs/wchar.h @@ -864,14 +864,21 @@ extern size_t wcsftime_l (wchar_t *__restrict __s, size_t __maxsize, /* Define some macros helping to catch buffer overflows. */ #if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function -# include +/* Declare all functions from bits/wchar2-decl.h first. */ +# include #endif -#include +/* The following headers provide asm redirections. These redirections must + appear before the first usage of these functions, e.g. in bits/wchar.h. */ #if defined __LDBL_COMPAT || __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI == 1 # include #endif +#if __USE_FORTIFY_LEVEL > 0 && defined __fortify_function +/* Now include the function definitions and redirects too. */ +# include +#endif + __END_DECLS #endif /* wchar.h */ -- 2.30.2