From 4d6da0374a6f9e36702cb9fbc7418d144cd62410 Mon Sep 17 00:00:00 2001 From: joseph Date: Wed, 29 Jul 2009 15:58:14 +0000 Subject: Merge changes between r8623 and r8721 from /fsf/trunk. git-svn-id: svn://svn.eglibc.org/trunk@8722 7b3dc134-2b1b-0410-93df-9e9f96275f8d --- libc/sysdeps/generic/ldsodefs.h | 19 + libc/sysdeps/generic/sysdep.h | 43 +- libc/sysdeps/i386/configure | 25 + libc/sysdeps/i386/configure.in | 11 + libc/sysdeps/posix/getaddrinfo.c | 2 + libc/sysdeps/powerpc/sysdep.h | 4 +- libc/sysdeps/s390/dl-procinfo.c | 10 +- libc/sysdeps/s390/dl-procinfo.h | 9 +- libc/sysdeps/s390/s390-64/Makefile | 67 + libc/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c | 238 +++ libc/sysdeps/s390/s390-64/utf16-utf32-z9.c | 325 ++++ libc/sysdeps/s390/s390-64/utf8-utf16-z9.c | 463 +++++ libc/sysdeps/s390/s390-64/utf8-utf32-z9.c | 508 ++++++ libc/sysdeps/unix/sysv/linux/configure | 11 - libc/sysdeps/unix/sysv/linux/configure.in | 13 - libc/sysdeps/unix/sysv/linux/eventfd.c | 15 +- libc/sysdeps/unix/sysv/linux/i386/makecontext.S | 10 +- libc/sysdeps/unix/sysv/linux/i386/sysconf.c | 3 + libc/sysdeps/unix/sysv/linux/kernel-features.h | 2 + libc/sysdeps/unix/sysv/linux/signalfd.c | 15 +- libc/sysdeps/unix/sysv/linux/sys/epoll.h | 4 +- libc/sysdeps/x86_64/Makefile | 6 + libc/sysdeps/x86_64/bits/link.h | 12 + libc/sysdeps/x86_64/cacheinfo.c | 53 +- libc/sysdeps/x86_64/dl-trampoline.S | 267 ++- libc/sysdeps/x86_64/elf/configure | 25 + libc/sysdeps/x86_64/elf/configure.in | 11 + libc/sysdeps/x86_64/link-defines.sym | 28 + libc/sysdeps/x86_64/memcmp.S | 359 ++++ libc/sysdeps/x86_64/multiarch/Makefile | 10 +- libc/sysdeps/x86_64/multiarch/init-arch.c | 8 +- libc/sysdeps/x86_64/multiarch/rawmemchr.S | 1 + libc/sysdeps/x86_64/multiarch/rtld-strlen.S | 1 + libc/sysdeps/x86_64/multiarch/stpcpy.S | 7 + libc/sysdeps/x86_64/multiarch/stpncpy-c.c | 8 + libc/sysdeps/x86_64/multiarch/stpncpy.S | 6 + libc/sysdeps/x86_64/multiarch/strcasestr-c.c | 18 + libc/sysdeps/x86_64/multiarch/strcasestr.c | 3 + libc/sysdeps/x86_64/multiarch/strcmp.S | 370 ++-- libc/sysdeps/x86_64/multiarch/strcpy.S | 1911 ++++++++++++++++++++ libc/sysdeps/x86_64/multiarch/strcspn-c.c | 312 ++++ libc/sysdeps/x86_64/multiarch/strcspn.S | 82 + libc/sysdeps/x86_64/multiarch/strlen.S | 1 + libc/sysdeps/x86_64/multiarch/strncmp-c.c | 8 - libc/sysdeps/x86_64/multiarch/strncpy-c.c | 8 + libc/sysdeps/x86_64/multiarch/strncpy.S | 3 + libc/sysdeps/x86_64/multiarch/strpbrk-c.c | 4 + libc/sysdeps/x86_64/multiarch/strpbrk.S | 3 + libc/sysdeps/x86_64/multiarch/strspn-c.c | 284 +++ libc/sysdeps/x86_64/multiarch/strspn.S | 63 + libc/sysdeps/x86_64/multiarch/strstr-c.c | 12 + libc/sysdeps/x86_64/multiarch/strstr.c | 464 +++++ libc/sysdeps/x86_64/rtld-memcmp.c | 1 + libc/sysdeps/x86_64/rtld-strchr.S | 291 +++ libc/sysdeps/x86_64/rtld-strlen.S | 139 ++ libc/sysdeps/x86_64/strcmp.S | 1948 ++++++++++++++++++++- libc/sysdeps/x86_64/strncmp.S | 3 + libc/sysdeps/x86_64/tst-xmmymm.sh | 79 + 58 files changed, 8212 insertions(+), 394 deletions(-) create mode 100644 libc/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c create mode 100644 libc/sysdeps/s390/s390-64/utf16-utf32-z9.c create mode 100644 libc/sysdeps/s390/s390-64/utf8-utf16-z9.c create mode 100644 libc/sysdeps/s390/s390-64/utf8-utf32-z9.c create mode 100644 libc/sysdeps/x86_64/link-defines.sym create mode 100644 libc/sysdeps/x86_64/memcmp.S create mode 100644 libc/sysdeps/x86_64/multiarch/rtld-strlen.S create mode 100644 libc/sysdeps/x86_64/multiarch/stpcpy.S create mode 100644 libc/sysdeps/x86_64/multiarch/stpncpy-c.c create mode 100644 libc/sysdeps/x86_64/multiarch/stpncpy.S create mode 100644 libc/sysdeps/x86_64/multiarch/strcasestr-c.c create mode 100644 libc/sysdeps/x86_64/multiarch/strcasestr.c create mode 100644 libc/sysdeps/x86_64/multiarch/strcpy.S create mode 100644 libc/sysdeps/x86_64/multiarch/strcspn-c.c create mode 100644 libc/sysdeps/x86_64/multiarch/strcspn.S delete mode 100644 libc/sysdeps/x86_64/multiarch/strncmp-c.c create mode 100644 libc/sysdeps/x86_64/multiarch/strncpy-c.c create mode 100644 libc/sysdeps/x86_64/multiarch/strncpy.S create mode 100644 libc/sysdeps/x86_64/multiarch/strpbrk-c.c create mode 100644 libc/sysdeps/x86_64/multiarch/strpbrk.S create mode 100644 libc/sysdeps/x86_64/multiarch/strspn-c.c create mode 100644 libc/sysdeps/x86_64/multiarch/strspn.S create mode 100644 libc/sysdeps/x86_64/multiarch/strstr-c.c create mode 100644 libc/sysdeps/x86_64/multiarch/strstr.c create mode 100644 libc/sysdeps/x86_64/rtld-memcmp.c create mode 100644 libc/sysdeps/x86_64/rtld-strchr.S create mode 100644 libc/sysdeps/x86_64/rtld-strlen.S create mode 100644 libc/sysdeps/x86_64/strncmp.S create mode 100755 libc/sysdeps/x86_64/tst-xmmymm.sh (limited to 'libc/sysdeps') diff --git a/libc/sysdeps/generic/ldsodefs.h b/libc/sysdeps/generic/ldsodefs.h index b1af7fde0..e18e60f73 100644 --- a/libc/sysdeps/generic/ldsodefs.h +++ b/libc/sysdeps/generic/ldsodefs.h @@ -335,6 +335,10 @@ struct audit_ifaces extern int _dl_name_match_p (const char *__name, const struct link_map *__map) internal_function; +/* Compute next higher prime number. */ +extern unsigned long int _dl_higher_prime_number (unsigned long int n) + internal_function; + /* Function used as argument for `_dl_receive_error' function. The arguments are the error code, error string, and the objname the error occurred in. */ @@ -383,6 +387,21 @@ struct rtld_global allocated by rtld. Later it keeps the size of the map. It might be reset if in _dl_close if the last global object is removed. */ size_t _ns_global_scope_alloc; + /* Search table for unique objects. */ + struct unique_sym_table + { + __rtld_lock_recursive_t lock; + struct unique_sym + { + uint32_t hashval; + const char *name; + const ElfW(Sym) *sym; + struct link_map *map; + } *entries; + size_t size; + size_t n_elements; + void (*free) (void *); + } _ns_unique_sym_table; /* Keep track of changes to each namespace' list. */ struct r_debug _ns_debug; } _dl_ns[DL_NNS]; diff --git a/libc/sysdeps/generic/sysdep.h b/libc/sysdeps/generic/sysdep.h index 15d951c77..54884d9af 100644 --- a/libc/sysdeps/generic/sysdep.h +++ b/libc/sysdeps/generic/sysdep.h @@ -1,5 +1,5 @@ /* Generic asm macros used on many machines. - Copyright (C) 1991,92,93,96,98,2002,2003 Free Software Foundation, Inc. + Copyright (C) 1991,92,93,96,98,2002,2003,2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -39,13 +39,13 @@ #ifdef __ASSEMBLER__ /* Mark the end of function named SYM. This is used on some platforms to generate correct debugging information. */ -#ifndef END -#define END(sym) -#endif +# ifndef END +# define END(sym) +# endif -#ifndef JUMPTARGET -#define JUMPTARGET(sym) sym -#endif +# ifndef JUMPTARGET +# define JUMPTARGET(sym) sym +# endif /* Makros to generate eh_frame unwind information. */ # ifdef HAVE_ASM_CFI_DIRECTIVES @@ -65,6 +65,8 @@ # define cfi_remember_state .cfi_remember_state # define cfi_restore_state .cfi_restore_state # define cfi_window_save .cfi_window_save +# define cfi_personality(enc, exp) .cfi_personality enc, exp +# define cfi_lsda(enc, exp) .cfi_lsda enc, exp # else # define cfi_startproc # define cfi_endproc @@ -82,6 +84,8 @@ # define cfi_remember_state # define cfi_restore_state # define cfi_window_save +# define cfi_personality(enc, exp) +# define cfi_lsda(enc, exp) # endif #else /* ! ASSEMBLER */ @@ -116,6 +120,10 @@ ".cfi_restore_state" # define CFI_WINDOW_SAVE \ ".cfi_window_save" +# define CFI_PERSONALITY(enc, exp) \ + ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) +# define CFI_LSDA(enc, exp) \ + ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) # else # define CFI_STARTPROC # define CFI_ENDPROC @@ -132,6 +140,27 @@ # define CFI_REMEMBER_STATE # define CFI_RESTORE_STATE # define CFI_WINDOW_SAVE +# define CFI_PERSONALITY(enc, exp) +# define CFI_LSDA(enc, exp) # endif #endif /* __ASSEMBLER__ */ + +/* Values used for encoding parameter of cfi_personality and cfi_lsda. */ +#define DW_EH_PE_absptr 0x00 +#define DW_EH_PE_omit 0xff +#define DW_EH_PE_uleb128 0x01 +#define DW_EH_PE_udata2 0x02 +#define DW_EH_PE_udata4 0x03 +#define DW_EH_PE_udata8 0x04 +#define DW_EH_PE_sleb128 0x09 +#define DW_EH_PE_sdata2 0x0a +#define DW_EH_PE_sdata4 0x0b +#define DW_EH_PE_sdata8 0x0c +#define DW_EH_PE_signed 0x08 +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 +#define DW_EH_PE_indirect 0x80 diff --git a/libc/sysdeps/i386/configure b/libc/sysdeps/i386/configure index d1d4dc15a..7ceabbf55 100755 --- a/libc/sysdeps/i386/configure +++ b/libc/sysdeps/i386/configure @@ -52,3 +52,28 @@ if test $libc_cv_cpp_asm_debuginfo = yes; then _ACEOF fi + +echo "$as_me:$LINENO: checking for SSE4 support" >&5 +echo $ECHO_N "checking for SSE4 support... $ECHO_C" >&6 +if test "${libc_cv_cc_sse4+set}" = set; then + echo $ECHO_N "(cached) $ECHO_C" >&6 +else + if { ac_try='${CC-cc} -msse4 -xc /dev/null -S -o /dev/null' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_cc_sse4=yes +else + libc_cv_cc_sse4=no +fi +fi +echo "$as_me:$LINENO: result: $libc_cv_cc_sse4" >&5 +echo "${ECHO_T}$libc_cv_cc_sse4" >&6 +if test $libc_cv_cc_sse4 = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_SSE4_SUPPORT 1 +_ACEOF + +fi diff --git a/libc/sysdeps/i386/configure.in b/libc/sysdeps/i386/configure.in index 028e1ae8e..44f53a57a 100644 --- a/libc/sysdeps/i386/configure.in +++ b/libc/sysdeps/i386/configure.in @@ -33,3 +33,14 @@ rm -f conftest*])AC_SUBST(libc_cv_cpp_asm_debuginfo) if test $libc_cv_cpp_asm_debuginfo = yes; then AC_DEFINE(HAVE_CPP_ASM_DEBUGINFO) fi + +dnl Check if -msse4 works. +AC_CACHE_CHECK(for SSE4 support, libc_cv_cc_sse4, [dnl +if AC_TRY_COMMAND([${CC-cc} -msse4 -xc /dev/null -S -o /dev/null]); then + libc_cv_cc_sse4=yes +else + libc_cv_cc_sse4=no +fi]) +if test $libc_cv_cc_sse4 = yes; then + AC_DEFINE(HAVE_SSE4_SUPPORT) +fi diff --git a/libc/sysdeps/posix/getaddrinfo.c b/libc/sysdeps/posix/getaddrinfo.c index d346c621f..a788d18fe 100644 --- a/libc/sysdeps/posix/getaddrinfo.c +++ b/libc/sysdeps/posix/getaddrinfo.c @@ -833,6 +833,8 @@ gaih_inet (const char *name, const struct gaih_service *service, && inet6_status != NSS_STATUS_UNAVAIL) status = inet6_status; } + else + status = NSS_STATUS_UNAVAIL; } if (nss_next_action (nip, status) == NSS_ACTION_RETURN) diff --git a/libc/sysdeps/powerpc/sysdep.h b/libc/sysdeps/powerpc/sysdep.h index 43edeb71e..f5c79c54e 100644 --- a/libc/sysdeps/powerpc/sysdep.h +++ b/libc/sysdeps/powerpc/sysdep.h @@ -44,8 +44,8 @@ #define PPC_FEATURE_PA6T 0x00000800 /* PA Semi 6T Core */ #define PPC_FEATURE_HAS_DFP 0x00000400 /* Decimal FP Unit */ #define PPC_FEATURE_POWER6_EXT 0x00000200 /* P6 + mffgpr/mftgpr */ -#define PPC_FEATURE_HAS_VSX 0x00000100 /* P7 Vector Extension. */ -#define PPC_FEATURE_ARCH_2_06 0x00000080 /* ISA 2.06 */ +#define PPC_FEATURE_ARCH_2_06 0x00000100 /* ISA 2.06 */ +#define PPC_FEATURE_HAS_VSX 0x00000080 /* P7 Vector Extension. */ #define PPC_FEATURE_970 (PPC_FEATURE_POWER4 + PPC_FEATURE_HAS_ALTIVEC) #ifdef __ASSEMBLER__ diff --git a/libc/sysdeps/s390/dl-procinfo.c b/libc/sysdeps/s390/dl-procinfo.c index 32c6aef95..d51d7b237 100644 --- a/libc/sysdeps/s390/dl-procinfo.c +++ b/libc/sysdeps/s390/dl-procinfo.c @@ -1,5 +1,5 @@ /* Data for s390 version of processor capability information. - Copyright (C) 2006 Free Software Foundation, Inc. + Copyright (C) 2006, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Martin Schwidefsky , 2006. @@ -47,11 +47,11 @@ #if !defined PROCINFO_DECL && defined SHARED ._dl_s390_cap_flags #else -PROCINFO_CLASS const char _dl_s390_cap_flags[7][6] +PROCINFO_CLASS const char _dl_s390_cap_flags[10][8] #endif #ifndef PROCINFO_DECL = { - "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp" + "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", "hpage", "etf3enh", "highgprs" } #endif #if !defined SHARED || defined PROCINFO_DECL @@ -63,11 +63,11 @@ PROCINFO_CLASS const char _dl_s390_cap_flags[7][6] #if !defined PROCINFO_DECL && defined SHARED ._dl_s390_platforms #else -PROCINFO_CLASS const char _dl_s390_platforms[4][7] +PROCINFO_CLASS const char _dl_s390_platforms[5][7] #endif #ifndef PROCINFO_DECL = { - "g5", "z900", "z990", "z9-109" + "g5", "z900", "z990", "z9-109", "z10" } #endif #if !defined SHARED || defined PROCINFO_DECL diff --git a/libc/sysdeps/s390/dl-procinfo.h b/libc/sysdeps/s390/dl-procinfo.h index 178d7cc01..0a7ebd3be 100644 --- a/libc/sysdeps/s390/dl-procinfo.h +++ b/libc/sysdeps/s390/dl-procinfo.h @@ -1,5 +1,5 @@ /* s390 version of processor capability information handling macros. - Copyright (C) 2006 Free Software Foundation, Inc. + Copyright (C) 2006, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Martin Schwidefsky , 2006. @@ -22,9 +22,9 @@ #define _DL_PROCINFO_H 1 #include -#define _DL_HWCAP_COUNT 7 +#define _DL_HWCAP_COUNT 10 -#define _DL_PLATFORMS_COUNT 4 +#define _DL_PLATFORMS_COUNT 5 /* The kernel provides up to 32 capability bits with elf_hwcap. */ #define _DL_FIRST_PLATFORM 32 @@ -45,6 +45,9 @@ enum HWCAP_S390_LDISP = 1 << 4, HWCAP_S390_EIMM = 1 << 5, HWCAP_S390_DFP = 1 << 6, + HWCAP_S390_HPAGE = 1 << 7, + HWCAP_S390_ETF3EH = 1 << 8, + HWCAP_S390_HIGH_GPRS = 1 << 9, }; #define HWCAP_IMPORTANT (HWCAP_S390_ZARCH | HWCAP_S390_LDISP \ diff --git a/libc/sysdeps/s390/s390-64/Makefile b/libc/sysdeps/s390/s390-64/Makefile index 0a5051449..1814f37ab 100644 --- a/libc/sysdeps/s390/s390-64/Makefile +++ b/libc/sysdeps/s390/s390-64/Makefile @@ -9,3 +9,70 @@ CFLAGS-rtld.c += -Wno-uninitialized -Wno-unused CFLAGS-dl-load.c += -Wno-unused CFLAGS-dl-reloc.c += -Wno-unused endif + +ifeq ($(subdir),iconvdata) +ISO-8859-1_CP037_Z900-routines := iso-8859-1_cp037_z900 +ISO-8859-1_CP037_Z900-map := gconv.map + +UTF8_UTF32_Z9-routines := utf8-utf32-z9 +UTF8_UTF32_Z9-map := gconv.map + +UTF16_UTF32_Z9-routines := utf16-utf32-z9 +UTF16_UTF32_Z9-map := gconv.map + +UTF8_UTF16_Z9-routines := utf8-utf16-z9 +UTF8_UTF16_Z9-map := gconv.map + +s390x-iconv-modules = ISO-8859-1_CP037_Z900 UTF8_UTF16_Z9 UTF16_UTF32_Z9 UTF8_UTF32_Z9 + +extra-modules-left += $(s390x-iconv-modules) +include extra-module.mk + +extra-objs += $(addsuffix .so, $(s390x-iconv-modules)) +install-others += $(patsubst %, $(inst_gconvdir)/%.so, $(s390x-iconv-modules)) + +distribute += iso-8859-1_cp037_z900.c utf8-utf32-z9.c utf16-utf32-z9.c utf8-utf16-z9.c + +$(patsubst %, $(inst_gconvdir)/%.so, $(s390x-iconv-modules)) : \ +$(inst_gconvdir)/%.so: $(objpfx)%.so $(+force) + $(do-install-program) + +$(objpfx)gconv-modules-s390: gconv-modules $(+force) + cp $< $@ + echo >> $@ + echo "# S/390 hardware accelerated modules" >> $@ + echo -n "module ISO-8859-1// IBM037// " >> $@ + echo " ISO-8859-1_CP037_Z900 1" >> $@ + echo -n "module IBM037// ISO-8859-1// " >> $@ + echo " ISO-8859-1_CP037_Z900 1" >> $@ + echo -n "module ISO-10646/UTF8/ UTF-32// " >> $@ + echo " UTF8_UTF32_Z9 1" >> $@ + echo -n "module UTF-32BE// ISO-10646/UTF8/ " >> $@ + echo " UTF8_UTF32_Z9 1" >> $@ + echo -n "module ISO-10646/UTF8/ UTF-32BE// " >> $@ + echo " UTF8_UTF32_Z9 1" >> $@ + echo -n "module UTF-16BE// UTF-32// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module UTF-32BE// UTF-16// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module INTERNAL UTF-16// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module UTF-32BE// UTF-16BE// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module INTERNAL UTF-16BE// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module UTF-16BE// UTF-32BE// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module UTF-16BE// INTERNAL " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module UTF-16BE// ISO-10646/UTF8/ " >> $@ + echo " UTF8_UTF16_Z9 1" >> $@ + echo -n "module ISO-10646/UTF8/ UTF-16// " >> $@ + echo " UTF8_UTF16_Z9 1" >> $@ + echo -n "module ISO-10646/UTF8/ UTF-16BE// " >> $@ + echo " UTF8_UTF16_Z9 1" >> $@ + +$(inst_gconvdir)/gconv-modules: $(objpfx)gconv-modules-s390 $(+force) + $(do-install) + +endif diff --git a/libc/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c b/libc/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c new file mode 100644 index 000000000..d4c4931f2 --- /dev/null +++ b/libc/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c @@ -0,0 +1,238 @@ +/* Conversion between ISO 8859-1 and IBM037. + + This module uses the Z900 variant of the Translate One To One + instruction. + Copyright (C) 1997-2009 Free Software Foundation, Inc. + + Author: Andreas Krebbel + Based on the work by Ulrich Drepper , 1997. + + Thanks to Daniel Appich who covered the relevant performance work + in his diploma thesis. + + This is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +// conversion table from ISO-8859-1 to IBM037 +static const unsigned char table_iso8859_1_to_cp037[256] +__attribute__ ((aligned (8))) = +{ + [0x00] = 0x00, [0x01] = 0x01, [0x02] = 0x02, [0x03] = 0x03, + [0x04] = 0x37, [0x05] = 0x2D, [0x06] = 0x2E, [0x07] = 0x2F, + [0x08] = 0x16, [0x09] = 0x05, [0x0A] = 0x25, [0x0B] = 0x0B, + [0x0C] = 0x0C, [0x0D] = 0x0D, [0x0E] = 0x0E, [0x0F] = 0x0F, + [0x10] = 0x10, [0x11] = 0x11, [0x12] = 0x12, [0x13] = 0x13, + [0x14] = 0x3C, [0x15] = 0x3D, [0x16] = 0x32, [0x17] = 0x26, + [0x18] = 0x18, [0x19] = 0x19, [0x1A] = 0x3F, [0x1B] = 0x27, + [0x1C] = 0x1C, [0x1D] = 0x1D, [0x1E] = 0x1E, [0x1F] = 0x1F, + [0x20] = 0x40, [0x21] = 0x5A, [0x22] = 0x7F, [0x23] = 0x7B, + [0x24] = 0x5B, [0x25] = 0x6C, [0x26] = 0x50, [0x27] = 0x7D, + [0x28] = 0x4D, [0x29] = 0x5D, [0x2A] = 0x5C, [0x2B] = 0x4E, + [0x2C] = 0x6B, [0x2D] = 0x60, [0x2E] = 0x4B, [0x2F] = 0x61, + [0x30] = 0xF0, [0x31] = 0xF1, [0x32] = 0xF2, [0x33] = 0xF3, + [0x34] = 0xF4, [0x35] = 0xF5, [0x36] = 0xF6, [0x37] = 0xF7, + [0x38] = 0xF8, [0x39] = 0xF9, [0x3A] = 0x7A, [0x3B] = 0x5E, + [0x3C] = 0x4C, [0x3D] = 0x7E, [0x3E] = 0x6E, [0x3F] = 0x6F, + [0x40] = 0x7C, [0x41] = 0xC1, [0x42] = 0xC2, [0x43] = 0xC3, + [0x44] = 0xC4, [0x45] = 0xC5, [0x46] = 0xC6, [0x47] = 0xC7, + [0x48] = 0xC8, [0x49] = 0xC9, [0x4A] = 0xD1, [0x4B] = 0xD2, + [0x4C] = 0xD3, [0x4D] = 0xD4, [0x4E] = 0xD5, [0x4F] = 0xD6, + [0x50] = 0xD7, [0x51] = 0xD8, [0x52] = 0xD9, [0x53] = 0xE2, + [0x54] = 0xE3, [0x55] = 0xE4, [0x56] = 0xE5, [0x57] = 0xE6, + [0x58] = 0xE7, [0x59] = 0xE8, [0x5A] = 0xE9, [0x5B] = 0xBA, + [0x5C] = 0xE0, [0x5D] = 0xBB, [0x5E] = 0xB0, [0x5F] = 0x6D, + [0x60] = 0x79, [0x61] = 0x81, [0x62] = 0x82, [0x63] = 0x83, + [0x64] = 0x84, [0x65] = 0x85, [0x66] = 0x86, [0x67] = 0x87, + [0x68] = 0x88, [0x69] = 0x89, [0x6A] = 0x91, [0x6B] = 0x92, + [0x6C] = 0x93, [0x6D] = 0x94, [0x6E] = 0x95, [0x6F] = 0x96, + [0x70] = 0x97, [0x71] = 0x98, [0x72] = 0x99, [0x73] = 0xA2, + [0x74] = 0xA3, [0x75] = 0xA4, [0x76] = 0xA5, [0x77] = 0xA6, + [0x78] = 0xA7, [0x79] = 0xA8, [0x7A] = 0xA9, [0x7B] = 0xC0, + [0x7C] = 0x4F, [0x7D] = 0xD0, [0x7E] = 0xA1, [0x7F] = 0x07, + [0x80] = 0x20, [0x81] = 0x21, [0x82] = 0x22, [0x83] = 0x23, + [0x84] = 0x24, [0x85] = 0x15, [0x86] = 0x06, [0x87] = 0x17, + [0x88] = 0x28, [0x89] = 0x29, [0x8A] = 0x2A, [0x8B] = 0x2B, + [0x8C] = 0x2C, [0x8D] = 0x09, [0x8E] = 0x0A, [0x8F] = 0x1B, + [0x90] = 0x30, [0x91] = 0x31, [0x92] = 0x1A, [0x93] = 0x33, + [0x94] = 0x34, [0x95] = 0x35, [0x96] = 0x36, [0x97] = 0x08, + [0x98] = 0x38, [0x99] = 0x39, [0x9A] = 0x3A, [0x9B] = 0x3B, + [0x9C] = 0x04, [0x9D] = 0x14, [0x9E] = 0x3E, [0x9F] = 0xFF, + [0xA0] = 0x41, [0xA1] = 0xAA, [0xA2] = 0x4A, [0xA3] = 0xB1, + [0xA4] = 0x9F, [0xA5] = 0xB2, [0xA6] = 0x6A, [0xA7] = 0xB5, + [0xA8] = 0xBD, [0xA9] = 0xB4, [0xAA] = 0x9A, [0xAB] = 0x8A, + [0xAC] = 0x5F, [0xAD] = 0xCA, [0xAE] = 0xAF, [0xAF] = 0xBC, + [0xB0] = 0x90, [0xB1] = 0x8F, [0xB2] = 0xEA, [0xB3] = 0xFA, + [0xB4] = 0xBE, [0xB5] = 0xA0, [0xB6] = 0xB6, [0xB7] = 0xB3, + [0xB8] = 0x9D, [0xB9] = 0xDA, [0xBA] = 0x9B, [0xBB] = 0x8B, + [0xBC] = 0xB7, [0xBD] = 0xB8, [0xBE] = 0xB9, [0xBF] = 0xAB, + [0xC0] = 0x64, [0xC1] = 0x65, [0xC2] = 0x62, [0xC3] = 0x66, + [0xC4] = 0x63, [0xC5] = 0x67, [0xC6] = 0x9E, [0xC7] = 0x68, + [0xC8] = 0x74, [0xC9] = 0x71, [0xCA] = 0x72, [0xCB] = 0x73, + [0xCC] = 0x78, [0xCD] = 0x75, [0xCE] = 0x76, [0xCF] = 0x77, + [0xD0] = 0xAC, [0xD1] = 0x69, [0xD2] = 0xED, [0xD3] = 0xEE, + [0xD4] = 0xEB, [0xD5] = 0xEF, [0xD6] = 0xEC, [0xD7] = 0xBF, + [0xD8] = 0x80, [0xD9] = 0xFD, [0xDA] = 0xFE, [0xDB] = 0xFB, + [0xDC] = 0xFC, [0xDD] = 0xAD, [0xDE] = 0xAE, [0xDF] = 0x59, + [0xE0] = 0x44, [0xE1] = 0x45, [0xE2] = 0x42, [0xE3] = 0x46, + [0xE4] = 0x43, [0xE5] = 0x47, [0xE6] = 0x9C, [0xE7] = 0x48, + [0xE8] = 0x54, [0xE9] = 0x51, [0xEA] = 0x52, [0xEB] = 0x53, + [0xEC] = 0x58, [0xED] = 0x55, [0xEE] = 0x56, [0xEF] = 0x57, + [0xF0] = 0x8C, [0xF1] = 0x49, [0xF2] = 0xCD, [0xF3] = 0xCE, + [0xF4] = 0xCB, [0xF5] = 0xCF, [0xF6] = 0xCC, [0xF7] = 0xE1, + [0xF8] = 0x70, [0xF9] = 0xDD, [0xFA] = 0xDE, [0xFB] = 0xDB, + [0xFC] = 0xDC, [0xFD] = 0x8D, [0xFE] = 0x8E, [0xFF] = 0xDF +}; + +// conversion table from IBM037 to ISO-8859-1 +static const unsigned char table_cp037_iso8859_1[256] +__attribute__ ((aligned (8))) = +{ + [0x00] = 0x00, [0x01] = 0x01, [0x02] = 0x02, [0x03] = 0x03, + [0x04] = 0x9C, [0x05] = 0x09, [0x06] = 0x86, [0x07] = 0x7F, + [0x08] = 0x97, [0x09] = 0x8D, [0x0A] = 0x8E, [0x0B] = 0x0B, + [0x0C] = 0x0C, [0x0D] = 0x0D, [0x0E] = 0x0E, [0x0F] = 0x0F, + [0x10] = 0x10, [0x11] = 0x11, [0x12] = 0x12, [0x13] = 0x13, + [0x14] = 0x9D, [0x15] = 0x85, [0x16] = 0x08, [0x17] = 0x87, + [0x18] = 0x18, [0x19] = 0x19, [0x1A] = 0x92, [0x1B] = 0x8F, + [0x1C] = 0x1C, [0x1D] = 0x1D, [0x1E] = 0x1E, [0x1F] = 0x1F, + [0x20] = 0x80, [0x21] = 0x81, [0x22] = 0x82, [0x23] = 0x83, + [0x24] = 0x84, [0x25] = 0x0A, [0x26] = 0x17, [0x27] = 0x1B, + [0x28] = 0x88, [0x29] = 0x89, [0x2A] = 0x8A, [0x2B] = 0x8B, + [0x2C] = 0x8C, [0x2D] = 0x05, [0x2E] = 0x06, [0x2F] = 0x07, + [0x30] = 0x90, [0x31] = 0x91, [0x32] = 0x16, [0x33] = 0x93, + [0x34] = 0x94, [0x35] = 0x95, [0x36] = 0x96, [0x37] = 0x04, + [0x38] = 0x98, [0x39] = 0x99, [0x3A] = 0x9A, [0x3B] = 0x9B, + [0x3C] = 0x14, [0x3D] = 0x15, [0x3E] = 0x9E, [0x3F] = 0x1A, + [0x40] = 0x20, [0x41] = 0xA0, [0x42] = 0xE2, [0x43] = 0xE4, + [0x44] = 0xE0, [0x45] = 0xE1, [0x46] = 0xE3, [0x47] = 0xE5, + [0x48] = 0xE7, [0x49] = 0xF1, [0x4A] = 0xA2, [0x4B] = 0x2E, + [0x4C] = 0x3C, [0x4D] = 0x28, [0x4E] = 0x2B, [0x4F] = 0x7C, + [0x50] = 0x26, [0x51] = 0xE9, [0x52] = 0xEA, [0x53] = 0xEB, + [0x54] = 0xE8, [0x55] = 0xED, [0x56] = 0xEE, [0x57] = 0xEF, + [0x58] = 0xEC, [0x59] = 0xDF, [0x5A] = 0x21, [0x5B] = 0x24, + [0x5C] = 0x2A, [0x5D] = 0x29, [0x5E] = 0x3B, [0x5F] = 0xAC, + [0x60] = 0x2D, [0x61] = 0x2F, [0x62] = 0xC2, [0x63] = 0xC4, + [0x64] = 0xC0, [0x65] = 0xC1, [0x66] = 0xC3, [0x67] = 0xC5, + [0x68] = 0xC7, [0x69] = 0xD1, [0x6A] = 0xA6, [0x6B] = 0x2C, + [0x6C] = 0x25, [0x6D] = 0x5F, [0x6E] = 0x3E, [0x6F] = 0x3F, + [0x70] = 0xF8, [0x71] = 0xC9, [0x72] = 0xCA, [0x73] = 0xCB, + [0x74] = 0xC8, [0x75] = 0xCD, [0x76] = 0xCE, [0x77] = 0xCF, + [0x78] = 0xCC, [0x79] = 0x60, [0x7A] = 0x3A, [0x7B] = 0x23, + [0x7C] = 0x40, [0x7D] = 0x27, [0x7E] = 0x3D, [0x7F] = 0x22, + [0x80] = 0xD8, [0x81] = 0x61, [0x82] = 0x62, [0x83] = 0x63, + [0x84] = 0x64, [0x85] = 0x65, [0x86] = 0x66, [0x87] = 0x67, + [0x88] = 0x68, [0x89] = 0x69, [0x8A] = 0xAB, [0x8B] = 0xBB, + [0x8C] = 0xF0, [0x8D] = 0xFD, [0x8E] = 0xFE, [0x8F] = 0xB1, + [0x90] = 0xB0, [0x91] = 0x6A, [0x92] = 0x6B, [0x93] = 0x6C, + [0x94] = 0x6D, [0x95] = 0x6E, [0x96] = 0x6F, [0x97] = 0x70, + [0x98] = 0x71, [0x99] = 0x72, [0x9A] = 0xAA, [0x9B] = 0xBA, + [0x9C] = 0xE6, [0x9D] = 0xB8, [0x9E] = 0xC6, [0x9F] = 0xA4, + [0xA0] = 0xB5, [0xA1] = 0x7E, [0xA2] = 0x73, [0xA3] = 0x74, + [0xA4] = 0x75, [0xA5] = 0x76, [0xA6] = 0x77, [0xA7] = 0x78, + [0xA8] = 0x79, [0xA9] = 0x7A, [0xAA] = 0xA1, [0xAB] = 0xBF, + [0xAC] = 0xD0, [0xAD] = 0xDD, [0xAE] = 0xDE, [0xAF] = 0xAE, + [0xB0] = 0x5E, [0xB1] = 0xA3, [0xB2] = 0xA5, [0xB3] = 0xB7, + [0xB4] = 0xA9, [0xB5] = 0xA7, [0xB6] = 0xB6, [0xB7] = 0xBC, + [0xB8] = 0xBD, [0xB9] = 0xBE, [0xBA] = 0x5B, [0xBB] = 0x5D, + [0xBC] = 0xAF, [0xBD] = 0xA8, [0xBE] = 0xB4, [0xBF] = 0xD7, + [0xC0] = 0x7B, [0xC1] = 0x41, [0xC2] = 0x42, [0xC3] = 0x43, + [0xC4] = 0x44, [0xC5] = 0x45, [0xC6] = 0x46, [0xC7] = 0x47, + [0xC8] = 0x48, [0xC9] = 0x49, [0xCA] = 0xAD, [0xCB] = 0xF4, + [0xCC] = 0xF6, [0xCD] = 0xF2, [0xCE] = 0xF3, [0xCF] = 0xF5, + [0xD0] = 0x7D, [0xD1] = 0x4A, [0xD2] = 0x4B, [0xD3] = 0x4C, + [0xD4] = 0x4D, [0xD5] = 0x4E, [0xD6] = 0x4F, [0xD7] = 0x50, + [0xD8] = 0x51, [0xD9] = 0x52, [0xDA] = 0xB9, [0xDB] = 0xFB, + [0xDC] = 0xFC, [0xDD] = 0xF9, [0xDE] = 0xFA, [0xDF] = 0xFF, + [0xE0] = 0x5C, [0xE1] = 0xF7, [0xE2] = 0x53, [0xE3] = 0x54, + [0xE4] = 0x55, [0xE5] = 0x56, [0xE6] = 0x57, [0xE7] = 0x58, + [0xE8] = 0x59, [0xE9] = 0x5A, [0xEA] = 0xB2, [0xEB] = 0xD4, + [0xEC] = 0xD6, [0xED] = 0xD2, [0xEE] = 0xD3, [0xEF] = 0xD5, + [0xF0] = 0x30, [0xF1] = 0x31, [0xF2] = 0x32, [0xF3] = 0x33, + [0xF4] = 0x34, [0xF5] = 0x35, [0xF6] = 0x36, [0xF7] = 0x37, + [0xF8] = 0x38, [0xF9] = 0x39, [0xFA] = 0xB3, [0xFB] = 0xDB, + [0xFC] = 0xDC, [0xFD] = 0xD9, [0xFE] = 0xDA, [0xFF] = 0x9F +}; + +/* Definitions used in the body of the `gconv' function. */ +#define CHARSET_NAME "ISO-8859-1//" +#define FROM_LOOP iso8859_1_to_cp037_z900 +#define TO_LOOP cp037_to_iso8859_1_z900 +#define DEFINE_INIT 1 +#define DEFINE_FINI 1 +#define MIN_NEEDED_FROM 1 +#define MIN_NEEDED_TO 1 + +/* The Z900 variant of troo forces us to always specify a test + character which ends the translation. So if we run into the + situation where the translation has been interrupted due to the + test character we translate the character by hand and jump back + into the instruction. */ + +#define TROO_LOOP(TABLE) \ + { \ + register const unsigned char test asm ("0") = 0; \ + register const unsigned char *pTable asm ("1") = TABLE; \ + register unsigned char *pOutput asm ("2") = outptr; \ + register uint64_t length asm ("3"); \ + const unsigned char* pInput = inptr; \ + uint64_t tmp; \ + \ + length = (inend - inptr < outend - outptr \ + ? inend - inptr : outend - outptr); \ + \ + asm volatile ("0: \n\t" \ + " troo %0,%1 \n\t" \ + " jz 1f \n\t" \ + " jo 0b \n\t" \ + " llgc %3,0(%1) \n\t" \ + " la %3,0(%3,%4) \n\t" \ + " mvc 0(1,%0),0(%3) \n\t" \ + " aghi %1,1 \n\t" \ + " aghi %0,1 \n\t" \ + " aghi %2,-1 \n\t" \ + " j 0b \n\t" \ + "1: \n" \ + \ + : "+a" (pOutput), "+a" (pInput), "+d" (length), "=&a" (tmp) \ + : "a" (pTable), "d" (test) \ + : "cc"); \ + \ + inptr = pInput; \ + outptr = pOutput; \ + } + +/* First define the conversion function from ISO 8859-1 to CP037. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY TROO_LOOP (table_iso8859_1_to_cp037) + +#include + + +/* Next, define the conversion function from CP037 to ISO 8859-1. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define LOOPFCT TO_LOOP +#define BODY TROO_LOOP (table_cp037_iso8859_1); + +#include + + +/* Now define the toplevel functions. */ +#include diff --git a/libc/sysdeps/s390/s390-64/utf16-utf32-z9.c b/libc/sysdeps/s390/s390-64/utf16-utf32-z9.c new file mode 100644 index 000000000..868dea68c --- /dev/null +++ b/libc/sysdeps/s390/s390-64/utf16-utf32-z9.c @@ -0,0 +1,325 @@ +/* Conversion between UTF-16 and UTF-32 BE/internal. + + This module uses the Z9-109 variants of the Convert Unicode + instructions. + Copyright (C) 1997-2009 Free Software Foundation, Inc. + + Author: Andreas Krebbel + Based on the work by Ulrich Drepper , 1997. + + Thanks to Daniel Appich who covered the relevant performance work + in his diploma thesis. + + This is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include +#include + +/* UTF-32 big endian byte order mark. */ +#define BOM_UTF32 0x0000feffu + +/* UTF-16 big endian byte order mark. */ +#define BOM_UTF16 0xfeff + +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +#define MIN_NEEDED_FROM 2 +#define MAX_NEEDED_FROM 4 +#define MIN_NEEDED_TO 4 +#define FROM_LOOP from_utf16_loop +#define TO_LOOP to_utf16_loop +#define FROM_DIRECTION (dir == from_utf16) +#define PREPARE_LOOP \ + enum direction dir = ((struct utf16_data *) step->__data)->dir; \ + int emit_bom = ((struct utf16_data *) step->__data)->emit_bom; \ + \ + if (emit_bom && !data->__internal_use \ + && data->__invocation_counter == 0) \ + { \ + if (dir == to_utf16) \ + { \ + /* Emit the UTF-16 Byte Order Mark. */ \ + if (__builtin_expect (outbuf + 2 > outend, 0)) \ + return __GCONV_FULL_OUTPUT; \ + \ + put16u (outbuf, BOM_UTF16); \ + outbuf += 2; \ + } \ + else \ + { \ + /* Emit the UTF-32 Byte Order Mark. */ \ + if (__builtin_expect (outbuf + 4 > outend, 0)) \ + return __GCONV_FULL_OUTPUT; \ + \ + put32u (outbuf, BOM_UTF32); \ + outbuf += 4; \ + } \ + } + +/* Direction of the transformation. */ +enum direction +{ + illegal_dir, + to_utf16, + from_utf16 +}; + +struct utf16_data +{ + enum direction dir; + int emit_bom; +}; + + +extern int gconv_init (struct __gconv_step *step); +int +gconv_init (struct __gconv_step *step) +{ + /* Determine which direction. */ + struct utf16_data *new_data; + enum direction dir = illegal_dir; + int emit_bom; + int result; + + emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0 + || __strcasecmp (step->__to_name, "UTF-16//") == 0); + + if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0 + && (__strcasecmp (step->__to_name, "UTF-32//") == 0 + || __strcasecmp (step->__to_name, "UTF-32BE//") == 0 + || __strcasecmp (step->__to_name, "INTERNAL") == 0)) + { + dir = from_utf16; + } + else if ((__strcasecmp (step->__to_name, "UTF-16//") == 0 + || __strcasecmp (step->__to_name, "UTF-16BE//") == 0) + && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0 + || __strcasecmp (step->__from_name, "INTERNAL") == 0)) + { + dir = to_utf16; + } + + result = __GCONV_NOCONV; + if (dir != illegal_dir) + { + new_data = (struct utf16_data *) malloc (sizeof (struct utf16_data)); + + result = __GCONV_NOMEM; + if (new_data != NULL) + { + new_data->dir = dir; + new_data->emit_bom = emit_bom; + step->__data = new_data; + + if (dir == from_utf16) + { + step->__min_needed_from = MIN_NEEDED_FROM; + step->__max_needed_from = MIN_NEEDED_FROM; + step->__min_needed_to = MIN_NEEDED_TO; + step->__max_needed_to = MIN_NEEDED_TO; + } + else + { + step->__min_needed_from = MIN_NEEDED_TO; + step->__max_needed_from = MIN_NEEDED_TO; + step->__min_needed_to = MIN_NEEDED_FROM; + step->__max_needed_to = MIN_NEEDED_FROM; + } + + step->__stateful = 0; + + result = __GCONV_OK; + } + } + + return result; +} + + +extern void gconv_end (struct __gconv_step *data); +void +gconv_end (struct __gconv_step *data) +{ + free (data->__data); +} + +/* The macro for the hardware loop. This is used for both + directions. */ +#define HARDWARE_CONVERT(INSTRUCTION) \ + { \ + register const unsigned char* pInput asm ("8") = inptr; \ + register unsigned long long inlen asm ("9") = inend - inptr; \ + register unsigned char* pOutput asm ("10") = outptr; \ + register unsigned long long outlen asm("11") = outend - outptr; \ + uint64_t cc = 0; \ + \ + asm volatile ("0: " INSTRUCTION " \n\t" \ + " jo 0b \n\t" \ + " ipm %2 \n" \ + : "+a" (pOutput), "+a" (pInput), "+d" (cc), \ + "+d" (outlen), "+d" (inlen) \ + : \ + : "cc", "memory"); \ + \ + inptr = pInput; \ + outptr = pOutput; \ + cc >>= 28; \ + \ + if (cc == 1) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + else if (cc == 2) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } + +/* Conversion function from UTF-16 to UTF-32 internal/BE. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +/* The software routine is copied from utf-16.c (minus bytes + swapping). */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu24 %0, %1, 1"); \ + if (inptr != inend) \ + { \ + /* Check if the third byte is \ + a valid start of a UTF-16 surrogate. */ \ + if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc) \ + STANDARD_FROM_LOOP_ERR_HANDLER (3); \ + \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + continue; \ + } \ + \ + uint16_t u1 = get16 (inptr); \ + \ + if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff) \ + { \ + /* No surrogate. */ \ + put32 (outptr, u1); \ + inptr += 2; \ + } \ + else \ + { \ + /* It's a surrogate character. At least the first word says \ + it is. */ \ + if (__builtin_expect (inptr + 4 > inend, 0)) \ + { \ + /* We don't have enough input for another complete input \ + character. */ \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + inptr += 2; \ + uint16_t u2 = get16 (inptr); \ + if (__builtin_expect (u2 < 0xdc00, 0) \ + || __builtin_expect (u2 > 0xdfff, 0)) \ + { \ + /* This is no valid second word for a surrogate. */ \ + inptr -= 2; \ + STANDARD_FROM_LOOP_ERR_HANDLER (2); \ + } \ + \ + put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00)); \ + inptr += 2; \ + } \ + outptr += 4; \ + } +#define LOOP_NEED_FLAGS +#include + +/* Conversion from UTF-32 internal/BE to UTF-16. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +/* The software routine is copied from utf-16.c (minus bytes + swapping). */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu42 %0, %1"); \ + \ + if (inptr != inend) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + continue; \ + } \ + \ + uint32_t c = get32 (inptr); \ + \ + if (__builtin_expect (c <= 0xd7ff, 1) \ + || (c >=0xdc00 && c <= 0xffff)) \ + { \ + /* Two UTF-16 chars. */ \ + put16 (outptr, c); \ + } \ + else if (__builtin_expect (c >= 0x10000, 1) \ + && __builtin_expect (c <= 0x10ffff, 1)) \ + { \ + /* Four UTF-16 chars. */ \ + uint16_t zabcd = ((c & 0x1f0000) >> 16) - 1; \ + uint16_t out; \ + \ + /* Generate a surrogate character. */ \ + if (__builtin_expect (outptr + 4 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + out = 0xd800; \ + out |= (zabcd & 0xff) << 6; \ + out |= (c >> 10) & 0x3f; \ + put16 (outptr, out); \ + outptr += 2; \ + \ + out = 0xdc00; \ + out |= c & 0x3ff; \ + put16 (outptr, out); \ + } \ + else \ + { \ + STANDARD_TO_LOOP_ERR_HANDLER (4); \ + } \ + outptr += 2; \ + inptr += 4; \ + } +#define LOOP_NEED_FLAGS +#include + +#include diff --git a/libc/sysdeps/s390/s390-64/utf8-utf16-z9.c b/libc/sysdeps/s390/s390-64/utf8-utf16-z9.c new file mode 100644 index 000000000..531d3ebd4 --- /dev/null +++ b/libc/sysdeps/s390/s390-64/utf8-utf16-z9.c @@ -0,0 +1,463 @@ +/* Conversion between UTF-16 and UTF-32 BE/internal. + + This module uses the Z9-109 variants of the Convert Unicode + instructions. + Copyright (C) 1997-2009 Free Software Foundation, Inc. + + Author: Andreas Krebbel + Based on the work by Ulrich Drepper , 1997. + + Thanks to Daniel Appich who covered the relevant performance work + in his diploma thesis. + + This is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include +#include + +/* UTF-16 big endian byte order mark. */ +#define BOM_UTF16 0xfeff + +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +#define MIN_NEEDED_FROM 1 +#define MAX_NEEDED_FROM 4 +#define MIN_NEEDED_TO 2 +#define MAX_NEEDED_TO 4 +#define FROM_LOOP from_utf8_loop +#define TO_LOOP to_utf8_loop +#define FROM_DIRECTION (dir == from_utf8) +#define PREPARE_LOOP \ + enum direction dir = ((struct utf8_data *) step->__data)->dir; \ + int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \ + \ + if (emit_bom && !data->__internal_use \ + && data->__invocation_counter == 0) \ + { \ + /* Emit the UTF-16 Byte Order Mark. */ \ + if (__builtin_expect (outbuf + 2 > outend, 0)) \ + return __GCONV_FULL_OUTPUT; \ + \ + put16u (outbuf, BOM_UTF16); \ + outbuf += 2; \ + } + +/* Direction of the transformation. */ +enum direction +{ + illegal_dir, + to_utf8, + from_utf8 +}; + +struct utf8_data +{ + enum direction dir; + int emit_bom; +}; + + +extern int gconv_init (struct __gconv_step *step); +int +gconv_init (struct __gconv_step *step) +{ + /* Determine which direction. */ + struct utf8_data *new_data; + enum direction dir = illegal_dir; + int emit_bom; + int result; + + emit_bom = (__strcasecmp (step->__to_name, "UTF-16//") == 0); + + if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0 + && (__strcasecmp (step->__to_name, "UTF-16//") == 0 + || __strcasecmp (step->__to_name, "UTF-16BE//") == 0)) + { + dir = from_utf8; + } + else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0 + && __strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0) + { + dir = to_utf8; + } + + result = __GCONV_NOCONV; + if (dir != illegal_dir) + { + new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data)); + + result = __GCONV_NOMEM; + if (new_data != NULL) + { + new_data->dir = dir; + new_data->emit_bom = emit_bom; + step->__data = new_data; + + if (dir == from_utf8) + { + step->__min_needed_from = MIN_NEEDED_FROM; + step->__max_needed_from = MIN_NEEDED_FROM; + step->__min_needed_to = MIN_NEEDED_TO; + step->__max_needed_to = MIN_NEEDED_TO; + } + else + { + step->__min_needed_from = MIN_NEEDED_TO; + step->__max_needed_from = MIN_NEEDED_TO; + step->__min_needed_to = MIN_NEEDED_FROM; + step->__max_needed_to = MIN_NEEDED_FROM; + } + + step->__stateful = 0; + + result = __GCONV_OK; + } + } + + return result; +} + + +extern void gconv_end (struct __gconv_step *data); +void +gconv_end (struct __gconv_step *data) +{ + free (data->__data); +} + +/* The macro for the hardware loop. This is used for both + directions. */ +#define HARDWARE_CONVERT(INSTRUCTION) \ + { \ + register const unsigned char* pInput asm ("8") = inptr; \ + register unsigned long long inlen asm ("9") = inend - inptr; \ + register unsigned char* pOutput asm ("10") = outptr; \ + register unsigned long long outlen asm("11") = outend - outptr; \ + uint64_t cc = 0; \ + \ + asm volatile ("0: " INSTRUCTION " \n\t" \ + " jo 0b \n\t" \ + " ipm %2 \n" \ + : "+a" (pOutput), "+a" (pInput), "+d" (cc), \ + "+d" (outlen), "+d" (inlen) \ + : \ + : "cc", "memory"); \ + \ + inptr = pInput; \ + outptr = pOutput; \ + cc >>= 28; \ + \ + if (cc == 1) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + else if (cc == 2) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } + +/* Conversion function from UTF-8 to UTF-16. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +/* The software implementation is based on the code in gconv_simple.c. */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu12 %0, %1, 1"); \ + \ + if (inptr != inend) \ + { \ + int i; \ + for (i = 1; inptr + i < inend; ++i) \ + if ((inptr[i] & 0xc0) != 0x80) \ + break; \ + \ + if (__builtin_expect (inptr + i == inend, 1)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + STANDARD_FROM_LOOP_ERR_HANDLER (i); \ + } \ + continue; \ + } \ + \ + /* Next input byte. */ \ + uint16_t ch = *inptr; \ + \ + if (__builtin_expect (ch < 0x80, 1)) \ + { \ + /* One byte sequence. */ \ + ++inptr; \ + } \ + else \ + { \ + uint_fast32_t cnt; \ + uint_fast32_t i; \ + \ + if (ch >= 0xc2 && ch < 0xe0) \ + { \ + /* We expect two bytes. The first byte cannot be 0xc0 \ + or 0xc1, otherwise the wide character could have been \ + represented using a single byte. */ \ + cnt = 2; \ + ch &= 0x1f; \ + } \ + else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ + { \ + /* We expect three bytes. */ \ + cnt = 3; \ + ch &= 0x0f; \ + } \ + else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ + { \ + /* We expect four bytes. */ \ + cnt = 4; \ + ch &= 0x07; \ + } \ + else \ + { \ + /* Search the end of this ill-formed UTF-8 character. This \ + is the next byte with (x & 0xc0) != 0x80. */ \ + i = 0; \ + do \ + ++i; \ + while (inptr + i < inend \ + && (*(inptr + i) & 0xc0) == 0x80 \ + && i < 5); \ + \ + errout: \ + STANDARD_FROM_LOOP_ERR_HANDLER (i); \ + } \ + \ + if (__builtin_expect (inptr + cnt > inend, 0)) \ + { \ + /* We don't have enough input. But before we report \ + that check that all the bytes are correct. */ \ + for (i = 1; inptr + i < inend; ++i) \ + if ((inptr[i] & 0xc0) != 0x80) \ + break; \ + \ + if (__builtin_expect (inptr + i == inend, 1)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + goto errout; \ + } \ + \ + if (cnt == 4) \ + { \ + /* For 4 byte UTF-8 chars two UTF-16 chars (high and \ + low) are needed. */ \ + uint16_t zabcd, high, low; \ + \ + if (__builtin_expect (outptr + 4 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + /* See Principles of Operations cu12. */ \ + zabcd = (((inptr[0] & 0x7) << 2) | \ + ((inptr[1] & 0x30) >> 4)) - 1; \ + \ + /* z-bit must be zero after subtracting 1. */ \ + if (zabcd & 0x10) \ + STANDARD_FROM_LOOP_ERR_HANDLER (4) \ + \ + high = (uint16_t)(0xd8 << 8); /* high surrogate id */ \ + high |= zabcd << 6; /* abcd bits */ \ + high |= (inptr[1] & 0xf) << 2; /* efgh bits */ \ + high |= (inptr[2] & 0x30) >> 4; /* ij bits */ \ + \ + low = (uint16_t)(0xdc << 8); /* low surrogate id */ \ + low |= ((uint16_t)inptr[2] & 0xc) << 6; /* kl bits */ \ + low |= (inptr[2] & 0x3) << 6; /* mn bits */ \ + low |= inptr[3] & 0x3f; /* opqrst bits */ \ + \ + put16 (outptr, high); \ + outptr += 2; \ + put16 (outptr, low); \ + outptr += 2; \ + inptr += 4; \ + continue; \ + } \ + else \ + { \ + /* Read the possible remaining bytes. */ \ + for (i = 1; i < cnt; ++i) \ + { \ + uint16_t byte = inptr[i]; \ + \ + if ((byte & 0xc0) != 0x80) \ + /* This is an illegal encoding. */ \ + break; \ + \ + ch <<= 6; \ + ch |= byte & 0x3f; \ + } \ + inptr += cnt; \ + \ + } \ + } \ + /* Now adjust the pointers and store the result. */ \ + *((uint16_t *) outptr) = ch; \ + outptr += sizeof (uint16_t); \ + } + +#define LOOP_NEED_FLAGS +#include + +/* Conversion from UTF-16 to UTF-8. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +/* The software routine is based on the functionality of the S/390 + hardware instruction (cu21) as described in the Principles of + Operation. */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu21 %0, %1"); \ + if (inptr != inend) \ + { \ + /* Check if the third byte is \ + a valid start of a UTF-16 surrogate. */ \ + if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc) \ + STANDARD_TO_LOOP_ERR_HANDLER (3); \ + \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + continue; \ + } \ + \ + uint16_t c = get16 (inptr); \ + \ + if (__builtin_expect (c <= 0x007f, 1)) \ + { \ + /* Single byte UTF-8 char. */ \ + *outptr = c & 0xff; \ + outptr++; \ + } \ + else if (c >= 0x0080 && c <= 0x07ff) \ + { \ + /* Two byte UTF-8 char. */ \ + \ + if (__builtin_expect (outptr + 2 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + outptr[0] = 0xc0; \ + outptr[0] |= c >> 6; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= c & 0x3f; \ + \ + outptr += 2; \ + } \ + else if (c >= 0x0800 && c <= 0xd7ff) \ + { \ + /* Three byte UTF-8 char. */ \ + \ + if (__builtin_expect (outptr + 3 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + outptr[0] = 0xe0; \ + outptr[0] |= c >> 12; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= (c >> 6) & 0x3f; \ + \ + outptr[2] = 0x80; \ + outptr[2] |= c & 0x3f; \ + \ + outptr += 3; \ + } \ + else if (c >= 0xd800 && c <= 0xdbff) \ + { \ + /* Four byte UTF-8 char. */ \ + uint16_t low, uvwxy; \ + \ + if (__builtin_expect (outptr + 4 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + inptr += 2; \ + if (__builtin_expect (inptr + 2 > inend, 0)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + low = get16 (inptr); \ + \ + if ((low & 0xfc00) != 0xdc00) \ + { \ + inptr -= 2; \ + STANDARD_TO_LOOP_ERR_HANDLER (2); \ + } \ + uvwxy = ((c >> 6) & 0xf) + 1; \ + outptr[0] = 0xf0; \ + outptr[0] |= uvwxy >> 2; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= (uvwxy << 4) & 0x30; \ + outptr[1] |= (c >> 2) & 0x0f; \ + \ + outptr[2] = 0x80; \ + outptr[2] |= (c & 0x03) << 4; \ + outptr[2] |= (low >> 6) & 0x0f; \ + \ + outptr[3] = 0x80; \ + outptr[3] |= low & 0x3f; \ + \ + outptr += 4; \ + } \ + else \ + { \ + STANDARD_TO_LOOP_ERR_HANDLER (2); \ + } \ + inptr += 2; \ + } +#define LOOP_NEED_FLAGS +#include + +#include diff --git a/libc/sysdeps/s390/s390-64/utf8-utf32-z9.c b/libc/sysdeps/s390/s390-64/utf8-utf32-z9.c new file mode 100644 index 000000000..17ef8bc89 --- /dev/null +++ b/libc/sysdeps/s390/s390-64/utf8-utf32-z9.c @@ -0,0 +1,508 @@ +/* Conversion between UTF-8 and UTF-32 BE/internal. + + This module uses the Z9-109 variants of the Convert Unicode + instructions. + Copyright (C) 1997-2009 Free Software Foundation, Inc. + + Author: Andreas Krebbel + Based on the work by Ulrich Drepper , 1997. + + Thanks to Daniel Appich who covered the relevant performance work + in his diploma thesis. + + This is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include +#include +#include + +/* UTF-32 big endian byte order mark. */ +#define BOM 0x0000feffu + +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +/* These definitions apply to the UTF-8 to UTF-32 direction. The + software implementation for UTF-8 still supports multibyte + characters up to 6 bytes whereas the hardware variant does not. */ +#define MIN_NEEDED_FROM 1 +#define MAX_NEEDED_FROM 6 +#define MIN_NEEDED_TO 4 +#define FROM_LOOP from_utf8_loop +#define TO_LOOP to_utf8_loop +#define FROM_DIRECTION (dir == from_utf8) +#define PREPARE_LOOP \ + enum direction dir = ((struct utf8_data *) step->__data)->dir; \ + int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \ + \ + if (emit_bom && !data->__internal_use \ + && data->__invocation_counter == 0) \ + { \ + /* Emit the Byte Order Mark. */ \ + if (__builtin_expect (outbuf + 4 > outend, 0)) \ + return __GCONV_FULL_OUTPUT; \ + \ + put32u (outbuf, BOM); \ + outbuf += 4; \ + } + +/* Direction of the transformation. */ +enum direction +{ + illegal_dir, + to_utf8, + from_utf8 +}; + +struct utf8_data +{ + enum direction dir; + int emit_bom; +}; + + +extern int gconv_init (struct __gconv_step *step); +int +gconv_init (struct __gconv_step *step) +{ + /* Determine which direction. */ + struct utf8_data *new_data; + enum direction dir = illegal_dir; + int emit_bom; + int result; + + emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0); + + if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0 + && (__strcasecmp (step->__to_name, "UTF-32//") == 0 + || __strcasecmp (step->__to_name, "UTF-32BE//") == 0 + || __strcasecmp (step->__to_name, "INTERNAL") == 0)) + { + dir = from_utf8; + } + else if (__strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0 + && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0 + || __strcasecmp (step->__from_name, "INTERNAL") == 0)) + { + dir = to_utf8; + } + + result = __GCONV_NOCONV; + if (dir != illegal_dir) + { + new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data)); + + result = __GCONV_NOMEM; + if (new_data != NULL) + { + new_data->dir = dir; + new_data->emit_bom = emit_bom; + step->__data = new_data; + + if (dir == from_utf8) + { + step->__min_needed_from = MIN_NEEDED_FROM; + step->__max_needed_from = MIN_NEEDED_FROM; + step->__min_needed_to = MIN_NEEDED_TO; + step->__max_needed_to = MIN_NEEDED_TO; + } + else + { + step->__min_needed_from = MIN_NEEDED_TO; + step->__max_needed_from = MIN_NEEDED_TO; + step->__min_needed_to = MIN_NEEDED_FROM; + step->__max_needed_to = MIN_NEEDED_FROM; + } + + step->__stateful = 0; + + result = __GCONV_OK; + } + } + + return result; +} + + +extern void gconv_end (struct __gconv_step *data); +void +gconv_end (struct __gconv_step *data) +{ + free (data->__data); +} + +/* The macro for the hardware loop. This is used for both + directions. */ +#define HARDWARE_CONVERT(INSTRUCTION) \ + { \ + register const unsigned char* pInput asm ("8") = inptr; \ + register unsigned long long inlen asm ("9") = inend - inptr; \ + register unsigned char* pOutput asm ("10") = outptr; \ + register unsigned long long outlen asm("11") = outend - outptr; \ + uint64_t cc = 0; \ + \ + asm volatile ("0: " INSTRUCTION " \n\t" \ + " jo 0b \n\t" \ + " ipm %2 \n" \ + : "+a" (pOutput), "+a" (pInput), "+d" (cc), \ + "+d" (outlen), "+d" (inlen) \ + : \ + : "cc", "memory"); \ + \ + inptr = pInput; \ + outptr = pOutput; \ + cc >>= 28; \ + \ + if (cc == 1) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + else if (cc == 2) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } + +/* Conversion function from UTF-8 to UTF-32 internal/BE. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +/* The software routine is copied from gconv_simple.c. */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu14 %0, %1, 1"); \ + \ + if (inptr != inend) \ + { \ + int i; \ + for (i = 1; inptr + i < inend; ++i) \ + if ((inptr[i] & 0xc0) != 0x80) \ + break; \ + \ + if (__builtin_expect (inptr + i == inend, 1)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + STANDARD_FROM_LOOP_ERR_HANDLER (i); \ + } \ + continue; \ + } \ + \ + /* Next input byte. */ \ + uint32_t ch = *inptr; \ + \ + if (__builtin_expect (ch < 0x80, 1)) \ + { \ + /* One byte sequence. */ \ + ++inptr; \ + } \ + else \ + { \ + uint_fast32_t cnt; \ + uint_fast32_t i; \ + \ + if (ch >= 0xc2 && ch < 0xe0) \ + { \ + /* We expect two bytes. The first byte cannot be 0xc0 or \ + 0xc1, otherwise the wide character could have been \ + represented using a single byte. */ \ + cnt = 2; \ + ch &= 0x1f; \ + } \ + else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ + { \ + /* We expect three bytes. */ \ + cnt = 3; \ + ch &= 0x0f; \ + } \ + else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ + { \ + /* We expect four bytes. */ \ + cnt = 4; \ + ch &= 0x07; \ + } \ + else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ + { \ + /* We expect five bytes. */ \ + cnt = 5; \ + ch &= 0x03; \ + } \ + else if (__builtin_expect ((ch & 0xfe) == 0xfc, 1)) \ + { \ + /* We expect six bytes. */ \ + cnt = 6; \ + ch &= 0x01; \ + } \ + else \ + { \ + /* Search the end of this ill-formed UTF-8 character. This \ + is the next byte with (x & 0xc0) != 0x80. */ \ + i = 0; \ + do \ + ++i; \ + while (inptr + i < inend \ + && (*(inptr + i) & 0xc0) == 0x80 \ + && i < 5); \ + \ + errout: \ + STANDARD_FROM_LOOP_ERR_HANDLER (i); \ + } \ + \ + if (__builtin_expect (inptr + cnt > inend, 0)) \ + { \ + /* We don't have enough input. But before we report \ + that check that all the bytes are correct. */ \ + for (i = 1; inptr + i < inend; ++i) \ + if ((inptr[i] & 0xc0) != 0x80) \ + break; \ + \ + if (__builtin_expect (inptr + i == inend, 1)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + goto errout; \ + } \ + \ + /* Read the possible remaining bytes. */ \ + for (i = 1; i < cnt; ++i) \ + { \ + uint32_t byte = inptr[i]; \ + \ + if ((byte & 0xc0) != 0x80) \ + /* This is an illegal encoding. */ \ + break; \ + \ + ch <<= 6; \ + ch |= byte & 0x3f; \ + } \ + \ + /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \ + If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \ + have been represented with fewer than cnt bytes. */ \ + if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0)) \ + { \ + /* This is an illegal encoding. */ \ + goto errout; \ + } \ + \ + inptr += cnt; \ + } \ + \ + /* Now adjust the pointers and store the result. */ \ + *((uint32_t *) outptr) = ch; \ + outptr += sizeof (uint32_t); \ + } +#define LOOP_NEED_FLAGS + +#define STORE_REST \ + { \ + /* We store the remaining bytes while converting them into the UCS4 \ + format. We can assume that the first byte in the buffer is \ + correct and that it requires a larger number of bytes than there \ + are in the input buffer. */ \ + wint_t ch = **inptrp; \ + size_t cnt, r; \ + \ + state->__count = inend - *inptrp; \ + \ + if (ch >= 0xc2 && ch < 0xe0) \ + { \ + /* We expect two bytes. The first byte cannot be 0xc0 or \ + 0xc1, otherwise the wide character could have been \ + represented using a single byte. */ \ + cnt = 2; \ + ch &= 0x1f; \ + } \ + else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ + { \ + /* We expect three bytes. */ \ + cnt = 3; \ + ch &= 0x0f; \ + } \ + else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ + { \ + /* We expect four bytes. */ \ + cnt = 4; \ + ch &= 0x07; \ + } \ + else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ + { \ + /* We expect five bytes. */ \ + cnt = 5; \ + ch &= 0x03; \ + } \ + else \ + { \ + /* We expect six bytes. */ \ + cnt = 6; \ + ch &= 0x01; \ + } \ + \ + /* The first byte is already consumed. */ \ + r = cnt - 1; \ + while (++(*inptrp) < inend) \ + { \ + ch <<= 6; \ + ch |= **inptrp & 0x3f; \ + --r; \ + } \ + \ + /* Shift for the so far missing bytes. */ \ + ch <<= r * 6; \ + \ + /* Store the number of bytes expected for the entire sequence. */ \ + state->__count |= cnt << 8; \ + \ + /* Store the value. */ \ + state->__value.__wch = ch; \ + } + +#define UNPACK_BYTES \ + { \ + static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \ + wint_t wch = state->__value.__wch; \ + size_t ntotal = state->__count >> 8; \ + \ + inlen = state->__count & 255; \ + \ + bytebuf[0] = inmask[ntotal - 2]; \ + \ + do \ + { \ + if (--ntotal < inlen) \ + bytebuf[ntotal] = 0x80 | (wch & 0x3f); \ + wch >>= 6; \ + } \ + while (ntotal > 1); \ + \ + bytebuf[0] |= wch; \ + } + +#define CLEAR_STATE \ + state->__count = 0 + +#include + +/* Conversion from UTF-32 internal/BE to UTF-8. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +/* The software routine mimics the S/390 cu41 instruction. */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu41 %0, %1"); \ + \ + if (inptr != inend) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + continue; \ + } \ + \ + uint32_t wc = *((const uint32_t *) inptr); \ + \ + if (__builtin_expect (wc <= 0x7f, 1)) \ + { \ + /* Single UTF-8 char. */ \ + *outptr = (uint8_t)wc; \ + outptr++; \ + } \ + else if (wc <= 0x7ff) \ + { \ + /* Two UTF-8 chars. */ \ + if (__builtin_expect (outptr + 2 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + outptr[0] = 0xc0; \ + outptr[0] |= wc >> 6; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= wc & 0x3f; \ + \ + outptr += 2; \ + } \ + else if (wc <= 0xffff) \ + { \ + /* Three UTF-8 chars. */ \ + if (__builtin_expect (outptr + 3 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + outptr[0] = 0xe0; \ + outptr[0] |= wc >> 12; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= (wc >> 6) & 0x3f; \ + \ + outptr[2] = 0x80; \ + outptr[2] |= wc & 0x3f; \ + \ + outptr += 3; \ + } \ + else if (wc <= 0x10ffff) \ + { \ + /* Four UTF-8 chars. */ \ + if (__builtin_expect (outptr + 4 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + outptr[0] = 0xf0; \ + outptr[0] |= wc >> 18; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= (wc >> 12) & 0x3f; \ + \ + outptr[2] = 0x80; \ + outptr[2] |= (wc >> 6) & 0x3f; \ + \ + outptr[3] = 0x80; \ + outptr[3] |= wc & 0x3f; \ + \ + outptr += 4; \ + } \ + else \ + { \ + STANDARD_TO_LOOP_ERR_HANDLER (4); \ + } \ + inptr += 4; \ + } +#define LOOP_NEED_FLAGS +#include + +#include diff --git a/libc/sysdeps/unix/sysv/linux/configure b/libc/sysdeps/unix/sysv/linux/configure index 3966c3e42..4982ba51c 100644 --- a/libc/sysdeps/unix/sysv/linux/configure +++ b/libc/sysdeps/unix/sysv/linux/configure @@ -1,17 +1,6 @@ # This file is generated from configure.in by Autoconf. DO NOT EDIT! # Local configure fragment for sysdeps/unix/sysv/linux. -# The Linux kernel headers can be found in -# /lib/modules/$(uname -r)/build/include -# Check whether this directory is available. -if test -z "$sysheaders" && - test "x$cross_compiling" = xno && - test -d /lib/modules/`uname -r`/build/include; then - sysheaders="/lib/modules/`uname -r`/build/include" - ccheaders=`$CC -print-file-name=include` - SYSINCLUDES="-I $sysheaders" -fi - # Don't bother trying to generate any glue code to be compatible with the # existing system library, because we are the only system library. inhibit_glue=yes diff --git a/libc/sysdeps/unix/sysv/linux/configure.in b/libc/sysdeps/unix/sysv/linux/configure.in index 5330e98c2..8f00407a8 100644 --- a/libc/sysdeps/unix/sysv/linux/configure.in +++ b/libc/sysdeps/unix/sysv/linux/configure.in @@ -1,19 +1,6 @@ GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. # Local configure fragment for sysdeps/unix/sysv/linux. -# The Linux kernel headers can be found in -# /lib/modules/$(uname -r)/build/include -# Check whether this directory is available. -if test -z "$sysheaders" && - test "x$cross_compiling" = xno && - test -d /lib/modules/`uname -r`/build/include; then - sysheaders="/lib/modules/`uname -r`/build/include" - ccheaders=`$CC -print-file-name=include` - dnl We don't have to use -nostdinc. We just want one more directory - dnl to be used. - SYSINCLUDES="-I $sysheaders" -fi - # Don't bother trying to generate any glue code to be compatible with the # existing system library, because we are the only system library. inhibit_glue=yes diff --git a/libc/sysdeps/unix/sysv/linux/eventfd.c b/libc/sysdeps/unix/sysv/linux/eventfd.c index 4cd557983..7f69ecdb8 100644 --- a/libc/sysdeps/unix/sysv/linux/eventfd.c +++ b/libc/sysdeps/unix/sysv/linux/eventfd.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,14 +19,21 @@ #include #include #include +#include int eventfd (int count, int flags) { #ifdef __NR_eventfd2 - return INLINE_SYSCALL (eventfd2, 2, count, flags); -#else + int res = INLINE_SYSCALL (eventfd2, 2, count, flags); +# ifndef __ASSUME_EVENTFD2 + if (res != -1 || errno != ENOSYS) +# endif + return res; +#endif + +#ifndef __ASSUME_EVENTFD2 /* The old system call has no flag parameter which is bad. So we have to wait until we have to support to pass additional values to the kernel (sys_indirect) before implementing setting flags like @@ -43,5 +50,7 @@ eventfd (int count, int flags) __set_errno (ENOSYS); return -1; # endif +#elif !defined __NR_eventfd2 +# error "__ASSUME_EVENTFD2 defined but not __NR_eventfd2" #endif } diff --git a/libc/sysdeps/unix/sysv/linux/i386/makecontext.S b/libc/sysdeps/unix/sysv/linux/i386/makecontext.S index ad2340555..5b98e64db 100644 --- a/libc/sysdeps/unix/sysv/linux/i386/makecontext.S +++ b/libc/sysdeps/unix/sysv/linux/i386/makecontext.S @@ -1,5 +1,5 @@ /* Create new context. - Copyright (C) 2001, 2002, 2005, 2007, 2008 Free Software Foundation, Inc. + Copyright (C) 2001,2002,2005,2007,2008,2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 2001. @@ -105,17 +105,15 @@ L(exitcode): 1: popl %ebx addl $_GLOBAL_OFFSET_TABLE_+[.-1b], %ebx #endif - popl %eax /* This is the next context. */ - testl %eax, %eax + cmpl $0, (%esp) /* Check the next context. */ je 2f /* If it is zero exit. */ - pushl %eax call JUMPTARGET(__setcontext) /* If this returns (which can happen if the syscall fails) we'll exit the program with the return error value (-1). */ -2: pushl %eax - call HIDDEN_JUMPTARGET(exit) + movl %eax, (%esp) +2: call HIDDEN_JUMPTARGET(exit) /* The 'exit' call should never return. In case it does cause the process to terminate. */ hlt diff --git a/libc/sysdeps/unix/sysv/linux/i386/sysconf.c b/libc/sysdeps/unix/sysv/linux/i386/sysconf.c index efe1a639c..ff3cf9f7c 100644 --- a/libc/sysdeps/unix/sysv/linux/i386/sysconf.c +++ b/libc/sysdeps/unix/sysv/linux/i386/sysconf.c @@ -138,6 +138,9 @@ static const struct intel_02_cache_info { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 }, { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 }, { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 }, + { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 }, + { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 }, + { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 }, }; #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known[0])) diff --git a/libc/sysdeps/unix/sysv/linux/kernel-features.h b/libc/sysdeps/unix/sysv/linux/kernel-features.h index 456251579..ff065effb 100644 --- a/libc/sysdeps/unix/sysv/linux/kernel-features.h +++ b/libc/sysdeps/unix/sysv/linux/kernel-features.h @@ -516,6 +516,8 @@ # define __ASSUME_SOCK_CLOEXEC 1 # define __ASSUME_IN_NONBLOCK 1 # define __ASSUME_PIPE2 1 +# define __ASSUME_EVENTFD2 1 +# define __ASSUME_SIGNALFD4 1 #endif /* Support for the accept4 syscall was added in 2.6.28. */ diff --git a/libc/sysdeps/unix/sysv/linux/signalfd.c b/libc/sysdeps/unix/sysv/linux/signalfd.c index 9898f2923..c2d974a45 100644 --- a/libc/sysdeps/unix/sysv/linux/signalfd.c +++ b/libc/sysdeps/unix/sysv/linux/signalfd.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,14 +20,21 @@ #include #include #include +#include int signalfd (int fd, const sigset_t *mask, int flags) { #ifdef __NR_signalfd4 - return INLINE_SYSCALL (signalfd4, 4, fd, mask, _NSIG / 8, flags); -#else + int res = INLINE_SYSCALL (signalfd4, 4, fd, mask, _NSIG / 8, flags); +# ifndef __ASSUME_SIGNALFD4 + if (res != -1 || errno != ENOSYS) +# endif + return res; +#endif + +#ifndef __ASSUME_SIGNALFD4 /* The old system call has no flag parameter which is bad. So we have to wait until we have to support to pass additional values to the kernel (sys_indirect) before implementing setting flags like @@ -44,5 +51,7 @@ signalfd (int fd, const sigset_t *mask, int flags) __set_errno (ENOSYS); return -1; # endif +#elif !defined __NR_signalfd4 +# error "__ASSUME_SIGNALFD4 defined but not __NR_signalfd4" #endif } diff --git a/libc/sysdeps/unix/sysv/linux/sys/epoll.h b/libc/sysdeps/unix/sysv/linux/sys/epoll.h index 12de0bcfe..ca1d3d045 100644 --- a/libc/sysdeps/unix/sysv/linux/sys/epoll.h +++ b/libc/sysdeps/unix/sysv/linux/sys/epoll.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2006, 2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2002-2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -31,7 +31,7 @@ typedef __sigset_t sigset_t; #endif -/* Flags to be passed to epoll_create2. */ +/* Flags to be passed to epoll_create1. */ enum { EPOLL_CLOEXEC = 02000000, diff --git a/libc/sysdeps/x86_64/Makefile b/libc/sysdeps/x86_64/Makefile index da8209338..e8d0285e2 100644 --- a/libc/sysdeps/x86_64/Makefile +++ b/libc/sysdeps/x86_64/Makefile @@ -4,6 +4,7 @@ long-double-fcts = yes ifeq ($(subdir),csu) sysdep_routines += hp-timing elide-routines.os += hp-timing +gen-as-const-headers += link-defines.sym endif ifeq ($(subdir),gmon) @@ -18,6 +19,11 @@ ifeq ($(subdir),elf) sysdep-dl-routines += tlsdesc dl-tlsdesc sysdep_routines += tlsdesc dl-tlsdesc sysdep-rtld-routines += tlsdesc dl-tlsdesc + +tests: $(objpfx)tst-xmmymm.out +$(objpfx)tst-xmmymm.out: ../sysdeps/x86_64/tst-xmmymm.sh $(objpfx)ld.so + @echo "Checking ld.so for SSE register use. This will take a few seconds..." + $(SHELL) -e $< $(objpfx) > $@ endif ifeq ($(subdir),csu) diff --git a/libc/sysdeps/x86_64/bits/link.h b/libc/sysdeps/x86_64/bits/link.h index 5676b7875..643a293bb 100644 --- a/libc/sysdeps/x86_64/bits/link.h +++ b/libc/sysdeps/x86_64/bits/link.h @@ -65,10 +65,19 @@ __END_DECLS /* Registers for entry into PLT on x86-64. */ # if __GNUC_PREREQ (4,0) typedef float La_x86_64_xmm __attribute__ ((__vector_size__ (16))); +typedef float La_x86_64_ymm __attribute__ ((__vector_size__ (32))); # else typedef float La_x86_64_xmm __attribute__ ((__mode__ (__V4SF__))); # endif +typedef union +{ +# if __GNUC_PREREQ (4,0) + La_x86_64_ymm ymm[2]; +# endif + La_x86_64_xmm xmm[4]; +} La_x86_64_vector __attribute__ ((aligned(16))); + typedef struct La_x86_64_regs { uint64_t lr_rdx; @@ -80,6 +89,7 @@ typedef struct La_x86_64_regs uint64_t lr_rbp; uint64_t lr_rsp; La_x86_64_xmm lr_xmm[8]; + La_x86_64_vector lr_vector[8]; } La_x86_64_regs; /* Return values for calls from PLT on x86-64. */ @@ -91,6 +101,8 @@ typedef struct La_x86_64_retval La_x86_64_xmm lrv_xmm1; long double lrv_st0; long double lrv_st1; + La_x86_64_vector lrv_vector0; + La_x86_64_vector lrv_vector1; } La_x86_64_retval; diff --git a/libc/sysdeps/x86_64/cacheinfo.c b/libc/sysdeps/x86_64/cacheinfo.c index 362687c18..75b81958d 100644 --- a/libc/sysdeps/x86_64/cacheinfo.c +++ b/libc/sysdeps/x86_64/cacheinfo.c @@ -25,6 +25,17 @@ #ifdef USE_MULTIARCH # include "multiarch/init-arch.h" + +# define is_intel __cpu_features.kind == arch_kind_intel +# define is_amd __cpu_features.kind == arch_kind_amd +# define max_cpuid __cpu_features.max_cpuid +#else + /* This spells out "GenuineIntel". */ +# define is_intel \ + ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69 + /* This spells out "AuthenticAMD". */ +# define is_amd \ + ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65 #endif static const struct intel_02_cache_info @@ -100,6 +111,9 @@ static const struct intel_02_cache_info { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 }, { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 }, { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 }, + { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 }, + { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 }, + { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 }, }; #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0])) @@ -152,6 +166,12 @@ intel_check_word (int name, unsigned int value, bool *has_level_2, /* Intel reused this value. For family 15, model 6 it specifies the 3rd level cache. Otherwise the 2nd level cache. */ + unsigned int family; + unsigned int model; +#ifdef USE_MULTIARCH + family = __cpu_features.family; + model = __cpu_features.model; +#else unsigned int eax; unsigned int ebx; unsigned int ecx; @@ -160,9 +180,10 @@ intel_check_word (int name, unsigned int value, bool *has_level_2, : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (1)); - unsigned int family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf); - unsigned int model = ((((eax >>16) & 0xf) << 4) - + ((eax >> 4) & 0xf)); + family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf); + model = (((eax >>16) & 0xf) << 4) + ((eax >> 4) & 0xf); +#endif + if (family == 15 && model == 6) { /* The level 3 cache is encoded for this model like @@ -394,21 +415,24 @@ long int attribute_hidden __cache_sysconf (int name) { +#ifdef USE_MULTIARCH + if (__cpu_features.kind == arch_kind_unknown) + __init_cpu_features (); +#else /* Find out what brand of processor. */ - unsigned int eax; + unsigned int max_cpuid; unsigned int ebx; unsigned int ecx; unsigned int edx; asm volatile ("cpuid" - : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "=a" (max_cpuid), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (0)); +#endif - /* This spells out "GenuineIntel". */ - if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69) - return handle_intel (name, eax); + if (is_intel) + return handle_intel (name, max_cpuid); - /* This spells out "AuthenticAMD". */ - if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) + if (is_amd) return handle_amd (name); // XXX Fill in more vendors. @@ -457,20 +481,11 @@ init_cacheinfo (void) #ifdef USE_MULTIARCH if (__cpu_features.kind == arch_kind_unknown) __init_cpu_features (); -# define is_intel __cpu_features.kind == arch_kind_intel -# define is_amd __cpu_features.kind == arch_kind_amd -# define max_cpuid __cpu_features.max_cpuid #else int max_cpuid; asm volatile ("cpuid" : "=a" (max_cpuid), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (0)); - /* This spells out "GenuineIntel". */ -# define is_intel \ - ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69 - /* This spells out "AuthenticAMD". */ -# define is_amd \ - ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65 #endif if (is_intel) diff --git a/libc/sysdeps/x86_64/dl-trampoline.S b/libc/sysdeps/x86_64/dl-trampoline.S index d8d9bc12a..49d239f07 100644 --- a/libc/sysdeps/x86_64/dl-trampoline.S +++ b/libc/sysdeps/x86_64/dl-trampoline.S @@ -17,7 +17,9 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ +#include #include +#include .text .globl _dl_runtime_resolve @@ -89,25 +91,85 @@ _dl_runtime_profile: /* Actively align the La_x86_64_regs structure. */ andq $0xfffffffffffffff0, %rsp - subq $192, %rsp # sizeof(La_x86_64_regs) +# ifdef HAVE_AVX_SUPPORT + /* sizeof(La_x86_64_regs). Need extra space for 8 SSE registers + to detect if any xmm0-xmm7 registers are changed by audit + module. */ + subq $(LR_SIZE + XMM_SIZE*8), %rsp +# else + subq $LR_SIZE, %rsp # sizeof(La_x86_64_regs) +# endif movq %rsp, 24(%rbx) - movq %rdx, (%rsp) # Fill the La_x86_64_regs structure. - movq %r8, 8(%rsp) - movq %r9, 16(%rsp) - movq %rcx, 24(%rsp) - movq %rsi, 32(%rsp) - movq %rdi, 40(%rsp) - movq %rbp, 48(%rsp) + /* Fill the La_x86_64_regs structure. */ + movq %rdx, LR_RDX_OFFSET(%rsp) + movq %r8, LR_R8_OFFSET(%rsp) + movq %r9, LR_R9_OFFSET(%rsp) + movq %rcx, LR_RCX_OFFSET(%rsp) + movq %rsi, LR_RSI_OFFSET(%rsp) + movq %rdi, LR_RDI_OFFSET(%rsp) + movq %rbp, LR_RBP_OFFSET(%rsp) + leaq 48(%rbx), %rax - movq %rax, 56(%rsp) - movaps %xmm0, 64(%rsp) - movaps %xmm1, 80(%rsp) - movaps %xmm2, 96(%rsp) - movaps %xmm3, 112(%rsp) - movaps %xmm4, 128(%rsp) - movaps %xmm5, 144(%rsp) - movaps %xmm7, 160(%rsp) + movq %rax, LR_RSP_OFFSET(%rsp) + + /* We always store the XMM registers even if AVX is available. + This is to provide backward binary compatility for existing + audit modules. */ + movaps %xmm0, (LR_XMM_OFFSET)(%rsp) + movaps %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp) + movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp) + movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp) + movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp) + movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp) + movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp) + movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp) + +# ifdef HAVE_AVX_SUPPORT + .data +L(have_avx): + .zero 4 + .size L(have_avx), 4 + .previous + + cmpl $0, L(have_avx)(%rip) + jne 1f + movq %rbx, %r11 # Save rbx + movl $1, %eax + cpuid + movq %r11,%rbx # Restore rbx + movl $1, %eax + testl $(1 << 28), %ecx + jne 2f + negl %eax +2: movl %eax, L(have_avx)(%rip) + cmpl $0, %eax + +1: js L(no_avx1) + + /* This is to support AVX audit modules. */ + vmovdqu %ymm0, (LR_VECTOR_OFFSET)(%rsp) + vmovdqu %ymm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp) + vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp) + vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp) + vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp) + vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp) + vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp) + vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp) + + /* Save xmm0-xmm7 registers to detect if any of them are + changed by audit module. */ + vmovdqa %xmm0, (LR_SIZE)(%rsp) + vmovdqa %xmm1, (LR_SIZE + XMM_SIZE)(%rsp) + vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp) + vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp) + vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp) + vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp) + vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp) + vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp) + +L(no_avx1): +# endif movq %rsp, %rcx # La_x86_64_regs pointer to %rcx. movq 48(%rbx), %rdx # Load return address if needed. @@ -119,27 +181,87 @@ _dl_runtime_profile: movq %rax, %r11 # Save return value. movq 8(%rbx), %rax # Get back register content. - movq (%rsp), %rdx - movq 8(%rsp), %r8 - movq 16(%rsp), %r9 - movaps 64(%rsp), %xmm0 - movaps 80(%rsp), %xmm1 - movaps 96(%rsp), %xmm2 - movaps 112(%rsp), %xmm3 - movaps 128(%rsp), %xmm4 - movaps 144(%rsp), %xmm5 - movaps 160(%rsp), %xmm7 - + movq LR_RDX_OFFSET(%rsp), %rdx + movq LR_R8_OFFSET(%rsp), %r8 + movq LR_R9_OFFSET(%rsp), %r9 + + movaps (LR_XMM_OFFSET)(%rsp), %xmm0 + movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 + movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 + movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 + movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 + movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 + movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 + movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 + +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx2) + + /* Check if any xmm0-xmm7 registers are changed by audit + module. */ + vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 + +L(no_avx2): +1: +# endif movq 16(%rbx), %r10 # Anything in framesize? testq %r10, %r10 - jns 1f + jns 3f /* There's nothing in the frame size, so there will be no call to the _dl_call_pltexit. */ - movq 24(%rsp), %rcx # Get back registers content. - movq 32(%rsp), %rsi - movq 40(%rsp), %rdi + /* Get back registers content. */ + movq LR_RCX_OFFSET(%rsp), %rcx + movq LR_RSI_OFFSET(%rsp), %rsi + movq LR_RDI_OFFSET(%rsp), %rdi movq %rbx, %rsp movq (%rsp), %rbx @@ -151,7 +273,7 @@ _dl_runtime_profile: cfi_adjust_cfa_offset(-48) jmp *%r11 # Jump to function address. -1: +3: cfi_adjust_cfa_offset(48) cfi_rel_offset(%rbx, 0) cfi_def_cfa_register(%rbx) @@ -161,7 +283,7 @@ _dl_runtime_profile: temporary buffer of the size specified by the 'framesize' returned from _dl_profile_fixup */ - leaq 56(%rbx), %rsi # stack + leaq LR_RSP_OFFSET(%rbx), %rsi # stack addq $8, %r10 andq $0xfffffffffffffff0, %r10 movq %r10, %rcx @@ -183,31 +305,80 @@ _dl_runtime_profile: _dl_call_pltexit. The La_x86_64_regs is being pointed by rsp now, so we just need to allocate the sizeof(La_x86_64_retval) space on the stack, since the alignment has already been taken care of. */ - - subq $80, %rsp # sizeof(La_x86_64_retval) +# ifdef HAVE_AVX_SUPPORT + /* sizeof(La_x86_64_retval). Need extra space for 2 SSE + registers to detect if xmm0/xmm1 registers are changed + by audit module. */ + subq $(LRV_SIZE + XMM_SIZE*2), %rsp +# else + subq $LRV_SIZE, %rsp # sizeof(La_x86_64_retval) +# endif movq %rsp, %rcx # La_x86_64_retval argument to %rcx. - movq %rax, (%rcx) # Fill in the La_x86_64_retval structure. - movq %rdx, 8(%rcx) - movaps %xmm0, 16(%rcx) - movaps %xmm1, 32(%rcx) - fstpt 48(%rcx) - fstpt 64(%rcx) + /* Fill in the La_x86_64_retval structure. */ + movq %rax, LRV_RAX_OFFSET(%rcx) + movq %rdx, LRV_RDX_OFFSET(%rcx) + + movaps %xmm0, LRV_XMM0_OFFSET(%rcx) + movaps %xmm1, LRV_XMM1_OFFSET(%rcx) + +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx3) + + /* This is to support AVX audit modules. */ + vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx) + vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx) + + /* Save xmm0/xmm1 registers to detect if they are changed + by audit module. */ + vmovdqa %xmm0, (LRV_SIZE)(%rcx) + vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx) + +L(no_avx3): +# endif + + fstpt LRV_ST0_OFFSET(%rcx) + fstpt LRV_ST1_OFFSET(%rcx) movq 24(%rbx), %rdx # La_x86_64_regs argument to %rdx. movq 40(%rbx), %rsi # Copy args pushed by PLT in register. movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index call _dl_call_pltexit - movq (%rsp), %rax # Restore return registers. - movq 8(%rsp), %rdx - movaps 16(%rsp), %xmm0 - movaps 32(%rsp), %xmm1 - fldt 64(%rsp) - fldt 48(%rsp) + /* Restore return registers. */ + movq LRV_RAX_OFFSET(%rsp), %rax + movq LRV_RDX_OFFSET(%rsp), %rdx + + movaps LRV_XMM0_OFFSET(%rsp), %xmm0 + movaps LRV_XMM1_OFFSET(%rsp), %xmm1 + +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx4) + + /* Check if xmm0/xmm1 registers are changed by audit module. */ + vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2 + vpmovmskb %xmm2, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 + +1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 + vpmovmskb %xmm2, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 + +L(no_avx4): +1: +# endif + + fldt LRV_ST1_OFFSET(%rsp) + fldt LRV_ST0_OFFSET(%rsp) movq %rbx, %rsp - movq (%rsp), %rbx + movq (%rsp), %rbx cfi_restore(rbx) cfi_def_cfa_register(%rsp) diff --git a/libc/sysdeps/x86_64/elf/configure b/libc/sysdeps/x86_64/elf/configure index 24eff6284..79e45a758 100755 --- a/libc/sysdeps/x86_64/elf/configure +++ b/libc/sysdeps/x86_64/elf/configure @@ -47,3 +47,28 @@ cat >>confdefs.h <<\_ACEOF #define PI_STATIC_AND_HIDDEN 1 _ACEOF + +echo "$as_me:$LINENO: checking for AVX support" >&5 +echo $ECHO_N "checking for AVX support... $ECHO_C" >&6 +if test "${libc_cv_cc_avx+set}" = set; then + echo $ECHO_N "(cached) $ECHO_C" >&6 +else + if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi +fi +echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5 +echo "${ECHO_T}$libc_cv_cc_avx" >&6 +if test $libc_cv_cc_avx = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_AVX_SUPPORT 1 +_ACEOF + +fi diff --git a/libc/sysdeps/x86_64/elf/configure.in b/libc/sysdeps/x86_64/elf/configure.in index 9cb59d009..14d187530 100644 --- a/libc/sysdeps/x86_64/elf/configure.in +++ b/libc/sysdeps/x86_64/elf/configure.in @@ -32,3 +32,14 @@ fi dnl It is always possible to access static and hidden symbols in an dnl position independent way. AC_DEFINE(PI_STATIC_AND_HIDDEN) + +dnl Check if -mavx works. +AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl +if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi]) +if test $libc_cv_cc_avx = yes; then + AC_DEFINE(HAVE_AVX_SUPPORT) +fi diff --git a/libc/sysdeps/x86_64/link-defines.sym b/libc/sysdeps/x86_64/link-defines.sym new file mode 100644 index 000000000..1694d883a --- /dev/null +++ b/libc/sysdeps/x86_64/link-defines.sym @@ -0,0 +1,28 @@ +#include "link.h" +#include + +-- +VECTOR_SIZE sizeof (La_x86_64_vector) +XMM_SIZE sizeof (La_x86_64_xmm) + +LR_SIZE sizeof (struct La_x86_64_regs) +LR_RDX_OFFSET offsetof (struct La_x86_64_regs, lr_rdx) +LR_R8_OFFSET offsetof (struct La_x86_64_regs, lr_r8) +LR_R9_OFFSET offsetof (struct La_x86_64_regs, lr_r9) +LR_RCX_OFFSET offsetof (struct La_x86_64_regs, lr_rcx) +LR_RSI_OFFSET offsetof (struct La_x86_64_regs, lr_rsi) +LR_RDI_OFFSET offsetof (struct La_x86_64_regs, lr_rdi) +LR_RBP_OFFSET offsetof (struct La_x86_64_regs, lr_rbp) +LR_RSP_OFFSET offsetof (struct La_x86_64_regs, lr_rsp) +LR_XMM_OFFSET offsetof (struct La_x86_64_regs, lr_xmm) +LR_VECTOR_OFFSET offsetof (struct La_x86_64_regs, lr_vector) + +LRV_SIZE sizeof (struct La_x86_64_retval) +LRV_RAX_OFFSET offsetof (struct La_x86_64_retval, lrv_rax) +LRV_RDX_OFFSET offsetof (struct La_x86_64_retval, lrv_rdx) +LRV_XMM0_OFFSET offsetof (struct La_x86_64_retval, lrv_xmm0) +LRV_XMM1_OFFSET offsetof (struct La_x86_64_retval, lrv_xmm1) +LRV_ST0_OFFSET offsetof (struct La_x86_64_retval, lrv_st0) +LRV_ST1_OFFSET offsetof (struct La_x86_64_retval, lrv_st1) +LRV_VECTOR0_OFFSET offsetof (struct La_x86_64_retval, lrv_vector0) +LRV_VECTOR1_OFFSET offsetof (struct La_x86_64_retval, lrv_vector1) diff --git a/libc/sysdeps/x86_64/memcmp.S b/libc/sysdeps/x86_64/memcmp.S new file mode 100644 index 000000000..a9fe13ae5 --- /dev/null +++ b/libc/sysdeps/x86_64/memcmp.S @@ -0,0 +1,359 @@ +/* memcmp with SSE2 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + + .text +ENTRY (memcmp) + test %rdx, %rdx + jz L(finz) + cmpq $1, %rdx + jle L(finr1b) + subq %rdi, %rsi + movq %rdx, %r10 + cmpq $32, %r10 + jge L(gt32) + /* Handle small chunks and last block of less than 32 bytes. */ +L(small): + testq $1, %r10 + jz L(s2b) + movzbl (%rdi), %eax + movzbl (%rdi, %rsi), %edx + subq $1, %r10 + je L(finz1) + addq $1, %rdi + subl %edx, %eax + jnz L(exit) +L(s2b): + testq $2, %r10 + jz L(s4b) + movzwl (%rdi), %eax + movzwl (%rdi, %rsi), %edx + subq $2, %r10 + je L(fin2_7) + addq $2, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s4b): + testq $4, %r10 + jz L(s8b) + movl (%rdi), %eax + movl (%rdi, %rsi), %edx + subq $4, %r10 + je L(fin2_7) + addq $4, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s8b): + testq $8, %r10 + jz L(s16b) + movq (%rdi), %rax + movq (%rdi, %rsi), %rdx + subq $8, %r10 + je L(fin2_7) + addq $8, %rdi + cmpq %rdx, %rax + jnz L(fin2_7) +L(s16b): + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + xorl %eax, %eax + subl $0xffff, %edx + jz L(finz) + bsfl %edx, %ecx + leaq (%rdi, %rcx), %rcx + movzbl (%rcx), %eax + movzbl (%rsi, %rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(finr1b): + movzbl (%rdi), %eax + movzbl (%rsi), %edx +L(finz1): + subl %edx, %eax +L(exit): + ret + + .p2align 4,, 4 +L(fin2_7): + cmpq %rdx, %rax + jz L(finz) + movq %rax, %r11 + subq %rdx, %r11 + bsfq %r11, %rcx + sarq $3, %rcx + salq $3, %rcx + sarq %cl, %rax + movzbl %al, %eax + sarq %cl, %rdx + movzbl %dl, %edx + subl %edx, %eax + ret + + .p2align 4,, 4 +L(finz): + xorl %eax, %eax + ret + + /* For blocks bigger than 32 bytes + 1. Advance one of the addr pointer to be 16B aligned. + 2. Treat the case of both addr pointers aligned to 16B + separately to avoid movdqu. + 3. Handle any blocks of greater than 64 consecutive bytes with + unrolling to reduce branches. + 4. At least one addr pointer is 16B aligned, use memory version + of pcmbeqb. + */ + .p2align 4,, 4 +L(gt32): + movq %rdx, %r11 + addq %rdi, %r11 + movq %rdi, %r8 + + andq $15, %r8 + jz L(16am) + /* Both pointers may be misaligned. */ + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + subl $0xffff, %edx + jnz L(neq) + neg %r8 + leaq 16(%rdi, %r8), %rdi +L(16am): + /* Handle two 16B aligned pointers separately. */ + testq $15, %rsi + jz L(ATR) + testq $16, %rdi + jz L(A32) + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi +L(A32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + /* Pre-unroll to be ready for unrolled 64B loop. */ + testq $32, %rdi + jz L(A64) + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(A64): + movq %r11, %r10 + andq $-64, %r10 + cmpq %r10, %rdi + jge L(mt32) + +L(A64main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A64main) + +L(mt32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(A32main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A32main) +L(mt16): + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + + .p2align 4,, 4 +L(neq): + bsfl %edx, %ecx + movzbl (%rdi, %rcx), %eax + addq %rdi, %rsi + movzbl (%rsi,%rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(ATR): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + testq $16, %rdi + jz L(ATR32) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + je L(mt16) + +L(ATR32): + movq %r11, %r10 + andq $-64, %r10 + testq $32, %rdi + jz L(ATR64) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(ATR64): + cmpq %rdi, %r10 + je L(mt32) + +L(ATR64main): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + jne L(ATR64main) + + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(ATR32res): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %r10, %rdi + jne L(ATR32res) + + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + /* Align to 16byte to improve instruction fetch. */ + .p2align 4,, 4 +END(memcmp) + +#undef bcmp +weak_alias (memcmp, bcmp) +libc_hidden_builtin_def (memcmp) diff --git a/libc/sysdeps/x86_64/multiarch/Makefile b/libc/sysdeps/x86_64/multiarch/Makefile index 1c35e1ffb..b06640220 100644 --- a/libc/sysdeps/x86_64/multiarch/Makefile +++ b/libc/sysdeps/x86_64/multiarch/Makefile @@ -4,5 +4,13 @@ gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) -sysdep_routines += strncmp-c +sysdep_routines += stpncpy-c strncpy-c +ifeq (yes,$(config-cflags-sse4)) +sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c +CFLAGS-strcspn-c.c += -msse4 +CFLAGS-strpbrk-c.c += -msse4 +CFLAGS-strspn-c.c += -msse4 +CFLAGS-strstr.c += -msse4 +CFLAGS-strcasestr.c += -msse4 +endif endif diff --git a/libc/sysdeps/x86_64/multiarch/init-arch.c b/libc/sysdeps/x86_64/multiarch/init-arch.c index 29e687344..35fd19af0 100644 --- a/libc/sysdeps/x86_64/multiarch/init-arch.c +++ b/libc/sysdeps/x86_64/multiarch/init-arch.c @@ -68,7 +68,13 @@ __init_cpu_features (void) __cpu_features.model += extended_model; } else if (__cpu_features.family == 0x06) - __cpu_features.model += extended_model; + { + __cpu_features.model += extended_model; + + if (__cpu_features.model == 0x1c) + /* Avoid SSSE3 on Atom since it is slow. */ + __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx &= ~(1 << 9); + } } /* This spells out "AuthenticAMD". */ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) diff --git a/libc/sysdeps/x86_64/multiarch/rawmemchr.S b/libc/sysdeps/x86_64/multiarch/rawmemchr.S index 93ca63163..d4f265f43 100644 --- a/libc/sysdeps/x86_64/multiarch/rawmemchr.S +++ b/libc/sysdeps/x86_64/multiarch/rawmemchr.S @@ -77,6 +77,7 @@ __rawmemchr_sse42: # undef ENTRY # define ENTRY(name) \ .type __rawmemchr_sse2, @function; \ + .align 16; \ __rawmemchr_sse2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/libc/sysdeps/x86_64/multiarch/rtld-strlen.S b/libc/sysdeps/x86_64/multiarch/rtld-strlen.S new file mode 100644 index 000000000..596e0549e --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/rtld-strlen.S @@ -0,0 +1 @@ +#include "../rtld-strlen.S" diff --git a/libc/sysdeps/x86_64/multiarch/stpcpy.S b/libc/sysdeps/x86_64/multiarch/stpcpy.S new file mode 100644 index 000000000..b63d308ed --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/stpcpy.S @@ -0,0 +1,7 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/libc/sysdeps/x86_64/multiarch/stpncpy-c.c b/libc/sysdeps/x86_64/multiarch/stpncpy-c.c new file mode 100644 index 000000000..2fde77dca --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/stpncpy-c.c @@ -0,0 +1,8 @@ +#define STPNCPY __stpncpy_sse2 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2); +#endif + +#include "stpncpy.c" diff --git a/libc/sysdeps/x86_64/multiarch/stpncpy.S b/libc/sysdeps/x86_64/multiarch/stpncpy.S new file mode 100644 index 000000000..ff89a8949 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/stpncpy.S @@ -0,0 +1,6 @@ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/libc/sysdeps/x86_64/multiarch/strcasestr-c.c b/libc/sysdeps/x86_64/multiarch/strcasestr-c.c new file mode 100644 index 000000000..e6879531b --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strcasestr-c.c @@ -0,0 +1,18 @@ +#include "init-arch.h" + +#define STRCASESTR __strcasestr_sse2 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strcasestr_sse2, __GI_strcasestr, __strcasestr_sse2); + +#include "string/strcasestr.c" + +extern char *__strcasestr_sse42 (const char *, const char *); + +#if 1 +libc_ifunc (__strcasestr, + HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2); +#else +libc_ifunc (__strcasestr, + 0 ? __strcasestr_sse42 : __strcasestr_sse2); +#endif diff --git a/libc/sysdeps/x86_64/multiarch/strcasestr.c b/libc/sysdeps/x86_64/multiarch/strcasestr.c new file mode 100644 index 000000000..064e3ef4f --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strcasestr.c @@ -0,0 +1,3 @@ +#define USE_AS_STRCASESTR +#define STRSTR_SSE42 __strcasestr_sse42 +#include "strstr.c" diff --git a/libc/sysdeps/x86_64/multiarch/strcmp.S b/libc/sysdeps/x86_64/multiarch/strcmp.S index 2f4bf17d9..1a315737a 100644 --- a/libc/sysdeps/x86_64/multiarch/strcmp.S +++ b/libc/sysdeps/x86_64/multiarch/strcmp.S @@ -28,9 +28,9 @@ /* calculate left number to compare */ \ lea -16(%rcx, %r11), %r9; \ cmp %r9, %r11; \ - jb LABEL(strcmp_exitz); \ + jb LABEL(strcmp_exitz_sse4_2); \ test %r9, %r9; \ - je LABEL(strcmp_exitz); \ + je LABEL(strcmp_exitz_sse4_2); \ mov %r9, %r11 #define STRCMP_SSE42 __strncmp_sse42 @@ -106,9 +106,9 @@ STRCMP_SSE42: */ #ifdef USE_AS_STRNCMP test %rdx, %rdx - je LABEL(strcmp_exitz) + je LABEL(strcmp_exitz_sse4_2) cmp $1, %rdx - je LABEL(Byte0) + je LABEL(Byte0_sse4_2) mov %rdx, %r11 #endif mov %esi, %ecx @@ -117,23 +117,21 @@ STRCMP_SSE42: and $0x3f, %rcx /* rsi alignment in cache line */ and $0x3f, %rax /* rdi alignment in cache line */ cmp $0x30, %ecx - ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ + ja LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */ cmp $0x30, %eax - ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ - movlpd (%rdi), %xmm1 - movlpd (%rsi), %xmm2 - movhpd 8(%rdi), %xmm1 - movhpd 8(%rsi), %xmm2 + ja LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */ + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ psubb %xmm0, %xmm1 /* packed sub of comparison results*/ pmovmskb %xmm1, %edx sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ - jnz LABEL(less16bytes) /* If not, find different value or null char */ + jnz LABEL(less16bytes_sse4_2)/* If not, find different value or null char */ #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) /* finish comparision */ + jbe LABEL(strcmp_exitz_sse4_2)/* finish comparision */ #endif add $16, %rsi /* prepare to search next 16 bytes */ add $16, %rdi /* prepare to search next 16 bytes */ @@ -144,7 +142,7 @@ STRCMP_SSE42: * below to use. */ .p2align 4 -LABEL(crosscache): +LABEL(crosscache_sse4_2): and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ mov $0xffff, %edx /* for equivalent offset */ @@ -152,15 +150,15 @@ LABEL(crosscache): and $0xf, %ecx /* offset of rsi */ and $0xf, %eax /* offset of rdi */ cmp %eax, %ecx - je LABEL(ashr_0) /* rsi and rdi relative offset same */ - ja LABEL(bigger) + je LABEL(ashr_0_sse4_2) /* rsi and rdi relative offset same */ + ja LABEL(bigger_sse4_2) mov %edx, %r8d /* r8d is offset flag for exit tail */ xchg %ecx, %eax xchg %rsi, %rdi -LABEL(bigger): +LABEL(bigger_sse4_2): lea 15(%rax), %r9 sub %rcx, %r9 - lea LABEL(unaligned_table)(%rip), %r10 + lea LABEL(unaligned_table_sse4_2)(%rip), %r10 movslq (%r10, %r9,4), %r9 lea (%r10, %r9), %r10 jmp *%r10 /* jump to corresponding case */ @@ -171,7 +169,7 @@ LABEL(bigger): * n(0~15) n(0~15) 15(15+ n-n) ashr_0 */ .p2align 4 -LABEL(ashr_0): +LABEL(ashr_0_sse4_2): movdqa (%rsi), %xmm1 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ @@ -186,7 +184,7 @@ LABEL(ashr_0): * edx must be the same with r9d if in left byte (16-rcx) is equal to * the start from (16-rax) and no null char was seen. */ - jne LABEL(less32bytes) /* mismatch or null char */ + jne LABEL(less32bytes_sse4_2) /* mismatch or null char */ UPDATE_STRNCMP_COUNTER mov $16, %rcx mov $16, %r9 @@ -205,7 +203,7 @@ LABEL(ashr_0_use_sse4_2): jbe LABEL(ashr_0_use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif movdqa (%rdi,%rdx), %xmm0 @@ -214,17 +212,17 @@ LABEL(ashr_0_use_sse4_2): jbe LABEL(ashr_0_use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif jmp LABEL(ashr_0_use_sse4_2) .p2align 4 LABEL(ashr_0_use_sse4_2_exit): - jnc LABEL(strcmp_exitz) + jnc LABEL(strcmp_exitz_sse4_2) #ifdef USE_AS_STRNCMP sub %rcx, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif lea -16(%rdx, %rcx), %rcx movzbl (%rdi, %rcx), %eax @@ -241,7 +239,7 @@ LABEL(ashr_0_use_sse4_2_exit): * n(15) n -15 0(15 +(n-15) - n) ashr_1 */ .p2align 4 -LABEL(ashr_1): +LABEL(ashr_1_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -253,7 +251,7 @@ LABEL(ashr_1): shr %cl, %edx /* adjust 0xffff for offset */ shr %cl, %r9d /* adjust for 16-byte offset */ sub %r9d, %edx - jnz LABEL(less32bytes) /* mismatch or null char seen */ + jnz LABEL(less32bytes_sse4_2)/* mismatch or null char seen */ movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -281,7 +279,7 @@ LABEL(loop_ashr_1_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -294,7 +292,7 @@ LABEL(loop_ashr_1_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_1_use_sse4_2) @@ -320,7 +318,7 @@ LABEL(nibble_ashr_1_use_sse4_2): * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 */ .p2align 4 -LABEL(ashr_2): +LABEL(ashr_2_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -332,7 +330,7 @@ LABEL(ashr_2): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -360,7 +358,7 @@ LABEL(loop_ashr_2_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -373,7 +371,7 @@ LABEL(loop_ashr_2_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_2_use_sse4_2) @@ -399,7 +397,7 @@ LABEL(nibble_ashr_2_use_sse4_2): * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 */ .p2align 4 -LABEL(ashr_3): +LABEL(ashr_3_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -411,7 +409,7 @@ LABEL(ashr_3): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -439,7 +437,7 @@ LABEL(loop_ashr_3_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -452,7 +450,7 @@ LABEL(loop_ashr_3_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_3_use_sse4_2) @@ -478,7 +476,7 @@ LABEL(nibble_ashr_3_use_sse4_2): * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 */ .p2align 4 -LABEL(ashr_4): +LABEL(ashr_4_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -490,7 +488,7 @@ LABEL(ashr_4): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -519,7 +517,7 @@ LABEL(loop_ashr_4_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -532,7 +530,7 @@ LABEL(loop_ashr_4_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_4_use_sse4_2) @@ -558,7 +556,7 @@ LABEL(nibble_ashr_4_use_sse4_2): * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 */ .p2align 4 -LABEL(ashr_5): +LABEL(ashr_5_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -570,7 +568,7 @@ LABEL(ashr_5): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -599,7 +597,7 @@ LABEL(loop_ashr_5_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -613,7 +611,7 @@ LABEL(loop_ashr_5_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_5_use_sse4_2) @@ -639,7 +637,7 @@ LABEL(nibble_ashr_5_use_sse4_2): * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 */ .p2align 4 -LABEL(ashr_6): +LABEL(ashr_6_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -651,7 +649,7 @@ LABEL(ashr_6): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -680,7 +678,7 @@ LABEL(loop_ashr_6_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -693,7 +691,7 @@ LABEL(loop_ashr_6_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_6_use_sse4_2) @@ -719,7 +717,7 @@ LABEL(nibble_ashr_6_use_sse4_2): * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 */ .p2align 4 -LABEL(ashr_7): +LABEL(ashr_7_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -731,7 +729,7 @@ LABEL(ashr_7): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -760,7 +758,7 @@ LABEL(loop_ashr_7_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -773,7 +771,7 @@ LABEL(loop_ashr_7_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_7_use_sse4_2) @@ -799,7 +797,7 @@ LABEL(nibble_ashr_7_use_sse4_2): * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 */ .p2align 4 -LABEL(ashr_8): +LABEL(ashr_8_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -811,7 +809,7 @@ LABEL(ashr_8): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -840,7 +838,7 @@ LABEL(loop_ashr_8_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -853,7 +851,7 @@ LABEL(loop_ashr_8_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_8_use_sse4_2) @@ -879,7 +877,7 @@ LABEL(nibble_ashr_8_use_sse4_2): * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 */ .p2align 4 -LABEL(ashr_9): +LABEL(ashr_9_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -891,7 +889,7 @@ LABEL(ashr_9): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -921,7 +919,7 @@ LABEL(loop_ashr_9_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -934,7 +932,7 @@ LABEL(loop_ashr_9_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_9_use_sse4_2) @@ -960,7 +958,7 @@ LABEL(nibble_ashr_9_use_sse4_2): * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 */ .p2align 4 -LABEL(ashr_10): +LABEL(ashr_10_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -972,7 +970,7 @@ LABEL(ashr_10): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -1001,7 +999,7 @@ LABEL(loop_ashr_10_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1014,7 +1012,7 @@ LABEL(loop_ashr_10_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_10_use_sse4_2) @@ -1040,7 +1038,7 @@ LABEL(nibble_ashr_10_use_sse4_2): * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 */ .p2align 4 -LABEL(ashr_11): +LABEL(ashr_11_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -1052,7 +1050,7 @@ LABEL(ashr_11): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -1081,7 +1079,7 @@ LABEL(loop_ashr_11_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1094,7 +1092,7 @@ LABEL(loop_ashr_11_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_11_use_sse4_2) @@ -1120,7 +1118,7 @@ LABEL(nibble_ashr_11_use_sse4_2): * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 */ .p2align 4 -LABEL(ashr_12): +LABEL(ashr_12_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -1132,7 +1130,7 @@ LABEL(ashr_12): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -1161,7 +1159,7 @@ LABEL(loop_ashr_12_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1174,7 +1172,7 @@ LABEL(loop_ashr_12_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_12_use_sse4_2) @@ -1200,7 +1198,7 @@ LABEL(nibble_ashr_12_use_sse4_2): * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 */ .p2align 4 -LABEL(ashr_13): +LABEL(ashr_13_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -1212,7 +1210,7 @@ LABEL(ashr_13): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -1242,7 +1240,7 @@ LABEL(loop_ashr_13_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1255,7 +1253,7 @@ LABEL(loop_ashr_13_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_13_use_sse4_2) @@ -1281,7 +1279,7 @@ LABEL(nibble_ashr_13_use_sse4_2): * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 */ .p2align 4 -LABEL(ashr_14): +LABEL(ashr_14_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -1293,7 +1291,7 @@ LABEL(ashr_14): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -1323,7 +1321,7 @@ LABEL(loop_ashr_14_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1336,7 +1334,7 @@ LABEL(loop_ashr_14_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_14_use_sse4_2) @@ -1362,7 +1360,7 @@ LABEL(nibble_ashr_14_use_sse4_2): * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 */ .p2align 4 -LABEL(ashr_15): +LABEL(ashr_15_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -1374,7 +1372,7 @@ LABEL(ashr_15): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 @@ -1406,7 +1404,7 @@ LABEL(loop_ashr_15_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1419,7 +1417,7 @@ LABEL(loop_ashr_15_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_15_use_sse4_2) @@ -1441,194 +1439,53 @@ LABEL(nibble_ashr_use_sse4_2_exit): pcmpistri $0x1a,(%rsi,%rdx), %xmm0 .p2align 4 LABEL(use_sse4_2_exit): - jnc LABEL(strcmp_exitz) + jnc LABEL(strcmp_exitz_sse4_2) #ifdef USE_AS_STRNCMP sub %rcx, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add %rcx, %rdx lea -16(%rdi, %r9), %rdi movzbl (%rdi, %rdx), %eax movzbl (%rsi, %rdx), %edx test %r8d, %r8d - jz LABEL(use_sse4_2_ret) + jz LABEL(use_sse4_2_ret_sse4_2) xchg %eax, %edx -LABEL(use_sse4_2_ret): +LABEL(use_sse4_2_ret_sse4_2): sub %edx, %eax ret - .p2align 4 -LABEL(aftertail): - pcmpeqb %xmm3, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - not %edx - - .p2align 4 -LABEL(exit): - lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ -LABEL(less32bytes): +LABEL(less32bytes_sse4_2): lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ test %r8d, %r8d - jz LABEL(ret) + jz LABEL(ret_sse4_2) xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ .p2align 4 -LABEL(ret): -LABEL(less16bytes): - /* - * Check to see if BSF is fast on this processor. If not, use a different - * exit tail. - */ +LABEL(ret_sse4_2): +LABEL(less16bytes_sse4_2): bsf %rdx, %rdx /* find and store bit index in %rdx */ #ifdef USE_AS_STRNCMP sub %rdx, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif - xor %ecx, %ecx /* clear %ecx */ - xor %eax, %eax /* clear %eax */ - - movb (%rsi, %rdx), %cl - movb (%rdi, %rdx), %al + movzbl (%rsi, %rdx), %ecx + movzbl (%rdi, %rdx), %eax sub %ecx, %eax ret -LABEL(strcmp_exitz): +LABEL(strcmp_exitz_sse4_2): xor %eax, %eax ret .p2align 4 -LABEL(Byte0): - /* - * never need to handle byte 0 for strncmpy -#ifdef USE_AS_STRNCMP - sub $0, %r11 - jbe LABEL(strcmp_exitz) -#endif - */ +LABEL(Byte0_sse4_2): movzx (%rsi), %ecx movzx (%rdi), %eax - sub %ecx, %eax - ret - - .p2align 4 -LABEL(Byte1): - -#ifdef USE_AS_STRNCMP - sub $1, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 1(%rsi), %ecx - movzx 1(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(Byte2): - -#ifdef USE_AS_STRNCMP - sub $2, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 2(%rsi), %ecx - movzx 2(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(Byte3): - -#ifdef USE_AS_STRNCMP - sub $3, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 3(%rsi), %ecx - movzx 3(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(Byte4): - -#ifdef USE_AS_STRNCMP - sub $4, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 4(%rsi), %ecx - movzx 4(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(Byte5): - -#ifdef USE_AS_STRNCMP - sub $5, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 5(%rsi), %ecx - movzx 5(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(Byte6): - -#ifdef USE_AS_STRNCMP - sub $6, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 6(%rsi), %ecx - movzx 6(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(next_8_bytes): - add $8, %rdi - add $8, %rsi -#ifdef USE_AS_STRNCMP - sub $8, %r11 - jbe LABEL(strcmp_exitz) -#endif - test $0x01, %dh - jnz LABEL(Byte0) - - test $0x02, %dh - jnz LABEL(Byte1) - - test $0x04, %dh - jnz LABEL(Byte2) - - test $0x08, %dh - jnz LABEL(Byte3) - - test $0x10, %dh - jnz LABEL(Byte4) - - test $0x20, %dh - jnz LABEL(Byte5) - - test $0x40, %dh - jnz LABEL(Byte6) - -#ifdef USE_AS_STRNCMP - sub $7, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 7(%rsi), %ecx - movzx 7(%rdi), %eax - sub %ecx, %eax ret cfi_endproc @@ -1636,29 +1493,30 @@ LABEL(next_8_bytes): /* Put all SSE 4.2 functions together. */ .section .rodata.sse4.2,"a",@progbits - .p2align 4 -LABEL(unaligned_table): - .int LABEL(ashr_1) - LABEL(unaligned_table) - .int LABEL(ashr_2) - LABEL(unaligned_table) - .int LABEL(ashr_3) - LABEL(unaligned_table) - .int LABEL(ashr_4) - LABEL(unaligned_table) - .int LABEL(ashr_5) - LABEL(unaligned_table) - .int LABEL(ashr_6) - LABEL(unaligned_table) - .int LABEL(ashr_7) - LABEL(unaligned_table) - .int LABEL(ashr_8) - LABEL(unaligned_table) - .int LABEL(ashr_9) - LABEL(unaligned_table) - .int LABEL(ashr_10) - LABEL(unaligned_table) - .int LABEL(ashr_11) - LABEL(unaligned_table) - .int LABEL(ashr_12) - LABEL(unaligned_table) - .int LABEL(ashr_13) - LABEL(unaligned_table) - .int LABEL(ashr_14) - LABEL(unaligned_table) - .int LABEL(ashr_15) - LABEL(unaligned_table) - .int LABEL(ashr_0) - LABEL(unaligned_table) + .p2align 3 +LABEL(unaligned_table_sse4_2): + .int LABEL(ashr_1_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_2_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_3_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_4_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_5_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_6_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_7_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_8_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_9_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_10_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_11_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_12_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_13_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_14_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_15_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_0_sse4_2) - LABEL(unaligned_table_sse4_2) # undef ENTRY # define ENTRY(name) \ .type STRCMP_SSE2, @function; \ + .align 16; \ STRCMP_SSE2: cfi_startproc; \ CALL_MCOUNT # undef END @@ -1672,6 +1530,4 @@ LABEL(unaligned_table): .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2 #endif -#ifndef USE_AS_STRNCMP #include "../strcmp.S" -#endif diff --git a/libc/sysdeps/x86_64/multiarch/strcpy.S b/libc/sysdeps/x86_64/multiarch/strcpy.S new file mode 100644 index 000000000..7e400a914 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strcpy.S @@ -0,0 +1,1911 @@ +/* strcpy with SSSE3 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define __GI_STRCPY __GI_stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define __GI_STRCPY __GI_strcpy +# endif +#endif + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCPY_SSE2(%rip), %rax + testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq STRCPY_SSSE3(%rip), %rax +2: ret +END(STRCPY) + + .section .text.ssse3,"ax",@progbits +STRCPY_SSSE3: + cfi_startproc + CALL_MCOUNT + +/* + * This implementation uses SSE to copy up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCPY + test %rdx, %rdx + jz LABEL(strncpy_exitz) + mov %rdx, %r8 +#else + xor %edx, %edx +#endif + mov %esi, %ecx + and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/ + and $15, %ecx + mov %rdi, %rax /*store return parameter*/ + + + pxor %xmm0, %xmm0 /* clear %xmm0 */ + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + shr %cl, %edx /* get real bits left in edx*/ + test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */ + jnz LABEL(less16bytes) + +#ifdef USE_AS_STRNCPY + lea -16(%r8,%rcx), %r11 + cmp $0, %r11 + jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */ +#endif + + mov %rcx, %r9 + or %edi, %ecx + and $15, %ecx + lea -16(%r9), %r10 + jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/ + + neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/ + + pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/ + pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(less32bytes) + /* + * at least 16 byte available to fill destination rdi + */ +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(less32bytes_strncpy_truncation) +#endif + mov (%rsi, %r9), %rdx + mov %rdx, (%rdi) + mov 8(%rsi, %r9), %rdx + mov %rdx, 8(%rdi) + + /* + * so far destatination rdi may be aligned by 16, re-calculate rsi to jump + * crossponding case + * rcx is offset of rsi + * rax is offset of rdi + */ + + and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ + mov %rax, %rdx /* rax store orignal rdi */ + xor %rdi, %rdx /* equal to and $15, %rdx */ +#ifdef USE_AS_STRNCPY + add %rdx, %r8 +#endif + + add $16, %rdi /* next 16 bytes for rdi */ + sub %rdx, %r9 + + lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */ + mov %esi, %ecx /*store offset of rsi */ + and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ + + and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/ + jz LABEL(ashr_0) + + lea -16(%rcx), %r10 + mov %rcx, %r9 + neg %r10 + lea LABEL(unaligned_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + + /* + * The following cases will be handled by ashr_0 & ashr_0_start + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * 0 0 0 ashr_0 + * n(1~15) n(1~15) 0 ashr_0_start + * + */ + .p2align 5 +LABEL(ashr_0): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */ + movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */ + add $16, %rsi + add $16, %rdi + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + + test %edx, %edx /* edx must be 0 if there is no null char in rsi*/ + jnz LABEL(aligned_16bytes) + +LABEL(ashr_0_loop): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jz LABEL(ashr_0_loop) + + jmp LABEL(aligned_exit) + .p2align 4 + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_15): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_15_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_15_use_ssse3) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_14): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_14_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_14_use_ssse3) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_13): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_13_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_13_use_ssse3) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_12): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_12_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_12_use_ssse3) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_11): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_11_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_11_use_ssse3) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_10): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_10_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_10_use_ssse3) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_9): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_9_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $9, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $9, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_9_use_ssse3) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_8): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_8_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $8, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $8, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_8_use_ssse3) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_7): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + .p2align 4 + +LABEL(ashr_7_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $7, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $7, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_7_use_ssse3) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_6): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_6_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $6, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $6, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_6_use_ssse3) + + /* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_5): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_5_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $5, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $5, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_5_use_ssse3) + +/* + * + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_4): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_4_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $4, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $4, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_4_use_ssse3) + +/* + * + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_3): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_3_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $3, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $3, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_3_use_ssse3) + +/* + * + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_2): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_2_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $2, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $2, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_2_use_ssse3) + +/* + * + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_1): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_1_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $1, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + palignr $1, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_1_use_ssse3) + + .p2align 4 +LABEL(less32bytes): + xor %ecx, %ecx +LABEL(unaligned_exit): + add %r9, %rsi /* r9 stores original offset of rsi*/ + mov %rcx, %r9 + mov %r10, %rcx + shl %cl, %edx /* after shl, calculate the exact number to be filled*/ + mov %r9, %rcx + .p2align 4 +LABEL(aligned_exit): + add %rcx, %rdi /*locate exact address for rdi */ +LABEL(less16bytes): + add %rcx, %rsi /*locate exact address for rsi */ +LABEL(aligned_16bytes): +#ifdef USE_AS_STRNCPY + mov $1, %r9d + lea -1(%r8), %rcx + shl %cl, %r9d + cmp $32, %r8 + ja LABEL(strncpy_tail) + or %r9d, %edx +LABEL(strncpy_tail): +#endif + bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/ + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + +#ifdef USE_AS_STRNCPY + .p2align 4 +LABEL(less32bytes_strncpy_truncation): + xor %ecx, %ecx +LABEL(strncpy_truncation_unaligned): + add %r9, %rsi +LABEL(strncpy_truncation_aligned): + add %rcx, %rdi + add %rcx, %rsi + add $16, %r8 + lea -1(%r8), %rcx + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + .p2align 4 +LABEL(strncpy_exitz): + mov %rdi, %rax + ret +#endif + +#ifdef USE_AS_STRNCPY + .p2align 4 +LABEL(strncpy_fill_tail): + mov %rax, %rdx + movzx %cl, %rax + mov %r8, %rcx + add %rax, %rdi + xor %eax, %eax + shr $3, %ecx + jz LABEL(strncpy_fill_less_8) + + rep stosq +LABEL(strncpy_fill_less_8): + mov %r8, %rcx + and $7, %ecx + jz LABEL(strncpy_fill_return) +LABEL(strncpy_fill_less_7): + sub $1, %ecx + mov %al, (%rdi, %rcx) + jnz LABEL(strncpy_fill_less_7) +LABEL(strncpy_fill_return): +#ifdef USE_AS_STPCPY + cmpb $1, (%rdx) + sbb $-1, %rdx +#endif + mov %rdx, %rax + ret +#endif + .p2align 4 +LABEL(tail_0): + mov (%rsi), %cl + mov %cl, (%rdi) +#ifdef USE_AS_STPCPY + mov %rdi, %rax +#endif +#ifdef USE_AS_STRNCPY + mov $1, %cl + sub $1, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_1): + mov (%rsi), %cx + mov %cx, (%rdi) +#ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $2, %cl + sub $2, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_2): + mov (%rsi), %cx + mov %cx, (%rdi) + mov 1(%rsi), %cx + mov %cx, 1(%rdi) +#ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $3, %cl + sub $3, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_3): + mov (%rsi), %ecx + mov %ecx, (%rdi) +#ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $4, %cl + sub $4, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_4): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 1(%rsi), %edx + mov %edx, 1(%rdi) +#ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $5, %cl + sub $5, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_5): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 2(%rsi), %edx + mov %edx, 2(%rdi) +#ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $6, %cl + sub $6, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_6): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 3(%rsi), %edx + mov %edx,3(%rdi) +#ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $7, %cl + sub $7, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_7): + mov (%rsi), %rcx + mov %rcx, (%rdi) +#ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $8, %cl + sub $8, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_8): + + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %edx + mov %edx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $9, %cl + sub $9, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_9): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %edx + mov %edx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $10, %cl + sub $10, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_10): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %edx + mov %edx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $11, %cl + sub $11, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_11): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %edx + mov %edx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $12, %cl + sub $12, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_12): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %rcx + mov %rcx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $13, %cl + sub $13, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_13): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %rcx + mov %rcx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $14, %cl + sub $14, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_14): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %rcx + mov %rcx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $15, %cl + sub $15, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + +LABEL(tail_15): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $16, %cl + sub $16, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_16): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cl + mov %cl, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $17, %cl + sub $17, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_17): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cx + mov %cx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $18, %cl + sub $18, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_18): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %ecx + mov %ecx,15(%rdi) +#ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $19, %cl + sub $19, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_19): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %ecx + mov %ecx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $20, %cl + sub $20, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_20): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 13(%rsi), %rcx + mov %rcx, 13(%rdi) +#ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $21, %cl + sub $21, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_21): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 14(%rsi), %rcx + mov %rcx, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $22, %cl + sub $22, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_22): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %rcx + mov %rcx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $23, %cl + sub $23, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_23): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $24, %cl + sub $24, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_24): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %edx + mov %edx, 21(%rdi) +#ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $25, %cl + sub $25, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_25): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %edx + mov %edx, 22(%rdi) +#ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $26, %cl + sub $26, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_26): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %edx + mov %edx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $27, %cl + sub $27, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_27): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %edx + mov %edx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $28, %cl + sub $28, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_28): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %rdx + mov %rdx, 21(%rdi) +#ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $29, %cl + sub $29, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_29): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %rdx + mov %rdx, 22(%rdi) +#ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $30, %cl + sub $30, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + + .p2align 4 +LABEL(tail_30): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %rdx + mov %rdx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $31, %cl + sub $31, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_31): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %rdx + mov %rdx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $32, %cl + sub $32, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + cfi_endproc + .size STRCPY_SSSE3, .-STRCPY_SSSE3 + + .p2align 4 + .section .rodata.ssse3,"a",@progbits +LABEL(tail_table): + .int LABEL(tail_0) - LABEL(tail_table) + .int LABEL(tail_1) - LABEL(tail_table) + .int LABEL(tail_2) - LABEL(tail_table) + .int LABEL(tail_3) - LABEL(tail_table) + .int LABEL(tail_4) - LABEL(tail_table) + .int LABEL(tail_5) - LABEL(tail_table) + .int LABEL(tail_6) - LABEL(tail_table) + .int LABEL(tail_7) - LABEL(tail_table) + .int LABEL(tail_8) - LABEL(tail_table) + .int LABEL(tail_9) - LABEL(tail_table) + .int LABEL(tail_10) - LABEL(tail_table) + .int LABEL(tail_11) - LABEL(tail_table) + .int LABEL(tail_12) - LABEL(tail_table) + .int LABEL(tail_13) - LABEL(tail_table) + .int LABEL(tail_14) - LABEL(tail_table) + .int LABEL(tail_15) - LABEL(tail_table) + .int LABEL(tail_16) - LABEL(tail_table) + .int LABEL(tail_17) - LABEL(tail_table) + .int LABEL(tail_18) - LABEL(tail_table) + .int LABEL(tail_19) - LABEL(tail_table) + .int LABEL(tail_20) - LABEL(tail_table) + .int LABEL(tail_21) - LABEL(tail_table) + .int LABEL(tail_22) - LABEL(tail_table) + .int LABEL(tail_23) - LABEL(tail_table) + .int LABEL(tail_24) - LABEL(tail_table) + .int LABEL(tail_25) - LABEL(tail_table) + .int LABEL(tail_26) - LABEL(tail_table) + .int LABEL(tail_27) - LABEL(tail_table) + .int LABEL(tail_28) - LABEL(tail_table) + .int LABEL(tail_29) - LABEL(tail_table) + .int LABEL(tail_30) - LABEL(tail_table) + .int LABEL(tail_31) - LABEL(tail_table) + + .p2align 4 +LABEL(unaligned_table): + .int LABEL(ashr_0) - LABEL(unaligned_table) + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCPY_SSE2, @function; \ + .align 16; \ + STRCPY_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2 +#endif + +#ifndef USE_AS_STRNCPY +#include "../strcpy.S" +#endif diff --git a/libc/sysdeps/x86_64/multiarch/strcspn-c.c b/libc/sysdeps/x86_64/multiarch/strcspn-c.c new file mode 100644 index 000000000..4512267d3 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strcspn-c.c @@ -0,0 +1,312 @@ +/* strcspn with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* We use 0x2: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_POSITIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any byte A and + the offset of the first byte. There are 3 cases: + + 1. The first 16byte data element has the byte A at the offset X. + 2. The first 16byte data element has EOS and doesn't have the byte A. + 3. The first 16byte data element is valid and doesn't have the byte A. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + 1 X 1 0/1 0 + 2 16 0 1 0 + 3 16 0 0 0 + + We exit from the loop for cases 1 and 2 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset + X for case 1. */ + +#ifndef STRCSPN_SSE2 +# define STRCSPN_SSE2 __strcspn_sse2 +# define STRCSPN_SSE42 __strcspn_sse42 +#endif + +#ifdef USE_AS_STRPBRK +# define RETURN(val1, val2) return val1 +#else +# define RETURN(val1, val2) return val2 +#endif + +extern +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +STRCSPN_SSE2 (const char *, const char *); + + +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +__attribute__ ((section (".text.sse4.2"))) +STRCSPN_SSE42 (const char *s, const char *a) +{ + if (*a == 0) + RETURN (NULL, strlen (s)); + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + mask = _mm_srli_si128 (mask0, 1); + break; + case 2: + mask = _mm_srli_si128 (mask0, 2); + break; + case 3: + mask = _mm_srli_si128 (mask0, 3); + break; + case 4: + mask = _mm_srli_si128 (mask0, 4); + break; + case 5: + mask = _mm_srli_si128 (mask0, 5); + break; + case 6: + mask = _mm_srli_si128 (mask0, 6); + break; + case 7: + mask = _mm_srli_si128 (mask0, 7); + break; + case 8: + mask = _mm_srli_si128 (mask0, 8); + break; + case 9: + mask = _mm_srli_si128 (mask0, 9); + break; + case 10: + mask = _mm_srli_si128 (mask0, 10); + break; + case 11: + mask = _mm_srli_si128 (mask0, 11); + break; + case 12: + mask = _mm_srli_si128 (mask0, 12); + break; + case 13: + mask = _mm_srli_si128 (mask0, 13); + break; + case 14: + mask = _mm_srli_si128 (mask0, 14); + break; + case 15: + mask = _mm_srli_si128 (mask0, 15); + break; + } + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return STRCSPN_SSE2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. */ + switch (offset) + { + case 1: + mask = _mm_alignr_epi8 (mask1, mask0, 1); + break; + case 2: + mask = _mm_alignr_epi8 (mask1, mask0, 2); + break; + case 3: + mask = _mm_alignr_epi8 (mask1, mask0, 3); + break; + case 4: + mask = _mm_alignr_epi8 (mask1, mask0, 4); + break; + case 5: + mask = _mm_alignr_epi8 (mask1, mask0, 5); + break; + case 6: + mask = _mm_alignr_epi8 (mask1, mask0, 6); + break; + case 7: + mask = _mm_alignr_epi8 (mask1, mask0, 7); + break; + case 8: + mask = _mm_alignr_epi8 (mask1, mask0, 8); + break; + case 9: + mask = _mm_alignr_epi8 (mask1, mask0, 9); + break; + case 10: + mask = _mm_alignr_epi8 (mask1, mask0, 10); + break; + case 11: + mask = _mm_alignr_epi8 (mask1, mask0, 11); + break; + case 12: + mask = _mm_alignr_epi8 (mask1, mask0, 12); + break; + case 13: + mask = _mm_alignr_epi8 (mask1, mask0, 13); + break; + case 14: + mask = _mm_alignr_epi8 (mask1, mask0, 14); + break; + case 15: + mask = _mm_alignr_epi8 (mask1, mask0, 15); + break; + } + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_SSE2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + + int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ + int cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) + RETURN ((char *) (s + length), length); + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + RETURN (NULL, index); + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x2); + int cflag = _mm_cmpistrc (mask, value, 0x2); + int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) + RETURN (NULL, + /* Find where the NULL terminator is. */ + (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); + aligned += 16; + } +} diff --git a/libc/sysdeps/x86_64/multiarch/strcspn.S b/libc/sysdeps/x86_64/multiarch/strcspn.S new file mode 100644 index 000000000..cc75ab70e --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strcspn.S @@ -0,0 +1,82 @@ +/* Multiple versions of strcspn + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + +#ifdef HAVE_SSE4_SUPPORT + +#include +#include + +#ifdef USE_AS_STRPBRK +#define STRCSPN_SSE42 __strpbrk_sse42 +#define STRCSPN_SSE2 __strpbrk_sse2 +#define __GI_STRCSPN __GI_strpbrk +#else +#ifndef STRCSPN +#define STRCSPN strcspn +#define STRCSPN_SSE42 __strcspn_sse42 +#define STRCSPN_SSE2 __strcspn_sse2 +#define __GI_STRCSPN __GI_strcspn +#endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc + .text +ENTRY(STRCSPN) + .type STRCSPN, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCSPN_SSE2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq STRCSPN_SSE42(%rip), %rax +2: ret +END(STRCSPN) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCSPN_SSE2, @function; \ + .globl STRCSPN_SSE2; \ + .align 16; \ + STRCSPN_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcspn calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 +#endif + +#endif /* HAVE_SSE4_SUPPORT */ + +#ifdef USE_AS_STRPBRK +#include "../strpbrk.S" +#else +#include "../strcspn.S" +#endif diff --git a/libc/sysdeps/x86_64/multiarch/strlen.S b/libc/sysdeps/x86_64/multiarch/strlen.S index 79e6a977e..82b03ccc2 100644 --- a/libc/sysdeps/x86_64/multiarch/strlen.S +++ b/libc/sysdeps/x86_64/multiarch/strlen.S @@ -77,6 +77,7 @@ __strlen_sse42: # undef ENTRY # define ENTRY(name) \ .type __strlen_sse2, @function; \ + .align 16; \ __strlen_sse2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/libc/sysdeps/x86_64/multiarch/strncmp-c.c b/libc/sysdeps/x86_64/multiarch/strncmp-c.c deleted file mode 100644 index d4f74a418..000000000 --- a/libc/sysdeps/x86_64/multiarch/strncmp-c.c +++ /dev/null @@ -1,8 +0,0 @@ -#ifdef SHARED -#define STRNCMP __strncmp_sse2 -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) \ - __hidden_ver1 (__strncmp_sse2, __GI_strncmp, __strncmp_sse2); -#endif - -#include "strncmp.c" diff --git a/libc/sysdeps/x86_64/multiarch/strncpy-c.c b/libc/sysdeps/x86_64/multiarch/strncpy-c.c new file mode 100644 index 000000000..296c32cb5 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strncpy-c.c @@ -0,0 +1,8 @@ +#define STRNCPY __strncpy_sse2 +#ifdef SHARED +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2); +#endif + +#include "strncpy.c" diff --git a/libc/sysdeps/x86_64/multiarch/strncpy.S b/libc/sysdeps/x86_64/multiarch/strncpy.S new file mode 100644 index 000000000..327a4ce44 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strncpy.S @@ -0,0 +1,3 @@ +#define STRCPY strncpy +#define USE_AS_STRNCPY +#include "strcpy.S" diff --git a/libc/sysdeps/x86_64/multiarch/strpbrk-c.c b/libc/sysdeps/x86_64/multiarch/strpbrk-c.c new file mode 100644 index 000000000..c58dcb560 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strpbrk-c.c @@ -0,0 +1,4 @@ +#define USE_AS_STRPBRK +#define STRCSPN_SSE2 __strpbrk_sse2 +#define STRCSPN_SSE42 __strpbrk_sse42 +#include "strcspn-c.c" diff --git a/libc/sysdeps/x86_64/multiarch/strpbrk.S b/libc/sysdeps/x86_64/multiarch/strpbrk.S new file mode 100644 index 000000000..ed5bca6a9 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strpbrk.S @@ -0,0 +1,3 @@ +#define STRCSPN strpbrk +#define USE_AS_STRPBRK +#include "strcspn.S" diff --git a/libc/sysdeps/x86_64/multiarch/strspn-c.c b/libc/sysdeps/x86_64/multiarch/strspn-c.c new file mode 100644 index 000000000..5b99f0d38 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strspn-c.c @@ -0,0 +1,284 @@ +/* strspn with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* We use 0x12: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any non-A byte and + the offset of the first byte. There are 2 cases: + + 1. The first 16byte data element has the non-A byte, including + EOS, at the offset X. + 2. The first 16byte data element is valid and doesn't have the non-A + byte. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + case ECX CFlag ZFlag SFlag + 1 X 1 0/1 0 + 2 16 0 0 0 + + We exit from the loop for case 1. */ + +extern size_t __strspn_sse2 (const char *, const char *); + + +size_t +__attribute__ ((section (".text.sse4.2"))) +__strspn_sse42 (const char *s, const char *a) +{ + if (*a == 0) + return 0; + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + mask = _mm_srli_si128 (mask0, 1); + break; + case 2: + mask = _mm_srli_si128 (mask0, 2); + break; + case 3: + mask = _mm_srli_si128 (mask0, 3); + break; + case 4: + mask = _mm_srli_si128 (mask0, 4); + break; + case 5: + mask = _mm_srli_si128 (mask0, 5); + break; + case 6: + mask = _mm_srli_si128 (mask0, 6); + break; + case 7: + mask = _mm_srli_si128 (mask0, 7); + break; + case 8: + mask = _mm_srli_si128 (mask0, 8); + break; + case 9: + mask = _mm_srli_si128 (mask0, 9); + break; + case 10: + mask = _mm_srli_si128 (mask0, 10); + break; + case 11: + mask = _mm_srli_si128 (mask0, 11); + break; + case 12: + mask = _mm_srli_si128 (mask0, 12); + break; + case 13: + mask = _mm_srli_si128 (mask0, 13); + break; + case 14: + mask = _mm_srli_si128 (mask0, 14); + break; + case 15: + mask = _mm_srli_si128 (mask0, 15); + break; + } + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return __strspn_sse2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. */ + switch (offset) + { + case 1: + mask = _mm_alignr_epi8 (mask1, mask0, 1); + break; + case 2: + mask = _mm_alignr_epi8 (mask1, mask0, 2); + break; + case 3: + mask = _mm_alignr_epi8 (mask1, mask0, 3); + break; + case 4: + mask = _mm_alignr_epi8 (mask1, mask0, 4); + break; + case 5: + mask = _mm_alignr_epi8 (mask1, mask0, 5); + break; + case 6: + mask = _mm_alignr_epi8 (mask1, mask0, 6); + break; + case 7: + mask = _mm_alignr_epi8 (mask1, mask0, 7); + break; + case 8: + mask = _mm_alignr_epi8 (mask1, mask0, 8); + break; + case 9: + mask = _mm_alignr_epi8 (mask1, mask0, 9); + break; + case 10: + mask = _mm_alignr_epi8 (mask1, mask0, 10); + break; + case 11: + mask = _mm_alignr_epi8 (mask1, mask0, 11); + break; + case 12: + mask = _mm_alignr_epi8 (mask1, mask0, 12); + break; + case 13: + mask = _mm_alignr_epi8 (mask1, mask0, 13); + break; + case 14: + mask = _mm_alignr_epi8 (mask1, mask0, 14); + break; + case 15: + mask = _mm_alignr_epi8 (mask1, mask0, 15); + break; + } + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return __strspn_sse2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + + int length = _mm_cmpistri (mask, value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + return length; + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x12); + int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; + } +} diff --git a/libc/sysdeps/x86_64/multiarch/strspn.S b/libc/sysdeps/x86_64/multiarch/strspn.S new file mode 100644 index 000000000..4183a2cf6 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strspn.S @@ -0,0 +1,63 @@ +/* Multiple versions of strspn + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + +#ifdef HAVE_SSE4_SUPPORT + +#include +#include + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(strspn) + .type strspn, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __strspn_sse2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq __strspn_sse42(%rip), %rax +2: ret +END(strspn) + +# undef ENTRY +# define ENTRY(name) \ + .type __strspn_sse2, @function; \ + .globl __strspn_sse2; \ + .align 16; \ + __strspn_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strspn calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strspn; __GI_strspn = __strspn_sse2 +#endif + +#endif /* HAVE_SSE4_SUPPORT */ + +#include "../strspn.S" diff --git a/libc/sysdeps/x86_64/multiarch/strstr-c.c b/libc/sysdeps/x86_64/multiarch/strstr-c.c new file mode 100644 index 000000000..cff99b71e --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strstr-c.c @@ -0,0 +1,12 @@ +#include "init-arch.h" + +#define STRSTR __strstr_sse2 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2); + +#include "string/strstr.c" + +extern char *__strstr_sse42 (const char *, const char *); + +libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2); diff --git a/libc/sysdeps/x86_64/multiarch/strstr.c b/libc/sysdeps/x86_64/multiarch/strstr.c new file mode 100644 index 000000000..76d5ad16d --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strstr.c @@ -0,0 +1,464 @@ +/* strstr with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + +#ifndef STRSTR_SSE42 +# define STRSTR_SSE42 __strstr_sse42 +#endif + +#ifdef USE_AS_STRCASESTR +# include +# include + +# define LOADBYTE(C) tolower (C) +# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2)) +#else +# define LOADBYTE(C) (C) +# define CMPBYTE(C1, C2) ((C1) == (C2)) +#endif + +/* We use 0xe ordered-compare: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ORDER + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to do the scanning and string comparsion requirements of + sub-string match. In the scanning phase, we process Cflag and ECX + index to locate the first fragment match; once the first fragment + match position has been identified, we do comparison of subsequent + string fragments until we can conclude false or true match; whe + n concluding a false match, we may need to repeat scanning process + from next relevant offset in the target string. + + In the scanning phase we have 4 cases: + case ECX CFlag ZFlag SFlag + 1 16 0 0 0 + 2a 16 0 0 1 + 2b 16 0 1 0 + 2c 16 0 1 1 + + 1. No ordered-comparison match, both 16B fragments are valid, so + continue to next fragment. + 2. No ordered-comparison match, there is EOS in either fragment, + 2a. Zflg = 0, Sflg = 1, we continue + 2b. Zflg = 1, Sflg = 0, we conclude no match and return. + 2c. Zflg = 1, sflg = 1, lenth determine match or no match + + In the string comparison phase, the 1st fragment match is fixed up + to produce ECX = 0. Subsequent fragment compare of nonzero index + and no match conclude a false match. + + case ECX CFlag ZFlag SFlag + 3 X 1 0 0/1 + 4a 0 1 0 0 + 4b 0 1 0 1 + 4c 0 < X 1 0 0/1 + 5 16 0 1 0 + + 3. An initial ordered-comparison fragment match, we fix up to do + subsequent string comparison + 4a. Continuation of fragment comparison of a string compare. + 4b. EOS reached in the reference string, we conclude true match and + return + 4c. String compare failed if index is nonzero, we need to go back to + scanning + 5. failed string compare, go back to scanning + */ + +/* Fix-up of removal of unneeded data due to 16B aligned load + parameters: + value: 16B data loaded from 16B aligned address. + offset: Offset of target data address relative to 16B aligned load + address. + */ + +static __inline__ __m128i +__m128i_shift_right (__m128i value, int offset) +{ + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + return value; +} + +/* Simple replacement of movdqu to address 4KB boundary cross issue. + If EOS occurs within less than 16B before 4KB boundary, we don't + cross to next page. */ + +static __m128i +__attribute__ ((section (".text.sse4.2"))) +__m128i_strloadu (const unsigned char * p) +{ + int offset = ((size_t) p & (16 - 1)); + + if (offset && (int) ((size_t) p & 0xfff) > 0xff0) + { + __m128i a = _mm_load_si128 ((__m128i *) (p - offset)); + __m128i zero = _mm_setzero_si128 (); + int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero)); + if ((bmsk >> offset) != 0) + return __m128i_shift_right (a, offset); + } + return _mm_loadu_si128 ((__m128i *) p); +} + +#ifdef USE_AS_STRCASESTR + +/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C + locale. */ + +static __m128i +__attribute__ ((section (".text.sse4.2"))) +__m128i_strloadu_tolower_posix (const unsigned char * p) +{ + __m128i frag = __m128i_strloadu (p); + + /* Convert frag to lower case for POSIX/C locale. */ + __m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41); + __m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0); + __m128i mask1 = _mm_cmpistrm (rangeuc, frag, 0x44); + __m128i mask2 = _mm_blendv_epi8 (u2ldelta, frag, mask1); + mask2 = _mm_sub_epi8 (mask2, u2ldelta); + return _mm_blendv_epi8 (frag, mask2, mask1); +} + +/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C + locale. */ + +static __m128i +__attribute__ ((section (".text.sse4.2"))) +__m128i_strloadu_tolower (const unsigned char * p) +{ + union + { + char b[16]; + __m128i x; + } u; + + for (int i = 0; i < 16; i++) + if (p[i] == 0) + { + u.b[i] = 0; + break; + } + else + u.b[i] = tolower (p[i]); + + return u.x; +} +#endif + +/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP + algorithm) overlap for a fully populated 16B vector. + Input parameter: 1st 16Byte loaded from the reference string of a + strstr function. + We don't use KMP algorithm if reference string is less than 16B. + */ + +static int +__inline__ __attribute__ ((__always_inline__,)) +KMP16Bovrlap (__m128i s2) +{ + __m128i b = _mm_unpacklo_epi8 (s2, s2); + __m128i a = _mm_unpacklo_epi8 (b, b); + a = _mm_shuffle_epi32 (a, 0); + b = _mm_srli_si128 (s2, sizeof (char)); + int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a)); + + /* _BitScanForward(&k1, bmsk); */ + int k1; + __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk)); + if (!bmsk) + return 16; + else if (bmsk == 0x7fff) + return 1; + else if (!k1) + { + /* There are al least two ditinct char in s2. If byte 0 and 1 are + idential and the distinct value lies farther down, we can deduce + the next byte offset to restart full compare is least no earlier + than byte 3. */ + return 3; + } + else + { + /* Byte 1 is not degenerated to byte 0. */ + return k1 + 1; + } +} + +char * +__attribute__ ((section (".text.sse4.2"))) +STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2) +{ +#define p1 s1 + const unsigned char *p2 = s2; + + if (p2[0] == '\0') + return (char *) p1; + + if (p1[0] == '\0') + return NULL; + + /* Check if p1 length is 1 byte long. */ + if (p1[1] == '\0') + return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL; + +#ifdef USE_AS_STRCASESTR + __m128i (*strloadu) (const unsigned char *); + + if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE) == 0) + strloadu = __m128i_strloadu_tolower_posix; + else + strloadu = __m128i_strloadu_tolower; +#else +# define strloadu __m128i_strloadu +#endif + + /* p1 > 1 byte long. Load up to 16 bytes of fragment. */ + __m128i frag1 = strloadu (p1); + + __m128i frag2; + if (p2[1] != '\0') + /* p2 is > 1 byte long. */ + frag2 = strloadu (p2); + else + frag2 = _mm_insert_epi8 (_mm_setzero_si128 (), LOADBYTE (p2[0]), 0); + + /* Unsigned bytes, equal order, does frag2 has null? */ + int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + int cmp = _mm_cmpistri (frag2, frag1, 0x0c); + int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); + if (cmp_s & cmp_c) + { + int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, + _mm_setzero_si128 ())); + int len; + __asm ("bsfl %[bmsk], %[len]" + : [len] "=r" (len) : [bmsk] "r" (bmsk)); + p1 += cmp; + if ((len + cmp) <= 16) + return (char *) p1; + + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + if ((len + cmp) <= 16) + return (char *) p1 + cmp; + } + + if (cmp_s) + { + /* Adjust addr for 16B alginment in ensuing loop. */ + while (!cmp_z) + { + p1 += cmp; + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp + once already, this time cmp will be zero and we can exit. */ + if ((!cmp) & cmp_c) + break; + } + + if (!cmp_c) + return NULL; + + /* Since s2 is less than 16 bytes, com_c is definitive + determination of full match. */ + return (char *) p1 + cmp; + } + + /* General case, s2 is at least 16 bytes or more. + First, the common case of false-match at first byte of p2. */ + const unsigned char *pt = NULL; + int kmp_fwd = 0; +re_trace: + while (!cmp_c) + { + /* frag1 has null. */ + if (cmp_z) + return NULL; + + /* frag 1 has no null, advance 16 bytes. */ + p1 += 16; + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + /* Unsigned bytes, equal order, is there a partial match? */ + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + } + + /* Next, handle initial positive match as first byte of p2. We have + a partial fragment match, make full determination until we reached + end of s2. */ + if (!cmp) + { + if (cmp_z) + return (char *) p1; + + pt = p1; + p1 += 16; + p2 += 16; + /* Load up to 16 bytes of fragment. */ + frag2 = strloadu (p2); + } + else + { + /* Adjust 16B alignment. */ + p1 += cmp; + pt = p1; + } + + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + + /* Unsigned bytes, equal order, does frag2 has null? */ + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); + while (!(cmp | cmp_z | cmp_s)) + { + p1 += 16; + p2 += 16; + /* Load up to 16 bytes of fragment. */ + frag2 = strloadu (p2); + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + /* Unsigned bytes, equal order, does frag2 has null? */ + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c); + } + + /* Full determination yielded a false result, retrace s1 to next + starting position. + Zflg 1 0 1 0/1 + Sflg 0 1 1 0/1 + cmp na 0 0 >0 + action done done continue continue if s2 < s1 + false match retrace s1 else false + */ + + if (cmp_s & !cmp) + return (char *) pt; + if (cmp_z) + { + if (!cmp_s) + return NULL; + + /* Handle both zero and sign flag set and s1 is shorter in + length. */ + __m128i zero = _mm_setzero_si128 (); + int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2)); + int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1)); + int len; + int len1; + __asm ("bsfl %[bmsk], %[len]" + : [len] "=r" (len) : [bmsk] "r" (bmsk)); + __asm ("bsfl %[bmsk1], %[len1]" + : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1)); + if (len >= len1) + return NULL; + } + else if (!cmp) + return (char *) pt; + + /* Otherwise, we have to retrace and continue. Default of multiple + paths that need to retrace from next byte in s1. */ + p2 = s2; + frag2 = strloadu (p2); + + if (!kmp_fwd) + kmp_fwd = KMP16Bovrlap (frag2); + + /* KMP algorithm predicted overlap needs to be corrected for + partial fragment compare. */ + p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd); + + /* Since s2 is at least 16 bytes long, we're certain there is no + match. */ + if (p1[0] == '\0') + return NULL; + + /* Load up to 16 bytes of fragment. */ + frag1 = strloadu (p1); + + /* Unsigned bytes, equal order, is there a partial match? */ + cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c); + cmp = _mm_cmpistri (frag2, frag1, 0x0c); + cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c); + goto re_trace; +} diff --git a/libc/sysdeps/x86_64/rtld-memcmp.c b/libc/sysdeps/x86_64/rtld-memcmp.c new file mode 100644 index 000000000..2ee40328b --- /dev/null +++ b/libc/sysdeps/x86_64/rtld-memcmp.c @@ -0,0 +1 @@ +#include diff --git a/libc/sysdeps/x86_64/rtld-strchr.S b/libc/sysdeps/x86_64/rtld-strchr.S new file mode 100644 index 000000000..893469797 --- /dev/null +++ b/libc/sysdeps/x86_64/rtld-strchr.S @@ -0,0 +1,291 @@ +/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR. + For AMD x86-64. + Copyright (C) 2002, 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + + + .text +ENTRY (BP_SYM (strchr)) + + /* Before we start with the main loop we process single bytes + until the source pointer is aligned. This has two reasons: + 1. aligned 64-bit memory access is faster + and (more important) + 2. we process in the main loop 64 bit in one step although + we don't know the end of the string. But accessing at + 8-byte alignment guarantees that we never access illegal + memory if this would not also be done by the trivial + implementation (this is because all processor inherent + boundaries are multiples of 8). */ + + movq %rdi, %rdx + andl $7, %edx /* Mask alignment bits */ + movq %rdi, %rax /* duplicate destination. */ + jz 1f /* aligned => start loop */ + neg %edx + addl $8, %edx /* Align to 8 bytes. */ + + /* Search the first bytes directly. */ +0: movb (%rax), %cl /* load byte */ + cmpb %cl,%sil /* compare byte. */ + je 6f /* target found */ + testb %cl,%cl /* is byte NUL? */ + je 7f /* yes => return NULL */ + incq %rax /* increment pointer */ + decl %edx + jnz 0b + + +1: + /* At the moment %rsi contains C. What we need for the + algorithm is C in all bytes of the register. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + /* Populate 8 bit data to full 64-bit. */ + movabs $0x0101010101010101,%r9 + movzbl %sil,%edx + imul %rdx,%r9 + + movq $0xfefefefefefefeff, %r8 /* Save magic. */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of QUARDWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24 tec.. If one of bits 54-63 is set, there will be a carry + into bit 64 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + .p2align 4 +4: + /* Main Loop is unrolled 4 times. */ + /* First unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + + /* Second unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + /* Third unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + /* Fourth unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz 4b /* no NUL found => restart loop */ + + +7: /* Return NULL. */ + xorl %eax, %eax + retq + + + /* We now scan for the byte in which the character was matched. + But we have to take care of the case that a NUL char is + found before this in the dword. Note that we XORed %rcx + with the byte we're looking for, therefore the tests below look + reversed. */ + + + .p2align 4 /* Align, it's a jump target. */ +3: movq %r9,%rdx /* move to %rdx so that we can access bytes */ + subq $8,%rax /* correct pointer increment. */ + testb %cl, %cl /* is first byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is first byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is second byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is third byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is third byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is fourth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is fourth byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is fifth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is fifth byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is sixth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is sixth byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is seventh byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is seventh byte NUL? */ + je 7b /* yes => return NULL */ + + /* It must be in the eigth byte and it cannot be NUL. */ + incq %rax + +6: + nop + retq +END (BP_SYM (strchr)) + +weak_alias (BP_SYM (strchr), BP_SYM (index)) +libc_hidden_builtin_def (strchr) diff --git a/libc/sysdeps/x86_64/rtld-strlen.S b/libc/sysdeps/x86_64/rtld-strlen.S new file mode 100644 index 000000000..fd950edaa --- /dev/null +++ b/libc/sysdeps/x86_64/rtld-strlen.S @@ -0,0 +1,139 @@ +/* strlen(str) -- determine the length of the string STR. + Copyright (C) 2002, 2003 Free Software Foundation, Inc. + Based on i486 version contributed by Ulrich Drepper . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + + + .text +ENTRY (strlen) + movq %rdi, %rcx /* Duplicate source pointer. */ + andl $7, %ecx /* mask alignment bits */ + movq %rdi, %rax /* duplicate destination. */ + jz 1f /* aligned => start loop */ + + neg %ecx /* We need to align to 8 bytes. */ + addl $8,%ecx + /* Search the first bytes directly. */ +0: cmpb $0x0,(%rax) /* is byte NUL? */ + je 2f /* yes => return */ + incq %rax /* increment pointer */ + decl %ecx + jnz 0b + +1: movq $0xfefefefefefefeff,%r8 /* Save magic. */ + + .p2align 4 /* Align loop. */ +4: /* Main Loop is unrolled 4 times. */ + /* First unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Second unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Third unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Fourth unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz 4b /* no NUL found => continue loop */ + + .p2align 4 /* Align, it's a jump target. */ +3: subq $8,%rax /* correct pointer increment. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0x00ff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + testl $0xff000000, %ecx /* is fourth byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + shrq $32, %rcx /* look at other half. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0xff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ +2: + subq %rdi, %rax /* compute difference to string start */ + ret +END (strlen) +libc_hidden_builtin_def (strlen) diff --git a/libc/sysdeps/x86_64/strcmp.S b/libc/sysdeps/x86_64/strcmp.S index 119b88e40..340a64ba3 100644 --- a/libc/sysdeps/x86_64/strcmp.S +++ b/libc/sysdeps/x86_64/strcmp.S @@ -1,8 +1,10 @@ /* Highly optimized version for x86-64. - Copyright (C) 1999, 2000, 2002, 2003, 2005 Free Software Foundation, Inc. + Copyright (C) 1999, 2000, 2002, 2003, 2005, 2009 + Free Software Foundation, Inc. This file is part of the GNU C Library. Based on i686 version contributed by Ulrich Drepper , 1999. + Updated with SSE2 support contributed by Intel Corporation. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -24,8 +26,35 @@ #include "bp-sym.h" #include "bp-asm.h" - .text -ENTRY (BP_SYM (strcmp)) +#undef UPDATE_STRNCMP_COUNTER + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + +#ifdef USE_AS_STRNCMP +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 + +#else +# define UPDATE_STRNCMP_COUNTER +# ifndef STRCMP +# define STRCMP strcmp +# endif +#endif + + .text +ENTRY (BP_SYM (STRCMP)) +#ifdef NOT_IN_libc +/* Simple version since we can't use SSE registers in ld.so. */ L(oop): movb (%rdi), %al cmpb (%rsi), %al jne L(neq) @@ -41,5 +70,1914 @@ L(neq): movl $1, %eax movl $-1, %ecx cmovbl %ecx, %eax ret -END (BP_SYM (strcmp)) -libc_hidden_builtin_def (strcmp) +END (BP_SYM (STRCMP)) +#else /* NOT_IN_libc */ +/* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCMP + test %rdx, %rdx + je LABEL(strcmp_exitz) + cmp $1, %rdx + je LABEL(Byte0) + mov %rdx, %r11 +#endif + mov %esi, %ecx + mov %edi, %eax +/* Use 64bit AND here to avoid long NOP padding. */ + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ + movlpd (%rdi), %xmm1 + movlpd (%rsi), %xmm2 + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes) /* If not, find different value or null char */ +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) /* finish comparision */ +#endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine source and destination string offsets from 16-byte alignment. + * Use relative offset difference between the two to determine which case + * below to use. + */ + .p2align 4 +LABEL(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +LABEL(bigger): + lea 15(%rax), %r9 + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 + lea (%r10, %r9), %r10 + jmp *%r10 /* jump to corresponding case */ + +/* + * The following cases will be handled by ashr_0 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +LABEL(ashr_0): + + movdqa (%rsi), %xmm1 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne LABEL(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + .p2align 4 +LABEL(loop_ashr_0): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) /* mismatch or null char seen */ + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + jmp LABEL(loop_ashr_0) + +/* + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +LABEL(ashr_1): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pslldq $15, %xmm2 /* shift first string to align with second */ + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz LABEL(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads*/ + mov $1, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_1): + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + +LABEL(gobble_ashr_1): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_1) + + /* + * Nibble avoids loads across page boundary. This is to avoid a potential + * access into unmapped memory. + */ + .p2align 4 +LABEL(nibble_ashr_1): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ + pmovmskb %xmm0, %edx + test $0xfffe, %edx + jnz LABEL(ashr_1_exittail) /* find null char*/ + +#ifdef USE_AS_STRNCMP + cmp $14, %r11 + jbe LABEL(ashr_1_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* substract 4K from %r10 */ + jmp LABEL(gobble_ashr_1) + + /* + * Once find null char, determine if there is a string mismatch + * before the null char. + */ + .p2align 4 +LABEL(ashr_1_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $1, %xmm0 + psrldq $1, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +LABEL(ashr_2): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_2): + add $16, %r10 + jg LABEL(nibble_ashr_2) + +LABEL(gobble_ashr_2): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_2) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_2) + + .p2align 4 +LABEL(nibble_ashr_2): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfffc, %edx + jnz LABEL(ashr_2_exittail) + +#ifdef USE_AS_STRNCMP + cmp $13, %r11 + jbe LABEL(ashr_2_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_2) + + .p2align 4 +LABEL(ashr_2_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $2, %xmm0 + psrldq $2, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +LABEL(ashr_3): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $3, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 3(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_3): + add $16, %r10 + jg LABEL(nibble_ashr_3) + +LABEL(gobble_ashr_3): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $3, %xmm3 + pslldq $13, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_3) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $3, %xmm3 + pslldq $13, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_3) + + .p2align 4 +LABEL(nibble_ashr_3): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfff8, %edx + jnz LABEL(ashr_3_exittail) + +#ifdef USE_AS_STRNCMP + cmp $12, %r11 + jbe LABEL(ashr_3_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_3) + + .p2align 4 +LABEL(ashr_3_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $3, %xmm0 + psrldq $3, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +LABEL(ashr_4): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $4, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 4(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_4): + add $16, %r10 + jg LABEL(nibble_ashr_4) + +LABEL(gobble_ashr_4): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $4, %xmm3 + pslldq $12, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_4) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $4, %xmm3 + pslldq $12, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_4) + + .p2align 4 +LABEL(nibble_ashr_4): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfff0, %edx + jnz LABEL(ashr_4_exittail) + +#ifdef USE_AS_STRNCMP + cmp $11, %r11 + jbe LABEL(ashr_4_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_4) + + .p2align 4 +LABEL(ashr_4_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +LABEL(ashr_5): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $5, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 5(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_5): + add $16, %r10 + jg LABEL(nibble_ashr_5) + +LABEL(gobble_ashr_5): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $5, %xmm3 + pslldq $11, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_5) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $5, %xmm3 + pslldq $11, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_5) + + .p2align 4 +LABEL(nibble_ashr_5): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xffe0, %edx + jnz LABEL(ashr_5_exittail) + +#ifdef USE_AS_STRNCMP + cmp $10, %r11 + jbe LABEL(ashr_5_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_5) + + .p2align 4 +LABEL(ashr_5_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $5, %xmm0 + psrldq $5, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 + */ + .p2align 4 +LABEL(ashr_6): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $6, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 6(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_6): + add $16, %r10 + jg LABEL(nibble_ashr_6) + +LABEL(gobble_ashr_6): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $6, %xmm3 + pslldq $10, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_6) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $6, %xmm3 + pslldq $10, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_6) + + .p2align 4 +LABEL(nibble_ashr_6): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xffc0, %edx + jnz LABEL(ashr_6_exittail) + +#ifdef USE_AS_STRNCMP + cmp $9, %r11 + jbe LABEL(ashr_6_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_6) + + .p2align 4 +LABEL(ashr_6_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $6, %xmm0 + psrldq $6, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 + */ + .p2align 4 +LABEL(ashr_7): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $7, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 7(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_7): + add $16, %r10 + jg LABEL(nibble_ashr_7) + +LABEL(gobble_ashr_7): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $7, %xmm3 + pslldq $9, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_7) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $7, %xmm3 + pslldq $9, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_7) + + .p2align 4 +LABEL(nibble_ashr_7): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xff80, %edx + jnz LABEL(ashr_7_exittail) + +#ifdef USE_AS_STRNCMP + cmp $8, %r11 + jbe LABEL(ashr_7_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_7) + + .p2align 4 +LABEL(ashr_7_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $7, %xmm0 + psrldq $7, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 + */ + .p2align 4 +LABEL(ashr_8): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $8, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 8(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_8): + add $16, %r10 + jg LABEL(nibble_ashr_8) + +LABEL(gobble_ashr_8): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $8, %xmm3 + pslldq $8, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_8) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $8, %xmm3 + pslldq $8, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_8) + + .p2align 4 +LABEL(nibble_ashr_8): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xff00, %edx + jnz LABEL(ashr_8_exittail) + +#ifdef USE_AS_STRNCMP + cmp $7, %r11 + jbe LABEL(ashr_8_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_8) + + .p2align 4 +LABEL(ashr_8_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $8, %xmm0 + psrldq $8, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 + */ + .p2align 4 +LABEL(ashr_9): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $9, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 9(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_9): + add $16, %r10 + jg LABEL(nibble_ashr_9) + +LABEL(gobble_ashr_9): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $9, %xmm3 + pslldq $7, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_9) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $9, %xmm3 + pslldq $7, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 /* store for next cycle */ + jmp LABEL(loop_ashr_9) + + .p2align 4 +LABEL(nibble_ashr_9): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfe00, %edx + jnz LABEL(ashr_9_exittail) + +#ifdef USE_AS_STRNCMP + cmp $6, %r11 + jbe LABEL(ashr_9_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_9) + + .p2align 4 +LABEL(ashr_9_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $9, %xmm0 + psrldq $9, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 + */ + .p2align 4 +LABEL(ashr_10): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $10, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 10(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_10): + add $16, %r10 + jg LABEL(nibble_ashr_10) + +LABEL(gobble_ashr_10): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $10, %xmm3 + pslldq $6, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_10) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $10, %xmm3 + pslldq $6, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_10) + + .p2align 4 +LABEL(nibble_ashr_10): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfc00, %edx + jnz LABEL(ashr_10_exittail) + +#ifdef USE_AS_STRNCMP + cmp $5, %r11 + jbe LABEL(ashr_10_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_10) + + .p2align 4 +LABEL(ashr_10_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $10, %xmm0 + psrldq $10, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 + */ + .p2align 4 +LABEL(ashr_11): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $11, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 11(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_11): + add $16, %r10 + jg LABEL(nibble_ashr_11) + +LABEL(gobble_ashr_11): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $11, %xmm3 + pslldq $5, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_11) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $11, %xmm3 + pslldq $5, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_11) + + .p2align 4 +LABEL(nibble_ashr_11): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xf800, %edx + jnz LABEL(ashr_11_exittail) + +#ifdef USE_AS_STRNCMP + cmp $4, %r11 + jbe LABEL(ashr_11_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_11) + + .p2align 4 +LABEL(ashr_11_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $11, %xmm0 + psrldq $11, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 + */ + .p2align 4 +LABEL(ashr_12): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $12, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 12(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_12): + add $16, %r10 + jg LABEL(nibble_ashr_12) + +LABEL(gobble_ashr_12): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $12, %xmm3 + pslldq $4, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_12) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $12, %xmm3 + pslldq $4, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_12) + + .p2align 4 +LABEL(nibble_ashr_12): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xf000, %edx + jnz LABEL(ashr_12_exittail) + +#ifdef USE_AS_STRNCMP + cmp $3, %r11 + jbe LABEL(ashr_12_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_12) + + .p2align 4 +LABEL(ashr_12_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $12, %xmm0 + psrldq $12, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 + */ + .p2align 4 +LABEL(ashr_13): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $13, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 13(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_13): + add $16, %r10 + jg LABEL(nibble_ashr_13) + +LABEL(gobble_ashr_13): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $13, %xmm3 + pslldq $3, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_13) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $13, %xmm3 + pslldq $3, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_13) + + .p2align 4 +LABEL(nibble_ashr_13): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xe000, %edx + jnz LABEL(ashr_13_exittail) + +#ifdef USE_AS_STRNCMP + cmp $2, %r11 + jbe LABEL(ashr_13_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_13) + + .p2align 4 +LABEL(ashr_13_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $13, %xmm0 + psrldq $13, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 + */ + .p2align 4 +LABEL(ashr_14): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $14, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 14(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_14): + add $16, %r10 + jg LABEL(nibble_ashr_14) + +LABEL(gobble_ashr_14): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $14, %xmm3 + pslldq $2, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_14) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $14, %xmm3 + pslldq $2, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_14) + + .p2align 4 +LABEL(nibble_ashr_14): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xc000, %edx + jnz LABEL(ashr_14_exittail) + +#ifdef USE_AS_STRNCMP + cmp $1, %r11 + jbe LABEL(ashr_14_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_14) + + .p2align 4 +LABEL(ashr_14_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $14, %xmm0 + psrldq $14, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 + */ + .p2align 4 +LABEL(ashr_15): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $15, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 15(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_15): + add $16, %r10 + jg LABEL(nibble_ashr_15) + +LABEL(gobble_ashr_15): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $15, %xmm3 + pslldq $1, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_15) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $15, %xmm3 + pslldq $1, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_15) + + .p2align 4 +LABEL(nibble_ashr_15): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0x8000, %edx + jnz LABEL(ashr_15_exittail) + +#ifdef USE_AS_STRNCMP + test %r11, %r11 + je LABEL(ashr_15_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_15) + + .p2align 4 +LABEL(ashr_15_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $15, %xmm3 + psrldq $15, %xmm0 + + .p2align 4 +LABEL(aftertail): + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + not %edx + + .p2align 4 +LABEL(exit): + lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ +LABEL(less32bytes): + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ + test %r8d, %r8d + jz LABEL(ret) + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ + + .p2align 4 +LABEL(ret): +LABEL(less16bytes): + bsf %rdx, %rdx /* find and store bit index in %rdx */ + +#ifdef USE_AS_STRNCMP + sub %rdx, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzbl (%rsi, %rdx), %ecx + movzbl (%rdi, %rdx), %eax + + sub %ecx, %eax + ret + +LABEL(strcmp_exitz): + xor %eax, %eax + ret + + .p2align 4 +LABEL(Byte0): + movzx (%rsi), %ecx + movzx (%rdi), %eax + + sub %ecx, %eax + ret +END (BP_SYM (STRCMP)) + + .section .rodata,"a",@progbits + .p2align 3 +LABEL(unaligned_table): + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + .int LABEL(ashr_0) - LABEL(unaligned_table) +#endif /* NOT_IN_libc */ +libc_hidden_builtin_def (STRCMP) diff --git a/libc/sysdeps/x86_64/strncmp.S b/libc/sysdeps/x86_64/strncmp.S new file mode 100644 index 000000000..0af34e7f1 --- /dev/null +++ b/libc/sysdeps/x86_64/strncmp.S @@ -0,0 +1,3 @@ +#define STRCMP strncmp +#define USE_AS_STRNCMP +#include "strcmp.S" diff --git a/libc/sysdeps/x86_64/tst-xmmymm.sh b/libc/sysdeps/x86_64/tst-xmmymm.sh new file mode 100755 index 000000000..a576e7da0 --- /dev/null +++ b/libc/sysdeps/x86_64/tst-xmmymm.sh @@ -0,0 +1,79 @@ +#! /bin/bash +objpfx="$1" + +tmp=$(mktemp ${objpfx}tst-xmmymm.XXXXXX) +trap 'rm -f "$tmp"' 1 2 3 15 + +# List of object files we have to test +rtldobjs=$(readelf -W -wi ${objpfx}dl-allobjs.os | + awk '/^ "$tmp" +declare -a objects +objects=($(cat "$tmp")) + +objs="dl-runtime.os" +tocheck="dl-runtime.os" + +while test -n "$objs"; do + this="$objs" + objs="" + + for f in $this; do + undef=$(nm -u "$objpfx"../*/"$f" | awk '{print $2}') + if test -n "$undef"; then + for s in $undef; do + for obj in ${objects[*]} "_GLOBAL_OFFSET_TABLE_"; do + if test "$obj" = "$s"; then + continue 2 + fi + done + for o in $rtldobjs; do + ro=$(echo "$objpfx"../*/"$o") + if nm -g --defined-only "$ro" | egrep -qs " $s\$"; then + if ! (echo "$tocheck $objs" | fgrep -qs "$o"); then + echo "$o needed for $s" + objs="$objs $o" + fi + break; + fi + done + done + fi + done + tocheck="$tocheck$objs" +done + +echo +echo +echo "object files needed: $tocheck" + +cp /dev/null "$tmp" +for f in $tocheck; do + objdump -d "$objpfx"../*/"$f" | + awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xy]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' | + while read fct; do + if test "$fct" != "_dl_runtime_profile"; then + echo "function $fct in $f modifies xmm/ymm" >> "$tmp" + result=1 + fi + done +done + +if test -s "$tmp"; then + echo + echo + cat "$tmp" + result=1 +else + result=0 +fi + +rm "$tmp" +exit $result -- cgit v1.2.1