diff options
Diffstat (limited to 'libc/sysdeps/x86_64')
22 files changed, 768 insertions, 1625 deletions
diff --git a/libc/sysdeps/x86_64/fpu/libm-test-ulps b/libc/sysdeps/x86_64/fpu/libm-test-ulps index b828774c7..f190ed881 100644 --- a/libc/sysdeps/x86_64/fpu/libm-test-ulps +++ b/libc/sysdeps/x86_64/fpu/libm-test-ulps @@ -2390,6 +2390,9 @@ ifloat: 1 Test "j0 (0x1.d7ce3ap+107) == 2.775523647291230802651040996274861694514e-17": float: 2 ifloat: 2 +Test "j0 (0x1p16382) == -1.2193782500509000574176799046642541129387e-2466": +ildouble: 1 +ldouble: 1 Test "j0 (10.0) == -0.245935764451348335197760862485328754": double: 2 float: 1 @@ -2420,6 +2423,9 @@ ldouble: 1 Test "j1 (0x1.ff00000000002p+840) == 1.846591691699331493194965158699937660696e-127": double: 1 idouble: 1 +Test "j1 (0x1p16382) == 8.0839224448726336195866026476176740513439e-2467": +ildouble: 1 +ldouble: 1 Test "j1 (10.0) == 0.0434727461688614366697487680258592883": float: 2 ifloat: 2 @@ -3073,6 +3079,9 @@ double: 1 float: 1 idouble: 1 ifloat: 1 +Test "y0 (0x1p16382) == 8.0839224448726336195866026476176740513439e-2467": +ildouble: 1 +ldouble: 1 Test "y0 (1.0) == 0.0882569642156769579829267660235151628": double: 2 float: 1 @@ -3117,6 +3126,9 @@ ldouble: 1 Test "y1 (0x1p-10) == -6.5190099301063115047395187618929589514382e+02": double: 1 idouble: 1 +Test "y1 (0x1p16382) == 1.2193782500509000574176799046642541129387e-2466": +ildouble: 1 +ldouble: 1 Test "y1 (1.5) == -0.412308626973911295952829820633445323": float: 1 ifloat: 1 diff --git a/libc/sysdeps/x86_64/memset.S b/libc/sysdeps/x86_64/memset.S index f3a4d448d..b393efe44 100644 --- a/libc/sysdeps/x86_64/memset.S +++ b/libc/sysdeps/x86_64/memset.S @@ -23,7 +23,7 @@ #define __STOS_UPPER_BOUNDARY $65536 .text -#if !defined NOT_IN_libc && !defined USE_MULTIARCH +#if !defined NOT_IN_libc ENTRY(__bzero) mov %rsi,%rdx /* Adjust parameter. */ xorl %esi,%esi /* Fill with 0s. */ diff --git a/libc/sysdeps/x86_64/multiarch/Makefile b/libc/sysdeps/x86_64/multiarch/Makefile index dd6c27d0b..86787ee6e 100644 --- a/libc/sysdeps/x86_64/multiarch/Makefile +++ b/libc/sysdeps/x86_64/multiarch/Makefile @@ -10,14 +10,12 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ - strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \ + strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ - strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \ - strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \ - memcmp-ssse3 + strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 diff --git a/libc/sysdeps/x86_64/multiarch/bzero.S b/libc/sysdeps/x86_64/multiarch/bzero.S deleted file mode 100644 index 88e96ea8e..000000000 --- a/libc/sysdeps/x86_64/multiarch/bzero.S +++ /dev/null @@ -1,28 +0,0 @@ -/* bzero. x86-64 version. - Copyright (C) 2010-2013 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - .text -ENTRY(__bzero) - mov %rsi,%rdx /* Adjust parameter. */ - xorl %esi,%esi /* Fill with 0s. */ - jmp __libc_memset /* Branch to IFUNC memset. */ -END(__bzero) -weak_alias (__bzero, bzero) diff --git a/libc/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/libc/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 643cb2dd0..05315fdd7 100644 --- a/libc/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/libc/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -61,17 +61,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2)) - /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ - IFUNC_IMPL (i, name, __memset_chk, - IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2) - IFUNC_IMPL_ADD (array, i, __memset_chk, 1, - __memset_chk_x86_64)) - - /* Support sysdeps/x86_64/multiarch/memset.S. */ - IFUNC_IMPL (i, name, memset, - IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2) - IFUNC_IMPL_ADD (array, i, memset, 1, __memset_x86_64)) - /* Support sysdeps/x86_64/multiarch/rawmemchr.S. */ IFUNC_IMPL (i, name, rawmemchr, IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_SSE4_2, @@ -187,11 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) - /* Support sysdeps/x86_64/multiarch/strnlen.S. */ - IFUNC_IMPL (i, name, strnlen, - IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2_no_bsf) - IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) - /* Support sysdeps/x86_64/multiarch/strpbrk.S. */ IFUNC_IMPL (i, name, strpbrk, IFUNC_IMPL_ADD (array, i, strpbrk, HAS_SSE4_2, @@ -262,14 +246,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2)) - /* Support sysdeps/x86_64/multiarch/strlen.S. */ - IFUNC_IMPL (i, name, strlen, - IFUNC_IMPL_ADD (array, i, strlen, HAS_SSE4_2, __strlen_sse42) - IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_pminub) - IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_no_bsf) - IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2) - IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) - /* Support sysdeps/x86_64/multiarch/strncmp.S. */ IFUNC_IMPL (i, name, strncmp, IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2, diff --git a/libc/sysdeps/x86_64/multiarch/init-arch.c b/libc/sysdeps/x86_64/multiarch/init-arch.c index 992cbfb75..7daaf4609 100644 --- a/libc/sysdeps/x86_64/multiarch/init-arch.c +++ b/libc/sysdeps/x86_64/multiarch/init-arch.c @@ -58,11 +58,6 @@ __init_cpu_features (void) get_common_indeces (&family, &model); - /* Intel processors prefer SSE instruction for memory/string - routines if they are available. */ - __cpu_features.feature[index_Prefer_SSE_for_memop] - |= bit_Prefer_SSE_for_memop; - unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax; unsigned int extended_family = (eax >> 20) & 0xff; unsigned int extended_model = (eax >> 12) & 0xf0; @@ -125,12 +120,6 @@ __init_cpu_features (void) ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx; - /* AMD processors prefer SSE instructions for memory/string routines - if they are available, otherwise they prefer integer instructions. */ - if ((ecx & 0x200)) - __cpu_features.feature[index_Prefer_SSE_for_memop] - |= bit_Prefer_SSE_for_memop; - unsigned int eax; __cpuid (0x80000000, eax, ebx, ecx, edx); if (eax >= 0x80000001) diff --git a/libc/sysdeps/x86_64/multiarch/init-arch.h b/libc/sysdeps/x86_64/multiarch/init-arch.h index 0aece18de..28edbf7d0 100644 --- a/libc/sysdeps/x86_64/multiarch/init-arch.h +++ b/libc/sysdeps/x86_64/multiarch/init-arch.h @@ -18,7 +18,6 @@ #define bit_Fast_Rep_String (1 << 0) #define bit_Fast_Copy_Backward (1 << 1) #define bit_Slow_BSF (1 << 2) -#define bit_Prefer_SSE_for_memop (1 << 3) #define bit_Fast_Unaligned_Load (1 << 4) #define bit_Prefer_PMINUB_for_stringop (1 << 5) #define bit_AVX_Usable (1 << 6) @@ -58,7 +57,6 @@ # define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE # define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE # define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE -# define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE # define index_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE # define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE # define index_AVX_Usable FEATURE_INDEX_1*FEATURE_SIZE @@ -157,7 +155,6 @@ extern const struct cpu_features *__get_cpu_features (void) # define index_Fast_Rep_String FEATURE_INDEX_1 # define index_Fast_Copy_Backward FEATURE_INDEX_1 # define index_Slow_BSF FEATURE_INDEX_1 -# define index_Prefer_SSE_for_memop FEATURE_INDEX_1 # define index_Fast_Unaligned_Load FEATURE_INDEX_1 # define index_AVX_Usable FEATURE_INDEX_1 # define index_FMA_Usable FEATURE_INDEX_1 @@ -169,7 +166,6 @@ extern const struct cpu_features *__get_cpu_features (void) # define HAS_FAST_REP_STRING HAS_ARCH_FEATURE (Fast_Rep_String) # define HAS_FAST_COPY_BACKWARD HAS_ARCH_FEATURE (Fast_Copy_Backward) # define HAS_SLOW_BSF HAS_ARCH_FEATURE (Slow_BSF) -# define HAS_PREFER_SSE_FOR_MEMOP HAS_ARCH_FEATURE (Prefer_SSE_for_memop) # define HAS_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (Fast_Unaligned_Load) # define HAS_AVX HAS_ARCH_FEATURE (AVX_Usable) # define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable) diff --git a/libc/sysdeps/x86_64/multiarch/memset-x86-64.S b/libc/sysdeps/x86_64/multiarch/memset-x86-64.S deleted file mode 100644 index 551d105d2..000000000 --- a/libc/sysdeps/x86_64/multiarch/memset-x86-64.S +++ /dev/null @@ -1,19 +0,0 @@ -#include <sysdep.h> - -#ifndef NOT_IN_libc -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __memset_chk_x86_64, @function; \ - .globl __memset_chk_x86_64; \ - .p2align 4; \ - __memset_chk_x86_64: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __memset_chk_x86_64, .-__memset_chk_x86_64 - -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(name) -# define memset __memset_x86_64 -# include "../memset.S" -#endif diff --git a/libc/sysdeps/x86_64/multiarch/memset.S b/libc/sysdeps/x86_64/multiarch/memset.S deleted file mode 100644 index 7f673faa7..000000000 --- a/libc/sysdeps/x86_64/multiarch/memset.S +++ /dev/null @@ -1,79 +0,0 @@ -/* Multiple versions of memset - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2013 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib. */ -#ifndef NOT_IN_libc -ENTRY(memset) - .type memset, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __memset_x86_64(%rip), %rax - testl $bit_Prefer_SSE_for_memop, __cpu_features+FEATURE_OFFSET+index_Prefer_SSE_for_memop(%rip) - jz 2f - leaq __memset_sse2(%rip), %rax -2: ret -END(memset) - -/* Define internal IFUNC memset for bzero. */ - .globl __libc_memset - .hidden __libc_memset - __libc_memset = memset - -# define USE_SSE2 1 - -# undef ENTRY -# define ENTRY(name) \ - .type __memset_sse2, @function; \ - .globl __memset_sse2; \ - .p2align 4; \ - __memset_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __memset_sse2, .-__memset_sse2 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __memset_chk_sse2, @function; \ - .globl __memset_chk_sse2; \ - .p2align 4; \ - __memset_chk_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __memset_chk_sse2, .-__memset_chk_sse2 - -# ifdef SHARED -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal memset calls through a PLT. - The speedup we get from using GPR instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memset; __GI_memset = __memset_sse2 -# endif - -# undef strong_alias -# define strong_alias(original, alias) -#endif - -#include "../memset.S" diff --git a/libc/sysdeps/x86_64/multiarch/memset_chk.S b/libc/sysdeps/x86_64/multiarch/memset_chk.S deleted file mode 100644 index 55e263542..000000000 --- a/libc/sysdeps/x86_64/multiarch/memset_chk.S +++ /dev/null @@ -1,44 +0,0 @@ -/* Multiple versions of __memset_chk - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2013 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - -/* Define multiple versions only for the definition in lib. */ -#ifndef NOT_IN_libc -# ifdef SHARED -ENTRY(__memset_chk) - .type __memset_chk, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __memset_chk_x86_64(%rip), %rax - testl $bit_Prefer_SSE_for_memop, __cpu_features+FEATURE_OFFSET+index_Prefer_SSE_for_memop(%rip) - jz 2f - leaq __memset_chk_sse2(%rip), %rax -2: ret -END(__memset_chk) - -strong_alias (__memset_chk, __memset_zero_constant_len_parameter) - .section .gnu.warning.__memset_zero_constant_len_parameter - .string "memset used with constant zero length parameter; this could be due to transposed parameters" -# else -# include "../memset_chk.S" -# endif -#endif diff --git a/libc/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/libc/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S index 72bb60994..028c6d3d7 100644 --- a/libc/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S +++ b/libc/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -34,10 +34,236 @@ ENTRY (STRCAT) mov %rdx, %r8 # endif -# define RETURN jmp L(StartStrcpyPart) -# include "strlen-sse2-pminub.S" -# undef RETURN +/* Inline corresponding strlen file, temporary until new strcpy + implementation gets merged. */ + xor %rax, %rax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%rdi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit_less16) + mov %rdi, %rax + and $-16, %rax + jmp L(align16_start) +L(next): + mov %rdi, %rax + and $-16, %rax + pcmpeqb (%rax), %xmm0 + mov $-1, %r10d + sub %rax, %rcx + shl %cl, %r10d + pmovmskb %xmm0, %edx + and %r10d, %edx + jnz L(exit) + +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pcmpeqb 16(%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 80(%rax), %xmm0 + add $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm1 + add $16, %rax + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm2 + add $16, %rax + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm3 + add $16, %rax + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit) + + add $16, %rax + .p2align 4 + L(align64_loop): + movaps (%rax), %xmm4 + pminub 16(%rax), %xmm4 + movaps 32(%rax), %xmm5 + pminub 48(%rax), %xmm5 + add $64, %rax + pminub %xmm4, %xmm5 + pcmpeqb %xmm0, %xmm5 + pmovmskb %xmm5, %edx + test %edx, %edx + jz L(align64_loop) + + pcmpeqb -64(%rax), %xmm0 + sub $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit): + sub %rdi, %rax +L(exit_less16): + bsf %rdx, %rdx + add %rdx, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit16): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $16, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit32): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $32, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit48): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $48, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit64): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax + + .p2align 4 L(StartStrcpyPart): lea (%r9, %rax), %rdi mov %rsi, %rcx diff --git a/libc/sysdeps/x86_64/multiarch/strcat-ssse3.S b/libc/sysdeps/x86_64/multiarch/strcat-ssse3.S index fea9d11b4..8101b91e5 100644 --- a/libc/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ b/libc/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -33,11 +33,321 @@ ENTRY (STRCAT) mov %rdx, %r8 # endif -# define RETURN jmp L(StartStrcpyPart) -# include "strlen-sse2-no-bsf.S" -# undef RETURN +/* Inline corresponding strlen file, temporary until new strcpy + implementation gets merged. */ + + xor %eax, %eax + cmpb $0, (%rdi) + jz L(exit_tail0) + cmpb $0, 1(%rdi) + jz L(exit_tail1) + cmpb $0, 2(%rdi) + jz L(exit_tail2) + cmpb $0, 3(%rdi) + jz L(exit_tail3) + + cmpb $0, 4(%rdi) + jz L(exit_tail4) + cmpb $0, 5(%rdi) + jz L(exit_tail5) + cmpb $0, 6(%rdi) + jz L(exit_tail6) + cmpb $0, 7(%rdi) + jz L(exit_tail7) + + cmpb $0, 8(%rdi) + jz L(exit_tail8) + cmpb $0, 9(%rdi) + jz L(exit_tail9) + cmpb $0, 10(%rdi) + jz L(exit_tail10) + cmpb $0, 11(%rdi) + jz L(exit_tail11) + + cmpb $0, 12(%rdi) + jz L(exit_tail12) + cmpb $0, 13(%rdi) + jz L(exit_tail13) + cmpb $0, 14(%rdi) + jz L(exit_tail14) + cmpb $0, 15(%rdi) + jz L(exit_tail15) + pxor %xmm0, %xmm0 + lea 16(%rdi), %rcx + lea 16(%rdi), %rax + and $-16, %rax + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax + .p2align 4 +L(aligned_64): + pcmpeqb (%rax), %xmm0 + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %r11d + pmovmskb %xmm2, %r10d + pmovmskb %xmm3, %r9d + or %edx, %r9d + or %r11d, %r9d + or %r10d, %r9d + lea 64(%rax), %rax + jz L(aligned_64) + + test %edx, %edx + jnz L(aligned_64_exit_16) + test %r11d, %r11d + jnz L(aligned_64_exit_32) + test %r10d, %r10d + jnz L(aligned_64_exit_48) + +L(aligned_64_exit_64): + pmovmskb %xmm3, %edx + jmp L(exit) + +L(aligned_64_exit_48): + lea -16(%rax), %rax + mov %r10d, %edx + jmp L(exit) + +L(aligned_64_exit_32): + lea -32(%rax), %rax + mov %r11d, %edx + jmp L(exit) + +L(aligned_64_exit_16): + lea -48(%rax), %rax + +L(exit): + sub %rcx, %rax + test %dl, %dl + jz L(exit_high) + test $0x01, %dl + jnz L(exit_tail0) + + test $0x02, %dl + jnz L(exit_tail1) + + test $0x04, %dl + jnz L(exit_tail2) + + test $0x08, %dl + jnz L(exit_tail3) + + test $0x10, %dl + jnz L(exit_tail4) + + test $0x20, %dl + jnz L(exit_tail5) + + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax +L(exit_tail0): + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_high): + add $8, %eax + test $0x01, %dh + jnz L(exit_tail0) + + test $0x02, %dh + jnz L(exit_tail1) + + test $0x04, %dh + jnz L(exit_tail2) + + test $0x08, %dh + jnz L(exit_tail3) + + test $0x10, %dh + jnz L(exit_tail4) + + test $0x20, %dh + jnz L(exit_tail5) + + test $0x40, %dh + jnz L(exit_tail6) + add $7, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail1): + add $1, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail2): + add $2, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail3): + add $3, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail4): + add $4, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail5): + add $5, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail6): + add $6, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail7): + add $7, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail8): + add $8, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail9): + add $9, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail10): + add $10, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail11): + add $11, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail12): + add $12, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail13): + add $13, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail14): + add $14, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail15): + add $15, %eax + + .p2align 4 L(StartStrcpyPart): mov %rsi, %rcx lea (%rdi, %rax), %rdx diff --git a/libc/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S b/libc/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S deleted file mode 100644 index ff2ab7004..000000000 --- a/libc/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S +++ /dev/null @@ -1,685 +0,0 @@ -/* strlen SSE2 without bsf - Copyright (C) 2010-2013 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -/* only for strlen case we don't use optimized version for STATIC build just for SHARED */ - -#if (defined SHARED || defined USE_AS_STRCAT || defined USE_AS_STRNLEN) && !defined NOT_IN_libc - -# ifndef USE_AS_STRCAT - -# include <sysdep.h> - -# define RETURN ret - -# ifndef STRLEN -# define STRLEN __strlen_sse2_no_bsf -# endif - - atom_text_section -ENTRY (STRLEN) -# endif - xor %eax, %eax -# ifdef USE_AS_STRNLEN - mov %rsi, %r8 - sub $4, %rsi - jbe L(len_less4_prolog) -# endif - cmpb $0, (%rdi) - jz L(exit_tail0) - cmpb $0, 1(%rdi) - jz L(exit_tail1) - cmpb $0, 2(%rdi) - jz L(exit_tail2) - cmpb $0, 3(%rdi) - jz L(exit_tail3) - -# ifdef USE_AS_STRNLEN - sub $4, %rsi - jbe L(len_less8_prolog) -# endif - - cmpb $0, 4(%rdi) - jz L(exit_tail4) - cmpb $0, 5(%rdi) - jz L(exit_tail5) - cmpb $0, 6(%rdi) - jz L(exit_tail6) - cmpb $0, 7(%rdi) - jz L(exit_tail7) - -# ifdef USE_AS_STRNLEN - sub $4, %rsi - jbe L(len_less12_prolog) -# endif - - cmpb $0, 8(%rdi) - jz L(exit_tail8) - cmpb $0, 9(%rdi) - jz L(exit_tail9) - cmpb $0, 10(%rdi) - jz L(exit_tail10) - cmpb $0, 11(%rdi) - jz L(exit_tail11) - -# ifdef USE_AS_STRNLEN - sub $4, %rsi - jbe L(len_less16_prolog) -# endif - - cmpb $0, 12(%rdi) - jz L(exit_tail12) - cmpb $0, 13(%rdi) - jz L(exit_tail13) - cmpb $0, 14(%rdi) - jz L(exit_tail14) - cmpb $0, 15(%rdi) - jz L(exit_tail15) - pxor %xmm0, %xmm0 - lea 16(%rdi), %rcx - lea 16(%rdi), %rax - and $-16, %rax - -# ifdef USE_AS_STRNLEN - and $15, %rdi - add %rdi, %rsi - sub $64, %rsi - jbe L(len_less64) -# endif - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - sub $64, %rsi - jbe L(len_less64) -# endif - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - sub $64, %rsi - jbe L(len_less64) -# endif - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - sub $64, %rsi - jbe L(len_less64) -# endif - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - -# ifdef USE_AS_STRNLEN - mov %rax, %rdx - and $63, %rdx - add %rdx, %rsi -# endif - - and $-0x40, %rax - - .p2align 4 -L(aligned_64): -# ifdef USE_AS_STRNLEN - sub $64, %rsi - jbe L(len_less64) -# endif - pcmpeqb (%rax), %xmm0 - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %r11d - pmovmskb %xmm2, %r10d - pmovmskb %xmm3, %r9d - or %edx, %r9d - or %r11d, %r9d - or %r10d, %r9d - lea 64(%rax), %rax - jz L(aligned_64) - - test %edx, %edx - jnz L(aligned_64_exit_16) - test %r11d, %r11d - jnz L(aligned_64_exit_32) - test %r10d, %r10d - jnz L(aligned_64_exit_48) -L(aligned_64_exit_64): - pmovmskb %xmm3, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_48): - lea -16(%rax), %rax - mov %r10d, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_32): - lea -32(%rax), %rax - mov %r11d, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_16): - lea -48(%rax), %rax -L(aligned_64_exit): -L(exit): - sub %rcx, %rax - test %dl, %dl - jz L(exit_high) - test $0x01, %dl - jnz L(exit_tail0) - - test $0x02, %dl - jnz L(exit_tail1) - - test $0x04, %dl - jnz L(exit_tail2) - - test $0x08, %dl - jnz L(exit_tail3) - - test $0x10, %dl - jnz L(exit_tail4) - - test $0x20, %dl - jnz L(exit_tail5) - - test $0x40, %dl - jnz L(exit_tail6) - add $7, %eax -L(exit_tail0): - RETURN - -L(exit_high): - add $8, %eax - test $0x01, %dh - jnz L(exit_tail0) - - test $0x02, %dh - jnz L(exit_tail1) - - test $0x04, %dh - jnz L(exit_tail2) - - test $0x08, %dh - jnz L(exit_tail3) - - test $0x10, %dh - jnz L(exit_tail4) - - test $0x20, %dh - jnz L(exit_tail5) - - test $0x40, %dh - jnz L(exit_tail6) - add $7, %eax - RETURN - -# ifdef USE_AS_STRNLEN - - .p2align 4 -L(len_less64): - pxor %xmm0, %xmm0 - add $64, %rsi - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - lea 16(%rax), %rax - test %edx, %edx - jnz L(strnlen_exit) - - sub $16, %rsi - jbe L(return_start_len) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - lea 16(%rax), %rax - test %edx, %edx - jnz L(strnlen_exit) - - sub $16, %rsi - jbe L(return_start_len) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - lea 16(%rax), %rax - test %edx, %edx - jnz L(strnlen_exit) - - sub $16, %rsi - jbe L(return_start_len) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - lea 16(%rax), %rax - test %edx, %edx - jnz L(strnlen_exit) - - mov %r8, %rax - ret - - .p2align 4 -L(strnlen_exit): - sub %rcx, %rax - - test %dl, %dl - jz L(strnlen_exit_high) - mov %dl, %cl - and $15, %cl - jz L(strnlen_exit_8) - test $0x01, %dl - jnz L(exit_tail0) - test $0x02, %dl - jnz L(strnlen_exit_tail1) - test $0x04, %dl - jnz L(strnlen_exit_tail2) - sub $4, %rsi - jb L(return_start_len) - lea 3(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_8): - test $0x10, %dl - jnz L(strnlen_exit_tail4) - test $0x20, %dl - jnz L(strnlen_exit_tail5) - test $0x40, %dl - jnz L(strnlen_exit_tail6) - sub $8, %rsi - jb L(return_start_len) - lea 7(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_high): - mov %dh, %ch - and $15, %ch - jz L(strnlen_exit_high_8) - test $0x01, %dh - jnz L(strnlen_exit_tail8) - test $0x02, %dh - jnz L(strnlen_exit_tail9) - test $0x04, %dh - jnz L(strnlen_exit_tail10) - sub $12, %rsi - jb L(return_start_len) - lea 11(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_high_8): - test $0x10, %dh - jnz L(strnlen_exit_tail12) - test $0x20, %dh - jnz L(strnlen_exit_tail13) - test $0x40, %dh - jnz L(strnlen_exit_tail14) - sub $16, %rsi - jb L(return_start_len) - lea 15(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail1): - sub $2, %rsi - jb L(return_start_len) - lea 1(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail2): - sub $3, %rsi - jb L(return_start_len) - lea 2(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail4): - sub $5, %rsi - jb L(return_start_len) - lea 4(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail5): - sub $6, %rsi - jb L(return_start_len) - lea 5(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail6): - sub $7, %rsi - jb L(return_start_len) - lea 6(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail8): - sub $9, %rsi - jb L(return_start_len) - lea 8(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail9): - sub $10, %rsi - jb L(return_start_len) - lea 9(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail10): - sub $11, %rsi - jb L(return_start_len) - lea 10(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail12): - sub $13, %rsi - jb L(return_start_len) - lea 12(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail13): - sub $14, %rsi - jb L(return_start_len) - lea 13(%eax), %eax - ret - - .p2align 4 -L(strnlen_exit_tail14): - sub $15, %rsi - jb L(return_start_len) - lea 14(%eax), %eax - ret - - .p2align 4 -L(return_start_len): - mov %r8, %rax - ret - -/* for prolog only */ - - .p2align 4 -L(len_less4_prolog): - add $4, %rsi - jz L(exit_tail0) - - cmpb $0, (%rdi) - jz L(exit_tail0) - cmp $1, %esi - je L(exit_tail1) - - cmpb $0, 1(%rdi) - jz L(exit_tail1) - cmp $2, %esi - je L(exit_tail2) - - cmpb $0, 2(%rdi) - jz L(exit_tail2) - cmp $3, %esi - je L(exit_tail3) - - cmpb $0, 3(%rdi) - jz L(exit_tail3) - mov $4, %eax - ret - - .p2align 4 -L(len_less8_prolog): - add $4, %rsi - - cmpb $0, 4(%rdi) - jz L(exit_tail4) - cmp $1, %esi - je L(exit_tail5) - - cmpb $0, 5(%rdi) - jz L(exit_tail5) - cmp $2, %esi - je L(exit_tail6) - - cmpb $0, 6(%rdi) - jz L(exit_tail6) - cmp $3, %esi - je L(exit_tail7) - - cmpb $0, 7(%rdi) - jz L(exit_tail7) - mov $8, %eax - ret - - .p2align 4 -L(len_less12_prolog): - add $4, %rsi - - cmpb $0, 8(%rdi) - jz L(exit_tail8) - cmp $1, %esi - je L(exit_tail9) - - cmpb $0, 9(%rdi) - jz L(exit_tail9) - cmp $2, %esi - je L(exit_tail10) - - cmpb $0, 10(%rdi) - jz L(exit_tail10) - cmp $3, %esi - je L(exit_tail11) - - cmpb $0, 11(%rdi) - jz L(exit_tail11) - mov $12, %eax - ret - - .p2align 4 -L(len_less16_prolog): - add $4, %rsi - - cmpb $0, 12(%rdi) - jz L(exit_tail12) - cmp $1, %esi - je L(exit_tail13) - - cmpb $0, 13(%rdi) - jz L(exit_tail13) - cmp $2, %esi - je L(exit_tail14) - - cmpb $0, 14(%rdi) - jz L(exit_tail14) - cmp $3, %esi - je L(exit_tail15) - - cmpb $0, 15(%rdi) - jz L(exit_tail15) - mov $16, %eax - ret -# endif - - .p2align 4 -L(exit_tail1): - add $1, %eax - RETURN - - .p2align 4 -L(exit_tail2): - add $2, %eax - RETURN - - .p2align 4 -L(exit_tail3): - add $3, %eax - RETURN - - .p2align 4 -L(exit_tail4): - add $4, %eax - RETURN - - .p2align 4 -L(exit_tail5): - add $5, %eax - RETURN - - .p2align 4 -L(exit_tail6): - add $6, %eax - RETURN - - .p2align 4 -L(exit_tail7): - add $7, %eax - RETURN - - .p2align 4 -L(exit_tail8): - add $8, %eax - RETURN - - .p2align 4 -L(exit_tail9): - add $9, %eax - RETURN - - .p2align 4 -L(exit_tail10): - add $10, %eax - RETURN - - .p2align 4 -L(exit_tail11): - add $11, %eax - RETURN - - .p2align 4 -L(exit_tail12): - add $12, %eax - RETURN - - .p2align 4 -L(exit_tail13): - add $13, %eax - RETURN - - .p2align 4 -L(exit_tail14): - add $14, %eax - RETURN - - .p2align 4 -L(exit_tail15): - add $15, %eax -# ifndef USE_AS_STRCAT - RETURN -END (STRLEN) -# endif -#endif diff --git a/libc/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S b/libc/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S deleted file mode 100644 index cc4bb57e9..000000000 --- a/libc/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S +++ /dev/null @@ -1,259 +0,0 @@ -/* strlen SSE2 - Copyright (C) 2011-2013 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#if !defined NOT_IN_libc && (defined SHARED || defined USE_AS_STRCAT) - -# ifndef USE_AS_STRCAT - -# include <sysdep.h> - -# define RETURN ret - - .section .text.sse2,"ax",@progbits -ENTRY (__strlen_sse2_pminub) - -# endif - xor %rax, %rax - mov %edi, %ecx - and $0x3f, %ecx - pxor %xmm0, %xmm0 - cmp $0x30, %ecx - ja L(next) - movdqu (%rdi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit_less16) - mov %rdi, %rax - and $-16, %rax - jmp L(align16_start) -L(next): - mov %rdi, %rax - and $-16, %rax - pcmpeqb (%rax), %xmm0 - mov $-1, %r10d - sub %rax, %rcx - shl %cl, %r10d - pmovmskb %xmm0, %edx - and %r10d, %edx - jnz L(exit) -L(align16_start): - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - pcmpeqb 16(%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - pcmpeqb 80(%rax), %xmm0 - add $64, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - pcmpeqb 80(%rax), %xmm0 - add $64, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - pcmpeqb 80(%rax), %xmm0 - add $64, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit64) - - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 80(%rax), %xmm0 - add $80, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit) - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 16(%rax), %xmm1 - add $16, %rax - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit) - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 16(%rax), %xmm2 - add $16, %rax - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit) - - test $0x3f, %rax - jz L(align64_loop) - - pcmpeqb 16(%rax), %xmm3 - add $16, %rax - pmovmskb %xmm3, %edx - test %edx, %edx - jnz L(exit) - - add $16, %rax - .p2align 4 - L(align64_loop): - movaps (%rax), %xmm4 - pminub 16(%rax), %xmm4 - movaps 32(%rax), %xmm5 - pminub 48(%rax), %xmm5 - add $64, %rax - pminub %xmm4, %xmm5 - pcmpeqb %xmm0, %xmm5 - pmovmskb %xmm5, %edx - test %edx, %edx - jz L(align64_loop) - - - pcmpeqb -64(%rax), %xmm0 - sub $80, %rax - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) - - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) - - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) - - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $64, %rax - RETURN - - .p2align 4 -L(exit): - sub %rdi, %rax -L(exit_less16): - bsf %rdx, %rdx - add %rdx, %rax - RETURN - .p2align 4 -L(exit16): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $16, %rax - RETURN - .p2align 4 -L(exit32): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $32, %rax - RETURN - .p2align 4 -L(exit48): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $48, %rax - RETURN - .p2align 4 -L(exit64): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $64, %rax -# ifndef USE_AS_STRCAT - RETURN - -END (__strlen_sse2_pminub) -# endif -#endif diff --git a/libc/sysdeps/x86_64/multiarch/strlen-sse4.S b/libc/sysdeps/x86_64/multiarch/strlen-sse4.S deleted file mode 100644 index 8d685df0c..000000000 --- a/libc/sysdeps/x86_64/multiarch/strlen-sse4.S +++ /dev/null @@ -1,84 +0,0 @@ -/* strlen with SSE4 - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Ulrich Drepper <drepper@redhat.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#if defined SHARED && !defined NOT_IN_libc - -#include <sysdep.h> - - .section .text.sse4.2,"ax",@progbits -ENTRY (__strlen_sse42) - pxor %xmm1, %xmm1 - movl %edi, %ecx - movq %rdi, %r8 - andq $~15, %rdi - xor %edi, %ecx - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %edx - shrl %cl, %edx - shll %cl, %edx - andl %edx, %edx - jnz L(less16bytes) - pxor %xmm1, %xmm1 - - .p2align 4 -L(more64bytes_loop): - pcmpistri $0x08, 16(%rdi), %xmm1 - jz L(more32bytes) - - pcmpistri $0x08, 32(%rdi), %xmm1 - jz L(more48bytes) - - pcmpistri $0x08, 48(%rdi), %xmm1 - jz L(more64bytes) - - add $64, %rdi - pcmpistri $0x08, (%rdi), %xmm1 - jnz L(more64bytes_loop) - leaq (%rdi,%rcx), %rax - subq %r8, %rax - ret - - .p2align 4 -L(more32bytes): - leaq 16(%rdi,%rcx, 1), %rax - subq %r8, %rax - ret - - .p2align 4 -L(more48bytes): - leaq 32(%rdi,%rcx, 1), %rax - subq %r8, %rax - ret - - .p2align 4 -L(more64bytes): - leaq 48(%rdi,%rcx, 1), %rax - subq %r8, %rax - ret - - .p2align 4 -L(less16bytes): - subq %r8, %rdi - bsfl %edx, %eax - addq %rdi, %rax - ret - -END (__strlen_sse42) - -#endif diff --git a/libc/sysdeps/x86_64/multiarch/strlen.S b/libc/sysdeps/x86_64/multiarch/strlen.S deleted file mode 100644 index ab29ceff2..000000000 --- a/libc/sysdeps/x86_64/multiarch/strlen.S +++ /dev/null @@ -1,68 +0,0 @@ -/* Multiple versions of strlen(str) -- determine the length of the string STR. - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Ulrich Drepper <drepper@redhat.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - -/* Define multiple versions only for the definition in libc and for - the DSO. In static binaries we need strlen before the initialization - happened. */ -#if defined SHARED && !defined NOT_IN_libc - .text -ENTRY(strlen) - .type strlen, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __strlen_sse2_pminub(%rip), %rax - testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip) - jnz 2f - leaq __strlen_sse2(%rip), %rax - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jz 2f - leaq __strlen_sse42(%rip), %rax - ret -2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) - jz 3f - leaq __strlen_sse2_no_bsf(%rip), %rax -3: ret -END(strlen) - -# undef ENTRY -# define ENTRY(name) \ - .type __strlen_sse2, @function; \ - .align 16; \ - .globl __strlen_sse2; \ - .hidden __strlen_sse2; \ - __strlen_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __strlen_sse2, .-__strlen_sse2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strlen calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_strlen; __GI_strlen = __strlen_sse2 -#endif - -#include "../strlen.S" diff --git a/libc/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S b/libc/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S deleted file mode 100644 index 248328d99..000000000 --- a/libc/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNLEN -#define STRLEN __strnlen_sse2_no_bsf -#include "strlen-sse2-no-bsf.S" diff --git a/libc/sysdeps/x86_64/multiarch/strnlen.S b/libc/sysdeps/x86_64/multiarch/strnlen.S deleted file mode 100644 index 124f8458a..000000000 --- a/libc/sysdeps/x86_64/multiarch/strnlen.S +++ /dev/null @@ -1,57 +0,0 @@ -/* multiple version of strnlen - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2011-2013 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - -/* Define multiple versions only for the definition in libc. */ -#ifndef NOT_IN_libc - - .text -ENTRY(__strnlen) - .type __strnlen, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: leaq __strnlen_sse2(%rip), %rax - testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) - jz 2f - leaq __strnlen_sse2_no_bsf(%rip), %rax -2: ret -END(__strnlen) - -# undef ENTRY -# define ENTRY(name) \ - .type __strnlen_sse2, @function; \ - .align 16; \ - .globl __strnlen_sse2; \ - .hidden __strnlen_sse2; \ - __strnlen_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2 - -# undef libc_hidden_def -# define libc_hidden_def(name) \ - .globl __GI_strnlen; __GI_strnlen = __strnlen_sse2 -#endif - -#include "../strnlen.S" diff --git a/libc/sysdeps/x86_64/preconfigure b/libc/sysdeps/x86_64/preconfigure index ca9de7584..d5abba882 100644 --- a/libc/sysdeps/x86_64/preconfigure +++ b/libc/sysdeps/x86_64/preconfigure @@ -1,123 +1,3 @@ - -# as_fn_set_status STATUS -# ----------------------- -# Set $? to STATUS, without forking. -as_fn_set_status () -{ - return $1 -} # as_fn_set_status - -# as_fn_exit STATUS -# ----------------- -# Exit the shell with STATUS, even in a "trap 0" or "set -e" context. -as_fn_exit () -{ - set +e - as_fn_set_status $1 - exit $1 -} # as_fn_exit -if expr a : '\(a\)' >/dev/null 2>&1 && - test "X`expr 00001 : '.*\(...\)'`" = X001; then - as_expr=expr -else - as_expr=false -fi - -if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then - as_basename=basename -else - as_basename=false -fi - -as_me=`$as_basename -- "$0" || -$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ - X"$0" : 'X\(//\)$' \| \ - X"$0" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X/"$0" | - sed '/^.*\/\([^/][^/]*\)\/*$/{ - s//\1/ - q - } - /^X\/\(\/\/\)$/{ - s//\1/ - q - } - /^X\/\(\/\).*/{ - s//\1/ - q - } - s/.*/./; q'` - - - as_lineno_1=$LINENO as_lineno_1a=$LINENO - as_lineno_2=$LINENO as_lineno_2a=$LINENO - eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" && - test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || { - # Blame Lee E. McMahon (1931-1989) for sed's syntax. :-) - sed -n ' - p - /[$]LINENO/= - ' <$as_myself | - sed ' - s/[$]LINENO.*/&-/ - t lineno - b - :lineno - N - :loop - s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/ - t loop - s/-\n.*// - ' >$as_me.lineno && - chmod +x "$as_me.lineno" || - { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; } - - # Don't try to exec as it changes $[0], causing all sort of problems - # (the dirname of $[0] is not the place where we might find the - # original and so on. Autoconf is especially sensitive to this). - . "./$as_me.lineno" - # Exit status is that of the last command. - exit -} - - -# ac_fn_c_try_compile LINENO -# -------------------------- -# Try to compile conftest.$ac_ext, and return whether this succeeded. -ac_fn_c_try_compile () -{ - as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - rm -f conftest.$ac_objext - if { { ac_try="$ac_compile" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 - (eval "$ac_compile") 2>conftest.err - ac_status=$? - if test -s conftest.err; then - grep -v '^ *+' conftest.err >conftest.er1 - cat conftest.er1 >&5 - mv -f conftest.er1 conftest.err - fi - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; } && { - test -z "$ac_c_werror_flag" || - test ! -s conftest.err - } && test -s conftest.$ac_objext; then : - ac_retval=0 -else - $as_echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - ac_retval=1 -fi - eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno - as_fn_set_status $ac_retval - -} # ac_fn_c_try_compile # This file is generated from configure.in by Autoconf. DO NOT EDIT! # Local preconfigure fragment for sysdeps/x86_64 diff --git a/libc/sysdeps/x86_64/strcat.S b/libc/sysdeps/x86_64/strcat.S index 287ffd24c..8bea6fb5d 100644 --- a/libc/sysdeps/x86_64/strcat.S +++ b/libc/sysdeps/x86_64/strcat.S @@ -21,6 +21,7 @@ #include <sysdep.h> #include "asm-syntax.h" +/* Will be removed when new strcpy implementation gets merged. */ .text ENTRY (strcat) diff --git a/libc/sysdeps/x86_64/strlen.S b/libc/sysdeps/x86_64/strlen.S index 4bdca0a45..eeb109221 100644 --- a/libc/sysdeps/x86_64/strlen.S +++ b/libc/sysdeps/x86_64/strlen.S @@ -1,6 +1,5 @@ -/* strlen(str) -- determine the length of the string STR. - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Ulrich Drepper <drepper@redhat.com>. +/* SSE2 version of strlen. + Copyright (C) 2012-2013 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,83 +18,222 @@ #include <sysdep.h> +/* Long lived register in strlen(s), strnlen(s, n) are: - .text + %xmm11 - zero + %rdi - s + %r10 (s+n) & (~(64-1)) + %r11 s+n +*/ + + +.text ENTRY(strlen) + +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ +#define FIND_ZERO \ + pcmpeqb (%rax), %xmm8; \ + pcmpeqb 16(%rax), %xmm9; \ + pcmpeqb 32(%rax), %xmm10; \ + pcmpeqb 48(%rax), %xmm11; \ + pmovmskb %xmm8, %esi; \ + pmovmskb %xmm9, %edx; \ + pmovmskb %xmm10, %r8d; \ + pmovmskb %xmm11, %ecx; \ + salq $16, %rdx; \ + salq $16, %rcx; \ + orq %rsi, %rdx; \ + orq %r8, %rcx; \ + salq $32, %rcx; \ + orq %rcx, %rdx; + +#ifdef AS_STRNLEN +/* Do not read anything when n==0. */ + test %rsi, %rsi + jne L(n_nonzero) xor %rax, %rax - mov %edi, %ecx - and $0x3f, %ecx - pxor %xmm0, %xmm0 - cmp $0x30, %ecx - ja L(next) - movdqu (%rdi), %xmm1 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit_less16) - mov %rdi, %rax - and $-16, %rax - jmp L(align16_start) -L(next): - mov %rdi, %rax - and $-16, %rax - pcmpeqb (%rax), %xmm0 - mov $-1, %esi - sub %rax, %rcx - shl %cl, %esi - pmovmskb %xmm0, %edx - and %esi, %edx - jnz L(exit) -L(align16_start): - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - .p2align 4 -L(align16_loop): - pcmpeqb 16(%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz L(exit16) + ret +L(n_nonzero): - pcmpeqb 32(%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - jnz L(exit32) +/* Initialize long lived registers. */ - pcmpeqb 48(%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - jnz L(exit48) + add %rdi, %rsi + mov %rsi, %r10 + and $-64, %r10 + mov %rsi, %r11 +#endif - pcmpeqb 64(%rax), %xmm3 - pmovmskb %xmm3, %edx - lea 64(%rax), %rax + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + movq %rdi, %rax + movq %rdi, %rcx + andq $4095, %rcx +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ + cmpq $4047, %rcx +/* We cannot unify this branching as it would be ~6 cycles slower. */ + ja L(cross_page) + +#ifdef AS_STRNLEN +/* Test if end is among first 64 bytes. */ +# define STRNLEN_PROLOG \ + mov %r11, %rsi; \ + subq %rax, %rsi; \ + andq $-64, %rax; \ + testq $-64, %rsi; \ + je L(strnlen_ret) +#else +# define STRNLEN_PROLOG andq $-64, %rax; +#endif + +/* Ignore bits in mask that come before start of string. */ +#define PROLOG(lab) \ + movq %rdi, %rcx; \ + xorq %rax, %rcx; \ + STRNLEN_PROLOG; \ + sarq %cl, %rdx; \ + test %rdx, %rdx; \ + je L(lab); \ + bsfq %rdx, %rax; \ + ret + +#ifdef AS_STRNLEN + andq $-16, %rax + FIND_ZERO +#else + /* Test first 16 bytes unaligned. */ + movdqu (%rax), %xmm12 + pcmpeqb %xmm8, %xmm12 + pmovmskb %xmm12, %edx test %edx, %edx - jz L(align16_loop) -L(exit): - sub %rdi, %rax -L(exit_less16): - bsf %rdx, %rdx - add %rdx, %rax + je L(next48_bytes) + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ + ret + +L(next48_bytes): +/* Same as FIND_ZERO except we do not check first 16 bytes. */ + andq $-16, %rax + pcmpeqb 16(%rax), %xmm9 + pcmpeqb 32(%rax), %xmm10 + pcmpeqb 48(%rax), %xmm11 + pmovmskb %xmm9, %edx + pmovmskb %xmm10, %r8d + pmovmskb %xmm11, %ecx + salq $16, %rdx + salq $16, %rcx + orq %r8, %rcx + salq $32, %rcx + orq %rcx, %rdx +#endif + + /* When no zero byte is found xmm9-11 are zero so we do not have to + zero them. */ + PROLOG(loop) + + .p2align 4 +L(cross_page): + andq $-64, %rax + FIND_ZERO + PROLOG(loop_init) + +#ifdef AS_STRNLEN +/* We must do this check to correctly handle strnlen (s, -1). */ +L(strnlen_ret): + bts %rsi, %rdx + sarq %cl, %rdx + test %rdx, %rdx + je L(loop_init) + bsfq %rdx, %rax ret +#endif + .p2align 4 +L(loop_init): + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 +#ifdef AS_STRNLEN + .p2align 4 +L(loop): + + addq $64, %rax + cmpq %rax, %r10 + je L(exit_end) + + movdqa (%rax), %xmm8 + pminub 16(%rax), %xmm8 + pminub 32(%rax), %xmm8 + pminub 48(%rax), %xmm8 + pcmpeqb %xmm11, %xmm8 + pmovmskb %xmm8, %edx + testl %edx, %edx + jne L(exit) + jmp L(loop) + .p2align 4 -L(exit16): - sub %rdi, %rax - bsf %rdx, %rdx - lea 16(%rdx,%rax), %rax +L(exit_end): + cmp %rax, %r11 + je L(first) /* Do not read when end is at page boundary. */ + pxor %xmm8, %xmm8 + FIND_ZERO + +L(first): + bts %r11, %rdx + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax ret + .p2align 4 -L(exit32): - sub %rdi, %rax - bsf %rdx, %rdx - lea 32(%rdx,%rax), %rax +L(exit): + pxor %xmm8, %xmm8 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax ret + +#else + + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ + .p2align 4 +L(loop): + + movdqa 64(%rax), %xmm8 + pminub 80(%rax), %xmm8 + pminub 96(%rax), %xmm8 + pminub 112(%rax), %xmm8 + pcmpeqb %xmm11, %xmm8 + pmovmskb %xmm8, %edx + testl %edx, %edx + jne L(exit64) + + subq $-128, %rax + + movdqa (%rax), %xmm8 + pminub 16(%rax), %xmm8 + pminub 32(%rax), %xmm8 + pminub 48(%rax), %xmm8 + pcmpeqb %xmm11, %xmm8 + pmovmskb %xmm8, %edx + testl %edx, %edx + jne L(exit0) + jmp L(loop) + .p2align 4 -L(exit48): - sub %rdi, %rax - bsf %rdx, %rdx - lea 48(%rdx,%rax), %rax +L(exit64): + addq $64, %rax +L(exit0): + pxor %xmm8, %xmm8 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax ret + +#endif + END(strlen) libc_hidden_builtin_def (strlen) diff --git a/libc/sysdeps/x86_64/strnlen.S b/libc/sysdeps/x86_64/strnlen.S index 6e5350306..d3c43ac48 100644 --- a/libc/sysdeps/x86_64/strnlen.S +++ b/libc/sysdeps/x86_64/strnlen.S @@ -1,63 +1,6 @@ -/* strnlen(str,maxlen) -- determine the length of the string STR up to MAXLEN. - Copyright (C) 2010-2013 Free Software Foundation, Inc. - Contributed by Ulrich Drepper <drepper@redhat.com>. - This file is part of the GNU C Library. +#define AS_STRNLEN +#define strlen __strnlen +#include "strlen.S" - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - - - .text -ENTRY(__strnlen) - movq %rsi, %rax - testq %rsi, %rsi - jz 3f - pxor %xmm2, %xmm2 - movq %rdi, %rcx - movq %rdi, %r8 - movq $16, %r9 - andq $~15, %rdi - movdqa %xmm2, %xmm1 - pcmpeqb (%rdi), %xmm2 - orl $0xffffffff, %r10d - subq %rdi, %rcx - shll %cl, %r10d - subq %rcx, %r9 - pmovmskb %xmm2, %edx - andl %r10d, %edx - jnz 1f - subq %r9, %rsi - jbe 3f - -2: movdqa 16(%rdi), %xmm0 - leaq 16(%rdi), %rdi - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jnz 1f - subq $16, %rsi - jnbe 2b -3: ret - -1: subq %r8, %rdi - bsfl %edx, %edx - addq %rdi, %rdx - cmpq %rdx, %rax - cmovnbq %rdx, %rax - ret -END(__strnlen) -weak_alias (__strnlen, strnlen) -libc_hidden_def (strnlen) +weak_alias (__strnlen, strnlen); +libc_hidden_builtin_def (strnlen) |