diff options
Diffstat (limited to 'libc/sysdeps/x86_64')
-rw-r--r-- | libc/sysdeps/x86_64/dl-trampoline.S | 2 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/fpu/fraiseexcpt.c | 2 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/fpu/libm-test-ulps | 75 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/Makefile | 6 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/ifunc-impl-list.c | 7 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/rawmemchr.S | 103 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 210 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/strcmp-sse42.S | 2 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/strcmp.S | 7 | ||||
-rw-r--r-- | libc/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S | 4 |
10 files changed, 300 insertions, 118 deletions
diff --git a/libc/sysdeps/x86_64/dl-trampoline.S b/libc/sysdeps/x86_64/dl-trampoline.S index 5770c64bf..a25e390a7 100644 --- a/libc/sysdeps/x86_64/dl-trampoline.S +++ b/libc/sysdeps/x86_64/dl-trampoline.S @@ -119,7 +119,7 @@ _dl_runtime_profile: movq %rax, LR_RSP_OFFSET(%rsp) /* We always store the XMM registers even if AVX is available. - This is to provide backward binary compatility for existing + This is to provide backward binary compatibility for existing audit modules. */ movaps %xmm0, (LR_XMM_OFFSET)(%rsp) movaps %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp) diff --git a/libc/sysdeps/x86_64/fpu/fraiseexcpt.c b/libc/sysdeps/x86_64/fpu/fraiseexcpt.c index 9a251e101..e5f553adf 100644 --- a/libc/sysdeps/x86_64/fpu/fraiseexcpt.c +++ b/libc/sysdeps/x86_64/fpu/fraiseexcpt.c @@ -30,7 +30,7 @@ __feraiseexcept (int excepts) /* First: invalid exception. */ if ((FE_INVALID & excepts) != 0) { - /* One example of a invalid operation is 0.0 / 0.0. */ + /* One example of an invalid operation is 0.0 / 0.0. */ float f = 0.0; __asm__ __volatile__ ("divss %0, %0 " : : "x" (f)); diff --git a/libc/sysdeps/x86_64/fpu/libm-test-ulps b/libc/sysdeps/x86_64/fpu/libm-test-ulps index d02618a0a..6fbfa64ae 100644 --- a/libc/sysdeps/x86_64/fpu/libm-test-ulps +++ b/libc/sysdeps/x86_64/fpu/libm-test-ulps @@ -6222,11 +6222,39 @@ idouble: 1 Test "gamma (-0.5)": ildouble: 1 ldouble: 1 +Test "gamma (-0x1p-10)": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 +Test "gamma (-0x1p-15)": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +Test "gamma (-0x1p-20)": +double: 1 +idouble: 1 +Test "gamma (-0x1p-30)": +ildouble: 1 +ldouble: 1 +Test "gamma (-0x1p-5)": +double: 1 +idouble: 1 Test "gamma (0.7)": double: 1 float: 1 idouble: 1 ifloat: 1 +Test "gamma (0x1p-10)": +float: 1 +ifloat: 1 +Test "gamma (0x1p-30)": +double: 1 +idouble: 1 +Test "gamma (0x1p-40)": +ildouble: 1 +ldouble: 1 Test "gamma (1.2)": double: 1 float: 2 @@ -6403,6 +6431,11 @@ idouble: 2 ifloat: 2 ildouble: 1 ldouble: 1 +Test "jn (2, 0x1p127)": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 Test "jn (2, 2.4048255576957729)": double: 2 float: 1 @@ -6486,11 +6519,39 @@ ldouble: 2 Test "lgamma (-0.5)": ildouble: 1 ldouble: 1 +Test "lgamma (-0x1p-10)": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 +Test "lgamma (-0x1p-15)": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +Test "lgamma (-0x1p-20)": +double: 1 +idouble: 1 +Test "lgamma (-0x1p-30)": +ildouble: 1 +ldouble: 1 +Test "lgamma (-0x1p-5)": +double: 1 +idouble: 1 Test "lgamma (0.7)": double: 1 float: 1 idouble: 1 ifloat: 1 +Test "lgamma (0x1p-10)": +float: 1 +ifloat: 1 +Test "lgamma (0x1p-30)": +double: 1 +idouble: 1 +Test "lgamma (0x1p-40)": +ildouble: 1 +ldouble: 1 Test "lgamma (1.2)": double: 1 float: 2 @@ -7728,6 +7789,16 @@ double: 3 float: 1 idouble: 3 ifloat: 1 +Test "yn (2, 0x1.ffff62p+99)": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 +Test "yn (2, 0x1p127)": +double: 1 +float: 3 +idouble: 1 +ifloat: 3 Test "yn (3, 0.125)": double: 1 idouble: 1 @@ -8428,9 +8499,9 @@ ldouble: 2 Function: "yn": double: 3 -float: 2 +float: 3 idouble: 3 -ifloat: 2 +ifloat: 3 ildouble: 4 ldouble: 4 diff --git a/libc/sysdeps/x86_64/multiarch/Makefile b/libc/sysdeps/x86_64/multiarch/Makefile index 203d16eed..5ab950a53 100644 --- a/libc/sysdeps/x86_64/multiarch/Makefile +++ b/libc/sysdeps/x86_64/multiarch/Makefile @@ -6,8 +6,10 @@ endif ifeq ($(subdir),string) -sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ - strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \ +sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ + strcmp-sse2-unaligned strncmp-ssse3 \ + strend-sse4 memcmp-sse4 memcpy-ssse3 \ + memcpy-sse2-unaligned mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ diff --git a/libc/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/libc/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 28d35793c..f8756d7af 100644 --- a/libc/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/libc/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -61,12 +61,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2)) - /* Support sysdeps/x86_64/multiarch/rawmemchr.S. */ - IFUNC_IMPL (i, name, rawmemchr, - IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_SSE4_2, - __rawmemchr_sse42) - IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) - /* Support sysdeps/x86_64/multiarch/stpncpy.S. */ IFUNC_IMPL (i, name, stpncpy, IFUNC_IMPL_ADD (array, i, stpncpy, HAS_SSSE3, @@ -124,6 +118,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcmp, IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42) IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) /* Support sysdeps/x86_64/multiarch/strcpy.S. */ diff --git a/libc/sysdeps/x86_64/multiarch/rawmemchr.S b/libc/sysdeps/x86_64/multiarch/rawmemchr.S deleted file mode 100644 index 50de38ffb..000000000 --- a/libc/sysdeps/x86_64/multiarch/rawmemchr.S +++ /dev/null @@ -1,103 +0,0 @@ -/* Multiple versions of rawmemchr - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Ulrich Drepper <drepper@redhat.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <init-arch.h> - - -/* Define multiple versions only for the definition in lib. */ -#ifndef NOT_IN_libc - .text -ENTRY(rawmemchr) - .type rawmemchr, @gnu_indirect_function - cmpl $0, __cpu_features+KIND_OFFSET(%rip) - jne 1f - call __init_cpu_features -1: testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip) - jnz 2f - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jz 2f - leaq __rawmemchr_sse42(%rip), %rax - ret -2: leaq __rawmemchr_sse2(%rip), %rax - ret - -END(rawmemchr) -strong_alias (rawmemchr, __rawmemchr) - - - .section .text.sse4.2,"ax",@progbits - .align 16 - .type __rawmemchr_sse42, @function - .globl __rawmemchr_sse42 - .hidden __rawmemchr_sse42 -__rawmemchr_sse42: - cfi_startproc - CALL_MCOUNT - movd %esi, %xmm1 - movq %rdi, %rcx - pxor %xmm2, %xmm2 - andq $~15, %rdi - orl $0xffffffff, %esi - pshufb %xmm2, %xmm1 - movdqa (%rdi), %xmm0 - subq %rdi, %rcx - pcmpeqb %xmm1, %xmm0 - shl %cl, %esi - pmovmskb %xmm0, %ecx - movl $16, %eax - movl $16, %edx - andl %esi, %ecx - jnz 1f - -2: pcmpestri $0x08, 16(%rdi), %xmm1 - leaq 16(%rdi), %rdi - jnc 2b - - leaq (%rdi,%rcx), %rax - ret - -1: bsfl %ecx, %eax - addq %rdi, %rax - ret - cfi_endproc - .size __rawmemchr_sse42, .-__rawmemchr_sse42 - - -# undef ENTRY -# define ENTRY(name) \ - .type __rawmemchr_sse2, @function; \ - .align 16; \ - .globl __rawmemchr_sse2; \ - .hidden __rawmemchr_sse2; \ - __rawmemchr_sse2: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal rawmemchr calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2 -#endif - -#include "../rawmemchr.S" diff --git a/libc/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/libc/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S new file mode 100644 index 000000000..eed843297 --- /dev/null +++ b/libc/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -0,0 +1,210 @@ +/* strcmp with unaligned loads + Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#define ALIGN(x) .p2align x + +ENTRY ( __strcmp_sse2_unaligned) + movl %edi, %eax + xorl %edx, %edx + pxor %xmm7, %xmm7 + orl %esi, %eax + andl $4095, %eax + cmpl $4032, %eax + jg L(cross_page) + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pminub %xmm1, %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + testq %rax, %rax + je L(next_48_bytes) +L(return): + bsfq %rax, %rdx + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + ret + + ALIGN (4) +L(next_48_bytes): + movdqu 16(%rdi), %xmm6 + movdqu 16(%rsi), %xmm3 + movdqu 32(%rdi), %xmm5 + pcmpeqb %xmm6, %xmm3 + movdqu 32(%rsi), %xmm2 + pminub %xmm6, %xmm3 + pcmpeqb %xmm1, %xmm3 + movdqu 48(%rdi), %xmm4 + pcmpeqb %xmm5, %xmm2 + pmovmskb %xmm3, %edx + movdqu 48(%rsi), %xmm0 + pminub %xmm5, %xmm2 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm2, %eax + salq $16, %rdx + pminub %xmm4, %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $32, %rax + orq %rdx, %rax + pmovmskb %xmm0, %ecx + movq %rcx, %rdx + salq $48, %rdx + orq %rdx, %rax + jne L(return) +L(main_loop_header): + leaq 64(%rdi), %rdx + movl $4096, %ecx + pxor %xmm9, %xmm9 + andq $-64, %rdx + subq %rdi, %rdx + leaq (%rdi, %rdx), %rax + addq %rsi, %rdx + movq %rdx, %rsi + andl $4095, %esi + subq %rsi, %rcx + shrq $6, %rcx + movq %rcx, %rsi + jmp L(loop_start) + + ALIGN (4) +L(loop): + addq $64, %rax + addq $64, %rdx +L(loop_start): + testq %rsi, %rsi + leaq -1(%rsi), %rsi + je L(loop_cross_page) +L(back_to_loop): + movdqu (%rdx), %xmm0 + movdqu 16(%rdx), %xmm1 + movdqa (%rax), %xmm2 + movdqa 16(%rax), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqu 32(%rdx), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqu 48(%rdx), %xmm6 + pminub %xmm3, %xmm1 + movdqa 32(%rax), %xmm2 + pminub %xmm1, %xmm0 + movdqa 48(%rax), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + pminub %xmm5, %xmm0 + pminub %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %ecx + testl %ecx, %ecx + je L(loop) + pcmpeqb %xmm7, %xmm5 + movdqu (%rdx), %xmm0 + pcmpeqb %xmm7, %xmm1 + movdqa (%rax), %xmm2 + pcmpeqb %xmm2, %xmm0 + pminub %xmm2, %xmm0 + pcmpeqb %xmm7, %xmm6 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rcx + orq %rdi, %rcx + salq $48, %rsi + orq %rsi, %rcx + bsfq %rcx, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + ALIGN (4) +L(loop_cross_page): + xor %r10, %r10 + movq %rdx, %r9 + and $63, %r9 + subq %r9, %r10 + + movdqa (%rdx, %r10), %xmm0 + movdqa 16(%rdx, %r10), %xmm1 + movdqu (%rax, %r10), %xmm2 + movdqu 16(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqa 32(%rdx, %r10), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqa 48(%rdx, %r10), %xmm6 + pminub %xmm3, %xmm1 + movdqu 32(%rax, %r10), %xmm2 + movdqu 48(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + + pcmpeqb %xmm7, %xmm0 + pcmpeqb %xmm7, %xmm1 + pcmpeqb %xmm7, %xmm5 + pcmpeqb %xmm7, %xmm6 + + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rdi + orq %rcx, %rdi + salq $48, %rsi + orq %rsi, %rdi + movq %r9, %rcx + movq $63, %rsi + shrq %cl, %rdi + test %rdi, %rdi + je L(back_to_loop) + bsfq %rdi, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + ALIGN (4) +L(cross_page_loop): + cmpb %cl, %al + jne L(different) + addq $1, %rdx + cmpq $64, %rdx + je L(main_loop_header) +L(cross_page): + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx + testb %al, %al + jne L(cross_page_loop) + xorl %eax, %eax +L(different): + subl %ecx, %eax + ret +END (__strcmp_sse2_unaligned) diff --git a/libc/sysdeps/x86_64/multiarch/strcmp-sse42.S b/libc/sysdeps/x86_64/multiarch/strcmp-sse42.S index a503e9211..c84f1c2b3 100644 --- a/libc/sysdeps/x86_64/multiarch/strcmp-sse42.S +++ b/libc/sysdeps/x86_64/multiarch/strcmp-sse42.S @@ -206,7 +206,7 @@ LABEL(touppermask): jnz LABEL(less16bytes)/* If not, find different value or null char */ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L sub $16, %r11 - jbe LABEL(strcmp_exitz)/* finish comparision */ + jbe LABEL(strcmp_exitz)/* finish comparison */ #endif add $16, %rsi /* prepare to search next 16 bytes */ add $16, %rdi /* prepare to search next 16 bytes */ diff --git a/libc/sysdeps/x86_64/multiarch/strcmp.S b/libc/sysdeps/x86_64/multiarch/strcmp.S index 1d4d71183..c5dcd1aa5 100644 --- a/libc/sysdeps/x86_64/multiarch/strcmp.S +++ b/libc/sysdeps/x86_64/multiarch/strcmp.S @@ -66,6 +66,7 @@ # define STRCMP_SSE2 __strncasecmp_l_sse2 # define __GI_STRCMP __GI___strncasecmp_l #else +# define USE_AS_STRCMP # define UPDATE_STRNCMP_COUNTER # ifndef STRCMP # define STRCMP strcmp @@ -88,11 +89,17 @@ ENTRY(STRCMP) jne 1f call __init_cpu_features 1: +#ifdef USE_AS_STRCMP + leaq __strcmp_sse2_unaligned(%rip), %rax + testl $bit_Fast_Unaligned_Load, __cpu_features+CPUID_OFFSET+index_Fast_Unaligned_Load(%rip) + jnz 3f +#else testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip) jnz 2f leaq STRCMP_SSE42(%rip), %rax testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) jnz 3f +#endif 2: leaq STRCMP_SSSE3(%rip), %rax testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) jnz 3f diff --git a/libc/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/libc/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S index cd56e5637..7710173c6 100644 --- a/libc/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S +++ b/libc/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S @@ -93,7 +93,7 @@ ENTRY (STRCPY) movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */ movdqu %xmm1, (%rdi) -/* If source adress alignment != destination adress alignment */ +/* If source address alignment != destination address alignment */ .p2align 4 L(Unalign16Both): sub %rcx, %rdi @@ -289,7 +289,7 @@ L(Unaligned64Leave): BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) # endif -/* If source adress alignment == destination adress alignment */ +/* If source address alignment == destination address alignment */ L(SourceStringAlignmentLess32): pxor %xmm0, %xmm0 |