diff options
author | joseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d> | 2010-01-26 11:27:38 +0000 |
---|---|---|
committer | joseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d> | 2010-01-26 11:27:38 +0000 |
commit | 75a8dd396d88f79e8e2660fefe5bd93cbff71f55 (patch) | |
tree | e7ff701c0888ad2e6d527fd0b3302bcc3663d3dd /libc/sysdeps | |
parent | 42bc5058cebfefa9329005e1b3bc0525f7f7b67b (diff) | |
download | eglibc2-75a8dd396d88f79e8e2660fefe5bd93cbff71f55.tar.gz |
Merge changes between r9569 and r9736 from /fsf/trunk.
git-svn-id: svn://svn.eglibc.org/trunk@9737 7b3dc134-2b1b-0410-93df-9e9f96275f8d
Diffstat (limited to 'libc/sysdeps')
73 files changed, 7311 insertions, 330 deletions
diff --git a/libc/sysdeps/generic/ldsodefs.h b/libc/sysdeps/generic/ldsodefs.h index e18e60f73..230c39a63 100644 --- a/libc/sysdeps/generic/ldsodefs.h +++ b/libc/sysdeps/generic/ldsodefs.h @@ -1,5 +1,5 @@ /* Run-time dynamic linker data structures for loaded ELF shared objects. - Copyright (C) 1995-2006, 2007, 2008, 2009 Free Software Foundation, Inc. + Copyright (C) 1995-2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -1015,7 +1015,8 @@ extern void *_dl_sysdep_read_whole_file (const char *file, size_t *sizep, extern ElfW(Addr) _dl_sysdep_start (void **start_argptr, void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum, - ElfW(Addr) *user_entry)) + ElfW(Addr) *user_entry, + ElfW(auxv_t) *auxv)) attribute_hidden; extern void _dl_sysdep_start_cleanup (void) diff --git a/libc/sysdeps/generic/netinet/ip.h b/libc/sysdeps/generic/netinet/ip.h index 38bd7556d..a837b9814 100644 --- a/libc/sysdeps/generic/netinet/ip.h +++ b/libc/sysdeps/generic/netinet/ip.h @@ -189,7 +189,26 @@ struct ip_timestamp #define IPTOS_DSCP_EF 0xb8 /* - * Definitions for IP type of service (ip_tos) + * In RFC 2474, Section 4.2.2.1, the Class Selector Codepoints subsume + * the old ToS Precedence values. + */ + +#define IPTOS_CLASS_MASK 0xe0 +#define IPTOS_CLASS(class) ((tos) & IPTOS_CLASS_MASK) +#define IPTOS_CLASS_CS0 0x00 +#define IPTOS_CLASS_CS1 0x20 +#define IPTOS_CLASS_CS2 0x40 +#define IPTOS_CLASS_CS3 0x60 +#define IPTOS_CLASS_CS4 0x80 +#define IPTOS_CLASS_CS5 0xa0 +#define IPTOS_CLASS_CS6 0xc0 +#define IPTOS_CLASS_CS7 0xe0 + +#define IPTOS_CLASS_DEFAULT IPTOS_CLASS_CS0 + +/* + * Definitions for IP type of service (ip_tos) [deprecated; use DSCP + * and CS definitions above instead.] */ #define IPTOS_TOS_MASK 0x1E #define IPTOS_TOS(tos) ((tos) & IPTOS_TOS_MASK) @@ -200,18 +219,18 @@ struct ip_timestamp #define IPTOS_MINCOST IPTOS_LOWCOST /* - * Definitions for IP precedence (also in ip_tos) (hopefully unused) + * Definitions for IP precedence (also in ip_tos) [also deprecated.] */ -#define IPTOS_PREC_MASK 0xe0 -#define IPTOS_PREC(tos) ((tos) & IPTOS_PREC_MASK) -#define IPTOS_PREC_NETCONTROL 0xe0 -#define IPTOS_PREC_INTERNETCONTROL 0xc0 -#define IPTOS_PREC_CRITIC_ECP 0xa0 -#define IPTOS_PREC_FLASHOVERRIDE 0x80 -#define IPTOS_PREC_FLASH 0x60 -#define IPTOS_PREC_IMMEDIATE 0x40 -#define IPTOS_PREC_PRIORITY 0x20 -#define IPTOS_PREC_ROUTINE 0x00 +#define IPTOS_PREC_MASK IPTOS_CLASS_MASK +#define IPTOS_PREC(tos) IPTOS_CLASS(tos) +#define IPTOS_PREC_NETCONTROL IPTOS_CLASS_CS7 +#define IPTOS_PREC_INTERNETCONTROL IPTOS_CLASS_CS6 +#define IPTOS_PREC_CRITIC_ECP IPTOS_CLASS_CS5 +#define IPTOS_PREC_FLASHOVERRIDE IPTOS_CLASS_CS4 +#define IPTOS_PREC_FLASH IPTOS_CLASS_CS3 +#define IPTOS_PREC_IMMEDIATE IPTOS_CLASS_CS2 +#define IPTOS_PREC_PRIORITY IPTOS_CLASS_CS1 +#define IPTOS_PREC_ROUTINE IPTOS_CLASS_CS0 /* * Definitions for options. diff --git a/libc/sysdeps/i386/i686/bcopy.S b/libc/sysdeps/i386/i686/bcopy.S new file mode 100644 index 000000000..15ef9419a --- /dev/null +++ b/libc/sysdeps/i386/i686/bcopy.S @@ -0,0 +1,3 @@ +#define USE_AS_BCOPY +#define memmove bcopy +#include <sysdeps/i386/i686/memmove.S> diff --git a/libc/sysdeps/i386/i686/cacheinfo.c b/libc/sysdeps/i386/i686/cacheinfo.c index 82e4cd223..f8b7f521c 100644 --- a/libc/sysdeps/i386/i686/cacheinfo.c +++ b/libc/sysdeps/i386/i686/cacheinfo.c @@ -1,3 +1,4 @@ +#define __x86_64_data_cache_size __x86_data_cache_size #define __x86_64_data_cache_size_half __x86_data_cache_size_half #define __x86_64_shared_cache_size __x86_shared_cache_size #define __x86_64_shared_cache_size_half __x86_shared_cache_size_half diff --git a/libc/sysdeps/i386/i686/memcpy.S b/libc/sysdeps/i386/i686/memcpy.S index 0b2da1ea2..86ee082be 100644 --- a/libc/sysdeps/i386/i686/memcpy.S +++ b/libc/sysdeps/i386/i686/memcpy.S @@ -32,11 +32,11 @@ .text #if defined PIC && !defined NOT_IN_libc -ENTRY (__memcpy_chk) +ENTRY_CHK (__memcpy_chk) movl 12(%esp), %eax cmpl %eax, 16(%esp) jb HIDDEN_JUMPTARGET (__chk_fail) -END (__memcpy_chk) +END_CHK (__memcpy_chk) #endif ENTRY (BP_SYM (memcpy)) ENTER diff --git a/libc/sysdeps/i386/i686/memmove.S b/libc/sysdeps/i386/i686/memmove.S index b93b5c729..981f14f4e 100644 --- a/libc/sysdeps/i386/i686/memmove.S +++ b/libc/sysdeps/i386/i686/memmove.S @@ -26,18 +26,27 @@ #define PARMS LINKAGE+4 /* one spilled register */ #define RTN PARMS -#define DEST RTN+RTN_SIZE -#define SRC DEST+PTR_SIZE -#define LEN SRC+PTR_SIZE .text -#if defined PIC && !defined NOT_IN_libc -ENTRY (__memmove_chk) + +#ifdef USE_AS_BCOPY +# define SRC RTN+RTN_SIZE +# define DEST SRC+PTR_SIZE +# define LEN DEST+PTR_SIZE +#else +# define DEST RTN+RTN_SIZE +# define SRC DEST+PTR_SIZE +# define LEN SRC+PTR_SIZE + +# if defined PIC && !defined NOT_IN_libc +ENTRY_CHK (__memmove_chk) movl 12(%esp), %eax cmpl %eax, 16(%esp) jb HIDDEN_JUMPTARGET (__chk_fail) -END (__memmove_chk) +END_CHK (__memmove_chk) +# endif #endif + ENTRY (BP_SYM (memmove)) ENTER @@ -69,8 +78,10 @@ ENTRY (BP_SYM (memmove)) movsl movl %edx, %esi cfi_restore (esi) +#ifndef USE_AS_BCOPY movl DEST(%esp), %eax RETURN_BOUNDED_POINTER (DEST(%esp)) +#endif popl %edi cfi_adjust_cfa_offset (-4) @@ -101,8 +112,10 @@ ENTRY (BP_SYM (memmove)) movsl movl %edx, %esi cfi_restore (esi) +#ifndef USE_AS_BCOPY movl DEST(%esp), %eax RETURN_BOUNDED_POINTER (DEST(%esp)) +#endif cld popl %edi @@ -112,4 +125,6 @@ ENTRY (BP_SYM (memmove)) LEAVE RET_PTR END (BP_SYM (memmove)) +#ifndef USE_AS_BCOPY libc_hidden_builtin_def (memmove) +#endif diff --git a/libc/sysdeps/i386/i686/mempcpy.S b/libc/sysdeps/i386/i686/mempcpy.S index 6437e4a5d..c10686fb3 100644 --- a/libc/sysdeps/i386/i686/mempcpy.S +++ b/libc/sysdeps/i386/i686/mempcpy.S @@ -32,11 +32,11 @@ .text #if defined PIC && !defined NOT_IN_libc -ENTRY (__mempcpy_chk) +ENTRY_CHK (__mempcpy_chk) movl 12(%esp), %eax cmpl %eax, 16(%esp) jb HIDDEN_JUMPTARGET (__chk_fail) -END (__mempcpy_chk) +END_CHK (__mempcpy_chk) #endif ENTRY (BP_SYM (__mempcpy)) ENTER diff --git a/libc/sysdeps/i386/i686/memset.S b/libc/sysdeps/i386/i686/memset.S index dfa1aa701..b343af7b6 100644 --- a/libc/sysdeps/i386/i686/memset.S +++ b/libc/sysdeps/i386/i686/memset.S @@ -40,11 +40,11 @@ .text #if defined PIC && !defined NOT_IN_libc && !BZERO_P -ENTRY (__memset_chk) +ENTRY_CHK (__memset_chk) movl 12(%esp), %eax cmpl %eax, 16(%esp) jb HIDDEN_JUMPTARGET (__chk_fail) -END (__memset_chk) +END_CHK (__memset_chk) #endif ENTRY (BP_SYM (memset)) ENTER diff --git a/libc/sysdeps/i386/i686/multiarch/Makefile b/libc/sysdeps/i386/i686/multiarch/Makefile index e1553b284..fbad9ae73 100644 --- a/libc/sysdeps/i386/i686/multiarch/Makefile +++ b/libc/sysdeps/i386/i686/multiarch/Makefile @@ -4,6 +4,10 @@ gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) +sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ + memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \ + memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \ + memset-sse2-rep bzero-sse2-rep ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 diff --git a/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S new file mode 100644 index 000000000..cbc8b420e --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3.S new file mode 100644 index 000000000..36aac44b9 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_ssse3 +#include "memcpy-ssse3.S" diff --git a/libc/sysdeps/i386/i686/multiarch/bcopy.S b/libc/sysdeps/i386/i686/multiarch/bcopy.S new file mode 100644 index 000000000..8671bf684 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/bcopy.S @@ -0,0 +1,89 @@ +/* Multiple versions of bcopy + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#ifndef NOT_IN_libc +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(bcopy) + .type bcopy, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __bcopy_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __bcopy_ssse3@GOTOFF(%ebx), %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __bcopy_ssse3_rep@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(bcopy) +# else + .text +ENTRY(bcopy) + .type bcopy, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal __bcopy_ia32, %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features + jz 2f + leal __bcopy_ssse3, %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features + jz 2f + leal __bcopy_ssse3_rep, %eax +2: ret +END(bcopy) +# endif + +# undef ENTRY +# define ENTRY(name) \ + .type __bcopy_ia32, @function; \ + .p2align 4; \ + __bcopy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32 + +#endif + +#include "../bcopy.S" diff --git a/libc/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/libc/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S new file mode 100644 index 000000000..507b288bb --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S @@ -0,0 +1,3 @@ +#define USE_AS_BZERO +#define __memset_sse2_rep __bzero_sse2_rep +#include "memset-sse2-rep.S" diff --git a/libc/sysdeps/i386/i686/multiarch/bzero-sse2.S b/libc/sysdeps/i386/i686/multiarch/bzero-sse2.S new file mode 100644 index 000000000..8d04512e4 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/bzero-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_BZERO +#define __memset_sse2 __bzero_sse2 +#include "memset-sse2.S" diff --git a/libc/sysdeps/i386/i686/multiarch/bzero.S b/libc/sysdeps/i386/i686/multiarch/bzero.S new file mode 100644 index 000000000..8c740a42d --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/bzero.S @@ -0,0 +1,97 @@ +/* Multiple versions of bzero + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#ifndef NOT_IN_libc +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(__bzero) + .type __bzero, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __bzero_ia32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __bzero_sse2@GOTOFF(%ebx), %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __bzero_sse2_rep@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(__bzero) +# else + .text +ENTRY(__bzero) + .type __bzero, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal __bzero_ia32, %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features + jz 2f + leal __bzero_sse2, %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features + jz 2f + leal __bzero_sse2_rep, %eax +2: ret +END(__bzero) +# endif + +# undef ENTRY +# define ENTRY(name) \ + .type __bzero_ia32, @function; \ + .p2align 4; \ + __bzero_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __bzero_ia32, .-__bzero_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI___bzero; __GI___bzero = __bzero_ia32 +# endif +#endif + +#include "../bzero.S" diff --git a/libc/sysdeps/i386/i686/multiarch/ifunc-defines.sym b/libc/sysdeps/i386/i686/multiarch/ifunc-defines.sym index e2021cdf8..eb1538abc 100644 --- a/libc/sysdeps/i386/i686/multiarch/ifunc-defines.sym +++ b/libc/sysdeps/i386/i686/multiarch/ifunc-defines.sym @@ -13,5 +13,8 @@ CPUID_ECX_OFFSET offsetof (struct cpuid_registers, ecx) CPUID_EDX_OFFSET offsetof (struct cpuid_registers, edx) FAMILY_OFFSET offsetof (struct cpu_features, family) MODEL_OFFSET offsetof (struct cpu_features, model) +FEATURE_OFFSET offsetof (struct cpu_features, feature) +FEATURE_SIZE sizeof (unsigned int) COMMON_CPUID_INDEX_1 +FEATURE_INDEX_1 diff --git a/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S new file mode 100644 index 000000000..b26037d27 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S @@ -0,0 +1,1785 @@ +/* memcpy with SSSE3 and REP string. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> + +#if !defined NOT_IN_libc \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3_rep +# define MEMCPY_CHK __memcpy_chk_ssse3_rep +#endif + +#ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +#else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + call __i686.get_pc_thunk.bx; \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ + addl $(TABLE - .), %ebx + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx + + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + ALIGN (4) + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret +#else +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +#endif + + .section .text.ssse3,"ax",@progbits +#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +#ifdef USE_AS_MEMMOVE + cmp %eax, %edx + jb L(copy_forward) + je L(fwd_write_0bytes) + cmp $32, %ecx + jge L(memmove_bwd) + jmp L(bk_write_less32bytes_2) +L(memmove_bwd): + add %ecx, %eax + cmp %eax, %edx + movl SRC(%esp), %eax + jb L(copy_backward) + +L(copy_forward): +#endif + cmp $48, %ecx + jge L(48bytesormore) + +L(fwd_write_less32bytes): +#ifndef USE_AS_MEMMOVE + cmp %dl, %al + jl L(bk_write) +#endif + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) +#ifndef USE_AS_MEMMOVE +L(bk_write): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) +#endif + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(48bytesormore): + movdqu (%eax), %xmm0 + PUSH (%edi) + movl %edx, %edi + and $-16, %edx + PUSH (%esi) + add $16, %edx + movl %edi, %esi + sub %edx, %edi + add %edi, %ecx + sub %edi, %eax + +#ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + call __i686.get_pc_thunk.bx + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +#endif + + mov %eax, %edi + jge L(large_page) + and $0xf, %edi + jz L(shl_0) + + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + + ALIGN (4) +L(shl_0): + movdqu %xmm0, (%esi) + xor %edi, %edi + cmp $127, %ecx + ja L(shl_0_gobble) + lea -32(%ecx), %ecx +L(shl_0_loop): + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jl L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jl L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jl L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi +L(shl_0_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + add %edi, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + +L(shl_0_gobble): + +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + call __i686.get_pc_thunk.bx + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi +# else + mov __x86_data_cache_size_half, %edi +# endif +#endif + mov %edi, %esi + shr $3, %esi + sub %esi, %edi + cmp %edi, %ecx + jge L(shl_0_gobble_mem_start) + lea -128(%ecx), %ecx + ALIGN (4) +L(shl_0_gobble_cache_loop): + movdqa (%eax), %xmm0 + movaps 0x10(%eax), %xmm1 + movaps 0x20(%eax), %xmm2 + movaps 0x30(%eax), %xmm3 + movaps 0x40(%eax), %xmm4 + movaps 0x50(%eax), %xmm5 + movaps 0x60(%eax), %xmm6 + movaps 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm1, 0x10(%edx) + movaps %xmm2, 0x20(%edx) + movaps %xmm3, 0x30(%edx) + movaps %xmm4, 0x40(%edx) + movaps %xmm5, 0x50(%edx) + movaps %xmm6, 0x60(%edx) + movaps %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jge L(shl_0_gobble_cache_loop) +L(shl_0_gobble_cache_loop_tail): + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_cache_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_cache_less_64bytes): + cmp $0x20, %ecx + jl L(shl_0_cache_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_cache_less_32bytes): + cmp $0x10, %ecx + jl L(shl_0_cache_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_cache_less_16bytes): + add %ecx, %edx + add %ecx, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(shl_0_gobble_mem_start): + cmp %al, %dl + je L(copy_page_by_rep) + lea -128(%ecx), %ecx +L(shl_0_gobble_mem_loop): + prefetchnta 0x1c0(%eax) + prefetchnta 0x280(%eax) + prefetchnta 0x1c0(%edx) + prefetchnta 0x280(%edx) + + movdqa (%eax), %xmm0 + movaps 0x10(%eax), %xmm1 + movaps 0x20(%eax), %xmm2 + movaps 0x30(%eax), %xmm3 + movaps 0x40(%eax), %xmm4 + movaps 0x50(%eax), %xmm5 + movaps 0x60(%eax), %xmm6 + movaps 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $0x80, %ecx + movdqa %xmm0, (%edx) + movaps %xmm1, 0x10(%edx) + movaps %xmm2, 0x20(%edx) + movaps %xmm3, 0x30(%edx) + movaps %xmm4, 0x40(%edx) + movaps %xmm5, 0x50(%edx) + movaps %xmm6, 0x60(%edx) + movaps %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jge L(shl_0_gobble_mem_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_mem_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_mem_less_64bytes): + cmp $0x20, %ecx + jl L(shl_0_mem_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_mem_less_32bytes): + cmp $0x10, %ecx + jl L(shl_0_mem_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_mem_less_16bytes): + add %ecx, %edx + add %ecx, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(shl_1): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -1(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_1_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_1_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_1_loop) + +L(shl_1_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 1(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_2): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -2(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_2_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_2_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_2_loop) + +L(shl_2_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 2(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_3): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -3(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_3_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_3_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_3_loop) + +L(shl_3_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 3(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_4): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -4(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_4_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_4_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_4_loop) + +L(shl_4_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 4(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_5): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -5(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_5_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_5_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_5_loop) + +L(shl_5_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 5(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(shl_6): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -6(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_6_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_6_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_6_loop) + +L(shl_6_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 6(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_7): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -7(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_7_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_7_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_7_loop) + +L(shl_7_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 7(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_8): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -8(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_8_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_8_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_8_loop) + +L(shl_8_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 8(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_9): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -9(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_9_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_9_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_9_loop) + +L(shl_9_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 9(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_10): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -10(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_10_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_10_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_10_loop) + +L(shl_10_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 10(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_11): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -11(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_11_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_11_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_11_loop) + +L(shl_11_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 11(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_12): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -12(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_12_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_12_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_12_loop) + +L(shl_12_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 12(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_13): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -13(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_13_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_13_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_13_loop) + +L(shl_13_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 13(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_14): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -14(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_14_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_14_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_14_loop) + +L(shl_14_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 14(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(shl_15): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -15(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_15_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_15_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_15_loop) + +L(shl_15_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 15(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(fwd_write_44bytes): + movl -44(%eax), %ecx + movl %ecx, -44(%edx) +L(fwd_write_40bytes): + movl -40(%eax), %ecx + movl %ecx, -40(%edx) +L(fwd_write_36bytes): + movl -36(%eax), %ecx + movl %ecx, -36(%edx) +L(fwd_write_32bytes): + movl -32(%eax), %ecx + movl %ecx, -32(%edx) +L(fwd_write_28bytes): + movl -28(%eax), %ecx + movl %ecx, -28(%edx) +L(fwd_write_24bytes): + movl -24(%eax), %ecx + movl %ecx, -24(%edx) +L(fwd_write_20bytes): + movl -20(%eax), %ecx + movl %ecx, -20(%edx) +L(fwd_write_16bytes): + movl -16(%eax), %ecx + movl %ecx, -16(%edx) +L(fwd_write_12bytes): + movl -12(%eax), %ecx + movl %ecx, -12(%edx) +L(fwd_write_8bytes): + movl -8(%eax), %ecx + movl %ecx, -8(%edx) +L(fwd_write_4bytes): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +L(fwd_write_0bytes): +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_5bytes): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_45bytes): + movl -45(%eax), %ecx + movl %ecx, -45(%edx) +L(fwd_write_41bytes): + movl -41(%eax), %ecx + movl %ecx, -41(%edx) +L(fwd_write_37bytes): + movl -37(%eax), %ecx + movl %ecx, -37(%edx) +L(fwd_write_33bytes): + movl -33(%eax), %ecx + movl %ecx, -33(%edx) +L(fwd_write_29bytes): + movl -29(%eax), %ecx + movl %ecx, -29(%edx) +L(fwd_write_25bytes): + movl -25(%eax), %ecx + movl %ecx, -25(%edx) +L(fwd_write_21bytes): + movl -21(%eax), %ecx + movl %ecx, -21(%edx) +L(fwd_write_17bytes): + movl -17(%eax), %ecx + movl %ecx, -17(%edx) +L(fwd_write_13bytes): + movl -13(%eax), %ecx + movl %ecx, -13(%edx) +L(fwd_write_9bytes): + movl -9(%eax), %ecx + movl %ecx, -9(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) +L(fwd_write_1bytes): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_46bytes): + movl -46(%eax), %ecx + movl %ecx, -46(%edx) +L(fwd_write_42bytes): + movl -42(%eax), %ecx + movl %ecx, -42(%edx) +L(fwd_write_38bytes): + movl -38(%eax), %ecx + movl %ecx, -38(%edx) +L(fwd_write_34bytes): + movl -34(%eax), %ecx + movl %ecx, -34(%edx) +L(fwd_write_30bytes): + movl -30(%eax), %ecx + movl %ecx, -30(%edx) +L(fwd_write_26bytes): + movl -26(%eax), %ecx + movl %ecx, -26(%edx) +L(fwd_write_22bytes): + movl -22(%eax), %ecx + movl %ecx, -22(%edx) +L(fwd_write_18bytes): + movl -18(%eax), %ecx + movl %ecx, -18(%edx) +L(fwd_write_14bytes): + movl -14(%eax), %ecx + movl %ecx, -14(%edx) +L(fwd_write_10bytes): + movl -10(%eax), %ecx + movl %ecx, -10(%edx) +L(fwd_write_6bytes): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) +L(fwd_write_2bytes): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_47bytes): + movl -47(%eax), %ecx + movl %ecx, -47(%edx) +L(fwd_write_43bytes): + movl -43(%eax), %ecx + movl %ecx, -43(%edx) +L(fwd_write_39bytes): + movl -39(%eax), %ecx + movl %ecx, -39(%edx) +L(fwd_write_35bytes): + movl -35(%eax), %ecx + movl %ecx, -35(%edx) +L(fwd_write_31bytes): + movl -31(%eax), %ecx + movl %ecx, -31(%edx) +L(fwd_write_27bytes): + movl -27(%eax), %ecx + movl %ecx, -27(%edx) +L(fwd_write_23bytes): + movl -23(%eax), %ecx + movl %ecx, -23(%edx) +L(fwd_write_19bytes): + movl -19(%eax), %ecx + movl %ecx, -19(%edx) +L(fwd_write_15bytes): + movl -15(%eax), %ecx + movl %ecx, -15(%edx) +L(fwd_write_11bytes): + movl -11(%eax), %ecx + movl %ecx, -11(%edx) +L(fwd_write_7bytes): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) +L(fwd_write_3bytes): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(large_page): + movdqu (%eax), %xmm1 + lea 16(%eax), %eax + movdqu %xmm0, (%esi) + movntdq %xmm1, (%edx) + lea 16(%edx), %edx + cmp %al, %dl + je L(copy_page_by_rep) +L(large_page_loop_init): + POP (%esi) + lea -0x90(%ecx), %ecx + POP (%edi) +L(large_page_loop): + prefetchnta 0x1c0(%eax) + prefetchnta 0x280(%eax) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + movdqu 0x40(%eax), %xmm4 + movdqu 0x50(%eax), %xmm5 + movdqu 0x60(%eax), %xmm6 + movdqu 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + lfence + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + movntdq %xmm4, 0x40(%edx) + movntdq %xmm5, 0x50(%edx) + movntdq %xmm6, 0x60(%edx) + movntdq %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + jae L(large_page_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(large_page_less_64bytes) + + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + lea 0x40(%eax), %eax + + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + lea 0x40(%edx), %edx + sub $0x40, %ecx +L(large_page_less_64bytes): + cmp $32, %ecx + jl L(large_page_less_32bytes) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + lea 0x20(%eax), %eax + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + lea 0x20(%edx), %edx + sub $0x20, %ecx +L(large_page_less_32bytes): + add %ecx, %edx + add %ecx, %eax + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(copy_page_by_rep): + mov %eax, %esi + mov %edx, %edi + mov %ecx, %edx + shr $2, %ecx + and $3, %edx + rep movsl + jz L(copy_page_by_rep_exit) + cmp $2, %edx + jb L(copy_page_by_rep_left_1) + movzwl (%esi), %eax + movw %ax, (%edi) + add $2, %esi + add $2, %edi + sub $2, %edx + jz L(copy_page_by_rep_exit) +L(copy_page_by_rep_left_1): + movzbl (%esi), %eax + movb %al, (%edi) +L(copy_page_by_rep_exit): + POP (%esi) + POP (%edi) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_44bytes): + movl 40(%eax), %ecx + movl %ecx, 40(%edx) +L(bk_write_40bytes): + movl 36(%eax), %ecx + movl %ecx, 36(%edx) +L(bk_write_36bytes): + movl 32(%eax), %ecx + movl %ecx, 32(%edx) +L(bk_write_32bytes): + movl 28(%eax), %ecx + movl %ecx, 28(%edx) +L(bk_write_28bytes): + movl 24(%eax), %ecx + movl %ecx, 24(%edx) +L(bk_write_24bytes): + movl 20(%eax), %ecx + movl %ecx, 20(%edx) +L(bk_write_20bytes): + movl 16(%eax), %ecx + movl %ecx, 16(%edx) +L(bk_write_16bytes): + movl 12(%eax), %ecx + movl %ecx, 12(%edx) +L(bk_write_12bytes): + movl 8(%eax), %ecx + movl %ecx, 8(%edx) +L(bk_write_8bytes): + movl 4(%eax), %ecx + movl %ecx, 4(%edx) +L(bk_write_4bytes): + movl (%eax), %ecx + movl %ecx, (%edx) +L(bk_write_0bytes): +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_45bytes): + movl 41(%eax), %ecx + movl %ecx, 41(%edx) +L(bk_write_41bytes): + movl 37(%eax), %ecx + movl %ecx, 37(%edx) +L(bk_write_37bytes): + movl 33(%eax), %ecx + movl %ecx, 33(%edx) +L(bk_write_33bytes): + movl 29(%eax), %ecx + movl %ecx, 29(%edx) +L(bk_write_29bytes): + movl 25(%eax), %ecx + movl %ecx, 25(%edx) +L(bk_write_25bytes): + movl 21(%eax), %ecx + movl %ecx, 21(%edx) +L(bk_write_21bytes): + movl 17(%eax), %ecx + movl %ecx, 17(%edx) +L(bk_write_17bytes): + movl 13(%eax), %ecx + movl %ecx, 13(%edx) +L(bk_write_13bytes): + movl 9(%eax), %ecx + movl %ecx, 9(%edx) +L(bk_write_9bytes): + movl 5(%eax), %ecx + movl %ecx, 5(%edx) +L(bk_write_5bytes): + movl 1(%eax), %ecx + movl %ecx, 1(%edx) +L(bk_write_1bytes): + movzbl (%eax), %ecx + movb %cl, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_46bytes): + movl 42(%eax), %ecx + movl %ecx, 42(%edx) +L(bk_write_42bytes): + movl 38(%eax), %ecx + movl %ecx, 38(%edx) +L(bk_write_38bytes): + movl 34(%eax), %ecx + movl %ecx, 34(%edx) +L(bk_write_34bytes): + movl 30(%eax), %ecx + movl %ecx, 30(%edx) +L(bk_write_30bytes): + movl 26(%eax), %ecx + movl %ecx, 26(%edx) +L(bk_write_26bytes): + movl 22(%eax), %ecx + movl %ecx, 22(%edx) +L(bk_write_22bytes): + movl 18(%eax), %ecx + movl %ecx, 18(%edx) +L(bk_write_18bytes): + movl 14(%eax), %ecx + movl %ecx, 14(%edx) +L(bk_write_14bytes): + movl 10(%eax), %ecx + movl %ecx, 10(%edx) +L(bk_write_10bytes): + movl 6(%eax), %ecx + movl %ecx, 6(%edx) +L(bk_write_6bytes): + movl 2(%eax), %ecx + movl %ecx, 2(%edx) +L(bk_write_2bytes): + movzwl (%eax), %ecx + movw %cx, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_47bytes): + movl 43(%eax), %ecx + movl %ecx, 43(%edx) +L(bk_write_43bytes): + movl 39(%eax), %ecx + movl %ecx, 39(%edx) +L(bk_write_39bytes): + movl 35(%eax), %ecx + movl %ecx, 35(%edx) +L(bk_write_35bytes): + movl 31(%eax), %ecx + movl %ecx, 31(%edx) +L(bk_write_31bytes): + movl 27(%eax), %ecx + movl %ecx, 27(%edx) +L(bk_write_27bytes): + movl 23(%eax), %ecx + movl %ecx, 23(%edx) +L(bk_write_23bytes): + movl 19(%eax), %ecx + movl %ecx, 19(%edx) +L(bk_write_19bytes): + movl 15(%eax), %ecx + movl %ecx, 15(%edx) +L(bk_write_15bytes): + movl 11(%eax), %ecx + movl %ecx, 11(%edx) +L(bk_write_11bytes): + movl 7(%eax), %ecx + movl %ecx, 7(%edx) +L(bk_write_7bytes): + movl 3(%eax), %ecx + movl %ecx, 3(%edx) +L(bk_write_3bytes): + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN_END + + + .pushsection .rodata.ssse3,"a",@progbits + ALIGN (2) +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) + + ALIGN (2) +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + ALIGN (2) +L(table_48_bytes_bwd): + .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) + + .popsection + +#ifdef USE_AS_MEMMOVE + ALIGN (4) +L(copy_backward): + PUSH (%esi) + movl %eax, %esi + lea (%ecx,%edx,1),%edx + lea (%ecx,%esi,1),%esi + testl $0x3, %edx + jnz L(bk_align) + +L(bk_aligned_4): + cmp $64, %ecx + jge L(bk_write_more64bytes) + +L(bk_write_64bytesless): + cmp $32, %ecx + jl L(bk_write_less32bytes) + +L(bk_write_more32bytes): + /* Copy 32 bytes at a time. */ + sub $32, %ecx + movl -4(%esi), %eax + movl %eax, -4(%edx) + movl -8(%esi), %eax + movl %eax, -8(%edx) + movl -12(%esi), %eax + movl %eax, -12(%edx) + movl -16(%esi), %eax + movl %eax, -16(%edx) + movl -20(%esi), %eax + movl %eax, -20(%edx) + movl -24(%esi), %eax + movl %eax, -24(%edx) + movl -28(%esi), %eax + movl %eax, -28(%edx) + movl -32(%esi), %eax + movl %eax, -32(%edx) + sub $32, %edx + sub $32, %esi + +L(bk_write_less32bytes): + movl %esi, %eax + sub %ecx, %edx + sub %ecx, %eax + POP (%esi) +L(bk_write_less32bytes_2): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + + ALIGN (4) +L(bk_align): + cmp $8, %ecx + jle L(bk_write_less32bytes) + testl $1, %edx + /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, + then (EDX & 2) must be != 0. */ + jz L(bk_got2) + sub $1, %esi + sub $1, %ecx + sub $1, %edx + movzbl (%esi), %eax + movb %al, (%edx) + + testl $2, %edx + jz L(bk_aligned_4) + +L(bk_got2): + sub $2, %esi + sub $2, %ecx + sub $2, %edx + movzwl (%esi), %eax + movw %ax, (%edx) + jmp L(bk_aligned_4) + + ALIGN (4) +L(bk_write_more64bytes): + /* Check alignment of last byte. */ + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + +/* EDX is aligned 4 bytes, but not 16 bytes. */ +L(bk_ssse3_align): + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + +L(bk_ssse3_cpy_pre): + cmp $64, %ecx + jl L(bk_write_more32bytes) + +L(bk_ssse3_cpy): + sub $64, %esi + sub $64, %ecx + sub $64, %edx + movdqu 0x30(%esi), %xmm3 + movdqa %xmm3, 0x30(%edx) + movdqu 0x20(%esi), %xmm2 + movdqa %xmm2, 0x20(%edx) + movdqu 0x10(%esi), %xmm1 + movdqa %xmm1, 0x10(%edx) + movdqu (%esi), %xmm0 + movdqa %xmm0, (%edx) + cmp $64, %ecx + jge L(bk_ssse3_cpy) + jmp L(bk_write_64bytesless) + +#endif + +END (MEMCPY) + +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S new file mode 100644 index 000000000..749c82d37 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S @@ -0,0 +1,1737 @@ +/* memcpy with SSSE3 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> + +#if !defined NOT_IN_libc \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +#endif + +#ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +#else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + call __i686.get_pc_thunk.bx; \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ + addl $(TABLE - .), %ebx + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx + + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + ALIGN (4) + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret +#else +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +#endif + + .section .text.ssse3,"ax",@progbits +#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +#ifdef USE_AS_MEMMOVE + cmp %eax, %edx + jb L(copy_forward) + je L(fwd_write_0bytes) + cmp $32, %ecx + jge L(memmove_bwd) + jmp L(bk_write_less32bytes_2) +L(memmove_bwd): + add %ecx, %eax + cmp %eax, %edx + movl SRC(%esp), %eax + jb L(copy_backward) + +L(copy_forward): +#endif + cmp $48, %ecx + jge L(48bytesormore) + +L(fwd_write_less32bytes): +#ifndef USE_AS_MEMMOVE + cmp %dl, %al + jl L(bk_write) +#endif + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) +#ifndef USE_AS_MEMMOVE +L(bk_write): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) +#endif + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(48bytesormore): + movdqu (%eax), %xmm0 + PUSH (%edi) + movl %edx, %edi + and $-16, %edx + PUSH (%esi) + add $16, %edx + movl %edi, %esi + sub %edx, %edi + add %edi, %ecx + sub %edi, %eax + +#ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + call __i686.get_pc_thunk.bx + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +#endif + + mov %eax, %edi + jge L(large_page) + and $0xf, %edi + jz L(shl_0) + + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + + ALIGN (4) +L(shl_0): + movdqu %xmm0, (%esi) + xor %edi, %edi + POP (%esi) + cmp $127, %ecx + ja L(shl_0_gobble) + lea -32(%ecx), %ecx +L(shl_0_loop): + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jl L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jl L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jl L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi +L(shl_0_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + add %edi, %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + +L(shl_0_gobble): + +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + call __i686.get_pc_thunk.bx + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +#endif + + POP (%edi) + lea -128(%ecx), %ecx + jge L(shl_0_gobble_mem_loop) +L(shl_0_gobble_cache_loop): + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jge L(shl_0_gobble_cache_loop) +L(shl_0_gobble_cache_loop_tail): + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_cache_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_cache_less_64bytes): + cmp $0x20, %ecx + jl L(shl_0_cache_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_cache_less_32bytes): + cmp $0x10, %ecx + jl L(shl_0_cache_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_cache_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(shl_0_gobble_mem_loop): + prefetcht0 0x1c0(%eax) + prefetcht0 0x280(%eax) + prefetcht0 0x1c0(%edx) + + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $0x80, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jge L(shl_0_gobble_mem_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_mem_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_mem_less_64bytes): + cmp $0x20, %ecx + jl L(shl_0_mem_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_mem_less_32bytes): + cmp $0x10, %ecx + jl L(shl_0_mem_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_mem_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(shl_1): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -1(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_1_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_1_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_1_loop) + +L(shl_1_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 1(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_2): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -2(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_2_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_2_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_2_loop) + +L(shl_2_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 2(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_3): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -3(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_3_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_3_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_3_loop) + +L(shl_3_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 3(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_4): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -4(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_4_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_4_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_4_loop) + +L(shl_4_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 4(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_5): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -5(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_5_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_5_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_5_loop) + +L(shl_5_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 5(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(shl_6): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -6(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_6_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_6_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_6_loop) + +L(shl_6_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 6(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_7): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -7(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_7_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_7_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_7_loop) + +L(shl_7_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 7(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_8): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -8(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_8_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_8_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_8_loop) + +L(shl_8_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 8(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_9): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -9(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_9_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_9_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_9_loop) + +L(shl_9_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 9(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_10): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -10(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_10_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_10_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_10_loop) + +L(shl_10_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 10(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_11): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -11(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_11_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_11_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_11_loop) + +L(shl_11_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 11(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_12): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -12(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_12_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_12_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_12_loop) + +L(shl_12_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 12(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_13): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -13(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_13_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_13_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_13_loop) + +L(shl_13_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 13(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + ALIGN (4) +L(shl_14): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -14(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_14_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_14_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_14_loop) + +L(shl_14_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 14(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(shl_15): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + lea -15(%eax), %eax + movaps (%eax), %xmm1 + xor %edi, %edi + lea -32(%ecx), %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_15_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jl L(shl_15_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_15_loop) + +L(shl_15_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 15(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(fwd_write_44bytes): + movl -44(%eax), %ecx + movl %ecx, -44(%edx) +L(fwd_write_40bytes): + movl -40(%eax), %ecx + movl %ecx, -40(%edx) +L(fwd_write_36bytes): + movl -36(%eax), %ecx + movl %ecx, -36(%edx) +L(fwd_write_32bytes): + movl -32(%eax), %ecx + movl %ecx, -32(%edx) +L(fwd_write_28bytes): + movl -28(%eax), %ecx + movl %ecx, -28(%edx) +L(fwd_write_24bytes): + movl -24(%eax), %ecx + movl %ecx, -24(%edx) +L(fwd_write_20bytes): + movl -20(%eax), %ecx + movl %ecx, -20(%edx) +L(fwd_write_16bytes): + movl -16(%eax), %ecx + movl %ecx, -16(%edx) +L(fwd_write_12bytes): + movl -12(%eax), %ecx + movl %ecx, -12(%edx) +L(fwd_write_8bytes): + movl -8(%eax), %ecx + movl %ecx, -8(%edx) +L(fwd_write_4bytes): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +L(fwd_write_0bytes): +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_5bytes): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_45bytes): + movl -45(%eax), %ecx + movl %ecx, -45(%edx) +L(fwd_write_41bytes): + movl -41(%eax), %ecx + movl %ecx, -41(%edx) +L(fwd_write_37bytes): + movl -37(%eax), %ecx + movl %ecx, -37(%edx) +L(fwd_write_33bytes): + movl -33(%eax), %ecx + movl %ecx, -33(%edx) +L(fwd_write_29bytes): + movl -29(%eax), %ecx + movl %ecx, -29(%edx) +L(fwd_write_25bytes): + movl -25(%eax), %ecx + movl %ecx, -25(%edx) +L(fwd_write_21bytes): + movl -21(%eax), %ecx + movl %ecx, -21(%edx) +L(fwd_write_17bytes): + movl -17(%eax), %ecx + movl %ecx, -17(%edx) +L(fwd_write_13bytes): + movl -13(%eax), %ecx + movl %ecx, -13(%edx) +L(fwd_write_9bytes): + movl -9(%eax), %ecx + movl %ecx, -9(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) +L(fwd_write_1bytes): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_46bytes): + movl -46(%eax), %ecx + movl %ecx, -46(%edx) +L(fwd_write_42bytes): + movl -42(%eax), %ecx + movl %ecx, -42(%edx) +L(fwd_write_38bytes): + movl -38(%eax), %ecx + movl %ecx, -38(%edx) +L(fwd_write_34bytes): + movl -34(%eax), %ecx + movl %ecx, -34(%edx) +L(fwd_write_30bytes): + movl -30(%eax), %ecx + movl %ecx, -30(%edx) +L(fwd_write_26bytes): + movl -26(%eax), %ecx + movl %ecx, -26(%edx) +L(fwd_write_22bytes): + movl -22(%eax), %ecx + movl %ecx, -22(%edx) +L(fwd_write_18bytes): + movl -18(%eax), %ecx + movl %ecx, -18(%edx) +L(fwd_write_14bytes): + movl -14(%eax), %ecx + movl %ecx, -14(%edx) +L(fwd_write_10bytes): + movl -10(%eax), %ecx + movl %ecx, -10(%edx) +L(fwd_write_6bytes): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) +L(fwd_write_2bytes): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_47bytes): + movl -47(%eax), %ecx + movl %ecx, -47(%edx) +L(fwd_write_43bytes): + movl -43(%eax), %ecx + movl %ecx, -43(%edx) +L(fwd_write_39bytes): + movl -39(%eax), %ecx + movl %ecx, -39(%edx) +L(fwd_write_35bytes): + movl -35(%eax), %ecx + movl %ecx, -35(%edx) +L(fwd_write_31bytes): + movl -31(%eax), %ecx + movl %ecx, -31(%edx) +L(fwd_write_27bytes): + movl -27(%eax), %ecx + movl %ecx, -27(%edx) +L(fwd_write_23bytes): + movl -23(%eax), %ecx + movl %ecx, -23(%edx) +L(fwd_write_19bytes): + movl -19(%eax), %ecx + movl %ecx, -19(%edx) +L(fwd_write_15bytes): + movl -15(%eax), %ecx + movl %ecx, -15(%edx) +L(fwd_write_11bytes): + movl -11(%eax), %ecx + movl %ecx, -11(%edx) +L(fwd_write_7bytes): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) +L(fwd_write_3bytes): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(large_page): + movdqu (%eax), %xmm1 + lea 16(%eax), %eax + movdqu %xmm0, (%esi) + movntdq %xmm1, (%edx) + lea 16(%edx), %edx + POP (%esi) + lea -0x90(%ecx), %ecx + POP (%edi) +L(large_page_loop): + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + movdqu 0x40(%eax), %xmm4 + movdqu 0x50(%eax), %xmm5 + movdqu 0x60(%eax), %xmm6 + movdqu 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + movntdq %xmm4, 0x40(%edx) + movntdq %xmm5, 0x50(%edx) + movntdq %xmm6, 0x60(%edx) + movntdq %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + jae L(large_page_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(large_page_less_64bytes) + + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + lea 0x40(%eax), %eax + + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + lea 0x40(%edx), %edx + sub $0x40, %ecx +L(large_page_less_64bytes): + cmp $32, %ecx + jl L(large_page_less_32bytes) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + lea 0x20(%eax), %eax + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + lea 0x20(%edx), %edx + sub $0x20, %ecx +L(large_page_less_32bytes): + add %ecx, %edx + add %ecx, %eax + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(bk_write_44bytes): + movl 40(%eax), %ecx + movl %ecx, 40(%edx) +L(bk_write_40bytes): + movl 36(%eax), %ecx + movl %ecx, 36(%edx) +L(bk_write_36bytes): + movl 32(%eax), %ecx + movl %ecx, 32(%edx) +L(bk_write_32bytes): + movl 28(%eax), %ecx + movl %ecx, 28(%edx) +L(bk_write_28bytes): + movl 24(%eax), %ecx + movl %ecx, 24(%edx) +L(bk_write_24bytes): + movl 20(%eax), %ecx + movl %ecx, 20(%edx) +L(bk_write_20bytes): + movl 16(%eax), %ecx + movl %ecx, 16(%edx) +L(bk_write_16bytes): + movl 12(%eax), %ecx + movl %ecx, 12(%edx) +L(bk_write_12bytes): + movl 8(%eax), %ecx + movl %ecx, 8(%edx) +L(bk_write_8bytes): + movl 4(%eax), %ecx + movl %ecx, 4(%edx) +L(bk_write_4bytes): + movl (%eax), %ecx + movl %ecx, (%edx) +L(bk_write_0bytes): +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_45bytes): + movl 41(%eax), %ecx + movl %ecx, 41(%edx) +L(bk_write_41bytes): + movl 37(%eax), %ecx + movl %ecx, 37(%edx) +L(bk_write_37bytes): + movl 33(%eax), %ecx + movl %ecx, 33(%edx) +L(bk_write_33bytes): + movl 29(%eax), %ecx + movl %ecx, 29(%edx) +L(bk_write_29bytes): + movl 25(%eax), %ecx + movl %ecx, 25(%edx) +L(bk_write_25bytes): + movl 21(%eax), %ecx + movl %ecx, 21(%edx) +L(bk_write_21bytes): + movl 17(%eax), %ecx + movl %ecx, 17(%edx) +L(bk_write_17bytes): + movl 13(%eax), %ecx + movl %ecx, 13(%edx) +L(bk_write_13bytes): + movl 9(%eax), %ecx + movl %ecx, 9(%edx) +L(bk_write_9bytes): + movl 5(%eax), %ecx + movl %ecx, 5(%edx) +L(bk_write_5bytes): + movl 1(%eax), %ecx + movl %ecx, 1(%edx) +L(bk_write_1bytes): + movzbl (%eax), %ecx + movb %cl, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_46bytes): + movl 42(%eax), %ecx + movl %ecx, 42(%edx) +L(bk_write_42bytes): + movl 38(%eax), %ecx + movl %ecx, 38(%edx) +L(bk_write_38bytes): + movl 34(%eax), %ecx + movl %ecx, 34(%edx) +L(bk_write_34bytes): + movl 30(%eax), %ecx + movl %ecx, 30(%edx) +L(bk_write_30bytes): + movl 26(%eax), %ecx + movl %ecx, 26(%edx) +L(bk_write_26bytes): + movl 22(%eax), %ecx + movl %ecx, 22(%edx) +L(bk_write_22bytes): + movl 18(%eax), %ecx + movl %ecx, 18(%edx) +L(bk_write_18bytes): + movl 14(%eax), %ecx + movl %ecx, 14(%edx) +L(bk_write_14bytes): + movl 10(%eax), %ecx + movl %ecx, 10(%edx) +L(bk_write_10bytes): + movl 6(%eax), %ecx + movl %ecx, 6(%edx) +L(bk_write_6bytes): + movl 2(%eax), %ecx + movl %ecx, 2(%edx) +L(bk_write_2bytes): + movzwl (%eax), %ecx + movw %cx, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_47bytes): + movl 43(%eax), %ecx + movl %ecx, 43(%edx) +L(bk_write_43bytes): + movl 39(%eax), %ecx + movl %ecx, 39(%edx) +L(bk_write_39bytes): + movl 35(%eax), %ecx + movl %ecx, 35(%edx) +L(bk_write_35bytes): + movl 31(%eax), %ecx + movl %ecx, 31(%edx) +L(bk_write_31bytes): + movl 27(%eax), %ecx + movl %ecx, 27(%edx) +L(bk_write_27bytes): + movl 23(%eax), %ecx + movl %ecx, 23(%edx) +L(bk_write_23bytes): + movl 19(%eax), %ecx + movl %ecx, 19(%edx) +L(bk_write_19bytes): + movl 15(%eax), %ecx + movl %ecx, 15(%edx) +L(bk_write_15bytes): + movl 11(%eax), %ecx + movl %ecx, 11(%edx) +L(bk_write_11bytes): + movl 7(%eax), %ecx + movl %ecx, 7(%edx) +L(bk_write_7bytes): + movl 3(%eax), %ecx + movl %ecx, 3(%edx) +L(bk_write_3bytes): + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN_END + + + .pushsection .rodata.ssse3,"a",@progbits + ALIGN (2) +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) + + ALIGN (2) +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + ALIGN (2) +L(table_48_bytes_bwd): + .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) + + .popsection + +#ifdef USE_AS_MEMMOVE + ALIGN (4) +L(copy_backward): + PUSH (%esi) + movl %eax, %esi + lea (%ecx,%edx,1),%edx + lea (%ecx,%esi,1),%esi + testl $0x3, %edx + jnz L(bk_align) + +L(bk_aligned_4): + cmp $64, %ecx + jge L(bk_write_more64bytes) + +L(bk_write_64bytesless): + cmp $32, %ecx + jl L(bk_write_less32bytes) + +L(bk_write_more32bytes): + /* Copy 32 bytes at a time. */ + sub $32, %ecx + movl -4(%esi), %eax + movl %eax, -4(%edx) + movl -8(%esi), %eax + movl %eax, -8(%edx) + movl -12(%esi), %eax + movl %eax, -12(%edx) + movl -16(%esi), %eax + movl %eax, -16(%edx) + movl -20(%esi), %eax + movl %eax, -20(%edx) + movl -24(%esi), %eax + movl %eax, -24(%edx) + movl -28(%esi), %eax + movl %eax, -28(%edx) + movl -32(%esi), %eax + movl %eax, -32(%edx) + sub $32, %edx + sub $32, %esi + +L(bk_write_less32bytes): + movl %esi, %eax + sub %ecx, %edx + sub %ecx, %eax + POP (%esi) +L(bk_write_less32bytes_2): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + + ALIGN (4) +L(bk_align): + cmp $8, %ecx + jle L(bk_write_less32bytes) + testl $1, %edx + /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, + then (EDX & 2) must be != 0. */ + jz L(bk_got2) + sub $1, %esi + sub $1, %ecx + sub $1, %edx + movzbl (%esi), %eax + movb %al, (%edx) + + testl $2, %edx + jz L(bk_aligned_4) + +L(bk_got2): + sub $2, %esi + sub $2, %ecx + sub $2, %edx + movzwl (%esi), %eax + movw %ax, (%edx) + jmp L(bk_aligned_4) + + ALIGN (4) +L(bk_write_more64bytes): + /* Check alignment of last byte. */ + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + +/* EDX is aligned 4 bytes, but not 16 bytes. */ +L(bk_ssse3_align): + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + +L(bk_ssse3_cpy_pre): + cmp $64, %ecx + jl L(bk_write_more32bytes) + +L(bk_ssse3_cpy): + sub $64, %esi + sub $64, %ecx + sub $64, %edx + movdqu 0x30(%esi), %xmm3 + movdqa %xmm3, 0x30(%edx) + movdqu 0x20(%esi), %xmm2 + movdqa %xmm2, 0x20(%edx) + movdqu 0x10(%esi), %xmm1 + movdqa %xmm1, 0x10(%edx) + movdqu (%esi), %xmm0 + movdqa %xmm0, (%edx) + cmp $64, %ecx + jge L(bk_ssse3_cpy) + jmp L(bk_write_64bytesless) + +#endif + +END (MEMCPY) + +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/memcpy.S b/libc/sysdeps/i386/i686/multiarch/memcpy.S new file mode 100644 index 000000000..bf1c7cc2d --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memcpy.S @@ -0,0 +1,90 @@ +/* Multiple versions of memcpy + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need memcpy before the initialization + happened. */ +#if defined SHARED && !defined NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(memcpy) + .type memcpy, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __memcpy_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memcpy_ssse3@GOTOFF(%ebx), %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memcpy_ssse3_rep@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(memcpy) + +# undef ENTRY +# define ENTRY(name) \ + .type __memcpy_ia32, @function; \ + .p2align 4; \ + __memcpy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memcpy_chk_ia32, @function; \ + .globl __memcpy_chk_ia32; \ + .p2align 4; \ + __memcpy_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32 + +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcpy; __GI_memcpy = __memcpy_ia32 +#endif + +#include "../memcpy.S" diff --git a/libc/sysdeps/i386/i686/multiarch/memcpy_chk.S b/libc/sysdeps/i386/i686/multiarch/memcpy_chk.S new file mode 100644 index 000000000..171ac8ade --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memcpy_chk.S @@ -0,0 +1,64 @@ +/* Multiple versions of __memcpy_chk + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch memcpy functions for static binaries. + */ +#ifndef NOT_IN_libc +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(__memcpy_chk) + .type __memcpy_chk, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __memcpy_chk_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memcpy_chk_ssse3@GOTOFF(%ebx), %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(__memcpy_chk) +# else +# include "../memcpy_chk.S" +# endif +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/libc/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S new file mode 100644 index 000000000..d202fc4a1 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3_rep +#define MEMCPY_CHK __memmove_chk_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/libc/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/libc/sysdeps/i386/i686/multiarch/memmove-ssse3.S new file mode 100644 index 000000000..295430b1e --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memmove-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3 +#define MEMCPY_CHK __memmove_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/libc/sysdeps/i386/i686/multiarch/memmove.S b/libc/sysdeps/i386/i686/multiarch/memmove.S new file mode 100644 index 000000000..e0529c012 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memmove.S @@ -0,0 +1,117 @@ +/* Multiple versions of memmove + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#ifndef NOT_IN_libc +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(memmove) + .type memmove, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __memmove_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memmove_ssse3@GOTOFF(%ebx), %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memmove_ssse3_rep@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(memmove) + +# undef ENTRY +# define ENTRY(name) \ + .type __memmove_ia32, @function; \ + .p2align 4; \ + __memmove_ia32: cfi_startproc; \ + CALL_MCOUNT +# else + .text +ENTRY(memmove) + .type memmove, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal __memmove_ia32, %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features + jz 2f + leal __memmove_ssse3, %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features + jz 2f + leal __memmove_ssse3_rep, %eax +2: ret +END(memmove) + +# undef ENTRY +# define ENTRY(name) \ + .type __memmove_ia32, @function; \ + .globl __memmove_ia32; \ + .p2align 4; \ + __memmove_ia32: cfi_startproc; \ + CALL_MCOUNT +# endif + +# undef END +# define END(name) \ + cfi_endproc; .size __memmove_ia32, .-__memmove_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memmove_chk_ia32, @function; \ + .globl __memmove_chk_ia32; \ + .p2align 4; \ + __memmove_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memmove; __GI_memmove = __memmove_ia32 +# endif +#endif + +#include "../memmove.S" diff --git a/libc/sysdeps/i386/i686/multiarch/memmove_chk.S b/libc/sysdeps/i386/i686/multiarch/memmove_chk.S new file mode 100644 index 000000000..e33f2a31b --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memmove_chk.S @@ -0,0 +1,112 @@ +/* Multiple versions of __memmove_chk + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#ifndef NOT_IN_libc +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(__memmove_chk) + .type __memmove_chk, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __memmove_chk_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memmove_chk_ssse3@GOTOFF(%ebx), %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memmove_chk_ssse3_rep@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(__memmove_chk) +# else + .text +ENTRY(__memmove_chk) + .type __memmove_chk, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal __memmove_chk_ia32, %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features + jz 2f + leal __memmove_chk_ssse3, %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features + jz 2f + leal __memmove_chk_ssse3_rep, %eax +2: ret +END(__memmove_chk) + + .type __memmove_chk_ssse3, @function + .p2align 4; +__memmove_chk_ssse3: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ssse3 + cfi_endproc + .size __memmove_chk_ssse3, .-__memmove_chk_ssse3 + + .type __memmove_chk_ssse3_rep, @function + .p2align 4; +__memmove_chk_ssse3_rep: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ssse3_rep + cfi_endproc + .size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep + + .type __memmove_chk_ia32, @function + .p2align 4; +__memmove_chk_ia32: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ia32 + cfi_endproc + .size __memmove_chk_ia32, .-__memmove_chk_ia32 +# endif +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S new file mode 100644 index 000000000..5357b33e1 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_ssse3_rep +#define MEMCPY_CHK __mempcpy_chk_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S new file mode 100644 index 000000000..822d98e95 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_ssse3 +#define MEMCPY_CHK __mempcpy_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/libc/sysdeps/i386/i686/multiarch/mempcpy.S b/libc/sysdeps/i386/i686/multiarch/mempcpy.S new file mode 100644 index 000000000..df830d2e6 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/mempcpy.S @@ -0,0 +1,93 @@ +/* Multiple versions of mempcpy + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need mempcpy before the initialization + happened. */ +#if defined SHARED && !defined NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(__mempcpy) + .type __mempcpy, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __mempcpy_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __mempcpy_ssse3@GOTOFF(%ebx), %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __mempcpy_ssse3_rep@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(__mempcpy) + +# undef ENTRY +# define ENTRY(name) \ + .type __mempcpy_ia32, @function; \ + .p2align 4; \ + __mempcpy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __mempcpy_chk_ia32, @function; \ + .globl __mempcpy_chk_ia32; \ + .p2align 4; \ + __mempcpy_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32 + +# undef libc_hidden_def +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_def(name) \ + .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32 +# define libc_hidden_builtin_def(name) \ + .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32 +#endif + +#include "../mempcpy.S" diff --git a/libc/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/libc/sysdeps/i386/i686/multiarch/mempcpy_chk.S new file mode 100644 index 000000000..828fb5e60 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/mempcpy_chk.S @@ -0,0 +1,64 @@ +/* Multiple versions of __mempcpy_chk + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch mempcpy functions for static binaries. + */ +#ifndef NOT_IN_libc +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(__mempcpy_chk) + .type __mempcpy_chk, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __mempcpy_chk_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __mempcpy_chk_ssse3@GOTOFF(%ebx), %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __mempcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(__mempcpy_chk) +# else +# include "../mempcpy_chk.S" +# endif +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/libc/sysdeps/i386/i686/multiarch/memset-sse2-rep.S new file mode 100644 index 000000000..84afffeb6 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memset-sse2-rep.S @@ -0,0 +1,821 @@ +/* memset with SSE2 and REP string. + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_BZERO +# define DEST PARMS +# define LEN DEST+4 +# define SETRTNVAL +#else +# define DEST PARMS +# define CHR DEST+4 +# define LEN CHR+4 +# define SETRTNVAL movl DEST(%esp), %eax +#endif + +#ifdef SHARED +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define PARMS 8 /* Preserve EBX. */ +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + /* We first load PC into EBX. */ \ + call __i686.get_pc_thunk.bx; \ + /* Get the address of the jump table. */ \ + add $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + add (%ebx,%ecx,4), %ebx; \ + add %ecx, %edx; \ + /* We loaded the jump table and adjuested EDX. Go. */ \ + jmp *%ebx + + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + ALIGN (4) + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret +#else +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define PARMS 4 +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + add %ecx, %edx; \ + jmp *TABLE(,%ecx,4) +#endif + + .section .text.sse2,"ax",@progbits +#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO +ENTRY (__memset_chk_sse2_rep) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk_sse2_rep) +#endif +ENTRY (__memset_sse2_rep) + ENTRANCE + + movl LEN(%esp), %ecx +#ifdef USE_AS_BZERO + xor %eax, %eax +#else + movzbl CHR(%esp), %eax + movb %al, %ah + /* Fill the whole EAX with pattern. */ + movl %eax, %edx + shl $16, %eax + or %edx, %eax +#endif + movl DEST(%esp), %edx + cmp $32, %ecx + jae L(32bytesormore) + +L(write_less32bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_less_32bytes): + .int JMPTBL (L(write_0bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_32bytes)) + .popsection + + ALIGN (4) +L(write_28bytes): + movl %eax, -28(%edx) +L(write_24bytes): + movl %eax, -24(%edx) +L(write_20bytes): + movl %eax, -20(%edx) +L(write_16bytes): + movl %eax, -16(%edx) +L(write_12bytes): + movl %eax, -12(%edx) +L(write_8bytes): + movl %eax, -8(%edx) +L(write_4bytes): + movl %eax, -4(%edx) +L(write_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(write_29bytes): + movl %eax, -29(%edx) +L(write_25bytes): + movl %eax, -25(%edx) +L(write_21bytes): + movl %eax, -21(%edx) +L(write_17bytes): + movl %eax, -17(%edx) +L(write_13bytes): + movl %eax, -13(%edx) +L(write_9bytes): + movl %eax, -9(%edx) +L(write_5bytes): + movl %eax, -5(%edx) +L(write_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_30bytes): + movl %eax, -30(%edx) +L(write_26bytes): + movl %eax, -26(%edx) +L(write_22bytes): + movl %eax, -22(%edx) +L(write_18bytes): + movl %eax, -18(%edx) +L(write_14bytes): + movl %eax, -14(%edx) +L(write_10bytes): + movl %eax, -10(%edx) +L(write_6bytes): + movl %eax, -6(%edx) +L(write_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_31bytes): + movl %eax, -31(%edx) +L(write_27bytes): + movl %eax, -27(%edx) +L(write_23bytes): + movl %eax, -23(%edx) +L(write_19bytes): + movl %eax, -19(%edx) +L(write_15bytes): + movl %eax, -15(%edx) +L(write_11bytes): + movl %eax, -11(%edx) +L(write_7bytes): + movl %eax, -7(%edx) +L(write_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(32bytesormore): + /* Fill xmm0 with the pattern. */ +#ifdef USE_AS_BZERO + pxor %xmm0, %xmm0 +#else + movd %eax, %xmm0 + punpcklbw %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 +#endif + testl $0xf, %edx + jz L(aligned_16) +/* ECX > 32 and EDX is not 16 byte aligned. */ +L(not_aligned_16): + movdqu %xmm0, (%edx) + movl %edx, %eax + and $-16, %edx + add $16, %edx + sub %edx, %eax + add %eax, %ecx + movd %xmm0, %eax + + ALIGN (4) +L(aligned_16): + cmp $128, %ecx + jge L(128bytesormore) + +L(aligned_16_less128bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytesormore): + PUSH (%edi) +#ifdef DATA_CACHE_SIZE + PUSH (%ebx) + mov $DATA_CACHE_SIZE, %ebx +#else +# ifdef SHARED + call __i686.get_pc_thunk.bx + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_data_cache_size@GOTOFF(%ebx), %ebx +# else + PUSH (%ebx) + mov __x86_data_cache_size, %ebx +# endif +#endif + mov %ebx, %edi + shr $4, %ebx + sub %ebx, %edi +#if defined DATA_CACHE_SIZE || !defined SHARED + POP (%ebx) +#endif +/* + * When data size approximate the end of L1 cache, + * fast string will prefetch and combine data efficiently. + */ + cmp %edi, %ecx + jae L(128bytesormore_nt) + subl $128, %ecx +L(128bytesormore_normal): + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jl L(128bytesless_normal) + + + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jge L(128bytesormore_normal) + +L(128bytesless_normal): + POP (%edi) + lea 128(%ecx), %ecx + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytesormore_nt): + mov %edx, %edi + mov %ecx, %edx + shr $2, %ecx + and $3, %edx + rep stosl + jz L(copy_page_by_rep_exit) + cmp $2, %edx + jb L(copy_page_by_rep_left_1) + movw %ax, (%edi) + add $2, %edi + sub $2, %edx + jz L(copy_page_by_rep_exit) +L(copy_page_by_rep_left_1): + movb %al, (%edi) +L(copy_page_by_rep_exit): + POP (%edi) + SETRTNVAL + RETURN + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_16_128bytes): + .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) + .popsection + + ALIGN (4) +L(aligned_16_112bytes): + movdqa %xmm0, -112(%edx) +L(aligned_16_96bytes): + movdqa %xmm0, -96(%edx) +L(aligned_16_80bytes): + movdqa %xmm0, -80(%edx) +L(aligned_16_64bytes): + movdqa %xmm0, -64(%edx) +L(aligned_16_48bytes): + movdqa %xmm0, -48(%edx) +L(aligned_16_32bytes): + movdqa %xmm0, -32(%edx) +L(aligned_16_16bytes): + movdqa %xmm0, -16(%edx) +L(aligned_16_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_113bytes): + movdqa %xmm0, -113(%edx) +L(aligned_16_97bytes): + movdqa %xmm0, -97(%edx) +L(aligned_16_81bytes): + movdqa %xmm0, -81(%edx) +L(aligned_16_65bytes): + movdqa %xmm0, -65(%edx) +L(aligned_16_49bytes): + movdqa %xmm0, -49(%edx) +L(aligned_16_33bytes): + movdqa %xmm0, -33(%edx) +L(aligned_16_17bytes): + movdqa %xmm0, -17(%edx) +L(aligned_16_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_114bytes): + movdqa %xmm0, -114(%edx) +L(aligned_16_98bytes): + movdqa %xmm0, -98(%edx) +L(aligned_16_82bytes): + movdqa %xmm0, -82(%edx) +L(aligned_16_66bytes): + movdqa %xmm0, -66(%edx) +L(aligned_16_50bytes): + movdqa %xmm0, -50(%edx) +L(aligned_16_34bytes): + movdqa %xmm0, -34(%edx) +L(aligned_16_18bytes): + movdqa %xmm0, -18(%edx) +L(aligned_16_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_115bytes): + movdqa %xmm0, -115(%edx) +L(aligned_16_99bytes): + movdqa %xmm0, -99(%edx) +L(aligned_16_83bytes): + movdqa %xmm0, -83(%edx) +L(aligned_16_67bytes): + movdqa %xmm0, -67(%edx) +L(aligned_16_51bytes): + movdqa %xmm0, -51(%edx) +L(aligned_16_35bytes): + movdqa %xmm0, -35(%edx) +L(aligned_16_19bytes): + movdqa %xmm0, -19(%edx) +L(aligned_16_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_116bytes): + movdqa %xmm0, -116(%edx) +L(aligned_16_100bytes): + movdqa %xmm0, -100(%edx) +L(aligned_16_84bytes): + movdqa %xmm0, -84(%edx) +L(aligned_16_68bytes): + movdqa %xmm0, -68(%edx) +L(aligned_16_52bytes): + movdqa %xmm0, -52(%edx) +L(aligned_16_36bytes): + movdqa %xmm0, -36(%edx) +L(aligned_16_20bytes): + movdqa %xmm0, -20(%edx) +L(aligned_16_4bytes): + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_117bytes): + movdqa %xmm0, -117(%edx) +L(aligned_16_101bytes): + movdqa %xmm0, -101(%edx) +L(aligned_16_85bytes): + movdqa %xmm0, -85(%edx) +L(aligned_16_69bytes): + movdqa %xmm0, -69(%edx) +L(aligned_16_53bytes): + movdqa %xmm0, -53(%edx) +L(aligned_16_37bytes): + movdqa %xmm0, -37(%edx) +L(aligned_16_21bytes): + movdqa %xmm0, -21(%edx) +L(aligned_16_5bytes): + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_118bytes): + movdqa %xmm0, -118(%edx) +L(aligned_16_102bytes): + movdqa %xmm0, -102(%edx) +L(aligned_16_86bytes): + movdqa %xmm0, -86(%edx) +L(aligned_16_70bytes): + movdqa %xmm0, -70(%edx) +L(aligned_16_54bytes): + movdqa %xmm0, -54(%edx) +L(aligned_16_38bytes): + movdqa %xmm0, -38(%edx) +L(aligned_16_22bytes): + movdqa %xmm0, -22(%edx) +L(aligned_16_6bytes): + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_119bytes): + movdqa %xmm0, -119(%edx) +L(aligned_16_103bytes): + movdqa %xmm0, -103(%edx) +L(aligned_16_87bytes): + movdqa %xmm0, -87(%edx) +L(aligned_16_71bytes): + movdqa %xmm0, -71(%edx) +L(aligned_16_55bytes): + movdqa %xmm0, -55(%edx) +L(aligned_16_39bytes): + movdqa %xmm0, -39(%edx) +L(aligned_16_23bytes): + movdqa %xmm0, -23(%edx) +L(aligned_16_7bytes): + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_120bytes): + movdqa %xmm0, -120(%edx) +L(aligned_16_104bytes): + movdqa %xmm0, -104(%edx) +L(aligned_16_88bytes): + movdqa %xmm0, -88(%edx) +L(aligned_16_72bytes): + movdqa %xmm0, -72(%edx) +L(aligned_16_56bytes): + movdqa %xmm0, -56(%edx) +L(aligned_16_40bytes): + movdqa %xmm0, -40(%edx) +L(aligned_16_24bytes): + movdqa %xmm0, -24(%edx) +L(aligned_16_8bytes): + movq %xmm0, -8(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_121bytes): + movdqa %xmm0, -121(%edx) +L(aligned_16_105bytes): + movdqa %xmm0, -105(%edx) +L(aligned_16_89bytes): + movdqa %xmm0, -89(%edx) +L(aligned_16_73bytes): + movdqa %xmm0, -73(%edx) +L(aligned_16_57bytes): + movdqa %xmm0, -57(%edx) +L(aligned_16_41bytes): + movdqa %xmm0, -41(%edx) +L(aligned_16_25bytes): + movdqa %xmm0, -25(%edx) +L(aligned_16_9bytes): + movq %xmm0, -9(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_122bytes): + movdqa %xmm0, -122(%edx) +L(aligned_16_106bytes): + movdqa %xmm0, -106(%edx) +L(aligned_16_90bytes): + movdqa %xmm0, -90(%edx) +L(aligned_16_74bytes): + movdqa %xmm0, -74(%edx) +L(aligned_16_58bytes): + movdqa %xmm0, -58(%edx) +L(aligned_16_42bytes): + movdqa %xmm0, -42(%edx) +L(aligned_16_26bytes): + movdqa %xmm0, -26(%edx) +L(aligned_16_10bytes): + movq %xmm0, -10(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_123bytes): + movdqa %xmm0, -123(%edx) +L(aligned_16_107bytes): + movdqa %xmm0, -107(%edx) +L(aligned_16_91bytes): + movdqa %xmm0, -91(%edx) +L(aligned_16_75bytes): + movdqa %xmm0, -75(%edx) +L(aligned_16_59bytes): + movdqa %xmm0, -59(%edx) +L(aligned_16_43bytes): + movdqa %xmm0, -43(%edx) +L(aligned_16_27bytes): + movdqa %xmm0, -27(%edx) +L(aligned_16_11bytes): + movq %xmm0, -11(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_124bytes): + movdqa %xmm0, -124(%edx) +L(aligned_16_108bytes): + movdqa %xmm0, -108(%edx) +L(aligned_16_92bytes): + movdqa %xmm0, -92(%edx) +L(aligned_16_76bytes): + movdqa %xmm0, -76(%edx) +L(aligned_16_60bytes): + movdqa %xmm0, -60(%edx) +L(aligned_16_44bytes): + movdqa %xmm0, -44(%edx) +L(aligned_16_28bytes): + movdqa %xmm0, -28(%edx) +L(aligned_16_12bytes): + movq %xmm0, -12(%edx) + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_125bytes): + movdqa %xmm0, -125(%edx) +L(aligned_16_109bytes): + movdqa %xmm0, -109(%edx) +L(aligned_16_93bytes): + movdqa %xmm0, -93(%edx) +L(aligned_16_77bytes): + movdqa %xmm0, -77(%edx) +L(aligned_16_61bytes): + movdqa %xmm0, -61(%edx) +L(aligned_16_45bytes): + movdqa %xmm0, -45(%edx) +L(aligned_16_29bytes): + movdqa %xmm0, -29(%edx) +L(aligned_16_13bytes): + movq %xmm0, -13(%edx) + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_126bytes): + movdqa %xmm0, -126(%edx) +L(aligned_16_110bytes): + movdqa %xmm0, -110(%edx) +L(aligned_16_94bytes): + movdqa %xmm0, -94(%edx) +L(aligned_16_78bytes): + movdqa %xmm0, -78(%edx) +L(aligned_16_62bytes): + movdqa %xmm0, -62(%edx) +L(aligned_16_46bytes): + movdqa %xmm0, -46(%edx) +L(aligned_16_30bytes): + movdqa %xmm0, -30(%edx) +L(aligned_16_14bytes): + movq %xmm0, -14(%edx) + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_127bytes): + movdqa %xmm0, -127(%edx) +L(aligned_16_111bytes): + movdqa %xmm0, -111(%edx) +L(aligned_16_95bytes): + movdqa %xmm0, -95(%edx) +L(aligned_16_79bytes): + movdqa %xmm0, -79(%edx) +L(aligned_16_63bytes): + movdqa %xmm0, -63(%edx) +L(aligned_16_47bytes): + movdqa %xmm0, -47(%edx) +L(aligned_16_31bytes): + movdqa %xmm0, -31(%edx) +L(aligned_16_15bytes): + movq %xmm0, -15(%edx) + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN_END + +END (__memset_sse2_rep) + +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/memset-sse2.S b/libc/sysdeps/i386/i686/multiarch/memset-sse2.S new file mode 100644 index 000000000..b2b979193 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memset-sse2.S @@ -0,0 +1,867 @@ +/* memset with SSE2 + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_BZERO +# define DEST PARMS +# define LEN DEST+4 +# define SETRTNVAL +#else +# define DEST PARMS +# define CHR DEST+4 +# define LEN CHR+4 +# define SETRTNVAL movl DEST(%esp), %eax +#endif + +#ifdef SHARED +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define PARMS 8 /* Preserve EBX. */ +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + /* We first load PC into EBX. */ \ + call __i686.get_pc_thunk.bx; \ + /* Get the address of the jump table. */ \ + add $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + add (%ebx,%ecx,4), %ebx; \ + add %ecx, %edx; \ + /* We loaded the jump table and adjuested EDX. Go. */ \ + jmp *%ebx + + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + ALIGN (4) + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret +#else +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define PARMS 4 +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + add %ecx, %edx; \ + jmp *TABLE(,%ecx,4) +#endif + + .section .text.sse2,"ax",@progbits +#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO +ENTRY (__memset_chk_sse2) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk_sse2) +#endif +ENTRY (__memset_sse2) + ENTRANCE + + movl LEN(%esp), %ecx +#ifdef USE_AS_BZERO + xor %eax, %eax +#else + movzbl CHR(%esp), %eax + movb %al, %ah + /* Fill the whole EAX with pattern. */ + movl %eax, %edx + shl $16, %eax + or %edx, %eax +#endif + movl DEST(%esp), %edx + cmp $32, %ecx + jae L(32bytesormore) + +L(write_less32bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_less_32bytes): + .int JMPTBL (L(write_0bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_32bytes)) + .popsection + + ALIGN (4) +L(write_28bytes): + movl %eax, -28(%edx) +L(write_24bytes): + movl %eax, -24(%edx) +L(write_20bytes): + movl %eax, -20(%edx) +L(write_16bytes): + movl %eax, -16(%edx) +L(write_12bytes): + movl %eax, -12(%edx) +L(write_8bytes): + movl %eax, -8(%edx) +L(write_4bytes): + movl %eax, -4(%edx) +L(write_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(write_29bytes): + movl %eax, -29(%edx) +L(write_25bytes): + movl %eax, -25(%edx) +L(write_21bytes): + movl %eax, -21(%edx) +L(write_17bytes): + movl %eax, -17(%edx) +L(write_13bytes): + movl %eax, -13(%edx) +L(write_9bytes): + movl %eax, -9(%edx) +L(write_5bytes): + movl %eax, -5(%edx) +L(write_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_30bytes): + movl %eax, -30(%edx) +L(write_26bytes): + movl %eax, -26(%edx) +L(write_22bytes): + movl %eax, -22(%edx) +L(write_18bytes): + movl %eax, -18(%edx) +L(write_14bytes): + movl %eax, -14(%edx) +L(write_10bytes): + movl %eax, -10(%edx) +L(write_6bytes): + movl %eax, -6(%edx) +L(write_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_31bytes): + movl %eax, -31(%edx) +L(write_27bytes): + movl %eax, -27(%edx) +L(write_23bytes): + movl %eax, -23(%edx) +L(write_19bytes): + movl %eax, -19(%edx) +L(write_15bytes): + movl %eax, -15(%edx) +L(write_11bytes): + movl %eax, -11(%edx) +L(write_7bytes): + movl %eax, -7(%edx) +L(write_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(32bytesormore): + /* Fill xmm0 with the pattern. */ +#ifdef USE_AS_BZERO + pxor %xmm0, %xmm0 +#else + movd %eax, %xmm0 + punpcklbw %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 +#endif + testl $0xf, %edx + jz L(aligned_16) +/* ECX > 32 and EDX is not 16 byte aligned. */ +L(not_aligned_16): + movdqu %xmm0, (%edx) + movl %edx, %eax + and $-16, %edx + add $16, %edx + sub %edx, %eax + add %eax, %ecx + movd %xmm0, %eax + + ALIGN (4) +L(aligned_16): + cmp $128, %ecx + jge L(128bytesormore) + +L(aligned_16_less128bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytesormore): +#ifdef SHARED_CACHE_SIZE + PUSH (%ebx) + mov $SHARED_CACHE_SIZE, %ebx +#else +# ifdef SHARED + call __i686.get_pc_thunk.bx + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx +# else + PUSH (%ebx) + mov __x86_shared_cache_size, %ebx +# endif +#endif + cmp %ebx, %ecx + jae L(128bytesormore_nt_start) + + +#ifdef DATA_CACHE_SIZE + POP (%ebx) + cmp $DATA_CACHE_SIZE, %ecx +#else +# ifdef SHARED + call __i686.get_pc_thunk.bx + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx +# else + POP (%ebx) + cmp __x86_data_cache_size, %ecx +# endif +#endif + + jae L(128bytes_L2_normal) + subl $128, %ecx +L(128bytesormore_normal): + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jl L(128bytesless_normal) + + + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jge L(128bytesormore_normal) + +L(128bytesless_normal): + lea 128(%ecx), %ecx + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytes_L2_normal): + prefetcht0 0x380(%edx) + prefetcht0 0x3c0(%edx) + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm0, 0x10(%edx) + movaps %xmm0, 0x20(%edx) + movaps %xmm0, 0x30(%edx) + movaps %xmm0, 0x40(%edx) + movaps %xmm0, 0x50(%edx) + movaps %xmm0, 0x60(%edx) + movaps %xmm0, 0x70(%edx) + add $128, %edx + cmp $128, %ecx + jge L(128bytes_L2_normal) + +L(128bytesless_L2_normal): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + +L(128bytesormore_nt_start): + sub %ebx, %ecx + ALIGN (4) +L(128bytesormore_shared_cache_loop): + prefetcht0 0x3c0(%edx) + prefetcht0 0x380(%edx) + sub $0x80, %ebx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + add $0x80, %edx + cmp $0x80, %ebx + jge L(128bytesormore_shared_cache_loop) + cmp $0x80, %ecx + jb L(shared_cache_loop_end) + ALIGN (4) +L(128bytesormore_nt): + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm0, 0x10(%edx) + movntdq %xmm0, 0x20(%edx) + movntdq %xmm0, 0x30(%edx) + movntdq %xmm0, 0x40(%edx) + movntdq %xmm0, 0x50(%edx) + movntdq %xmm0, 0x60(%edx) + movntdq %xmm0, 0x70(%edx) + add $0x80, %edx + cmp $0x80, %ecx + jge L(128bytesormore_nt) + sfence +L(shared_cache_loop_end): +#if defined DATA_CACHE_SIZE || !defined SHARED + POP (%ebx) +#endif + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_16_128bytes): + .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) + .popsection + + ALIGN (4) +L(aligned_16_112bytes): + movdqa %xmm0, -112(%edx) +L(aligned_16_96bytes): + movdqa %xmm0, -96(%edx) +L(aligned_16_80bytes): + movdqa %xmm0, -80(%edx) +L(aligned_16_64bytes): + movdqa %xmm0, -64(%edx) +L(aligned_16_48bytes): + movdqa %xmm0, -48(%edx) +L(aligned_16_32bytes): + movdqa %xmm0, -32(%edx) +L(aligned_16_16bytes): + movdqa %xmm0, -16(%edx) +L(aligned_16_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_113bytes): + movdqa %xmm0, -113(%edx) +L(aligned_16_97bytes): + movdqa %xmm0, -97(%edx) +L(aligned_16_81bytes): + movdqa %xmm0, -81(%edx) +L(aligned_16_65bytes): + movdqa %xmm0, -65(%edx) +L(aligned_16_49bytes): + movdqa %xmm0, -49(%edx) +L(aligned_16_33bytes): + movdqa %xmm0, -33(%edx) +L(aligned_16_17bytes): + movdqa %xmm0, -17(%edx) +L(aligned_16_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_114bytes): + movdqa %xmm0, -114(%edx) +L(aligned_16_98bytes): + movdqa %xmm0, -98(%edx) +L(aligned_16_82bytes): + movdqa %xmm0, -82(%edx) +L(aligned_16_66bytes): + movdqa %xmm0, -66(%edx) +L(aligned_16_50bytes): + movdqa %xmm0, -50(%edx) +L(aligned_16_34bytes): + movdqa %xmm0, -34(%edx) +L(aligned_16_18bytes): + movdqa %xmm0, -18(%edx) +L(aligned_16_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_115bytes): + movdqa %xmm0, -115(%edx) +L(aligned_16_99bytes): + movdqa %xmm0, -99(%edx) +L(aligned_16_83bytes): + movdqa %xmm0, -83(%edx) +L(aligned_16_67bytes): + movdqa %xmm0, -67(%edx) +L(aligned_16_51bytes): + movdqa %xmm0, -51(%edx) +L(aligned_16_35bytes): + movdqa %xmm0, -35(%edx) +L(aligned_16_19bytes): + movdqa %xmm0, -19(%edx) +L(aligned_16_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_116bytes): + movdqa %xmm0, -116(%edx) +L(aligned_16_100bytes): + movdqa %xmm0, -100(%edx) +L(aligned_16_84bytes): + movdqa %xmm0, -84(%edx) +L(aligned_16_68bytes): + movdqa %xmm0, -68(%edx) +L(aligned_16_52bytes): + movdqa %xmm0, -52(%edx) +L(aligned_16_36bytes): + movdqa %xmm0, -36(%edx) +L(aligned_16_20bytes): + movdqa %xmm0, -20(%edx) +L(aligned_16_4bytes): + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_117bytes): + movdqa %xmm0, -117(%edx) +L(aligned_16_101bytes): + movdqa %xmm0, -101(%edx) +L(aligned_16_85bytes): + movdqa %xmm0, -85(%edx) +L(aligned_16_69bytes): + movdqa %xmm0, -69(%edx) +L(aligned_16_53bytes): + movdqa %xmm0, -53(%edx) +L(aligned_16_37bytes): + movdqa %xmm0, -37(%edx) +L(aligned_16_21bytes): + movdqa %xmm0, -21(%edx) +L(aligned_16_5bytes): + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_118bytes): + movdqa %xmm0, -118(%edx) +L(aligned_16_102bytes): + movdqa %xmm0, -102(%edx) +L(aligned_16_86bytes): + movdqa %xmm0, -86(%edx) +L(aligned_16_70bytes): + movdqa %xmm0, -70(%edx) +L(aligned_16_54bytes): + movdqa %xmm0, -54(%edx) +L(aligned_16_38bytes): + movdqa %xmm0, -38(%edx) +L(aligned_16_22bytes): + movdqa %xmm0, -22(%edx) +L(aligned_16_6bytes): + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_119bytes): + movdqa %xmm0, -119(%edx) +L(aligned_16_103bytes): + movdqa %xmm0, -103(%edx) +L(aligned_16_87bytes): + movdqa %xmm0, -87(%edx) +L(aligned_16_71bytes): + movdqa %xmm0, -71(%edx) +L(aligned_16_55bytes): + movdqa %xmm0, -55(%edx) +L(aligned_16_39bytes): + movdqa %xmm0, -39(%edx) +L(aligned_16_23bytes): + movdqa %xmm0, -23(%edx) +L(aligned_16_7bytes): + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_120bytes): + movdqa %xmm0, -120(%edx) +L(aligned_16_104bytes): + movdqa %xmm0, -104(%edx) +L(aligned_16_88bytes): + movdqa %xmm0, -88(%edx) +L(aligned_16_72bytes): + movdqa %xmm0, -72(%edx) +L(aligned_16_56bytes): + movdqa %xmm0, -56(%edx) +L(aligned_16_40bytes): + movdqa %xmm0, -40(%edx) +L(aligned_16_24bytes): + movdqa %xmm0, -24(%edx) +L(aligned_16_8bytes): + movq %xmm0, -8(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_121bytes): + movdqa %xmm0, -121(%edx) +L(aligned_16_105bytes): + movdqa %xmm0, -105(%edx) +L(aligned_16_89bytes): + movdqa %xmm0, -89(%edx) +L(aligned_16_73bytes): + movdqa %xmm0, -73(%edx) +L(aligned_16_57bytes): + movdqa %xmm0, -57(%edx) +L(aligned_16_41bytes): + movdqa %xmm0, -41(%edx) +L(aligned_16_25bytes): + movdqa %xmm0, -25(%edx) +L(aligned_16_9bytes): + movq %xmm0, -9(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_122bytes): + movdqa %xmm0, -122(%edx) +L(aligned_16_106bytes): + movdqa %xmm0, -106(%edx) +L(aligned_16_90bytes): + movdqa %xmm0, -90(%edx) +L(aligned_16_74bytes): + movdqa %xmm0, -74(%edx) +L(aligned_16_58bytes): + movdqa %xmm0, -58(%edx) +L(aligned_16_42bytes): + movdqa %xmm0, -42(%edx) +L(aligned_16_26bytes): + movdqa %xmm0, -26(%edx) +L(aligned_16_10bytes): + movq %xmm0, -10(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_123bytes): + movdqa %xmm0, -123(%edx) +L(aligned_16_107bytes): + movdqa %xmm0, -107(%edx) +L(aligned_16_91bytes): + movdqa %xmm0, -91(%edx) +L(aligned_16_75bytes): + movdqa %xmm0, -75(%edx) +L(aligned_16_59bytes): + movdqa %xmm0, -59(%edx) +L(aligned_16_43bytes): + movdqa %xmm0, -43(%edx) +L(aligned_16_27bytes): + movdqa %xmm0, -27(%edx) +L(aligned_16_11bytes): + movq %xmm0, -11(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_124bytes): + movdqa %xmm0, -124(%edx) +L(aligned_16_108bytes): + movdqa %xmm0, -108(%edx) +L(aligned_16_92bytes): + movdqa %xmm0, -92(%edx) +L(aligned_16_76bytes): + movdqa %xmm0, -76(%edx) +L(aligned_16_60bytes): + movdqa %xmm0, -60(%edx) +L(aligned_16_44bytes): + movdqa %xmm0, -44(%edx) +L(aligned_16_28bytes): + movdqa %xmm0, -28(%edx) +L(aligned_16_12bytes): + movq %xmm0, -12(%edx) + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_125bytes): + movdqa %xmm0, -125(%edx) +L(aligned_16_109bytes): + movdqa %xmm0, -109(%edx) +L(aligned_16_93bytes): + movdqa %xmm0, -93(%edx) +L(aligned_16_77bytes): + movdqa %xmm0, -77(%edx) +L(aligned_16_61bytes): + movdqa %xmm0, -61(%edx) +L(aligned_16_45bytes): + movdqa %xmm0, -45(%edx) +L(aligned_16_29bytes): + movdqa %xmm0, -29(%edx) +L(aligned_16_13bytes): + movq %xmm0, -13(%edx) + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_126bytes): + movdqa %xmm0, -126(%edx) +L(aligned_16_110bytes): + movdqa %xmm0, -110(%edx) +L(aligned_16_94bytes): + movdqa %xmm0, -94(%edx) +L(aligned_16_78bytes): + movdqa %xmm0, -78(%edx) +L(aligned_16_62bytes): + movdqa %xmm0, -62(%edx) +L(aligned_16_46bytes): + movdqa %xmm0, -46(%edx) +L(aligned_16_30bytes): + movdqa %xmm0, -30(%edx) +L(aligned_16_14bytes): + movq %xmm0, -14(%edx) + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_127bytes): + movdqa %xmm0, -127(%edx) +L(aligned_16_111bytes): + movdqa %xmm0, -111(%edx) +L(aligned_16_95bytes): + movdqa %xmm0, -95(%edx) +L(aligned_16_79bytes): + movdqa %xmm0, -79(%edx) +L(aligned_16_63bytes): + movdqa %xmm0, -63(%edx) +L(aligned_16_47bytes): + movdqa %xmm0, -47(%edx) +L(aligned_16_31bytes): + movdqa %xmm0, -31(%edx) +L(aligned_16_15bytes): + movq %xmm0, -15(%edx) + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN_END + +END (__memset_sse2) + +#endif diff --git a/libc/sysdeps/i386/i686/multiarch/memset.S b/libc/sysdeps/i386/i686/multiarch/memset.S new file mode 100644 index 000000000..34dddcef7 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memset.S @@ -0,0 +1,112 @@ +/* Multiple versions of memset + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#ifndef NOT_IN_libc +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(memset) + .type memset, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __memset_ia32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memset_sse2@GOTOFF(%ebx), %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memset_sse2_rep@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(memset) +# else + .text +ENTRY(memset) + .type memset, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal __memset_ia32, %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features + jz 2f + leal __memset_sse2, %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features + jz 2f + leal __memset_sse2_rep, %eax +2: ret +END(memset) +# endif + +# undef ENTRY +# define ENTRY(name) \ + .type __memset_ia32, @function; \ + .globl __memset_ia32; \ + .p2align 4; \ + __memset_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memset_ia32, .-__memset_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memset_chk_ia32, @function; \ + .globl __memset_chk_ia32; \ + .p2align 4; \ + __memset_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memset; __GI_memset = __memset_ia32 +# endif + +# undef strong_alias +# define strong_alias(original, alias) +#endif + +#include "../memset.S" diff --git a/libc/sysdeps/i386/i686/multiarch/memset_chk.S b/libc/sysdeps/i386/i686/multiarch/memset_chk.S new file mode 100644 index 000000000..d659c7e56 --- /dev/null +++ b/libc/sysdeps/i386/i686/multiarch/memset_chk.S @@ -0,0 +1,116 @@ +/* Multiple versions of __memset_chk + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#ifndef NOT_IN_libc +# ifdef SHARED + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(__memset_chk) + .type __memset_chk, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __memset_chk_ia32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memset_chk_sse2@GOTOFF(%ebx), %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __memset_chk_sse2_rep@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(__memset_chk) + +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +# else + .text +ENTRY(__memset_chk) + .type __memset_chk, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features + jne 1f + call __init_cpu_features +1: leal __memset_chk_ia32, %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features + jz 2f + leal __memset_chk_sse2, %eax + testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features + jz 2f + leal __memset_chk_sse2_rep, %eax +2: ret +END(__memset_chk) + + .type __memset_chk_sse2, @function + .p2align 4; +__memset_chk_sse2: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_sse2 + cfi_endproc + .size __memset_chk_sse2, .-__memset_chk_sse2 + + .type __memset_chk_sse2_rep, @function + .p2align 4; +__memset_chk_sse2_rep: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_sse2_rep + cfi_endproc + .size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep + + .type __memset_chk_ia32, @function + .p2align 4; +__memset_chk_ia32: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_ia32 + cfi_endproc + .size __memset_chk_ia32, .-__memset_chk_ia32 +# endif +#endif diff --git a/libc/sysdeps/i386/sysdep.h b/libc/sysdeps/i386/sysdep.h index e03a8e926..efdc82dde 100644 --- a/libc/sysdeps/i386/sysdep.h +++ b/libc/sysdeps/i386/sysdep.h @@ -67,6 +67,9 @@ ASM_SIZE_DIRECTIVE(name) \ STABS_FUN_END(name) +#define ENTRY_CHK(name) ENTRY (name) +#define END_CHK(name) END (name) + #ifdef HAVE_CPP_ASM_DEBUGINFO /* Disable that goop, because we just pass -g through to the assembler and it generates proper line number information directly. */ diff --git a/libc/sysdeps/ieee754/ldbl-128/s_ceill.c b/libc/sysdeps/ieee754/ldbl-128/s_ceill.c index 76bda9fc0..71f2fee91 100644 --- a/libc/sysdeps/ieee754/ldbl-128/s_ceill.c +++ b/libc/sysdeps/ieee754/ldbl-128/s_ceill.c @@ -30,9 +30,9 @@ static char rcsid[] = "$NetBSD: $"; #include "math_private.h" #ifdef __STDC__ -static const long double huge = 1.0e4930; +static const long double huge = 1.0e4930L; #else -static long double huge = 1.0e4930; +static long double huge = 1.0e4930L; #endif #ifdef __STDC__ diff --git a/libc/sysdeps/ieee754/ldbl-128/s_expm1l.c b/libc/sysdeps/ieee754/ldbl-128/s_expm1l.c index a82489bb2..dec6404af 100644 --- a/libc/sysdeps/ieee754/ldbl-128/s_expm1l.c +++ b/libc/sysdeps/ieee754/ldbl-128/s_expm1l.c @@ -86,7 +86,7 @@ static const long double /* ln (2^16384 * (1 - 2^-113)) */ maxlog = 1.1356523406294143949491931077970764891253E4L, /* ln 2^-114 */ - minarg = -7.9018778583833765273564461846232128760607E1L, big = 2e4932L; + minarg = -7.9018778583833765273564461846232128760607E1L, big = 1e4932L; long double diff --git a/libc/sysdeps/ieee754/ldbl-128/s_floorl.c b/libc/sysdeps/ieee754/ldbl-128/s_floorl.c index ff5b98da9..2a60a79fc 100644 --- a/libc/sysdeps/ieee754/ldbl-128/s_floorl.c +++ b/libc/sysdeps/ieee754/ldbl-128/s_floorl.c @@ -30,9 +30,9 @@ static char rcsid[] = "$NetBSD: $"; #include "math_private.h" #ifdef __STDC__ -static const long double huge = 1.0e4930; +static const long double huge = 1.0e4930L; #else -static long double huge = 1.0e4930; +static long double huge = 1.0e4930L; #endif #ifdef __STDC__ diff --git a/libc/sysdeps/ieee754/ldbl-128/s_log1pl.c b/libc/sysdeps/ieee754/ldbl-128/s_log1pl.c index 4480dc9f2..6e50575ac 100644 --- a/libc/sysdeps/ieee754/ldbl-128/s_log1pl.c +++ b/libc/sysdeps/ieee754/ldbl-128/s_log1pl.c @@ -120,7 +120,6 @@ static const long double C2 = 1.428606820309417232121458176568075500134E-6L; static const long double sqrth = 0.7071067811865475244008443621048490392848L; /* ln (2^16384 * (1 - 2^-113)) */ static const long double maxlog = 1.1356523406294143949491931077970764891253E4L; -static const long double big = 2e4932L; static const long double zero = 0.0L; long double diff --git a/libc/sysdeps/ieee754/ldbl-128/s_nexttowardf.c b/libc/sysdeps/ieee754/ldbl-128/s_nexttowardf.c index 1a22e0102..1f37d80e0 100644 --- a/libc/sysdeps/ieee754/ldbl-128/s_nexttowardf.c +++ b/libc/sysdeps/ieee754/ldbl-128/s_nexttowardf.c @@ -44,10 +44,12 @@ static char rcsid[] = "$NetBSD: $"; return x+y; if((long double) x==y) return y; /* x=y, return y */ if(ix==0) { /* x == 0 */ - float x2; + float u; SET_FLOAT_WORD(x,(u_int32_t)((hy>>32)&0x80000000)|1);/* return +-minsub*/ - x2 = x*x; - if(x2==x) return x2; else return x; /* raise underflow flag */ + u = math_opt_barrier (x); + u = u * u; + math_force_eval (u); /* raise underflow flag */ + return x; } if(hx>=0) { /* x > 0 */ if(hy<0||(ix>>23)>(iy>>48)-0x3f80 @@ -68,12 +70,9 @@ static char rcsid[] = "$NetBSD: $"; } hy = hx&0x7f800000; if(hy>=0x7f800000) return x+x; /* overflow */ - if(hy<0x00800000) { /* underflow */ - float x2 = x*x; - if(x2!=x) { /* raise underflow flag */ - SET_FLOAT_WORD(x2,hx); - return x2; - } + if(hy<0x00800000) { + float u = x*x; /* underflow */ + math_force_eval (u); /* raise underflow flag */ } SET_FLOAT_WORD(x,hx); return x; diff --git a/libc/sysdeps/mach/hurd/bits/libc-lock.h b/libc/sysdeps/mach/hurd/bits/libc-lock.h index 0fa90bcc3..90e46e02f 100644 --- a/libc/sysdeps/mach/hurd/bits/libc-lock.h +++ b/libc/sysdeps/mach/hurd/bits/libc-lock.h @@ -31,6 +31,7 @@ typedef struct void *owner; int count; } __libc_lock_recursive_t; +typedef __libc_lock_recursive_t __rtld_lock_recursive_t; #define __libc_lock_owner_self() ((void *) __hurd_threadvar_location (0)) @@ -121,6 +122,8 @@ typedef struct __libc_lock_recursive_opaque__ __libc_lock_recursive_t; #define __rtld_lock_init_recursive(NAME) \ __libc_lock_init_recursive (NAME) +#define __rtld_lock_initialize(NAME) \ + (void) ((NAME) = (__rtld_lock_recursive_t) _RTLD_LOCK_RECURSIVE_INITIALIZER) #define __rtld_lock_trylock_recursive(NAME) \ __libc_lock_trylock_recursive (NAME) #define __rtld_lock_lock_recursive(NAME) \ diff --git a/libc/sysdeps/mach/hurd/bits/stat.h b/libc/sysdeps/mach/hurd/bits/stat.h index c3f96660c..b64a658fb 100644 --- a/libc/sysdeps/mach/hurd/bits/stat.h +++ b/libc/sysdeps/mach/hurd/bits/stat.h @@ -1,4 +1,5 @@ -/* Copyright (C) 1992,93,94,96,97,99,2000,2005 Free Software Foundation, Inc. +/* Copyright (C) 1992-1994,1996,1997,1999,2000,2005,2010 + Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,10 +17,13 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#ifndef _SYS_STAT_H +#if !defined _SYS_STAT_H && !defined _FCNTL_H # error "Never include <bits/stat.h> directly; use <sys/stat.h> instead." #endif +#ifndef _BITS_STAT_H +#define _BITS_STAT_H 1 + #include <bits/types.h> /* NOTE: The size of this structure (32 ints) is known in @@ -192,5 +196,7 @@ struct stat64 /* Default file creation mask (umask). */ #ifdef __USE_BSD -#define CMASK 0022 +# define CMASK 0022 #endif + +#endif /* bits/stat.h */ diff --git a/libc/sysdeps/powerpc/powerpc32/cell/memcpy.S b/libc/sysdeps/powerpc/powerpc32/cell/memcpy.S new file mode 100644 index 000000000..cc1da99fd --- /dev/null +++ b/libc/sysdeps/powerpc/powerpc32/cell/memcpy.S @@ -0,0 +1,245 @@ +/* Optimized memcpy implementation for CELL BE PowerPC. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <bp-sym.h> +#include <bp-asm.h> + +#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */ +#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */ + +/* memcpy routine optimized for CELL-BE-PPC v2.0 + * + * The CELL PPC core has 1 integer unit and 1 load/store unit + * CELL: + * 1st level data cache = 32K + * 2nd level data cache = 512K + * 3rd level data cache = 0K + * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks, + * latency to memory is >400 clocks + * To improve copy performance we need to prefetch source data + * far ahead to hide this latency + * For best performance instructionforms ending in "." like "andi." + * should be avoided as the are implemented in microcode on CELL. + * The below code is loop unrolled for the CELL cache line of 128 bytes + */ + +.align 7 + +EALIGN (BP_SYM (memcpy), 5, 0) + CALL_MCOUNT + + dcbt 0,r4 /* Prefetch ONE SRC cacheline */ + cmplwi cr1,r5,16 /* is size < 16 ? */ + mr r6,r3 + blt+ cr1,.Lshortcopy + +.Lbigcopy: + neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ + clrlwi r8,r8,32-4 /* aling to 16byte boundary */ + sub r7,r4,r3 + cmplwi cr0,r8,0 + beq+ .Ldst_aligned + +.Ldst_unaligned: + mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ + subf r5,r8,r5 + + bf cr7*4+3,1f + lbzx r0,r7,r6 /* copy 1 byte */ + stb r0,0(r6) + addi r6,r6,1 +1: bf cr7*4+2,2f + lhzx r0,r7,r6 /* copy 2 byte */ + sth r0,0(r6) + addi r6,r6,2 +2: bf cr7*4+1,4f + lwzx r0,r7,r6 /* copy 4 byte */ + stw r0,0(r6) + addi r6,r6,4 +4: bf cr7*4+0,8f + lfdx fp9,r7,r6 /* copy 8 byte */ + stfd fp9,0(r6) + addi r6,r6,8 +8: + add r4,r7,r6 + +.Ldst_aligned: + + cmpwi cr5,r5,128-1 + + neg r7,r6 + addi r6,r6,-8 /* prepare for stfdu */ + addi r4,r4,-8 /* prepare for lfdu */ + + clrlwi r7,r7,32-7 /* align to cacheline boundary */ + ble+ cr5,.Llessthancacheline + + cmplwi cr6,r7,0 + subf r5,r7,r5 + srwi r7,r7,4 /* divide size by 16 */ + srwi r10,r5,7 /* number of cache lines to copy */ + + cmplwi r10,0 + li r11,0 /* number cachelines to copy with prefetch */ + beq .Lnocacheprefetch + + cmplwi r10,PREFETCH_AHEAD + li r12,128+8 /* prefetch distance */ + ble .Llessthanmaxprefetch + + subi r11,r10,PREFETCH_AHEAD + li r10,PREFETCH_AHEAD + +.Llessthanmaxprefetch: + mtctr r10 + +.LprefetchSRC: + dcbt r12,r4 + addi r12,r12,128 + bdnz .LprefetchSRC + +.Lnocacheprefetch: + mtctr r7 + cmplwi cr1,r5,128 + clrlwi r5,r5,32-7 + beq cr6,.Lcachelinealigned + +.Laligntocacheline: + lfd fp9,0x08(r4) + lfdu fp10,0x10(r4) + stfd fp9,0x08(r6) + stfdu fp10,0x10(r6) + bdnz .Laligntocacheline + + +.Lcachelinealigned: /* copy while cache lines */ + + blt- cr1,.Llessthancacheline /* size <128 */ + +.Louterloop: + cmpwi r11,0 + mtctr r11 + beq- .Lendloop + + li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ + +.align 4 + /* Copy whole cachelines, optimized by prefetching SRC cacheline */ +.Lloop: /* Copy aligned body */ + dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ + lfd fp9, 0x08(r4) + dcbz r11,r6 + lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */ + lfd fp11, 0x18(r4) /* to hide 1st level cache lantency. */ + lfd fp12, 0x20(r4) + stfd fp9, 0x08(r6) + stfd fp10, 0x10(r6) + stfd fp11, 0x18(r6) + stfd fp12, 0x20(r6) + lfd fp9, 0x28(r4) + lfd fp10, 0x30(r4) + lfd fp11, 0x38(r4) + lfd fp12, 0x40(r4) + stfd fp9, 0x28(r6) + stfd fp10, 0x30(r6) + stfd fp11, 0x38(r6) + stfd fp12, 0x40(r6) + lfd fp9, 0x48(r4) + lfd fp10, 0x50(r4) + lfd fp11, 0x58(r4) + lfd fp12, 0x60(r4) + stfd fp9, 0x48(r6) + stfd fp10, 0x50(r6) + stfd fp11, 0x58(r6) + stfd fp12, 0x60(r6) + lfd fp9, 0x68(r4) + lfd fp10, 0x70(r4) + lfd fp11, 0x78(r4) + lfdu fp12, 0x80(r4) + stfd fp9, 0x68(r6) + stfd fp10, 0x70(r6) + stfd fp11, 0x78(r6) + stfdu fp12, 0x80(r6) + + bdnz .Lloop + +.Lendloop: + cmpwi r10,0 + slwi r10,r10,2 /* adjust from 128 to 32 byte stride */ + beq- .Lendloop2 + mtctr r10 + +.Lloop2: /* Copy aligned body */ + lfd fp9, 0x08(r4) + lfd fp10, 0x10(r4) + lfd fp11, 0x18(r4) + lfdu fp12, 0x20(r4) + stfd fp9, 0x08(r6) + stfd fp10, 0x10(r6) + stfd fp11, 0x18(r6) + stfdu fp12, 0x20(r6) + + bdnz .Lloop2 +.Lendloop2: + +.Llessthancacheline: /* less than cache to do ? */ + cmplwi cr0,r5,16 + srwi r7,r5,4 /* divide size by 16 */ + blt- .Ldo_lt16 + mtctr r7 + +.Lcopy_remaining: + lfd fp9,0x08(r4) + lfdu fp10,0x10(r4) + stfd fp9,0x08(r6) + stfdu fp10,0x10(r6) + bdnz .Lcopy_remaining + +.Ldo_lt16: /* less than 16 ? */ + cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */ + beqlr+ /* no rest to copy */ + addi r4,r4,8 + addi r6,r6,8 + +.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */ + mtcrf 0x01,r5 + sub r7,r4,r6 + bf- cr7*4+0,8f + lfdx fp9,r7,r6 /* copy 8 byte */ + stfd fp9,0(r6) + addi r6,r6,8 +8: + bf cr7*4+1,4f + lwzx r0,r7,r6 /* copy 4 byte */ + stw r0,0(r6) + addi r6,r6,4 +4: + bf cr7*4+2,2f + lhzx r0,r7,r6 /* copy 2 byte */ + sth r0,0(r6) + addi r6,r6,2 +2: + bf cr7*4+3,1f + lbzx r0,r7,r6 /* copy 1 byte */ + stb r0,0(r6) +1: blr + +END (BP_SYM (memcpy)) +libc_hidden_builtin_def (memcpy) diff --git a/libc/sysdeps/powerpc/powerpc64/cell/memcpy.S b/libc/sysdeps/powerpc/powerpc64/cell/memcpy.S new file mode 100644 index 000000000..c6ee730e4 --- /dev/null +++ b/libc/sysdeps/powerpc/powerpc64/cell/memcpy.S @@ -0,0 +1,245 @@ +/* Optimized memcpy implementation for CELL BE PowerPC. + Copyright (C) 2010 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <bp-sym.h> +#include <bp-asm.h> + +#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */ +#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */ + +/* memcpy routine optimized for CELL-BE-PPC v2.0 + * + * The CELL PPC core has 1 integer unit and 1 load/store unit + * CELL: + * 1st level data cache = 32K + * 2nd level data cache = 512K + * 3rd level data cache = 0K + * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks, + * latency to memory is >400 clocks + * To improve copy performance we need to prefetch source data + * far ahead to hide this latency + * For best performance instructionforms ending in "." like "andi." + * should be avoided as the are implemented in microcode on CELL. + * The below code is loop unrolled for the CELL cache line of 128 bytes + */ + +.align 7 + +EALIGN (BP_SYM (memcpy), 5, 0) + CALL_MCOUNT 3 + + dcbt 0,r4 /* Prefetch ONE SRC cacheline */ + cmpldi cr1,r5,16 /* is size < 16 ? */ + mr r6,r3 + blt+ cr1,.Lshortcopy + +.Lbigcopy: + neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ + clrldi r8,r8,64-4 /* aling to 16byte boundary */ + sub r7,r4,r3 + cmpldi cr0,r8,0 + beq+ .Ldst_aligned + +.Ldst_unaligned: + mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ + subf r5,r8,r5 + + bf cr7*4+3,1f + lbzx r0,r7,r6 /* copy 1 byte */ + stb r0,0(r6) + addi r6,r6,1 +1: bf cr7*4+2,2f + lhzx r0,r7,r6 /* copy 2 byte */ + sth r0,0(r6) + addi r6,r6,2 +2: bf cr7*4+1,4f + lwzx r0,r7,r6 /* copy 4 byte */ + stw r0,0(r6) + addi r6,r6,4 +4: bf cr7*4+0,8f + ldx r0,r7,r6 /* copy 8 byte */ + std r0,0(r6) + addi r6,r6,8 +8: + add r4,r7,r6 + +.Ldst_aligned: + + cmpdi cr5,r5,128-1 + + neg r7,r6 + addi r6,r6,-8 /* prepare for stdu */ + addi r4,r4,-8 /* prepare for ldu */ + + clrldi r7,r7,64-7 /* align to cacheline boundary */ + ble+ cr5,.Llessthancacheline + + cmpldi cr6,r7,0 + subf r5,r7,r5 + srdi r7,r7,4 /* divide size by 16 */ + srdi r10,r5,7 /* number of cache lines to copy */ + + cmpldi r10,0 + li r11,0 /* number cachelines to copy with prefetch */ + beq .Lnocacheprefetch + + cmpldi r10,PREFETCH_AHEAD + li r12,128+8 /* prefetch distance */ + ble .Llessthanmaxprefetch + + subi r11,r10,PREFETCH_AHEAD + li r10,PREFETCH_AHEAD + +.Llessthanmaxprefetch: + mtctr r10 + +.LprefetchSRC: + dcbt r12,r4 + addi r12,r12,128 + bdnz .LprefetchSRC + +.Lnocacheprefetch: + mtctr r7 + cmpldi cr1,r5,128 + clrldi r5,r5,64-7 + beq cr6,.Lcachelinealigned + +.Laligntocacheline: + ld r9,0x08(r4) + ldu r7,0x10(r4) + std r9,0x08(r6) + stdu r7,0x10(r6) + bdnz .Laligntocacheline + + +.Lcachelinealigned: /* copy while cache lines */ + + blt- cr1,.Llessthancacheline /* size <128 */ + +.Louterloop: + cmpdi r11,0 + mtctr r11 + beq- .Lendloop + + li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ + +.align 4 + /* Copy whole cachelines, optimized by prefetching SRC cacheline */ +.Lloop: /* Copy aligned body */ + dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ + ld r9, 0x08(r4) + dcbz r11,r6 + ld r7, 0x10(r4) /* 4 register stride copy is optimal */ + ld r8, 0x18(r4) /* to hide 1st level cache lantency. */ + ld r0, 0x20(r4) + std r9, 0x08(r6) + std r7, 0x10(r6) + std r8, 0x18(r6) + std r0, 0x20(r6) + ld r9, 0x28(r4) + ld r7, 0x30(r4) + ld r8, 0x38(r4) + ld r0, 0x40(r4) + std r9, 0x28(r6) + std r7, 0x30(r6) + std r8, 0x38(r6) + std r0, 0x40(r6) + ld r9, 0x48(r4) + ld r7, 0x50(r4) + ld r8, 0x58(r4) + ld r0, 0x60(r4) + std r9, 0x48(r6) + std r7, 0x50(r6) + std r8, 0x58(r6) + std r0, 0x60(r6) + ld r9, 0x68(r4) + ld r7, 0x70(r4) + ld r8, 0x78(r4) + ldu r0, 0x80(r4) + std r9, 0x68(r6) + std r7, 0x70(r6) + std r8, 0x78(r6) + stdu r0, 0x80(r6) + + bdnz .Lloop + +.Lendloop: + cmpdi r10,0 + sldi r10,r10,2 /* adjust from 128 to 32 byte stride */ + beq- .Lendloop2 + mtctr r10 + +.Lloop2: /* Copy aligned body */ + ld r9, 0x08(r4) + ld r7, 0x10(r4) + ld r8, 0x18(r4) + ldu r0, 0x20(r4) + std r9, 0x08(r6) + std r7, 0x10(r6) + std r8, 0x18(r6) + stdu r0, 0x20(r6) + + bdnz .Lloop2 +.Lendloop2: + +.Llessthancacheline: /* less than cache to do ? */ + cmpldi cr0,r5,16 + srdi r7,r5,4 /* divide size by 16 */ + blt- .Ldo_lt16 + mtctr r7 + +.Lcopy_remaining: + ld r8,0x08(r4) + ldu r7,0x10(r4) + std r8,0x08(r6) + stdu r7,0x10(r6) + bdnz .Lcopy_remaining + +.Ldo_lt16: /* less than 16 ? */ + cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */ + beqlr+ /* no rest to copy */ + addi r4,r4,8 + addi r6,r6,8 + +.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */ + mtcrf 0x01,r5 + sub r7,r4,r6 + bf- cr7*4+0,8f + ldx r0,r7,r6 /* copy 8 byte */ + std r0,0(r6) + addi r6,r6,8 +8: + bf cr7*4+1,4f + lwzx r0,r7,r6 /* copy 4 byte */ + stw r0,0(r6) + addi r6,r6,4 +4: + bf cr7*4+2,2f + lhzx r0,r7,r6 /* copy 2 byte */ + sth r0,0(r6) + addi r6,r6,2 +2: + bf cr7*4+3,1f + lbzx r0,r7,r6 /* copy 1 byte */ + stb r0,0(r6) +1: blr + +END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS) +libc_hidden_builtin_def (memcpy) diff --git a/libc/sysdeps/s390/s390-32/dl-machine.h b/libc/sysdeps/s390/s390-32/dl-machine.h index 251a5f692..415b38801 100644 --- a/libc/sysdeps/s390/s390-32/dl-machine.h +++ b/libc/sysdeps/s390/s390-32/dl-machine.h @@ -27,6 +27,7 @@ #include <sys/param.h> #include <string.h> #include <link.h> +#include <sysdeps/s390/dl-procinfo.h> /* This is an older, now obsolete value. */ #define EM_S390_OLD 0xA390 @@ -35,6 +36,12 @@ static inline int elf_machine_matches_host (const Elf32_Ehdr *ehdr) { + /* Check if the kernel provides the high gpr facility if needed by + the binary. */ + if ((ehdr->e_flags & EF_S390_HIGH_GPRS) + && !(GLRO (dl_hwcap) & HWCAP_S390_HIGH_GPRS)) + return 0; + return (ehdr->e_machine == EM_S390 || ehdr->e_machine == EM_S390_OLD) && ehdr->e_ident[EI_CLASS] == ELFCLASS32; } diff --git a/libc/sysdeps/s390/s390-32/elf/start.S b/libc/sysdeps/s390/s390-32/elf/start.S index f7290106c..066f7f0aa 100644 --- a/libc/sysdeps/s390/s390-32/elf/start.S +++ b/libc/sysdeps/s390/s390-32/elf/start.S @@ -59,6 +59,88 @@ .globl _start .type _start,@function _start: + /* Check if the kernel provides highgprs facility if needed by + the binary. */ + + lr %r6,%r15 + la %r6,4(%r6) /* Skip the argument counter. */ + +.L11: l %r5,0(%r6) /* Skip the argument vector. */ + la %r6,4(%r6) + ltr %r5,%r5 + jne .L11 + +.L12: l %r5,0(%r6) /* Skip the environment vector. */ + la %r6,4(%r6) + ltr %r5,%r5 + jne .L12 + + /* Obtain the needed values from the auxiliary vector. */ + + lhi %r7,16 /* AT_HWCAP */ + lhi %r8,3 /* AT_PHDR */ + lhi %r9,5 /* AT_PHNUM */ + lhi %r2,4 /* AT_PHENT */ +.L13: l %r5,0(%r6) + clr %r5,%r7 + jne .L15 + l %r10,4(%r6) /* r10 = AT_HWCAP value. */ +.L15: clr %r5,%r8 + jne .L16 + l %r11,4(%r6) /* r11 = AT_PHDR value. */ +.L16: clr %r5,%r9 + jne .L17 + l %r12,4(%r6) /* r12 = AT_PHNUM value. */ +.L17: clr %r5,%r2 + jne .L18 + l %r0,4(%r6) /* r0 = AT_PHENT value. */ +.L18: ltr %r5,%r5 + la %r6,8(%r6) + jnz .L13 + + /* Locate the ELF header by looking for the first PT_LOAD + segment with a p_offset of zero. */ + + lr %r4,%r11 /* Backup AT_PHDR. */ + lhi %r7,1 /* PT_LOAD id */ + lhi %r8,0 +.L19: cl %r7,0(%r4) /* p_type == PT_LOAD? */ + jne .L20 + cl %r8,4(%r4) /* p_offset == 0? */ + jne .L20 + l %r9,8(%r4) /* r9 = p_vaddr <- ELF header address */ + j .L24 +.L20: alr %r4,%r0 /* r4 += AT_PHENT value */ + brct %r12,.L19 + + j .+2 /* Trap, there must be such a phdr. */ + +.L24: lr %r4,%r11 /* Backup AT_PHDR. */ + lhi %r2,6 /* PT_PHDR id */ +.L23: cl %r2,0(%r4) + jne .L22 + l %r3,8(%r4) /* r3 = PT_PHDR p_vaddr */ + j .L25 +.L22: alr %r4,%r0 /* r4 += AT_PHENT value */ + brct %r12,.L23 + + ltr %r9,%r9 /* Load address == 0? */ + jz .L14 /* No checking for PIE without PT_PHDR. */ + j .L21 + +.L25: clr %r3,%r11 /* PT_PHDR p_vaddr == AT_PHDR? */ + je .L21 + lr %r9,%r11 + slr %r9,%r3 /* elf_header_addr = AT_PHDR - PT_PHDR.p_vaddr */ + +.L21: l %r5,36(%r9) /* Load the e_flags field. */ + tml %r5,1 + jz .L14 /* Binary does not require highgprs facility. */ + + tml %r10,512 /* Check the AT_HWCAP value. */ + jz 2 /* Trap if no highgprs facility available. */ +.L14: + /* Setup pointer to literal pool of _start */ basr %r13,0 .L0: ahi %r13,.Llit-.L0 diff --git a/libc/sysdeps/unix/bsd/bits/stat.h b/libc/sysdeps/unix/bsd/bits/stat.h index 84a58ffbc..6eeab3b0e 100644 --- a/libc/sysdeps/unix/bsd/bits/stat.h +++ b/libc/sysdeps/unix/bsd/bits/stat.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1991, 92, 96, 97, 99, 2000 Free Software Foundation, Inc. +/* Copyright (C) 1991,1992,1996-2000,2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,10 +16,13 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#ifndef _SYS_STAT_H +#if !defined _SYS_STAT_H && !defined _FCNTL_H # error "Never include <bits/stat.h> directly; use <sys/stat.h> instead." #endif +#ifndef _BITS_STAT_H +#define _BITS_STAT_H 1 + #include <bits/types.h> /* Structure describing file characteristics. */ @@ -84,3 +87,5 @@ struct stat #define __S_IREAD 0400 /* Read by owner. */ #define __S_IWRITE 0200 /* Write by owner. */ #define __S_IEXEC 0100 /* Execute by owner. */ + +#endif /* bits/stat.h */ diff --git a/libc/sysdeps/unix/sysv/bits/stat.h b/libc/sysdeps/unix/sysv/bits/stat.h index f9a9e614d..2fb619a2e 100644 --- a/libc/sysdeps/unix/sysv/bits/stat.h +++ b/libc/sysdeps/unix/sysv/bits/stat.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1992, 1996, 1997, 2000 Free Software Foundation, Inc. +/* Copyright (C) 1992, 1996, 1997, 2000, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,10 +16,13 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#ifndef _SYS_STAT_H +#if !defined _SYS_STAT_H && !defined _FCNTL_H # error "Never include <bits/stat.h> directly; use <sys/stat.h> instead." #endif +#ifndef _BITS_STAT_H +#define _BITS_STAT_H 1 + struct stat { short int st_dev; @@ -63,3 +66,5 @@ struct stat #define __S_IREAD 0400 /* Read by owner. */ #define __S_IWRITE 0200 /* Write by owner. */ #define __S_IEXEC 0100 /* Execute by owner. */ + +#endif /* bits/stat.h */ diff --git a/libc/sysdeps/unix/sysv/linux/bits/sigaction.h b/libc/sysdeps/unix/sysv/linux/bits/sigaction.h index 48cc5312f..62be06920 100644 --- a/libc/sysdeps/unix/sysv/linux/bits/sigaction.h +++ b/libc/sysdeps/unix/sysv/linux/bits/sigaction.h @@ -1,5 +1,5 @@ /* The proper definitions for Linux's sigaction. - Copyright (C) 1993-1999, 2000 Free Software Foundation, Inc. + Copyright (C) 1993-1999, 2000, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -57,6 +57,8 @@ struct sigaction three arguments instead of one. */ #if defined __USE_UNIX98 || defined __USE_MISC # define SA_ONSTACK 0x08000000 /* Use signal stack by using `sa_restorer'. */ +#endif +#if defined __USE_UNIX98 || defined __USE_MISC || defined __USE_XOPEN2K8 # define SA_RESTART 0x10000000 /* Restart syscall on signal return. */ # define SA_NODEFER 0x40000000 /* Don't automatically block the signal when its handler is being executed. */ diff --git a/libc/sysdeps/unix/sysv/linux/bits/stat.h b/libc/sysdeps/unix/sysv/linux/bits/stat.h index be5272333..8ac3cd472 100644 --- a/libc/sysdeps/unix/sysv/linux/bits/stat.h +++ b/libc/sysdeps/unix/sysv/linux/bits/stat.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1992, 1995-2001, 2002, 2009 Free Software Foundation, Inc. +/* Copyright (C) 1992, 1995-2002, 2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,10 +16,13 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#ifndef _SYS_STAT_H +#if !defined _SYS_STAT_H && !defined _FCNTL_H # error "Never include <bits/stat.h> directly; use <sys/stat.h> instead." #endif +#ifndef _BITS_STAT_H +#define _BITS_STAT_H 1 + /* Versions of the `struct stat' data structure. */ #define _STAT_VER_LINUX_OLD 1 #define _STAT_VER_KERNEL 1 @@ -166,3 +169,5 @@ struct stat64 # define UTIME_NOW ((1l << 30) - 1l) # define UTIME_OMIT ((1l << 30) - 2l) #endif + +#endif /* bits/stat.h */ diff --git a/libc/sysdeps/unix/sysv/linux/i386/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/i386/bits/fcntl.h index 8eaf7c368..669388954 100644 --- a/libc/sysdeps/unix/sysv/linux/i386/bits/fcntl.h +++ b/libc/sysdeps/unix/sysv/linux/i386/bits/fcntl.h @@ -1,5 +1,5 @@ /* O_*, F_*, FD_* bit values for Linux. - Copyright (C) 1995, 1996, 1997, 1998, 2000, 2004, 2006, 2007, 2009 + Copyright (C) 1995, 1996, 1997, 1998, 2000, 2004, 2006, 2007, 2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -45,13 +45,15 @@ #define O_FSYNC O_SYNC #define O_ASYNC 020000 -#ifdef __USE_GNU -# define O_DIRECT 040000 /* Direct disk access. */ +#ifdef __USE_XOPEN2K8 # define O_DIRECTORY 0200000 /* Must be a directory. */ # define O_NOFOLLOW 0400000 /* Do not follow links. */ -# define O_NOATIME 01000000 /* Do not set atime. */ # define O_CLOEXEC 02000000 /* Set close_on_exec. */ #endif +#ifdef __USE_GNU +# define O_DIRECT 040000 /* Direct disk access. */ +# define O_NOATIME 01000000 /* Do not set atime. */ +#endif /* For now Linux has synchronisity options for data and read operations. We define the symbols here but let them do the same as O_SYNC since @@ -84,7 +86,7 @@ #define F_SETLK64 13 /* Set record locking info (non-blocking). */ #define F_SETLKW64 14 /* Set record locking info (blocking). */ -#if defined __USE_BSD || defined __USE_UNIX98 +#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8 # define F_SETOWN 8 /* Get owner (process receiving SIGIO). */ # define F_GETOWN 9 /* Set owner (process receiving SIGIO). */ #endif @@ -100,6 +102,8 @@ # define F_SETLEASE 1024 /* Set a lease. */ # define F_GETLEASE 1025 /* Enquire what lease is active. */ # define F_NOTIFY 1026 /* Request notfications on a directory. */ +#endif +#ifdef __USE_XOPEN2K8 # define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with close-on-exit set. */ #endif diff --git a/libc/sysdeps/unix/sysv/linux/ia64/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/ia64/bits/fcntl.h index c65a11e1c..33635fd9e 100644 --- a/libc/sysdeps/unix/sysv/linux/ia64/bits/fcntl.h +++ b/libc/sysdeps/unix/sysv/linux/ia64/bits/fcntl.h @@ -1,5 +1,6 @@ /* O_*, F_*, FD_* bit values for Linux/IA64. - Copyright (C) 1999,2000,2004,2006,2007,2009 Free Software Foundation, Inc. + Copyright (C) 1999, 2000, 2004, 2006, 2007, 2009, 2010 + Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -44,13 +45,15 @@ #define O_FSYNC O_SYNC #define O_ASYNC 020000 -#ifdef __USE_GNU -# define O_DIRECT 040000 +#ifdef __USE_XOPEN2K8 # define O_DIRECTORY 0200000 /* must be a directory */ # define O_NOFOLLOW 0400000 /* don't follow links */ -# define O_NOATIME 01000000 /* Do not set atime. */ # define O_CLOEXEC 02000000 /* Set close_on_exec. */ #endif +#ifdef __USE_GNU +# define O_DIRECT 040000 +# define O_NOATIME 01000000 /* Do not set atime. */ +#endif #ifdef __USE_LARGEFILE64 /* Not necessary, files are always with 64bit off_t. */ @@ -80,7 +83,7 @@ #define F_SETLK64 6 /* Set record locking info (non-blocking). */ #define F_SETLKW64 7 /* Set record locking info (blocking). */ -#if defined __USE_BSD || defined __USE_UNIX98 +#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8 # define F_SETOWN 8 /* Get owner (process receiving SIGIO). */ # define F_GETOWN 9 /* Set owner (process receiving SIGIO). */ #endif @@ -96,6 +99,8 @@ # define F_SETLEASE 1024 /* Set a lease. */ # define F_GETLEASE 1025 /* Enquire what lease is active. */ # define F_NOTIFY 1026 /* Request notfications on a directory. */ +#endif +#ifdef __USE_XOPEN2K8 # define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with close-on-exit set. */ #endif diff --git a/libc/sysdeps/unix/sysv/linux/ia64/bits/sigaction.h b/libc/sysdeps/unix/sysv/linux/ia64/bits/sigaction.h index 11599d520..b557eaa3e 100644 --- a/libc/sysdeps/unix/sysv/linux/ia64/bits/sigaction.h +++ b/libc/sysdeps/unix/sysv/linux/ia64/bits/sigaction.h @@ -1,5 +1,5 @@ /* Definitions for Linux/ia64 sigaction. - Copyright (C) 1996, 1997, 2000, 2003 Free Software Foundation, Inc. + Copyright (C) 1996, 1997, 2000, 2003, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -53,6 +53,8 @@ struct sigaction #define SA_SIGINFO 0x00000004 #if defined __USE_UNIX98 || defined __USE_MISC # define SA_ONSTACK 0x08000000 /* Use signal stack by using `sa_restorer'. */ +#endif +#if defined __USE_UNIX98 || defined __USE_MISC || defined __USE_XOPEN2K8 # define SA_RESTART 0x10000000 /* Restart syscall on signal return. */ # define SA_NODEFER 0x40000000 /* Don't automatically block the signal when its handler is being executed. */ diff --git a/libc/sysdeps/unix/sysv/linux/ia64/bits/stat.h b/libc/sysdeps/unix/sysv/linux/ia64/bits/stat.h index 75a331828..86acd27ae 100644 --- a/libc/sysdeps/unix/sysv/linux/ia64/bits/stat.h +++ b/libc/sysdeps/unix/sysv/linux/ia64/bits/stat.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1999, 2000, 2001, 2002, 2009 Free Software Foundation, Inc. +/* Copyright (C) 1999-2002, 2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,10 +16,13 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#ifndef _SYS_STAT_H +#if !defined _SYS_STAT_H && !defined _FCNTL_H # error "Never include <bits/stat.h> directly; use <sys/stat.h> instead." #endif +#ifndef _BITS_STAT_H +#define _BITS_STAT_H 1 + /* Versions of the `struct stat' data structure. */ #define _STAT_VER_KERNEL 0 #define _STAT_VER_LINUX 1 @@ -143,3 +146,5 @@ struct stat64 # define UTIME_NOW ((1l << 30) - 1l) # define UTIME_OMIT ((1l << 30) - 2l) #endif + +#endif /* bits//stat.h */ diff --git a/libc/sysdeps/unix/sysv/linux/powerpc/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/powerpc/bits/fcntl.h index 40fe1e50d..fea347bfc 100644 --- a/libc/sysdeps/unix/sysv/linux/powerpc/bits/fcntl.h +++ b/libc/sysdeps/unix/sysv/linux/powerpc/bits/fcntl.h @@ -1,5 +1,5 @@ /* O_*, F_*, FD_* bit values for Linux/PowerPC. - Copyright (C) 1995, 1996, 1997, 1998, 2000, 2003, 2004, 2006, 2007, 2009 + Copyright (C) 1995-1998, 2000, 2003, 2004, 2006, 2007, 2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -45,13 +45,15 @@ #define O_FSYNC O_SYNC #define O_ASYNC 020000 -#ifdef __USE_GNU -# define O_DIRECT 0400000 /* Direct disk access. */ +#ifdef __USE_XOPEN2K8 # define O_DIRECTORY 040000 /* Must be a directory. */ # define O_NOFOLLOW 0100000 /* Do not follow links. */ -# define O_NOATIME 01000000 /* Do not set atime. */ # define O_CLOEXEC 02000000 /* Set close_on_exec. */ #endif +#ifdef __USE_GNU +# define O_DIRECT 0400000 /* Direct disk access. */ +# define O_NOATIME 01000000 /* Do not set atime. */ +#endif #ifdef __USE_LARGEFILE64 # define O_LARGEFILE 0200000 @@ -84,7 +86,7 @@ #define F_SETLK64 13 /* Set record locking info (non-blocking). */ #define F_SETLKW64 14 /* Set record locking info (blocking). */ -#if defined __USE_BSD || defined __USE_UNIX98 +#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8 # define F_SETOWN 8 /* Get owner (process receiving SIGIO). */ # define F_GETOWN 9 /* Set owner (process receiving SIGIO). */ #endif @@ -100,6 +102,8 @@ # define F_SETLEASE 1024 /* Set a lease. */ # define F_GETLEASE 1025 /* Enquire what lease is active. */ # define F_NOTIFY 1026 /* Request notfications on a directory. */ +#endif +#ifdef __USE_XOPEN2K8 # define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with close-on-exit set. */ #endif diff --git a/libc/sysdeps/unix/sysv/linux/powerpc/bits/stat.h b/libc/sysdeps/unix/sysv/linux/powerpc/bits/stat.h index 81879ca1f..6e4a55f87 100644 --- a/libc/sysdeps/unix/sysv/linux/powerpc/bits/stat.h +++ b/libc/sysdeps/unix/sysv/linux/powerpc/bits/stat.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1992, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2009 +/* Copyright (C) 1992, 1995-2002, 2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,10 +17,13 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#ifndef _SYS_STAT_H +#if !defined _SYS_STAT_H && !defined _FCNTL_H # error "Never include <bits/stat.h> directly; use <sys/stat.h> instead." #endif +#ifndef _BITS_STAT_H +#define _BITS_STAT_H 1 + #include <bits/wordsize.h> /* Versions of the `struct stat' data structure. */ @@ -270,3 +273,5 @@ struct stat64 # define UTIME_NOW ((1l << 30) - 1l) # define UTIME_OMIT ((1l << 30) - 2l) #endif + +#endif /* bits/stat.h */ diff --git a/libc/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies b/libc/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies new file mode 100644 index 000000000..7c381f043 --- /dev/null +++ b/libc/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies @@ -0,0 +1,3 @@ +# Make sure this comes before the powerpc/powerpc32/fpu that's +# listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies. +powerpc/powerpc32/cell/fpu diff --git a/libc/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies b/libc/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies new file mode 100644 index 000000000..b6720ecda --- /dev/null +++ b/libc/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies @@ -0,0 +1 @@ +powerpc/powerpc64/cell/fpu diff --git a/libc/sysdeps/unix/sysv/linux/s390/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/s390/bits/fcntl.h index c9e6a4504..aeb1e0fe9 100644 --- a/libc/sysdeps/unix/sysv/linux/s390/bits/fcntl.h +++ b/libc/sysdeps/unix/sysv/linux/s390/bits/fcntl.h @@ -1,5 +1,6 @@ /* O_*, F_*, FD_* bit values for Linux. - Copyright (C) 2000,2001,2002,2004,2006,2007,2009 Free Software Foundation, Inc. + Copyright (C) 2000-2002,2004,2006,2007,2009,2010 + Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -45,13 +46,15 @@ #define O_FSYNC O_SYNC #define O_ASYNC 020000 -#ifdef __USE_GNU -# define O_DIRECT 040000 /* Direct disk access. */ +#ifdef __USE_XOPEN2K8 # define O_DIRECTORY 0200000 /* Must be a directory. */ # define O_NOFOLLOW 0400000 /* Do not follow links. */ -# define O_NOATIME 01000000 /* Do not set atime. */ # define O_CLOEXEC 02000000 /* Set close_on_exec. */ #endif +#ifdef __USE_GNU +# define O_DIRECT 040000 /* Direct disk access. */ +# define O_NOATIME 01000000 /* Do not set atime. */ +#endif #ifdef __USE_LARGEFILE64 # if __WORDSIZE == 64 @@ -99,7 +102,7 @@ # define F_SETLKW64 14 /* Set record locking info (blocking). */ #endif -#if defined __USE_BSD || defined __USE_UNIX98 +#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8 # define F_SETOWN 8 /* Get owner (process receiving SIGIO). */ # define F_GETOWN 9 /* Set owner (process receiving SIGIO). */ #endif @@ -115,6 +118,8 @@ # define F_SETLEASE 1024 /* Set a lease. */ # define F_GETLEASE 1025 /* Enquire what lease is active. */ # define F_NOTIFY 1026 /* Request notfications on a directory. */ +#endif +#ifdef __USE_XOPEN2K8 # define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with close-on-exit set. */ #endif diff --git a/libc/sysdeps/unix/sysv/linux/s390/bits/sigaction.h b/libc/sysdeps/unix/sysv/linux/s390/bits/sigaction.h index 308cb5bd7..8767d6785 100644 --- a/libc/sysdeps/unix/sysv/linux/s390/bits/sigaction.h +++ b/libc/sysdeps/unix/sysv/linux/s390/bits/sigaction.h @@ -1,5 +1,5 @@ /* Definitions for 31 & 64 bit S/390 sigaction. - Copyright (C) 2001, 2002 Free Software Foundation, Inc. + Copyright (C) 2001, 2002, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -90,6 +90,8 @@ struct sigaction three arguments instead of one. */ #if defined __USE_UNIX98 || defined __USE_MISC # define SA_ONSTACK 0x08000000 /* Use signal stack by using `sa_restorer'. */ +#endif +#if defined __USE_UNIX98 || defined __USE_MISC || defined __USE_XOPEN2K8 # define SA_RESTART 0x10000000 /* Restart syscall on signal return. */ # define SA_NODEFER 0x40000000 /* Don't automatically block the signal when its handler is being executed. */ diff --git a/libc/sysdeps/unix/sysv/linux/s390/bits/stat.h b/libc/sysdeps/unix/sysv/linux/s390/bits/stat.h index 64be9a10f..9d62ad26f 100644 --- a/libc/sysdeps/unix/sysv/linux/s390/bits/stat.h +++ b/libc/sysdeps/unix/sysv/linux/s390/bits/stat.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2000, 2001, 2002, 2009 Free Software Foundation, Inc. +/* Copyright (C) 2000, 2001, 2002, 2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,10 +16,13 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#ifndef _SYS_STAT_H +#if !defined _SYS_STAT_H && !defined _FCNTL_H # error "Never include <bits/stat.h> directly; use <sys/stat.h> instead." #endif +#ifndef _BITS_STAT_H +#define _BITS_STAT_H 1 + #include <bits/wordsize.h> #if __WORDSIZE == 64 @@ -259,3 +262,5 @@ struct stat64 # define UTIME_NOW ((1l << 30) - 1l) # define UTIME_OMIT ((1l << 30) - 2l) #endif + +#endif /* bits/stat.h */ diff --git a/libc/sysdeps/unix/sysv/linux/s390/s390-32/____longjmp_chk.c b/libc/sysdeps/unix/sysv/linux/s390/s390-32/____longjmp_chk.c index b28e58749..f2c151857 100644 --- a/libc/sysdeps/unix/sysv/linux/s390/s390-32/____longjmp_chk.c +++ b/libc/sysdeps/unix/sysv/linux/s390/s390-32/____longjmp_chk.c @@ -46,7 +46,7 @@ { \ if ((oss.ss_flags & SS_ONSTACK) == 0 \ || ((uintptr_t) (oss.ss_sp + oss.ss_size) - new_sp \ - >= oss.ss_size)) \ + < oss.ss_size)) \ __fortify_fail ("longjmp causes uninitialized stack frame");\ } \ } \ diff --git a/libc/sysdeps/unix/sysv/linux/s390/s390-64/____longjmp_chk.c b/libc/sysdeps/unix/sysv/linux/s390/s390-64/____longjmp_chk.c index dcf58fb50..261be250d 100644 --- a/libc/sysdeps/unix/sysv/linux/s390/s390-64/____longjmp_chk.c +++ b/libc/sysdeps/unix/sysv/linux/s390/s390-64/____longjmp_chk.c @@ -46,7 +46,7 @@ { \ if ((oss.ss_flags & SS_ONSTACK) == 0 \ || ((uintptr_t) (oss.ss_sp + oss.ss_size) - new_sp \ - >= oss.ss_size)) \ + < oss.ss_size)) \ __fortify_fail ("longjmp causes uninitialized stack frame");\ } \ } \ diff --git a/libc/sysdeps/unix/sysv/linux/sh/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/sh/bits/fcntl.h index 8eaf7c368..2a4123c61 100644 --- a/libc/sysdeps/unix/sysv/linux/sh/bits/fcntl.h +++ b/libc/sysdeps/unix/sysv/linux/sh/bits/fcntl.h @@ -1,5 +1,5 @@ /* O_*, F_*, FD_* bit values for Linux. - Copyright (C) 1995, 1996, 1997, 1998, 2000, 2004, 2006, 2007, 2009 + Copyright (C) 1995, 1996, 1997, 1998, 2000, 2004, 2006, 2007, 2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -45,14 +45,17 @@ #define O_FSYNC O_SYNC #define O_ASYNC 020000 -#ifdef __USE_GNU -# define O_DIRECT 040000 /* Direct disk access. */ +#ifdef __USE_XOPEN2K8 # define O_DIRECTORY 0200000 /* Must be a directory. */ # define O_NOFOLLOW 0400000 /* Do not follow links. */ -# define O_NOATIME 01000000 /* Do not set atime. */ # define O_CLOEXEC 02000000 /* Set close_on_exec. */ #endif +#ifdef __USE_GNU +# define O_DIRECT 040000 /* Direct disk access. */ +# define O_NOATIME 01000000 /* Do not set atime. */ +#endif + /* For now Linux has synchronisity options for data and read operations. We define the symbols here but let them do the same as O_SYNC since this is a superset. */ @@ -84,7 +87,7 @@ #define F_SETLK64 13 /* Set record locking info (non-blocking). */ #define F_SETLKW64 14 /* Set record locking info (blocking). */ -#if defined __USE_BSD || defined __USE_UNIX98 +#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8 # define F_SETOWN 8 /* Get owner (process receiving SIGIO). */ # define F_GETOWN 9 /* Set owner (process receiving SIGIO). */ #endif @@ -100,6 +103,8 @@ # define F_SETLEASE 1024 /* Set a lease. */ # define F_GETLEASE 1025 /* Enquire what lease is active. */ # define F_NOTIFY 1026 /* Request notfications on a directory. */ +#endif +#ifdef __USE_XOPEN2K8 # define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with close-on-exit set. */ #endif diff --git a/libc/sysdeps/unix/sysv/linux/sparc/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/sparc/bits/fcntl.h index 03c5ba59b..1dc45b7d8 100644 --- a/libc/sysdeps/unix/sysv/linux/sparc/bits/fcntl.h +++ b/libc/sysdeps/unix/sysv/linux/sparc/bits/fcntl.h @@ -1,5 +1,5 @@ /* O_*, F_*, FD_* bit values for Linux/SPARC. - Copyright (C) 1995, 1996, 1997, 1998, 2000, 2003, 2004, 2006, 2007, 2009 + Copyright (C) 1995-1998, 2000, 2003, 2004, 2006, 2007, 2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -44,12 +44,14 @@ #define O_NDELAY (0x0004 | O_NONBLOCK) #define O_NOCTTY 0x8000 /* not fcntl */ -#ifdef __USE_GNU +#ifdef __USE_XOPEN2K8 # define O_DIRECTORY 0x10000 /* must be a directory */ # define O_NOFOLLOW 0x20000 /* don't follow links */ +# define O_CLOEXEC 0x400000 /* Set close_on_exit. */ +#endif +#ifdef __USE_GNU # define O_DIRECT 0x100000 /* direct disk access hint */ # define O_NOATIME 0x200000 /* Do not set atime. */ -# define O_CLOEXEC 0x400000 /* Set close_on_exit. */ #endif #ifdef __USE_LARGEFILE64 @@ -68,21 +70,13 @@ # define O_RSYNC O_SYNC /* Synchronize read operations. */ #endif -/* For now Linux has synchronisity options for data and read operations. - We define the symbols here but let them do the same as O_SYNC since - this is a superset. */ -#if defined __USE_POSIX199309 || defined __USE_UNIX98 -# define O_DSYNC O_SYNC /* Synchronize data. */ -# define O_RSYNC O_SYNC /* Synchronize read operations. */ -#endif - /* Values for the second argument to `fcntl'. */ #define F_DUPFD 0 /* Duplicate file descriptor. */ #define F_GETFD 1 /* Get file descriptor flags. */ #define F_SETFD 2 /* Set file descriptor flags. */ #define F_GETFL 3 /* Get file status flags. */ #define F_SETFL 4 /* Set file status flags. */ -#if defined __USE_BSD || defined __USE_UNIX98 +#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8 # define F_GETOWN 5 /* Get owner (process receiving SIGIO). */ # define F_SETOWN 6 /* Set owner (process receiving SIGIO). */ #endif @@ -107,6 +101,8 @@ # define F_SETLEASE 1024 /* Set a lease. */ # define F_GETLEASE 1025 /* Enquire what lease is active. */ # define F_NOTIFY 1026 /* Request notfications on a directory. */ +#endif +#ifdef __USE_XOPEN2K8 # define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with close-on-exit set. */ #endif diff --git a/libc/sysdeps/unix/sysv/linux/sparc/bits/sigaction.h b/libc/sysdeps/unix/sysv/linux/sparc/bits/sigaction.h index ee4196764..e474dbe26 100644 --- a/libc/sysdeps/unix/sysv/linux/sparc/bits/sigaction.h +++ b/libc/sysdeps/unix/sysv/linux/sparc/bits/sigaction.h @@ -1,5 +1,5 @@ /* The proper definitions for Linux/SPARC sigaction. - Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc. + Copyright (C) 1996-2000, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -58,6 +58,8 @@ struct sigaction three arguments instead of one. */ #if defined __USE_UNIX98 || defined __USE_MISC # define SA_ONSTACK 0x00000001 /* Use signal stack by using `sa_restorer'. */ +#endif +#if defined __USE_UNIX98 || defined __USE_MISC || defined __USE_XOPEN2K8 # define SA_RESTART 0x00000002 /* Restart syscall on signal return. */ # define SA_INTERRUPT 0x00000010 /* Historical no-op. */ # define SA_NOMASK 0x00000020 /* Don't automatically block the signal when diff --git a/libc/sysdeps/unix/sysv/linux/sparc/bits/stat.h b/libc/sysdeps/unix/sysv/linux/sparc/bits/stat.h index 175fdb857..eaab95a1c 100644 --- a/libc/sysdeps/unix/sysv/linux/sparc/bits/stat.h +++ b/libc/sysdeps/unix/sysv/linux/sparc/bits/stat.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1992, 1995-2002, 2006, 2009 Free Software Foundation, Inc. +/* Copyright (C) 1992,1995-2002,2006,2009,2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,10 +16,13 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#ifndef _SYS_STAT_H +#if !defined _SYS_STAT_H && !defined _FCNTL_H # error "Never include <bits/stat.h> directly; use <sys/stat.h> instead." #endif +#ifndef _BITS_STAT_H +#define _BITS_STAT_H 1 + /* Versions of the `struct stat' data structure. */ #define _STAT_VER_LINUX_OLD 1 #define _STAT_VER_KERNEL 1 @@ -167,3 +170,5 @@ struct stat64 # define UTIME_NOW ((1l << 30) - 1l) # define UTIME_OMIT ((1l << 30) - 2l) #endif + +#endif /* bits/stat.h */ diff --git a/libc/sysdeps/unix/sysv/linux/x86_64/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/x86_64/bits/fcntl.h index 43835081c..aa04e0e49 100644 --- a/libc/sysdeps/unix/sysv/linux/x86_64/bits/fcntl.h +++ b/libc/sysdeps/unix/sysv/linux/x86_64/bits/fcntl.h @@ -1,5 +1,6 @@ /* O_*, F_*, FD_* bit values for Linux/x86-64. - Copyright (C) 2001,2002,2004,2006,2007,2009 Free Software Foundation, Inc. + Copyright (C) 2001,2002,2004,2006,2007,2009,2010 + Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -45,13 +46,15 @@ #define O_FSYNC O_SYNC #define O_ASYNC 020000 -#ifdef __USE_GNU -# define O_DIRECT 040000 /* Direct disk access. */ +#ifdef __USE_XOPEN2K8 # define O_DIRECTORY 0200000 /* Must be a directory. */ # define O_NOFOLLOW 0400000 /* Do not follow links. */ -# define O_NOATIME 01000000 /* Do not set atime. */ # define O_CLOEXEC 02000000 /* Set close_on_exec. */ #endif +#ifdef __USE_GNU +# define O_DIRECT 040000 /* Direct disk access. */ +# define O_NOATIME 01000000 /* Do not set atime. */ +#endif /* For now Linux has synchronisity options for data and read operations. We define the symbols here but let them do the same as O_SYNC since @@ -98,7 +101,7 @@ # define F_SETLKW64 14 /* Set record locking info (blocking). */ #endif -#if defined __USE_BSD || defined __USE_UNIX98 +#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8 # define F_SETOWN 8 /* Get owner (process receiving SIGIO). */ # define F_GETOWN 9 /* Set owner (process receiving SIGIO). */ #endif @@ -114,6 +117,8 @@ # define F_SETLEASE 1024 /* Set a lease. */ # define F_GETLEASE 1025 /* Enquire what lease is active. */ # define F_NOTIFY 1026 /* Request notfications on a directory. */ +#endif +#ifdef __USE_XOPEN2K8 # define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with close-on-exit set. */ #endif diff --git a/libc/sysdeps/unix/sysv/linux/x86_64/bits/stat.h b/libc/sysdeps/unix/sysv/linux/x86_64/bits/stat.h index 9d12315e1..c7e4e1f0d 100644 --- a/libc/sysdeps/unix/sysv/linux/x86_64/bits/stat.h +++ b/libc/sysdeps/unix/sysv/linux/x86_64/bits/stat.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1999,2000,2001,2002,2003,2009 Free Software Foundation, Inc. +/* Copyright (C) 1999-2003,2009,2010 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,10 +16,13 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ -#ifndef _SYS_STAT_H +#if !defined _SYS_STAT_H && !defined _FCNTL_H # error "Never include <bits/stat.h> directly; use <sys/stat.h> instead." #endif +#ifndef _BITS_STAT_H +#define _BITS_STAT_H 1 + /* Versions of the `struct stat' data structure. */ #define _STAT_VER_KERNEL 0 @@ -206,3 +209,5 @@ struct stat64 # define UTIME_NOW ((1l << 30) - 1l) # define UTIME_OMIT ((1l << 30) - 2l) #endif + +#endif /* bits/stat.h */ diff --git a/libc/sysdeps/x86_64/cacheinfo.c b/libc/sysdeps/x86_64/cacheinfo.c index 5b66c62eb..54220379e 100644 --- a/libc/sysdeps/x86_64/cacheinfo.c +++ b/libc/sysdeps/x86_64/cacheinfo.c @@ -74,6 +74,7 @@ static const struct intel_02_cache_info { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 }, { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 }, { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 }, + { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 }, { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 }, { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 }, { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 }, @@ -113,6 +114,7 @@ static const struct intel_02_cache_info { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 }, { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 }, { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, + { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 }, { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 }, { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 }, @@ -452,9 +454,10 @@ __cache_sysconf (int name) } -/* Half the data cache size for use in memory and string routines, typically +/* Data cache size for use in memory and string routines, typically L1 size. */ long int __x86_64_data_cache_size_half attribute_hidden = 32 * 1024 / 2; +long int __x86_64_data_cache_size attribute_hidden = 32 * 1024; /* Shared cache size for use in memory and string routines, typically L2 or L3 size. */ long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; @@ -657,7 +660,10 @@ init_cacheinfo (void) } if (data > 0) - __x86_64_data_cache_size_half = data / 2; + { + __x86_64_data_cache_size_half = data / 2; + __x86_64_data_cache_size = data; + } if (shared > 0) { diff --git a/libc/sysdeps/x86_64/multiarch/ifunc-defines.sym b/libc/sysdeps/x86_64/multiarch/ifunc-defines.sym index e2021cdf8..eb1538abc 100644 --- a/libc/sysdeps/x86_64/multiarch/ifunc-defines.sym +++ b/libc/sysdeps/x86_64/multiarch/ifunc-defines.sym @@ -13,5 +13,8 @@ CPUID_ECX_OFFSET offsetof (struct cpuid_registers, ecx) CPUID_EDX_OFFSET offsetof (struct cpuid_registers, edx) FAMILY_OFFSET offsetof (struct cpu_features, family) MODEL_OFFSET offsetof (struct cpu_features, model) +FEATURE_OFFSET offsetof (struct cpu_features, feature) +FEATURE_SIZE sizeof (unsigned int) COMMON_CPUID_INDEX_1 +FEATURE_INDEX_1 diff --git a/libc/sysdeps/x86_64/multiarch/init-arch.c b/libc/sysdeps/x86_64/multiarch/init-arch.c index 7823aceb9..50b2a38fb 100644 --- a/libc/sysdeps/x86_64/multiarch/init-arch.c +++ b/libc/sysdeps/x86_64/multiarch/init-arch.c @@ -64,7 +64,23 @@ __init_cpu_features (void) __cpu_features.model += extended_model; } else if (__cpu_features.family == 0x06) - __cpu_features.model += extended_model; + { + __cpu_features.model += extended_model; + switch (__cpu_features.model) + { + case 0x1a: + case 0x1e: + case 0x1f: + case 0x25: + case 0x2e: + case 0x2f: + /* Rep string instructions are fast on Intel Core i3, i5 + and i7. */ + __cpu_features.feature[index_Fast_Rep_String] + |= bit_Fast_Rep_String; + break; + } + } } /* This spells out "AuthenticAMD". */ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) diff --git a/libc/sysdeps/x86_64/multiarch/init-arch.h b/libc/sysdeps/x86_64/multiarch/init-arch.h index 0f8f77a8a..69492cb3b 100644 --- a/libc/sysdeps/x86_64/multiarch/init-arch.h +++ b/libc/sysdeps/x86_64/multiarch/init-arch.h @@ -16,6 +16,8 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ +#define bit_Fast_Rep_String (1 << 0) + #ifdef __ASSEMBLER__ #include <ifunc-defines.h> @@ -28,6 +30,8 @@ #define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET #define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET +#define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE + #else /* __ASSEMBLER__ */ #include <sys/param.h> @@ -39,6 +43,13 @@ enum COMMON_CPUID_INDEX_MAX }; +enum + { + FEATURE_INDEX_1 = 0, + /* Keep the following line at the end. */ + FEATURE_INDEX_MAX + }; + extern struct cpu_features { enum @@ -58,6 +69,7 @@ extern struct cpu_features } cpuid[COMMON_CPUID_INDEX_MAX]; unsigned int family; unsigned int model; + unsigned int feature[FEATURE_INDEX_MAX]; } __cpu_features attribute_hidden; @@ -86,4 +98,6 @@ extern const struct cpu_features *__get_cpu_features (void) #define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20) #define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12) +#define index_Fast_Rep_String FEATURE_INDEX_1 + #endif /* __ASSEMBLER__ */ diff --git a/libc/sysdeps/x86_64/multiarch/strlen.S b/libc/sysdeps/x86_64/multiarch/strlen.S index 509f9c960..f9641131f 100644 --- a/libc/sysdeps/x86_64/multiarch/strlen.S +++ b/libc/sysdeps/x86_64/multiarch/strlen.S @@ -46,28 +46,58 @@ END(strlen) __strlen_sse42: cfi_startproc CALL_MCOUNT - pxor %xmm2, %xmm2 - movq %rdi, %rcx + pxor %xmm1, %xmm1 + movl %edi, %ecx movq %rdi, %r8 andq $~15, %rdi - movdqa %xmm2, %xmm1 - pcmpeqb (%rdi), %xmm2 - orl $0xffffffff, %esi - subq %rdi, %rcx - shll %cl, %esi - pmovmskb %xmm2, %edx - andl %esi, %edx - jnz 1f - -2: pcmpistri $0x08, 16(%rdi), %xmm1 - leaq 16(%rdi), %rdi - jnz 2b + xor %edi, %ecx + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %edx + shrl %cl, %edx + shll %cl, %edx + andl %edx, %edx + jnz L(less16bytes) + pxor %xmm1, %xmm1 + .p2align 4 +L(more64bytes_loop): + pcmpistri $0x08, 16(%rdi), %xmm1 + jz L(more32bytes) + + pcmpistri $0x08, 32(%rdi), %xmm1 + jz L(more48bytes) + + pcmpistri $0x08, 48(%rdi), %xmm1 + jz L(more64bytes) + + add $64, %rdi + pcmpistri $0x08, (%rdi), %xmm1 + jnz L(more64bytes_loop) leaq (%rdi,%rcx), %rax subq %r8, %rax ret -1: subq %r8, %rdi + .p2align 4 +L(more32bytes): + leaq 16(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more48bytes): + leaq 32(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more64bytes): + leaq 48(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(less16bytes): + subq %r8, %rdi bsfl %edx, %eax addq %rdi, %rax ret diff --git a/libc/sysdeps/x86_64/strcmp.S b/libc/sysdeps/x86_64/strcmp.S index 650ec173b..ac3fe1467 100644 --- a/libc/sysdeps/x86_64/strcmp.S +++ b/libc/sysdeps/x86_64/strcmp.S @@ -1,5 +1,5 @@ /* Highly optimized version for x86-64. - Copyright (C) 1999, 2000, 2002, 2003, 2005, 2009 + Copyright (C) 1999, 2000, 2002, 2003, 2005, 2009, 2010 Free Software Foundation, Inc. This file is part of the GNU C Library. Based on i686 version contributed by Ulrich Drepper @@ -33,6 +33,13 @@ #endif #ifdef USE_AS_STRNCMP +/* The simplified code below is not set up to handle strncmp() so far. + Should this become necessary it has to be implemented. For now + just report the problem. */ +# ifdef NOT_IN_lib +# error "strncmp not implemented so far" +# endif + /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz if the new counter > the old one or is 0. */ # define UPDATE_STRNCMP_COUNTER \ @@ -54,7 +61,7 @@ #ifndef USE_SSSE3 .text #else - .section .text.ssse3,"ax",@progbits + .section .text.ssse3,"ax",@progbits #endif ENTRY (BP_SYM (STRCMP)) @@ -80,13 +87,13 @@ END (BP_SYM (STRCMP)) /* * This implementation uses SSE to compare up to 16 bytes at a time. */ -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP test %rdx, %rdx je LABEL(strcmp_exitz) cmp $1, %rdx je LABEL(Byte0) mov %rdx, %r11 -#endif +# endif mov %esi, %ecx mov %edi, %eax /* Use 64bit AND here to avoid long NOP padding. */ @@ -107,10 +114,10 @@ END (BP_SYM (STRCMP)) pmovmskb %xmm1, %edx sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ jnz LABEL(less16bytes) /* If not, find different value or null char */ -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) /* finish comparision */ -#endif +# endif add $16, %rsi /* prepare to search next 16 bytes */ add $16, %rdi /* prepare to search next 16 bytes */ @@ -184,10 +191,10 @@ LABEL(loop_ashr_0): sub $0xffff, %edx jnz LABEL(exit) /* mismatch or null char seen */ -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa (%rsi, %rcx), %xmm1 movdqa (%rdi, %rcx), %xmm2 @@ -198,10 +205,10 @@ LABEL(loop_ashr_0): pmovmskb %xmm1, %edx sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx jmp LABEL(loop_ashr_0) @@ -249,13 +256,13 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $1, %xmm3 pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -264,10 +271,10 @@ LABEL(gobble_ashr_1): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -278,13 +285,13 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $1, %xmm3 pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -293,10 +300,10 @@ LABEL(gobble_ashr_1): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 jmp LABEL(loop_ashr_1) @@ -312,10 +319,10 @@ LABEL(nibble_ashr_1): test $0xfffe, %edx jnz LABEL(ashr_1_exittail) /* find null char*/ -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $14, %r11 jbe LABEL(ashr_1_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 /* substract 4K from %r10 */ @@ -334,7 +341,7 @@ LABEL(ashr_1_exittail): /* * The following cases will be handled by ashr_2 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 */ .p2align 4 @@ -376,13 +383,13 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $2, %xmm3 pslldq $14, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -391,10 +398,10 @@ LABEL(gobble_ashr_2): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -406,13 +413,13 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $2, %xmm3 pslldq $14, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -421,10 +428,10 @@ LABEL(gobble_ashr_2): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -437,10 +444,10 @@ LABEL(nibble_ashr_2): test $0xfffc, %edx jnz LABEL(ashr_2_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $13, %r11 jbe LABEL(ashr_2_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -498,13 +505,13 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $3, %xmm3 pslldq $13, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -513,10 +520,10 @@ LABEL(gobble_ashr_3): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -528,13 +535,13 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $3, %xmm3 pslldq $13, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -543,10 +550,10 @@ LABEL(gobble_ashr_3): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -559,10 +566,10 @@ LABEL(nibble_ashr_3): test $0xfff8, %edx jnz LABEL(ashr_3_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $12, %r11 jbe LABEL(ashr_3_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -620,13 +627,13 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $4, %xmm3 pslldq $12, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -635,10 +642,10 @@ LABEL(gobble_ashr_4): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -650,13 +657,13 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $4, %xmm3 pslldq $12, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -665,10 +672,10 @@ LABEL(gobble_ashr_4): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -681,10 +688,10 @@ LABEL(nibble_ashr_4): test $0xfff0, %edx jnz LABEL(ashr_4_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $11, %r11 jbe LABEL(ashr_4_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -700,7 +707,7 @@ LABEL(ashr_4_exittail): /* * The following cases will be handled by ashr_5 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 */ .p2align 4 LABEL(ashr_5): @@ -742,13 +749,13 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $5, %xmm3 pslldq $11, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -757,10 +764,10 @@ LABEL(gobble_ashr_5): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -772,13 +779,13 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $5, %xmm3 pslldq $11, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -787,10 +794,10 @@ LABEL(gobble_ashr_5): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -803,10 +810,10 @@ LABEL(nibble_ashr_5): test $0xffe0, %edx jnz LABEL(ashr_5_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $10, %r11 jbe LABEL(ashr_5_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -822,7 +829,7 @@ LABEL(ashr_5_exittail): /* * The following cases will be handled by ashr_6 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 */ .p2align 4 LABEL(ashr_6): @@ -864,13 +871,13 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $6, %xmm3 pslldq $10, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -879,10 +886,10 @@ LABEL(gobble_ashr_6): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -894,13 +901,13 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $6, %xmm3 pslldq $10, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -909,10 +916,10 @@ LABEL(gobble_ashr_6): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -925,10 +932,10 @@ LABEL(nibble_ashr_6): test $0xffc0, %edx jnz LABEL(ashr_6_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $9, %r11 jbe LABEL(ashr_6_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -944,7 +951,7 @@ LABEL(ashr_6_exittail): /* * The following cases will be handled by ashr_7 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 */ .p2align 4 LABEL(ashr_7): @@ -986,13 +993,13 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $7, %xmm3 pslldq $9, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1001,10 +1008,10 @@ LABEL(gobble_ashr_7): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1016,13 +1023,13 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $7, %xmm3 pslldq $9, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1031,10 +1038,10 @@ LABEL(gobble_ashr_7): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1047,10 +1054,10 @@ LABEL(nibble_ashr_7): test $0xff80, %edx jnz LABEL(ashr_7_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $8, %r11 jbe LABEL(ashr_7_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -1066,7 +1073,7 @@ LABEL(ashr_7_exittail): /* * The following cases will be handled by ashr_8 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 */ .p2align 4 LABEL(ashr_8): @@ -1108,13 +1115,13 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $8, %xmm3 pslldq $8, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1123,10 +1130,10 @@ LABEL(gobble_ashr_8): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1138,13 +1145,13 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $8, %xmm3 pslldq $8, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1153,10 +1160,10 @@ LABEL(gobble_ashr_8): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1169,10 +1176,10 @@ LABEL(nibble_ashr_8): test $0xff00, %edx jnz LABEL(ashr_8_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $7, %r11 jbe LABEL(ashr_8_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -1188,7 +1195,7 @@ LABEL(ashr_8_exittail): /* * The following cases will be handled by ashr_9 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 */ .p2align 4 LABEL(ashr_9): @@ -1230,13 +1237,13 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $9, %xmm3 pslldq $7, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1245,10 +1252,10 @@ LABEL(gobble_ashr_9): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1260,13 +1267,13 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $9, %xmm3 pslldq $7, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1275,10 +1282,10 @@ LABEL(gobble_ashr_9): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 /* store for next cycle */ @@ -1291,10 +1298,10 @@ LABEL(nibble_ashr_9): test $0xfe00, %edx jnz LABEL(ashr_9_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $6, %r11 jbe LABEL(ashr_9_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -1310,7 +1317,7 @@ LABEL(ashr_9_exittail): /* * The following cases will be handled by ashr_10 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 */ .p2align 4 LABEL(ashr_10): @@ -1352,13 +1359,13 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $10, %xmm3 pslldq $6, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1367,10 +1374,10 @@ LABEL(gobble_ashr_10): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1382,13 +1389,13 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $10, %xmm3 pslldq $6, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1397,10 +1404,10 @@ LABEL(gobble_ashr_10): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1413,10 +1420,10 @@ LABEL(nibble_ashr_10): test $0xfc00, %edx jnz LABEL(ashr_10_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $5, %r11 jbe LABEL(ashr_10_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -1432,7 +1439,7 @@ LABEL(ashr_10_exittail): /* * The following cases will be handled by ashr_11 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 */ .p2align 4 LABEL(ashr_11): @@ -1474,13 +1481,13 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $11, %xmm3 pslldq $5, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1489,10 +1496,10 @@ LABEL(gobble_ashr_11): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1504,13 +1511,13 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $11, %xmm3 pslldq $5, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1519,10 +1526,10 @@ LABEL(gobble_ashr_11): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1535,10 +1542,10 @@ LABEL(nibble_ashr_11): test $0xf800, %edx jnz LABEL(ashr_11_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $4, %r11 jbe LABEL(ashr_11_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -1554,7 +1561,7 @@ LABEL(ashr_11_exittail): /* * The following cases will be handled by ashr_12 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 */ .p2align 4 LABEL(ashr_12): @@ -1596,13 +1603,13 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $12, %xmm3 pslldq $4, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1611,10 +1618,10 @@ LABEL(gobble_ashr_12): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1626,13 +1633,13 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $12, %xmm3 pslldq $4, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1641,10 +1648,10 @@ LABEL(gobble_ashr_12): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1657,10 +1664,10 @@ LABEL(nibble_ashr_12): test $0xf000, %edx jnz LABEL(ashr_12_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $3, %r11 jbe LABEL(ashr_12_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -1676,7 +1683,7 @@ LABEL(ashr_12_exittail): /* * The following cases will be handled by ashr_13 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 */ .p2align 4 LABEL(ashr_13): @@ -1718,13 +1725,13 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $13, %xmm3 pslldq $3, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1733,10 +1740,10 @@ LABEL(gobble_ashr_13): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1748,13 +1755,13 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $13, %xmm3 pslldq $3, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1763,10 +1770,10 @@ LABEL(gobble_ashr_13): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1779,10 +1786,10 @@ LABEL(nibble_ashr_13): test $0xe000, %edx jnz LABEL(ashr_13_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $2, %r11 jbe LABEL(ashr_13_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -1798,7 +1805,7 @@ LABEL(ashr_13_exittail): /* * The following cases will be handled by ashr_14 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 */ .p2align 4 LABEL(ashr_14): @@ -1840,13 +1847,13 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $14, %xmm3 pslldq $2, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1855,10 +1862,10 @@ LABEL(gobble_ashr_14): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1870,13 +1877,13 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $14, %xmm3 pslldq $2, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1885,10 +1892,10 @@ LABEL(gobble_ashr_14): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1901,10 +1908,10 @@ LABEL(nibble_ashr_14): test $0xc000, %edx jnz LABEL(ashr_14_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP cmp $1, %r11 jbe LABEL(ashr_14_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -1920,7 +1927,7 @@ LABEL(ashr_14_exittail): /* * The following cases will be handled by ashr_15 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 */ .p2align 4 LABEL(ashr_15): @@ -1964,13 +1971,13 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $15, %xmm3 pslldq $1, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -1979,10 +1986,10 @@ LABEL(gobble_ashr_15): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -1994,13 +2001,13 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 +# ifndef USE_SSSE3 psrldq $15, %xmm3 pslldq $1, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else +# else palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif +# endif pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm2, %xmm1 @@ -2009,10 +2016,10 @@ LABEL(gobble_ashr_15): sub $0xffff, %edx jnz LABEL(exit) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub $16, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif add $16, %rcx movdqa %xmm4, %xmm3 @@ -2025,10 +2032,10 @@ LABEL(nibble_ashr_15): test $0x8000, %edx jnz LABEL(ashr_15_exittail) -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP test %r11, %r11 je LABEL(ashr_15_exittail) -#endif +# endif pxor %xmm0, %xmm0 sub $0x1000, %r10 @@ -2062,10 +2069,10 @@ LABEL(ret): LABEL(less16bytes): bsf %rdx, %rdx /* find and store bit index in %rdx */ -#ifdef USE_AS_STRNCMP +# ifdef USE_AS_STRNCMP sub %rdx, %r11 jbe LABEL(strcmp_exitz) -#endif +# endif movzbl (%rsi, %rdx), %ecx movzbl (%rdi, %rdx), %eax |