summaryrefslogtreecommitdiff
path: root/libc/sysdeps
diff options
context:
space:
mode:
authorjoseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d>2010-01-26 11:27:38 +0000
committerjoseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d>2010-01-26 11:27:38 +0000
commit75a8dd396d88f79e8e2660fefe5bd93cbff71f55 (patch)
treee7ff701c0888ad2e6d527fd0b3302bcc3663d3dd /libc/sysdeps
parent42bc5058cebfefa9329005e1b3bc0525f7f7b67b (diff)
downloadeglibc2-75a8dd396d88f79e8e2660fefe5bd93cbff71f55.tar.gz
Merge changes between r9569 and r9736 from /fsf/trunk.
git-svn-id: svn://svn.eglibc.org/trunk@9737 7b3dc134-2b1b-0410-93df-9e9f96275f8d
Diffstat (limited to 'libc/sysdeps')
-rw-r--r--libc/sysdeps/generic/ldsodefs.h5
-rw-r--r--libc/sysdeps/generic/netinet/ip.h43
-rw-r--r--libc/sysdeps/i386/i686/bcopy.S3
-rw-r--r--libc/sysdeps/i386/i686/cacheinfo.c1
-rw-r--r--libc/sysdeps/i386/i686/memcpy.S4
-rw-r--r--libc/sysdeps/i386/i686/memmove.S27
-rw-r--r--libc/sysdeps/i386/i686/mempcpy.S4
-rw-r--r--libc/sysdeps/i386/i686/memset.S4
-rw-r--r--libc/sysdeps/i386/i686/multiarch/Makefile4
-rw-r--r--libc/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S4
-rw-r--r--libc/sysdeps/i386/i686/multiarch/bcopy-ssse3.S4
-rw-r--r--libc/sysdeps/i386/i686/multiarch/bcopy.S89
-rw-r--r--libc/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S3
-rw-r--r--libc/sysdeps/i386/i686/multiarch/bzero-sse2.S3
-rw-r--r--libc/sysdeps/i386/i686/multiarch/bzero.S97
-rw-r--r--libc/sysdeps/i386/i686/multiarch/ifunc-defines.sym3
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S1785
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S1737
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memcpy.S90
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memcpy_chk.S64
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S4
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memmove-ssse3.S4
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memmove.S117
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memmove_chk.S112
-rw-r--r--libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S4
-rw-r--r--libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S4
-rw-r--r--libc/sysdeps/i386/i686/multiarch/mempcpy.S93
-rw-r--r--libc/sysdeps/i386/i686/multiarch/mempcpy_chk.S64
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memset-sse2-rep.S821
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memset-sse2.S867
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memset.S112
-rw-r--r--libc/sysdeps/i386/i686/multiarch/memset_chk.S116
-rw-r--r--libc/sysdeps/i386/sysdep.h3
-rw-r--r--libc/sysdeps/ieee754/ldbl-128/s_ceill.c4
-rw-r--r--libc/sysdeps/ieee754/ldbl-128/s_expm1l.c2
-rw-r--r--libc/sysdeps/ieee754/ldbl-128/s_floorl.c4
-rw-r--r--libc/sysdeps/ieee754/ldbl-128/s_log1pl.c1
-rw-r--r--libc/sysdeps/ieee754/ldbl-128/s_nexttowardf.c17
-rw-r--r--libc/sysdeps/mach/hurd/bits/libc-lock.h3
-rw-r--r--libc/sysdeps/mach/hurd/bits/stat.h12
-rw-r--r--libc/sysdeps/powerpc/powerpc32/cell/memcpy.S245
-rw-r--r--libc/sysdeps/powerpc/powerpc64/cell/memcpy.S245
-rw-r--r--libc/sysdeps/s390/s390-32/dl-machine.h7
-rw-r--r--libc/sysdeps/s390/s390-32/elf/start.S82
-rw-r--r--libc/sysdeps/unix/bsd/bits/stat.h9
-rw-r--r--libc/sysdeps/unix/sysv/bits/stat.h9
-rw-r--r--libc/sysdeps/unix/sysv/linux/bits/sigaction.h4
-rw-r--r--libc/sysdeps/unix/sysv/linux/bits/stat.h9
-rw-r--r--libc/sysdeps/unix/sysv/linux/i386/bits/fcntl.h14
-rw-r--r--libc/sysdeps/unix/sysv/linux/ia64/bits/fcntl.h15
-rw-r--r--libc/sysdeps/unix/sysv/linux/ia64/bits/sigaction.h4
-rw-r--r--libc/sysdeps/unix/sysv/linux/ia64/bits/stat.h9
-rw-r--r--libc/sysdeps/unix/sysv/linux/powerpc/bits/fcntl.h14
-rw-r--r--libc/sysdeps/unix/sysv/linux/powerpc/bits/stat.h9
-rw-r--r--libc/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies3
-rw-r--r--libc/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies1
-rw-r--r--libc/sysdeps/unix/sysv/linux/s390/bits/fcntl.h15
-rw-r--r--libc/sysdeps/unix/sysv/linux/s390/bits/sigaction.h4
-rw-r--r--libc/sysdeps/unix/sysv/linux/s390/bits/stat.h9
-rw-r--r--libc/sysdeps/unix/sysv/linux/s390/s390-32/____longjmp_chk.c2
-rw-r--r--libc/sysdeps/unix/sysv/linux/s390/s390-64/____longjmp_chk.c2
-rw-r--r--libc/sysdeps/unix/sysv/linux/sh/bits/fcntl.h15
-rw-r--r--libc/sysdeps/unix/sysv/linux/sparc/bits/fcntl.h20
-rw-r--r--libc/sysdeps/unix/sysv/linux/sparc/bits/sigaction.h4
-rw-r--r--libc/sysdeps/unix/sysv/linux/sparc/bits/stat.h9
-rw-r--r--libc/sysdeps/unix/sysv/linux/x86_64/bits/fcntl.h15
-rw-r--r--libc/sysdeps/unix/sysv/linux/x86_64/bits/stat.h9
-rw-r--r--libc/sysdeps/x86_64/cacheinfo.c10
-rw-r--r--libc/sysdeps/x86_64/multiarch/ifunc-defines.sym3
-rw-r--r--libc/sysdeps/x86_64/multiarch/init-arch.c18
-rw-r--r--libc/sysdeps/x86_64/multiarch/init-arch.h14
-rw-r--r--libc/sysdeps/x86_64/multiarch/strlen.S60
-rw-r--r--libc/sysdeps/x86_64/strcmp.S415
73 files changed, 7311 insertions, 330 deletions
diff --git a/libc/sysdeps/generic/ldsodefs.h b/libc/sysdeps/generic/ldsodefs.h
index e18e60f73..230c39a63 100644
--- a/libc/sysdeps/generic/ldsodefs.h
+++ b/libc/sysdeps/generic/ldsodefs.h
@@ -1,5 +1,5 @@
/* Run-time dynamic linker data structures for loaded ELF shared objects.
- Copyright (C) 1995-2006, 2007, 2008, 2009 Free Software Foundation, Inc.
+ Copyright (C) 1995-2009, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -1015,7 +1015,8 @@ extern void *_dl_sysdep_read_whole_file (const char *file, size_t *sizep,
extern ElfW(Addr) _dl_sysdep_start (void **start_argptr,
void (*dl_main) (const ElfW(Phdr) *phdr,
ElfW(Word) phnum,
- ElfW(Addr) *user_entry))
+ ElfW(Addr) *user_entry,
+ ElfW(auxv_t) *auxv))
attribute_hidden;
extern void _dl_sysdep_start_cleanup (void)
diff --git a/libc/sysdeps/generic/netinet/ip.h b/libc/sysdeps/generic/netinet/ip.h
index 38bd7556d..a837b9814 100644
--- a/libc/sysdeps/generic/netinet/ip.h
+++ b/libc/sysdeps/generic/netinet/ip.h
@@ -189,7 +189,26 @@ struct ip_timestamp
#define IPTOS_DSCP_EF 0xb8
/*
- * Definitions for IP type of service (ip_tos)
+ * In RFC 2474, Section 4.2.2.1, the Class Selector Codepoints subsume
+ * the old ToS Precedence values.
+ */
+
+#define IPTOS_CLASS_MASK 0xe0
+#define IPTOS_CLASS(class) ((tos) & IPTOS_CLASS_MASK)
+#define IPTOS_CLASS_CS0 0x00
+#define IPTOS_CLASS_CS1 0x20
+#define IPTOS_CLASS_CS2 0x40
+#define IPTOS_CLASS_CS3 0x60
+#define IPTOS_CLASS_CS4 0x80
+#define IPTOS_CLASS_CS5 0xa0
+#define IPTOS_CLASS_CS6 0xc0
+#define IPTOS_CLASS_CS7 0xe0
+
+#define IPTOS_CLASS_DEFAULT IPTOS_CLASS_CS0
+
+/*
+ * Definitions for IP type of service (ip_tos) [deprecated; use DSCP
+ * and CS definitions above instead.]
*/
#define IPTOS_TOS_MASK 0x1E
#define IPTOS_TOS(tos) ((tos) & IPTOS_TOS_MASK)
@@ -200,18 +219,18 @@ struct ip_timestamp
#define IPTOS_MINCOST IPTOS_LOWCOST
/*
- * Definitions for IP precedence (also in ip_tos) (hopefully unused)
+ * Definitions for IP precedence (also in ip_tos) [also deprecated.]
*/
-#define IPTOS_PREC_MASK 0xe0
-#define IPTOS_PREC(tos) ((tos) & IPTOS_PREC_MASK)
-#define IPTOS_PREC_NETCONTROL 0xe0
-#define IPTOS_PREC_INTERNETCONTROL 0xc0
-#define IPTOS_PREC_CRITIC_ECP 0xa0
-#define IPTOS_PREC_FLASHOVERRIDE 0x80
-#define IPTOS_PREC_FLASH 0x60
-#define IPTOS_PREC_IMMEDIATE 0x40
-#define IPTOS_PREC_PRIORITY 0x20
-#define IPTOS_PREC_ROUTINE 0x00
+#define IPTOS_PREC_MASK IPTOS_CLASS_MASK
+#define IPTOS_PREC(tos) IPTOS_CLASS(tos)
+#define IPTOS_PREC_NETCONTROL IPTOS_CLASS_CS7
+#define IPTOS_PREC_INTERNETCONTROL IPTOS_CLASS_CS6
+#define IPTOS_PREC_CRITIC_ECP IPTOS_CLASS_CS5
+#define IPTOS_PREC_FLASHOVERRIDE IPTOS_CLASS_CS4
+#define IPTOS_PREC_FLASH IPTOS_CLASS_CS3
+#define IPTOS_PREC_IMMEDIATE IPTOS_CLASS_CS2
+#define IPTOS_PREC_PRIORITY IPTOS_CLASS_CS1
+#define IPTOS_PREC_ROUTINE IPTOS_CLASS_CS0
/*
* Definitions for options.
diff --git a/libc/sysdeps/i386/i686/bcopy.S b/libc/sysdeps/i386/i686/bcopy.S
new file mode 100644
index 000000000..15ef9419a
--- /dev/null
+++ b/libc/sysdeps/i386/i686/bcopy.S
@@ -0,0 +1,3 @@
+#define USE_AS_BCOPY
+#define memmove bcopy
+#include <sysdeps/i386/i686/memmove.S>
diff --git a/libc/sysdeps/i386/i686/cacheinfo.c b/libc/sysdeps/i386/i686/cacheinfo.c
index 82e4cd223..f8b7f521c 100644
--- a/libc/sysdeps/i386/i686/cacheinfo.c
+++ b/libc/sysdeps/i386/i686/cacheinfo.c
@@ -1,3 +1,4 @@
+#define __x86_64_data_cache_size __x86_data_cache_size
#define __x86_64_data_cache_size_half __x86_data_cache_size_half
#define __x86_64_shared_cache_size __x86_shared_cache_size
#define __x86_64_shared_cache_size_half __x86_shared_cache_size_half
diff --git a/libc/sysdeps/i386/i686/memcpy.S b/libc/sysdeps/i386/i686/memcpy.S
index 0b2da1ea2..86ee082be 100644
--- a/libc/sysdeps/i386/i686/memcpy.S
+++ b/libc/sysdeps/i386/i686/memcpy.S
@@ -32,11 +32,11 @@
.text
#if defined PIC && !defined NOT_IN_libc
-ENTRY (__memcpy_chk)
+ENTRY_CHK (__memcpy_chk)
movl 12(%esp), %eax
cmpl %eax, 16(%esp)
jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__memcpy_chk)
+END_CHK (__memcpy_chk)
#endif
ENTRY (BP_SYM (memcpy))
ENTER
diff --git a/libc/sysdeps/i386/i686/memmove.S b/libc/sysdeps/i386/i686/memmove.S
index b93b5c729..981f14f4e 100644
--- a/libc/sysdeps/i386/i686/memmove.S
+++ b/libc/sysdeps/i386/i686/memmove.S
@@ -26,18 +26,27 @@
#define PARMS LINKAGE+4 /* one spilled register */
#define RTN PARMS
-#define DEST RTN+RTN_SIZE
-#define SRC DEST+PTR_SIZE
-#define LEN SRC+PTR_SIZE
.text
-#if defined PIC && !defined NOT_IN_libc
-ENTRY (__memmove_chk)
+
+#ifdef USE_AS_BCOPY
+# define SRC RTN+RTN_SIZE
+# define DEST SRC+PTR_SIZE
+# define LEN DEST+PTR_SIZE
+#else
+# define DEST RTN+RTN_SIZE
+# define SRC DEST+PTR_SIZE
+# define LEN SRC+PTR_SIZE
+
+# if defined PIC && !defined NOT_IN_libc
+ENTRY_CHK (__memmove_chk)
movl 12(%esp), %eax
cmpl %eax, 16(%esp)
jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__memmove_chk)
+END_CHK (__memmove_chk)
+# endif
#endif
+
ENTRY (BP_SYM (memmove))
ENTER
@@ -69,8 +78,10 @@ ENTRY (BP_SYM (memmove))
movsl
movl %edx, %esi
cfi_restore (esi)
+#ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
RETURN_BOUNDED_POINTER (DEST(%esp))
+#endif
popl %edi
cfi_adjust_cfa_offset (-4)
@@ -101,8 +112,10 @@ ENTRY (BP_SYM (memmove))
movsl
movl %edx, %esi
cfi_restore (esi)
+#ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
RETURN_BOUNDED_POINTER (DEST(%esp))
+#endif
cld
popl %edi
@@ -112,4 +125,6 @@ ENTRY (BP_SYM (memmove))
LEAVE
RET_PTR
END (BP_SYM (memmove))
+#ifndef USE_AS_BCOPY
libc_hidden_builtin_def (memmove)
+#endif
diff --git a/libc/sysdeps/i386/i686/mempcpy.S b/libc/sysdeps/i386/i686/mempcpy.S
index 6437e4a5d..c10686fb3 100644
--- a/libc/sysdeps/i386/i686/mempcpy.S
+++ b/libc/sysdeps/i386/i686/mempcpy.S
@@ -32,11 +32,11 @@
.text
#if defined PIC && !defined NOT_IN_libc
-ENTRY (__mempcpy_chk)
+ENTRY_CHK (__mempcpy_chk)
movl 12(%esp), %eax
cmpl %eax, 16(%esp)
jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__mempcpy_chk)
+END_CHK (__mempcpy_chk)
#endif
ENTRY (BP_SYM (__mempcpy))
ENTER
diff --git a/libc/sysdeps/i386/i686/memset.S b/libc/sysdeps/i386/i686/memset.S
index dfa1aa701..b343af7b6 100644
--- a/libc/sysdeps/i386/i686/memset.S
+++ b/libc/sysdeps/i386/i686/memset.S
@@ -40,11 +40,11 @@
.text
#if defined PIC && !defined NOT_IN_libc && !BZERO_P
-ENTRY (__memset_chk)
+ENTRY_CHK (__memset_chk)
movl 12(%esp), %eax
cmpl %eax, 16(%esp)
jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__memset_chk)
+END_CHK (__memset_chk)
#endif
ENTRY (BP_SYM (memset))
ENTER
diff --git a/libc/sysdeps/i386/i686/multiarch/Makefile b/libc/sysdeps/i386/i686/multiarch/Makefile
index e1553b284..fbad9ae73 100644
--- a/libc/sysdeps/i386/i686/multiarch/Makefile
+++ b/libc/sysdeps/i386/i686/multiarch/Makefile
@@ -4,6 +4,10 @@ gen-as-const-headers += ifunc-defines.sym
endif
ifeq ($(subdir),string)
+sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
+ memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \
+ memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \
+ memset-sse2-rep bzero-sse2-rep
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4
diff --git a/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
new file mode 100644
index 000000000..cbc8b420e
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY __bcopy_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
new file mode 100644
index 000000000..36aac44b9
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/bcopy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY __bcopy_ssse3
+#include "memcpy-ssse3.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/bcopy.S b/libc/sysdeps/i386/i686/multiarch/bcopy.S
new file mode 100644
index 000000000..8671bf684
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/bcopy.S
@@ -0,0 +1,89 @@
+/* Multiple versions of bcopy
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(bcopy)
+ .type bcopy, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __bcopy_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __bcopy_ssse3@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __bcopy_ssse3_rep@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(bcopy)
+# else
+ .text
+ENTRY(bcopy)
+ .type bcopy, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features
+ jne 1f
+ call __init_cpu_features
+1: leal __bcopy_ia32, %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+ jz 2f
+ leal __bcopy_ssse3, %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+ jz 2f
+ leal __bcopy_ssse3_rep, %eax
+2: ret
+END(bcopy)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __bcopy_ia32, @function; \
+ .p2align 4; \
+ __bcopy_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32
+
+#endif
+
+#include "../bcopy.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/libc/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
new file mode 100644
index 000000000..507b288bb
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2_rep __bzero_sse2_rep
+#include "memset-sse2-rep.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/bzero-sse2.S b/libc/sysdeps/i386/i686/multiarch/bzero-sse2.S
new file mode 100644
index 000000000..8d04512e4
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/bzero-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_sse2 __bzero_sse2
+#include "memset-sse2.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/bzero.S b/libc/sysdeps/i386/i686/multiarch/bzero.S
new file mode 100644
index 000000000..8c740a42d
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/bzero.S
@@ -0,0 +1,97 @@
+/* Multiple versions of bzero
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(__bzero)
+ .type __bzero, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __bzero_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __bzero_sse2@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __bzero_sse2_rep@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(__bzero)
+# else
+ .text
+ENTRY(__bzero)
+ .type __bzero, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features
+ jne 1f
+ call __init_cpu_features
+1: leal __bzero_ia32, %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+ jz 2f
+ leal __bzero_sse2, %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+ jz 2f
+ leal __bzero_sse2_rep, %eax
+2: ret
+END(__bzero)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __bzero_ia32, @function; \
+ .p2align 4; \
+ __bzero_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __bzero_ia32, .-__bzero_ia32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI___bzero; __GI___bzero = __bzero_ia32
+# endif
+#endif
+
+#include "../bzero.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/ifunc-defines.sym b/libc/sysdeps/i386/i686/multiarch/ifunc-defines.sym
index e2021cdf8..eb1538abc 100644
--- a/libc/sysdeps/i386/i686/multiarch/ifunc-defines.sym
+++ b/libc/sysdeps/i386/i686/multiarch/ifunc-defines.sym
@@ -13,5 +13,8 @@ CPUID_ECX_OFFSET offsetof (struct cpuid_registers, ecx)
CPUID_EDX_OFFSET offsetof (struct cpuid_registers, edx)
FAMILY_OFFSET offsetof (struct cpu_features, family)
MODEL_OFFSET offsetof (struct cpu_features, model)
+FEATURE_OFFSET offsetof (struct cpu_features, feature)
+FEATURE_SIZE sizeof (unsigned int)
COMMON_CPUID_INDEX_1
+FEATURE_INDEX_1
diff --git a/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
new file mode 100644
index 000000000..b26037d27
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -0,0 +1,1785 @@
+/* memcpy with SSSE3 and REP string.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY __memcpy_ssse3_rep
+# define MEMCPY_CHK __memcpy_chk_ssse3_rep
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+#else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into EBX. */ \
+ call __i686.get_pc_thunk.bx; \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
+ addl $(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ ALIGN (4)
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+#else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+#endif
+
+ .section .text.ssse3,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+
+#ifdef USE_AS_MEMMOVE
+ cmp %eax, %edx
+ jb L(copy_forward)
+ je L(fwd_write_0bytes)
+ cmp $32, %ecx
+ jge L(memmove_bwd)
+ jmp L(bk_write_less32bytes_2)
+L(memmove_bwd):
+ add %ecx, %eax
+ cmp %eax, %edx
+ movl SRC(%esp), %eax
+ jb L(copy_backward)
+
+L(copy_forward):
+#endif
+ cmp $48, %ecx
+ jge L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+ cmp %dl, %al
+ jl L(bk_write)
+#endif
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+ ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned. */
+L(48bytesormore):
+ movdqu (%eax), %xmm0
+ PUSH (%edi)
+ movl %edx, %edi
+ and $-16, %edx
+ PUSH (%esi)
+ add $16, %edx
+ movl %edi, %esi
+ sub %edx, %edi
+ add %edi, %ecx
+ sub %edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+ call __i686.get_pc_thunk.bx
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+ mov %eax, %edi
+ jge L(large_page)
+ and $0xf, %edi
+ jz L(shl_0)
+
+ BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+ ALIGN (4)
+L(shl_0):
+ movdqu %xmm0, (%esi)
+ xor %edi, %edi
+ cmp $127, %ecx
+ ja L(shl_0_gobble)
+ lea -32(%ecx), %ecx
+L(shl_0_loop):
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jl L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jl L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jl L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+L(shl_0_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ add %edi, %eax
+ POP (%esi)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+ call __i686.get_pc_thunk.bx
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi
+# else
+ mov __x86_data_cache_size_half, %edi
+# endif
+#endif
+ mov %edi, %esi
+ shr $3, %esi
+ sub %esi, %edi
+ cmp %edi, %ecx
+ jge L(shl_0_gobble_mem_start)
+ lea -128(%ecx), %ecx
+ ALIGN (4)
+L(shl_0_gobble_cache_loop):
+ movdqa (%eax), %xmm0
+ movaps 0x10(%eax), %xmm1
+ movaps 0x20(%eax), %xmm2
+ movaps 0x30(%eax), %xmm3
+ movaps 0x40(%eax), %xmm4
+ movaps 0x50(%eax), %xmm5
+ movaps 0x60(%eax), %xmm6
+ movaps 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movaps %xmm1, 0x10(%edx)
+ movaps %xmm2, 0x20(%edx)
+ movaps %xmm3, 0x30(%edx)
+ movaps %xmm4, 0x40(%edx)
+ movaps %xmm5, 0x50(%edx)
+ movaps %xmm6, 0x60(%edx)
+ movaps %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jge L(shl_0_gobble_cache_loop)
+L(shl_0_gobble_cache_loop_tail):
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(shl_0_cache_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+L(shl_0_cache_less_64bytes):
+ cmp $0x20, %ecx
+ jl L(shl_0_cache_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+L(shl_0_cache_less_32bytes):
+ cmp $0x10, %ecx
+ jl L(shl_0_cache_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+L(shl_0_cache_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ POP (%esi)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(shl_0_gobble_mem_start):
+ cmp %al, %dl
+ je L(copy_page_by_rep)
+ lea -128(%ecx), %ecx
+L(shl_0_gobble_mem_loop):
+ prefetchnta 0x1c0(%eax)
+ prefetchnta 0x280(%eax)
+ prefetchnta 0x1c0(%edx)
+ prefetchnta 0x280(%edx)
+
+ movdqa (%eax), %xmm0
+ movaps 0x10(%eax), %xmm1
+ movaps 0x20(%eax), %xmm2
+ movaps 0x30(%eax), %xmm3
+ movaps 0x40(%eax), %xmm4
+ movaps 0x50(%eax), %xmm5
+ movaps 0x60(%eax), %xmm6
+ movaps 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $0x80, %ecx
+ movdqa %xmm0, (%edx)
+ movaps %xmm1, 0x10(%edx)
+ movaps %xmm2, 0x20(%edx)
+ movaps %xmm3, 0x30(%edx)
+ movaps %xmm4, 0x40(%edx)
+ movaps %xmm5, 0x50(%edx)
+ movaps %xmm6, 0x60(%edx)
+ movaps %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jge L(shl_0_gobble_mem_loop)
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(shl_0_mem_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+L(shl_0_mem_less_64bytes):
+ cmp $0x20, %ecx
+ jl L(shl_0_mem_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+L(shl_0_mem_less_32bytes):
+ cmp $0x10, %ecx
+ jl L(shl_0_mem_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+L(shl_0_mem_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ POP (%esi)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(shl_1):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -1(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_1_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_1_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_1_loop)
+
+L(shl_1_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 1(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_2):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -2(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_2_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_2_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_2_loop)
+
+L(shl_2_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 2(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_3):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -3(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_3_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_3_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_3_loop)
+
+L(shl_3_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 3(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_4):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -4(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_4_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_4_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_4_loop)
+
+L(shl_4_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 4(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_5):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -5(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_5_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_5_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_5_loop)
+
+L(shl_5_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 5(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(shl_6):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -6(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_6_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_6_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_6_loop)
+
+L(shl_6_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 6(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_7):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -7(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_7_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_7_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_7_loop)
+
+L(shl_7_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 7(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_8):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -8(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_8_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_8_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_8_loop)
+
+L(shl_8_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 8(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_9):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -9(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_9_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_9_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_9_loop)
+
+L(shl_9_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 9(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_10):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -10(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_10_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_10_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_10_loop)
+
+L(shl_10_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 10(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_11):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -11(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_11_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_11_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_11_loop)
+
+L(shl_11_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 11(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_12):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -12(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_12_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_12_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_12_loop)
+
+L(shl_12_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 12(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_13):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -13(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_13_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_13_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_13_loop)
+
+L(shl_13_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 13(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_14):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -14(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_14_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_14_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_14_loop)
+
+L(shl_14_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 14(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(shl_15):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -15(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_15_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_15_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_15_loop)
+
+L(shl_15_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 15(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(fwd_write_44bytes):
+ movl -44(%eax), %ecx
+ movl %ecx, -44(%edx)
+L(fwd_write_40bytes):
+ movl -40(%eax), %ecx
+ movl %ecx, -40(%edx)
+L(fwd_write_36bytes):
+ movl -36(%eax), %ecx
+ movl %ecx, -36(%edx)
+L(fwd_write_32bytes):
+ movl -32(%eax), %ecx
+ movl %ecx, -32(%edx)
+L(fwd_write_28bytes):
+ movl -28(%eax), %ecx
+ movl %ecx, -28(%edx)
+L(fwd_write_24bytes):
+ movl -24(%eax), %ecx
+ movl %ecx, -24(%edx)
+L(fwd_write_20bytes):
+ movl -20(%eax), %ecx
+ movl %ecx, -20(%edx)
+L(fwd_write_16bytes):
+ movl -16(%eax), %ecx
+ movl %ecx, -16(%edx)
+L(fwd_write_12bytes):
+ movl -12(%eax), %ecx
+ movl %ecx, -12(%edx)
+L(fwd_write_8bytes):
+ movl -8(%eax), %ecx
+ movl %ecx, -8(%edx)
+L(fwd_write_4bytes):
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_5bytes):
+ movl -5(%eax), %ecx
+ movl -4(%eax), %eax
+ movl %ecx, -5(%edx)
+ movl %eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_45bytes):
+ movl -45(%eax), %ecx
+ movl %ecx, -45(%edx)
+L(fwd_write_41bytes):
+ movl -41(%eax), %ecx
+ movl %ecx, -41(%edx)
+L(fwd_write_37bytes):
+ movl -37(%eax), %ecx
+ movl %ecx, -37(%edx)
+L(fwd_write_33bytes):
+ movl -33(%eax), %ecx
+ movl %ecx, -33(%edx)
+L(fwd_write_29bytes):
+ movl -29(%eax), %ecx
+ movl %ecx, -29(%edx)
+L(fwd_write_25bytes):
+ movl -25(%eax), %ecx
+ movl %ecx, -25(%edx)
+L(fwd_write_21bytes):
+ movl -21(%eax), %ecx
+ movl %ecx, -21(%edx)
+L(fwd_write_17bytes):
+ movl -17(%eax), %ecx
+ movl %ecx, -17(%edx)
+L(fwd_write_13bytes):
+ movl -13(%eax), %ecx
+ movl %ecx, -13(%edx)
+L(fwd_write_9bytes):
+ movl -9(%eax), %ecx
+ movl %ecx, -9(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+L(fwd_write_1bytes):
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_46bytes):
+ movl -46(%eax), %ecx
+ movl %ecx, -46(%edx)
+L(fwd_write_42bytes):
+ movl -42(%eax), %ecx
+ movl %ecx, -42(%edx)
+L(fwd_write_38bytes):
+ movl -38(%eax), %ecx
+ movl %ecx, -38(%edx)
+L(fwd_write_34bytes):
+ movl -34(%eax), %ecx
+ movl %ecx, -34(%edx)
+L(fwd_write_30bytes):
+ movl -30(%eax), %ecx
+ movl %ecx, -30(%edx)
+L(fwd_write_26bytes):
+ movl -26(%eax), %ecx
+ movl %ecx, -26(%edx)
+L(fwd_write_22bytes):
+ movl -22(%eax), %ecx
+ movl %ecx, -22(%edx)
+L(fwd_write_18bytes):
+ movl -18(%eax), %ecx
+ movl %ecx, -18(%edx)
+L(fwd_write_14bytes):
+ movl -14(%eax), %ecx
+ movl %ecx, -14(%edx)
+L(fwd_write_10bytes):
+ movl -10(%eax), %ecx
+ movl %ecx, -10(%edx)
+L(fwd_write_6bytes):
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+L(fwd_write_2bytes):
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_47bytes):
+ movl -47(%eax), %ecx
+ movl %ecx, -47(%edx)
+L(fwd_write_43bytes):
+ movl -43(%eax), %ecx
+ movl %ecx, -43(%edx)
+L(fwd_write_39bytes):
+ movl -39(%eax), %ecx
+ movl %ecx, -39(%edx)
+L(fwd_write_35bytes):
+ movl -35(%eax), %ecx
+ movl %ecx, -35(%edx)
+L(fwd_write_31bytes):
+ movl -31(%eax), %ecx
+ movl %ecx, -31(%edx)
+L(fwd_write_27bytes):
+ movl -27(%eax), %ecx
+ movl %ecx, -27(%edx)
+L(fwd_write_23bytes):
+ movl -23(%eax), %ecx
+ movl %ecx, -23(%edx)
+L(fwd_write_19bytes):
+ movl -19(%eax), %ecx
+ movl %ecx, -19(%edx)
+L(fwd_write_15bytes):
+ movl -15(%eax), %ecx
+ movl %ecx, -15(%edx)
+L(fwd_write_11bytes):
+ movl -11(%eax), %ecx
+ movl %ecx, -11(%edx)
+L(fwd_write_7bytes):
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+L(fwd_write_3bytes):
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(large_page):
+ movdqu (%eax), %xmm1
+ lea 16(%eax), %eax
+ movdqu %xmm0, (%esi)
+ movntdq %xmm1, (%edx)
+ lea 16(%edx), %edx
+ cmp %al, %dl
+ je L(copy_page_by_rep)
+L(large_page_loop_init):
+ POP (%esi)
+ lea -0x90(%ecx), %ecx
+ POP (%edi)
+L(large_page_loop):
+ prefetchnta 0x1c0(%eax)
+ prefetchnta 0x280(%eax)
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ movdqu 0x40(%eax), %xmm4
+ movdqu 0x50(%eax), %xmm5
+ movdqu 0x60(%eax), %xmm6
+ movdqu 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ lfence
+ sub $0x80, %ecx
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ movntdq %xmm4, 0x40(%edx)
+ movntdq %xmm5, 0x50(%edx)
+ movntdq %xmm6, 0x60(%edx)
+ movntdq %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+ jae L(large_page_loop)
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(large_page_less_64bytes)
+
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ lea 0x40(%eax), %eax
+
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ lea 0x40(%edx), %edx
+ sub $0x40, %ecx
+L(large_page_less_64bytes):
+ cmp $32, %ecx
+ jl L(large_page_less_32bytes)
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ lea 0x20(%eax), %eax
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ lea 0x20(%edx), %edx
+ sub $0x20, %ecx
+L(large_page_less_32bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ sfence
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(copy_page_by_rep):
+ mov %eax, %esi
+ mov %edx, %edi
+ mov %ecx, %edx
+ shr $2, %ecx
+ and $3, %edx
+ rep movsl
+ jz L(copy_page_by_rep_exit)
+ cmp $2, %edx
+ jb L(copy_page_by_rep_left_1)
+ movzwl (%esi), %eax
+ movw %ax, (%edi)
+ add $2, %esi
+ add $2, %edi
+ sub $2, %edx
+ jz L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+ movzbl (%esi), %eax
+ movb %al, (%edi)
+L(copy_page_by_rep_exit):
+ POP (%esi)
+ POP (%edi)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_44bytes):
+ movl 40(%eax), %ecx
+ movl %ecx, 40(%edx)
+L(bk_write_40bytes):
+ movl 36(%eax), %ecx
+ movl %ecx, 36(%edx)
+L(bk_write_36bytes):
+ movl 32(%eax), %ecx
+ movl %ecx, 32(%edx)
+L(bk_write_32bytes):
+ movl 28(%eax), %ecx
+ movl %ecx, 28(%edx)
+L(bk_write_28bytes):
+ movl 24(%eax), %ecx
+ movl %ecx, 24(%edx)
+L(bk_write_24bytes):
+ movl 20(%eax), %ecx
+ movl %ecx, 20(%edx)
+L(bk_write_20bytes):
+ movl 16(%eax), %ecx
+ movl %ecx, 16(%edx)
+L(bk_write_16bytes):
+ movl 12(%eax), %ecx
+ movl %ecx, 12(%edx)
+L(bk_write_12bytes):
+ movl 8(%eax), %ecx
+ movl %ecx, 8(%edx)
+L(bk_write_8bytes):
+ movl 4(%eax), %ecx
+ movl %ecx, 4(%edx)
+L(bk_write_4bytes):
+ movl (%eax), %ecx
+ movl %ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_45bytes):
+ movl 41(%eax), %ecx
+ movl %ecx, 41(%edx)
+L(bk_write_41bytes):
+ movl 37(%eax), %ecx
+ movl %ecx, 37(%edx)
+L(bk_write_37bytes):
+ movl 33(%eax), %ecx
+ movl %ecx, 33(%edx)
+L(bk_write_33bytes):
+ movl 29(%eax), %ecx
+ movl %ecx, 29(%edx)
+L(bk_write_29bytes):
+ movl 25(%eax), %ecx
+ movl %ecx, 25(%edx)
+L(bk_write_25bytes):
+ movl 21(%eax), %ecx
+ movl %ecx, 21(%edx)
+L(bk_write_21bytes):
+ movl 17(%eax), %ecx
+ movl %ecx, 17(%edx)
+L(bk_write_17bytes):
+ movl 13(%eax), %ecx
+ movl %ecx, 13(%edx)
+L(bk_write_13bytes):
+ movl 9(%eax), %ecx
+ movl %ecx, 9(%edx)
+L(bk_write_9bytes):
+ movl 5(%eax), %ecx
+ movl %ecx, 5(%edx)
+L(bk_write_5bytes):
+ movl 1(%eax), %ecx
+ movl %ecx, 1(%edx)
+L(bk_write_1bytes):
+ movzbl (%eax), %ecx
+ movb %cl, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_46bytes):
+ movl 42(%eax), %ecx
+ movl %ecx, 42(%edx)
+L(bk_write_42bytes):
+ movl 38(%eax), %ecx
+ movl %ecx, 38(%edx)
+L(bk_write_38bytes):
+ movl 34(%eax), %ecx
+ movl %ecx, 34(%edx)
+L(bk_write_34bytes):
+ movl 30(%eax), %ecx
+ movl %ecx, 30(%edx)
+L(bk_write_30bytes):
+ movl 26(%eax), %ecx
+ movl %ecx, 26(%edx)
+L(bk_write_26bytes):
+ movl 22(%eax), %ecx
+ movl %ecx, 22(%edx)
+L(bk_write_22bytes):
+ movl 18(%eax), %ecx
+ movl %ecx, 18(%edx)
+L(bk_write_18bytes):
+ movl 14(%eax), %ecx
+ movl %ecx, 14(%edx)
+L(bk_write_14bytes):
+ movl 10(%eax), %ecx
+ movl %ecx, 10(%edx)
+L(bk_write_10bytes):
+ movl 6(%eax), %ecx
+ movl %ecx, 6(%edx)
+L(bk_write_6bytes):
+ movl 2(%eax), %ecx
+ movl %ecx, 2(%edx)
+L(bk_write_2bytes):
+ movzwl (%eax), %ecx
+ movw %cx, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_47bytes):
+ movl 43(%eax), %ecx
+ movl %ecx, 43(%edx)
+L(bk_write_43bytes):
+ movl 39(%eax), %ecx
+ movl %ecx, 39(%edx)
+L(bk_write_39bytes):
+ movl 35(%eax), %ecx
+ movl %ecx, 35(%edx)
+L(bk_write_35bytes):
+ movl 31(%eax), %ecx
+ movl %ecx, 31(%edx)
+L(bk_write_31bytes):
+ movl 27(%eax), %ecx
+ movl %ecx, 27(%edx)
+L(bk_write_27bytes):
+ movl 23(%eax), %ecx
+ movl %ecx, 23(%edx)
+L(bk_write_23bytes):
+ movl 19(%eax), %ecx
+ movl %ecx, 19(%edx)
+L(bk_write_19bytes):
+ movl 15(%eax), %ecx
+ movl %ecx, 15(%edx)
+L(bk_write_15bytes):
+ movl 11(%eax), %ecx
+ movl %ecx, 11(%edx)
+L(bk_write_11bytes):
+ movl 7(%eax), %ecx
+ movl %ecx, 7(%edx)
+L(bk_write_7bytes):
+ movl 3(%eax), %ecx
+ movl %ecx, 3(%edx)
+L(bk_write_3bytes):
+ movzwl 1(%eax), %ecx
+ movw %cx, 1(%edx)
+ movzbl (%eax), %eax
+ movb %al, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN_END
+
+
+ .pushsection .rodata.ssse3,"a",@progbits
+ ALIGN (2)
+L(table_48bytes_fwd):
+ .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+ ALIGN (2)
+L(shl_table):
+ .int JMPTBL (L(shl_0), L(shl_table))
+ .int JMPTBL (L(shl_1), L(shl_table))
+ .int JMPTBL (L(shl_2), L(shl_table))
+ .int JMPTBL (L(shl_3), L(shl_table))
+ .int JMPTBL (L(shl_4), L(shl_table))
+ .int JMPTBL (L(shl_5), L(shl_table))
+ .int JMPTBL (L(shl_6), L(shl_table))
+ .int JMPTBL (L(shl_7), L(shl_table))
+ .int JMPTBL (L(shl_8), L(shl_table))
+ .int JMPTBL (L(shl_9), L(shl_table))
+ .int JMPTBL (L(shl_10), L(shl_table))
+ .int JMPTBL (L(shl_11), L(shl_table))
+ .int JMPTBL (L(shl_12), L(shl_table))
+ .int JMPTBL (L(shl_13), L(shl_table))
+ .int JMPTBL (L(shl_14), L(shl_table))
+ .int JMPTBL (L(shl_15), L(shl_table))
+
+ ALIGN (2)
+L(table_48_bytes_bwd):
+ .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+ .popsection
+
+#ifdef USE_AS_MEMMOVE
+ ALIGN (4)
+L(copy_backward):
+ PUSH (%esi)
+ movl %eax, %esi
+ lea (%ecx,%edx,1),%edx
+ lea (%ecx,%esi,1),%esi
+ testl $0x3, %edx
+ jnz L(bk_align)
+
+L(bk_aligned_4):
+ cmp $64, %ecx
+ jge L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+ cmp $32, %ecx
+ jl L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+ /* Copy 32 bytes at a time. */
+ sub $32, %ecx
+ movl -4(%esi), %eax
+ movl %eax, -4(%edx)
+ movl -8(%esi), %eax
+ movl %eax, -8(%edx)
+ movl -12(%esi), %eax
+ movl %eax, -12(%edx)
+ movl -16(%esi), %eax
+ movl %eax, -16(%edx)
+ movl -20(%esi), %eax
+ movl %eax, -20(%edx)
+ movl -24(%esi), %eax
+ movl %eax, -24(%edx)
+ movl -28(%esi), %eax
+ movl %eax, -28(%edx)
+ movl -32(%esi), %eax
+ movl %eax, -32(%edx)
+ sub $32, %edx
+ sub $32, %esi
+
+L(bk_write_less32bytes):
+ movl %esi, %eax
+ sub %ecx, %edx
+ sub %ecx, %eax
+ POP (%esi)
+L(bk_write_less32bytes_2):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+ ALIGN (4)
+L(bk_align):
+ cmp $8, %ecx
+ jle L(bk_write_less32bytes)
+ testl $1, %edx
+ /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+ then (EDX & 2) must be != 0. */
+ jz L(bk_got2)
+ sub $1, %esi
+ sub $1, %ecx
+ sub $1, %edx
+ movzbl (%esi), %eax
+ movb %al, (%edx)
+
+ testl $2, %edx
+ jz L(bk_aligned_4)
+
+L(bk_got2):
+ sub $2, %esi
+ sub $2, %ecx
+ sub $2, %edx
+ movzwl (%esi), %eax
+ movw %ax, (%edx)
+ jmp L(bk_aligned_4)
+
+ ALIGN (4)
+L(bk_write_more64bytes):
+ /* Check alignment of last byte. */
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes. */
+L(bk_ssse3_align):
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+ cmp $64, %ecx
+ jl L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+ sub $64, %esi
+ sub $64, %ecx
+ sub $64, %edx
+ movdqu 0x30(%esi), %xmm3
+ movdqa %xmm3, 0x30(%edx)
+ movdqu 0x20(%esi), %xmm2
+ movdqa %xmm2, 0x20(%edx)
+ movdqu 0x10(%esi), %xmm1
+ movdqa %xmm1, 0x10(%edx)
+ movdqu (%esi), %xmm0
+ movdqa %xmm0, (%edx)
+ cmp $64, %ecx
+ jge L(bk_ssse3_cpy)
+ jmp L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
new file mode 100644
index 000000000..749c82d37
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -0,0 +1,1737 @@
+/* memcpy with SSSE3
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY __memcpy_ssse3
+# define MEMCPY_CHK __memcpy_chk_ssse3
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+#else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef SHARED
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into EBX. */ \
+ call __i686.get_pc_thunk.bx; \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
+ addl $(TABLE - .), %ebx
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ ALIGN (4)
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+#else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+
+# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+#endif
+
+ .section .text.ssse3,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+
+#ifdef USE_AS_MEMMOVE
+ cmp %eax, %edx
+ jb L(copy_forward)
+ je L(fwd_write_0bytes)
+ cmp $32, %ecx
+ jge L(memmove_bwd)
+ jmp L(bk_write_less32bytes_2)
+L(memmove_bwd):
+ add %ecx, %eax
+ cmp %eax, %edx
+ movl SRC(%esp), %eax
+ jb L(copy_backward)
+
+L(copy_forward):
+#endif
+ cmp $48, %ecx
+ jge L(48bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+ cmp %dl, %al
+ jl L(bk_write)
+#endif
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+#endif
+
+ ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned. */
+L(48bytesormore):
+ movdqu (%eax), %xmm0
+ PUSH (%edi)
+ movl %edx, %edi
+ and $-16, %edx
+ PUSH (%esi)
+ add $16, %edx
+ movl %edi, %esi
+ sub %edx, %edi
+ add %edi, %ecx
+ sub %edi, %eax
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+ call __i686.get_pc_thunk.bx
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_shared_cache_size_half, %ecx
+# endif
+#endif
+
+ mov %eax, %edi
+ jge L(large_page)
+ and $0xf, %edi
+ jz L(shl_0)
+
+ BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
+
+ ALIGN (4)
+L(shl_0):
+ movdqu %xmm0, (%esi)
+ xor %edi, %edi
+ POP (%esi)
+ cmp $127, %ecx
+ ja L(shl_0_gobble)
+ lea -32(%ecx), %ecx
+L(shl_0_loop):
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jl L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jl L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+ jl L(shl_0_end)
+
+ movdqa (%eax, %edi), %xmm0
+ movdqa 16(%eax, %edi), %xmm1
+ sub $32, %ecx
+ movdqa %xmm0, (%edx, %edi)
+ movdqa %xmm1, 16(%edx, %edi)
+ lea 32(%edi), %edi
+L(shl_0_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ add %edi, %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+L(shl_0_gobble):
+
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+#else
+# ifdef SHARED
+ call __i686.get_pc_thunk.bx
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+#endif
+
+ POP (%edi)
+ lea -128(%ecx), %ecx
+ jge L(shl_0_gobble_mem_loop)
+L(shl_0_gobble_cache_loop):
+ movdqa (%eax), %xmm0
+ movdqa 0x10(%eax), %xmm1
+ movdqa 0x20(%eax), %xmm2
+ movdqa 0x30(%eax), %xmm3
+ movdqa 0x40(%eax), %xmm4
+ movdqa 0x50(%eax), %xmm5
+ movdqa 0x60(%eax), %xmm6
+ movdqa 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ movdqa %xmm2, 0x20(%edx)
+ movdqa %xmm3, 0x30(%edx)
+ movdqa %xmm4, 0x40(%edx)
+ movdqa %xmm5, 0x50(%edx)
+ movdqa %xmm6, 0x60(%edx)
+ movdqa %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jge L(shl_0_gobble_cache_loop)
+L(shl_0_gobble_cache_loop_tail):
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(shl_0_cache_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+L(shl_0_cache_less_64bytes):
+ cmp $0x20, %ecx
+ jl L(shl_0_cache_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+L(shl_0_cache_less_32bytes):
+ cmp $0x10, %ecx
+ jl L(shl_0_cache_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+L(shl_0_cache_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(shl_0_gobble_mem_loop):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x280(%eax)
+ prefetcht0 0x1c0(%edx)
+
+ movdqa (%eax), %xmm0
+ movdqa 0x10(%eax), %xmm1
+ movdqa 0x20(%eax), %xmm2
+ movdqa 0x30(%eax), %xmm3
+ movdqa 0x40(%eax), %xmm4
+ movdqa 0x50(%eax), %xmm5
+ movdqa 0x60(%eax), %xmm6
+ movdqa 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+ sub $0x80, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ movdqa %xmm2, 0x20(%edx)
+ movdqa %xmm3, 0x30(%edx)
+ movdqa %xmm4, 0x40(%edx)
+ movdqa %xmm5, 0x50(%edx)
+ movdqa %xmm6, 0x60(%edx)
+ movdqa %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+
+ jge L(shl_0_gobble_mem_loop)
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(shl_0_mem_less_64bytes)
+
+ movdqa (%eax), %xmm0
+ sub $0x40, %ecx
+ movdqa 0x10(%eax), %xmm1
+
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+
+ movdqa 0x20(%eax), %xmm0
+ movdqa 0x30(%eax), %xmm1
+ add $0x40, %eax
+
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm1, 0x30(%edx)
+ add $0x40, %edx
+L(shl_0_mem_less_64bytes):
+ cmp $0x20, %ecx
+ jl L(shl_0_mem_less_32bytes)
+ movdqa (%eax), %xmm0
+ sub $0x20, %ecx
+ movdqa 0x10(%eax), %xmm1
+ add $0x20, %eax
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 0x10(%edx)
+ add $0x20, %edx
+L(shl_0_mem_less_32bytes):
+ cmp $0x10, %ecx
+ jl L(shl_0_mem_less_16bytes)
+ sub $0x10, %ecx
+ movdqa (%eax), %xmm0
+ add $0x10, %eax
+ movdqa %xmm0, (%edx)
+ add $0x10, %edx
+L(shl_0_mem_less_16bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(shl_1):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -1(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_1_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_1_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_1_loop)
+
+L(shl_1_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 1(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_2):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -2(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_2_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_2_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_2_loop)
+
+L(shl_2_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 2(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_3):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -3(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_3_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_3_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_3_loop)
+
+L(shl_3_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 3(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_4):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -4(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_4_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_4_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_4_loop)
+
+L(shl_4_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 4(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_5):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -5(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_5_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_5_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_5_loop)
+
+L(shl_5_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 5(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(shl_6):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -6(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_6_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_6_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_6_loop)
+
+L(shl_6_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 6(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_7):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -7(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_7_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_7_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_7_loop)
+
+L(shl_7_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 7(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_8):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -8(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_8_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_8_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_8_loop)
+
+L(shl_8_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 8(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_9):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -9(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_9_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_9_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_9_loop)
+
+L(shl_9_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 9(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_10):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -10(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_10_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_10_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_10_loop)
+
+L(shl_10_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 10(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_11):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -11(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_11_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_11_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_11_loop)
+
+L(shl_11_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 11(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_12):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -12(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_12_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_12_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_12_loop)
+
+L(shl_12_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 12(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_13):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -13(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_13_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_13_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_13_loop)
+
+L(shl_13_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 13(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+ ALIGN (4)
+L(shl_14):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -14(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_14_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_14_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_14_loop)
+
+L(shl_14_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 14(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(shl_15):
+ BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ lea -15(%eax), %eax
+ movaps (%eax), %xmm1
+ xor %edi, %edi
+ lea -32(%ecx), %ecx
+ movdqu %xmm0, (%esi)
+ POP (%esi)
+L(shl_15_loop):
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm4
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm1, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jl L(shl_15_end)
+
+ movdqa 16(%eax, %edi), %xmm2
+ sub $32, %ecx
+ movdqa 32(%eax, %edi), %xmm3
+ movdqa %xmm3, %xmm1
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm4, %xmm2
+ lea 32(%edi), %edi
+ movdqa %xmm2, -32(%edx, %edi)
+ movdqa %xmm3, -16(%edx, %edi)
+
+ jae L(shl_15_loop)
+
+L(shl_15_end):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 15(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(fwd_write_44bytes):
+ movl -44(%eax), %ecx
+ movl %ecx, -44(%edx)
+L(fwd_write_40bytes):
+ movl -40(%eax), %ecx
+ movl %ecx, -40(%edx)
+L(fwd_write_36bytes):
+ movl -36(%eax), %ecx
+ movl %ecx, -36(%edx)
+L(fwd_write_32bytes):
+ movl -32(%eax), %ecx
+ movl %ecx, -32(%edx)
+L(fwd_write_28bytes):
+ movl -28(%eax), %ecx
+ movl %ecx, -28(%edx)
+L(fwd_write_24bytes):
+ movl -24(%eax), %ecx
+ movl %ecx, -24(%edx)
+L(fwd_write_20bytes):
+ movl -20(%eax), %ecx
+ movl %ecx, -20(%edx)
+L(fwd_write_16bytes):
+ movl -16(%eax), %ecx
+ movl %ecx, -16(%edx)
+L(fwd_write_12bytes):
+ movl -12(%eax), %ecx
+ movl %ecx, -12(%edx)
+L(fwd_write_8bytes):
+ movl -8(%eax), %ecx
+ movl %ecx, -8(%edx)
+L(fwd_write_4bytes):
+ movl -4(%eax), %ecx
+ movl %ecx, -4(%edx)
+L(fwd_write_0bytes):
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_5bytes):
+ movl -5(%eax), %ecx
+ movl -4(%eax), %eax
+ movl %ecx, -5(%edx)
+ movl %eax, -4(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_45bytes):
+ movl -45(%eax), %ecx
+ movl %ecx, -45(%edx)
+L(fwd_write_41bytes):
+ movl -41(%eax), %ecx
+ movl %ecx, -41(%edx)
+L(fwd_write_37bytes):
+ movl -37(%eax), %ecx
+ movl %ecx, -37(%edx)
+L(fwd_write_33bytes):
+ movl -33(%eax), %ecx
+ movl %ecx, -33(%edx)
+L(fwd_write_29bytes):
+ movl -29(%eax), %ecx
+ movl %ecx, -29(%edx)
+L(fwd_write_25bytes):
+ movl -25(%eax), %ecx
+ movl %ecx, -25(%edx)
+L(fwd_write_21bytes):
+ movl -21(%eax), %ecx
+ movl %ecx, -21(%edx)
+L(fwd_write_17bytes):
+ movl -17(%eax), %ecx
+ movl %ecx, -17(%edx)
+L(fwd_write_13bytes):
+ movl -13(%eax), %ecx
+ movl %ecx, -13(%edx)
+L(fwd_write_9bytes):
+ movl -9(%eax), %ecx
+ movl %ecx, -9(%edx)
+ movl -5(%eax), %ecx
+ movl %ecx, -5(%edx)
+L(fwd_write_1bytes):
+ movzbl -1(%eax), %ecx
+ movb %cl, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_46bytes):
+ movl -46(%eax), %ecx
+ movl %ecx, -46(%edx)
+L(fwd_write_42bytes):
+ movl -42(%eax), %ecx
+ movl %ecx, -42(%edx)
+L(fwd_write_38bytes):
+ movl -38(%eax), %ecx
+ movl %ecx, -38(%edx)
+L(fwd_write_34bytes):
+ movl -34(%eax), %ecx
+ movl %ecx, -34(%edx)
+L(fwd_write_30bytes):
+ movl -30(%eax), %ecx
+ movl %ecx, -30(%edx)
+L(fwd_write_26bytes):
+ movl -26(%eax), %ecx
+ movl %ecx, -26(%edx)
+L(fwd_write_22bytes):
+ movl -22(%eax), %ecx
+ movl %ecx, -22(%edx)
+L(fwd_write_18bytes):
+ movl -18(%eax), %ecx
+ movl %ecx, -18(%edx)
+L(fwd_write_14bytes):
+ movl -14(%eax), %ecx
+ movl %ecx, -14(%edx)
+L(fwd_write_10bytes):
+ movl -10(%eax), %ecx
+ movl %ecx, -10(%edx)
+L(fwd_write_6bytes):
+ movl -6(%eax), %ecx
+ movl %ecx, -6(%edx)
+L(fwd_write_2bytes):
+ movzwl -2(%eax), %ecx
+ movw %cx, -2(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(fwd_write_47bytes):
+ movl -47(%eax), %ecx
+ movl %ecx, -47(%edx)
+L(fwd_write_43bytes):
+ movl -43(%eax), %ecx
+ movl %ecx, -43(%edx)
+L(fwd_write_39bytes):
+ movl -39(%eax), %ecx
+ movl %ecx, -39(%edx)
+L(fwd_write_35bytes):
+ movl -35(%eax), %ecx
+ movl %ecx, -35(%edx)
+L(fwd_write_31bytes):
+ movl -31(%eax), %ecx
+ movl %ecx, -31(%edx)
+L(fwd_write_27bytes):
+ movl -27(%eax), %ecx
+ movl %ecx, -27(%edx)
+L(fwd_write_23bytes):
+ movl -23(%eax), %ecx
+ movl %ecx, -23(%edx)
+L(fwd_write_19bytes):
+ movl -19(%eax), %ecx
+ movl %ecx, -19(%edx)
+L(fwd_write_15bytes):
+ movl -15(%eax), %ecx
+ movl %ecx, -15(%edx)
+L(fwd_write_11bytes):
+ movl -11(%eax), %ecx
+ movl %ecx, -11(%edx)
+L(fwd_write_7bytes):
+ movl -7(%eax), %ecx
+ movl %ecx, -7(%edx)
+L(fwd_write_3bytes):
+ movzwl -3(%eax), %ecx
+ movzbl -1(%eax), %eax
+ movw %cx, -3(%edx)
+ movb %al, -1(%edx)
+#ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
+ movl %edx, %eax
+# else
+ movl DEST(%esp), %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(large_page):
+ movdqu (%eax), %xmm1
+ lea 16(%eax), %eax
+ movdqu %xmm0, (%esi)
+ movntdq %xmm1, (%edx)
+ lea 16(%edx), %edx
+ POP (%esi)
+ lea -0x90(%ecx), %ecx
+ POP (%edi)
+L(large_page_loop):
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ movdqu 0x40(%eax), %xmm4
+ movdqu 0x50(%eax), %xmm5
+ movdqu 0x60(%eax), %xmm6
+ movdqu 0x70(%eax), %xmm7
+ lea 0x80(%eax), %eax
+
+ sub $0x80, %ecx
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ movntdq %xmm4, 0x40(%edx)
+ movntdq %xmm5, 0x50(%edx)
+ movntdq %xmm6, 0x60(%edx)
+ movntdq %xmm7, 0x70(%edx)
+ lea 0x80(%edx), %edx
+ jae L(large_page_loop)
+ cmp $-0x40, %ecx
+ lea 0x80(%ecx), %ecx
+ jl L(large_page_less_64bytes)
+
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ movdqu 0x20(%eax), %xmm2
+ movdqu 0x30(%eax), %xmm3
+ lea 0x40(%eax), %eax
+
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ movntdq %xmm2, 0x20(%edx)
+ movntdq %xmm3, 0x30(%edx)
+ lea 0x40(%edx), %edx
+ sub $0x40, %ecx
+L(large_page_less_64bytes):
+ cmp $32, %ecx
+ jl L(large_page_less_32bytes)
+ movdqu (%eax), %xmm0
+ movdqu 0x10(%eax), %xmm1
+ lea 0x20(%eax), %eax
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 0x10(%edx)
+ lea 0x20(%edx), %edx
+ sub $0x20, %ecx
+L(large_page_less_32bytes):
+ add %ecx, %edx
+ add %ecx, %eax
+ sfence
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
+
+
+ ALIGN (4)
+L(bk_write_44bytes):
+ movl 40(%eax), %ecx
+ movl %ecx, 40(%edx)
+L(bk_write_40bytes):
+ movl 36(%eax), %ecx
+ movl %ecx, 36(%edx)
+L(bk_write_36bytes):
+ movl 32(%eax), %ecx
+ movl %ecx, 32(%edx)
+L(bk_write_32bytes):
+ movl 28(%eax), %ecx
+ movl %ecx, 28(%edx)
+L(bk_write_28bytes):
+ movl 24(%eax), %ecx
+ movl %ecx, 24(%edx)
+L(bk_write_24bytes):
+ movl 20(%eax), %ecx
+ movl %ecx, 20(%edx)
+L(bk_write_20bytes):
+ movl 16(%eax), %ecx
+ movl %ecx, 16(%edx)
+L(bk_write_16bytes):
+ movl 12(%eax), %ecx
+ movl %ecx, 12(%edx)
+L(bk_write_12bytes):
+ movl 8(%eax), %ecx
+ movl %ecx, 8(%edx)
+L(bk_write_8bytes):
+ movl 4(%eax), %ecx
+ movl %ecx, 4(%edx)
+L(bk_write_4bytes):
+ movl (%eax), %ecx
+ movl %ecx, (%edx)
+L(bk_write_0bytes):
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_45bytes):
+ movl 41(%eax), %ecx
+ movl %ecx, 41(%edx)
+L(bk_write_41bytes):
+ movl 37(%eax), %ecx
+ movl %ecx, 37(%edx)
+L(bk_write_37bytes):
+ movl 33(%eax), %ecx
+ movl %ecx, 33(%edx)
+L(bk_write_33bytes):
+ movl 29(%eax), %ecx
+ movl %ecx, 29(%edx)
+L(bk_write_29bytes):
+ movl 25(%eax), %ecx
+ movl %ecx, 25(%edx)
+L(bk_write_25bytes):
+ movl 21(%eax), %ecx
+ movl %ecx, 21(%edx)
+L(bk_write_21bytes):
+ movl 17(%eax), %ecx
+ movl %ecx, 17(%edx)
+L(bk_write_17bytes):
+ movl 13(%eax), %ecx
+ movl %ecx, 13(%edx)
+L(bk_write_13bytes):
+ movl 9(%eax), %ecx
+ movl %ecx, 9(%edx)
+L(bk_write_9bytes):
+ movl 5(%eax), %ecx
+ movl %ecx, 5(%edx)
+L(bk_write_5bytes):
+ movl 1(%eax), %ecx
+ movl %ecx, 1(%edx)
+L(bk_write_1bytes):
+ movzbl (%eax), %ecx
+ movb %cl, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_46bytes):
+ movl 42(%eax), %ecx
+ movl %ecx, 42(%edx)
+L(bk_write_42bytes):
+ movl 38(%eax), %ecx
+ movl %ecx, 38(%edx)
+L(bk_write_38bytes):
+ movl 34(%eax), %ecx
+ movl %ecx, 34(%edx)
+L(bk_write_34bytes):
+ movl 30(%eax), %ecx
+ movl %ecx, 30(%edx)
+L(bk_write_30bytes):
+ movl 26(%eax), %ecx
+ movl %ecx, 26(%edx)
+L(bk_write_26bytes):
+ movl 22(%eax), %ecx
+ movl %ecx, 22(%edx)
+L(bk_write_22bytes):
+ movl 18(%eax), %ecx
+ movl %ecx, 18(%edx)
+L(bk_write_18bytes):
+ movl 14(%eax), %ecx
+ movl %ecx, 14(%edx)
+L(bk_write_14bytes):
+ movl 10(%eax), %ecx
+ movl %ecx, 10(%edx)
+L(bk_write_10bytes):
+ movl 6(%eax), %ecx
+ movl %ecx, 6(%edx)
+L(bk_write_6bytes):
+ movl 2(%eax), %ecx
+ movl %ecx, 2(%edx)
+L(bk_write_2bytes):
+ movzwl (%eax), %ecx
+ movw %cx, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN
+
+ ALIGN (4)
+L(bk_write_47bytes):
+ movl 43(%eax), %ecx
+ movl %ecx, 43(%edx)
+L(bk_write_43bytes):
+ movl 39(%eax), %ecx
+ movl %ecx, 39(%edx)
+L(bk_write_39bytes):
+ movl 35(%eax), %ecx
+ movl %ecx, 35(%edx)
+L(bk_write_35bytes):
+ movl 31(%eax), %ecx
+ movl %ecx, 31(%edx)
+L(bk_write_31bytes):
+ movl 27(%eax), %ecx
+ movl %ecx, 27(%edx)
+L(bk_write_27bytes):
+ movl 23(%eax), %ecx
+ movl %ecx, 23(%edx)
+L(bk_write_23bytes):
+ movl 19(%eax), %ecx
+ movl %ecx, 19(%edx)
+L(bk_write_19bytes):
+ movl 15(%eax), %ecx
+ movl %ecx, 15(%edx)
+L(bk_write_15bytes):
+ movl 11(%eax), %ecx
+ movl %ecx, 11(%edx)
+L(bk_write_11bytes):
+ movl 7(%eax), %ecx
+ movl %ecx, 7(%edx)
+L(bk_write_7bytes):
+ movl 3(%eax), %ecx
+ movl %ecx, 3(%edx)
+L(bk_write_3bytes):
+ movzwl 1(%eax), %ecx
+ movw %cx, 1(%edx)
+ movzbl (%eax), %eax
+ movb %al, (%edx)
+#ifndef USE_AS_BCOPY
+ movl DEST(%esp), %eax
+# ifdef USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+#endif
+ RETURN_END
+
+
+ .pushsection .rodata.ssse3,"a",@progbits
+ ALIGN (2)
+L(table_48bytes_fwd):
+ .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
+ .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
+
+ ALIGN (2)
+L(shl_table):
+ .int JMPTBL (L(shl_0), L(shl_table))
+ .int JMPTBL (L(shl_1), L(shl_table))
+ .int JMPTBL (L(shl_2), L(shl_table))
+ .int JMPTBL (L(shl_3), L(shl_table))
+ .int JMPTBL (L(shl_4), L(shl_table))
+ .int JMPTBL (L(shl_5), L(shl_table))
+ .int JMPTBL (L(shl_6), L(shl_table))
+ .int JMPTBL (L(shl_7), L(shl_table))
+ .int JMPTBL (L(shl_8), L(shl_table))
+ .int JMPTBL (L(shl_9), L(shl_table))
+ .int JMPTBL (L(shl_10), L(shl_table))
+ .int JMPTBL (L(shl_11), L(shl_table))
+ .int JMPTBL (L(shl_12), L(shl_table))
+ .int JMPTBL (L(shl_13), L(shl_table))
+ .int JMPTBL (L(shl_14), L(shl_table))
+ .int JMPTBL (L(shl_15), L(shl_table))
+
+ ALIGN (2)
+L(table_48_bytes_bwd):
+ .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
+ .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
+
+ .popsection
+
+#ifdef USE_AS_MEMMOVE
+ ALIGN (4)
+L(copy_backward):
+ PUSH (%esi)
+ movl %eax, %esi
+ lea (%ecx,%edx,1),%edx
+ lea (%ecx,%esi,1),%esi
+ testl $0x3, %edx
+ jnz L(bk_align)
+
+L(bk_aligned_4):
+ cmp $64, %ecx
+ jge L(bk_write_more64bytes)
+
+L(bk_write_64bytesless):
+ cmp $32, %ecx
+ jl L(bk_write_less32bytes)
+
+L(bk_write_more32bytes):
+ /* Copy 32 bytes at a time. */
+ sub $32, %ecx
+ movl -4(%esi), %eax
+ movl %eax, -4(%edx)
+ movl -8(%esi), %eax
+ movl %eax, -8(%edx)
+ movl -12(%esi), %eax
+ movl %eax, -12(%edx)
+ movl -16(%esi), %eax
+ movl %eax, -16(%edx)
+ movl -20(%esi), %eax
+ movl %eax, -20(%edx)
+ movl -24(%esi), %eax
+ movl %eax, -24(%edx)
+ movl -28(%esi), %eax
+ movl %eax, -28(%edx)
+ movl -32(%esi), %eax
+ movl %eax, -32(%edx)
+ sub $32, %edx
+ sub $32, %esi
+
+L(bk_write_less32bytes):
+ movl %esi, %eax
+ sub %ecx, %edx
+ sub %ecx, %eax
+ POP (%esi)
+L(bk_write_less32bytes_2):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
+
+ ALIGN (4)
+L(bk_align):
+ cmp $8, %ecx
+ jle L(bk_write_less32bytes)
+ testl $1, %edx
+ /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
+ then (EDX & 2) must be != 0. */
+ jz L(bk_got2)
+ sub $1, %esi
+ sub $1, %ecx
+ sub $1, %edx
+ movzbl (%esi), %eax
+ movb %al, (%edx)
+
+ testl $2, %edx
+ jz L(bk_aligned_4)
+
+L(bk_got2):
+ sub $2, %esi
+ sub $2, %ecx
+ sub $2, %edx
+ movzwl (%esi), %eax
+ movw %ax, (%edx)
+ jmp L(bk_aligned_4)
+
+ ALIGN (4)
+L(bk_write_more64bytes):
+ /* Check alignment of last byte. */
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+/* EDX is aligned 4 bytes, but not 16 bytes. */
+L(bk_ssse3_align):
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+ testl $15, %edx
+ jz L(bk_ssse3_cpy_pre)
+
+ sub $4, %esi
+ sub $4, %ecx
+ sub $4, %edx
+ movl (%esi), %eax
+ movl %eax, (%edx)
+
+L(bk_ssse3_cpy_pre):
+ cmp $64, %ecx
+ jl L(bk_write_more32bytes)
+
+L(bk_ssse3_cpy):
+ sub $64, %esi
+ sub $64, %ecx
+ sub $64, %edx
+ movdqu 0x30(%esi), %xmm3
+ movdqa %xmm3, 0x30(%edx)
+ movdqu 0x20(%esi), %xmm2
+ movdqa %xmm2, 0x20(%edx)
+ movdqu 0x10(%esi), %xmm1
+ movdqa %xmm1, 0x10(%edx)
+ movdqu (%esi), %xmm0
+ movdqa %xmm0, (%edx)
+ cmp $64, %ecx
+ jge L(bk_ssse3_cpy)
+ jmp L(bk_write_64bytesless)
+
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memcpy.S b/libc/sysdeps/i386/i686/multiarch/memcpy.S
new file mode 100644
index 000000000..bf1c7cc2d
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memcpy.S
@@ -0,0 +1,90 @@
+/* Multiple versions of memcpy
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. In static binaries we need memcpy before the initialization
+ happened. */
+#if defined SHARED && !defined NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(memcpy)
+ .type memcpy, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __memcpy_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memcpy_ssse3@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memcpy_ssse3_rep@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(memcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memcpy_ia32, @function; \
+ .p2align 4; \
+ __memcpy_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __memcpy_chk_ia32, @function; \
+ .globl __memcpy_chk_ia32; \
+ .p2align 4; \
+ __memcpy_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32
+
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memcpy; __GI_memcpy = __memcpy_ia32
+#endif
+
+#include "../memcpy.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/memcpy_chk.S b/libc/sysdeps/i386/i686/multiarch/memcpy_chk.S
new file mode 100644
index 000000000..171ac8ade
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -0,0 +1,64 @@
+/* Multiple versions of __memcpy_chk
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. There are no multiarch memcpy functions for static binaries.
+ */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(__memcpy_chk)
+ .type __memcpy_chk, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __memcpy_chk_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memcpy_chk_ssse3@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(__memcpy_chk)
+# else
+# include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/libc/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
new file mode 100644
index 000000000..d202fc4a1
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_ssse3_rep
+#define MEMCPY_CHK __memmove_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/libc/sysdeps/i386/i686/multiarch/memmove-ssse3.S
new file mode 100644
index 000000000..295430b1e
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_ssse3
+#define MEMCPY_CHK __memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/memmove.S b/libc/sysdeps/i386/i686/multiarch/memmove.S
new file mode 100644
index 000000000..e0529c012
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memmove.S
@@ -0,0 +1,117 @@
+/* Multiple versions of memmove
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(memmove)
+ .type memmove, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __memmove_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memmove_ssse3@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memmove_ssse3_rep@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(memmove)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memmove_ia32, @function; \
+ .p2align 4; \
+ __memmove_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# else
+ .text
+ENTRY(memmove)
+ .type memmove, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features
+ jne 1f
+ call __init_cpu_features
+1: leal __memmove_ia32, %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+ jz 2f
+ leal __memmove_ssse3, %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+ jz 2f
+ leal __memmove_ssse3_rep, %eax
+2: ret
+END(memmove)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memmove_ia32, @function; \
+ .globl __memmove_ia32; \
+ .p2align 4; \
+ __memmove_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# endif
+
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memmove_ia32, .-__memmove_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __memmove_chk_ia32, @function; \
+ .globl __memmove_chk_ia32; \
+ .p2align 4; \
+ __memmove_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memmove; __GI_memmove = __memmove_ia32
+# endif
+#endif
+
+#include "../memmove.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/memmove_chk.S b/libc/sysdeps/i386/i686/multiarch/memmove_chk.S
new file mode 100644
index 000000000..e33f2a31b
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -0,0 +1,112 @@
+/* Multiple versions of __memmove_chk
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(__memmove_chk)
+ .type __memmove_chk, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __memmove_chk_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memmove_chk_ssse3@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memmove_chk_ssse3_rep@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(__memmove_chk)
+# else
+ .text
+ENTRY(__memmove_chk)
+ .type __memmove_chk, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features
+ jne 1f
+ call __init_cpu_features
+1: leal __memmove_chk_ia32, %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+ jz 2f
+ leal __memmove_chk_ssse3, %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+ jz 2f
+ leal __memmove_chk_ssse3_rep, %eax
+2: ret
+END(__memmove_chk)
+
+ .type __memmove_chk_ssse3, @function
+ .p2align 4;
+__memmove_chk_ssse3:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_ssse3
+ cfi_endproc
+ .size __memmove_chk_ssse3, .-__memmove_chk_ssse3
+
+ .type __memmove_chk_ssse3_rep, @function
+ .p2align 4;
+__memmove_chk_ssse3_rep:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_ssse3_rep
+ cfi_endproc
+ .size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep
+
+ .type __memmove_chk_ia32, @function
+ .p2align 4;
+__memmove_chk_ia32:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_ia32
+ cfi_endproc
+ .size __memmove_chk_ia32, .-__memmove_chk_ia32
+# endif
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
new file mode 100644
index 000000000..5357b33e1
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_ssse3_rep
+#define MEMCPY_CHK __mempcpy_chk_ssse3_rep
+#include "memcpy-ssse3-rep.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
new file mode 100644
index 000000000..822d98e95
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_ssse3
+#define MEMCPY_CHK __mempcpy_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/mempcpy.S b/libc/sysdeps/i386/i686/multiarch/mempcpy.S
new file mode 100644
index 000000000..df830d2e6
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -0,0 +1,93 @@
+/* Multiple versions of mempcpy
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. In static binaries we need mempcpy before the initialization
+ happened. */
+#if defined SHARED && !defined NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(__mempcpy)
+ .type __mempcpy, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __mempcpy_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __mempcpy_ssse3@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __mempcpy_ssse3_rep@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(__mempcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __mempcpy_ia32, @function; \
+ .p2align 4; \
+ __mempcpy_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __mempcpy_chk_ia32, @function; \
+ .globl __mempcpy_chk_ia32; \
+ .p2align 4; \
+ __mempcpy_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32
+
+# undef libc_hidden_def
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_def(name) \
+ .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32
+# define libc_hidden_builtin_def(name) \
+ .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32
+#endif
+
+#include "../mempcpy.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/libc/sysdeps/i386/i686/multiarch/mempcpy_chk.S
new file mode 100644
index 000000000..828fb5e60
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -0,0 +1,64 @@
+/* Multiple versions of __mempcpy_chk
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. There are no multiarch mempcpy functions for static binaries.
+ */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(__mempcpy_chk)
+ .type __mempcpy_chk, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __mempcpy_chk_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __mempcpy_chk_ssse3@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __mempcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(__mempcpy_chk)
+# else
+# include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/libc/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
new file mode 100644
index 000000000..84afffeb6
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -0,0 +1,821 @@
+/* memset with SSE2 and REP string.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST PARMS
+# define LEN DEST+4
+# define SETRTNVAL
+#else
+# define DEST PARMS
+# define CHR DEST+4
+# define LEN CHR+4
+# define SETRTNVAL movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define PARMS 8 /* Preserve EBX. */
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ /* We first load PC into EBX. */ \
+ call __i686.get_pc_thunk.bx; \
+ /* Get the address of the jump table. */ \
+ add $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ add (%ebx,%ecx,4), %ebx; \
+ add %ecx, %edx; \
+ /* We loaded the jump table and adjuested EDX. Go. */ \
+ jmp *%ebx
+
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ ALIGN (4)
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+#else
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define PARMS 4
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ add %ecx, %edx; \
+ jmp *TABLE(,%ecx,4)
+#endif
+
+ .section .text.sse2,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2_rep)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2_rep)
+#endif
+ENTRY (__memset_sse2_rep)
+ ENTRANCE
+
+ movl LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+ xor %eax, %eax
+#else
+ movzbl CHR(%esp), %eax
+ movb %al, %ah
+ /* Fill the whole EAX with pattern. */
+ movl %eax, %edx
+ shl $16, %eax
+ or %edx, %eax
+#endif
+ movl DEST(%esp), %edx
+ cmp $32, %ecx
+ jae L(32bytesormore)
+
+L(write_less32bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_less_32bytes):
+ .int JMPTBL (L(write_0bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_1bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_2bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_3bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_4bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_5bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_6bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_7bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_8bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_9bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_10bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_11bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_12bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_13bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_14bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_15bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_16bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_17bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_18bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_19bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_20bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_21bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_22bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_23bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_24bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_25bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_26bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_27bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_28bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_29bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_30bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_31bytes), L(table_less_32bytes))
+ .popsection
+
+ ALIGN (4)
+L(write_28bytes):
+ movl %eax, -28(%edx)
+L(write_24bytes):
+ movl %eax, -24(%edx)
+L(write_20bytes):
+ movl %eax, -20(%edx)
+L(write_16bytes):
+ movl %eax, -16(%edx)
+L(write_12bytes):
+ movl %eax, -12(%edx)
+L(write_8bytes):
+ movl %eax, -8(%edx)
+L(write_4bytes):
+ movl %eax, -4(%edx)
+L(write_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_29bytes):
+ movl %eax, -29(%edx)
+L(write_25bytes):
+ movl %eax, -25(%edx)
+L(write_21bytes):
+ movl %eax, -21(%edx)
+L(write_17bytes):
+ movl %eax, -17(%edx)
+L(write_13bytes):
+ movl %eax, -13(%edx)
+L(write_9bytes):
+ movl %eax, -9(%edx)
+L(write_5bytes):
+ movl %eax, -5(%edx)
+L(write_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_30bytes):
+ movl %eax, -30(%edx)
+L(write_26bytes):
+ movl %eax, -26(%edx)
+L(write_22bytes):
+ movl %eax, -22(%edx)
+L(write_18bytes):
+ movl %eax, -18(%edx)
+L(write_14bytes):
+ movl %eax, -14(%edx)
+L(write_10bytes):
+ movl %eax, -10(%edx)
+L(write_6bytes):
+ movl %eax, -6(%edx)
+L(write_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_31bytes):
+ movl %eax, -31(%edx)
+L(write_27bytes):
+ movl %eax, -27(%edx)
+L(write_23bytes):
+ movl %eax, -23(%edx)
+L(write_19bytes):
+ movl %eax, -19(%edx)
+L(write_15bytes):
+ movl %eax, -15(%edx)
+L(write_11bytes):
+ movl %eax, -11(%edx)
+L(write_7bytes):
+ movl %eax, -7(%edx)
+L(write_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned. */
+L(32bytesormore):
+ /* Fill xmm0 with the pattern. */
+#ifdef USE_AS_BZERO
+ pxor %xmm0, %xmm0
+#else
+ movd %eax, %xmm0
+ punpcklbw %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0
+#endif
+ testl $0xf, %edx
+ jz L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned. */
+L(not_aligned_16):
+ movdqu %xmm0, (%edx)
+ movl %edx, %eax
+ and $-16, %edx
+ add $16, %edx
+ sub %edx, %eax
+ add %eax, %ecx
+ movd %xmm0, %eax
+
+ ALIGN (4)
+L(aligned_16):
+ cmp $128, %ecx
+ jge L(128bytesormore)
+
+L(aligned_16_less128bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ ALIGN (4)
+L(128bytesormore):
+ PUSH (%edi)
+#ifdef DATA_CACHE_SIZE
+ PUSH (%ebx)
+ mov $DATA_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+ call __i686.get_pc_thunk.bx
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ mov __x86_data_cache_size@GOTOFF(%ebx), %ebx
+# else
+ PUSH (%ebx)
+ mov __x86_data_cache_size, %ebx
+# endif
+#endif
+ mov %ebx, %edi
+ shr $4, %ebx
+ sub %ebx, %edi
+#if defined DATA_CACHE_SIZE || !defined SHARED
+ POP (%ebx)
+#endif
+/*
+ * When data size approximate the end of L1 cache,
+ * fast string will prefetch and combine data efficiently.
+ */
+ cmp %edi, %ecx
+ jae L(128bytesormore_nt)
+ subl $128, %ecx
+L(128bytesormore_normal):
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jl L(128bytesless_normal)
+
+
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jge L(128bytesormore_normal)
+
+L(128bytesless_normal):
+ POP (%edi)
+ lea 128(%ecx), %ecx
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ ALIGN (4)
+L(128bytesormore_nt):
+ mov %edx, %edi
+ mov %ecx, %edx
+ shr $2, %ecx
+ and $3, %edx
+ rep stosl
+ jz L(copy_page_by_rep_exit)
+ cmp $2, %edx
+ jb L(copy_page_by_rep_left_1)
+ movw %ax, (%edi)
+ add $2, %edi
+ sub $2, %edx
+ jz L(copy_page_by_rep_exit)
+L(copy_page_by_rep_left_1):
+ movb %al, (%edi)
+L(copy_page_by_rep_exit):
+ POP (%edi)
+ SETRTNVAL
+ RETURN
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_16_128bytes):
+ .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+ .popsection
+
+ ALIGN (4)
+L(aligned_16_112bytes):
+ movdqa %xmm0, -112(%edx)
+L(aligned_16_96bytes):
+ movdqa %xmm0, -96(%edx)
+L(aligned_16_80bytes):
+ movdqa %xmm0, -80(%edx)
+L(aligned_16_64bytes):
+ movdqa %xmm0, -64(%edx)
+L(aligned_16_48bytes):
+ movdqa %xmm0, -48(%edx)
+L(aligned_16_32bytes):
+ movdqa %xmm0, -32(%edx)
+L(aligned_16_16bytes):
+ movdqa %xmm0, -16(%edx)
+L(aligned_16_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_113bytes):
+ movdqa %xmm0, -113(%edx)
+L(aligned_16_97bytes):
+ movdqa %xmm0, -97(%edx)
+L(aligned_16_81bytes):
+ movdqa %xmm0, -81(%edx)
+L(aligned_16_65bytes):
+ movdqa %xmm0, -65(%edx)
+L(aligned_16_49bytes):
+ movdqa %xmm0, -49(%edx)
+L(aligned_16_33bytes):
+ movdqa %xmm0, -33(%edx)
+L(aligned_16_17bytes):
+ movdqa %xmm0, -17(%edx)
+L(aligned_16_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_114bytes):
+ movdqa %xmm0, -114(%edx)
+L(aligned_16_98bytes):
+ movdqa %xmm0, -98(%edx)
+L(aligned_16_82bytes):
+ movdqa %xmm0, -82(%edx)
+L(aligned_16_66bytes):
+ movdqa %xmm0, -66(%edx)
+L(aligned_16_50bytes):
+ movdqa %xmm0, -50(%edx)
+L(aligned_16_34bytes):
+ movdqa %xmm0, -34(%edx)
+L(aligned_16_18bytes):
+ movdqa %xmm0, -18(%edx)
+L(aligned_16_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_115bytes):
+ movdqa %xmm0, -115(%edx)
+L(aligned_16_99bytes):
+ movdqa %xmm0, -99(%edx)
+L(aligned_16_83bytes):
+ movdqa %xmm0, -83(%edx)
+L(aligned_16_67bytes):
+ movdqa %xmm0, -67(%edx)
+L(aligned_16_51bytes):
+ movdqa %xmm0, -51(%edx)
+L(aligned_16_35bytes):
+ movdqa %xmm0, -35(%edx)
+L(aligned_16_19bytes):
+ movdqa %xmm0, -19(%edx)
+L(aligned_16_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_116bytes):
+ movdqa %xmm0, -116(%edx)
+L(aligned_16_100bytes):
+ movdqa %xmm0, -100(%edx)
+L(aligned_16_84bytes):
+ movdqa %xmm0, -84(%edx)
+L(aligned_16_68bytes):
+ movdqa %xmm0, -68(%edx)
+L(aligned_16_52bytes):
+ movdqa %xmm0, -52(%edx)
+L(aligned_16_36bytes):
+ movdqa %xmm0, -36(%edx)
+L(aligned_16_20bytes):
+ movdqa %xmm0, -20(%edx)
+L(aligned_16_4bytes):
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_117bytes):
+ movdqa %xmm0, -117(%edx)
+L(aligned_16_101bytes):
+ movdqa %xmm0, -101(%edx)
+L(aligned_16_85bytes):
+ movdqa %xmm0, -85(%edx)
+L(aligned_16_69bytes):
+ movdqa %xmm0, -69(%edx)
+L(aligned_16_53bytes):
+ movdqa %xmm0, -53(%edx)
+L(aligned_16_37bytes):
+ movdqa %xmm0, -37(%edx)
+L(aligned_16_21bytes):
+ movdqa %xmm0, -21(%edx)
+L(aligned_16_5bytes):
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_118bytes):
+ movdqa %xmm0, -118(%edx)
+L(aligned_16_102bytes):
+ movdqa %xmm0, -102(%edx)
+L(aligned_16_86bytes):
+ movdqa %xmm0, -86(%edx)
+L(aligned_16_70bytes):
+ movdqa %xmm0, -70(%edx)
+L(aligned_16_54bytes):
+ movdqa %xmm0, -54(%edx)
+L(aligned_16_38bytes):
+ movdqa %xmm0, -38(%edx)
+L(aligned_16_22bytes):
+ movdqa %xmm0, -22(%edx)
+L(aligned_16_6bytes):
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_119bytes):
+ movdqa %xmm0, -119(%edx)
+L(aligned_16_103bytes):
+ movdqa %xmm0, -103(%edx)
+L(aligned_16_87bytes):
+ movdqa %xmm0, -87(%edx)
+L(aligned_16_71bytes):
+ movdqa %xmm0, -71(%edx)
+L(aligned_16_55bytes):
+ movdqa %xmm0, -55(%edx)
+L(aligned_16_39bytes):
+ movdqa %xmm0, -39(%edx)
+L(aligned_16_23bytes):
+ movdqa %xmm0, -23(%edx)
+L(aligned_16_7bytes):
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_120bytes):
+ movdqa %xmm0, -120(%edx)
+L(aligned_16_104bytes):
+ movdqa %xmm0, -104(%edx)
+L(aligned_16_88bytes):
+ movdqa %xmm0, -88(%edx)
+L(aligned_16_72bytes):
+ movdqa %xmm0, -72(%edx)
+L(aligned_16_56bytes):
+ movdqa %xmm0, -56(%edx)
+L(aligned_16_40bytes):
+ movdqa %xmm0, -40(%edx)
+L(aligned_16_24bytes):
+ movdqa %xmm0, -24(%edx)
+L(aligned_16_8bytes):
+ movq %xmm0, -8(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_121bytes):
+ movdqa %xmm0, -121(%edx)
+L(aligned_16_105bytes):
+ movdqa %xmm0, -105(%edx)
+L(aligned_16_89bytes):
+ movdqa %xmm0, -89(%edx)
+L(aligned_16_73bytes):
+ movdqa %xmm0, -73(%edx)
+L(aligned_16_57bytes):
+ movdqa %xmm0, -57(%edx)
+L(aligned_16_41bytes):
+ movdqa %xmm0, -41(%edx)
+L(aligned_16_25bytes):
+ movdqa %xmm0, -25(%edx)
+L(aligned_16_9bytes):
+ movq %xmm0, -9(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_122bytes):
+ movdqa %xmm0, -122(%edx)
+L(aligned_16_106bytes):
+ movdqa %xmm0, -106(%edx)
+L(aligned_16_90bytes):
+ movdqa %xmm0, -90(%edx)
+L(aligned_16_74bytes):
+ movdqa %xmm0, -74(%edx)
+L(aligned_16_58bytes):
+ movdqa %xmm0, -58(%edx)
+L(aligned_16_42bytes):
+ movdqa %xmm0, -42(%edx)
+L(aligned_16_26bytes):
+ movdqa %xmm0, -26(%edx)
+L(aligned_16_10bytes):
+ movq %xmm0, -10(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_123bytes):
+ movdqa %xmm0, -123(%edx)
+L(aligned_16_107bytes):
+ movdqa %xmm0, -107(%edx)
+L(aligned_16_91bytes):
+ movdqa %xmm0, -91(%edx)
+L(aligned_16_75bytes):
+ movdqa %xmm0, -75(%edx)
+L(aligned_16_59bytes):
+ movdqa %xmm0, -59(%edx)
+L(aligned_16_43bytes):
+ movdqa %xmm0, -43(%edx)
+L(aligned_16_27bytes):
+ movdqa %xmm0, -27(%edx)
+L(aligned_16_11bytes):
+ movq %xmm0, -11(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_124bytes):
+ movdqa %xmm0, -124(%edx)
+L(aligned_16_108bytes):
+ movdqa %xmm0, -108(%edx)
+L(aligned_16_92bytes):
+ movdqa %xmm0, -92(%edx)
+L(aligned_16_76bytes):
+ movdqa %xmm0, -76(%edx)
+L(aligned_16_60bytes):
+ movdqa %xmm0, -60(%edx)
+L(aligned_16_44bytes):
+ movdqa %xmm0, -44(%edx)
+L(aligned_16_28bytes):
+ movdqa %xmm0, -28(%edx)
+L(aligned_16_12bytes):
+ movq %xmm0, -12(%edx)
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_125bytes):
+ movdqa %xmm0, -125(%edx)
+L(aligned_16_109bytes):
+ movdqa %xmm0, -109(%edx)
+L(aligned_16_93bytes):
+ movdqa %xmm0, -93(%edx)
+L(aligned_16_77bytes):
+ movdqa %xmm0, -77(%edx)
+L(aligned_16_61bytes):
+ movdqa %xmm0, -61(%edx)
+L(aligned_16_45bytes):
+ movdqa %xmm0, -45(%edx)
+L(aligned_16_29bytes):
+ movdqa %xmm0, -29(%edx)
+L(aligned_16_13bytes):
+ movq %xmm0, -13(%edx)
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_126bytes):
+ movdqa %xmm0, -126(%edx)
+L(aligned_16_110bytes):
+ movdqa %xmm0, -110(%edx)
+L(aligned_16_94bytes):
+ movdqa %xmm0, -94(%edx)
+L(aligned_16_78bytes):
+ movdqa %xmm0, -78(%edx)
+L(aligned_16_62bytes):
+ movdqa %xmm0, -62(%edx)
+L(aligned_16_46bytes):
+ movdqa %xmm0, -46(%edx)
+L(aligned_16_30bytes):
+ movdqa %xmm0, -30(%edx)
+L(aligned_16_14bytes):
+ movq %xmm0, -14(%edx)
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_127bytes):
+ movdqa %xmm0, -127(%edx)
+L(aligned_16_111bytes):
+ movdqa %xmm0, -111(%edx)
+L(aligned_16_95bytes):
+ movdqa %xmm0, -95(%edx)
+L(aligned_16_79bytes):
+ movdqa %xmm0, -79(%edx)
+L(aligned_16_63bytes):
+ movdqa %xmm0, -63(%edx)
+L(aligned_16_47bytes):
+ movdqa %xmm0, -47(%edx)
+L(aligned_16_31bytes):
+ movdqa %xmm0, -31(%edx)
+L(aligned_16_15bytes):
+ movq %xmm0, -15(%edx)
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN_END
+
+END (__memset_sse2_rep)
+
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memset-sse2.S b/libc/sysdeps/i386/i686/multiarch/memset-sse2.S
new file mode 100644
index 000000000..b2b979193
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -0,0 +1,867 @@
+/* memset with SSE2
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifdef USE_AS_BZERO
+# define DEST PARMS
+# define LEN DEST+4
+# define SETRTNVAL
+#else
+# define DEST PARMS
+# define CHR DEST+4
+# define LEN CHR+4
+# define SETRTNVAL movl DEST(%esp), %eax
+#endif
+
+#ifdef SHARED
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define PARMS 8 /* Preserve EBX. */
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ /* We first load PC into EBX. */ \
+ call __i686.get_pc_thunk.bx; \
+ /* Get the address of the jump table. */ \
+ add $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ add (%ebx,%ecx,4), %ebx; \
+ add %ecx, %edx; \
+ /* We loaded the jump table and adjuested EDX. Go. */ \
+ jmp *%ebx
+
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ ALIGN (4)
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+#else
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define PARMS 4
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
+ add %ecx, %edx; \
+ jmp *TABLE(,%ecx,4)
+#endif
+
+ .section .text.sse2,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO
+ENTRY (__memset_chk_sse2)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_sse2)
+#endif
+ENTRY (__memset_sse2)
+ ENTRANCE
+
+ movl LEN(%esp), %ecx
+#ifdef USE_AS_BZERO
+ xor %eax, %eax
+#else
+ movzbl CHR(%esp), %eax
+ movb %al, %ah
+ /* Fill the whole EAX with pattern. */
+ movl %eax, %edx
+ shl $16, %eax
+ or %edx, %eax
+#endif
+ movl DEST(%esp), %edx
+ cmp $32, %ecx
+ jae L(32bytesormore)
+
+L(write_less32bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_less_32bytes):
+ .int JMPTBL (L(write_0bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_1bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_2bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_3bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_4bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_5bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_6bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_7bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_8bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_9bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_10bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_11bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_12bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_13bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_14bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_15bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_16bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_17bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_18bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_19bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_20bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_21bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_22bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_23bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_24bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_25bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_26bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_27bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_28bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_29bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_30bytes), L(table_less_32bytes))
+ .int JMPTBL (L(write_31bytes), L(table_less_32bytes))
+ .popsection
+
+ ALIGN (4)
+L(write_28bytes):
+ movl %eax, -28(%edx)
+L(write_24bytes):
+ movl %eax, -24(%edx)
+L(write_20bytes):
+ movl %eax, -20(%edx)
+L(write_16bytes):
+ movl %eax, -16(%edx)
+L(write_12bytes):
+ movl %eax, -12(%edx)
+L(write_8bytes):
+ movl %eax, -8(%edx)
+L(write_4bytes):
+ movl %eax, -4(%edx)
+L(write_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_29bytes):
+ movl %eax, -29(%edx)
+L(write_25bytes):
+ movl %eax, -25(%edx)
+L(write_21bytes):
+ movl %eax, -21(%edx)
+L(write_17bytes):
+ movl %eax, -17(%edx)
+L(write_13bytes):
+ movl %eax, -13(%edx)
+L(write_9bytes):
+ movl %eax, -9(%edx)
+L(write_5bytes):
+ movl %eax, -5(%edx)
+L(write_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_30bytes):
+ movl %eax, -30(%edx)
+L(write_26bytes):
+ movl %eax, -26(%edx)
+L(write_22bytes):
+ movl %eax, -22(%edx)
+L(write_18bytes):
+ movl %eax, -18(%edx)
+L(write_14bytes):
+ movl %eax, -14(%edx)
+L(write_10bytes):
+ movl %eax, -10(%edx)
+L(write_6bytes):
+ movl %eax, -6(%edx)
+L(write_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(write_31bytes):
+ movl %eax, -31(%edx)
+L(write_27bytes):
+ movl %eax, -27(%edx)
+L(write_23bytes):
+ movl %eax, -23(%edx)
+L(write_19bytes):
+ movl %eax, -19(%edx)
+L(write_15bytes):
+ movl %eax, -15(%edx)
+L(write_11bytes):
+ movl %eax, -11(%edx)
+L(write_7bytes):
+ movl %eax, -7(%edx)
+L(write_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+/* ECX > 32 and EDX is 4 byte aligned. */
+L(32bytesormore):
+ /* Fill xmm0 with the pattern. */
+#ifdef USE_AS_BZERO
+ pxor %xmm0, %xmm0
+#else
+ movd %eax, %xmm0
+ punpcklbw %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0
+#endif
+ testl $0xf, %edx
+ jz L(aligned_16)
+/* ECX > 32 and EDX is not 16 byte aligned. */
+L(not_aligned_16):
+ movdqu %xmm0, (%edx)
+ movl %edx, %eax
+ and $-16, %edx
+ add $16, %edx
+ sub %edx, %eax
+ add %eax, %ecx
+ movd %xmm0, %eax
+
+ ALIGN (4)
+L(aligned_16):
+ cmp $128, %ecx
+ jge L(128bytesormore)
+
+L(aligned_16_less128bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ ALIGN (4)
+L(128bytesormore):
+#ifdef SHARED_CACHE_SIZE
+ PUSH (%ebx)
+ mov $SHARED_CACHE_SIZE, %ebx
+#else
+# ifdef SHARED
+ call __i686.get_pc_thunk.bx
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx
+# else
+ PUSH (%ebx)
+ mov __x86_shared_cache_size, %ebx
+# endif
+#endif
+ cmp %ebx, %ecx
+ jae L(128bytesormore_nt_start)
+
+
+#ifdef DATA_CACHE_SIZE
+ POP (%ebx)
+ cmp $DATA_CACHE_SIZE, %ecx
+#else
+# ifdef SHARED
+ call __i686.get_pc_thunk.bx
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx
+# else
+ POP (%ebx)
+ cmp __x86_data_cache_size, %ecx
+# endif
+#endif
+
+ jae L(128bytes_L2_normal)
+ subl $128, %ecx
+L(128bytesormore_normal):
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jl L(128bytesless_normal)
+
+
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ lea 128(%edx), %edx
+ jge L(128bytesormore_normal)
+
+L(128bytesless_normal):
+ lea 128(%ecx), %ecx
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+ ALIGN (4)
+L(128bytes_L2_normal):
+ prefetcht0 0x380(%edx)
+ prefetcht0 0x3c0(%edx)
+ sub $128, %ecx
+ movdqa %xmm0, (%edx)
+ movaps %xmm0, 0x10(%edx)
+ movaps %xmm0, 0x20(%edx)
+ movaps %xmm0, 0x30(%edx)
+ movaps %xmm0, 0x40(%edx)
+ movaps %xmm0, 0x50(%edx)
+ movaps %xmm0, 0x60(%edx)
+ movaps %xmm0, 0x70(%edx)
+ add $128, %edx
+ cmp $128, %ecx
+ jge L(128bytes_L2_normal)
+
+L(128bytesless_L2_normal):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+L(128bytesormore_nt_start):
+ sub %ebx, %ecx
+ ALIGN (4)
+L(128bytesormore_shared_cache_loop):
+ prefetcht0 0x3c0(%edx)
+ prefetcht0 0x380(%edx)
+ sub $0x80, %ebx
+ movdqa %xmm0, (%edx)
+ movdqa %xmm0, 0x10(%edx)
+ movdqa %xmm0, 0x20(%edx)
+ movdqa %xmm0, 0x30(%edx)
+ movdqa %xmm0, 0x40(%edx)
+ movdqa %xmm0, 0x50(%edx)
+ movdqa %xmm0, 0x60(%edx)
+ movdqa %xmm0, 0x70(%edx)
+ add $0x80, %edx
+ cmp $0x80, %ebx
+ jge L(128bytesormore_shared_cache_loop)
+ cmp $0x80, %ecx
+ jb L(shared_cache_loop_end)
+ ALIGN (4)
+L(128bytesormore_nt):
+ sub $0x80, %ecx
+ movntdq %xmm0, (%edx)
+ movntdq %xmm0, 0x10(%edx)
+ movntdq %xmm0, 0x20(%edx)
+ movntdq %xmm0, 0x30(%edx)
+ movntdq %xmm0, 0x40(%edx)
+ movntdq %xmm0, 0x50(%edx)
+ movntdq %xmm0, 0x60(%edx)
+ movntdq %xmm0, 0x70(%edx)
+ add $0x80, %edx
+ cmp $0x80, %ecx
+ jge L(128bytesormore_nt)
+ sfence
+L(shared_cache_loop_end):
+#if defined DATA_CACHE_SIZE || !defined SHARED
+ POP (%ebx)
+#endif
+ BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
+
+
+ .pushsection .rodata.sse2,"a",@progbits
+ ALIGN (2)
+L(table_16_128bytes):
+ .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
+ .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
+ .popsection
+
+ ALIGN (4)
+L(aligned_16_112bytes):
+ movdqa %xmm0, -112(%edx)
+L(aligned_16_96bytes):
+ movdqa %xmm0, -96(%edx)
+L(aligned_16_80bytes):
+ movdqa %xmm0, -80(%edx)
+L(aligned_16_64bytes):
+ movdqa %xmm0, -64(%edx)
+L(aligned_16_48bytes):
+ movdqa %xmm0, -48(%edx)
+L(aligned_16_32bytes):
+ movdqa %xmm0, -32(%edx)
+L(aligned_16_16bytes):
+ movdqa %xmm0, -16(%edx)
+L(aligned_16_0bytes):
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_113bytes):
+ movdqa %xmm0, -113(%edx)
+L(aligned_16_97bytes):
+ movdqa %xmm0, -97(%edx)
+L(aligned_16_81bytes):
+ movdqa %xmm0, -81(%edx)
+L(aligned_16_65bytes):
+ movdqa %xmm0, -65(%edx)
+L(aligned_16_49bytes):
+ movdqa %xmm0, -49(%edx)
+L(aligned_16_33bytes):
+ movdqa %xmm0, -33(%edx)
+L(aligned_16_17bytes):
+ movdqa %xmm0, -17(%edx)
+L(aligned_16_1bytes):
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_114bytes):
+ movdqa %xmm0, -114(%edx)
+L(aligned_16_98bytes):
+ movdqa %xmm0, -98(%edx)
+L(aligned_16_82bytes):
+ movdqa %xmm0, -82(%edx)
+L(aligned_16_66bytes):
+ movdqa %xmm0, -66(%edx)
+L(aligned_16_50bytes):
+ movdqa %xmm0, -50(%edx)
+L(aligned_16_34bytes):
+ movdqa %xmm0, -34(%edx)
+L(aligned_16_18bytes):
+ movdqa %xmm0, -18(%edx)
+L(aligned_16_2bytes):
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_115bytes):
+ movdqa %xmm0, -115(%edx)
+L(aligned_16_99bytes):
+ movdqa %xmm0, -99(%edx)
+L(aligned_16_83bytes):
+ movdqa %xmm0, -83(%edx)
+L(aligned_16_67bytes):
+ movdqa %xmm0, -67(%edx)
+L(aligned_16_51bytes):
+ movdqa %xmm0, -51(%edx)
+L(aligned_16_35bytes):
+ movdqa %xmm0, -35(%edx)
+L(aligned_16_19bytes):
+ movdqa %xmm0, -19(%edx)
+L(aligned_16_3bytes):
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_116bytes):
+ movdqa %xmm0, -116(%edx)
+L(aligned_16_100bytes):
+ movdqa %xmm0, -100(%edx)
+L(aligned_16_84bytes):
+ movdqa %xmm0, -84(%edx)
+L(aligned_16_68bytes):
+ movdqa %xmm0, -68(%edx)
+L(aligned_16_52bytes):
+ movdqa %xmm0, -52(%edx)
+L(aligned_16_36bytes):
+ movdqa %xmm0, -36(%edx)
+L(aligned_16_20bytes):
+ movdqa %xmm0, -20(%edx)
+L(aligned_16_4bytes):
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_117bytes):
+ movdqa %xmm0, -117(%edx)
+L(aligned_16_101bytes):
+ movdqa %xmm0, -101(%edx)
+L(aligned_16_85bytes):
+ movdqa %xmm0, -85(%edx)
+L(aligned_16_69bytes):
+ movdqa %xmm0, -69(%edx)
+L(aligned_16_53bytes):
+ movdqa %xmm0, -53(%edx)
+L(aligned_16_37bytes):
+ movdqa %xmm0, -37(%edx)
+L(aligned_16_21bytes):
+ movdqa %xmm0, -21(%edx)
+L(aligned_16_5bytes):
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_118bytes):
+ movdqa %xmm0, -118(%edx)
+L(aligned_16_102bytes):
+ movdqa %xmm0, -102(%edx)
+L(aligned_16_86bytes):
+ movdqa %xmm0, -86(%edx)
+L(aligned_16_70bytes):
+ movdqa %xmm0, -70(%edx)
+L(aligned_16_54bytes):
+ movdqa %xmm0, -54(%edx)
+L(aligned_16_38bytes):
+ movdqa %xmm0, -38(%edx)
+L(aligned_16_22bytes):
+ movdqa %xmm0, -22(%edx)
+L(aligned_16_6bytes):
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_119bytes):
+ movdqa %xmm0, -119(%edx)
+L(aligned_16_103bytes):
+ movdqa %xmm0, -103(%edx)
+L(aligned_16_87bytes):
+ movdqa %xmm0, -87(%edx)
+L(aligned_16_71bytes):
+ movdqa %xmm0, -71(%edx)
+L(aligned_16_55bytes):
+ movdqa %xmm0, -55(%edx)
+L(aligned_16_39bytes):
+ movdqa %xmm0, -39(%edx)
+L(aligned_16_23bytes):
+ movdqa %xmm0, -23(%edx)
+L(aligned_16_7bytes):
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_120bytes):
+ movdqa %xmm0, -120(%edx)
+L(aligned_16_104bytes):
+ movdqa %xmm0, -104(%edx)
+L(aligned_16_88bytes):
+ movdqa %xmm0, -88(%edx)
+L(aligned_16_72bytes):
+ movdqa %xmm0, -72(%edx)
+L(aligned_16_56bytes):
+ movdqa %xmm0, -56(%edx)
+L(aligned_16_40bytes):
+ movdqa %xmm0, -40(%edx)
+L(aligned_16_24bytes):
+ movdqa %xmm0, -24(%edx)
+L(aligned_16_8bytes):
+ movq %xmm0, -8(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_121bytes):
+ movdqa %xmm0, -121(%edx)
+L(aligned_16_105bytes):
+ movdqa %xmm0, -105(%edx)
+L(aligned_16_89bytes):
+ movdqa %xmm0, -89(%edx)
+L(aligned_16_73bytes):
+ movdqa %xmm0, -73(%edx)
+L(aligned_16_57bytes):
+ movdqa %xmm0, -57(%edx)
+L(aligned_16_41bytes):
+ movdqa %xmm0, -41(%edx)
+L(aligned_16_25bytes):
+ movdqa %xmm0, -25(%edx)
+L(aligned_16_9bytes):
+ movq %xmm0, -9(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_122bytes):
+ movdqa %xmm0, -122(%edx)
+L(aligned_16_106bytes):
+ movdqa %xmm0, -106(%edx)
+L(aligned_16_90bytes):
+ movdqa %xmm0, -90(%edx)
+L(aligned_16_74bytes):
+ movdqa %xmm0, -74(%edx)
+L(aligned_16_58bytes):
+ movdqa %xmm0, -58(%edx)
+L(aligned_16_42bytes):
+ movdqa %xmm0, -42(%edx)
+L(aligned_16_26bytes):
+ movdqa %xmm0, -26(%edx)
+L(aligned_16_10bytes):
+ movq %xmm0, -10(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_123bytes):
+ movdqa %xmm0, -123(%edx)
+L(aligned_16_107bytes):
+ movdqa %xmm0, -107(%edx)
+L(aligned_16_91bytes):
+ movdqa %xmm0, -91(%edx)
+L(aligned_16_75bytes):
+ movdqa %xmm0, -75(%edx)
+L(aligned_16_59bytes):
+ movdqa %xmm0, -59(%edx)
+L(aligned_16_43bytes):
+ movdqa %xmm0, -43(%edx)
+L(aligned_16_27bytes):
+ movdqa %xmm0, -27(%edx)
+L(aligned_16_11bytes):
+ movq %xmm0, -11(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_124bytes):
+ movdqa %xmm0, -124(%edx)
+L(aligned_16_108bytes):
+ movdqa %xmm0, -108(%edx)
+L(aligned_16_92bytes):
+ movdqa %xmm0, -92(%edx)
+L(aligned_16_76bytes):
+ movdqa %xmm0, -76(%edx)
+L(aligned_16_60bytes):
+ movdqa %xmm0, -60(%edx)
+L(aligned_16_44bytes):
+ movdqa %xmm0, -44(%edx)
+L(aligned_16_28bytes):
+ movdqa %xmm0, -28(%edx)
+L(aligned_16_12bytes):
+ movq %xmm0, -12(%edx)
+ movl %eax, -4(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_125bytes):
+ movdqa %xmm0, -125(%edx)
+L(aligned_16_109bytes):
+ movdqa %xmm0, -109(%edx)
+L(aligned_16_93bytes):
+ movdqa %xmm0, -93(%edx)
+L(aligned_16_77bytes):
+ movdqa %xmm0, -77(%edx)
+L(aligned_16_61bytes):
+ movdqa %xmm0, -61(%edx)
+L(aligned_16_45bytes):
+ movdqa %xmm0, -45(%edx)
+L(aligned_16_29bytes):
+ movdqa %xmm0, -29(%edx)
+L(aligned_16_13bytes):
+ movq %xmm0, -13(%edx)
+ movl %eax, -5(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_126bytes):
+ movdqa %xmm0, -126(%edx)
+L(aligned_16_110bytes):
+ movdqa %xmm0, -110(%edx)
+L(aligned_16_94bytes):
+ movdqa %xmm0, -94(%edx)
+L(aligned_16_78bytes):
+ movdqa %xmm0, -78(%edx)
+L(aligned_16_62bytes):
+ movdqa %xmm0, -62(%edx)
+L(aligned_16_46bytes):
+ movdqa %xmm0, -46(%edx)
+L(aligned_16_30bytes):
+ movdqa %xmm0, -30(%edx)
+L(aligned_16_14bytes):
+ movq %xmm0, -14(%edx)
+ movl %eax, -6(%edx)
+ movw %ax, -2(%edx)
+ SETRTNVAL
+ RETURN
+
+ ALIGN (4)
+L(aligned_16_127bytes):
+ movdqa %xmm0, -127(%edx)
+L(aligned_16_111bytes):
+ movdqa %xmm0, -111(%edx)
+L(aligned_16_95bytes):
+ movdqa %xmm0, -95(%edx)
+L(aligned_16_79bytes):
+ movdqa %xmm0, -79(%edx)
+L(aligned_16_63bytes):
+ movdqa %xmm0, -63(%edx)
+L(aligned_16_47bytes):
+ movdqa %xmm0, -47(%edx)
+L(aligned_16_31bytes):
+ movdqa %xmm0, -31(%edx)
+L(aligned_16_15bytes):
+ movq %xmm0, -15(%edx)
+ movl %eax, -7(%edx)
+ movw %ax, -3(%edx)
+ movb %al, -1(%edx)
+ SETRTNVAL
+ RETURN_END
+
+END (__memset_sse2)
+
+#endif
diff --git a/libc/sysdeps/i386/i686/multiarch/memset.S b/libc/sysdeps/i386/i686/multiarch/memset.S
new file mode 100644
index 000000000..34dddcef7
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memset.S
@@ -0,0 +1,112 @@
+/* Multiple versions of memset
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(memset)
+ .type memset, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __memset_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memset_sse2@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memset_sse2_rep@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(memset)
+# else
+ .text
+ENTRY(memset)
+ .type memset, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features
+ jne 1f
+ call __init_cpu_features
+1: leal __memset_ia32, %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+ jz 2f
+ leal __memset_sse2, %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+ jz 2f
+ leal __memset_sse2_rep, %eax
+2: ret
+END(memset)
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memset_ia32, @function; \
+ .globl __memset_ia32; \
+ .p2align 4; \
+ __memset_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memset_ia32, .-__memset_ia32
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __memset_chk_ia32, @function; \
+ .globl __memset_chk_ia32; \
+ .p2align 4; \
+ __memset_chk_ia32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* IFUNC doesn't work with the hidden functions in shared library since
+ they will be called without setting up EBX needed for PLT which is
+ used by IFUNC. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memset; __GI_memset = __memset_ia32
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
diff --git a/libc/sysdeps/i386/i686/multiarch/memset_chk.S b/libc/sysdeps/i386/i686/multiarch/memset_chk.S
new file mode 100644
index 000000000..d659c7e56
--- /dev/null
+++ b/libc/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -0,0 +1,116 @@
+/* Multiple versions of __memset_chk
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(__memset_chk)
+ .type __memset_chk, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __memset_chk_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memset_chk_sse2@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memset_chk_sse2_rep@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(__memset_chk)
+
+strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
+ .section .gnu.warning.__memset_zero_constant_len_parameter
+ .string "memset used with constant zero length parameter; this could be due to transposed parameters"
+# else
+ .text
+ENTRY(__memset_chk)
+ .type __memset_chk, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features
+ jne 1f
+ call __init_cpu_features
+1: leal __memset_chk_ia32, %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+ jz 2f
+ leal __memset_chk_sse2, %eax
+ testl $bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+ jz 2f
+ leal __memset_chk_sse2_rep, %eax
+2: ret
+END(__memset_chk)
+
+ .type __memset_chk_sse2, @function
+ .p2align 4;
+__memset_chk_sse2:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memset_sse2
+ cfi_endproc
+ .size __memset_chk_sse2, .-__memset_chk_sse2
+
+ .type __memset_chk_sse2_rep, @function
+ .p2align 4;
+__memset_chk_sse2_rep:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memset_sse2_rep
+ cfi_endproc
+ .size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep
+
+ .type __memset_chk_ia32, @function
+ .p2align 4;
+__memset_chk_ia32:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memset_ia32
+ cfi_endproc
+ .size __memset_chk_ia32, .-__memset_chk_ia32
+# endif
+#endif
diff --git a/libc/sysdeps/i386/sysdep.h b/libc/sysdeps/i386/sysdep.h
index e03a8e926..efdc82dde 100644
--- a/libc/sysdeps/i386/sysdep.h
+++ b/libc/sysdeps/i386/sysdep.h
@@ -67,6 +67,9 @@
ASM_SIZE_DIRECTIVE(name) \
STABS_FUN_END(name)
+#define ENTRY_CHK(name) ENTRY (name)
+#define END_CHK(name) END (name)
+
#ifdef HAVE_CPP_ASM_DEBUGINFO
/* Disable that goop, because we just pass -g through to the assembler
and it generates proper line number information directly. */
diff --git a/libc/sysdeps/ieee754/ldbl-128/s_ceill.c b/libc/sysdeps/ieee754/ldbl-128/s_ceill.c
index 76bda9fc0..71f2fee91 100644
--- a/libc/sysdeps/ieee754/ldbl-128/s_ceill.c
+++ b/libc/sysdeps/ieee754/ldbl-128/s_ceill.c
@@ -30,9 +30,9 @@ static char rcsid[] = "$NetBSD: $";
#include "math_private.h"
#ifdef __STDC__
-static const long double huge = 1.0e4930;
+static const long double huge = 1.0e4930L;
#else
-static long double huge = 1.0e4930;
+static long double huge = 1.0e4930L;
#endif
#ifdef __STDC__
diff --git a/libc/sysdeps/ieee754/ldbl-128/s_expm1l.c b/libc/sysdeps/ieee754/ldbl-128/s_expm1l.c
index a82489bb2..dec6404af 100644
--- a/libc/sysdeps/ieee754/ldbl-128/s_expm1l.c
+++ b/libc/sysdeps/ieee754/ldbl-128/s_expm1l.c
@@ -86,7 +86,7 @@ static const long double
/* ln (2^16384 * (1 - 2^-113)) */
maxlog = 1.1356523406294143949491931077970764891253E4L,
/* ln 2^-114 */
- minarg = -7.9018778583833765273564461846232128760607E1L, big = 2e4932L;
+ minarg = -7.9018778583833765273564461846232128760607E1L, big = 1e4932L;
long double
diff --git a/libc/sysdeps/ieee754/ldbl-128/s_floorl.c b/libc/sysdeps/ieee754/ldbl-128/s_floorl.c
index ff5b98da9..2a60a79fc 100644
--- a/libc/sysdeps/ieee754/ldbl-128/s_floorl.c
+++ b/libc/sysdeps/ieee754/ldbl-128/s_floorl.c
@@ -30,9 +30,9 @@ static char rcsid[] = "$NetBSD: $";
#include "math_private.h"
#ifdef __STDC__
-static const long double huge = 1.0e4930;
+static const long double huge = 1.0e4930L;
#else
-static long double huge = 1.0e4930;
+static long double huge = 1.0e4930L;
#endif
#ifdef __STDC__
diff --git a/libc/sysdeps/ieee754/ldbl-128/s_log1pl.c b/libc/sysdeps/ieee754/ldbl-128/s_log1pl.c
index 4480dc9f2..6e50575ac 100644
--- a/libc/sysdeps/ieee754/ldbl-128/s_log1pl.c
+++ b/libc/sysdeps/ieee754/ldbl-128/s_log1pl.c
@@ -120,7 +120,6 @@ static const long double C2 = 1.428606820309417232121458176568075500134E-6L;
static const long double sqrth = 0.7071067811865475244008443621048490392848L;
/* ln (2^16384 * (1 - 2^-113)) */
static const long double maxlog = 1.1356523406294143949491931077970764891253E4L;
-static const long double big = 2e4932L;
static const long double zero = 0.0L;
long double
diff --git a/libc/sysdeps/ieee754/ldbl-128/s_nexttowardf.c b/libc/sysdeps/ieee754/ldbl-128/s_nexttowardf.c
index 1a22e0102..1f37d80e0 100644
--- a/libc/sysdeps/ieee754/ldbl-128/s_nexttowardf.c
+++ b/libc/sysdeps/ieee754/ldbl-128/s_nexttowardf.c
@@ -44,10 +44,12 @@ static char rcsid[] = "$NetBSD: $";
return x+y;
if((long double) x==y) return y; /* x=y, return y */
if(ix==0) { /* x == 0 */
- float x2;
+ float u;
SET_FLOAT_WORD(x,(u_int32_t)((hy>>32)&0x80000000)|1);/* return +-minsub*/
- x2 = x*x;
- if(x2==x) return x2; else return x; /* raise underflow flag */
+ u = math_opt_barrier (x);
+ u = u * u;
+ math_force_eval (u); /* raise underflow flag */
+ return x;
}
if(hx>=0) { /* x > 0 */
if(hy<0||(ix>>23)>(iy>>48)-0x3f80
@@ -68,12 +70,9 @@ static char rcsid[] = "$NetBSD: $";
}
hy = hx&0x7f800000;
if(hy>=0x7f800000) return x+x; /* overflow */
- if(hy<0x00800000) { /* underflow */
- float x2 = x*x;
- if(x2!=x) { /* raise underflow flag */
- SET_FLOAT_WORD(x2,hx);
- return x2;
- }
+ if(hy<0x00800000) {
+ float u = x*x; /* underflow */
+ math_force_eval (u); /* raise underflow flag */
}
SET_FLOAT_WORD(x,hx);
return x;
diff --git a/libc/sysdeps/mach/hurd/bits/libc-lock.h b/libc/sysdeps/mach/hurd/bits/libc-lock.h
index 0fa90bcc3..90e46e02f 100644
--- a/libc/sysdeps/mach/hurd/bits/libc-lock.h
+++ b/libc/sysdeps/mach/hurd/bits/libc-lock.h
@@ -31,6 +31,7 @@ typedef struct
void *owner;
int count;
} __libc_lock_recursive_t;
+typedef __libc_lock_recursive_t __rtld_lock_recursive_t;
#define __libc_lock_owner_self() ((void *) __hurd_threadvar_location (0))
@@ -121,6 +122,8 @@ typedef struct __libc_lock_recursive_opaque__ __libc_lock_recursive_t;
#define __rtld_lock_init_recursive(NAME) \
__libc_lock_init_recursive (NAME)
+#define __rtld_lock_initialize(NAME) \
+ (void) ((NAME) = (__rtld_lock_recursive_t) _RTLD_LOCK_RECURSIVE_INITIALIZER)
#define __rtld_lock_trylock_recursive(NAME) \
__libc_lock_trylock_recursive (NAME)
#define __rtld_lock_lock_recursive(NAME) \
diff --git a/libc/sysdeps/mach/hurd/bits/stat.h b/libc/sysdeps/mach/hurd/bits/stat.h
index c3f96660c..b64a658fb 100644
--- a/libc/sysdeps/mach/hurd/bits/stat.h
+++ b/libc/sysdeps/mach/hurd/bits/stat.h
@@ -1,4 +1,5 @@
-/* Copyright (C) 1992,93,94,96,97,99,2000,2005 Free Software Foundation, Inc.
+/* Copyright (C) 1992-1994,1996,1997,1999,2000,2005,2010
+ Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,10 +17,13 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
-#ifndef _SYS_STAT_H
+#if !defined _SYS_STAT_H && !defined _FCNTL_H
# error "Never include <bits/stat.h> directly; use <sys/stat.h> instead."
#endif
+#ifndef _BITS_STAT_H
+#define _BITS_STAT_H 1
+
#include <bits/types.h>
/* NOTE: The size of this structure (32 ints) is known in
@@ -192,5 +196,7 @@ struct stat64
/* Default file creation mask (umask). */
#ifdef __USE_BSD
-#define CMASK 0022
+# define CMASK 0022
#endif
+
+#endif /* bits/stat.h */
diff --git a/libc/sysdeps/powerpc/powerpc32/cell/memcpy.S b/libc/sysdeps/powerpc/powerpc32/cell/memcpy.S
new file mode 100644
index 000000000..cc1da99fd
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/cell/memcpy.S
@@ -0,0 +1,245 @@
+/* Optimized memcpy implementation for CELL BE PowerPC.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
+#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
+
+/* memcpy routine optimized for CELL-BE-PPC v2.0
+ *
+ * The CELL PPC core has 1 integer unit and 1 load/store unit
+ * CELL:
+ * 1st level data cache = 32K
+ * 2nd level data cache = 512K
+ * 3rd level data cache = 0K
+ * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
+ * latency to memory is >400 clocks
+ * To improve copy performance we need to prefetch source data
+ * far ahead to hide this latency
+ * For best performance instructionforms ending in "." like "andi."
+ * should be avoided as the are implemented in microcode on CELL.
+ * The below code is loop unrolled for the CELL cache line of 128 bytes
+ */
+
+.align 7
+
+EALIGN (BP_SYM (memcpy), 5, 0)
+ CALL_MCOUNT
+
+ dcbt 0,r4 /* Prefetch ONE SRC cacheline */
+ cmplwi cr1,r5,16 /* is size < 16 ? */
+ mr r6,r3
+ blt+ cr1,.Lshortcopy
+
+.Lbigcopy:
+ neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
+ clrlwi r8,r8,32-4 /* aling to 16byte boundary */
+ sub r7,r4,r3
+ cmplwi cr0,r8,0
+ beq+ .Ldst_aligned
+
+.Ldst_unaligned:
+ mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
+ subf r5,r8,r5
+
+ bf cr7*4+3,1f
+ lbzx r0,r7,r6 /* copy 1 byte */
+ stb r0,0(r6)
+ addi r6,r6,1
+1: bf cr7*4+2,2f
+ lhzx r0,r7,r6 /* copy 2 byte */
+ sth r0,0(r6)
+ addi r6,r6,2
+2: bf cr7*4+1,4f
+ lwzx r0,r7,r6 /* copy 4 byte */
+ stw r0,0(r6)
+ addi r6,r6,4
+4: bf cr7*4+0,8f
+ lfdx fp9,r7,r6 /* copy 8 byte */
+ stfd fp9,0(r6)
+ addi r6,r6,8
+8:
+ add r4,r7,r6
+
+.Ldst_aligned:
+
+ cmpwi cr5,r5,128-1
+
+ neg r7,r6
+ addi r6,r6,-8 /* prepare for stfdu */
+ addi r4,r4,-8 /* prepare for lfdu */
+
+ clrlwi r7,r7,32-7 /* align to cacheline boundary */
+ ble+ cr5,.Llessthancacheline
+
+ cmplwi cr6,r7,0
+ subf r5,r7,r5
+ srwi r7,r7,4 /* divide size by 16 */
+ srwi r10,r5,7 /* number of cache lines to copy */
+
+ cmplwi r10,0
+ li r11,0 /* number cachelines to copy with prefetch */
+ beq .Lnocacheprefetch
+
+ cmplwi r10,PREFETCH_AHEAD
+ li r12,128+8 /* prefetch distance */
+ ble .Llessthanmaxprefetch
+
+ subi r11,r10,PREFETCH_AHEAD
+ li r10,PREFETCH_AHEAD
+
+.Llessthanmaxprefetch:
+ mtctr r10
+
+.LprefetchSRC:
+ dcbt r12,r4
+ addi r12,r12,128
+ bdnz .LprefetchSRC
+
+.Lnocacheprefetch:
+ mtctr r7
+ cmplwi cr1,r5,128
+ clrlwi r5,r5,32-7
+ beq cr6,.Lcachelinealigned
+
+.Laligntocacheline:
+ lfd fp9,0x08(r4)
+ lfdu fp10,0x10(r4)
+ stfd fp9,0x08(r6)
+ stfdu fp10,0x10(r6)
+ bdnz .Laligntocacheline
+
+
+.Lcachelinealigned: /* copy while cache lines */
+
+ blt- cr1,.Llessthancacheline /* size <128 */
+
+.Louterloop:
+ cmpwi r11,0
+ mtctr r11
+ beq- .Lendloop
+
+ li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
+
+.align 4
+ /* Copy whole cachelines, optimized by prefetching SRC cacheline */
+.Lloop: /* Copy aligned body */
+ dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
+ lfd fp9, 0x08(r4)
+ dcbz r11,r6
+ lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */
+ lfd fp11, 0x18(r4) /* to hide 1st level cache lantency. */
+ lfd fp12, 0x20(r4)
+ stfd fp9, 0x08(r6)
+ stfd fp10, 0x10(r6)
+ stfd fp11, 0x18(r6)
+ stfd fp12, 0x20(r6)
+ lfd fp9, 0x28(r4)
+ lfd fp10, 0x30(r4)
+ lfd fp11, 0x38(r4)
+ lfd fp12, 0x40(r4)
+ stfd fp9, 0x28(r6)
+ stfd fp10, 0x30(r6)
+ stfd fp11, 0x38(r6)
+ stfd fp12, 0x40(r6)
+ lfd fp9, 0x48(r4)
+ lfd fp10, 0x50(r4)
+ lfd fp11, 0x58(r4)
+ lfd fp12, 0x60(r4)
+ stfd fp9, 0x48(r6)
+ stfd fp10, 0x50(r6)
+ stfd fp11, 0x58(r6)
+ stfd fp12, 0x60(r6)
+ lfd fp9, 0x68(r4)
+ lfd fp10, 0x70(r4)
+ lfd fp11, 0x78(r4)
+ lfdu fp12, 0x80(r4)
+ stfd fp9, 0x68(r6)
+ stfd fp10, 0x70(r6)
+ stfd fp11, 0x78(r6)
+ stfdu fp12, 0x80(r6)
+
+ bdnz .Lloop
+
+.Lendloop:
+ cmpwi r10,0
+ slwi r10,r10,2 /* adjust from 128 to 32 byte stride */
+ beq- .Lendloop2
+ mtctr r10
+
+.Lloop2: /* Copy aligned body */
+ lfd fp9, 0x08(r4)
+ lfd fp10, 0x10(r4)
+ lfd fp11, 0x18(r4)
+ lfdu fp12, 0x20(r4)
+ stfd fp9, 0x08(r6)
+ stfd fp10, 0x10(r6)
+ stfd fp11, 0x18(r6)
+ stfdu fp12, 0x20(r6)
+
+ bdnz .Lloop2
+.Lendloop2:
+
+.Llessthancacheline: /* less than cache to do ? */
+ cmplwi cr0,r5,16
+ srwi r7,r5,4 /* divide size by 16 */
+ blt- .Ldo_lt16
+ mtctr r7
+
+.Lcopy_remaining:
+ lfd fp9,0x08(r4)
+ lfdu fp10,0x10(r4)
+ stfd fp9,0x08(r6)
+ stfdu fp10,0x10(r6)
+ bdnz .Lcopy_remaining
+
+.Ldo_lt16: /* less than 16 ? */
+ cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */
+ beqlr+ /* no rest to copy */
+ addi r4,r4,8
+ addi r6,r6,8
+
+.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
+ mtcrf 0x01,r5
+ sub r7,r4,r6
+ bf- cr7*4+0,8f
+ lfdx fp9,r7,r6 /* copy 8 byte */
+ stfd fp9,0(r6)
+ addi r6,r6,8
+8:
+ bf cr7*4+1,4f
+ lwzx r0,r7,r6 /* copy 4 byte */
+ stw r0,0(r6)
+ addi r6,r6,4
+4:
+ bf cr7*4+2,2f
+ lhzx r0,r7,r6 /* copy 2 byte */
+ sth r0,0(r6)
+ addi r6,r6,2
+2:
+ bf cr7*4+3,1f
+ lbzx r0,r7,r6 /* copy 1 byte */
+ stb r0,0(r6)
+1: blr
+
+END (BP_SYM (memcpy))
+libc_hidden_builtin_def (memcpy)
diff --git a/libc/sysdeps/powerpc/powerpc64/cell/memcpy.S b/libc/sysdeps/powerpc/powerpc64/cell/memcpy.S
new file mode 100644
index 000000000..c6ee730e4
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc64/cell/memcpy.S
@@ -0,0 +1,245 @@
+/* Optimized memcpy implementation for CELL BE PowerPC.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
+#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
+
+/* memcpy routine optimized for CELL-BE-PPC v2.0
+ *
+ * The CELL PPC core has 1 integer unit and 1 load/store unit
+ * CELL:
+ * 1st level data cache = 32K
+ * 2nd level data cache = 512K
+ * 3rd level data cache = 0K
+ * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
+ * latency to memory is >400 clocks
+ * To improve copy performance we need to prefetch source data
+ * far ahead to hide this latency
+ * For best performance instructionforms ending in "." like "andi."
+ * should be avoided as the are implemented in microcode on CELL.
+ * The below code is loop unrolled for the CELL cache line of 128 bytes
+ */
+
+.align 7
+
+EALIGN (BP_SYM (memcpy), 5, 0)
+ CALL_MCOUNT 3
+
+ dcbt 0,r4 /* Prefetch ONE SRC cacheline */
+ cmpldi cr1,r5,16 /* is size < 16 ? */
+ mr r6,r3
+ blt+ cr1,.Lshortcopy
+
+.Lbigcopy:
+ neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
+ clrldi r8,r8,64-4 /* aling to 16byte boundary */
+ sub r7,r4,r3
+ cmpldi cr0,r8,0
+ beq+ .Ldst_aligned
+
+.Ldst_unaligned:
+ mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
+ subf r5,r8,r5
+
+ bf cr7*4+3,1f
+ lbzx r0,r7,r6 /* copy 1 byte */
+ stb r0,0(r6)
+ addi r6,r6,1
+1: bf cr7*4+2,2f
+ lhzx r0,r7,r6 /* copy 2 byte */
+ sth r0,0(r6)
+ addi r6,r6,2
+2: bf cr7*4+1,4f
+ lwzx r0,r7,r6 /* copy 4 byte */
+ stw r0,0(r6)
+ addi r6,r6,4
+4: bf cr7*4+0,8f
+ ldx r0,r7,r6 /* copy 8 byte */
+ std r0,0(r6)
+ addi r6,r6,8
+8:
+ add r4,r7,r6
+
+.Ldst_aligned:
+
+ cmpdi cr5,r5,128-1
+
+ neg r7,r6
+ addi r6,r6,-8 /* prepare for stdu */
+ addi r4,r4,-8 /* prepare for ldu */
+
+ clrldi r7,r7,64-7 /* align to cacheline boundary */
+ ble+ cr5,.Llessthancacheline
+
+ cmpldi cr6,r7,0
+ subf r5,r7,r5
+ srdi r7,r7,4 /* divide size by 16 */
+ srdi r10,r5,7 /* number of cache lines to copy */
+
+ cmpldi r10,0
+ li r11,0 /* number cachelines to copy with prefetch */
+ beq .Lnocacheprefetch
+
+ cmpldi r10,PREFETCH_AHEAD
+ li r12,128+8 /* prefetch distance */
+ ble .Llessthanmaxprefetch
+
+ subi r11,r10,PREFETCH_AHEAD
+ li r10,PREFETCH_AHEAD
+
+.Llessthanmaxprefetch:
+ mtctr r10
+
+.LprefetchSRC:
+ dcbt r12,r4
+ addi r12,r12,128
+ bdnz .LprefetchSRC
+
+.Lnocacheprefetch:
+ mtctr r7
+ cmpldi cr1,r5,128
+ clrldi r5,r5,64-7
+ beq cr6,.Lcachelinealigned
+
+.Laligntocacheline:
+ ld r9,0x08(r4)
+ ldu r7,0x10(r4)
+ std r9,0x08(r6)
+ stdu r7,0x10(r6)
+ bdnz .Laligntocacheline
+
+
+.Lcachelinealigned: /* copy while cache lines */
+
+ blt- cr1,.Llessthancacheline /* size <128 */
+
+.Louterloop:
+ cmpdi r11,0
+ mtctr r11
+ beq- .Lendloop
+
+ li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
+
+.align 4
+ /* Copy whole cachelines, optimized by prefetching SRC cacheline */
+.Lloop: /* Copy aligned body */
+ dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
+ ld r9, 0x08(r4)
+ dcbz r11,r6
+ ld r7, 0x10(r4) /* 4 register stride copy is optimal */
+ ld r8, 0x18(r4) /* to hide 1st level cache lantency. */
+ ld r0, 0x20(r4)
+ std r9, 0x08(r6)
+ std r7, 0x10(r6)
+ std r8, 0x18(r6)
+ std r0, 0x20(r6)
+ ld r9, 0x28(r4)
+ ld r7, 0x30(r4)
+ ld r8, 0x38(r4)
+ ld r0, 0x40(r4)
+ std r9, 0x28(r6)
+ std r7, 0x30(r6)
+ std r8, 0x38(r6)
+ std r0, 0x40(r6)
+ ld r9, 0x48(r4)
+ ld r7, 0x50(r4)
+ ld r8, 0x58(r4)
+ ld r0, 0x60(r4)
+ std r9, 0x48(r6)
+ std r7, 0x50(r6)
+ std r8, 0x58(r6)
+ std r0, 0x60(r6)
+ ld r9, 0x68(r4)
+ ld r7, 0x70(r4)
+ ld r8, 0x78(r4)
+ ldu r0, 0x80(r4)
+ std r9, 0x68(r6)
+ std r7, 0x70(r6)
+ std r8, 0x78(r6)
+ stdu r0, 0x80(r6)
+
+ bdnz .Lloop
+
+.Lendloop:
+ cmpdi r10,0
+ sldi r10,r10,2 /* adjust from 128 to 32 byte stride */
+ beq- .Lendloop2
+ mtctr r10
+
+.Lloop2: /* Copy aligned body */
+ ld r9, 0x08(r4)
+ ld r7, 0x10(r4)
+ ld r8, 0x18(r4)
+ ldu r0, 0x20(r4)
+ std r9, 0x08(r6)
+ std r7, 0x10(r6)
+ std r8, 0x18(r6)
+ stdu r0, 0x20(r6)
+
+ bdnz .Lloop2
+.Lendloop2:
+
+.Llessthancacheline: /* less than cache to do ? */
+ cmpldi cr0,r5,16
+ srdi r7,r5,4 /* divide size by 16 */
+ blt- .Ldo_lt16
+ mtctr r7
+
+.Lcopy_remaining:
+ ld r8,0x08(r4)
+ ldu r7,0x10(r4)
+ std r8,0x08(r6)
+ stdu r7,0x10(r6)
+ bdnz .Lcopy_remaining
+
+.Ldo_lt16: /* less than 16 ? */
+ cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
+ beqlr+ /* no rest to copy */
+ addi r4,r4,8
+ addi r6,r6,8
+
+.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
+ mtcrf 0x01,r5
+ sub r7,r4,r6
+ bf- cr7*4+0,8f
+ ldx r0,r7,r6 /* copy 8 byte */
+ std r0,0(r6)
+ addi r6,r6,8
+8:
+ bf cr7*4+1,4f
+ lwzx r0,r7,r6 /* copy 4 byte */
+ stw r0,0(r6)
+ addi r6,r6,4
+4:
+ bf cr7*4+2,2f
+ lhzx r0,r7,r6 /* copy 2 byte */
+ sth r0,0(r6)
+ addi r6,r6,2
+2:
+ bf cr7*4+3,1f
+ lbzx r0,r7,r6 /* copy 1 byte */
+ stb r0,0(r6)
+1: blr
+
+END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff --git a/libc/sysdeps/s390/s390-32/dl-machine.h b/libc/sysdeps/s390/s390-32/dl-machine.h
index 251a5f692..415b38801 100644
--- a/libc/sysdeps/s390/s390-32/dl-machine.h
+++ b/libc/sysdeps/s390/s390-32/dl-machine.h
@@ -27,6 +27,7 @@
#include <sys/param.h>
#include <string.h>
#include <link.h>
+#include <sysdeps/s390/dl-procinfo.h>
/* This is an older, now obsolete value. */
#define EM_S390_OLD 0xA390
@@ -35,6 +36,12 @@
static inline int
elf_machine_matches_host (const Elf32_Ehdr *ehdr)
{
+ /* Check if the kernel provides the high gpr facility if needed by
+ the binary. */
+ if ((ehdr->e_flags & EF_S390_HIGH_GPRS)
+ && !(GLRO (dl_hwcap) & HWCAP_S390_HIGH_GPRS))
+ return 0;
+
return (ehdr->e_machine == EM_S390 || ehdr->e_machine == EM_S390_OLD)
&& ehdr->e_ident[EI_CLASS] == ELFCLASS32;
}
diff --git a/libc/sysdeps/s390/s390-32/elf/start.S b/libc/sysdeps/s390/s390-32/elf/start.S
index f7290106c..066f7f0aa 100644
--- a/libc/sysdeps/s390/s390-32/elf/start.S
+++ b/libc/sysdeps/s390/s390-32/elf/start.S
@@ -59,6 +59,88 @@
.globl _start
.type _start,@function
_start:
+ /* Check if the kernel provides highgprs facility if needed by
+ the binary. */
+
+ lr %r6,%r15
+ la %r6,4(%r6) /* Skip the argument counter. */
+
+.L11: l %r5,0(%r6) /* Skip the argument vector. */
+ la %r6,4(%r6)
+ ltr %r5,%r5
+ jne .L11
+
+.L12: l %r5,0(%r6) /* Skip the environment vector. */
+ la %r6,4(%r6)
+ ltr %r5,%r5
+ jne .L12
+
+ /* Obtain the needed values from the auxiliary vector. */
+
+ lhi %r7,16 /* AT_HWCAP */
+ lhi %r8,3 /* AT_PHDR */
+ lhi %r9,5 /* AT_PHNUM */
+ lhi %r2,4 /* AT_PHENT */
+.L13: l %r5,0(%r6)
+ clr %r5,%r7
+ jne .L15
+ l %r10,4(%r6) /* r10 = AT_HWCAP value. */
+.L15: clr %r5,%r8
+ jne .L16
+ l %r11,4(%r6) /* r11 = AT_PHDR value. */
+.L16: clr %r5,%r9
+ jne .L17
+ l %r12,4(%r6) /* r12 = AT_PHNUM value. */
+.L17: clr %r5,%r2
+ jne .L18
+ l %r0,4(%r6) /* r0 = AT_PHENT value. */
+.L18: ltr %r5,%r5
+ la %r6,8(%r6)
+ jnz .L13
+
+ /* Locate the ELF header by looking for the first PT_LOAD
+ segment with a p_offset of zero. */
+
+ lr %r4,%r11 /* Backup AT_PHDR. */
+ lhi %r7,1 /* PT_LOAD id */
+ lhi %r8,0
+.L19: cl %r7,0(%r4) /* p_type == PT_LOAD? */
+ jne .L20
+ cl %r8,4(%r4) /* p_offset == 0? */
+ jne .L20
+ l %r9,8(%r4) /* r9 = p_vaddr <- ELF header address */
+ j .L24
+.L20: alr %r4,%r0 /* r4 += AT_PHENT value */
+ brct %r12,.L19
+
+ j .+2 /* Trap, there must be such a phdr. */
+
+.L24: lr %r4,%r11 /* Backup AT_PHDR. */
+ lhi %r2,6 /* PT_PHDR id */
+.L23: cl %r2,0(%r4)
+ jne .L22
+ l %r3,8(%r4) /* r3 = PT_PHDR p_vaddr */
+ j .L25
+.L22: alr %r4,%r0 /* r4 += AT_PHENT value */
+ brct %r12,.L23
+
+ ltr %r9,%r9 /* Load address == 0? */
+ jz .L14 /* No checking for PIE without PT_PHDR. */
+ j .L21
+
+.L25: clr %r3,%r11 /* PT_PHDR p_vaddr == AT_PHDR? */
+ je .L21
+ lr %r9,%r11
+ slr %r9,%r3 /* elf_header_addr = AT_PHDR - PT_PHDR.p_vaddr */
+
+.L21: l %r5,36(%r9) /* Load the e_flags field. */
+ tml %r5,1
+ jz .L14 /* Binary does not require highgprs facility. */
+
+ tml %r10,512 /* Check the AT_HWCAP value. */
+ jz 2 /* Trap if no highgprs facility available. */
+.L14:
+
/* Setup pointer to literal pool of _start */
basr %r13,0
.L0: ahi %r13,.Llit-.L0
diff --git a/libc/sysdeps/unix/bsd/bits/stat.h b/libc/sysdeps/unix/bsd/bits/stat.h
index 84a58ffbc..6eeab3b0e 100644
--- a/libc/sysdeps/unix/bsd/bits/stat.h
+++ b/libc/sysdeps/unix/bsd/bits/stat.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1991, 92, 96, 97, 99, 2000 Free Software Foundation, Inc.
+/* Copyright (C) 1991,1992,1996-2000,2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,10 +16,13 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
-#ifndef _SYS_STAT_H
+#if !defined _SYS_STAT_H && !defined _FCNTL_H
# error "Never include <bits/stat.h> directly; use <sys/stat.h> instead."
#endif
+#ifndef _BITS_STAT_H
+#define _BITS_STAT_H 1
+
#include <bits/types.h>
/* Structure describing file characteristics. */
@@ -84,3 +87,5 @@ struct stat
#define __S_IREAD 0400 /* Read by owner. */
#define __S_IWRITE 0200 /* Write by owner. */
#define __S_IEXEC 0100 /* Execute by owner. */
+
+#endif /* bits/stat.h */
diff --git a/libc/sysdeps/unix/sysv/bits/stat.h b/libc/sysdeps/unix/sysv/bits/stat.h
index f9a9e614d..2fb619a2e 100644
--- a/libc/sysdeps/unix/sysv/bits/stat.h
+++ b/libc/sysdeps/unix/sysv/bits/stat.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1992, 1996, 1997, 2000 Free Software Foundation, Inc.
+/* Copyright (C) 1992, 1996, 1997, 2000, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,10 +16,13 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
-#ifndef _SYS_STAT_H
+#if !defined _SYS_STAT_H && !defined _FCNTL_H
# error "Never include <bits/stat.h> directly; use <sys/stat.h> instead."
#endif
+#ifndef _BITS_STAT_H
+#define _BITS_STAT_H 1
+
struct stat
{
short int st_dev;
@@ -63,3 +66,5 @@ struct stat
#define __S_IREAD 0400 /* Read by owner. */
#define __S_IWRITE 0200 /* Write by owner. */
#define __S_IEXEC 0100 /* Execute by owner. */
+
+#endif /* bits/stat.h */
diff --git a/libc/sysdeps/unix/sysv/linux/bits/sigaction.h b/libc/sysdeps/unix/sysv/linux/bits/sigaction.h
index 48cc5312f..62be06920 100644
--- a/libc/sysdeps/unix/sysv/linux/bits/sigaction.h
+++ b/libc/sysdeps/unix/sysv/linux/bits/sigaction.h
@@ -1,5 +1,5 @@
/* The proper definitions for Linux's sigaction.
- Copyright (C) 1993-1999, 2000 Free Software Foundation, Inc.
+ Copyright (C) 1993-1999, 2000, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -57,6 +57,8 @@ struct sigaction
three arguments instead of one. */
#if defined __USE_UNIX98 || defined __USE_MISC
# define SA_ONSTACK 0x08000000 /* Use signal stack by using `sa_restorer'. */
+#endif
+#if defined __USE_UNIX98 || defined __USE_MISC || defined __USE_XOPEN2K8
# define SA_RESTART 0x10000000 /* Restart syscall on signal return. */
# define SA_NODEFER 0x40000000 /* Don't automatically block the signal when
its handler is being executed. */
diff --git a/libc/sysdeps/unix/sysv/linux/bits/stat.h b/libc/sysdeps/unix/sysv/linux/bits/stat.h
index be5272333..8ac3cd472 100644
--- a/libc/sysdeps/unix/sysv/linux/bits/stat.h
+++ b/libc/sysdeps/unix/sysv/linux/bits/stat.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1992, 1995-2001, 2002, 2009 Free Software Foundation, Inc.
+/* Copyright (C) 1992, 1995-2002, 2009, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,10 +16,13 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
-#ifndef _SYS_STAT_H
+#if !defined _SYS_STAT_H && !defined _FCNTL_H
# error "Never include <bits/stat.h> directly; use <sys/stat.h> instead."
#endif
+#ifndef _BITS_STAT_H
+#define _BITS_STAT_H 1
+
/* Versions of the `struct stat' data structure. */
#define _STAT_VER_LINUX_OLD 1
#define _STAT_VER_KERNEL 1
@@ -166,3 +169,5 @@ struct stat64
# define UTIME_NOW ((1l << 30) - 1l)
# define UTIME_OMIT ((1l << 30) - 2l)
#endif
+
+#endif /* bits/stat.h */
diff --git a/libc/sysdeps/unix/sysv/linux/i386/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/i386/bits/fcntl.h
index 8eaf7c368..669388954 100644
--- a/libc/sysdeps/unix/sysv/linux/i386/bits/fcntl.h
+++ b/libc/sysdeps/unix/sysv/linux/i386/bits/fcntl.h
@@ -1,5 +1,5 @@
/* O_*, F_*, FD_* bit values for Linux.
- Copyright (C) 1995, 1996, 1997, 1998, 2000, 2004, 2006, 2007, 2009
+ Copyright (C) 1995, 1996, 1997, 1998, 2000, 2004, 2006, 2007, 2009, 2010
Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -45,13 +45,15 @@
#define O_FSYNC O_SYNC
#define O_ASYNC 020000
-#ifdef __USE_GNU
-# define O_DIRECT 040000 /* Direct disk access. */
+#ifdef __USE_XOPEN2K8
# define O_DIRECTORY 0200000 /* Must be a directory. */
# define O_NOFOLLOW 0400000 /* Do not follow links. */
-# define O_NOATIME 01000000 /* Do not set atime. */
# define O_CLOEXEC 02000000 /* Set close_on_exec. */
#endif
+#ifdef __USE_GNU
+# define O_DIRECT 040000 /* Direct disk access. */
+# define O_NOATIME 01000000 /* Do not set atime. */
+#endif
/* For now Linux has synchronisity options for data and read operations.
We define the symbols here but let them do the same as O_SYNC since
@@ -84,7 +86,7 @@
#define F_SETLK64 13 /* Set record locking info (non-blocking). */
#define F_SETLKW64 14 /* Set record locking info (blocking). */
-#if defined __USE_BSD || defined __USE_UNIX98
+#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8
# define F_SETOWN 8 /* Get owner (process receiving SIGIO). */
# define F_GETOWN 9 /* Set owner (process receiving SIGIO). */
#endif
@@ -100,6 +102,8 @@
# define F_SETLEASE 1024 /* Set a lease. */
# define F_GETLEASE 1025 /* Enquire what lease is active. */
# define F_NOTIFY 1026 /* Request notfications on a directory. */
+#endif
+#ifdef __USE_XOPEN2K8
# define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with
close-on-exit set. */
#endif
diff --git a/libc/sysdeps/unix/sysv/linux/ia64/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/ia64/bits/fcntl.h
index c65a11e1c..33635fd9e 100644
--- a/libc/sysdeps/unix/sysv/linux/ia64/bits/fcntl.h
+++ b/libc/sysdeps/unix/sysv/linux/ia64/bits/fcntl.h
@@ -1,5 +1,6 @@
/* O_*, F_*, FD_* bit values for Linux/IA64.
- Copyright (C) 1999,2000,2004,2006,2007,2009 Free Software Foundation, Inc.
+ Copyright (C) 1999, 2000, 2004, 2006, 2007, 2009, 2010
+ Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -44,13 +45,15 @@
#define O_FSYNC O_SYNC
#define O_ASYNC 020000
-#ifdef __USE_GNU
-# define O_DIRECT 040000
+#ifdef __USE_XOPEN2K8
# define O_DIRECTORY 0200000 /* must be a directory */
# define O_NOFOLLOW 0400000 /* don't follow links */
-# define O_NOATIME 01000000 /* Do not set atime. */
# define O_CLOEXEC 02000000 /* Set close_on_exec. */
#endif
+#ifdef __USE_GNU
+# define O_DIRECT 040000
+# define O_NOATIME 01000000 /* Do not set atime. */
+#endif
#ifdef __USE_LARGEFILE64
/* Not necessary, files are always with 64bit off_t. */
@@ -80,7 +83,7 @@
#define F_SETLK64 6 /* Set record locking info (non-blocking). */
#define F_SETLKW64 7 /* Set record locking info (blocking). */
-#if defined __USE_BSD || defined __USE_UNIX98
+#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8
# define F_SETOWN 8 /* Get owner (process receiving SIGIO). */
# define F_GETOWN 9 /* Set owner (process receiving SIGIO). */
#endif
@@ -96,6 +99,8 @@
# define F_SETLEASE 1024 /* Set a lease. */
# define F_GETLEASE 1025 /* Enquire what lease is active. */
# define F_NOTIFY 1026 /* Request notfications on a directory. */
+#endif
+#ifdef __USE_XOPEN2K8
# define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with
close-on-exit set. */
#endif
diff --git a/libc/sysdeps/unix/sysv/linux/ia64/bits/sigaction.h b/libc/sysdeps/unix/sysv/linux/ia64/bits/sigaction.h
index 11599d520..b557eaa3e 100644
--- a/libc/sysdeps/unix/sysv/linux/ia64/bits/sigaction.h
+++ b/libc/sysdeps/unix/sysv/linux/ia64/bits/sigaction.h
@@ -1,5 +1,5 @@
/* Definitions for Linux/ia64 sigaction.
- Copyright (C) 1996, 1997, 2000, 2003 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1997, 2000, 2003, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -53,6 +53,8 @@ struct sigaction
#define SA_SIGINFO 0x00000004
#if defined __USE_UNIX98 || defined __USE_MISC
# define SA_ONSTACK 0x08000000 /* Use signal stack by using `sa_restorer'. */
+#endif
+#if defined __USE_UNIX98 || defined __USE_MISC || defined __USE_XOPEN2K8
# define SA_RESTART 0x10000000 /* Restart syscall on signal return. */
# define SA_NODEFER 0x40000000 /* Don't automatically block the signal
when its handler is being executed. */
diff --git a/libc/sysdeps/unix/sysv/linux/ia64/bits/stat.h b/libc/sysdeps/unix/sysv/linux/ia64/bits/stat.h
index 75a331828..86acd27ae 100644
--- a/libc/sysdeps/unix/sysv/linux/ia64/bits/stat.h
+++ b/libc/sysdeps/unix/sysv/linux/ia64/bits/stat.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1999, 2000, 2001, 2002, 2009 Free Software Foundation, Inc.
+/* Copyright (C) 1999-2002, 2009, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,10 +16,13 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
-#ifndef _SYS_STAT_H
+#if !defined _SYS_STAT_H && !defined _FCNTL_H
# error "Never include <bits/stat.h> directly; use <sys/stat.h> instead."
#endif
+#ifndef _BITS_STAT_H
+#define _BITS_STAT_H 1
+
/* Versions of the `struct stat' data structure. */
#define _STAT_VER_KERNEL 0
#define _STAT_VER_LINUX 1
@@ -143,3 +146,5 @@ struct stat64
# define UTIME_NOW ((1l << 30) - 1l)
# define UTIME_OMIT ((1l << 30) - 2l)
#endif
+
+#endif /* bits//stat.h */
diff --git a/libc/sysdeps/unix/sysv/linux/powerpc/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/powerpc/bits/fcntl.h
index 40fe1e50d..fea347bfc 100644
--- a/libc/sysdeps/unix/sysv/linux/powerpc/bits/fcntl.h
+++ b/libc/sysdeps/unix/sysv/linux/powerpc/bits/fcntl.h
@@ -1,5 +1,5 @@
/* O_*, F_*, FD_* bit values for Linux/PowerPC.
- Copyright (C) 1995, 1996, 1997, 1998, 2000, 2003, 2004, 2006, 2007, 2009
+ Copyright (C) 1995-1998, 2000, 2003, 2004, 2006, 2007, 2009, 2010
Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -45,13 +45,15 @@
#define O_FSYNC O_SYNC
#define O_ASYNC 020000
-#ifdef __USE_GNU
-# define O_DIRECT 0400000 /* Direct disk access. */
+#ifdef __USE_XOPEN2K8
# define O_DIRECTORY 040000 /* Must be a directory. */
# define O_NOFOLLOW 0100000 /* Do not follow links. */
-# define O_NOATIME 01000000 /* Do not set atime. */
# define O_CLOEXEC 02000000 /* Set close_on_exec. */
#endif
+#ifdef __USE_GNU
+# define O_DIRECT 0400000 /* Direct disk access. */
+# define O_NOATIME 01000000 /* Do not set atime. */
+#endif
#ifdef __USE_LARGEFILE64
# define O_LARGEFILE 0200000
@@ -84,7 +86,7 @@
#define F_SETLK64 13 /* Set record locking info (non-blocking). */
#define F_SETLKW64 14 /* Set record locking info (blocking). */
-#if defined __USE_BSD || defined __USE_UNIX98
+#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8
# define F_SETOWN 8 /* Get owner (process receiving SIGIO). */
# define F_GETOWN 9 /* Set owner (process receiving SIGIO). */
#endif
@@ -100,6 +102,8 @@
# define F_SETLEASE 1024 /* Set a lease. */
# define F_GETLEASE 1025 /* Enquire what lease is active. */
# define F_NOTIFY 1026 /* Request notfications on a directory. */
+#endif
+#ifdef __USE_XOPEN2K8
# define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with
close-on-exit set. */
#endif
diff --git a/libc/sysdeps/unix/sysv/linux/powerpc/bits/stat.h b/libc/sysdeps/unix/sysv/linux/powerpc/bits/stat.h
index 81879ca1f..6e4a55f87 100644
--- a/libc/sysdeps/unix/sysv/linux/powerpc/bits/stat.h
+++ b/libc/sysdeps/unix/sysv/linux/powerpc/bits/stat.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1992, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2009
+/* Copyright (C) 1992, 1995-2002, 2009, 2010
Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,10 +17,13 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
-#ifndef _SYS_STAT_H
+#if !defined _SYS_STAT_H && !defined _FCNTL_H
# error "Never include <bits/stat.h> directly; use <sys/stat.h> instead."
#endif
+#ifndef _BITS_STAT_H
+#define _BITS_STAT_H 1
+
#include <bits/wordsize.h>
/* Versions of the `struct stat' data structure. */
@@ -270,3 +273,5 @@ struct stat64
# define UTIME_NOW ((1l << 30) - 1l)
# define UTIME_OMIT ((1l << 30) - 2l)
#endif
+
+#endif /* bits/stat.h */
diff --git a/libc/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies b/libc/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies
new file mode 100644
index 000000000..7c381f043
--- /dev/null
+++ b/libc/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies
@@ -0,0 +1,3 @@
+# Make sure this comes before the powerpc/powerpc32/fpu that's
+# listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
+powerpc/powerpc32/cell/fpu
diff --git a/libc/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies b/libc/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies
new file mode 100644
index 000000000..b6720ecda
--- /dev/null
+++ b/libc/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/cell/fpu
diff --git a/libc/sysdeps/unix/sysv/linux/s390/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/s390/bits/fcntl.h
index c9e6a4504..aeb1e0fe9 100644
--- a/libc/sysdeps/unix/sysv/linux/s390/bits/fcntl.h
+++ b/libc/sysdeps/unix/sysv/linux/s390/bits/fcntl.h
@@ -1,5 +1,6 @@
/* O_*, F_*, FD_* bit values for Linux.
- Copyright (C) 2000,2001,2002,2004,2006,2007,2009 Free Software Foundation, Inc.
+ Copyright (C) 2000-2002,2004,2006,2007,2009,2010
+ Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -45,13 +46,15 @@
#define O_FSYNC O_SYNC
#define O_ASYNC 020000
-#ifdef __USE_GNU
-# define O_DIRECT 040000 /* Direct disk access. */
+#ifdef __USE_XOPEN2K8
# define O_DIRECTORY 0200000 /* Must be a directory. */
# define O_NOFOLLOW 0400000 /* Do not follow links. */
-# define O_NOATIME 01000000 /* Do not set atime. */
# define O_CLOEXEC 02000000 /* Set close_on_exec. */
#endif
+#ifdef __USE_GNU
+# define O_DIRECT 040000 /* Direct disk access. */
+# define O_NOATIME 01000000 /* Do not set atime. */
+#endif
#ifdef __USE_LARGEFILE64
# if __WORDSIZE == 64
@@ -99,7 +102,7 @@
# define F_SETLKW64 14 /* Set record locking info (blocking). */
#endif
-#if defined __USE_BSD || defined __USE_UNIX98
+#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8
# define F_SETOWN 8 /* Get owner (process receiving SIGIO). */
# define F_GETOWN 9 /* Set owner (process receiving SIGIO). */
#endif
@@ -115,6 +118,8 @@
# define F_SETLEASE 1024 /* Set a lease. */
# define F_GETLEASE 1025 /* Enquire what lease is active. */
# define F_NOTIFY 1026 /* Request notfications on a directory. */
+#endif
+#ifdef __USE_XOPEN2K8
# define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with
close-on-exit set. */
#endif
diff --git a/libc/sysdeps/unix/sysv/linux/s390/bits/sigaction.h b/libc/sysdeps/unix/sysv/linux/s390/bits/sigaction.h
index 308cb5bd7..8767d6785 100644
--- a/libc/sysdeps/unix/sysv/linux/s390/bits/sigaction.h
+++ b/libc/sysdeps/unix/sysv/linux/s390/bits/sigaction.h
@@ -1,5 +1,5 @@
/* Definitions for 31 & 64 bit S/390 sigaction.
- Copyright (C) 2001, 2002 Free Software Foundation, Inc.
+ Copyright (C) 2001, 2002, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -90,6 +90,8 @@ struct sigaction
three arguments instead of one. */
#if defined __USE_UNIX98 || defined __USE_MISC
# define SA_ONSTACK 0x08000000 /* Use signal stack by using `sa_restorer'. */
+#endif
+#if defined __USE_UNIX98 || defined __USE_MISC || defined __USE_XOPEN2K8
# define SA_RESTART 0x10000000 /* Restart syscall on signal return. */
# define SA_NODEFER 0x40000000 /* Don't automatically block the signal when
its handler is being executed. */
diff --git a/libc/sysdeps/unix/sysv/linux/s390/bits/stat.h b/libc/sysdeps/unix/sysv/linux/s390/bits/stat.h
index 64be9a10f..9d62ad26f 100644
--- a/libc/sysdeps/unix/sysv/linux/s390/bits/stat.h
+++ b/libc/sysdeps/unix/sysv/linux/s390/bits/stat.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2000, 2001, 2002, 2009 Free Software Foundation, Inc.
+/* Copyright (C) 2000, 2001, 2002, 2009, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,10 +16,13 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
-#ifndef _SYS_STAT_H
+#if !defined _SYS_STAT_H && !defined _FCNTL_H
# error "Never include <bits/stat.h> directly; use <sys/stat.h> instead."
#endif
+#ifndef _BITS_STAT_H
+#define _BITS_STAT_H 1
+
#include <bits/wordsize.h>
#if __WORDSIZE == 64
@@ -259,3 +262,5 @@ struct stat64
# define UTIME_NOW ((1l << 30) - 1l)
# define UTIME_OMIT ((1l << 30) - 2l)
#endif
+
+#endif /* bits/stat.h */
diff --git a/libc/sysdeps/unix/sysv/linux/s390/s390-32/____longjmp_chk.c b/libc/sysdeps/unix/sysv/linux/s390/s390-32/____longjmp_chk.c
index b28e58749..f2c151857 100644
--- a/libc/sysdeps/unix/sysv/linux/s390/s390-32/____longjmp_chk.c
+++ b/libc/sysdeps/unix/sysv/linux/s390/s390-32/____longjmp_chk.c
@@ -46,7 +46,7 @@
{ \
if ((oss.ss_flags & SS_ONSTACK) == 0 \
|| ((uintptr_t) (oss.ss_sp + oss.ss_size) - new_sp \
- >= oss.ss_size)) \
+ < oss.ss_size)) \
__fortify_fail ("longjmp causes uninitialized stack frame");\
} \
} \
diff --git a/libc/sysdeps/unix/sysv/linux/s390/s390-64/____longjmp_chk.c b/libc/sysdeps/unix/sysv/linux/s390/s390-64/____longjmp_chk.c
index dcf58fb50..261be250d 100644
--- a/libc/sysdeps/unix/sysv/linux/s390/s390-64/____longjmp_chk.c
+++ b/libc/sysdeps/unix/sysv/linux/s390/s390-64/____longjmp_chk.c
@@ -46,7 +46,7 @@
{ \
if ((oss.ss_flags & SS_ONSTACK) == 0 \
|| ((uintptr_t) (oss.ss_sp + oss.ss_size) - new_sp \
- >= oss.ss_size)) \
+ < oss.ss_size)) \
__fortify_fail ("longjmp causes uninitialized stack frame");\
} \
} \
diff --git a/libc/sysdeps/unix/sysv/linux/sh/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/sh/bits/fcntl.h
index 8eaf7c368..2a4123c61 100644
--- a/libc/sysdeps/unix/sysv/linux/sh/bits/fcntl.h
+++ b/libc/sysdeps/unix/sysv/linux/sh/bits/fcntl.h
@@ -1,5 +1,5 @@
/* O_*, F_*, FD_* bit values for Linux.
- Copyright (C) 1995, 1996, 1997, 1998, 2000, 2004, 2006, 2007, 2009
+ Copyright (C) 1995, 1996, 1997, 1998, 2000, 2004, 2006, 2007, 2009, 2010
Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -45,14 +45,17 @@
#define O_FSYNC O_SYNC
#define O_ASYNC 020000
-#ifdef __USE_GNU
-# define O_DIRECT 040000 /* Direct disk access. */
+#ifdef __USE_XOPEN2K8
# define O_DIRECTORY 0200000 /* Must be a directory. */
# define O_NOFOLLOW 0400000 /* Do not follow links. */
-# define O_NOATIME 01000000 /* Do not set atime. */
# define O_CLOEXEC 02000000 /* Set close_on_exec. */
#endif
+#ifdef __USE_GNU
+# define O_DIRECT 040000 /* Direct disk access. */
+# define O_NOATIME 01000000 /* Do not set atime. */
+#endif
+
/* For now Linux has synchronisity options for data and read operations.
We define the symbols here but let them do the same as O_SYNC since
this is a superset. */
@@ -84,7 +87,7 @@
#define F_SETLK64 13 /* Set record locking info (non-blocking). */
#define F_SETLKW64 14 /* Set record locking info (blocking). */
-#if defined __USE_BSD || defined __USE_UNIX98
+#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8
# define F_SETOWN 8 /* Get owner (process receiving SIGIO). */
# define F_GETOWN 9 /* Set owner (process receiving SIGIO). */
#endif
@@ -100,6 +103,8 @@
# define F_SETLEASE 1024 /* Set a lease. */
# define F_GETLEASE 1025 /* Enquire what lease is active. */
# define F_NOTIFY 1026 /* Request notfications on a directory. */
+#endif
+#ifdef __USE_XOPEN2K8
# define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with
close-on-exit set. */
#endif
diff --git a/libc/sysdeps/unix/sysv/linux/sparc/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/sparc/bits/fcntl.h
index 03c5ba59b..1dc45b7d8 100644
--- a/libc/sysdeps/unix/sysv/linux/sparc/bits/fcntl.h
+++ b/libc/sysdeps/unix/sysv/linux/sparc/bits/fcntl.h
@@ -1,5 +1,5 @@
/* O_*, F_*, FD_* bit values for Linux/SPARC.
- Copyright (C) 1995, 1996, 1997, 1998, 2000, 2003, 2004, 2006, 2007, 2009
+ Copyright (C) 1995-1998, 2000, 2003, 2004, 2006, 2007, 2009, 2010
Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -44,12 +44,14 @@
#define O_NDELAY (0x0004 | O_NONBLOCK)
#define O_NOCTTY 0x8000 /* not fcntl */
-#ifdef __USE_GNU
+#ifdef __USE_XOPEN2K8
# define O_DIRECTORY 0x10000 /* must be a directory */
# define O_NOFOLLOW 0x20000 /* don't follow links */
+# define O_CLOEXEC 0x400000 /* Set close_on_exit. */
+#endif
+#ifdef __USE_GNU
# define O_DIRECT 0x100000 /* direct disk access hint */
# define O_NOATIME 0x200000 /* Do not set atime. */
-# define O_CLOEXEC 0x400000 /* Set close_on_exit. */
#endif
#ifdef __USE_LARGEFILE64
@@ -68,21 +70,13 @@
# define O_RSYNC O_SYNC /* Synchronize read operations. */
#endif
-/* For now Linux has synchronisity options for data and read operations.
- We define the symbols here but let them do the same as O_SYNC since
- this is a superset. */
-#if defined __USE_POSIX199309 || defined __USE_UNIX98
-# define O_DSYNC O_SYNC /* Synchronize data. */
-# define O_RSYNC O_SYNC /* Synchronize read operations. */
-#endif
-
/* Values for the second argument to `fcntl'. */
#define F_DUPFD 0 /* Duplicate file descriptor. */
#define F_GETFD 1 /* Get file descriptor flags. */
#define F_SETFD 2 /* Set file descriptor flags. */
#define F_GETFL 3 /* Get file status flags. */
#define F_SETFL 4 /* Set file status flags. */
-#if defined __USE_BSD || defined __USE_UNIX98
+#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8
# define F_GETOWN 5 /* Get owner (process receiving SIGIO). */
# define F_SETOWN 6 /* Set owner (process receiving SIGIO). */
#endif
@@ -107,6 +101,8 @@
# define F_SETLEASE 1024 /* Set a lease. */
# define F_GETLEASE 1025 /* Enquire what lease is active. */
# define F_NOTIFY 1026 /* Request notfications on a directory. */
+#endif
+#ifdef __USE_XOPEN2K8
# define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with
close-on-exit set. */
#endif
diff --git a/libc/sysdeps/unix/sysv/linux/sparc/bits/sigaction.h b/libc/sysdeps/unix/sysv/linux/sparc/bits/sigaction.h
index ee4196764..e474dbe26 100644
--- a/libc/sysdeps/unix/sysv/linux/sparc/bits/sigaction.h
+++ b/libc/sysdeps/unix/sysv/linux/sparc/bits/sigaction.h
@@ -1,5 +1,5 @@
/* The proper definitions for Linux/SPARC sigaction.
- Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
+ Copyright (C) 1996-2000, 2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -58,6 +58,8 @@ struct sigaction
three arguments instead of one. */
#if defined __USE_UNIX98 || defined __USE_MISC
# define SA_ONSTACK 0x00000001 /* Use signal stack by using `sa_restorer'. */
+#endif
+#if defined __USE_UNIX98 || defined __USE_MISC || defined __USE_XOPEN2K8
# define SA_RESTART 0x00000002 /* Restart syscall on signal return. */
# define SA_INTERRUPT 0x00000010 /* Historical no-op. */
# define SA_NOMASK 0x00000020 /* Don't automatically block the signal when
diff --git a/libc/sysdeps/unix/sysv/linux/sparc/bits/stat.h b/libc/sysdeps/unix/sysv/linux/sparc/bits/stat.h
index 175fdb857..eaab95a1c 100644
--- a/libc/sysdeps/unix/sysv/linux/sparc/bits/stat.h
+++ b/libc/sysdeps/unix/sysv/linux/sparc/bits/stat.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1992, 1995-2002, 2006, 2009 Free Software Foundation, Inc.
+/* Copyright (C) 1992,1995-2002,2006,2009,2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,10 +16,13 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
-#ifndef _SYS_STAT_H
+#if !defined _SYS_STAT_H && !defined _FCNTL_H
# error "Never include <bits/stat.h> directly; use <sys/stat.h> instead."
#endif
+#ifndef _BITS_STAT_H
+#define _BITS_STAT_H 1
+
/* Versions of the `struct stat' data structure. */
#define _STAT_VER_LINUX_OLD 1
#define _STAT_VER_KERNEL 1
@@ -167,3 +170,5 @@ struct stat64
# define UTIME_NOW ((1l << 30) - 1l)
# define UTIME_OMIT ((1l << 30) - 2l)
#endif
+
+#endif /* bits/stat.h */
diff --git a/libc/sysdeps/unix/sysv/linux/x86_64/bits/fcntl.h b/libc/sysdeps/unix/sysv/linux/x86_64/bits/fcntl.h
index 43835081c..aa04e0e49 100644
--- a/libc/sysdeps/unix/sysv/linux/x86_64/bits/fcntl.h
+++ b/libc/sysdeps/unix/sysv/linux/x86_64/bits/fcntl.h
@@ -1,5 +1,6 @@
/* O_*, F_*, FD_* bit values for Linux/x86-64.
- Copyright (C) 2001,2002,2004,2006,2007,2009 Free Software Foundation, Inc.
+ Copyright (C) 2001,2002,2004,2006,2007,2009,2010
+ Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -45,13 +46,15 @@
#define O_FSYNC O_SYNC
#define O_ASYNC 020000
-#ifdef __USE_GNU
-# define O_DIRECT 040000 /* Direct disk access. */
+#ifdef __USE_XOPEN2K8
# define O_DIRECTORY 0200000 /* Must be a directory. */
# define O_NOFOLLOW 0400000 /* Do not follow links. */
-# define O_NOATIME 01000000 /* Do not set atime. */
# define O_CLOEXEC 02000000 /* Set close_on_exec. */
#endif
+#ifdef __USE_GNU
+# define O_DIRECT 040000 /* Direct disk access. */
+# define O_NOATIME 01000000 /* Do not set atime. */
+#endif
/* For now Linux has synchronisity options for data and read operations.
We define the symbols here but let them do the same as O_SYNC since
@@ -98,7 +101,7 @@
# define F_SETLKW64 14 /* Set record locking info (blocking). */
#endif
-#if defined __USE_BSD || defined __USE_UNIX98
+#if defined __USE_BSD || defined __USE_UNIX98 || defined __USE_XOPEN2K8
# define F_SETOWN 8 /* Get owner (process receiving SIGIO). */
# define F_GETOWN 9 /* Set owner (process receiving SIGIO). */
#endif
@@ -114,6 +117,8 @@
# define F_SETLEASE 1024 /* Set a lease. */
# define F_GETLEASE 1025 /* Enquire what lease is active. */
# define F_NOTIFY 1026 /* Request notfications on a directory. */
+#endif
+#ifdef __USE_XOPEN2K8
# define F_DUPFD_CLOEXEC 1030 /* Duplicate file descriptor with
close-on-exit set. */
#endif
diff --git a/libc/sysdeps/unix/sysv/linux/x86_64/bits/stat.h b/libc/sysdeps/unix/sysv/linux/x86_64/bits/stat.h
index 9d12315e1..c7e4e1f0d 100644
--- a/libc/sysdeps/unix/sysv/linux/x86_64/bits/stat.h
+++ b/libc/sysdeps/unix/sysv/linux/x86_64/bits/stat.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 1999,2000,2001,2002,2003,2009 Free Software Foundation, Inc.
+/* Copyright (C) 1999-2003,2009,2010 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,10 +16,13 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
-#ifndef _SYS_STAT_H
+#if !defined _SYS_STAT_H && !defined _FCNTL_H
# error "Never include <bits/stat.h> directly; use <sys/stat.h> instead."
#endif
+#ifndef _BITS_STAT_H
+#define _BITS_STAT_H 1
+
/* Versions of the `struct stat' data structure. */
#define _STAT_VER_KERNEL 0
@@ -206,3 +209,5 @@ struct stat64
# define UTIME_NOW ((1l << 30) - 1l)
# define UTIME_OMIT ((1l << 30) - 2l)
#endif
+
+#endif /* bits/stat.h */
diff --git a/libc/sysdeps/x86_64/cacheinfo.c b/libc/sysdeps/x86_64/cacheinfo.c
index 5b66c62eb..54220379e 100644
--- a/libc/sysdeps/x86_64/cacheinfo.c
+++ b/libc/sysdeps/x86_64/cacheinfo.c
@@ -74,6 +74,7 @@ static const struct intel_02_cache_info
{ 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
{ 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
{ 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
+ { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 },
{ 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
{ 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
{ 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
@@ -113,6 +114,7 @@ static const struct intel_02_cache_info
{ 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
{ 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
{ 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
+ { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
{ 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
{ 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
{ 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
@@ -452,9 +454,10 @@ __cache_sysconf (int name)
}
-/* Half the data cache size for use in memory and string routines, typically
+/* Data cache size for use in memory and string routines, typically
L1 size. */
long int __x86_64_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
+long int __x86_64_data_cache_size attribute_hidden = 32 * 1024;
/* Shared cache size for use in memory and string routines, typically
L2 or L3 size. */
long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
@@ -657,7 +660,10 @@ init_cacheinfo (void)
}
if (data > 0)
- __x86_64_data_cache_size_half = data / 2;
+ {
+ __x86_64_data_cache_size_half = data / 2;
+ __x86_64_data_cache_size = data;
+ }
if (shared > 0)
{
diff --git a/libc/sysdeps/x86_64/multiarch/ifunc-defines.sym b/libc/sysdeps/x86_64/multiarch/ifunc-defines.sym
index e2021cdf8..eb1538abc 100644
--- a/libc/sysdeps/x86_64/multiarch/ifunc-defines.sym
+++ b/libc/sysdeps/x86_64/multiarch/ifunc-defines.sym
@@ -13,5 +13,8 @@ CPUID_ECX_OFFSET offsetof (struct cpuid_registers, ecx)
CPUID_EDX_OFFSET offsetof (struct cpuid_registers, edx)
FAMILY_OFFSET offsetof (struct cpu_features, family)
MODEL_OFFSET offsetof (struct cpu_features, model)
+FEATURE_OFFSET offsetof (struct cpu_features, feature)
+FEATURE_SIZE sizeof (unsigned int)
COMMON_CPUID_INDEX_1
+FEATURE_INDEX_1
diff --git a/libc/sysdeps/x86_64/multiarch/init-arch.c b/libc/sysdeps/x86_64/multiarch/init-arch.c
index 7823aceb9..50b2a38fb 100644
--- a/libc/sysdeps/x86_64/multiarch/init-arch.c
+++ b/libc/sysdeps/x86_64/multiarch/init-arch.c
@@ -64,7 +64,23 @@ __init_cpu_features (void)
__cpu_features.model += extended_model;
}
else if (__cpu_features.family == 0x06)
- __cpu_features.model += extended_model;
+ {
+ __cpu_features.model += extended_model;
+ switch (__cpu_features.model)
+ {
+ case 0x1a:
+ case 0x1e:
+ case 0x1f:
+ case 0x25:
+ case 0x2e:
+ case 0x2f:
+ /* Rep string instructions are fast on Intel Core i3, i5
+ and i7. */
+ __cpu_features.feature[index_Fast_Rep_String]
+ |= bit_Fast_Rep_String;
+ break;
+ }
+ }
}
/* This spells out "AuthenticAMD". */
else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/libc/sysdeps/x86_64/multiarch/init-arch.h b/libc/sysdeps/x86_64/multiarch/init-arch.h
index 0f8f77a8a..69492cb3b 100644
--- a/libc/sysdeps/x86_64/multiarch/init-arch.h
+++ b/libc/sysdeps/x86_64/multiarch/init-arch.h
@@ -16,6 +16,8 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
+#define bit_Fast_Rep_String (1 << 0)
+
#ifdef __ASSEMBLER__
#include <ifunc-defines.h>
@@ -28,6 +30,8 @@
#define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
#define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+#define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
+
#else /* __ASSEMBLER__ */
#include <sys/param.h>
@@ -39,6 +43,13 @@ enum
COMMON_CPUID_INDEX_MAX
};
+enum
+ {
+ FEATURE_INDEX_1 = 0,
+ /* Keep the following line at the end. */
+ FEATURE_INDEX_MAX
+ };
+
extern struct cpu_features
{
enum
@@ -58,6 +69,7 @@ extern struct cpu_features
} cpuid[COMMON_CPUID_INDEX_MAX];
unsigned int family;
unsigned int model;
+ unsigned int feature[FEATURE_INDEX_MAX];
} __cpu_features attribute_hidden;
@@ -86,4 +98,6 @@ extern const struct cpu_features *__get_cpu_features (void)
#define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
#define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
+#define index_Fast_Rep_String FEATURE_INDEX_1
+
#endif /* __ASSEMBLER__ */
diff --git a/libc/sysdeps/x86_64/multiarch/strlen.S b/libc/sysdeps/x86_64/multiarch/strlen.S
index 509f9c960..f9641131f 100644
--- a/libc/sysdeps/x86_64/multiarch/strlen.S
+++ b/libc/sysdeps/x86_64/multiarch/strlen.S
@@ -46,28 +46,58 @@ END(strlen)
__strlen_sse42:
cfi_startproc
CALL_MCOUNT
- pxor %xmm2, %xmm2
- movq %rdi, %rcx
+ pxor %xmm1, %xmm1
+ movl %edi, %ecx
movq %rdi, %r8
andq $~15, %rdi
- movdqa %xmm2, %xmm1
- pcmpeqb (%rdi), %xmm2
- orl $0xffffffff, %esi
- subq %rdi, %rcx
- shll %cl, %esi
- pmovmskb %xmm2, %edx
- andl %esi, %edx
- jnz 1f
-
-2: pcmpistri $0x08, 16(%rdi), %xmm1
- leaq 16(%rdi), %rdi
- jnz 2b
+ xor %edi, %ecx
+ pcmpeqb (%rdi), %xmm1
+ pmovmskb %xmm1, %edx
+ shrl %cl, %edx
+ shll %cl, %edx
+ andl %edx, %edx
+ jnz L(less16bytes)
+ pxor %xmm1, %xmm1
+ .p2align 4
+L(more64bytes_loop):
+ pcmpistri $0x08, 16(%rdi), %xmm1
+ jz L(more32bytes)
+
+ pcmpistri $0x08, 32(%rdi), %xmm1
+ jz L(more48bytes)
+
+ pcmpistri $0x08, 48(%rdi), %xmm1
+ jz L(more64bytes)
+
+ add $64, %rdi
+ pcmpistri $0x08, (%rdi), %xmm1
+ jnz L(more64bytes_loop)
leaq (%rdi,%rcx), %rax
subq %r8, %rax
ret
-1: subq %r8, %rdi
+ .p2align 4
+L(more32bytes):
+ leaq 16(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more48bytes):
+ leaq 32(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more64bytes):
+ leaq 48(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(less16bytes):
+ subq %r8, %rdi
bsfl %edx, %eax
addq %rdi, %rax
ret
diff --git a/libc/sysdeps/x86_64/strcmp.S b/libc/sysdeps/x86_64/strcmp.S
index 650ec173b..ac3fe1467 100644
--- a/libc/sysdeps/x86_64/strcmp.S
+++ b/libc/sysdeps/x86_64/strcmp.S
@@ -1,5 +1,5 @@
/* Highly optimized version for x86-64.
- Copyright (C) 1999, 2000, 2002, 2003, 2005, 2009
+ Copyright (C) 1999, 2000, 2002, 2003, 2005, 2009, 2010
Free Software Foundation, Inc.
This file is part of the GNU C Library.
Based on i686 version contributed by Ulrich Drepper
@@ -33,6 +33,13 @@
#endif
#ifdef USE_AS_STRNCMP
+/* The simplified code below is not set up to handle strncmp() so far.
+ Should this become necessary it has to be implemented. For now
+ just report the problem. */
+# ifdef NOT_IN_lib
+# error "strncmp not implemented so far"
+# endif
+
/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
if the new counter > the old one or is 0. */
# define UPDATE_STRNCMP_COUNTER \
@@ -54,7 +61,7 @@
#ifndef USE_SSSE3
.text
#else
- .section .text.ssse3,"ax",@progbits
+ .section .text.ssse3,"ax",@progbits
#endif
ENTRY (BP_SYM (STRCMP))
@@ -80,13 +87,13 @@ END (BP_SYM (STRCMP))
/*
* This implementation uses SSE to compare up to 16 bytes at a time.
*/
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
test %rdx, %rdx
je LABEL(strcmp_exitz)
cmp $1, %rdx
je LABEL(Byte0)
mov %rdx, %r11
-#endif
+# endif
mov %esi, %ecx
mov %edi, %eax
/* Use 64bit AND here to avoid long NOP padding. */
@@ -107,10 +114,10 @@ END (BP_SYM (STRCMP))
pmovmskb %xmm1, %edx
sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
jnz LABEL(less16bytes) /* If not, find different value or null char */
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz) /* finish comparision */
-#endif
+# endif
add $16, %rsi /* prepare to search next 16 bytes */
add $16, %rdi /* prepare to search next 16 bytes */
@@ -184,10 +191,10 @@ LABEL(loop_ashr_0):
sub $0xffff, %edx
jnz LABEL(exit) /* mismatch or null char seen */
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa (%rsi, %rcx), %xmm1
movdqa (%rdi, %rcx), %xmm2
@@ -198,10 +205,10 @@ LABEL(loop_ashr_0):
pmovmskb %xmm1, %edx
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
jmp LABEL(loop_ashr_0)
@@ -249,13 +256,13 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -264,10 +271,10 @@ LABEL(gobble_ashr_1):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -278,13 +285,13 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -293,10 +300,10 @@ LABEL(gobble_ashr_1):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
jmp LABEL(loop_ashr_1)
@@ -312,10 +319,10 @@ LABEL(nibble_ashr_1):
test $0xfffe, %edx
jnz LABEL(ashr_1_exittail) /* find null char*/
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $14, %r11
jbe LABEL(ashr_1_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10 /* substract 4K from %r10 */
@@ -334,7 +341,7 @@ LABEL(ashr_1_exittail):
/*
* The following cases will be handled by ashr_2
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(14~15) n -14 1(15 +(n-14) - n) ashr_2
*/
.p2align 4
@@ -376,13 +383,13 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -391,10 +398,10 @@ LABEL(gobble_ashr_2):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -406,13 +413,13 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -421,10 +428,10 @@ LABEL(gobble_ashr_2):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -437,10 +444,10 @@ LABEL(nibble_ashr_2):
test $0xfffc, %edx
jnz LABEL(ashr_2_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $13, %r11
jbe LABEL(ashr_2_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -498,13 +505,13 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -513,10 +520,10 @@ LABEL(gobble_ashr_3):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -528,13 +535,13 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -543,10 +550,10 @@ LABEL(gobble_ashr_3):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -559,10 +566,10 @@ LABEL(nibble_ashr_3):
test $0xfff8, %edx
jnz LABEL(ashr_3_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $12, %r11
jbe LABEL(ashr_3_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -620,13 +627,13 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -635,10 +642,10 @@ LABEL(gobble_ashr_4):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -650,13 +657,13 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -665,10 +672,10 @@ LABEL(gobble_ashr_4):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -681,10 +688,10 @@ LABEL(nibble_ashr_4):
test $0xfff0, %edx
jnz LABEL(ashr_4_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $11, %r11
jbe LABEL(ashr_4_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -700,7 +707,7 @@ LABEL(ashr_4_exittail):
/*
* The following cases will be handled by ashr_5
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
+ * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
*/
.p2align 4
LABEL(ashr_5):
@@ -742,13 +749,13 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -757,10 +764,10 @@ LABEL(gobble_ashr_5):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -772,13 +779,13 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -787,10 +794,10 @@ LABEL(gobble_ashr_5):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -803,10 +810,10 @@ LABEL(nibble_ashr_5):
test $0xffe0, %edx
jnz LABEL(ashr_5_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $10, %r11
jbe LABEL(ashr_5_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -822,7 +829,7 @@ LABEL(ashr_5_exittail):
/*
* The following cases will be handled by ashr_6
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
+ * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
*/
.p2align 4
LABEL(ashr_6):
@@ -864,13 +871,13 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -879,10 +886,10 @@ LABEL(gobble_ashr_6):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -894,13 +901,13 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -909,10 +916,10 @@ LABEL(gobble_ashr_6):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -925,10 +932,10 @@ LABEL(nibble_ashr_6):
test $0xffc0, %edx
jnz LABEL(ashr_6_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $9, %r11
jbe LABEL(ashr_6_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -944,7 +951,7 @@ LABEL(ashr_6_exittail):
/*
* The following cases will be handled by ashr_7
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
+ * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
*/
.p2align 4
LABEL(ashr_7):
@@ -986,13 +993,13 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1001,10 +1008,10 @@ LABEL(gobble_ashr_7):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1016,13 +1023,13 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1031,10 +1038,10 @@ LABEL(gobble_ashr_7):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1047,10 +1054,10 @@ LABEL(nibble_ashr_7):
test $0xff80, %edx
jnz LABEL(ashr_7_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $8, %r11
jbe LABEL(ashr_7_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1066,7 +1073,7 @@ LABEL(ashr_7_exittail):
/*
* The following cases will be handled by ashr_8
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
+ * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
*/
.p2align 4
LABEL(ashr_8):
@@ -1108,13 +1115,13 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1123,10 +1130,10 @@ LABEL(gobble_ashr_8):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1138,13 +1145,13 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1153,10 +1160,10 @@ LABEL(gobble_ashr_8):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1169,10 +1176,10 @@ LABEL(nibble_ashr_8):
test $0xff00, %edx
jnz LABEL(ashr_8_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $7, %r11
jbe LABEL(ashr_8_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1188,7 +1195,7 @@ LABEL(ashr_8_exittail):
/*
* The following cases will be handled by ashr_9
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
+ * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
*/
.p2align 4
LABEL(ashr_9):
@@ -1230,13 +1237,13 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1245,10 +1252,10 @@ LABEL(gobble_ashr_9):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1260,13 +1267,13 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1275,10 +1282,10 @@ LABEL(gobble_ashr_9):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3 /* store for next cycle */
@@ -1291,10 +1298,10 @@ LABEL(nibble_ashr_9):
test $0xfe00, %edx
jnz LABEL(ashr_9_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $6, %r11
jbe LABEL(ashr_9_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1310,7 +1317,7 @@ LABEL(ashr_9_exittail):
/*
* The following cases will be handled by ashr_10
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
+ * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
*/
.p2align 4
LABEL(ashr_10):
@@ -1352,13 +1359,13 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1367,10 +1374,10 @@ LABEL(gobble_ashr_10):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1382,13 +1389,13 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1397,10 +1404,10 @@ LABEL(gobble_ashr_10):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1413,10 +1420,10 @@ LABEL(nibble_ashr_10):
test $0xfc00, %edx
jnz LABEL(ashr_10_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $5, %r11
jbe LABEL(ashr_10_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1432,7 +1439,7 @@ LABEL(ashr_10_exittail):
/*
* The following cases will be handled by ashr_11
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
+ * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
*/
.p2align 4
LABEL(ashr_11):
@@ -1474,13 +1481,13 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1489,10 +1496,10 @@ LABEL(gobble_ashr_11):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1504,13 +1511,13 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1519,10 +1526,10 @@ LABEL(gobble_ashr_11):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1535,10 +1542,10 @@ LABEL(nibble_ashr_11):
test $0xf800, %edx
jnz LABEL(ashr_11_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $4, %r11
jbe LABEL(ashr_11_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1554,7 +1561,7 @@ LABEL(ashr_11_exittail):
/*
* The following cases will be handled by ashr_12
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
+ * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
*/
.p2align 4
LABEL(ashr_12):
@@ -1596,13 +1603,13 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1611,10 +1618,10 @@ LABEL(gobble_ashr_12):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1626,13 +1633,13 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1641,10 +1648,10 @@ LABEL(gobble_ashr_12):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1657,10 +1664,10 @@ LABEL(nibble_ashr_12):
test $0xf000, %edx
jnz LABEL(ashr_12_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $3, %r11
jbe LABEL(ashr_12_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1676,7 +1683,7 @@ LABEL(ashr_12_exittail):
/*
* The following cases will be handled by ashr_13
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
+ * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
*/
.p2align 4
LABEL(ashr_13):
@@ -1718,13 +1725,13 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1733,10 +1740,10 @@ LABEL(gobble_ashr_13):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1748,13 +1755,13 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1763,10 +1770,10 @@ LABEL(gobble_ashr_13):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1779,10 +1786,10 @@ LABEL(nibble_ashr_13):
test $0xe000, %edx
jnz LABEL(ashr_13_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $2, %r11
jbe LABEL(ashr_13_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1798,7 +1805,7 @@ LABEL(ashr_13_exittail):
/*
* The following cases will be handled by ashr_14
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
+ * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
*/
.p2align 4
LABEL(ashr_14):
@@ -1840,13 +1847,13 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1855,10 +1862,10 @@ LABEL(gobble_ashr_14):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1870,13 +1877,13 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1885,10 +1892,10 @@ LABEL(gobble_ashr_14):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1901,10 +1908,10 @@ LABEL(nibble_ashr_14):
test $0xc000, %edx
jnz LABEL(ashr_14_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
cmp $1, %r11
jbe LABEL(ashr_14_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -1920,7 +1927,7 @@ LABEL(ashr_14_exittail):
/*
* The following cases will be handled by ashr_15
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
+ * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
*/
.p2align 4
LABEL(ashr_15):
@@ -1964,13 +1971,13 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -1979,10 +1986,10 @@ LABEL(gobble_ashr_15):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -1994,13 +2001,13 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
+# ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
+# else
palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+# endif
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm1
@@ -2009,10 +2016,10 @@ LABEL(gobble_ashr_15):
sub $0xffff, %edx
jnz LABEL(exit)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub $16, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
add $16, %rcx
movdqa %xmm4, %xmm3
@@ -2025,10 +2032,10 @@ LABEL(nibble_ashr_15):
test $0x8000, %edx
jnz LABEL(ashr_15_exittail)
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
test %r11, %r11
je LABEL(ashr_15_exittail)
-#endif
+# endif
pxor %xmm0, %xmm0
sub $0x1000, %r10
@@ -2062,10 +2069,10 @@ LABEL(ret):
LABEL(less16bytes):
bsf %rdx, %rdx /* find and store bit index in %rdx */
-#ifdef USE_AS_STRNCMP
+# ifdef USE_AS_STRNCMP
sub %rdx, %r11
jbe LABEL(strcmp_exitz)
-#endif
+# endif
movzbl (%rsi, %rdx), %ecx
movzbl (%rdi, %rdx), %eax