diff options
author | Harsha Jagasia <harsha.jagasia@amd.com> | 2011-03-04 23:30:08 -0500 |
---|---|---|
committer | Ulrich Drepper <drepper@gmail.com> | 2011-03-04 23:30:08 -0500 |
commit | 7e4ba49cd365555ddaff2ae8bba7b912464ad6e5 (patch) | |
tree | ec4eaf0ea436e74b584daefdceeb4ab66c52728d | |
parent | 13a804de8f3091e8ccd9b650f61becd6e1304227 (diff) | |
download | glibc-7e4ba49cd365555ddaff2ae8bba7b912464ad6e5.tar.gz |
Enable SSE2 memset for AMD'supcoming Orochi processor.
This patch enables SSE2 memset for AMD's upcoming Orochi processor.
This patch also fixes the following bug:
For misaligned blocks larger than > 144 Bytes, memset branches into
the integer code path depending on the value of misalignment even if
the startup code chooses the SSE2 code path upfront, when multiarch
is enabled.
-rw-r--r-- | ChangeLog | 16 | ||||
-rw-r--r-- | sysdeps/x86_64/cacheinfo.c | 49 | ||||
-rw-r--r-- | sysdeps/x86_64/memset.S | 68 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/init-arch.c | 12 |
4 files changed, 94 insertions, 51 deletions
@@ -1,3 +1,17 @@ +2011-03-02 Harsha Jagasia <harsha.jagasia@amd.com> + Ulrich Drepper <drepper@gmail.com> + + * sysdeps/x86_64/memset.S: After aligning destination, code + branches to different locations depending on the value of + misalignment, when multiarch is enabled. Fix this. + +2011-03-02 Harsha Jagasia <harsha.jagasia@amd.com> + + * sysdeps/x86_64/cacheinfo.c (init_cacheinfo): + Set _x86_64_preferred_memory_instruction for AMD processsors. + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): + Set bit_Prefer_SSE_for_memop for AMD processors. + 2011-03-04 Ulrich Drepper <drepper@gmail.com> * libio/fmemopen.c (fmemopen): Optimize a bit. @@ -12,7 +26,7 @@ 2011-02-28 Aurelien Jarno <aurelien@aurel32.net> - * sysdeps/sparc/sparc64/multiarch/memset.S(__bzero): call + * sysdeps/sparc/sparc64/multiarch/memset.S(__bzero): Call __bzero_ultra1 instead of __memset_ultra1. 2011-02-23 Andreas Schwab <schwab@redhat.com> diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c index eae54e725a..337444df07 100644 --- a/sysdeps/x86_64/cacheinfo.c +++ b/sysdeps/x86_64/cacheinfo.c @@ -1,5 +1,5 @@ /* x86_64 cache info. - Copyright (C) 2003, 2004, 2006, 2007, 2009 Free Software Foundation, Inc. + Copyright (C) 2003,2004,2006,2007,2009,2011 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -352,11 +352,11 @@ handle_amd (int name) case _SC_LEVEL2_CACHE_ASSOC: switch ((ecx >> 12) & 0xf) - { - case 0: - case 1: - case 2: - case 4: + { + case 0: + case 1: + case 2: + case 4: return (ecx >> 12) & 0xf; case 6: return 8; @@ -376,7 +376,7 @@ handle_amd (int name) return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff); default: return 0; - } + } /* NOTREACHED */ case _SC_LEVEL2_CACHE_LINESIZE: @@ -521,10 +521,10 @@ init_cacheinfo (void) shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid); if (shared <= 0) - { + { /* Try L2 otherwise. */ - level = 2; - shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid); + level = 2; + shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid); } unsigned int ebx_1; @@ -540,7 +540,7 @@ init_cacheinfo (void) #ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION /* Intel prefers SSSE3 instructions for memory/string routines - if they are avaiable. */ + if they are available. */ if ((ecx & 0x200)) __x86_64_preferred_memory_instruction = 3; else @@ -550,7 +550,7 @@ init_cacheinfo (void) /* Figure out the number of logical threads that share the highest cache level. */ if (max_cpuid >= 4) - { + { int i = 0; /* Query until desired cache level is enumerated. */ @@ -565,7 +565,7 @@ init_cacheinfo (void) if ((eax & 0x1f) == 0) goto intel_bug_no_cache_info; } - while (((eax >> 5) & 0x7) != level); + while (((eax >> 5) & 0x7) != level); threads = (eax >> 14) & 0x3ff; @@ -602,7 +602,7 @@ init_cacheinfo (void) threads += 1; } else - { + { intel_bug_no_cache_info: /* Assume that all logical threads share the highest cache level. */ @@ -612,7 +612,7 @@ init_cacheinfo (void) /* Cap usage of highest cache level to the number of supported threads. */ if (shared > 0 && threads > 0) - shared /= threads; + shared /= threads; } /* This spells out "AuthenticAMD". */ else if (is_amd) @@ -621,6 +621,25 @@ init_cacheinfo (void) long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); +#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION +# ifdef USE_MULTIARCH + eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax; + ebx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx; + ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx; + edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx; +# else + __cpuid (1, eax, ebx, ecx, edx); +# endif + + /* AMD prefers SSSE3 instructions for memory/string routines + if they are avaiable, otherwise it prefers integer + instructions. */ + if ((ecx & 0x200)) + __x86_64_preferred_memory_instruction = 3; + else + __x86_64_preferred_memory_instruction = 0; +#endif + /* Get maximum extended function. */ __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx); diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index f6eb71fc7e..d43c7f68b1 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -1,6 +1,6 @@ /* memset/bzero -- set memory area to CH/0 Optimized version for x86-64. - Copyright (C) 2002-2005, 2007, 2008 Free Software Foundation, Inc. + Copyright (C) 2002-2005, 2007, 2008, 2011 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,7 +23,7 @@ #define __STOS_LOWER_BOUNDARY $8192 #define __STOS_UPPER_BOUNDARY $65536 - .text + .text #if !defined NOT_IN_libc && !defined USE_MULTIARCH ENTRY(__bzero) mov %rsi,%rdx /* Adjust parameter. */ @@ -417,7 +417,7 @@ L(P4Q0): mov %edx,-0x4(%rdi) retq .balign 16 -#if defined(USE_EXTRA_TABLE) +#ifdef USE_EXTRA_TABLE L(P5QI): mov %rdx,-0x95(%rdi) #endif L(P5QH): mov %rdx,-0x8d(%rdi) @@ -596,6 +596,8 @@ L(A6Q0): mov %dx,-0x6(%rdi) jmp L(aligned_now) L(SSE_pre): +#else +L(aligned_now): #endif #if !defined USE_MULTIARCH || defined USE_SSE2 # fill RegXMM0 with the pattern @@ -606,16 +608,16 @@ L(SSE_pre): jge L(byte32sse2_pre) add %r8,%rdi -#ifndef PIC +# ifndef PIC lea L(SSExDx)(%rip),%r9 jmpq *(%r9,%r8,8) -#else +# else lea L(SSE0Q0)(%rip),%r9 lea L(SSExDx)(%rip),%rcx movswq (%rcx,%r8,2),%rcx lea (%rcx,%r9,1),%r9 jmpq *%r9 -#endif +# endif L(SSE0QB): movdqa %xmm0,-0xb0(%rdi) L(SSE0QA): movdqa %xmm0,-0xa0(%rdi) @@ -881,16 +883,16 @@ L(byte32sse2): lea 0x80(%rdi),%rdi jge L(byte32sse2) add %r8,%rdi -#ifndef PIC +# ifndef PIC lea L(SSExDx)(%rip),%r11 jmpq *(%r11,%r8,8) -#else +# else lea L(SSE0Q0)(%rip),%r11 lea L(SSExDx)(%rip),%rcx movswq (%rcx,%r8,2),%rcx lea (%rcx,%r11,1),%r11 jmpq *%r11 -#endif +# endif .balign 16 L(sse2_nt_move_pre): @@ -916,20 +918,20 @@ L(sse2_nt_move): jge L(sse2_nt_move) sfence add %r8,%rdi -#ifndef PIC +# ifndef PIC lea L(SSExDx)(%rip),%r11 jmpq *(%r11,%r8,8) -#else +# else lea L(SSE0Q0)(%rip),%r11 lea L(SSExDx)(%rip),%rcx movswq (%rcx,%r8,2),%rcx lea (%rcx,%r11,1),%r11 jmpq *%r11 -#endif +# endif .pushsection .rodata .balign 16 -#ifndef PIC +# ifndef PIC L(SSExDx): .quad L(SSE0Q0), L(SSE1Q0), L(SSE2Q0), L(SSE3Q0) .quad L(SSE4Q0), L(SSE5Q0), L(SSE6Q0), L(SSE7Q0) @@ -979,7 +981,7 @@ L(SSExDx): .quad L(SSE4QB), L(SSE5QB), L(SSE6QB), L(SSE7QB) .quad L(SSE8QB), L(SSE9QB), L(SSE10QB), L(SSE11QB) .quad L(SSE12QB), L(SSE13QB), L(SSE14QB), L(SSE15QB) -#else +# else L(SSExDx): .short L(SSE0Q0) -L(SSE0Q0) .short L(SSE1Q0) -L(SSE0Q0) @@ -1196,14 +1198,14 @@ L(SSExDx): .short L(SSE13QB)-L(SSE0Q0) .short L(SSE14QB)-L(SSE0Q0) .short L(SSE15QB)-L(SSE0Q0) -#endif +# endif .popsection #endif /* !defined USE_MULTIARCH || defined USE_SSE2 */ .balign 16 +#ifndef USE_MULTIARCH L(aligned_now): -#ifndef USE_MULTIARCH cmpl $0x1,__x86_64_preferred_memory_instruction(%rip) jg L(SSE_pre) #endif /* USE_MULTIARCH */ @@ -1246,17 +1248,17 @@ L(8byte_move_loop): L(8byte_move_skip): andl $127,%r8d - lea (%rdi,%r8,1),%rdi + lea (%rdi,%r8,1),%rdi #ifndef PIC - lea L(setPxQx)(%rip),%r11 - jmpq *(%r11,%r8,8) # old scheme remained for nonPIC + lea L(setPxQx)(%rip),%r11 + jmpq *(%r11,%r8,8) # old scheme remained for nonPIC #else - lea L(Got0)(%rip),%r11 + lea L(Got0)(%rip),%r11 lea L(setPxQx)(%rip),%rcx movswq (%rcx,%r8,2),%rcx - lea (%rcx,%r11,1),%r11 - jmpq *%r11 + lea (%rcx,%r11,1),%r11 + jmpq *%r11 #endif .balign 16 @@ -1290,16 +1292,16 @@ L(8byte_stos_skip): ja L(8byte_nt_move) andl $7,%r8d - lea (%rdi,%r8,1),%rdi + lea (%rdi,%r8,1),%rdi #ifndef PIC - lea L(setPxQx)(%rip),%r11 - jmpq *(%r11,%r8,8) # old scheme remained for nonPIC + lea L(setPxQx)(%rip),%r11 + jmpq *(%r11,%r8,8) # old scheme remained for nonPIC #else - lea L(Got0)(%rip),%r11 + lea L(Got0)(%rip),%r11 lea L(setPxQx)(%rip),%rcx movswq (%rcx,%r8,2),%rcx lea (%rcx,%r11,1),%r11 - jmpq *%r11 + jmpq *%r11 #endif .balign 16 @@ -1338,16 +1340,16 @@ L(8byte_nt_move_loop): L(8byte_nt_move_skip): andl $127,%r8d - lea (%rdi,%r8,1),%rdi + lea (%rdi,%r8,1),%rdi #ifndef PIC - lea L(setPxQx)(%rip),%r11 - jmpq *(%r11,%r8,8) # old scheme remained for nonPIC + lea L(setPxQx)(%rip),%r11 + jmpq *(%r11,%r8,8) # old scheme remained for nonPIC #else - lea L(Got0)(%rip),%r11 + lea L(Got0)(%rip),%r11 lea L(setPxQx)(%rip),%rcx movswq (%rcx,%r8,2),%rcx - lea (%rcx,%r11,1),%r11 - jmpq *%r11 + lea (%rcx,%r11,1),%r11 + jmpq *%r11 #endif END (memset) diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index f0d2bb7d14..34ec2df2d5 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -1,6 +1,6 @@ /* Initialize CPU feature data. This file is part of the GNU C Library. - Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc. + Copyright (C) 2008, 2009, 2010, 2011 Free Software Foundation, Inc. Contributed by Ulrich Drepper <drepper@redhat.com>. The GNU C Library is free software; you can redistribute it and/or @@ -60,7 +60,7 @@ __init_cpu_features (void) get_common_indeces (&family, &model); /* Intel processors prefer SSE instruction for memory/string - routines if they are avaiable. */ + routines if they are available. */ __cpu_features.feature[index_Prefer_SSE_for_memop] |= bit_Prefer_SSE_for_memop; @@ -107,6 +107,14 @@ __init_cpu_features (void) kind = arch_kind_amd; get_common_indeces (&family, &model); + + unsigned int ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx; + + /* AMD processors prefer SSE instructions for memory/string routines + if they are available, otherwise they prefer integer instructions. */ + if ((ecx & 0x200)) + __cpu_features.feature[index_Prefer_SSE_for_memop] + |= bit_Prefer_SSE_for_memop; } else kind = arch_kind_other; |