summaryrefslogtreecommitdiff
path: root/lib/freebl/mpi/mpcpucache.c
diff options
context:
space:
mode:
authorKai Engert <kaie@kuix.de>2013-02-28 12:44:50 +0100
committerKai Engert <kaie@kuix.de>2013-02-28 12:44:50 +0100
commit3ecd967b2a9e23403935e2bc932597f7e03e7f24 (patch)
tree4b0f054f0354c2dbe401f86d864c04c6034c1621 /lib/freebl/mpi/mpcpucache.c
parentf45b9ca74a609e0521d0cc4b7fc91603774992df (diff)
downloadnss-hg-3ecd967b2a9e23403935e2bc932597f7e03e7f24.tar.gz
Bug 845556, reorganize NSS directory layout, moving files, very large changeset! r=wtc
Diffstat (limited to 'lib/freebl/mpi/mpcpucache.c')
-rw-r--r--lib/freebl/mpi/mpcpucache.c813
1 files changed, 813 insertions, 0 deletions
diff --git a/lib/freebl/mpi/mpcpucache.c b/lib/freebl/mpi/mpcpucache.c
new file mode 100644
index 000000000..9a4a9d30c
--- /dev/null
+++ b/lib/freebl/mpi/mpcpucache.c
@@ -0,0 +1,813 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "mpi.h"
+
+/*
+ * This file implements a single function: s_mpi_getProcessorLineSize();
+ * s_mpi_getProcessorLineSize() returns the size in bytes of the cache line
+ * if a cache exists, or zero if there is no cache. If more than one
+ * cache line exists, it should return the smallest line size (which is
+ * usually the L1 cache).
+ *
+ * mp_modexp uses this information to make sure that private key information
+ * isn't being leaked through the cache.
+ *
+ * Currently the file returns good data for most modern x86 processors, and
+ * reasonable data on 64-bit ppc processors. All other processors are assumed
+ * to have a cache line size of 32 bytes unless modified by target.mk.
+ *
+ */
+
+#if defined(i386) || defined(__i386) || defined(__X86__) || defined (_M_IX86) || defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
+/* X86 processors have special instructions that tell us about the cache */
+#include "string.h"
+
+#if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
+#define AMD_64 1
+#endif
+
+/* Generic CPUID function */
+#if defined(AMD_64)
+
+#if defined(__GNUC__)
+
+void freebl_cpuid(unsigned long op, unsigned long *eax,
+ unsigned long *ebx, unsigned long *ecx,
+ unsigned long *edx)
+{
+ __asm__("cpuid\n\t"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (op));
+}
+
+#elif defined(_MSC_VER)
+
+#include <intrin.h>
+
+void freebl_cpuid(unsigned long op, unsigned long *eax,
+ unsigned long *ebx, unsigned long *ecx,
+ unsigned long *edx)
+{
+ int intrinsic_out[4];
+
+ __cpuid(intrinsic_out, op);
+ *eax = intrinsic_out[0];
+ *ebx = intrinsic_out[1];
+ *ecx = intrinsic_out[2];
+ *edx = intrinsic_out[3];
+}
+
+#endif
+
+#else /* !defined(AMD_64) */
+
+/* x86 */
+
+#if defined(__GNUC__)
+void freebl_cpuid(unsigned long op, unsigned long *eax,
+ unsigned long *ebx, unsigned long *ecx,
+ unsigned long *edx)
+{
+/* sigh GCC isn't smart enough to save the ebx PIC register on it's own
+ * in this case, so do it by hand. Use edi to store ebx and pass the
+ * value returned in ebx from cpuid through edi. */
+ __asm__("mov %%ebx,%%edi\n\t"
+ "cpuid\n\t"
+ "xchgl %%ebx,%%edi\n\t"
+ : "=a" (*eax),
+ "=D" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (op));
+}
+
+/*
+ * try flipping a processor flag to determine CPU type
+ */
+static unsigned long changeFlag(unsigned long flag)
+{
+ unsigned long changedFlags, originalFlags;
+ __asm__("pushfl\n\t" /* get the flags */
+ "popl %0\n\t"
+ "movl %0,%1\n\t" /* save the original flags */
+ "xorl %2,%0\n\t" /* flip the bit */
+ "pushl %0\n\t" /* set the flags */
+ "popfl\n\t"
+ "pushfl\n\t" /* get the flags again (for return) */
+ "popl %0\n\t"
+ "pushl %1\n\t" /* restore the original flags */
+ "popfl\n\t"
+ : "=r" (changedFlags),
+ "=r" (originalFlags),
+ "=r" (flag)
+ : "2" (flag));
+ return changedFlags ^ originalFlags;
+}
+
+#elif defined(_MSC_VER)
+
+/*
+ * windows versions of the above assembler
+ */
+#define wcpuid __asm __emit 0fh __asm __emit 0a2h
+void freebl_cpuid(unsigned long op, unsigned long *Reax,
+ unsigned long *Rebx, unsigned long *Recx, unsigned long *Redx)
+{
+ unsigned long Leax, Lebx, Lecx, Ledx;
+ __asm {
+ pushad
+ mov eax,op
+ wcpuid
+ mov Leax,eax
+ mov Lebx,ebx
+ mov Lecx,ecx
+ mov Ledx,edx
+ popad
+ }
+ *Reax = Leax;
+ *Rebx = Lebx;
+ *Recx = Lecx;
+ *Redx = Ledx;
+}
+
+static unsigned long changeFlag(unsigned long flag)
+{
+ unsigned long changedFlags, originalFlags;
+ __asm {
+ push eax
+ push ebx
+ pushfd /* get the flags */
+ pop eax
+ push eax /* save the flags on the stack */
+ mov originalFlags,eax /* save the original flags */
+ mov ebx,flag
+ xor eax,ebx /* flip the bit */
+ push eax /* set the flags */
+ popfd
+ pushfd /* get the flags again (for return) */
+ pop eax
+ popfd /* restore the original flags */
+ mov changedFlags,eax
+ pop ebx
+ pop eax
+ }
+ return changedFlags ^ originalFlags;
+}
+#endif
+
+#endif
+
+#if !defined(AMD_64)
+#define AC_FLAG 0x40000
+#define ID_FLAG 0x200000
+
+/* 386 processors can't flip the AC_FLAG, intel AP Note AP-485 */
+static int is386()
+{
+ return changeFlag(AC_FLAG) == 0;
+}
+
+/* 486 processors can't flip the ID_FLAG, intel AP Note AP-485 */
+static int is486()
+{
+ return changeFlag(ID_FLAG) == 0;
+}
+#endif
+
+
+/*
+ * table for Intel Cache.
+ * See Intel Application Note AP-485 for more information
+ */
+
+typedef unsigned char CacheTypeEntry;
+
+typedef enum {
+ Cache_NONE = 0,
+ Cache_UNKNOWN = 1,
+ Cache_TLB = 2,
+ Cache_TLBi = 3,
+ Cache_TLBd = 4,
+ Cache_Trace = 5,
+ Cache_L1 = 6,
+ Cache_L1i = 7,
+ Cache_L1d = 8,
+ Cache_L2 = 9 ,
+ Cache_L2i = 10 ,
+ Cache_L2d = 11 ,
+ Cache_L3 = 12 ,
+ Cache_L3i = 13,
+ Cache_L3d = 14
+} CacheType;
+
+struct _cache {
+ CacheTypeEntry type;
+ unsigned char lineSize;
+};
+static const struct _cache CacheMap[256] = {
+/* 00 */ {Cache_NONE, 0 },
+/* 01 */ {Cache_TLBi, 0 },
+/* 02 */ {Cache_TLBi, 0 },
+/* 03 */ {Cache_TLBd, 0 },
+/* 04 */ {Cache_TLBd, },
+/* 05 */ {Cache_UNKNOWN, 0 },
+/* 06 */ {Cache_L1i, 32 },
+/* 07 */ {Cache_UNKNOWN, 0 },
+/* 08 */ {Cache_L1i, 32 },
+/* 09 */ {Cache_UNKNOWN, 0 },
+/* 0a */ {Cache_L1d, 32 },
+/* 0b */ {Cache_UNKNOWN, 0 },
+/* 0c */ {Cache_L1d, 32 },
+/* 0d */ {Cache_UNKNOWN, 0 },
+/* 0e */ {Cache_UNKNOWN, 0 },
+/* 0f */ {Cache_UNKNOWN, 0 },
+/* 10 */ {Cache_UNKNOWN, 0 },
+/* 11 */ {Cache_UNKNOWN, 0 },
+/* 12 */ {Cache_UNKNOWN, 0 },
+/* 13 */ {Cache_UNKNOWN, 0 },
+/* 14 */ {Cache_UNKNOWN, 0 },
+/* 15 */ {Cache_UNKNOWN, 0 },
+/* 16 */ {Cache_UNKNOWN, 0 },
+/* 17 */ {Cache_UNKNOWN, 0 },
+/* 18 */ {Cache_UNKNOWN, 0 },
+/* 19 */ {Cache_UNKNOWN, 0 },
+/* 1a */ {Cache_UNKNOWN, 0 },
+/* 1b */ {Cache_UNKNOWN, 0 },
+/* 1c */ {Cache_UNKNOWN, 0 },
+/* 1d */ {Cache_UNKNOWN, 0 },
+/* 1e */ {Cache_UNKNOWN, 0 },
+/* 1f */ {Cache_UNKNOWN, 0 },
+/* 20 */ {Cache_UNKNOWN, 0 },
+/* 21 */ {Cache_UNKNOWN, 0 },
+/* 22 */ {Cache_L3, 64 },
+/* 23 */ {Cache_L3, 64 },
+/* 24 */ {Cache_UNKNOWN, 0 },
+/* 25 */ {Cache_L3, 64 },
+/* 26 */ {Cache_UNKNOWN, 0 },
+/* 27 */ {Cache_UNKNOWN, 0 },
+/* 28 */ {Cache_UNKNOWN, 0 },
+/* 29 */ {Cache_L3, 64 },
+/* 2a */ {Cache_UNKNOWN, 0 },
+/* 2b */ {Cache_UNKNOWN, 0 },
+/* 2c */ {Cache_L1d, 64 },
+/* 2d */ {Cache_UNKNOWN, 0 },
+/* 2e */ {Cache_UNKNOWN, 0 },
+/* 2f */ {Cache_UNKNOWN, 0 },
+/* 30 */ {Cache_L1i, 64 },
+/* 31 */ {Cache_UNKNOWN, 0 },
+/* 32 */ {Cache_UNKNOWN, 0 },
+/* 33 */ {Cache_UNKNOWN, 0 },
+/* 34 */ {Cache_UNKNOWN, 0 },
+/* 35 */ {Cache_UNKNOWN, 0 },
+/* 36 */ {Cache_UNKNOWN, 0 },
+/* 37 */ {Cache_UNKNOWN, 0 },
+/* 38 */ {Cache_UNKNOWN, 0 },
+/* 39 */ {Cache_L2, 64 },
+/* 3a */ {Cache_UNKNOWN, 0 },
+/* 3b */ {Cache_L2, 64 },
+/* 3c */ {Cache_L2, 64 },
+/* 3d */ {Cache_UNKNOWN, 0 },
+/* 3e */ {Cache_UNKNOWN, 0 },
+/* 3f */ {Cache_UNKNOWN, 0 },
+/* 40 */ {Cache_L2, 0 },
+/* 41 */ {Cache_L2, 32 },
+/* 42 */ {Cache_L2, 32 },
+/* 43 */ {Cache_L2, 32 },
+/* 44 */ {Cache_L2, 32 },
+/* 45 */ {Cache_L2, 32 },
+/* 46 */ {Cache_UNKNOWN, 0 },
+/* 47 */ {Cache_UNKNOWN, 0 },
+/* 48 */ {Cache_UNKNOWN, 0 },
+/* 49 */ {Cache_UNKNOWN, 0 },
+/* 4a */ {Cache_UNKNOWN, 0 },
+/* 4b */ {Cache_UNKNOWN, 0 },
+/* 4c */ {Cache_UNKNOWN, 0 },
+/* 4d */ {Cache_UNKNOWN, 0 },
+/* 4e */ {Cache_UNKNOWN, 0 },
+/* 4f */ {Cache_UNKNOWN, 0 },
+/* 50 */ {Cache_TLBi, 0 },
+/* 51 */ {Cache_TLBi, 0 },
+/* 52 */ {Cache_TLBi, 0 },
+/* 53 */ {Cache_UNKNOWN, 0 },
+/* 54 */ {Cache_UNKNOWN, 0 },
+/* 55 */ {Cache_UNKNOWN, 0 },
+/* 56 */ {Cache_UNKNOWN, 0 },
+/* 57 */ {Cache_UNKNOWN, 0 },
+/* 58 */ {Cache_UNKNOWN, 0 },
+/* 59 */ {Cache_UNKNOWN, 0 },
+/* 5a */ {Cache_UNKNOWN, 0 },
+/* 5b */ {Cache_TLBd, 0 },
+/* 5c */ {Cache_TLBd, 0 },
+/* 5d */ {Cache_TLBd, 0 },
+/* 5e */ {Cache_UNKNOWN, 0 },
+/* 5f */ {Cache_UNKNOWN, 0 },
+/* 60 */ {Cache_UNKNOWN, 0 },
+/* 61 */ {Cache_UNKNOWN, 0 },
+/* 62 */ {Cache_UNKNOWN, 0 },
+/* 63 */ {Cache_UNKNOWN, 0 },
+/* 64 */ {Cache_UNKNOWN, 0 },
+/* 65 */ {Cache_UNKNOWN, 0 },
+/* 66 */ {Cache_L1d, 64 },
+/* 67 */ {Cache_L1d, 64 },
+/* 68 */ {Cache_L1d, 64 },
+/* 69 */ {Cache_UNKNOWN, 0 },
+/* 6a */ {Cache_UNKNOWN, 0 },
+/* 6b */ {Cache_UNKNOWN, 0 },
+/* 6c */ {Cache_UNKNOWN, 0 },
+/* 6d */ {Cache_UNKNOWN, 0 },
+/* 6e */ {Cache_UNKNOWN, 0 },
+/* 6f */ {Cache_UNKNOWN, 0 },
+/* 70 */ {Cache_Trace, 1 },
+/* 71 */ {Cache_Trace, 1 },
+/* 72 */ {Cache_Trace, 1 },
+/* 73 */ {Cache_UNKNOWN, 0 },
+/* 74 */ {Cache_UNKNOWN, 0 },
+/* 75 */ {Cache_UNKNOWN, 0 },
+/* 76 */ {Cache_UNKNOWN, 0 },
+/* 77 */ {Cache_UNKNOWN, 0 },
+/* 78 */ {Cache_UNKNOWN, 0 },
+/* 79 */ {Cache_L2, 64 },
+/* 7a */ {Cache_L2, 64 },
+/* 7b */ {Cache_L2, 64 },
+/* 7c */ {Cache_L2, 64 },
+/* 7d */ {Cache_UNKNOWN, 0 },
+/* 7e */ {Cache_UNKNOWN, 0 },
+/* 7f */ {Cache_UNKNOWN, 0 },
+/* 80 */ {Cache_UNKNOWN, 0 },
+/* 81 */ {Cache_UNKNOWN, 0 },
+/* 82 */ {Cache_L2, 32 },
+/* 83 */ {Cache_L2, 32 },
+/* 84 */ {Cache_L2, 32 },
+/* 85 */ {Cache_L2, 32 },
+/* 86 */ {Cache_L2, 64 },
+/* 87 */ {Cache_L2, 64 },
+/* 88 */ {Cache_UNKNOWN, 0 },
+/* 89 */ {Cache_UNKNOWN, 0 },
+/* 8a */ {Cache_UNKNOWN, 0 },
+/* 8b */ {Cache_UNKNOWN, 0 },
+/* 8c */ {Cache_UNKNOWN, 0 },
+/* 8d */ {Cache_UNKNOWN, 0 },
+/* 8e */ {Cache_UNKNOWN, 0 },
+/* 8f */ {Cache_UNKNOWN, 0 },
+/* 90 */ {Cache_UNKNOWN, 0 },
+/* 91 */ {Cache_UNKNOWN, 0 },
+/* 92 */ {Cache_UNKNOWN, 0 },
+/* 93 */ {Cache_UNKNOWN, 0 },
+/* 94 */ {Cache_UNKNOWN, 0 },
+/* 95 */ {Cache_UNKNOWN, 0 },
+/* 96 */ {Cache_UNKNOWN, 0 },
+/* 97 */ {Cache_UNKNOWN, 0 },
+/* 98 */ {Cache_UNKNOWN, 0 },
+/* 99 */ {Cache_UNKNOWN, 0 },
+/* 9a */ {Cache_UNKNOWN, 0 },
+/* 9b */ {Cache_UNKNOWN, 0 },
+/* 9c */ {Cache_UNKNOWN, 0 },
+/* 9d */ {Cache_UNKNOWN, 0 },
+/* 9e */ {Cache_UNKNOWN, 0 },
+/* 9f */ {Cache_UNKNOWN, 0 },
+/* a0 */ {Cache_UNKNOWN, 0 },
+/* a1 */ {Cache_UNKNOWN, 0 },
+/* a2 */ {Cache_UNKNOWN, 0 },
+/* a3 */ {Cache_UNKNOWN, 0 },
+/* a4 */ {Cache_UNKNOWN, 0 },
+/* a5 */ {Cache_UNKNOWN, 0 },
+/* a6 */ {Cache_UNKNOWN, 0 },
+/* a7 */ {Cache_UNKNOWN, 0 },
+/* a8 */ {Cache_UNKNOWN, 0 },
+/* a9 */ {Cache_UNKNOWN, 0 },
+/* aa */ {Cache_UNKNOWN, 0 },
+/* ab */ {Cache_UNKNOWN, 0 },
+/* ac */ {Cache_UNKNOWN, 0 },
+/* ad */ {Cache_UNKNOWN, 0 },
+/* ae */ {Cache_UNKNOWN, 0 },
+/* af */ {Cache_UNKNOWN, 0 },
+/* b0 */ {Cache_TLBi, 0 },
+/* b1 */ {Cache_UNKNOWN, 0 },
+/* b2 */ {Cache_UNKNOWN, 0 },
+/* b3 */ {Cache_TLBd, 0 },
+/* b4 */ {Cache_UNKNOWN, 0 },
+/* b5 */ {Cache_UNKNOWN, 0 },
+/* b6 */ {Cache_UNKNOWN, 0 },
+/* b7 */ {Cache_UNKNOWN, 0 },
+/* b8 */ {Cache_UNKNOWN, 0 },
+/* b9 */ {Cache_UNKNOWN, 0 },
+/* ba */ {Cache_UNKNOWN, 0 },
+/* bb */ {Cache_UNKNOWN, 0 },
+/* bc */ {Cache_UNKNOWN, 0 },
+/* bd */ {Cache_UNKNOWN, 0 },
+/* be */ {Cache_UNKNOWN, 0 },
+/* bf */ {Cache_UNKNOWN, 0 },
+/* c0 */ {Cache_UNKNOWN, 0 },
+/* c1 */ {Cache_UNKNOWN, 0 },
+/* c2 */ {Cache_UNKNOWN, 0 },
+/* c3 */ {Cache_UNKNOWN, 0 },
+/* c4 */ {Cache_UNKNOWN, 0 },
+/* c5 */ {Cache_UNKNOWN, 0 },
+/* c6 */ {Cache_UNKNOWN, 0 },
+/* c7 */ {Cache_UNKNOWN, 0 },
+/* c8 */ {Cache_UNKNOWN, 0 },
+/* c9 */ {Cache_UNKNOWN, 0 },
+/* ca */ {Cache_UNKNOWN, 0 },
+/* cb */ {Cache_UNKNOWN, 0 },
+/* cc */ {Cache_UNKNOWN, 0 },
+/* cd */ {Cache_UNKNOWN, 0 },
+/* ce */ {Cache_UNKNOWN, 0 },
+/* cf */ {Cache_UNKNOWN, 0 },
+/* d0 */ {Cache_UNKNOWN, 0 },
+/* d1 */ {Cache_UNKNOWN, 0 },
+/* d2 */ {Cache_UNKNOWN, 0 },
+/* d3 */ {Cache_UNKNOWN, 0 },
+/* d4 */ {Cache_UNKNOWN, 0 },
+/* d5 */ {Cache_UNKNOWN, 0 },
+/* d6 */ {Cache_UNKNOWN, 0 },
+/* d7 */ {Cache_UNKNOWN, 0 },
+/* d8 */ {Cache_UNKNOWN, 0 },
+/* d9 */ {Cache_UNKNOWN, 0 },
+/* da */ {Cache_UNKNOWN, 0 },
+/* db */ {Cache_UNKNOWN, 0 },
+/* dc */ {Cache_UNKNOWN, 0 },
+/* dd */ {Cache_UNKNOWN, 0 },
+/* de */ {Cache_UNKNOWN, 0 },
+/* df */ {Cache_UNKNOWN, 0 },
+/* e0 */ {Cache_UNKNOWN, 0 },
+/* e1 */ {Cache_UNKNOWN, 0 },
+/* e2 */ {Cache_UNKNOWN, 0 },
+/* e3 */ {Cache_UNKNOWN, 0 },
+/* e4 */ {Cache_UNKNOWN, 0 },
+/* e5 */ {Cache_UNKNOWN, 0 },
+/* e6 */ {Cache_UNKNOWN, 0 },
+/* e7 */ {Cache_UNKNOWN, 0 },
+/* e8 */ {Cache_UNKNOWN, 0 },
+/* e9 */ {Cache_UNKNOWN, 0 },
+/* ea */ {Cache_UNKNOWN, 0 },
+/* eb */ {Cache_UNKNOWN, 0 },
+/* ec */ {Cache_UNKNOWN, 0 },
+/* ed */ {Cache_UNKNOWN, 0 },
+/* ee */ {Cache_UNKNOWN, 0 },
+/* ef */ {Cache_UNKNOWN, 0 },
+/* f0 */ {Cache_UNKNOWN, 0 },
+/* f1 */ {Cache_UNKNOWN, 0 },
+/* f2 */ {Cache_UNKNOWN, 0 },
+/* f3 */ {Cache_UNKNOWN, 0 },
+/* f4 */ {Cache_UNKNOWN, 0 },
+/* f5 */ {Cache_UNKNOWN, 0 },
+/* f6 */ {Cache_UNKNOWN, 0 },
+/* f7 */ {Cache_UNKNOWN, 0 },
+/* f8 */ {Cache_UNKNOWN, 0 },
+/* f9 */ {Cache_UNKNOWN, 0 },
+/* fa */ {Cache_UNKNOWN, 0 },
+/* fb */ {Cache_UNKNOWN, 0 },
+/* fc */ {Cache_UNKNOWN, 0 },
+/* fd */ {Cache_UNKNOWN, 0 },
+/* fe */ {Cache_UNKNOWN, 0 },
+/* ff */ {Cache_UNKNOWN, 0 }
+};
+
+
+/*
+ * use the above table to determine the CacheEntryLineSize.
+ */
+static void
+getIntelCacheEntryLineSize(unsigned long val, int *level,
+ unsigned long *lineSize)
+{
+ CacheType type;
+
+ type = CacheMap[val].type;
+ /* only interested in data caches */
+ /* NOTE val = 0x40 is a special value that means no L2 or L3 cache.
+ * this data check has the side effect of rejecting that entry. If
+ * that wasn't the case, we could have to reject it explicitly */
+ if (CacheMap[val].lineSize == 0) {
+ return;
+ }
+ /* look at the caches, skip types we aren't interested in.
+ * if we already have a value for a lower level cache, skip the
+ * current entry */
+ if ((type == Cache_L1)|| (type == Cache_L1d)) {
+ *level = 1;
+ *lineSize = CacheMap[val].lineSize;
+ } else if ((*level >= 2) && ((type == Cache_L2) || (type == Cache_L2d))) {
+ *level = 2;
+ *lineSize = CacheMap[val].lineSize;
+ } else if ((*level >= 3) && ((type == Cache_L3) || (type == Cache_L3d))) {
+ *level = 3;
+ *lineSize = CacheMap[val].lineSize;
+ }
+ return;
+}
+
+
+static void
+getIntelRegisterCacheLineSize(unsigned long val,
+ int *level, unsigned long *lineSize)
+{
+ getIntelCacheEntryLineSize(val >> 24 & 0xff, level, lineSize);
+ getIntelCacheEntryLineSize(val >> 16 & 0xff, level, lineSize);
+ getIntelCacheEntryLineSize(val >> 8 & 0xff, level, lineSize);
+ getIntelCacheEntryLineSize(val & 0xff, level, lineSize);
+}
+
+/*
+ * returns '0' if no recognized cache is found, or if the cache
+ * information is supported by this processor
+ */
+static unsigned long
+getIntelCacheLineSize(int cpuidLevel)
+{
+ int level = 4;
+ unsigned long lineSize = 0;
+ unsigned long eax, ebx, ecx, edx;
+ int repeat, count;
+
+ if (cpuidLevel < 2) {
+ return 0;
+ }
+
+ /* command '2' of the cpuid is intel's cache info call. Each byte of the
+ * 4 registers contain a potential descriptor for the cache. The CacheMap
+ * table maps the cache entry with the processor cache. Register 'al'
+ * contains a count value that cpuid '2' needs to be called in order to
+ * find all the cache descriptors. Only registers with the high bit set
+ * to 'zero' have valid descriptors. This code loops through all the
+ * required calls to cpuid '2' and passes any valid descriptors it finds
+ * to the getIntelRegisterCacheLineSize code, which breaks the registers
+ * down into their component descriptors. In the end the lineSize of the
+ * lowest level cache data cache is returned. */
+ freebl_cpuid(2, &eax, &ebx, &ecx, &edx);
+ repeat = eax & 0xf;
+ for (count = 0; count < repeat; count++) {
+ if ((eax & 0x80000000) == 0) {
+ getIntelRegisterCacheLineSize(eax & 0xffffff00, &level, &lineSize);
+ }
+ if ((ebx & 0x80000000) == 0) {
+ getIntelRegisterCacheLineSize(ebx, &level, &lineSize);
+ }
+ if ((ecx & 0x80000000) == 0) {
+ getIntelRegisterCacheLineSize(ecx, &level, &lineSize);
+ }
+ if ((edx & 0x80000000) == 0) {
+ getIntelRegisterCacheLineSize(edx, &level, &lineSize);
+ }
+ if (count+1 != repeat) {
+ freebl_cpuid(2, &eax, &ebx, &ecx, &edx);
+ }
+ }
+ return lineSize;
+}
+
+/*
+ * returns '0' if the cache info is not supported by this processor.
+ * This is based on the AMD extended cache commands for cpuid.
+ * (see "AMD Processor Recognition Application Note" Publication 20734).
+ * Some other processors use the identical scheme.
+ * (see "Processor Recognition, Transmeta Corporation").
+ */
+static unsigned long
+getOtherCacheLineSize(unsigned long cpuidLevel)
+{
+ unsigned long lineSize = 0;
+ unsigned long eax, ebx, ecx, edx;
+
+ /* get the Extended CPUID level */
+ freebl_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+ cpuidLevel = eax;
+
+ if (cpuidLevel >= 0x80000005) {
+ freebl_cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ lineSize = ecx & 0xff; /* line Size, L1 Data Cache */
+ }
+ return lineSize;
+}
+
+static const char * const manMap[] = {
+#define INTEL 0
+ "GenuineIntel",
+#define AMD 1
+ "AuthenticAMD",
+#define CYRIX 2
+ "CyrixInstead",
+#define CENTAUR 2
+ "CentaurHauls",
+#define NEXGEN 3
+ "NexGenDriven",
+#define TRANSMETA 4
+ "GenuineTMx86",
+#define RISE 5
+ "RiseRiseRise",
+#define UMC 6
+ "UMC UMC UMC ",
+#define SIS 7
+ "Sis Sis Sis ",
+#define NATIONAL 8
+ "Geode by NSC",
+};
+
+static const int n_manufacturers = sizeof(manMap)/sizeof(manMap[0]);
+
+
+#define MAN_UNKNOWN 9
+
+#if !defined(AMD_64)
+#define SSE2_FLAG (1<<26)
+unsigned long
+s_mpi_is_sse2()
+{
+ unsigned long eax, ebx, ecx, edx;
+ int manufacturer = MAN_UNKNOWN;
+ int i;
+ char string[13];
+
+ if (is386() || is486()) {
+ return 0;
+ }
+ freebl_cpuid(0, &eax, &ebx, &ecx, &edx);
+ /* string holds the CPU's manufacturer ID string - a twelve
+ * character ASCII string stored in ebx, edx, ecx, and
+ * the 32-bit extended feature flags are in edx, ecx.
+ */
+ *(int *)string = ebx;
+ *(int *)&string[4] = (int)edx;
+ *(int *)&string[8] = (int)ecx;
+ string[12] = 0;
+
+ /* has no SSE2 extensions */
+ if (eax == 0) {
+ return 0;
+ }
+
+ for (i=0; i < n_manufacturers; i++) {
+ if ( strcmp(manMap[i],string) == 0) {
+ manufacturer = i;
+ break;
+ }
+ }
+
+ freebl_cpuid(1,&eax,&ebx,&ecx,&edx);
+ return (edx & SSE2_FLAG) == SSE2_FLAG;
+}
+#endif
+
+unsigned long
+s_mpi_getProcessorLineSize()
+{
+ unsigned long eax, ebx, ecx, edx;
+ unsigned long cpuidLevel;
+ unsigned long cacheLineSize = 0;
+ int manufacturer = MAN_UNKNOWN;
+ int i;
+ char string[65];
+
+#if !defined(AMD_64)
+ if (is386()) {
+ return 0; /* 386 had no cache */
+ } if (is486()) {
+ return 32; /* really? need more info */
+ }
+#endif
+
+ /* Pentium, cpuid command is available */
+ freebl_cpuid(0, &eax, &ebx, &ecx, &edx);
+ cpuidLevel = eax;
+ /* string holds the CPU's manufacturer ID string - a twelve
+ * character ASCII string stored in ebx, edx, ecx, and
+ * the 32-bit extended feature flags are in edx, ecx.
+ */
+ *(int *)string = ebx;
+ *(int *)&string[4] = (int)edx;
+ *(int *)&string[8] = (int)ecx;
+ string[12] = 0;
+
+ manufacturer = MAN_UNKNOWN;
+ for (i=0; i < n_manufacturers; i++) {
+ if ( strcmp(manMap[i],string) == 0) {
+ manufacturer = i;
+ }
+ }
+
+ if (manufacturer == INTEL) {
+ cacheLineSize = getIntelCacheLineSize(cpuidLevel);
+ } else {
+ cacheLineSize = getOtherCacheLineSize(cpuidLevel);
+ }
+ /* doesn't support cache info based on cpuid. This means
+ * an old pentium class processor, which have cache lines of
+ * 32. If we learn differently, we can use a switch based on
+ * the Manufacturer id */
+ if (cacheLineSize == 0) {
+ cacheLineSize = 32;
+ }
+ return cacheLineSize;
+}
+#define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1
+#endif
+
+#if defined(__ppc64__)
+/*
+ * Sigh, The PPC has some really nice features to help us determine cache
+ * size, since it had lots of direct control functions to do so. The POWER
+ * processor even has an instruction to do this, but it was dropped in
+ * PowerPC. Unfortunately most of them are not available in user mode.
+ *
+ * The dcbz function would be a great way to determine cache line size except
+ * 1) it only works on write-back memory (it throws an exception otherwise),
+ * and 2) because so many mac programs 'knew' the processor cache size was
+ * 32 bytes, they used this instruction as a fast 'zero 32 bytes'. Now the new
+ * G5 processor has 128 byte cache, but dcbz only clears 32 bytes to keep
+ * these programs happy. dcbzl work if 64 bit instructions are supported.
+ * If you know 64 bit instructions are supported, and that stack is
+ * write-back, you can use this code.
+ */
+#include "memory.h"
+
+/* clear the cache line that contains 'array' */
+static inline void dcbzl(char *array)
+{
+ register char *a asm("r2") = array;
+ __asm__ __volatile__( "dcbzl %0,r0" : "=r" (a): "0"(a) );
+}
+
+
+#define PPC_DO_ALIGN(x,y) ((char *)\
+ ((((long long) (x))+((y)-1))&~((y)-1)))
+
+#define PPC_MAX_LINE_SIZE 256
+unsigned long
+s_mpi_getProcessorLineSize()
+{
+ char testArray[2*PPC_MAX_LINE_SIZE+1];
+ char *test;
+ int i;
+
+ /* align the array on a maximum line size boundary, so we
+ * know we are starting to clear from the first address */
+ test = PPC_DO_ALIGN(testArray, PPC_MAX_LINE_SIZE);
+ /* set all the values to 1's */
+ memset(test, 0xff, PPC_MAX_LINE_SIZE);
+ /* clear one cache block starting at 'test' */
+ dcbzl(test);
+
+ /* find the size of the cleared area, that's our block size */
+ for (i=PPC_MAX_LINE_SIZE; i != 0; i = i/2) {
+ if (test[i-1] == 0) {
+ return i;
+ }
+ }
+ return 0;
+}
+
+#define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1
+#endif
+
+
+/*
+ * put other processor and platform specific cache code here
+ * return the smallest cache line size in bytes on the processor
+ * (usually the L1 cache). If the OS has a call, this would be
+ * a greate place to put it.
+ *
+ * If there is no cache, return 0;
+ *
+ * define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED so the generic functions
+ * below aren't compiled.
+ *
+ */
+
+
+/* target.mk can define MPI_CACHE_LINE_SIZE if it's common for the family or
+ * OS */
+#if defined(MPI_CACHE_LINE_SIZE) && !defined(MPI_GET_PROCESSOR_LINE_SIZE_DEFINED)
+
+unsigned long
+s_mpi_getProcessorLineSize()
+{
+ return MPI_CACHE_LINE_SIZE;
+}
+#define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1
+#endif
+
+
+/* If no way to get the processor cache line size has been defined, assume
+ * it's 32 bytes (most common value, does not significantly impact performance)
+ */
+#ifndef MPI_GET_PROCESSOR_LINE_SIZE_DEFINED
+unsigned long
+s_mpi_getProcessorLineSize()
+{
+ return 32;
+}
+#endif
+
+#ifdef TEST_IT
+#include <stdio.h>
+
+main()
+{
+ printf("line size = %d\n", s_mpi_getProcessorLineSize());
+}
+#endif