summaryrefslogtreecommitdiff
path: root/gcc/config/spu/cachemgr.c
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/spu/cachemgr.c')
-rw-r--r--gcc/config/spu/cachemgr.c438
1 files changed, 438 insertions, 0 deletions
diff --git a/gcc/config/spu/cachemgr.c b/gcc/config/spu/cachemgr.c
new file mode 100644
index 00000000000..e7abd5e62db
--- /dev/null
+++ b/gcc/config/spu/cachemgr.c
@@ -0,0 +1,438 @@
+/* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#include <spu_mfcio.h>
+#include <spu_internals.h>
+#include <spu_intrinsics.h>
+#include <spu_cache.h>
+
+extern unsigned long long __ea_local_store;
+extern char __cache_tag_array_size;
+
+#define LINE_SIZE 128
+#define TAG_MASK (LINE_SIZE - 1)
+
+#define WAYS 4
+#define SET_MASK ((int) &__cache_tag_array_size - LINE_SIZE)
+
+#define CACHE_LINES ((int) &__cache_tag_array_size / \
+ sizeof (struct __cache_tag_array) * WAYS)
+
+struct __cache_tag_array
+{
+ unsigned int tag_lo[WAYS];
+ unsigned int tag_hi[WAYS];
+ void *base[WAYS];
+ int reserved[WAYS];
+ vector unsigned short dirty_bits[WAYS];
+};
+
+extern struct __cache_tag_array __cache_tag_array[];
+extern char __cache[];
+
+/* In order to make the code seem a little cleaner, and to avoid having
+ 64/32 bit ifdefs all over the place, we use macros. */
+
+#ifdef __EA64__
+typedef unsigned long long addr;
+
+#define CHECK_TAG(_entry, _way, _tag) \
+ ((_entry)->tag_lo[(_way)] == ((_tag) & 0xFFFFFFFF) \
+ && (_entry)->tag_hi[(_way)] == ((_tag) >> 32))
+
+#define GET_TAG(_entry, _way) \
+ ((unsigned long long)(_entry)->tag_hi[(_way)] << 32 \
+ | (unsigned long long)(_entry)->tag_lo[(_way)])
+
+#define SET_TAG(_entry, _way, _tag) \
+ (_entry)->tag_lo[(_way)] = (_tag) & 0xFFFFFFFF; \
+ (_entry)->tag_hi[(_way)] = (_tag) >> 32
+
+#else /*__EA32__*/
+typedef unsigned long addr;
+
+#define CHECK_TAG(_entry, _way, _tag) \
+ ((_entry)->tag_lo[(_way)] == (_tag))
+
+#define GET_TAG(_entry, _way) \
+ ((_entry)->tag_lo[(_way)])
+
+#define SET_TAG(_entry, _way, _tag) \
+ (_entry)->tag_lo[(_way)] = (_tag)
+
+#endif
+
+/* In GET_ENTRY, we cast away the high 32 bits,
+ as the tag is only in the low 32. */
+
+#define GET_ENTRY(_addr) \
+ ((struct __cache_tag_array *) \
+ si_to_uint (si_a (si_and (si_from_uint ((unsigned int) (addr) (_addr)), \
+ si_from_uint (SET_MASK)), \
+ si_from_uint ((unsigned int) __cache_tag_array))))
+
+#define GET_CACHE_LINE(_addr, _way) \
+ ((void *) (__cache + ((_addr) & SET_MASK) * WAYS) + ((_way) * LINE_SIZE));
+
+#define CHECK_DIRTY(_vec) (si_to_uint (si_orx ((qword) (_vec))))
+#define SET_EMPTY(_entry, _way) ((_entry)->tag_lo[(_way)] = 1)
+#define CHECK_EMPTY(_entry, _way) ((_entry)->tag_lo[(_way)] == 1)
+
+#define LS_FLAG 0x80000000
+#define SET_IS_LS(_entry, _way) ((_entry)->reserved[(_way)] |= LS_FLAG)
+#define CHECK_IS_LS(_entry, _way) ((_entry)->reserved[(_way)] & LS_FLAG)
+#define GET_LRU(_entry, _way) ((_entry)->reserved[(_way)] & ~LS_FLAG)
+
+static int dma_tag = 32;
+
+static void
+__cache_evict_entry (struct __cache_tag_array *entry, int way)
+{
+ addr tag = GET_TAG (entry, way);
+
+ if (CHECK_DIRTY (entry->dirty_bits[way]) && !CHECK_IS_LS (entry, way))
+ {
+#ifdef NONATOMIC
+ /* Non-atomic writes. */
+ unsigned int oldmask, mach_stat;
+ char *line = ((void *) 0);
+
+ /* Enter critical section. */
+ mach_stat = spu_readch (SPU_RdMachStat);
+ spu_idisable ();
+
+ /* Issue DMA request. */
+ line = GET_CACHE_LINE (entry->tag_lo[way], way);
+ mfc_put (line, tag, LINE_SIZE, dma_tag, 0, 0);
+
+ /* Wait for DMA completion. */
+ oldmask = mfc_read_tag_mask ();
+ mfc_write_tag_mask (1 << dma_tag);
+ mfc_read_tag_status_all ();
+ mfc_write_tag_mask (oldmask);
+
+ /* Leave critical section. */
+ if (__builtin_expect (mach_stat & 1, 0))
+ spu_ienable ();
+#else
+ /* Allocate a buffer large enough that we know it has 128 bytes
+ that are 128 byte aligned (for DMA). */
+
+ char buffer[LINE_SIZE + 127];
+ qword *buf_ptr = (qword *) (((unsigned int) (buffer) + 127) & ~127);
+ qword *line = GET_CACHE_LINE (entry->tag_lo[way], way);
+ qword bits;
+ unsigned int mach_stat;
+
+ /* Enter critical section. */
+ mach_stat = spu_readch (SPU_RdMachStat);
+ spu_idisable ();
+
+ do
+ {
+ /* We atomically read the current memory into a buffer
+ modify the dirty bytes in the buffer, and write it
+ back. If writeback fails, loop and try again. */
+
+ mfc_getllar (buf_ptr, tag, 0, 0);
+ mfc_read_atomic_status ();
+
+ /* The method we're using to write 16 dirty bytes into
+ the buffer at a time uses fsmb which in turn uses
+ the least significant 16 bits of word 0, so we
+ load the bits and rotate so that the first bit of
+ the bitmap is in the first bit that fsmb will use. */
+
+ bits = (qword) entry->dirty_bits[way];
+ bits = si_rotqbyi (bits, -2);
+
+ /* Si_fsmb creates the mask of dirty bytes.
+ Use selb to nab the appropriate bits. */
+ buf_ptr[0] = si_selb (buf_ptr[0], line[0], si_fsmb (bits));
+
+ /* Rotate to next 16 byte section of cache. */
+ bits = si_rotqbyi (bits, 2);
+
+ buf_ptr[1] = si_selb (buf_ptr[1], line[1], si_fsmb (bits));
+ bits = si_rotqbyi (bits, 2);
+ buf_ptr[2] = si_selb (buf_ptr[2], line[2], si_fsmb (bits));
+ bits = si_rotqbyi (bits, 2);
+ buf_ptr[3] = si_selb (buf_ptr[3], line[3], si_fsmb (bits));
+ bits = si_rotqbyi (bits, 2);
+ buf_ptr[4] = si_selb (buf_ptr[4], line[4], si_fsmb (bits));
+ bits = si_rotqbyi (bits, 2);
+ buf_ptr[5] = si_selb (buf_ptr[5], line[5], si_fsmb (bits));
+ bits = si_rotqbyi (bits, 2);
+ buf_ptr[6] = si_selb (buf_ptr[6], line[6], si_fsmb (bits));
+ bits = si_rotqbyi (bits, 2);
+ buf_ptr[7] = si_selb (buf_ptr[7], line[7], si_fsmb (bits));
+ bits = si_rotqbyi (bits, 2);
+
+ mfc_putllc (buf_ptr, tag, 0, 0);
+ }
+ while (mfc_read_atomic_status ());
+
+ /* Leave critical section. */
+ if (__builtin_expect (mach_stat & 1, 0))
+ spu_ienable ();
+#endif
+ }
+
+ /* In any case, marking the lo tag with 1 which denotes empty. */
+ SET_EMPTY (entry, way);
+ entry->dirty_bits[way] = (vector unsigned short) si_from_uint (0);
+}
+
+void
+__cache_evict (__ea void *ea)
+{
+ addr tag = (addr) ea & ~TAG_MASK;
+ struct __cache_tag_array *entry = GET_ENTRY (ea);
+ int i = 0;
+
+ /* Cycles through all the possible ways an address could be at
+ and evicts the way if found. */
+
+ for (i = 0; i < WAYS; i++)
+ if (CHECK_TAG (entry, i, tag))
+ __cache_evict_entry (entry, i);
+}
+
+static void *
+__cache_fill (int way, addr tag)
+{
+ unsigned int oldmask, mach_stat;
+ char *line = ((void *) 0);
+
+ /* Reserve our DMA tag. */
+ if (dma_tag == 32)
+ dma_tag = mfc_tag_reserve ();
+
+ /* Enter critical section. */
+ mach_stat = spu_readch (SPU_RdMachStat);
+ spu_idisable ();
+
+ /* Issue DMA request. */
+ line = GET_CACHE_LINE (tag, way);
+ mfc_get (line, tag, LINE_SIZE, dma_tag, 0, 0);
+
+ /* Wait for DMA completion. */
+ oldmask = mfc_read_tag_mask ();
+ mfc_write_tag_mask (1 << dma_tag);
+ mfc_read_tag_status_all ();
+ mfc_write_tag_mask (oldmask);
+
+ /* Leave critical section. */
+ if (__builtin_expect (mach_stat & 1, 0))
+ spu_ienable ();
+
+ return (void *) line;
+}
+
+static void
+__cache_miss (__ea void *ea, struct __cache_tag_array *entry, int way)
+{
+
+ addr tag = (addr) ea & ~TAG_MASK;
+ unsigned int lru = 0;
+ int i = 0;
+ int idx = 0;
+
+ /* If way > 4, then there are no empty slots, so we must evict
+ the least recently used entry. */
+ if (way >= 4)
+ {
+ for (i = 0; i < WAYS; i++)
+ {
+ if (GET_LRU (entry, i) > lru)
+ {
+ lru = GET_LRU (entry, i);
+ idx = i;
+ }
+ }
+ __cache_evict_entry (entry, idx);
+ way = idx;
+ }
+
+ /* Set the empty entry's tag and fill it's cache line. */
+
+ SET_TAG (entry, way, tag);
+ entry->reserved[way] = 0;
+
+ /* Check if the address is just an effective address within the
+ SPU's local store. */
+
+ /* Because the LS is not 256k aligned, we can't do a nice and mask
+ here to compare, so we must check the whole range. */
+
+ if ((addr) ea >= (addr) __ea_local_store
+ && (addr) ea < (addr) (__ea_local_store + 0x40000))
+ {
+ SET_IS_LS (entry, way);
+ entry->base[way] =
+ (void *) ((unsigned int) ((addr) ea -
+ (addr) __ea_local_store) & ~0x7f);
+ }
+ else
+ {
+ entry->base[way] = __cache_fill (way, tag);
+ }
+}
+
+void *
+__cache_fetch_dirty (__ea void *ea, int n_bytes_dirty)
+{
+#ifdef __EA64__
+ unsigned int tag_hi;
+ qword etag_hi;
+#endif
+ unsigned int tag_lo;
+ struct __cache_tag_array *entry;
+
+ qword etag_lo;
+ qword equal;
+ qword bit_mask;
+ qword way;
+
+ /* This first chunk, we merely fill the pointer and tag. */
+
+ entry = GET_ENTRY (ea);
+
+#ifndef __EA64__
+ tag_lo =
+ si_to_uint (si_andc
+ (si_shufb
+ (si_from_uint ((addr) ea), si_from_uint (0),
+ si_from_uint (0x00010203)), si_from_uint (TAG_MASK)));
+#else
+ tag_lo =
+ si_to_uint (si_andc
+ (si_shufb
+ (si_from_ullong ((addr) ea), si_from_uint (0),
+ si_from_uint (0x04050607)), si_from_uint (TAG_MASK)));
+
+ tag_hi =
+ si_to_uint (si_shufb
+ (si_from_ullong ((addr) ea), si_from_uint (0),
+ si_from_uint (0x00010203)));
+#endif
+
+ /* Increment LRU in reserved bytes. */
+ si_stqd (si_ai (si_lqd (si_from_ptr (entry), 48), 1),
+ si_from_ptr (entry), 48);
+
+missreturn:
+ /* Check if the entry's lo_tag is equal to the address' lo_tag. */
+ etag_lo = si_lqd (si_from_ptr (entry), 0);
+ equal = si_ceq (etag_lo, si_from_uint (tag_lo));
+#ifdef __EA64__
+ /* And the high tag too. */
+ etag_hi = si_lqd (si_from_ptr (entry), 16);
+ equal = si_and (equal, (si_ceq (etag_hi, si_from_uint (tag_hi))));
+#endif
+
+ if ((si_to_uint (si_orx (equal)) == 0))
+ goto misshandler;
+
+ if (n_bytes_dirty)
+ {
+ /* way = 0x40,0x50,0x60,0x70 for each way, which is also the
+ offset of the appropriate dirty bits. */
+ way = si_shli (si_clz (si_gbb (equal)), 2);
+
+ /* To create the bit_mask, we set it to all 1s (uint -1), then we
+ shift it over (128 - n_bytes_dirty) times. */
+
+ bit_mask = si_from_uint (-1);
+
+ bit_mask =
+ si_shlqby (bit_mask, si_from_uint ((LINE_SIZE - n_bytes_dirty) / 8));
+
+ bit_mask =
+ si_shlqbi (bit_mask, si_from_uint ((LINE_SIZE - n_bytes_dirty) % 8));
+
+ /* Rotate it around to the correct offset. */
+ bit_mask =
+ si_rotqby (bit_mask,
+ si_from_uint (-1 * ((addr) ea & TAG_MASK) / 8));
+
+ bit_mask =
+ si_rotqbi (bit_mask,
+ si_from_uint (-1 * ((addr) ea & TAG_MASK) % 8));
+
+ /* Update the dirty bits. */
+ si_stqx (si_or (si_lqx (si_from_ptr (entry), way), bit_mask),
+ si_from_ptr (entry), way);
+ };
+
+ /* We've definitely found the right entry, set LRU (reserved) to 0
+ maintaining the LS flag (MSB). */
+
+ si_stqd (si_andc
+ (si_lqd (si_from_ptr (entry), 48),
+ si_and (equal, si_from_uint (~(LS_FLAG)))),
+ si_from_ptr (entry), 48);
+
+ return (void *)
+ si_to_uint (si_a
+ (si_orx
+ (si_and (si_lqd (si_from_ptr (entry), 32), equal)),
+ si_from_uint (((unsigned int) (addr) ea) & TAG_MASK)));
+
+misshandler:
+ equal = si_ceqi (etag_lo, 1);
+ __cache_miss (ea, entry, (si_to_uint (si_clz (si_gbb (equal))) - 16) >> 2);
+ goto missreturn;
+}
+
+void *
+__cache_fetch (__ea void *ea)
+{
+ return __cache_fetch_dirty (ea, 0);
+}
+
+void
+__cache_touch (__ea void *ea __attribute__ ((unused)))
+{
+ /* NO-OP for now. */
+}
+
+void __cache_flush (void) __attribute__ ((destructor));
+void
+__cache_flush (void)
+{
+ struct __cache_tag_array *entry = __cache_tag_array;
+ unsigned int i;
+ int j;
+
+ /* Cycle through each cache entry and evict all used ways. */
+
+ for (i = 0; i < CACHE_LINES / WAYS; i++)
+ {
+ for (j = 0; j < WAYS; j++)
+ if (!CHECK_EMPTY (entry, j))
+ __cache_evict_entry (entry, j);
+
+ entry++;
+ }
+}