summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLiu Xinyun <xinyun.liu@intel.com>2010-09-25 14:56:38 +0800
committerSøren Sandmann Pedersen <ssp@redhat.com>2010-09-27 23:44:09 -0400
commitba69989374fe9cbe5151c5aac7b824da0806f94a (patch)
tree3a5386a03e88a37101e487f419f3bd96ce054465
parent56777f3f675869806cd30bcd21a5b39d788507cb (diff)
downloadpixman-ba69989374fe9cbe5151c5aac7b824da0806f94a.tar.gz
Remove cache prefetch code.
The performance is decreased with cache prefetch, especially for ATOM. So remove these code. Following is the experiment. old: 0.19.5-with-cache-prefetch new: 0.19.5-without-cache-prefetch CPU: Intel Atom N270@1.6GHz OS: MeeGo (32 bits) Speedups ======== image-rgba poppler-0 17125.68 (17279.58 0.92%) -> 14765.36 (15926.49 3.54%): 1.16x speedup image-rgba ocitysmap-0 9008.25 (9040.41 7.50%) -> 8277.94 (8343.09 5.44%): 1.09x speedup image-rgba xfce4-terminal-a1-0 18020.76 (18230.68 0.97%) -> 16703.77 (16712.42 1.22%): 1.08x speedup image-rgba gnome-terminal-vim-0 25081.38 (25133.38 0.24%) -> 23407.47 (23652.98 0.54%): 1.07x speedup image-rgba firefox-talos-gfx-0 57916.97 (57973.20 0.11%) -> 54556.64 (54624.55 0.39%): 1.06x speedup image-rgba firefox-planet-gnome-0 102377.47 (103496.63 0.70%) -> 96816.65 (97075.54 0.15%): 1.06x speedup image-rgba swfdec-giant-steps-0 12376.24 (12616.84 1.02%) -> 11705.30 (11825.20 1.06%): 1.06x speedup CPU: Intel Core(TM)2 Duo CPU T9600@2.80GHz OS: Ubuntu 10.04 (64bits) Speedups ======== image-rgba ocitysmap-0 2671.46 (2691.82 8.55%) -> 2296.20 (2307.26 5.77%): 1.16x speedup image-rgba swfdec-giant-steps-0 1614.55 (1615.18 1.68%) -> 1532.84 (1538.52 0.72%): 1.05x speedup Signed-off-by: Liu Xinyun <xinyun.liu@intel.com> Signed-off-by: Chen Miaobo <miaobo.chen@intel.com>
-rw-r--r--pixman/pixman-sse2.c659
1 files changed, 0 insertions, 659 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 33d71ee..112a8c2 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -357,34 +357,6 @@ in_over_2x128 (__m128i* src_lo,
over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
}
-static force_inline void
-cache_prefetch (__m128i* addr)
-{
- _mm_prefetch ((void const*)addr, _MM_HINT_T0);
-}
-
-static force_inline void
-cache_prefetch_next (__m128i* addr)
-{
- _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
-}
-
-/* prefetching NULL is very slow on some systems. don't do that. */
-
-static force_inline void
-maybe_prefetch (__m128i* addr)
-{
- if (addr)
- cache_prefetch (addr);
-}
-
-static force_inline void
-maybe_prefetch_next (__m128i* addr)
-{
- if (addr)
- cache_prefetch_next (addr);
-}
-
/* load 4 pixels from a 16-byte boundary aligned address */
static force_inline __m128i
load_128_aligned (__m128i* src)
@@ -649,11 +621,6 @@ core_combine_over_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
/* Align dst on a 16-byte boundary */
while (w && ((unsigned long)pd & 15))
{
@@ -667,18 +634,8 @@ core_combine_over_u_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
/* I'm loading unaligned because I'm not sure about
* the address alignment.
*/
@@ -740,11 +697,6 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_alpha_lo, xmm_alpha_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
/* Align dst on a 16-byte boundary */
while (w &&
((unsigned long)pd & 15))
@@ -759,18 +711,8 @@ core_combine_over_reverse_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
@@ -842,11 +784,6 @@ core_combine_in_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -859,18 +796,8 @@ core_combine_in_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
@@ -916,11 +843,6 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd,
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -933,18 +855,8 @@ core_combine_reverse_in_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
@@ -985,11 +897,6 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd,
const uint32_t* pm,
int w)
{
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
@@ -1006,21 +913,11 @@ core_combine_reverse_out_u_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1067,11 +964,6 @@ core_combine_out_u_sse2 (uint32_t* pd,
const uint32_t* pm,
int w)
{
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
uint32_t s = combine1 (ps, pm);
@@ -1087,21 +979,11 @@ core_combine_out_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i xmm_src_lo, xmm_src_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1167,11 +1049,6 @@ core_combine_atop_u_sse2 (uint32_t* pd,
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -1184,18 +1061,8 @@ core_combine_atop_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1264,11 +1131,6 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd,
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -1281,18 +1143,8 @@ core_combine_reverse_atop_u_sse2 (uint32_t* pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1365,11 +1217,6 @@ core_combine_xor_u_sse2 (uint32_t* dst,
__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && ((unsigned long) pd & 15))
{
s = combine1 (ps, pm);
@@ -1382,18 +1229,8 @@ core_combine_xor_u_sse2 (uint32_t* dst,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
xmm_dst = load_128_aligned ((__m128i*) pd);
@@ -1450,11 +1287,6 @@ core_combine_add_u_sse2 (uint32_t* dst,
const uint32_t* ps = src;
const uint32_t* pm = mask;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
@@ -1468,20 +1300,10 @@ core_combine_add_u_sse2 (uint32_t* dst,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
__m128i s;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
s = combine4 ((__m128i*)ps, (__m128i*)pm);
save_128_aligned (
@@ -1536,11 +1358,6 @@ core_combine_saturate_u_sse2 (uint32_t * pd,
uint32_t pack_cmp;
__m128i xmm_src, xmm_dst;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = combine1 (ps, pm);
@@ -1553,18 +1370,8 @@ core_combine_saturate_u_sse2 (uint32_t * pd,
pm++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- maybe_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- maybe_prefetch_next ((__m128i*)pm);
-
xmm_dst = load_128_aligned ((__m128i*)pd);
xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
@@ -1637,11 +1444,6 @@ core_combine_src_ca_sse2 (uint32_t* pd,
__m128i xmm_mask_lo, xmm_mask_hi;
__m128i xmm_dst_lo, xmm_dst_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1651,18 +1453,8 @@ core_combine_src_ca_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1718,11 +1510,6 @@ core_combine_over_ca_sse2 (uint32_t* pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1733,18 +1520,8 @@ core_combine_over_ca_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1807,11 +1584,6 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1822,18 +1594,8 @@ core_combine_over_reverse_ca_sse2 (uint32_t* pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1885,11 +1647,6 @@ core_combine_in_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1904,18 +1661,8 @@ core_combine_in_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -1973,11 +1720,6 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -1992,18 +1734,8 @@ core_combine_in_reverse_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2059,11 +1791,6 @@ core_combine_out_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2078,18 +1805,8 @@ core_combine_out_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2148,11 +1865,6 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2168,18 +1880,8 @@ core_combine_out_reverse_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2258,11 +1960,6 @@ core_combine_atop_ca_sse2 (uint32_t * pd,
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2273,18 +1970,8 @@ core_combine_atop_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2364,11 +2051,6 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2379,18 +2061,8 @@ core_combine_reverse_atop_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2473,11 +2145,6 @@ core_combine_xor_ca_sse2 (uint32_t * pd,
__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2488,18 +2155,8 @@ core_combine_xor_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
@@ -2562,11 +2219,6 @@ core_combine_add_ca_sse2 (uint32_t * pd,
__m128i xmm_dst_lo, xmm_dst_hi;
__m128i xmm_mask_lo, xmm_mask_hi;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
s = *ps++;
@@ -2580,18 +2232,8 @@ core_combine_add_ca_sse2 (uint32_t * pd,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)ps);
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)ps);
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_src_hi = load_128_unaligned ((__m128i*)ps);
xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
xmm_dst_hi = load_128_aligned ((__m128i*)pd);
@@ -2971,9 +2613,6 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
{
dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
w = width;
@@ -2986,13 +2625,8 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
w--;
}
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -3062,9 +2696,6 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
{
dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
w = width;
@@ -3079,14 +2710,8 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_565_128_4x128 (xmm_dst,
@@ -3177,10 +2802,6 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
dst_line += dst_stride;
mask_line += mask_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
m = *pm++;
@@ -3200,16 +2821,8 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
@@ -3316,10 +2929,6 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
dst_line += dst_stride;
mask_line += mask_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w && (unsigned long)pd & 15)
{
m = *pm++;
@@ -3340,16 +2949,8 @@ sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)pd);
- cache_prefetch ((__m128i*)pm);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)pd);
- cache_prefetch_next ((__m128i*)pm);
-
xmm_mask = load_128_unaligned ((__m128i*)pm);
pack_cmp =
@@ -3447,10 +3048,6 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w && (unsigned long)dst & 15)
{
uint32_t s = *src++;
@@ -3467,16 +3064,8 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)src);
-
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3556,25 +3145,16 @@ sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
-
while (w && (unsigned long)dst & 15)
{
*dst++ = *src++ | 0xff000000;
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
-
while (w >= 16)
{
__m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
-
xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
@@ -3646,10 +3226,6 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w && (unsigned long)dst & 15)
{
uint32_t s = (*src++) | 0xff000000;
@@ -3666,16 +3242,8 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)src);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)src);
-
xmm_src = _mm_or_si128 (
load_128_unaligned ((__m128i*)src), mask_ff000000);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -3815,10 +3383,6 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
dst = dst_line;
src = src_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
src_line += src_stride;
w = width;
@@ -3834,17 +3398,9 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
/* It's a 8 pixel loop */
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
/* I'm loading unaligned because I'm not sure
* about the address alignment.
*/
@@ -3954,10 +3510,6 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
@@ -3978,16 +3530,8 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
@@ -4099,7 +3643,6 @@ pixman_fill_sse2 (uint32_t *bits,
return FALSE;
}
- cache_prefetch ((__m128i*)byte_line);
xmm_def = create_mask_2x32_128 (data, data);
while (height--)
@@ -4109,8 +3652,6 @@ pixman_fill_sse2 (uint32_t *bits,
byte_line += stride;
w = byte_width;
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 1 && ((unsigned long)d & 1))
{
*(uint8_t *)d = data;
@@ -4133,12 +3674,8 @@ pixman_fill_sse2 (uint32_t *bits,
d += 4;
}
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 128)
{
- cache_prefetch (((__m128i*)d) + 12);
-
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
@@ -4154,8 +3691,6 @@ pixman_fill_sse2 (uint32_t *bits,
if (w >= 64)
{
- cache_prefetch (((__m128i*)d) + 8);
-
save_128_aligned ((__m128i*)(d), xmm_def);
save_128_aligned ((__m128i*)(d + 16), xmm_def);
save_128_aligned ((__m128i*)(d + 32), xmm_def);
@@ -4165,8 +3700,6 @@ pixman_fill_sse2 (uint32_t *bits,
w -= 64;
}
- cache_prefetch_next ((__m128i*)d);
-
if (w >= 32)
{
save_128_aligned ((__m128i*)(d), xmm_def);
@@ -4184,8 +3717,6 @@ pixman_fill_sse2 (uint32_t *bits,
w -= 16;
}
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 4)
{
*(uint32_t *)d = data;
@@ -4265,10 +3796,6 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
uint8_t m = *mask++;
@@ -4288,16 +3815,8 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
m = *((uint32_t*)mask);
if (srca == 0xff && m == 0xffffffff)
@@ -4410,10 +3929,6 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
m = *mask++;
@@ -4434,16 +3949,8 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*) dst);
unpack_565_128_4x128 (xmm_dst,
&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
@@ -4570,10 +4077,6 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
s = *src++;
@@ -4587,16 +4090,8 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
/* First round */
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -4715,10 +4210,6 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && (unsigned long)dst & 15)
{
s = *src++;
@@ -4731,16 +4222,8 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_src_hi = load_128_unaligned ((__m128i*)src);
opaque = is_opaque (xmm_src_hi);
@@ -4845,10 +4328,6 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
mask_line += mask_stride;
dst_line += dst_stride;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = *(uint32_t *) mask;
@@ -4870,16 +4349,8 @@ sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
mask++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 8)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
/* First round */
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5001,10 +4472,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
@@ -5018,16 +4485,8 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5121,9 +4580,6 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
dst_line += dst_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
d = (uint32_t) *dst;
@@ -5135,14 +4591,8 @@ sse2_composite_in_n_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -5214,10 +4664,6 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
src_line += src_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
s = (uint32_t) *src++;
@@ -5229,16 +4675,8 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_src = load_128_unaligned ((__m128i*)src);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5321,10 +4759,6 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
mask_line += mask_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
m = (uint32_t) *mask++;
@@ -5338,16 +4772,8 @@ sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)mask);
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)mask);
- cache_prefetch_next ((__m128i*)dst);
-
xmm_mask = load_128_unaligned ((__m128i*)mask);
xmm_dst = load_128_aligned ((__m128i*)dst);
@@ -5440,9 +4866,6 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
dst_line += dst_stride;
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w && ((unsigned long)dst & 15))
{
*dst = (uint8_t)_mm_cvtsi64_si32 (
@@ -5454,14 +4877,8 @@ sse2_composite_add_n_8 (pixman_implementation_t *imp,
dst++;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
while (w >= 16)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)dst);
-
save_128_aligned (
(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
@@ -5519,10 +4936,6 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
dst = dst_line;
src = src_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
src_line += src_stride;
w = width;
@@ -5644,9 +5057,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
return FALSE;
}
- cache_prefetch ((__m128i*)src_bytes);
- cache_prefetch ((__m128i*)dst_bytes);
-
while (height--)
{
int w;
@@ -5656,9 +5066,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
dst_bytes += dst_stride;
w = byte_width;
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 2 && ((unsigned long)d & 3))
{
*(uint16_t *)d = *(uint16_t *)s;
@@ -5676,17 +5083,10 @@ pixman_blt_sse2 (uint32_t *src_bits,
d += 4;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 64)
{
__m128i xmm0, xmm1, xmm2, xmm3;
- /* 128 bytes ahead */
- cache_prefetch (((__m128i*)s) + 8);
- cache_prefetch (((__m128i*)d) + 8);
-
xmm0 = load_128_unaligned ((__m128i*)(s));
xmm1 = load_128_unaligned ((__m128i*)(s + 16));
xmm2 = load_128_unaligned ((__m128i*)(s + 32));
@@ -5702,9 +5102,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
w -= 64;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 16)
{
save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
@@ -5714,9 +5111,6 @@ pixman_blt_sse2 (uint32_t *src_bits,
s += 16;
}
- cache_prefetch_next ((__m128i*)s);
- cache_prefetch_next ((__m128i*)d);
-
while (w >= 4)
{
*(uint32_t *)d = *(uint32_t *)s;
@@ -5809,11 +5203,6 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)mask);
-
while (w && (unsigned long)dst & 15)
{
s = 0xff000000 | *src++;
@@ -5833,18 +5222,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)src);
- cache_prefetch ((__m128i*)dst);
- cache_prefetch ((__m128i*)mask);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)src);
- cache_prefetch_next ((__m128i*)dst);
- cache_prefetch_next ((__m128i*)mask);
-
m = *(uint32_t*) mask;
xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
@@ -5955,11 +5334,6 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w && (unsigned long)dst & 15)
{
uint32_t sa;
@@ -5994,18 +5368,8 @@ sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i *)src);
- cache_prefetch_next ((__m128i *)dst);
- cache_prefetch_next ((__m128i *)mask);
-
m = *(uint32_t *) mask;
if (m)
@@ -6117,9 +5481,6 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
{
dst = dst_line;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i*)dst);
-
dst_line += dst_stride;
w = width;
@@ -6135,15 +5496,10 @@ sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
dst++;
}
- cache_prefetch ((__m128i*)dst);
-
while (w >= 4)
{
__m128i tmp_lo, tmp_hi;
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i*)(dst + 4));
-
xmm_dst = load_128_aligned ((__m128i*)dst);
unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
@@ -6224,11 +5580,6 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
w = width;
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w && (unsigned long)dst & 15)
{
uint32_t sa;
@@ -6263,18 +5614,8 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
w--;
}
- /* call prefetch hint to optimize cache load*/
- cache_prefetch ((__m128i *)src);
- cache_prefetch ((__m128i *)dst);
- cache_prefetch ((__m128i *)mask);
-
while (w >= 4)
{
- /* fill cache line with next memory */
- cache_prefetch_next ((__m128i *)src);
- cache_prefetch_next ((__m128i *)dst);
- cache_prefetch_next ((__m128i *)mask);
-
xmm_mask = load_128_unaligned ((__m128i*)mask);
if (!is_transparent (xmm_mask))