summaryrefslogtreecommitdiff
path: root/liboil
diff options
context:
space:
mode:
Diffstat (limited to 'liboil')
-rw-r--r--liboil/colorspace/argb_paint.c20
-rw-r--r--liboil/colorspace/argb_paint_i386.c16
-rw-r--r--liboil/conv/conv_3dnow.c43
-rw-r--r--liboil/conv/conv_bitstuff.c21
-rw-r--r--liboil/conv/conv_misc.c40
-rw-r--r--liboil/simdpack/diffsquaresum_f64.c25
6 files changed, 118 insertions, 47 deletions
diff --git a/liboil/colorspace/argb_paint.c b/liboil/colorspace/argb_paint.c
index 6c7598a..7bbb168 100644
--- a/liboil/colorspace/argb_paint.c
+++ b/liboil/colorspace/argb_paint.c
@@ -35,8 +35,8 @@
OIL_DEFINE_CLASS (argb_paint_u8, "uint8_t *i_4xn, uint8_t *s1_4, uint8_t *s2_n, int n");
-#define imult(a,b) (((a)*(b) + (((a)*(b)) >> 8))>>8)
-#define apply(a,b,c) (imult(a,255-c) + imult(b,c))
+#define div255(x) (((x + 128) + ((x + 128)>>8))>>8)
+#define blend(x,y,a) div255((x)*(a) + (y)*(255-(a)))
static void
argb_paint_u8_ref (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n)
@@ -44,10 +44,10 @@ argb_paint_u8_ref (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n)
int i;
for(i=0;i<n;i++){
- dest[0] = apply(dest[0],color[0],alpha[0]);
- dest[1] = apply(dest[1],color[1],alpha[0]);
- dest[2] = apply(dest[2],color[2],alpha[0]);
- dest[3] = apply(dest[3],color[3],alpha[0]);
+ dest[0] = blend(color[0],dest[0],alpha[0]);
+ dest[1] = blend(color[1],dest[1],alpha[0]);
+ dest[2] = blend(color[2],dest[2],alpha[0]);
+ dest[3] = blend(color[3],dest[3],alpha[0]);
dest+=4;
alpha++;
}
@@ -68,10 +68,10 @@ argb_paint_u8_fast (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n)
dest[2] = color[2];
dest[3] = color[3];
} else {
- dest[0] = apply(dest[0],color[0],alpha[0]);
- dest[1] = apply(dest[1],color[1],alpha[0]);
- dest[2] = apply(dest[2],color[2],alpha[0]);
- dest[3] = apply(dest[3],color[3],alpha[0]);
+ dest[0] = blend(color[0],dest[0],alpha[0]);
+ dest[1] = blend(color[1],dest[1],alpha[0]);
+ dest[2] = blend(color[2],dest[2],alpha[0]);
+ dest[3] = blend(color[3],dest[3],alpha[0]);
}
dest+=4;
alpha++;
diff --git a/liboil/colorspace/argb_paint_i386.c b/liboil/colorspace/argb_paint_i386.c
index f6785a4..3753589 100644
--- a/liboil/colorspace/argb_paint_i386.c
+++ b/liboil/colorspace/argb_paint_i386.c
@@ -35,11 +35,12 @@
OIL_DECLARE_CLASS (argb_paint_u8);
-#define imult(a,b) (((a)*(b) + (((a)*(b)) >> 8))>>8)
-#define apply(a,b,c) (imult(a,255-c) + imult(b,c))
+#define div255(x) (((x + 128) + ((x + 128)>>8))>>8)
+#define blend(x,y,a) div255((x)*(a) + (y)*(255-(a)))
static short constants[][4] = {
- { 255, 255, 255, 255 }
+ { 255, 255, 255, 255 },
+ { 128, 128, 128, 128 }
};
static void
@@ -55,10 +56,10 @@ argb_paint_u8_mmx (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n)
" movq (%0), %%mm1\n"
" punpcklbw %%mm0, %%mm1\n"
" movb (%2), %%al\n"
- " je 3f\n"
+ " je 4f\n"
" cmpl $255, %1\n"
" jne 2f\n"
- " movd %%mm3, (%0)\n"
+ " movq %%mm3, %%mm2\n"
" jmp 3f\n"
"2:\n"
" movd %1, %%mm2\n"
@@ -68,13 +69,15 @@ argb_paint_u8_mmx (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n)
" pmullw %%mm1, %%mm4\n"
" pmullw %%mm3, %%mm2\n"
" paddw %%mm4, %%mm2\n"
+ " paddw 8(%4), %%mm2\n"
" movq %%mm2, %%mm1\n"
" psrlw $8, %%mm1\n"
" paddw %%mm1, %%mm2\n"
" psrlw $8, %%mm2\n"
+ "3: \n"
" packuswb %%mm0, %%mm2\n"
" movd %%mm2, (%0)\n"
- "3:\n"
+ "4:\n"
" add $4, %0\n"
" add $1, %2\n"
" decl %3\n"
@@ -85,3 +88,4 @@ argb_paint_u8_mmx (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n)
}
OIL_DEFINE_IMPL_FULL (argb_paint_u8_mmx, argb_paint_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_SSE);
+
diff --git a/liboil/conv/conv_3dnow.c b/liboil/conv/conv_3dnow.c
index 54da52f..af7df49 100644
--- a/liboil/conv/conv_3dnow.c
+++ b/liboil/conv/conv_3dnow.c
@@ -34,45 +34,52 @@
#ifdef __GNUC__
/* suboptimal */
-static void conv_f32_s32_3dnow(float *dst, int dst_stride, int32_t *src, int src_stride, int n)
+static void
+conv_f32_s16_3dnow(float *dst, int dst_stride, int16_t *src, int src_stride,
+ int n)
{
int i;
- if (n & 1)
- *dst++ = (float) *src++;
- n /= 2;
-
for(i=0;i<n;i++){
asm volatile(
- " pi2fd 0(%0), %%mm0 \n"
- " movq %%mm0, 0(%1) \n"
+ " xor %%eax, %%eax \n"
+ " movw 0(%0), %%eax \n"
+ " movd %%eax, %%mm0 \n"
+ " pi2fd 0(%0), %%mm0 \n"
+ " movd %%mm0, 0(%1) \n"
:
- : "a" (src), "c" (dst)
- : "mm0"
+ : "r" (src), "r" (dst)
+ : "eax", "mm0"
);
dst = OIL_OFFSET(dst, dst_stride);
src = OIL_OFFSET(src, src_stride);
}
asm volatile ("emms");
}
-OIL_DEFINE_IMPL_FULL(conv_f32_s32_3dnow, conv_f32_s32,
+OIL_DEFINE_IMPL_FULL(conv_f32_s16_3dnow, conv_f32_s16,
OIL_IMPL_FLAG_3DNOW);
/* suboptimal */
-static void conv_s32_f32_3dnow (int32_t *dst, int dst_stride, float *src, int src_stride, int n)
+static void
+conv_s32_f32_3dnow (int32_t *dst, int dst_stride, float *src, int src_stride,
+ int n)
{
int i;
-
- if (n & 1)
- *src++ = (int32_t) *dst++;
- n /= 2;
+ const float constants[][2] = {
+ { -0.5, -0.5 },
+ { -1.0, -1.0 }
+ };
for(i=0;i<n;i++){
asm volatile(
- " pf2id 0(%0), %%mm0 \n"
- " movq %%mm0, 0(%1) \n"
+ " movq 0(%0), %%mm0 \n"
+ " pfadd 0(%2), %%mm0 \n"
+ " pf2id %%mm0, %%mm1 \n"
+ " pfcmpgt 0(%2), %%mm0 \n"
+ " paddd %%mm0, %%mm1 \n"
+ " movd %%mm1, 0(%1) \n"
:
- : "a" (src), "c" (dst)
+ : "r" (src), "r" (dst), "r" (constants)
: "mm0"
);
dst = OIL_OFFSET(dst, dst_stride);
diff --git a/liboil/conv/conv_bitstuff.c b/liboil/conv/conv_bitstuff.c
index 82c7b9d..a535bfe 100644
--- a/liboil/conv/conv_bitstuff.c
+++ b/liboil/conv/conv_bitstuff.c
@@ -29,6 +29,7 @@
#include "config.h"
#endif
#include <liboil/liboilfunction.h>
+#include <conv.h>
#ifdef HAVE_IEEE754_H
@@ -49,7 +50,7 @@ static void conv_f32_u8_bitstuff(float *dst, int dest_stride, uint8_t *src,
OIL_INCREMENT(src, src_stride);
}
}
-OIL_DEFINE_IMPL(conv_f32_u8_bitstuff, conv_f32_u8_class);
+OIL_DEFINE_IMPL(conv_f32_u8_bitstuff, conv_f32_u8);
static void conv_f32_s8_bitstuff(float *dst, int dest_stride, int8_t *src,
int src_stride, int n)
@@ -66,7 +67,7 @@ static void conv_f32_s8_bitstuff(float *dst, int dest_stride, int8_t *src,
OIL_INCREMENT(src, src_stride);
}
}
-OIL_DEFINE_IMPL(conv_f32_s8_bitstuff, conv_f32_s8_class);
+OIL_DEFINE_IMPL(conv_f32_s8_bitstuff, conv_f32_s8);
static void conv_f32_u16_bitstuff(float *dst, int dest_stride, uint16_t *src,
int src_stride, int n)
@@ -83,7 +84,7 @@ static void conv_f32_u16_bitstuff(float *dst, int dest_stride, uint16_t *src,
OIL_INCREMENT(src, src_stride);
}
}
-OIL_DEFINE_IMPL(conv_f32_u16_bitstuff, conv_f32_u16_class);
+OIL_DEFINE_IMPL(conv_f32_u16_bitstuff, conv_f32_u16);
static void conv_f32_s16_bitstuff(float *dst, int dest_stride, int16_t *src,
int src_stride, int n)
@@ -100,7 +101,7 @@ static void conv_f32_s16_bitstuff(float *dst, int dest_stride, int16_t *src,
OIL_INCREMENT(src, src_stride);
}
}
-OIL_DEFINE_IMPL(conv_f32_s16_bitstuff, conv_f32_s16_class);
+OIL_DEFINE_IMPL(conv_f32_s16_bitstuff, conv_f32_s16);
#define signbit_S32(x) (((uint32_t)(x))>>31)
@@ -124,7 +125,7 @@ static void conv_s16_f32_bitstuff(int16_t *dst, int dest_stride, float *src,
OIL_INCREMENT(src, src_stride);
}
}
-OIL_DEFINE_IMPL(conv_s16_f32_bitstuff, conv_s16_f32_class);
+OIL_DEFINE_IMPL(conv_s16_f32_bitstuff, conv_s16_f32);
#if 0
@@ -143,7 +144,7 @@ static void conv_f64_u8_bitstuff(float *dst, int dest_stride, uint8_t *src,
OIL_INCREMENT(src, src_stride);
}
}
-OIL_DEFINE_IMPL(conv_f64_u8_bitstuff, conv_f64_u8_class);
+OIL_DEFINE_IMPL(conv_f64_u8_bitstuff, conv_f64_u8);
static void conv_f64_s8_bitstuff(float *dst, int dest_stride, int8_t *src,
int src_stride, int n)
@@ -160,7 +161,7 @@ static void conv_f64_s8_bitstuff(float *dst, int dest_stride, int8_t *src,
OIL_INCREMENT(src, src_stride);
}
}
-OIL_DEFINE_IMPL(conv_f64_s8_bitstuff, conv_f64_s8_class);
+OIL_DEFINE_IMPL(conv_f64_s8_bitstuff, conv_f64_s8);
static void conv_f64_u16_bitstuff(float *dst, int dest_stride, uint16_t *src,
int src_stride, int n)
@@ -177,7 +178,7 @@ static void conv_f64_u16_bitstuff(float *dst, int dest_stride, uint16_t *src,
OIL_INCREMENT(src, src_stride);
}
}
-OIL_DEFINE_IMPL(conv_f64_u16_bitstuff, conv_f64_u16_class);
+OIL_DEFINE_IMPL(conv_f64_u16_bitstuff, conv_f64_u16);
static void conv_f64_s16_bitstuff(float *dst, int dest_stride, int16_t *src,
int src_stride, int n)
@@ -194,7 +195,7 @@ static void conv_f64_s16_bitstuff(float *dst, int dest_stride, int16_t *src,
OIL_INCREMENT(src, src_stride);
}
}
-OIL_DEFINE_IMPL(conv_f64_s16_bitstuff, conv_f64_s16_class);
+OIL_DEFINE_IMPL(conv_f64_s16_bitstuff, conv_f64_s16);
#endif
/* This implementation is slightly inaccurate */
@@ -225,7 +226,7 @@ static void conv_s16_f64_bitstuff(int16_t *dst, int dest_stride, float *src,
OIL_INCREMENT (src, src_stride);
}
}
-OIL_DEFINE_IMPL(conv_s16_f64_bitstuff, conv_s16_f64_class);
+OIL_DEFINE_IMPL(conv_s16_f64_bitstuff, conv_s16_f64);
#endif
diff --git a/liboil/conv/conv_misc.c b/liboil/conv/conv_misc.c
index 73898be..b32250e 100644
--- a/liboil/conv/conv_misc.c
+++ b/liboil/conv/conv_misc.c
@@ -38,6 +38,43 @@ static void
conv_f64_s16_table(double *dest, int dest_stride, short *src,
int src_stride, int n)
{
+ static double ints_high[256];
+ static double ints_low[256];
+ static int init = 0;
+ int i;
+ unsigned int idx;
+ if(!init){
+ for(i=0;i<256;i++){
+ ints_high[i]=256.0*((i<128)?i:i-256);
+ ints_low[i]=i;
+ }
+ init = 1;
+ }
+
+ if(n&1){
+ idx = (unsigned short)*src;
+ *dest = ints_high[(idx>>8)] + ints_low[(idx&0xff)];
+ OIL_INCREMENT(dest, dest_stride);
+ OIL_INCREMENT(src, src_stride);
+ n-=1;
+ }
+ for(i=0;i<n;i+=2){
+ idx = (unsigned short)*src;
+ *dest = ints_high[(idx>>8)] + ints_low[(idx&0xff)];
+ OIL_INCREMENT(dest, dest_stride);
+ OIL_INCREMENT(src, src_stride);
+ idx = (unsigned short)*src;
+ *dest = ints_high[(idx>>8)] + ints_low[(idx&0xff)];
+ OIL_INCREMENT(dest, dest_stride);
+ OIL_INCREMENT(src, src_stride);
+ }
+}
+OIL_DEFINE_IMPL(conv_f64_s16_table, conv_f64_s16);
+
+static void
+conv_f32_s16_table(float *dest, int dest_stride, short *src,
+ int src_stride, int n)
+{
static float ints_high[256];
static float ints_low[256];
static int init = 0;
@@ -69,6 +106,7 @@ conv_f64_s16_table(double *dest, int dest_stride, short *src,
OIL_INCREMENT(src, src_stride);
}
}
-OIL_DEFINE_IMPL(conv_f64_s16_table, conv_f64_s16);
+OIL_DEFINE_IMPL(conv_f32_s16_table, conv_f32_s16);
+
diff --git a/liboil/simdpack/diffsquaresum_f64.c b/liboil/simdpack/diffsquaresum_f64.c
index 633f90a..f503ccf 100644
--- a/liboil/simdpack/diffsquaresum_f64.c
+++ b/liboil/simdpack/diffsquaresum_f64.c
@@ -47,7 +47,8 @@ diffsquaresum_f64_ref(double *dest, double *src1, int sstr1, double *src2,
int i;
for(i=0;i<n;i++){
- x = OIL_OFFSET(src1, i*sstr1) - OIL_OFFSET(src2, i*sstr2);
+ x = OIL_GET(src1, i*sstr1, double) -
+ OIL_GET(src2, i*sstr2, double);
x = x*x;
tmp = sum;
sum += x;
@@ -68,7 +69,8 @@ diffsquaresum_f64_i10_simple(double *dest, double *src1, int sstr1, double *src2
int i;
for(i=0;i<n;i++){
- x = OIL_OFFSET(src1, i*sstr1) - OIL_OFFSET(src2, i*sstr2);
+ x = OIL_GET(src1, i*sstr1, double) -
+ OIL_GET(src2, i*sstr2, double);
x = x*x;
sum += x;
}
@@ -78,6 +80,25 @@ diffsquaresum_f64_i10_simple(double *dest, double *src1, int sstr1, double *src2
OIL_DEFINE_IMPL (diffsquaresum_f64_i10_simple, diffsquaresum_f64);
static void
+diffsquaresum_f64_i10_fast(double *dest, double *src1, int sstr1, double *src2,
+ int sstr2, int n)
+{
+ double sum0 = 0;
+ double x;
+
+ while(n>0){
+ x = *src1 - *src2;
+ sum0 += x * x;
+ OIL_INCREMENT (src1, sstr1);
+ OIL_INCREMENT (src2, sstr2);
+ n--;
+ }
+
+ *dest = sum0;
+}
+OIL_DEFINE_IMPL (diffsquaresum_f64_i10_fast, diffsquaresum_f64);
+
+static void
diffsquaresum_f64_i10_unroll2(double *dest, double *src1, int sstr1, double *src2,
int sstr2, int n)
{