diff options
Diffstat (limited to 'liboil')
-rw-r--r-- | liboil/colorspace/argb_paint.c | 20 | ||||
-rw-r--r-- | liboil/colorspace/argb_paint_i386.c | 16 | ||||
-rw-r--r-- | liboil/conv/conv_3dnow.c | 43 | ||||
-rw-r--r-- | liboil/conv/conv_bitstuff.c | 21 | ||||
-rw-r--r-- | liboil/conv/conv_misc.c | 40 | ||||
-rw-r--r-- | liboil/simdpack/diffsquaresum_f64.c | 25 |
6 files changed, 118 insertions, 47 deletions
diff --git a/liboil/colorspace/argb_paint.c b/liboil/colorspace/argb_paint.c index 6c7598a..7bbb168 100644 --- a/liboil/colorspace/argb_paint.c +++ b/liboil/colorspace/argb_paint.c @@ -35,8 +35,8 @@ OIL_DEFINE_CLASS (argb_paint_u8, "uint8_t *i_4xn, uint8_t *s1_4, uint8_t *s2_n, int n"); -#define imult(a,b) (((a)*(b) + (((a)*(b)) >> 8))>>8) -#define apply(a,b,c) (imult(a,255-c) + imult(b,c)) +#define div255(x) (((x + 128) + ((x + 128)>>8))>>8) +#define blend(x,y,a) div255((x)*(a) + (y)*(255-(a))) static void argb_paint_u8_ref (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n) @@ -44,10 +44,10 @@ argb_paint_u8_ref (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n) int i; for(i=0;i<n;i++){ - dest[0] = apply(dest[0],color[0],alpha[0]); - dest[1] = apply(dest[1],color[1],alpha[0]); - dest[2] = apply(dest[2],color[2],alpha[0]); - dest[3] = apply(dest[3],color[3],alpha[0]); + dest[0] = blend(color[0],dest[0],alpha[0]); + dest[1] = blend(color[1],dest[1],alpha[0]); + dest[2] = blend(color[2],dest[2],alpha[0]); + dest[3] = blend(color[3],dest[3],alpha[0]); dest+=4; alpha++; } @@ -68,10 +68,10 @@ argb_paint_u8_fast (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n) dest[2] = color[2]; dest[3] = color[3]; } else { - dest[0] = apply(dest[0],color[0],alpha[0]); - dest[1] = apply(dest[1],color[1],alpha[0]); - dest[2] = apply(dest[2],color[2],alpha[0]); - dest[3] = apply(dest[3],color[3],alpha[0]); + dest[0] = blend(color[0],dest[0],alpha[0]); + dest[1] = blend(color[1],dest[1],alpha[0]); + dest[2] = blend(color[2],dest[2],alpha[0]); + dest[3] = blend(color[3],dest[3],alpha[0]); } dest+=4; alpha++; diff --git a/liboil/colorspace/argb_paint_i386.c b/liboil/colorspace/argb_paint_i386.c index f6785a4..3753589 100644 --- a/liboil/colorspace/argb_paint_i386.c +++ b/liboil/colorspace/argb_paint_i386.c @@ -35,11 +35,12 @@ OIL_DECLARE_CLASS (argb_paint_u8); -#define imult(a,b) (((a)*(b) + (((a)*(b)) >> 8))>>8) -#define apply(a,b,c) (imult(a,255-c) + imult(b,c)) +#define div255(x) (((x + 128) + ((x + 128)>>8))>>8) +#define blend(x,y,a) div255((x)*(a) + (y)*(255-(a))) static short constants[][4] = { - { 255, 255, 255, 255 } + { 255, 255, 255, 255 }, + { 128, 128, 128, 128 } }; static void @@ -55,10 +56,10 @@ argb_paint_u8_mmx (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n) " movq (%0), %%mm1\n" " punpcklbw %%mm0, %%mm1\n" " movb (%2), %%al\n" - " je 3f\n" + " je 4f\n" " cmpl $255, %1\n" " jne 2f\n" - " movd %%mm3, (%0)\n" + " movq %%mm3, %%mm2\n" " jmp 3f\n" "2:\n" " movd %1, %%mm2\n" @@ -68,13 +69,15 @@ argb_paint_u8_mmx (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n) " pmullw %%mm1, %%mm4\n" " pmullw %%mm3, %%mm2\n" " paddw %%mm4, %%mm2\n" + " paddw 8(%4), %%mm2\n" " movq %%mm2, %%mm1\n" " psrlw $8, %%mm1\n" " paddw %%mm1, %%mm2\n" " psrlw $8, %%mm2\n" + "3: \n" " packuswb %%mm0, %%mm2\n" " movd %%mm2, (%0)\n" - "3:\n" + "4:\n" " add $4, %0\n" " add $1, %2\n" " decl %3\n" @@ -85,3 +88,4 @@ argb_paint_u8_mmx (uint8_t *dest, uint8_t *color, uint8_t *alpha, int n) } OIL_DEFINE_IMPL_FULL (argb_paint_u8_mmx, argb_paint_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_SSE); + diff --git a/liboil/conv/conv_3dnow.c b/liboil/conv/conv_3dnow.c index 54da52f..af7df49 100644 --- a/liboil/conv/conv_3dnow.c +++ b/liboil/conv/conv_3dnow.c @@ -34,45 +34,52 @@ #ifdef __GNUC__ /* suboptimal */ -static void conv_f32_s32_3dnow(float *dst, int dst_stride, int32_t *src, int src_stride, int n) +static void +conv_f32_s16_3dnow(float *dst, int dst_stride, int16_t *src, int src_stride, + int n) { int i; - if (n & 1) - *dst++ = (float) *src++; - n /= 2; - for(i=0;i<n;i++){ asm volatile( - " pi2fd 0(%0), %%mm0 \n" - " movq %%mm0, 0(%1) \n" + " xor %%eax, %%eax \n" + " movw 0(%0), %%eax \n" + " movd %%eax, %%mm0 \n" + " pi2fd 0(%0), %%mm0 \n" + " movd %%mm0, 0(%1) \n" : - : "a" (src), "c" (dst) - : "mm0" + : "r" (src), "r" (dst) + : "eax", "mm0" ); dst = OIL_OFFSET(dst, dst_stride); src = OIL_OFFSET(src, src_stride); } asm volatile ("emms"); } -OIL_DEFINE_IMPL_FULL(conv_f32_s32_3dnow, conv_f32_s32, +OIL_DEFINE_IMPL_FULL(conv_f32_s16_3dnow, conv_f32_s16, OIL_IMPL_FLAG_3DNOW); /* suboptimal */ -static void conv_s32_f32_3dnow (int32_t *dst, int dst_stride, float *src, int src_stride, int n) +static void +conv_s32_f32_3dnow (int32_t *dst, int dst_stride, float *src, int src_stride, + int n) { int i; - - if (n & 1) - *src++ = (int32_t) *dst++; - n /= 2; + const float constants[][2] = { + { -0.5, -0.5 }, + { -1.0, -1.0 } + }; for(i=0;i<n;i++){ asm volatile( - " pf2id 0(%0), %%mm0 \n" - " movq %%mm0, 0(%1) \n" + " movq 0(%0), %%mm0 \n" + " pfadd 0(%2), %%mm0 \n" + " pf2id %%mm0, %%mm1 \n" + " pfcmpgt 0(%2), %%mm0 \n" + " paddd %%mm0, %%mm1 \n" + " movd %%mm1, 0(%1) \n" : - : "a" (src), "c" (dst) + : "r" (src), "r" (dst), "r" (constants) : "mm0" ); dst = OIL_OFFSET(dst, dst_stride); diff --git a/liboil/conv/conv_bitstuff.c b/liboil/conv/conv_bitstuff.c index 82c7b9d..a535bfe 100644 --- a/liboil/conv/conv_bitstuff.c +++ b/liboil/conv/conv_bitstuff.c @@ -29,6 +29,7 @@ #include "config.h" #endif #include <liboil/liboilfunction.h> +#include <conv.h> #ifdef HAVE_IEEE754_H @@ -49,7 +50,7 @@ static void conv_f32_u8_bitstuff(float *dst, int dest_stride, uint8_t *src, OIL_INCREMENT(src, src_stride); } } -OIL_DEFINE_IMPL(conv_f32_u8_bitstuff, conv_f32_u8_class); +OIL_DEFINE_IMPL(conv_f32_u8_bitstuff, conv_f32_u8); static void conv_f32_s8_bitstuff(float *dst, int dest_stride, int8_t *src, int src_stride, int n) @@ -66,7 +67,7 @@ static void conv_f32_s8_bitstuff(float *dst, int dest_stride, int8_t *src, OIL_INCREMENT(src, src_stride); } } -OIL_DEFINE_IMPL(conv_f32_s8_bitstuff, conv_f32_s8_class); +OIL_DEFINE_IMPL(conv_f32_s8_bitstuff, conv_f32_s8); static void conv_f32_u16_bitstuff(float *dst, int dest_stride, uint16_t *src, int src_stride, int n) @@ -83,7 +84,7 @@ static void conv_f32_u16_bitstuff(float *dst, int dest_stride, uint16_t *src, OIL_INCREMENT(src, src_stride); } } -OIL_DEFINE_IMPL(conv_f32_u16_bitstuff, conv_f32_u16_class); +OIL_DEFINE_IMPL(conv_f32_u16_bitstuff, conv_f32_u16); static void conv_f32_s16_bitstuff(float *dst, int dest_stride, int16_t *src, int src_stride, int n) @@ -100,7 +101,7 @@ static void conv_f32_s16_bitstuff(float *dst, int dest_stride, int16_t *src, OIL_INCREMENT(src, src_stride); } } -OIL_DEFINE_IMPL(conv_f32_s16_bitstuff, conv_f32_s16_class); +OIL_DEFINE_IMPL(conv_f32_s16_bitstuff, conv_f32_s16); #define signbit_S32(x) (((uint32_t)(x))>>31) @@ -124,7 +125,7 @@ static void conv_s16_f32_bitstuff(int16_t *dst, int dest_stride, float *src, OIL_INCREMENT(src, src_stride); } } -OIL_DEFINE_IMPL(conv_s16_f32_bitstuff, conv_s16_f32_class); +OIL_DEFINE_IMPL(conv_s16_f32_bitstuff, conv_s16_f32); #if 0 @@ -143,7 +144,7 @@ static void conv_f64_u8_bitstuff(float *dst, int dest_stride, uint8_t *src, OIL_INCREMENT(src, src_stride); } } -OIL_DEFINE_IMPL(conv_f64_u8_bitstuff, conv_f64_u8_class); +OIL_DEFINE_IMPL(conv_f64_u8_bitstuff, conv_f64_u8); static void conv_f64_s8_bitstuff(float *dst, int dest_stride, int8_t *src, int src_stride, int n) @@ -160,7 +161,7 @@ static void conv_f64_s8_bitstuff(float *dst, int dest_stride, int8_t *src, OIL_INCREMENT(src, src_stride); } } -OIL_DEFINE_IMPL(conv_f64_s8_bitstuff, conv_f64_s8_class); +OIL_DEFINE_IMPL(conv_f64_s8_bitstuff, conv_f64_s8); static void conv_f64_u16_bitstuff(float *dst, int dest_stride, uint16_t *src, int src_stride, int n) @@ -177,7 +178,7 @@ static void conv_f64_u16_bitstuff(float *dst, int dest_stride, uint16_t *src, OIL_INCREMENT(src, src_stride); } } -OIL_DEFINE_IMPL(conv_f64_u16_bitstuff, conv_f64_u16_class); +OIL_DEFINE_IMPL(conv_f64_u16_bitstuff, conv_f64_u16); static void conv_f64_s16_bitstuff(float *dst, int dest_stride, int16_t *src, int src_stride, int n) @@ -194,7 +195,7 @@ static void conv_f64_s16_bitstuff(float *dst, int dest_stride, int16_t *src, OIL_INCREMENT(src, src_stride); } } -OIL_DEFINE_IMPL(conv_f64_s16_bitstuff, conv_f64_s16_class); +OIL_DEFINE_IMPL(conv_f64_s16_bitstuff, conv_f64_s16); #endif /* This implementation is slightly inaccurate */ @@ -225,7 +226,7 @@ static void conv_s16_f64_bitstuff(int16_t *dst, int dest_stride, float *src, OIL_INCREMENT (src, src_stride); } } -OIL_DEFINE_IMPL(conv_s16_f64_bitstuff, conv_s16_f64_class); +OIL_DEFINE_IMPL(conv_s16_f64_bitstuff, conv_s16_f64); #endif diff --git a/liboil/conv/conv_misc.c b/liboil/conv/conv_misc.c index 73898be..b32250e 100644 --- a/liboil/conv/conv_misc.c +++ b/liboil/conv/conv_misc.c @@ -38,6 +38,43 @@ static void conv_f64_s16_table(double *dest, int dest_stride, short *src, int src_stride, int n) { + static double ints_high[256]; + static double ints_low[256]; + static int init = 0; + int i; + unsigned int idx; + if(!init){ + for(i=0;i<256;i++){ + ints_high[i]=256.0*((i<128)?i:i-256); + ints_low[i]=i; + } + init = 1; + } + + if(n&1){ + idx = (unsigned short)*src; + *dest = ints_high[(idx>>8)] + ints_low[(idx&0xff)]; + OIL_INCREMENT(dest, dest_stride); + OIL_INCREMENT(src, src_stride); + n-=1; + } + for(i=0;i<n;i+=2){ + idx = (unsigned short)*src; + *dest = ints_high[(idx>>8)] + ints_low[(idx&0xff)]; + OIL_INCREMENT(dest, dest_stride); + OIL_INCREMENT(src, src_stride); + idx = (unsigned short)*src; + *dest = ints_high[(idx>>8)] + ints_low[(idx&0xff)]; + OIL_INCREMENT(dest, dest_stride); + OIL_INCREMENT(src, src_stride); + } +} +OIL_DEFINE_IMPL(conv_f64_s16_table, conv_f64_s16); + +static void +conv_f32_s16_table(float *dest, int dest_stride, short *src, + int src_stride, int n) +{ static float ints_high[256]; static float ints_low[256]; static int init = 0; @@ -69,6 +106,7 @@ conv_f64_s16_table(double *dest, int dest_stride, short *src, OIL_INCREMENT(src, src_stride); } } -OIL_DEFINE_IMPL(conv_f64_s16_table, conv_f64_s16); +OIL_DEFINE_IMPL(conv_f32_s16_table, conv_f32_s16); + diff --git a/liboil/simdpack/diffsquaresum_f64.c b/liboil/simdpack/diffsquaresum_f64.c index 633f90a..f503ccf 100644 --- a/liboil/simdpack/diffsquaresum_f64.c +++ b/liboil/simdpack/diffsquaresum_f64.c @@ -47,7 +47,8 @@ diffsquaresum_f64_ref(double *dest, double *src1, int sstr1, double *src2, int i; for(i=0;i<n;i++){ - x = OIL_OFFSET(src1, i*sstr1) - OIL_OFFSET(src2, i*sstr2); + x = OIL_GET(src1, i*sstr1, double) - + OIL_GET(src2, i*sstr2, double); x = x*x; tmp = sum; sum += x; @@ -68,7 +69,8 @@ diffsquaresum_f64_i10_simple(double *dest, double *src1, int sstr1, double *src2 int i; for(i=0;i<n;i++){ - x = OIL_OFFSET(src1, i*sstr1) - OIL_OFFSET(src2, i*sstr2); + x = OIL_GET(src1, i*sstr1, double) - + OIL_GET(src2, i*sstr2, double); x = x*x; sum += x; } @@ -78,6 +80,25 @@ diffsquaresum_f64_i10_simple(double *dest, double *src1, int sstr1, double *src2 OIL_DEFINE_IMPL (diffsquaresum_f64_i10_simple, diffsquaresum_f64); static void +diffsquaresum_f64_i10_fast(double *dest, double *src1, int sstr1, double *src2, + int sstr2, int n) +{ + double sum0 = 0; + double x; + + while(n>0){ + x = *src1 - *src2; + sum0 += x * x; + OIL_INCREMENT (src1, sstr1); + OIL_INCREMENT (src2, sstr2); + n--; + } + + *dest = sum0; +} +OIL_DEFINE_IMPL (diffsquaresum_f64_i10_fast, diffsquaresum_f64); + +static void diffsquaresum_f64_i10_unroll2(double *dest, double *src1, int sstr1, double *src2, int sstr2, int n) { |