diff options
author | David Schleef <ds@schleef.org> | 2005-08-15 21:33:39 +0000 |
---|---|---|
committer | David Schleef <ds@schleef.org> | 2005-08-15 21:33:39 +0000 |
commit | ec572e49fb3423767ab7d562c5ef0aa2ad4ff38b (patch) | |
tree | 2aa40503eca7623296b431e79d22e6e4720e2be8 | |
parent | 1ef601312c0634d55f1a098be769eae436dd0b92 (diff) | |
download | liboil-ec572e49fb3423767ab7d562c5ef0aa2ad4ff38b.tar.gz |
* configure.ac: Add some altivec theora code
* liboil/Makefile.am:
* liboil/powerpc/Makefile.am:
* liboil/powerpc/fdct8x8theora_altivec.c: (fdct8x8theora_altivec):
* liboil/powerpc/recon8x8_altivec.c: (recon8x8_intra_altivec),
(recon8x8_inter_altivec), (recon8x8_inter2_altivec):
* liboil/powerpc/recon8x8_ppc.c: (recon8x8_intra_ppc),
(recon8x8_inter_ppc), (recon8x8_inter2_ppc):
* liboil/colorspace/composite.c: Fix bug in ADD operator.
* liboil/dct/fdct8x8theora_i386.c:
* liboil/simdpack/average2_u8.c: (average2_u8_trick),
(average2_u8_unroll4): Fix n%4!=0 problems noticed by thomasvs.
* liboil/simdpack/scalarmult_i386.c: (scalarmult_f32_sse): Fix
n%4!=0 problems.
* testsuite/stride.c: (main): use a random n to test possible
endpoint problems.
-rw-r--r-- | ChangeLog | 20 | ||||
-rw-r--r-- | configure.ac | 1 | ||||
-rw-r--r-- | liboil/Makefile.am | 3 | ||||
-rw-r--r-- | liboil/colorspace/composite.c | 2 | ||||
-rw-r--r-- | liboil/dct/fdct8x8theora_i386.c | 1 | ||||
-rw-r--r-- | liboil/powerpc/Makefile.am | 18 | ||||
-rw-r--r-- | liboil/powerpc/fdct8x8theora_altivec.c | 522 | ||||
-rw-r--r-- | liboil/powerpc/recon8x8_altivec.c | 716 | ||||
-rw-r--r-- | liboil/powerpc/recon8x8_ppc.c | 526 | ||||
-rw-r--r-- | liboil/simdpack/average2_u8.c | 80 | ||||
-rw-r--r-- | liboil/simdpack/scalarmult_i386.c | 6 | ||||
-rw-r--r-- | testsuite/stride.c | 2 |
12 files changed, 1835 insertions, 62 deletions
@@ -1,5 +1,25 @@ 2005-08-15 David Schleef <ds@schleef.org> + * configure.ac: Add some altivec theora code + * liboil/Makefile.am: + * liboil/powerpc/Makefile.am: + * liboil/powerpc/fdct8x8theora_altivec.c: (fdct8x8theora_altivec): + * liboil/powerpc/recon8x8_altivec.c: (recon8x8_intra_altivec), + (recon8x8_inter_altivec), (recon8x8_inter2_altivec): + * liboil/powerpc/recon8x8_ppc.c: (recon8x8_intra_ppc), + (recon8x8_inter_ppc), (recon8x8_inter2_ppc): + + * liboil/colorspace/composite.c: Fix bug in ADD operator. + * liboil/dct/fdct8x8theora_i386.c: + * liboil/simdpack/average2_u8.c: (average2_u8_trick), + (average2_u8_unroll4): Fix n%4!=0 problems noticed by thomasvs. + * liboil/simdpack/scalarmult_i386.c: (scalarmult_f32_sse): Fix + n%4!=0 problems. + * testsuite/stride.c: (main): use a random n to test possible + endpoint problems. + +2005-08-15 David Schleef <ds@schleef.org> + * liboil/liboilcpu.c: (oil_cpu_i386_getflags_cpuinfo): SSE2 implies MMXEXT in both codepaths. diff --git a/configure.ac b/configure.ac index c0e4a5e..9bb9331 100644 --- a/configure.ac +++ b/configure.ac @@ -184,6 +184,7 @@ liboil/jpeg/Makefile liboil/math/Makefile liboil/md5/Makefile liboil/motovec/Makefile +liboil/powerpc/Makefile liboil/ref/Makefile liboil/simdpack/Makefile liboil/sse/Makefile diff --git a/liboil/Makefile.am b/liboil/Makefile.am index b9bf45a..530ca28 100644 --- a/liboil/Makefile.am +++ b/liboil/Makefile.am @@ -1,7 +1,7 @@ pkgincludedir = $(includedir)/liboil-@LIBOIL_MAJORMINOR@/liboil -SUBDIRS = colorspace conv copy dct fb i386 jpeg math md5 motovec ref simdpack sse utf8 +SUBDIRS = colorspace conv copy dct fb i386 jpeg math md5 motovec powerpc ref simdpack sse utf8 lib_LTLIBRARIES = liboiltmp1.la liboil-@LIBOIL_MAJORMINOR@.la @@ -33,6 +33,7 @@ liboilfunctions_la_LIBADD = \ math/libmath.la \ md5/libmd5.la \ motovec/libmotovec.la \ + powerpc/libpowerpc.la \ ref/libref.la \ simdpack/libsimdpack.la \ sse/libsse.la \ diff --git a/liboil/colorspace/composite.c b/liboil/colorspace/composite.c index 6d9f4ea..5fbbae7 100644 --- a/liboil/colorspace/composite.c +++ b/liboil/colorspace/composite.c @@ -36,7 +36,7 @@ #include <liboil/liboildebug.h> #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m))) -#define COMPOSITE_ADD(d,s) ((d) + (s)) +#define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s)) #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m)) static void diff --git a/liboil/dct/fdct8x8theora_i386.c b/liboil/dct/fdct8x8theora_i386.c index 6126adb..7d8bce3 100644 --- a/liboil/dct/fdct8x8theora_i386.c +++ b/liboil/dct/fdct8x8theora_i386.c @@ -45,6 +45,7 @@ #include <liboil/dct/dct.h> #include <math.h> +/* FIXME this causes problems on old gcc */ static const __attribute__ ((aligned(8),used)) int64_t xC1S7 = 0x0fb15fb15fb15fb15LL; static const __attribute__ ((aligned(8),used)) int64_t xC2S6 = 0x0ec83ec83ec83ec83LL; static const __attribute__ ((aligned(8),used)) int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL; diff --git a/liboil/powerpc/Makefile.am b/liboil/powerpc/Makefile.am new file mode 100644 index 0000000..f49fcc5 --- /dev/null +++ b/liboil/powerpc/Makefile.am @@ -0,0 +1,18 @@ + +noinst_LTLIBRARIES = libpowerpc.la + +sources = \ + recon8x8_ppc.c \ + fdct8x8theora_altivec.c \ + recon8x8_altivec.c + +if HAVE_CPU_POWERPC +powerpc_sources = $(sources) +else +powerpc_sources = +endif + +libpowerpc_la_SOURCES = \ + $(powerpc_sources) +libpowerpc_la_CFLAGS = $(LIBOIL_CFLAGS) -fasm-blocks + diff --git a/liboil/powerpc/fdct8x8theora_altivec.c b/liboil/powerpc/fdct8x8theora_altivec.c new file mode 100644 index 0000000..0bb2467 --- /dev/null +++ b/liboil/powerpc/fdct8x8theora_altivec.c @@ -0,0 +1,522 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: fdct8x8theora_altivec.c,v 1.1 2005-08-15 21:33:39 ds Exp $ + + ********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/liboilfuncs.h> +#include <liboil/dct/dct.h> +#include <math.h> + +extern vector signed short idctConst; +extern vector unsigned char vPerm1; +extern vector unsigned char vPerm2; +extern vector unsigned char vPerm3; + +OIL_DECLARE_CLASS(fdct8x8theora); + +static void +fdct8x8theora_altivec(int16_t *src, int16_t *dest) +{ + (void) src; + (void) dest; + + asm + { + lwz r10,vPerm1 + xor r7,r7,r7 + + lwz r8,vPerm2 + + lwz r9,vPerm3 + + lvx v30,r10,r7 + + lvx v31,r8,r7 + + lvx v29,r9,r7 + + + lwz r9,idctConst + xor r7,r7,r7 + xor r8,r8,r8 + +//trying cache hints +// lis r8,0x1001 //Block Size = 16, Block Count = 1, Block Stride = 0 +// dstst r5,r8,0 +// dst r4,r8,1 +// dst r3,r8,2 + + lvx v8,r9,r7 + xor r8,r8,r8 + + lvx v10,r3,r8 //row 0 + vsplth v0,v8,0 + addi r8,r8,16 + + lvx v11,r3,r8 //row 1 + vsplth v1,v8,1 + addi r8,r8,16 + + lvx v12,r3,r8 //row 2 + vsplth v2,v8,2 + addi r8,r8,16 + + lvx v13,r3,r8 //row 3 + vsplth v3,v8,3 + addi r8,r8,16 + + lvx v14,r3,r8 //row 4 + vsplth v4,v8,4 + addi r8,r8,16 + + lvx v15,r3,r8 //row 5 + vsplth v5,v8,5 + addi r8,r8,16 + + lvx v16,r3,r8 //row 6 + vsplth v6,v8,6 + addi r8,r8,16 + + lvx v17,r3,r8 //row 7 + vsplth v7,v8,7 + + // on entry + //00 01 02 03 04 05 06 07 + //10 11 12 13 14 15 16 17 + //20 21 22 23 24 25 26 27 + //30 31 32 33 34 35 36 37 + //40 41 42 43 44 45 46 47 + //50 51 52 53 54 55 56 57 + //60 61 62 63 64 65 66 67 + //70 71 72 73 74 75 76 77 +//start of transpose + vmrghh v18,v10,v11 + vmrglh v19,v10,v11 + vmrghh v20,v12,v13 + vmrglh v21,v12,v13 + vmrghh v22,v14,v15 + vmrglh v23,v14,v15 + vmrghh v24,v16,v17 + vmrglh v25,v16,v17 + + vmrghw v8,v18,v20 + vmrghw v9,v22,v24 + vmrghw v26,v19,v21 + vmrghw v27,v23,v25 + vmrglw v18,v18,v20 + vmrglw v22,v22,v24 + vmrglw v19,v19,v21 + vmrglw v23,v23,v25 + + vperm v10,v8,v9,v30 //00 10 20 30 40 50 60 70 + vperm v11,v8,v9,v31 //01 11 21 31 41 51 61 71 + vperm v12,v18,v22,v30 //02 12 22 32 42 52 62 72 + vperm v13,v18,v22,v31 //03 13 23 33 43 53 63 73 + vperm v20,v26,v27,v30 //04 14 24 34 44 54 64 74 + vperm v21,v26,v27,v31 //05 15 25 35 45 55 65 75 + vperm v22,v19,v23,v30 //06 16 26 36 46 56 66 76 + vperm v23,v19,v23,v31 //07 17 27 37 47 57 67 77 +//end of transpose + +//~~~~~~~~~~ start cut here + vsubuhm v14,v10,v23 //id07 + vsubuhm v15,v11,v12 //id12 + vsubuhm v16,v13,v20 //id34 + vsubuhm v17,v21,v22 //id56 + + vadduhm v10,v10,v23 //is07 + vadduhm v11,v11,v12 //is12 + vadduhm v12,v13,v20 //is34 + vadduhm v13,v21,v22 //is56 + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// some precalulations + vspltish v28,15 + vadduhm v18,v10,v12 //is0734 + vadduhm v19,v11,v13 //is1256 + + vsubuhm v20,v11,v13 //(is12 - is56) + vmulesh v22,v20,v4 + vmulosh v23,v20,v4 + vperm v8,v22,v23,v29 //(c4s4 * (is12 - is56)) - (is12 - is56) + vadduhm v8,v8,v20 //c4s4 * (is12 - is56) + vsrh v20,v20,v28 //get sign bit + vadduhm v8,v8,v20 //add in sign bit aka icommon_product1 + + vadduhm v20,v15,v17 //(id12 + id56) + vmulesh v22,v20,v4 + vmulosh v23,v20,v4 + vperm v9,v22,v23,v29 //(c4s4 * (is12 + is56)) - (is12 + is56) + vadduhm v9,v9,v20 //c4s4 * (is12 + is56) + vsrh v20,v20,v28 //get sign bit + vadduhm v9,v9,v20 //add in sign bit aka icommon_product2 + + vsubuhm v20,v15,v17 //irot_input_x = id12 - id56 + vsubuhm v21,v10,v12 //irot_input_y = is07 - is34 + + vadduhm v22,v14,v8 //irot_input_x = icommon_product1 + id07 + vadduhm v23,v16,v9 //irot_input_y = icommon_product2 + id34 + vxor v24,v24,v24 + vsubuhm v23,v24,v23 //irot_input_y = -(icommon_product2 + id34) + + vsubuhm v24,v14,v8 //irot_input_x = id07 - icommon_product1 + vsubuhm v25,v16,v9 //irot_input_y = id34 - icommon_product2 + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// ip[0], ip[4] + vadduhm v8,v18,v19 //(is0734 + is1256) + vsubuhm v9,v18,v19 //(is0734 - is1256) + + vmulesh v18,v8,v4 + vmulosh v19,v8,v4 + vperm v18,v18,v19,v29 //(c4s4 * (is0734 + is1256)) - (is0734 + is1256) + vadduhm v18,v18,v8 //(c4s4 * (is0734 + is1256)) + vsrh v8,v8,v28 + vadduhm v10,v18,v8 //add in sign bit aka ip[0] + + vmulesh v18,v9,v4 + vmulosh v19,v9,v4 + vperm v18,v18,v19,v29 //(c4s4 * (is0734 + is1256)) - (is0734 + is1256) + vadduhm v18,v18,v9 //(c4s4 * (is0734 + is1256)) + vsrh v9,v9,v28 + vadduhm v14,v18,v9 //add in sign bit aka ip[4] + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// ip[2], ip[6] + vmulesh v18,v20,v6 + vmulosh v19,v20,v6 + vperm v18,v18,v19,v29 //(c6s2 * (irot_input_x)) + vsrh v8,v20,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v21,v2 + vmulosh v19,v21,v2 + vperm v18,v18,v19,v29 //(c2s6 * (irot_input_y)) - irot_input_y + vadduhm v18,v18,v21 //(c2s6 * (irot_input_y)) + vsrh v9,v21,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vadduhm v12,v8,v9 //ip[2] + + vmulesh v18,v21,v6 + vmulosh v19,v21,v6 + vperm v18,v18,v19,v29 //(c6s2 * (irot_input_y)) + vsrh v8,v21,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v20,v2 + vmulosh v19,v20,v2 + vperm v18,v18,v19,v29 //(c2s6 * (irot_input_x)) - irot_input_x + vadduhm v18,v18,v20 //(c2s6 * (irot_input_x)) + vsrh v9,v20,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vsubuhm v16,v8,v9 //ip[6] + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// ip[1], ip[7] + vmulesh v18,v22,v1 + vmulosh v19,v22,v1 + vperm v18,v18,v19,v29 //(c1s7 * (irot_input_x)) - irot_input_x + vadduhm v18,v18,v22 //(c1s7 * (irot_input_x)) + vsrh v8,v22,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v23,v7 + vmulosh v19,v23,v7 + vperm v18,v18,v19,v29 //(c7s1 * (irot_input_y)) + vsrh v9,v23,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vsubuhm v11,v8,v9 //ip[1] + + vmulesh v18,v22,v7 + vmulosh v19,v22,v7 + vperm v18,v18,v19,v29 //(c7s1 * (irot_input_x)) + vsrh v8,v22,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v23,v1 + vmulosh v19,v23,v1 + vperm v18,v18,v19,v29 //(c1s7 * (irot_input_y)) - irot_input_y + vadduhm v18,v18,v23 //(c1s7 * (irot_input_7)) + vsrh v9,v23,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vadduhm v17,v8,v9 //ip[7] + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// ip[3], ip[5] + vmulesh v18,v24,v3 + vmulosh v19,v24,v3 + vperm v18,v18,v19,v29 //(c3s5 * (irot_input_x)) - irot_input_x + vadduhm v18,v18,v24 //(c3s5 * (irot_input_x)) + vsrh v8,v24,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v25,v5 + vmulosh v19,v25,v5 + vperm v18,v18,v19,v29 //(c5s3 * (irot_input_y)) - irot_input_y + vadduhm v18,v18,v25 //(c5s3 * (irot_input_y)) + vsrh v9,v25,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vsubuhm v13,v8,v9 //ip[3] + + vmulesh v18,v24,v5 + vmulosh v19,v24,v5 + vperm v18,v18,v19,v29 //(c5s3 * (irot_input_x)) - irot_input_x + vadduhm v18,v18,v24 //(c5s3 * (irot_input_x)) + vsrh v8,v24,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v25,v3 + vmulosh v19,v25,v3 + vperm v18,v18,v19,v29 //(c3s5 * (irot_input_y)) - irot_input_y + vadduhm v18,v18,v25 //(c3s5 * (irot_input_y)) + vsrh v9,v25,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vadduhm v15,v8,v9 //ip[5] +//~~~~~~~~~~ end cut here + +//~~~~~~~~~~~~~~~ transpose back +//start of transpose + vmrghh v18,v10,v11 //00 01 10 11 20 21 30 31 + vmrglh v19,v10,v11 //40 41 50 51 60 61 70 71 + vmrghh v20,v12,v13 //02 03 12 13 22 23 32 33 + vmrglh v21,v12,v13 //42 43 52 53 62 63 72 73 + vmrghh v22,v14,v15 //04 05 14 15 24 25 34 35 + vmrglh v23,v14,v15 //44 45 54 55 64 65 74 75 + vmrghh v24,v16,v17 //06 07 16 17 26 27 36 37 + vmrglh v25,v16,v17 //46 47 56 57 66 67 76 77 + + vmrghw v8,v18,v20 //00 01 02 03 10 11 12 13 + vmrghw v9,v22,v24 //04 05 06 07 14 15 16 17 + vmrghw v26,v19,v21 //40 41 42 43 50 51 52 53 + vmrghw v27,v23,v25 //44 45 46 47 54 55 56 57 + vmrglw v18,v18,v20 //20 21 22 23 30 31 32 33 + vmrglw v22,v22,v24 //24 25 26 27 34 35 36 37 + vmrglw v19,v19,v21 //60 61 62 63 70 71 72 73 + vmrglw v23,v23,v25 //64 65 66 67 74 75 76 77 + + vperm v10,v8,v9,v30 //00 01 02 03 04 05 06 07 + vperm v11,v8,v9,v31 //10 11 12 13 14 15 16 17 + vperm v12,v18,v22,v30 //20 21 22 23 24 25 26 27 + vperm v13,v18,v22,v31 //30 31 32 33 34 35 36 37 + vperm v20,v26,v27,v30 //40 41 42 43 44 45 46 47 + vperm v21,v26,v27,v31 //50 51 52 53 54 55 56 57 + vperm v22,v19,v23,v30 //60 61 62 63 64 65 66 67 + vperm v23,v19,v23,v31 //70 71 72 73 74 75 76 77 +//end of transpose +//~~~~~~~~~~ start cut here + vsubuhm v14,v10,v23 //id07 + vsubuhm v15,v11,v12 //id12 + vsubuhm v16,v13,v20 //id34 + vsubuhm v17,v21,v22 //id56 + + vadduhm v10,v10,v23 //is07 + vadduhm v11,v11,v12 //is12 + vadduhm v12,v13,v20 //is34 + vadduhm v13,v21,v22 //is56 + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// some precalulations + vspltish v28,15 + vadduhm v18,v10,v12 //is0734 + vadduhm v19,v11,v13 //is1256 + + vsubuhm v20,v11,v13 //(is12 - is56) + vmulesh v22,v20,v4 + vmulosh v23,v20,v4 + vperm v8,v22,v23,v29 //(c4s4 * (is12 - is56)) - (is12 - is56) + vadduhm v8,v8,v20 //c4s4 * (is12 - is56) + vsrh v20,v20,v28 //get sign bit + vadduhm v8,v8,v20 //add in sign bit aka icommon_product1 + + vadduhm v20,v15,v17 //(id12 + id56) + vmulesh v22,v20,v4 + vmulosh v23,v20,v4 + vperm v9,v22,v23,v29 //(c4s4 * (is12 + is56)) - (is12 + is56) + vadduhm v9,v9,v20 //c4s4 * (is12 + is56) + vsrh v20,v20,v28 //get sign bit + vadduhm v9,v9,v20 //add in sign bit aka icommon_product2 + + vsubuhm v20,v15,v17 //irot_input_x = id12 - id56 + vsubuhm v21,v10,v12 //irot_input_y = is07 - is34 + + vadduhm v22,v14,v8 //irot_input_x = icommon_product1 + id07 + vadduhm v23,v16,v9 //irot_input_y = icommon_product2 + id34 + vxor v24,v24,v24 + vsubuhm v23,v24,v23 //irot_input_y = -(icommon_product2 + id34) + + vsubuhm v24,v14,v8 //irot_input_x = id07 - icommon_product1 + vsubuhm v25,v16,v9 //irot_input_y = id34 - icommon_product2 + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// ip[0], ip[4] + vadduhm v8,v18,v19 //(is0734 + is1256) + vsubuhm v9,v18,v19 //(is0734 - is1256) + + vmulesh v18,v8,v4 + vmulosh v19,v8,v4 + vperm v18,v18,v19,v29 //(c4s4 * (is0734 + is1256)) - (is0734 + is1256) + vadduhm v18,v18,v8 //(c4s4 * (is0734 + is1256)) + vsrh v8,v8,v28 + vadduhm v10,v18,v8 //add in sign bit aka ip[0] + + vmulesh v18,v9,v4 + vmulosh v19,v9,v4 + vperm v18,v18,v19,v29 //(c4s4 * (is0734 + is1256)) - (is0734 + is1256) + vadduhm v18,v18,v9 //(c4s4 * (is0734 + is1256)) + vsrh v9,v9,v28 + vadduhm v14,v18,v9 //add in sign bit aka ip[4] + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// ip[2], ip[6] + vmulesh v18,v20,v6 + vmulosh v19,v20,v6 + vperm v18,v18,v19,v29 //(c6s2 * (irot_input_x)) + vsrh v8,v20,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v21,v2 + vmulosh v19,v21,v2 + vperm v18,v18,v19,v29 //(c2s6 * (irot_input_y)) - irot_input_y + vadduhm v18,v18,v21 //(c2s6 * (irot_input_y)) + vsrh v9,v21,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vadduhm v12,v8,v9 //ip[2] + + vmulesh v18,v21,v6 + vmulosh v19,v21,v6 + vperm v18,v18,v19,v29 //(c6s2 * (irot_input_y)) + vsrh v8,v21,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v20,v2 + vmulosh v19,v20,v2 + vperm v18,v18,v19,v29 //(c2s6 * (irot_input_x)) - irot_input_x + vadduhm v18,v18,v20 //(c2s6 * (irot_input_x)) + vsrh v9,v20,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vsubuhm v16,v8,v9 //ip[6] + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// ip[1], ip[7] + vmulesh v18,v22,v1 + vmulosh v19,v22,v1 + vperm v18,v18,v19,v29 //(c1s7 * (irot_input_x)) - irot_input_x + vadduhm v18,v18,v22 //(c1s7 * (irot_input_x)) + vsrh v8,v22,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v23,v7 + vmulosh v19,v23,v7 + vperm v18,v18,v19,v29 //(c7s1 * (irot_input_y)) + vsrh v9,v23,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vsubuhm v11,v8,v9 //ip[1] + + vmulesh v18,v22,v7 + vmulosh v19,v22,v7 + vperm v18,v18,v19,v29 //(c7s1 * (irot_input_x)) + vsrh v8,v22,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v23,v1 + vmulosh v19,v23,v1 + vperm v18,v18,v19,v29 //(c1s7 * (irot_input_y)) - irot_input_y + vadduhm v18,v18,v23 //(c1s7 * (irot_input_7)) + vsrh v9,v23,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vadduhm v17,v8,v9 //ip[7] + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// ip[3], ip[5] + vmulesh v18,v24,v3 + vmulosh v19,v24,v3 + vperm v18,v18,v19,v29 //(c3s5 * (irot_input_x)) - irot_input_x + vadduhm v18,v18,v24 //(c3s5 * (irot_input_x)) + vsrh v8,v24,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v25,v5 + vmulosh v19,v25,v5 + vperm v18,v18,v19,v29 //(c5s3 * (irot_input_y)) - irot_input_y + vadduhm v18,v18,v25 //(c5s3 * (irot_input_y)) + vsrh v9,v25,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vsubuhm v13,v8,v9 //ip[3] + + vmulesh v18,v24,v5 + vmulosh v19,v24,v5 + vperm v18,v18,v19,v29 //(c5s3 * (irot_input_x)) - irot_input_x + vadduhm v18,v18,v24 //(c5s3 * (irot_input_x)) + vsrh v8,v24,v28 + vadduhm v8,v18,v8 //add in sign bit aka temp1 + vmulesh v18,v25,v3 + vmulosh v19,v25,v3 + vperm v18,v18,v19,v29 //(c3s5 * (irot_input_y)) - irot_input_y + vadduhm v18,v18,v25 //(c3s5 * (irot_input_y)) + vsrh v9,v25,v28 + vadduhm v9,v18,v9 //add in sign bit aka temp2 + vadduhm v15,v8,v9 //ip[5] +//~~~~~~~~~~ end cut here + +//~~~~~~~~~~ write to destination + xor r8,r8,r8 + + stvx v10,r4,r8 + addi r8,r8,16 + + stvx v11,r4,r8 + addi r8,r8,16 + + stvx v12,r4,r8 + addi r8,r8,16 + + stvx v13,r4,r8 + addi r8,r8,16 + + stvx v14,r4,r8 + addi r8,r8,16 + + stvx v15,r4,r8 + addi r8,r8,16 + + stvx v16,r4,r8 + addi r8,r8,16 + + stvx v17,r4,r8 + } + +} +OIL_DEFINE_IMPL_REF (fdct8x8theora_altivec, fdct8x8theora, OIL_IMPL_FLAG_ALTIVEC); diff --git a/liboil/powerpc/recon8x8_altivec.c b/liboil/powerpc/recon8x8_altivec.c new file mode 100644 index 0000000..1d85b2a --- /dev/null +++ b/liboil/powerpc/recon8x8_altivec.c @@ -0,0 +1,716 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + these functions are copied from + http://svn.xiph.org/trunk/vp32/CoreLibs/CDXV/Vp31/Common/mac/OptFunctionsPPC.c +*/ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/simdpack/simdpack.h> + +OIL_DECLARE_CLASS (recon8x8_intra); +OIL_DECLARE_CLASS (recon8x8_inter); +OIL_DECLARE_CLASS (recon8x8_inter2); + +static const __attribute__ ((aligned(8),used)) uint64_t V128 = 0x8080808080808080LL; + +static void + /* r3, r4, r5 */ +recon8x8_intra_altivec (uint8_t *dest, int16_t *change, int ds) +{ + asm { + //trying cache hints + lis r6,0x0108 + or r6,r6,r5 + dstst r3,r6,0 + + vspltish v1,7 + + vspltish v8,1 + xor r6,r6,r6 + + lvx v0,r4,r6 //get 8 shorts + vslh v8,v8,v1 //now have 128 + addi r6,r6,16 + + lvx v1,r4,r6 //get 8 shorts + vaddshs v0,v0,v8 //+=128 + addi r6,r6,16 + + lvx v2,r4,r6 //get 8 shorts + vaddshs v1,v1,v8 //+=128 + addi r6,r6,16 + vpkshus v0,v0,v0 //convert to bytes + + lvx v3,r4,r6 //get 8 shorts + vaddshs v2,v2,v8 //+=128 + addi r6,r6,16 + vpkshus v1,v1,v1 //convert to bytes + + lvx v4,r4,r6 //get 8 shorts + vaddshs v3,v3,v8 //+=128 + addi r6,r6,16 + vpkshus v2,v2,v2 //convert to bytes + + lvx v5,r4,r6 //get 8 shorts + vaddshs v4,v4,v8 //+=128 + addi r6,r6,16 + vpkshus v3,v3,v3 //convert to bytes + + lvx v6,r4,r6 //get 8 shorts + vaddshs v5,v5,v8 //+=128 + addi r6,r6,16 + vpkshus v4,v4,v4 //convert to bytes + + lvx v7,r4,r6 //get 8 shorts + xor r6,r6,r6 + vaddshs v6,v6,v8 //+=128 + vpkshus v5,v5,v5 //convert to bytes + + lvsr v9,r3,r6 //load alignment vector for stores + vaddshs v7,v7,v8 //+=128 + vpkshus v6,v6,v6 //convert to bytes + + vpkshus v7,v7,v7 //convert to bytes + + li r7,4 + vperm v0,v0,v0,v9 + + stvewx v0,r3,r6 + add r6,r6,r5 + + lvsr v9,r3,r6 //load alignment vector for stores + + stvewx v0,r3,r7 + add r7,r7,r5 + vperm v1,v1,v1,v9 + + stvewx v1,r3,r6 + add r6,r6,r5 + + lvsr v9,r3,r6 //load alignment vector for stores + + stvewx v1,r3,r7 + add r7,r7,r5 + vperm v2,v2,v2,v9 + + stvewx v2,r3,r6 + add r6,r6,r5 + + lvsr v9,r3,r6 //load alignment vector for stores + + stvewx v2,r3,r7 + add r7,r7,r5 + vperm v3,v3,v3,v9 + + stvewx v3,r3,r6 + add r6,r6,r5 + + lvsr v9,r3,r6 //load alignment vector for stores + + stvewx v3,r3,r7 + add r7,r7,r5 + vperm v4,v4,v4,v9 + + stvewx v4,r3,r6 + add r6,r6,r5 + + lvsr v9,r3,r6 //load alignment vector for stores + + stvewx v4,r3,r7 + add r7,r7,r5 + vperm v5,v5,v5,v9 + + stvewx v5,r3,r6 + add r6,r6,r5 + + lvsr v9,r3,r6 //load alignment vector for stores + + stvewx v5,r3,r7 + add r7,r7,r5 + vperm v6,v6,v6,v9 + + stvewx v6,r3,r6 + add r6,r6,r5 + + lvsr v9,r3,r6 //load alignment vector for stores + + stvewx v6,r3,r7 + add r7,r7,r5 + vperm v7,v7,v7,v9 + + stvewx v7,r3,r6 + + stvewx v7,r3,r7 + } +} + +OIL_DEFINE_IMPL_FULL (recon8x8_intra_altivec, recon8x8_intra, OIL_IMPL_FLAG_ALTIVEC); + +static void /* r3, r4, r5, r6 */ +recon8x8_inter_altivec (uint8_t *dest, uint8_t *src, int16_t *change, int dss) +{ + asm + { + //trying cache hints + lis r7,0x0108 + or r7,r7,r6 + dstst r3,r7,0 + + xor r7,r7,r7 + li r8,16 + + lvsl v8,r4,r7 //load alignment vector for refs + vxor v9,v9,v9 + + lvx v10,r4,r7 //get 8 refs + add r7,r7,r6 + + lvx v0,r4,r8 //need another 16 bytes for misaligned data -- 0 + add r8,r8,r6 + + lvx v11,r4,r7 //get 8 refs + vperm v10,v10,v0,v8 + + lvsl v8,r4,r7 //load alignment vector for refs + add r7,r7,r6 + + lvx v1,r4,r8 //need another 16 bytes for misaligned data -- 1 + add r8,r8,r6 + + lvx v12,r4,r7 //get 8 refs + vperm v11,v11,v1,v8 + + lvsl v8,r4,r7 //load alignment vector for refs + add r7,r7,r6 + + lvx v2,r4,r8 //need another 16 bytes for misaligned data -- 2 + add r8,r8,r6 + + lvx v13,r4,r7 //get 8 refs + vperm v12,v12,v2,v8 + + lvsl v8,r4,r7 //load alignment vector for refs + add r7,r7,r6 + + lvx v3,r4,r8 //need another 16 bytes for misaligned data -- 3 + add r8,r8,r6 + + lvx v14,r4,r7 //get 8 refs + vperm v13,v13,v3,v8 + + lvsl v8,r4,r7 //load alignment vector for refs + add r7,r7,r6 + + lvx v4,r4,r8 //need another 16 bytes for misaligned data -- 4 + add r8,r8,r6 + + lvx v15,r4,r7 //get 8 refs + vperm v14,v14,v4,v8 + + lvsl v8,r4,r7 //load alignment vector for refs + add r7,r7,r6 + + lvx v5,r4,r8 //need another 16 bytes for misaligned data -- 5 + add r8,r8,r6 + + lvx v16,r4,r7 //get 8 refs + vperm v15,v15,v5,v8 + + lvsl v8,r4,r7 //load alignment vector for refs + add r7,r7,r6 + + lvx v6,r4,r8 //need another 16 bytes for misaligned data -- 6 + add r8,r8,r6 + + lvx v17,r4,r7 //get 8 refs + vperm v16,v16,v6,v8 + + lvsl v8,r4,r7 //load alignment vector for refs + xor r7,r7,r7 + + lvx v7,r4,r8 //need another 16 bytes for misaligned data -- 7 + add r8,r8,r6 + + lvx v0,r5,r7 //get 8 shorts + vperm v17,v17,v7,v8 + addi r7,r7,16 + + lvx v1,r5,r7 //get 8 shorts + vmrghb v10,v9,v10 //unsigned byte -> unsigned half + addi r7,r7,16 + + lvx v2,r5,r7 //get 8 shorts + vmrghb v11,v9,v11 //unsigned byte -> unsigned half + vaddshs v0,v0,v10 + addi r7,r7,16 + + lvx v3,r5,r7 //get 8 shorts + vmrghb v12,v9,v12 //unsigned byte -> unsigned half + vaddshs v1,v1,v11 + addi r7,r7,16 + + lvx v4,r5,r7 //get 8 shorts + vmrghb v13,v9,v13 //unsigned byte -> unsigned half + vaddshs v2,v2,v12 + addi r7,r7,16 + + lvx v5,r5,r7 //get 8 shorts + vmrghb v14,v9,v14 //unsigned byte -> unsigned half + vaddshs v3,v3,v13 + addi r7,r7,16 + + lvx v6,r5,r7 //get 8 shorts + vmrghb v15,v9,v15 //unsigned byte -> unsigned half + vaddshs v4,v4,v14 + addi r7,r7,16 + + lvx v7,r5,r7 //get 8 shorts + vmrghb v16,v9,v16 //unsigned byte -> unsigned half + vaddshs v5,v5,v15 + + vmrghb v17,v9,v17 //unsigned byte -> unsigned half + vaddshs v6,v6,v16 + + vpkshus v0,v0,v0 + vaddshs v7,v7,v17 + + vpkshus v1,v1,v1 + xor r7,r7,r7 + + vpkshus v2,v2,v2 + + vpkshus v3,v3,v3 + + vpkshus v4,v4,v4 + + vpkshus v5,v5,v5 + + vpkshus v6,v6,v6 + + lvsr v9,r3,r7 //load alignment vector for stores + vpkshus v7,v7,v7 + + li r8,4 + vperm v0,v0,v0,v9 //adjust for writes + + stvewx v0,r3,r7 + add r7,r7,r6 + + lvsr v9,r3,r7 //load alignment vector for stores + + stvewx v0,r3,r8 + add r8,r8,r6 + vperm v1,v1,v1,v9 + + stvewx v1,r3,r7 + add r7,r7,r6 + + lvsr v9,r3,r7 //load alignment vector for stores + + stvewx v1,r3,r8 + add r8,r8,r6 + vperm v2,v2,v2,v9 + + stvewx v2,r3,r7 + add r7,r7,r6 + + lvsr v9,r3,r7 //load alignment vector for stores + + stvewx v2,r3,r8 + add r8,r8,r6 + vperm v3,v3,v3,v9 + + stvewx v3,r3,r7 + add r7,r7,r6 + + lvsr v9,r3,r7 //load alignment vector for stores + + stvewx v3,r3,r8 + add r8,r8,r6 + vperm v4,v4,v4,v9 + + stvewx v4,r3,r7 + add r7,r7,r6 + + lvsr v9,r3,r7 //load alignment vector for stores + + stvewx v4,r3,r8 + add r8,r8,r6 + vperm v5,v5,v5,v9 + + stvewx v5,r3,r7 + add r7,r7,r6 + + lvsr v9,r3,r7 //load alignment vector for stores + + stvewx v5,r3,r8 + add r8,r8,r6 + vperm v6,v6,v6,v9 + + stvewx v6,r3,r7 + add r7,r7,r6 + + lvsr v9,r3,r7 //load alignment vector for stores + + stvewx v6,r3,r8 + add r8,r8,r6 + vperm v7,v7,v7,v9 + + stvewx v7,r3,r7 + + stvewx v7,r3,r8 + } +} + +OIL_DEFINE_IMPL_FULL (recon8x8_inter_altivec, recon8x8_inter, OIL_IMPL_FLAG_ALTIVEC); + +static void /* r3, r4, r5, r6, r7 */ +recon8x8_inter2_altivec (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss) +{ + asm + { + //trying cache hints + lis r8,0x0108 + or r8,r8,r7 + dstst r3,r8,0 + + xor r8,r8,r8 + li r9,16 + + lvsl v8,r4,r8 //load alignment vector for RefPtr1 + vxor v9,v9,v9 + + lvx v10,r4,r8 //get 8 RefPtr1 -- 0 + add r8,r8,r7 + + lvx v0,r4,r9 //need another 16 bytes for misaligned data -- 0 + add r9,r9,r7 + + lvx v11,r4,r8 //get 8 RefPtr1 -- 1 + vperm v10,v10,v0,v8 + + lvsl v8,r4,r8 //load alignment vector for RefPtr1 + add r8,r8,r7 + + lvx v1,r4,r9 //need another 16 bytes for misaligned data -- 1 + vmrghb v10,v9,v10 //unsigned byte -> unsigned half + add r9,r9,r7 + + lvx v12,r4,r8 //get 8 RefPtr1 -- 2 + vperm v11,v11,v1,v8 + + lvsl v8,r4,r8 //load alignment vector for RefPtr1 + add r8,r8,r7 + + lvx v2,r4,r9 //need another 16 bytes for misaligned data -- 2 + vmrghb v11,v9,v11 //unsigned byte -> unsigned half + add r9,r9,r7 + + lvx v13,r4,r8 //get 8 RefPtr1 -- 3 + vperm v12,v12,v2,v8 + + lvsl v8,r4,r8 //load alignment vector for RefPtr1 + add r8,r8,r7 + + lvx v3,r4,r9 //need another 16 bytes for misaligned data -- 3 + vmrghb v12,v9,v12 //unsigned byte -> unsigned half + add r9,r9,r7 + + lvx v14,r4,r8 //get 8 RefPtr1 -- 4 + vperm v13,v13,v3,v8 + + lvsl v8,r4,r8 //load alignment vector for RefPtr1 + add r8,r8,r7 + + lvx v4,r4,r9 //need another 16 bytes for misaligned data -- 4 + vmrghb v13,v9,v13 //unsigned byte -> unsigned half + add r9,r9,r7 + + lvx v15,r4,r8 //get 8 RefPtr1 -- 5 + vperm v14,v14,v4,v8 + + lvsl v8,r4,r8 //load alignment vector for RefPtr1 + add r8,r8,r7 + + lvx v5,r4,r9 //need another 16 bytes for misaligned data -- 5 + vmrghb v14,v9,v14 //unsigned byte -> unsigned half + add r9,r9,r7 + + lvx v16,r4,r8 //get 8 RefPtr1 -- 6 + vperm v15,v15,v5,v8 + + lvsl v8,r4,r8 //load alignment vector for RefPtr1 + add r8,r8,r7 + + lvx v6,r4,r9 //need another 16 bytes for misaligned data -- 6 + vmrghb v15,v9,v15 //unsigned byte -> unsigned half + add r9,r9,r7 + + lvx v17,r4,r8 //get 8 RefPtr1 -- 7 + vperm v16,v16,v6,v8 + + lvsl v8,r4,r8 //load alignment vector for RefPtr1 + add r8,r8,r7 + + lvx v7,r4,r9 //need another 16 bytes for misaligned data -- 7 + vmrghb v16,v9,v16 //unsigned byte -> unsigned half + add r9,r9,r7 +//-------- + vperm v17,v17,v7,v8 + xor r8,r8,r8 + li r9,16 + + lvsl v18,r5,r8 //load alignment vector for RefPtr2 + vmrghb v17,v9,v17 //unsigned byte -> unsigned half + + lvx v20,r5,r8 //get 8 RefPtr2 -- 0 + add r8,r8,r7 + + lvx v0,r5,r9 //need another 16 bytes for misaligned data -- 0 + add r9,r9,r7 + + lvx v21,r5,r8 //get 8 RefPtr2 -- 1 + vperm v20,v20,v0,v18 + + lvsl v18,r5,r8 //load alignment vector for RefPtr2 + add r8,r8,r7 + + lvx v1,r5,r9 //need another 16 bytes for misaligned data -- 1 + vmrghb v20,v9,v20 //unsigned byte -> unsigned half + add r9,r9,r7 + + lvx v22,r5,r8 //get 8 RefPtr2 -- 2 + vperm v21,v21,v1,v18 + + lvsl v18,r5,r8 //load alignment vector for RefPtr2 + add r8,r8,r7 + + lvx v2,r5,r9 //need another 16 bytes for misaligned data -- 2 + vmrghb v21,v9,v21 //unsigned byte -> unsigned half + vadduhm v10,v10,v20 + add r9,r9,r7 + + lvx v23,r5,r8 //get 8 RefPtr2 -- 3 + vperm v22,v22,v2,v18 + + lvsl v18,r5,r8 //load alignment vector for RefPtr2 + add r8,r8,r7 + + lvx v3,r5,r9 //need another 16 bytes for misaligned data -- 3 + vmrghb v22,v9,v22 //unsigned byte -> unsigned half + vadduhm v11,v11,v21 + add r9,r9,r7 + + lvx v24,r5,r8 //get 8 RefPtr2 -- 4 + vperm v23,v23,v3,v18 + + lvsl v18,r5,r8 //load alignment vector for RefPtr2 + add r8,r8,r7 + + lvx v4,r5,r9 //need another 16 bytes for misaligned data -- 4 + vmrghb v23,v9,v23 //unsigned byte -> unsigned half + vadduhm v12,v12,v22 + add r9,r9,r7 + + lvx v25,r5,r8 //get 8 RefPtr2 -- 5 + vperm v24,v24,v4,v18 + + lvsl v18,r5,r8 //load alignment vector for RefPtr2 + add r8,r8,r7 + + lvx v5,r5,r9 //need another 16 bytes for misaligned data -- 5 + vmrghb v24,v9,v24 //unsigned byte -> unsigned half + vadduhm v13,v13,v23 + add r9,r9,r7 + + lvx v26,r5,r8 //get 8 RefPtr2 -- 6 + vperm v25,v25,v5,v18 + + lvsl v18,r5,r8 //load alignment vector for RefPtr2 + add r8,r8,r7 + + lvx v6,r5,r9 //need another 16 bytes for misaligned data -- 6 + vmrghb v25,v9,v25 //unsigned byte -> unsigned half + vadduhm v14,v14,v24 + add r9,r9,r7 + + lvx v27,r5,r8 //get 8 RefPtr2 -- 7 + vperm v26,v26,v6,v18 + + lvsl v18,r5,r8 //load alignment vector for RefPtr2 + add r8,r8,r7 + + lvx v7,r5,r9 //need another 16 bytes for misaligned data -- 7 + vmrghb v26,v9,v26 //unsigned byte -> unsigned half + vadduhm v15,v15,v25 + add r9,r9,r7 + + vperm v27,v27,v7,v18 + xor r8,r8,r8 + + vmrghb v27,v9,v27 //unsigned byte -> unsigned half + vadduhm v16,v16,v26 + + vadduhm v17,v17,v27 + vspltish v8,1 +//-------- + lvx v0,r6,r8 //get 8 shorts + vsrh v10,v10,v8 + addi r8,r8,16 + + lvx v1,r6,r8 //get 8 shorts + vsrh v11,v11,v8 + addi r8,r8,16 + + lvx v2,r6,r8 //get 8 shorts + vsrh v12,v12,v8 + addi r8,r8,16 + + lvx v3,r6,r8 //get 8 shorts + vsrh v13,v13,v8 + addi r8,r8,16 + + lvx v4,r6,r8 //get 8 shorts + vsrh v14,v14,v8 + addi r8,r8,16 + + lvx v5,r6,r8 //get 8 shorts + vsrh v15,v15,v8 + addi r8,r8,16 + + lvx v6,r6,r8 //get 8 shorts + vsrh v16,v16,v8 + addi r8,r8,16 + + lvx v7,r6,r8 //get 8 shorts + vsrh v17,v17,v8 + xor r8,r8,r8 +//-------- + lvsr v9,r3,r8 //load alignment vector for stores + vaddshs v0,v0,v10 + + vaddshs v1,v1,v11 + vpkshus v0,v0,v0 + + vaddshs v2,v2,v12 + vpkshus v1,v1,v1 + + vaddshs v3,v3,v13 + vpkshus v2,v2,v2 + + vaddshs v4,v4,v14 + vpkshus v3,v3,v3 + + vaddshs v5,v5,v15 + vpkshus v4,v4,v4 + + vaddshs v6,v6,v16 + vpkshus v5,v5,v5 + + vaddshs v7,v7,v17 + vpkshus v6,v6,v6 + + vpkshus v7,v7,v7 + + li r9,4 + vperm v0,v0,v0,v9 //adjust for writes + + stvewx v0,r3,r8 + add r8,r8,r7 + + lvsr v9,r3,r8 //load alignment vector for stores + + stvewx v0,r3,r9 + add r9,r9,r7 + vperm v1,v1,v1,v9 + + stvewx v1,r3,r8 + add r8,r8,r7 + + lvsr v9,r3,r8 //load alignment vector for stores + + stvewx v1,r3,r9 + add r9,r9,r7 + vperm v2,v2,v2,v9 + + stvewx v2,r3,r8 + add r8,r8,r7 + + lvsr v9,r3,r8 //load alignment vector for stores + + stvewx v2,r3,r9 + add r9,r9,r7 + vperm v3,v3,v3,v9 + + stvewx v3,r3,r8 + add r8,r8,r7 + + lvsr v9,r3,r8 //load alignment vector for stores + + stvewx v3,r3,r9 + add r9,r9,r7 + vperm v4,v4,v4,v9 + + stvewx v4,r3,r8 + add r8,r8,r7 + + lvsr v9,r3,r8 //load alignment vector for stores + + stvewx v4,r3,r9 + add r9,r9,r7 + vperm v5,v5,v5,v9 + + stvewx v5,r3,r8 + add r8,r8,r7 + + lvsr v9,r3,r8 //load alignment vector for stores + + stvewx v5,r3,r9 + add r9,r9,r7 + vperm v6,v6,v6,v9 + + stvewx v6,r3,r8 + add r8,r8,r7 + + lvsr v9,r3,r8 //load alignment vector for stores + + stvewx v6,r3,r9 + add r9,r9,r7 + vperm v7,v7,v7,v9 + + stvewx v7,r3,r8 + + stvewx v7,r3,r9 + } +} + +OIL_DEFINE_IMPL_FULL (recon8x8_inter2_altivec, recon8x8_inter2, OIL_IMPL_FLAG_ALTIVEC); diff --git a/liboil/powerpc/recon8x8_ppc.c b/liboil/powerpc/recon8x8_ppc.c new file mode 100644 index 0000000..53c74b5 --- /dev/null +++ b/liboil/powerpc/recon8x8_ppc.c @@ -0,0 +1,526 @@ +/* + * LIBOIL - Library of Optimized Inner Loops + * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + these functions are copied from + http://svn.xiph.org/trunk/vp32/CoreLibs/CDXV/Vp31/Common/mac/OptFunctionsPPC.c +*/ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <liboil/liboilfunction.h> +#include <liboil/simdpack/simdpack.h> + +OIL_DECLARE_CLASS (recon8x8_intra); +OIL_DECLARE_CLASS (recon8x8_inter); +OIL_DECLARE_CLASS (recon8x8_inter2); + +static const __attribute__ ((aligned(8),used)) uint64_t V128 = 0x8080808080808080LL; + +static void + /* r3, r4, r5 */ +recon8x8_intra_ppc (uint8_t *dest, int16_t *change, int ds) +{ + asm + { + lwz r0,0(r3) ;//preload cache + mr r12,r4 + + addi r12,r12,128 ;//end ptr + +doLoop1: + lha r7,0(r4) + + lha r8,2(r4) + addi r7,r7,128 + + lha r9,4(r4) + addi r8,r8,128 + andi. r0,r7,0xff00 + beq+ L1 + + srawi r0,r7,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r7,r0,0xff ;//now have 00 or ff + +L1: + lha r10,6(r4) + addi r9,r9,128 + andi. r0,r8,0xff00 + beq+ L2 + + srawi r0,r8,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r8,r0,0xff ;//now have 00 or ff + +L2: + lha r31,8(r4) + addi r10,r10,128 + andi. r0,r9,0xff00 + beq+ L3 + + srawi r0,r9,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r9,r0,0xff ;//now have 00 or ff + +L3: + lha r30,10(r4) + andi. r0,r10,0xff00 + beq+ L4 + + srawi r0,r10,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r10,r0,0xff ;//now have 00 or ff + +L4: + lha r29,12(r4) + insrwi r10,r7,8,0 + addi r31,r31,128 + + lwz r27,0(r3) ;//preload cache with dest + addi r30,r30,128 + andi. r0,r31,0xff00 + beq+ L5 + + srawi r0,r31,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r31,r0,0xff ;//now have 00 or ff + +L5: + lha r28,14(r4) + addi r29,r29,128 + andi. r0,r30,0xff00 + beq+ L6 + + srawi r0,r30,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r30,r0,0xff ;//now have 00 or ff + +L6: + addi r28,r28,128 + andi. r0,r29,0xff00 + beq+ L7 + + srawi r0,r29,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r29,r0,0xff ;//now have 00 or ff + +L7: + insrwi r10,r8,8,8 + andi. r0,r28,0xff00 + beq+ L8 + + srawi r0,r28,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r28,r0,0xff ;//now have 00 or ff + +L8: + insrwi r10,r9,8,16 + insrwi r28,r31,8,0 + + stw r10,0(r3) + insrwi r28,r30,8,8 + addi r4,r4,16 + + cmpw r4,r12 + insrwi r28,r29,8,16 + + stw r28,4(r3) + add r3,r3,r5 ;//add in stride + bne doLoop1 + + } +} + +OIL_DEFINE_IMPL_FULL (recon8x8_intra_ppc, recon8x8_intra, OIL_IMPL_FLAG_ASM); + +static void /* r3, r4, r5, r6 */ +recon8x8_inter_ppc (uint8_t *dest, uint8_t *src, int16_t *change, int dss) +{ + asm + { + mr r26,r4 + mr r4,r5 ;//same reg usage as intra + + lwz r0,0(r3) ;//preload cache + mr r12,r4 + + addi r12,r12,128 ;//end ptr + mr r5,r6 ;//same reg usage as intra + +doLoop1: + lha r7,0(r4) + + lbz r25,0(r26) + + lha r8,2(r4) + add r7,r7,r25 + + lbz r25,1(r26) + + lha r9,4(r4) + add r8,r8,r25 + andi. r0,r7,0xff00 + beq+ L1 + + srawi r0,r7,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r7,r0,0xff ;//now have 00 or ff + +L1: + lbz r25,2(r26) + + lha r10,6(r4) + add r9,r9,r25 + andi. r0,r8,0xff00 + beq+ L2 + + srawi r0,r8,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r8,r0,0xff ;//now have 00 or ff + +L2: + lbz r25,3(r26) + + lha r31,8(r4) + add r10,r10,r25 + andi. r0,r9,0xff00 + beq+ L3 + + srawi r0,r9,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r9,r0,0xff ;//now have 00 or ff + +L3: + lha r30,10(r4) + andi. r0,r10,0xff00 + beq+ L4 + + srawi r0,r10,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r10,r0,0xff ;//now have 00 or ff + +L4: + lbz r25,4(r26) + + + lha r29,12(r4) + insrwi r10,r7,8,0 + add r31,r31,r25 + + lbz r25,5(r26) + + lwz r27,0(r3) ;//preload cache with dest + add r30,r30,r25 + andi. r0,r31,0xff00 + beq+ L5 + + srawi r0,r31,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r31,r0,0xff ;//now have 00 or ff + +L5: + lbz r25,6(r26) + + lha r28,14(r4) + add r29,r29,r25 + andi. r0,r30,0xff00 + beq+ L6 + + srawi r0,r30,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r30,r0,0xff ;//now have 00 or ff + +L6: + lbz r25,7(r26) + add r26,r26,r5 + + add r28,r28,r25 + andi. r0,r29,0xff00 + beq+ L7 + + srawi r0,r29,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r29,r0,0xff ;//now have 00 or ff + +L7: + insrwi r10,r8,8,8 + andi. r0,r28,0xff00 + beq+ L8 + + srawi r0,r28,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r28,r0,0xff ;//now have 00 or ff + +L8: + insrwi r10,r9,8,16 + insrwi r28,r31,8,0 + + stw r10,0(r3) + insrwi r28,r30,8,8 + addi r4,r4,16 + + cmpw r4,r12 + insrwi r28,r29,8,16 + + stw r28,4(r3) + add r3,r3,r5 ;//add in stride + bne doLoop1 + + } +} + +OIL_DEFINE_IMPL_FULL (recon8x8_inter_ppc, recon8x8_inter, OIL_IMPL_FLAG_ASM); + +static void /* r3, r4, r5, r6, r7 */ +recon8x8_inter2_ppc (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss) +{ + asm + { + mr r26,r4 + mr r4,r6 ;//same reg usage as intra + + lwz r0,0(r3) ;//preload cache + mr r25,r5 + mr r12,r4 + + addi r12,r12,128 ;//end ptr + mr r5,r7 ;//same reg usage as intra + + li r24,0x0101 + li r23,0xfefe + + insrwi r23,r23,16,0 ;//0xfefefefe + insrwi r24,r24,16,0 ;//0x01010101 + +doLoop1: + lwz r22,0(r26) ;//get 4 ref pels + + lwz r21,0(r25) ;//get 4 src pels + + lha r7,0(r4) + and r20,r22,r21 + + lha r8,2(r4) + and r21,r21,r23 ;//mask low bits + and r22,r22,r23 ;//mask low bits + + srwi r21,r21,1 + srwi r22,r22,1 + + and r20,r20,r24 ;//save low bits + add r21,r21,r22 + + lwz r22,4(r26) ;//get 4 ref pels +// or r20,r21,r20 ;//add in hot fudge + add r20,r21,r20 ;//add in hot fudge + +//xor r20,r20,r20 + + lwz r21,4(r25) ;//get 4 src pels + rlwinm r19,r20,8,24,31 + rlwinm r18,r20,16,24,31 + + add r7,r7,r19 + + lha r9,4(r4) + add r8,r8,r18 + andi. r0,r7,0xff00 + beq+ L1 + + srawi r0,r7,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r7,r0,0xff ;//now have 00 or ff + +L1: + rlwinm r19,r20,24,24,31 + rlwinm r18,r20,0,24,31 + + lha r10,6(r4) + add r9,r9,r19 + andi. r0,r8,0xff00 + beq+ L2 + + srawi r0,r8,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r8,r0,0xff ;//now have 00 or ff + +L2: + lha r31,8(r4) + add r10,r10,r18 + andi. r0,r9,0xff00 + beq+ L3 + + srawi r0,r9,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r9,r0,0xff ;//now have 00 or ff + +L3: + lha r30,10(r4) + andi. r0,r10,0xff00 + beq+ L4 + + srawi r0,r10,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r10,r0,0xff ;//now have 00 or ff + +L4: + lha r29,12(r4) + insrwi r10,r7,8,0 + and r20,r22,r21 + + and r21,r21,r23 ;//mask low bits + and r22,r22,r23 ;//mask low bits + + srwi r21,r21,1 + srwi r22,r22,1 + + and r20,r20,r24 ;//save low bits + add r21,r21,r22 + +// or r20,r21,r20 ;//add in hot fudge + add r20,r21,r20 ;//add in hot fudge + + rlwinm r19,r20,8,24,31 + rlwinm r18,r20,16,24,31 + + + add r31,r31,r19 + +//xor r20,r20,r20 + + lwz r27,0(r3) ;//preload cache with dest + add r30,r30,r18 + andi. r0,r31,0xff00 + beq+ L5 + + srawi r0,r31,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r31,r0,0xff ;//now have 00 or ff + +L5: + rlwinm r19,r20,24,24,31 + rlwinm r18,r20,0,24,31 + + lha r28,14(r4) + add r29,r29,r19 + andi. r0,r30,0xff00 + beq+ L6 + + srawi r0,r30,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r30,r0,0xff ;//now have 00 or ff + +L6: + add r26,r26,r5 ;//add stride to ref pels + add r25,r25,r5 ;//add stride to src pels + + add r28,r28,r18 + andi. r0,r29,0xff00 + beq+ L7 + + srawi r0,r29,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r29,r0,0xff ;//now have 00 or ff + +L7: + insrwi r10,r8,8,8 + andi. r0,r28,0xff00 + beq+ L8 + + srawi r0,r28,15 ;//generate ff or 00 + + xori r0,r0,0xff ;//flip the bits + + andi. r28,r0,0xff ;//now have 00 or ff + +L8: + insrwi r10,r9,8,16 + insrwi r28,r31,8,0 + + stw r10,0(r3) + insrwi r28,r30,8,8 + addi r4,r4,16 + + cmpw r4,r12 + insrwi r28,r29,8,16 + + stw r28,4(r3) + add r3,r3,r5 ;//add in stride + bne doLoop1 + + } +} + +OIL_DEFINE_IMPL_FULL (recon8x8_inter2_ppc, recon8x8_inter2, OIL_IMPL_FLAG_ASM); diff --git a/liboil/simdpack/average2_u8.c b/liboil/simdpack/average2_u8.c index fe07aa8..3ac8c6a 100644 --- a/liboil/simdpack/average2_u8.c +++ b/liboil/simdpack/average2_u8.c @@ -57,35 +57,27 @@ average2_u8_trick (uint8_t * dest, int dstr, uint8_t *src1, int sstr1, { unsigned int x, y, d; -#if 0 - if (sstr1 == 1 && sstr2 == 1 && dstr == 1) { - while (n > 0) { - x = *(unsigned int *) src1; - y = *(unsigned int *) src2; - *(unsigned int *) dest = (((x ^ y) & 0xfefefefe) >> 1) + (x & y); - src1 += 4; - src2 += 4; - dest += 4; - n -= 4; - } - } else -#endif - { - while (n > 0) { - x = (src1[0] << 24) | (src1[sstr1] << 16) | (src1[2 * - sstr1] << 8) | (src1[3 * sstr1]); - y = (src2[0] << 24) | (src2[sstr2] << 16) | (src2[2 * - sstr2] << 8) | (src2[3 * sstr2]); - d = (((x ^ y) & 0xfefefefe) >> 1) + (x & y); - dest[0] = (d >> 24); - dest[1*dstr] = (d >> 16); - dest[2*dstr] = (d >> 8); - dest[3*dstr] = (d >> 0); - src1 += 4 * sstr1; - src2 += 4 * sstr2; - dest += 4 * dstr; - n -= 4; - } + while (n&3) { + *dest = (*src1 + *src2) >> 1; + src1 += sstr1; + src2 += sstr2; + dest += dstr; + n--; + } + while (n > 0) { + x = (src1[0] << 24) | (src1[sstr1] << 16) | (src1[2 * + sstr1] << 8) | (src1[3 * sstr1]); + y = (src2[0] << 24) | (src2[sstr2] << 16) | (src2[2 * + sstr2] << 8) | (src2[3 * sstr2]); + d = (((x ^ y) & 0xfefefefe) >> 1) + (x & y); + dest[0] = (d >> 24); + dest[1*dstr] = (d >> 16); + dest[2*dstr] = (d >> 8); + dest[3*dstr] = (d >> 0); + src1 += 4 * sstr1; + src2 += 4 * sstr2; + dest += 4 * dstr; + n -= 4; } } @@ -112,6 +104,7 @@ average2_u8_unroll4 (uint8_t * dest, int dstr, uint8_t *src1, int sstr1, { while (n & 0x3) { *dest = (*src1 + *src2) >> 1; + dest += dstr; src1 += sstr1; src2 += sstr2; n--; @@ -139,32 +132,3 @@ average2_u8_unroll4 (uint8_t * dest, int dstr, uint8_t *src1, int sstr1, OIL_DEFINE_IMPL (average2_u8_unroll4, average2_u8); -#if 0 /* doesn't compile */ -#ifdef HAVE_CPU_I386 -/* This doesn't work with sstr!=2 or dstr!=2 */ -static void -average2_u8_i386asm (uint8_t * dest, int dstr, uint8_t *src1, int sstr1, - uint8_t *src2, int sstr2, int n) -{ - __asm__ __volatile__ ("\n" - " .p2align 4,,15 \n" - "1: movzbl (%%ebx), %%eax \n" - " addl $2, %%ebx \n" - " movzbl (%%ecx), %%edx \n" - " addl $2, %%ecx \n" - " leal 1(%%edx, %%eax), %%eax \n" - " sarl $1, %%eax \n" - " movb %%al, 0(%%esi) \n" - " incl %%esi \n" - " decl %%edi \n" - " jg 1b \n":"+b" - (src1), "+c" (src2), "+D" (n), "+S" (dest) - ::"eax", "edx"); - -} - -OIL_DEFINE_IMPL (average2_u8_i386asm, average2_u8); -#endif -#endif - - diff --git a/liboil/simdpack/scalarmult_i386.c b/liboil/simdpack/scalarmult_i386.c index 232dc83..bade779 100644 --- a/liboil/simdpack/scalarmult_i386.c +++ b/liboil/simdpack/scalarmult_i386.c @@ -48,7 +48,7 @@ scalarmult_f32_sse (float *dest, int dstr, float *src, int sstr, " movss (%0), %%xmm1 \n" : : "r" (t)); - for(i=0;i<n;i+=4) { + for(i=0;i<n-3;i+=4) { t[0] = OIL_GET(src,sstr*(i + 0), float); t[1] = OIL_GET(src,sstr*(i + 1), float); t[2] = OIL_GET(src,sstr*(i + 2), float); @@ -64,7 +64,9 @@ scalarmult_f32_sse (float *dest, int dstr, float *src, int sstr, OIL_GET(dest,dstr*(i + 2), float) = t[2]; OIL_GET(dest,dstr*(i + 3), float) = t[3]; } - + for(;i<n;i++){ + OIL_GET(dest,dstr*i, float) = *val * OIL_GET(src,sstr*i, float); + } } OIL_DEFINE_IMPL_FULL (scalarmult_f32_sse, scalarmult_f32, OIL_IMPL_FLAG_SSE); diff --git a/testsuite/stride.c b/testsuite/stride.c index 38a6681..e14b230 100644 --- a/testsuite/stride.c +++ b/testsuite/stride.c @@ -74,6 +74,8 @@ int main (int argc, char *argv[]) continue; } oil_test_set_iterations (test, 1); + test->n = 1 + oil_rand_u8(); + test->m = 1 + oil_rand_u8(); oil_test_check_impl (test, klass->reference_impl); for(j=0;j<OIL_ARG_LAST;j++){ |