summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2005-08-15 21:33:39 +0000
committerDavid Schleef <ds@schleef.org>2005-08-15 21:33:39 +0000
commitec572e49fb3423767ab7d562c5ef0aa2ad4ff38b (patch)
tree2aa40503eca7623296b431e79d22e6e4720e2be8
parent1ef601312c0634d55f1a098be769eae436dd0b92 (diff)
downloadliboil-ec572e49fb3423767ab7d562c5ef0aa2ad4ff38b.tar.gz
* configure.ac: Add some altivec theora code
* liboil/Makefile.am: * liboil/powerpc/Makefile.am: * liboil/powerpc/fdct8x8theora_altivec.c: (fdct8x8theora_altivec): * liboil/powerpc/recon8x8_altivec.c: (recon8x8_intra_altivec), (recon8x8_inter_altivec), (recon8x8_inter2_altivec): * liboil/powerpc/recon8x8_ppc.c: (recon8x8_intra_ppc), (recon8x8_inter_ppc), (recon8x8_inter2_ppc): * liboil/colorspace/composite.c: Fix bug in ADD operator. * liboil/dct/fdct8x8theora_i386.c: * liboil/simdpack/average2_u8.c: (average2_u8_trick), (average2_u8_unroll4): Fix n%4!=0 problems noticed by thomasvs. * liboil/simdpack/scalarmult_i386.c: (scalarmult_f32_sse): Fix n%4!=0 problems. * testsuite/stride.c: (main): use a random n to test possible endpoint problems.
-rw-r--r--ChangeLog20
-rw-r--r--configure.ac1
-rw-r--r--liboil/Makefile.am3
-rw-r--r--liboil/colorspace/composite.c2
-rw-r--r--liboil/dct/fdct8x8theora_i386.c1
-rw-r--r--liboil/powerpc/Makefile.am18
-rw-r--r--liboil/powerpc/fdct8x8theora_altivec.c522
-rw-r--r--liboil/powerpc/recon8x8_altivec.c716
-rw-r--r--liboil/powerpc/recon8x8_ppc.c526
-rw-r--r--liboil/simdpack/average2_u8.c80
-rw-r--r--liboil/simdpack/scalarmult_i386.c6
-rw-r--r--testsuite/stride.c2
12 files changed, 1835 insertions, 62 deletions
diff --git a/ChangeLog b/ChangeLog
index 05285dd..647988f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,25 @@
2005-08-15 David Schleef <ds@schleef.org>
+ * configure.ac: Add some altivec theora code
+ * liboil/Makefile.am:
+ * liboil/powerpc/Makefile.am:
+ * liboil/powerpc/fdct8x8theora_altivec.c: (fdct8x8theora_altivec):
+ * liboil/powerpc/recon8x8_altivec.c: (recon8x8_intra_altivec),
+ (recon8x8_inter_altivec), (recon8x8_inter2_altivec):
+ * liboil/powerpc/recon8x8_ppc.c: (recon8x8_intra_ppc),
+ (recon8x8_inter_ppc), (recon8x8_inter2_ppc):
+
+ * liboil/colorspace/composite.c: Fix bug in ADD operator.
+ * liboil/dct/fdct8x8theora_i386.c:
+ * liboil/simdpack/average2_u8.c: (average2_u8_trick),
+ (average2_u8_unroll4): Fix n%4!=0 problems noticed by thomasvs.
+ * liboil/simdpack/scalarmult_i386.c: (scalarmult_f32_sse): Fix
+ n%4!=0 problems.
+ * testsuite/stride.c: (main): use a random n to test possible
+ endpoint problems.
+
+2005-08-15 David Schleef <ds@schleef.org>
+
* liboil/liboilcpu.c: (oil_cpu_i386_getflags_cpuinfo): SSE2
implies MMXEXT in both codepaths.
diff --git a/configure.ac b/configure.ac
index c0e4a5e..9bb9331 100644
--- a/configure.ac
+++ b/configure.ac
@@ -184,6 +184,7 @@ liboil/jpeg/Makefile
liboil/math/Makefile
liboil/md5/Makefile
liboil/motovec/Makefile
+liboil/powerpc/Makefile
liboil/ref/Makefile
liboil/simdpack/Makefile
liboil/sse/Makefile
diff --git a/liboil/Makefile.am b/liboil/Makefile.am
index b9bf45a..530ca28 100644
--- a/liboil/Makefile.am
+++ b/liboil/Makefile.am
@@ -1,7 +1,7 @@
pkgincludedir = $(includedir)/liboil-@LIBOIL_MAJORMINOR@/liboil
-SUBDIRS = colorspace conv copy dct fb i386 jpeg math md5 motovec ref simdpack sse utf8
+SUBDIRS = colorspace conv copy dct fb i386 jpeg math md5 motovec powerpc ref simdpack sse utf8
lib_LTLIBRARIES = liboiltmp1.la liboil-@LIBOIL_MAJORMINOR@.la
@@ -33,6 +33,7 @@ liboilfunctions_la_LIBADD = \
math/libmath.la \
md5/libmd5.la \
motovec/libmotovec.la \
+ powerpc/libpowerpc.la \
ref/libref.la \
simdpack/libsimdpack.la \
sse/libsse.la \
diff --git a/liboil/colorspace/composite.c b/liboil/colorspace/composite.c
index 6d9f4ea..5fbbae7 100644
--- a/liboil/colorspace/composite.c
+++ b/liboil/colorspace/composite.c
@@ -36,7 +36,7 @@
#include <liboil/liboildebug.h>
#define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
-#define COMPOSITE_ADD(d,s) ((d) + (s))
+#define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
#define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
static void
diff --git a/liboil/dct/fdct8x8theora_i386.c b/liboil/dct/fdct8x8theora_i386.c
index 6126adb..7d8bce3 100644
--- a/liboil/dct/fdct8x8theora_i386.c
+++ b/liboil/dct/fdct8x8theora_i386.c
@@ -45,6 +45,7 @@
#include <liboil/dct/dct.h>
#include <math.h>
+/* FIXME this causes problems on old gcc */
static const __attribute__ ((aligned(8),used)) int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
static const __attribute__ ((aligned(8),used)) int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
static const __attribute__ ((aligned(8),used)) int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
diff --git a/liboil/powerpc/Makefile.am b/liboil/powerpc/Makefile.am
new file mode 100644
index 0000000..f49fcc5
--- /dev/null
+++ b/liboil/powerpc/Makefile.am
@@ -0,0 +1,18 @@
+
+noinst_LTLIBRARIES = libpowerpc.la
+
+sources = \
+ recon8x8_ppc.c \
+ fdct8x8theora_altivec.c \
+ recon8x8_altivec.c
+
+if HAVE_CPU_POWERPC
+powerpc_sources = $(sources)
+else
+powerpc_sources =
+endif
+
+libpowerpc_la_SOURCES = \
+ $(powerpc_sources)
+libpowerpc_la_CFLAGS = $(LIBOIL_CFLAGS) -fasm-blocks
+
diff --git a/liboil/powerpc/fdct8x8theora_altivec.c b/liboil/powerpc/fdct8x8theora_altivec.c
new file mode 100644
index 0000000..0bb2467
--- /dev/null
+++ b/liboil/powerpc/fdct8x8theora_altivec.c
@@ -0,0 +1,522 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
+ * by the Xiph.Org Foundation http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: fdct8x8theora_altivec.c,v 1.1 2005-08-15 21:33:39 ds Exp $
+
+ ********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/liboilfuncs.h>
+#include <liboil/dct/dct.h>
+#include <math.h>
+
+extern vector signed short idctConst;
+extern vector unsigned char vPerm1;
+extern vector unsigned char vPerm2;
+extern vector unsigned char vPerm3;
+
+OIL_DECLARE_CLASS(fdct8x8theora);
+
+static void
+fdct8x8theora_altivec(int16_t *src, int16_t *dest)
+{
+ (void) src;
+ (void) dest;
+
+ asm
+ {
+ lwz r10,vPerm1
+ xor r7,r7,r7
+
+ lwz r8,vPerm2
+
+ lwz r9,vPerm3
+
+ lvx v30,r10,r7
+
+ lvx v31,r8,r7
+
+ lvx v29,r9,r7
+
+
+ lwz r9,idctConst
+ xor r7,r7,r7
+ xor r8,r8,r8
+
+//trying cache hints
+// lis r8,0x1001 //Block Size = 16, Block Count = 1, Block Stride = 0
+// dstst r5,r8,0
+// dst r4,r8,1
+// dst r3,r8,2
+
+ lvx v8,r9,r7
+ xor r8,r8,r8
+
+ lvx v10,r3,r8 //row 0
+ vsplth v0,v8,0
+ addi r8,r8,16
+
+ lvx v11,r3,r8 //row 1
+ vsplth v1,v8,1
+ addi r8,r8,16
+
+ lvx v12,r3,r8 //row 2
+ vsplth v2,v8,2
+ addi r8,r8,16
+
+ lvx v13,r3,r8 //row 3
+ vsplth v3,v8,3
+ addi r8,r8,16
+
+ lvx v14,r3,r8 //row 4
+ vsplth v4,v8,4
+ addi r8,r8,16
+
+ lvx v15,r3,r8 //row 5
+ vsplth v5,v8,5
+ addi r8,r8,16
+
+ lvx v16,r3,r8 //row 6
+ vsplth v6,v8,6
+ addi r8,r8,16
+
+ lvx v17,r3,r8 //row 7
+ vsplth v7,v8,7
+
+ // on entry
+ //00 01 02 03 04 05 06 07
+ //10 11 12 13 14 15 16 17
+ //20 21 22 23 24 25 26 27
+ //30 31 32 33 34 35 36 37
+ //40 41 42 43 44 45 46 47
+ //50 51 52 53 54 55 56 57
+ //60 61 62 63 64 65 66 67
+ //70 71 72 73 74 75 76 77
+//start of transpose
+ vmrghh v18,v10,v11
+ vmrglh v19,v10,v11
+ vmrghh v20,v12,v13
+ vmrglh v21,v12,v13
+ vmrghh v22,v14,v15
+ vmrglh v23,v14,v15
+ vmrghh v24,v16,v17
+ vmrglh v25,v16,v17
+
+ vmrghw v8,v18,v20
+ vmrghw v9,v22,v24
+ vmrghw v26,v19,v21
+ vmrghw v27,v23,v25
+ vmrglw v18,v18,v20
+ vmrglw v22,v22,v24
+ vmrglw v19,v19,v21
+ vmrglw v23,v23,v25
+
+ vperm v10,v8,v9,v30 //00 10 20 30 40 50 60 70
+ vperm v11,v8,v9,v31 //01 11 21 31 41 51 61 71
+ vperm v12,v18,v22,v30 //02 12 22 32 42 52 62 72
+ vperm v13,v18,v22,v31 //03 13 23 33 43 53 63 73
+ vperm v20,v26,v27,v30 //04 14 24 34 44 54 64 74
+ vperm v21,v26,v27,v31 //05 15 25 35 45 55 65 75
+ vperm v22,v19,v23,v30 //06 16 26 36 46 56 66 76
+ vperm v23,v19,v23,v31 //07 17 27 37 47 57 67 77
+//end of transpose
+
+//~~~~~~~~~~ start cut here
+ vsubuhm v14,v10,v23 //id07
+ vsubuhm v15,v11,v12 //id12
+ vsubuhm v16,v13,v20 //id34
+ vsubuhm v17,v21,v22 //id56
+
+ vadduhm v10,v10,v23 //is07
+ vadduhm v11,v11,v12 //is12
+ vadduhm v12,v13,v20 //is34
+ vadduhm v13,v21,v22 //is56
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// some precalulations
+ vspltish v28,15
+ vadduhm v18,v10,v12 //is0734
+ vadduhm v19,v11,v13 //is1256
+
+ vsubuhm v20,v11,v13 //(is12 - is56)
+ vmulesh v22,v20,v4
+ vmulosh v23,v20,v4
+ vperm v8,v22,v23,v29 //(c4s4 * (is12 - is56)) - (is12 - is56)
+ vadduhm v8,v8,v20 //c4s4 * (is12 - is56)
+ vsrh v20,v20,v28 //get sign bit
+ vadduhm v8,v8,v20 //add in sign bit aka icommon_product1
+
+ vadduhm v20,v15,v17 //(id12 + id56)
+ vmulesh v22,v20,v4
+ vmulosh v23,v20,v4
+ vperm v9,v22,v23,v29 //(c4s4 * (is12 + is56)) - (is12 + is56)
+ vadduhm v9,v9,v20 //c4s4 * (is12 + is56)
+ vsrh v20,v20,v28 //get sign bit
+ vadduhm v9,v9,v20 //add in sign bit aka icommon_product2
+
+ vsubuhm v20,v15,v17 //irot_input_x = id12 - id56
+ vsubuhm v21,v10,v12 //irot_input_y = is07 - is34
+
+ vadduhm v22,v14,v8 //irot_input_x = icommon_product1 + id07
+ vadduhm v23,v16,v9 //irot_input_y = icommon_product2 + id34
+ vxor v24,v24,v24
+ vsubuhm v23,v24,v23 //irot_input_y = -(icommon_product2 + id34)
+
+ vsubuhm v24,v14,v8 //irot_input_x = id07 - icommon_product1
+ vsubuhm v25,v16,v9 //irot_input_y = id34 - icommon_product2
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// ip[0], ip[4]
+ vadduhm v8,v18,v19 //(is0734 + is1256)
+ vsubuhm v9,v18,v19 //(is0734 - is1256)
+
+ vmulesh v18,v8,v4
+ vmulosh v19,v8,v4
+ vperm v18,v18,v19,v29 //(c4s4 * (is0734 + is1256)) - (is0734 + is1256)
+ vadduhm v18,v18,v8 //(c4s4 * (is0734 + is1256))
+ vsrh v8,v8,v28
+ vadduhm v10,v18,v8 //add in sign bit aka ip[0]
+
+ vmulesh v18,v9,v4
+ vmulosh v19,v9,v4
+ vperm v18,v18,v19,v29 //(c4s4 * (is0734 + is1256)) - (is0734 + is1256)
+ vadduhm v18,v18,v9 //(c4s4 * (is0734 + is1256))
+ vsrh v9,v9,v28
+ vadduhm v14,v18,v9 //add in sign bit aka ip[4]
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// ip[2], ip[6]
+ vmulesh v18,v20,v6
+ vmulosh v19,v20,v6
+ vperm v18,v18,v19,v29 //(c6s2 * (irot_input_x))
+ vsrh v8,v20,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v21,v2
+ vmulosh v19,v21,v2
+ vperm v18,v18,v19,v29 //(c2s6 * (irot_input_y)) - irot_input_y
+ vadduhm v18,v18,v21 //(c2s6 * (irot_input_y))
+ vsrh v9,v21,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vadduhm v12,v8,v9 //ip[2]
+
+ vmulesh v18,v21,v6
+ vmulosh v19,v21,v6
+ vperm v18,v18,v19,v29 //(c6s2 * (irot_input_y))
+ vsrh v8,v21,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v20,v2
+ vmulosh v19,v20,v2
+ vperm v18,v18,v19,v29 //(c2s6 * (irot_input_x)) - irot_input_x
+ vadduhm v18,v18,v20 //(c2s6 * (irot_input_x))
+ vsrh v9,v20,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vsubuhm v16,v8,v9 //ip[6]
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// ip[1], ip[7]
+ vmulesh v18,v22,v1
+ vmulosh v19,v22,v1
+ vperm v18,v18,v19,v29 //(c1s7 * (irot_input_x)) - irot_input_x
+ vadduhm v18,v18,v22 //(c1s7 * (irot_input_x))
+ vsrh v8,v22,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v23,v7
+ vmulosh v19,v23,v7
+ vperm v18,v18,v19,v29 //(c7s1 * (irot_input_y))
+ vsrh v9,v23,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vsubuhm v11,v8,v9 //ip[1]
+
+ vmulesh v18,v22,v7
+ vmulosh v19,v22,v7
+ vperm v18,v18,v19,v29 //(c7s1 * (irot_input_x))
+ vsrh v8,v22,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v23,v1
+ vmulosh v19,v23,v1
+ vperm v18,v18,v19,v29 //(c1s7 * (irot_input_y)) - irot_input_y
+ vadduhm v18,v18,v23 //(c1s7 * (irot_input_7))
+ vsrh v9,v23,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vadduhm v17,v8,v9 //ip[7]
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// ip[3], ip[5]
+ vmulesh v18,v24,v3
+ vmulosh v19,v24,v3
+ vperm v18,v18,v19,v29 //(c3s5 * (irot_input_x)) - irot_input_x
+ vadduhm v18,v18,v24 //(c3s5 * (irot_input_x))
+ vsrh v8,v24,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v25,v5
+ vmulosh v19,v25,v5
+ vperm v18,v18,v19,v29 //(c5s3 * (irot_input_y)) - irot_input_y
+ vadduhm v18,v18,v25 //(c5s3 * (irot_input_y))
+ vsrh v9,v25,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vsubuhm v13,v8,v9 //ip[3]
+
+ vmulesh v18,v24,v5
+ vmulosh v19,v24,v5
+ vperm v18,v18,v19,v29 //(c5s3 * (irot_input_x)) - irot_input_x
+ vadduhm v18,v18,v24 //(c5s3 * (irot_input_x))
+ vsrh v8,v24,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v25,v3
+ vmulosh v19,v25,v3
+ vperm v18,v18,v19,v29 //(c3s5 * (irot_input_y)) - irot_input_y
+ vadduhm v18,v18,v25 //(c3s5 * (irot_input_y))
+ vsrh v9,v25,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vadduhm v15,v8,v9 //ip[5]
+//~~~~~~~~~~ end cut here
+
+//~~~~~~~~~~~~~~~ transpose back
+//start of transpose
+ vmrghh v18,v10,v11 //00 01 10 11 20 21 30 31
+ vmrglh v19,v10,v11 //40 41 50 51 60 61 70 71
+ vmrghh v20,v12,v13 //02 03 12 13 22 23 32 33
+ vmrglh v21,v12,v13 //42 43 52 53 62 63 72 73
+ vmrghh v22,v14,v15 //04 05 14 15 24 25 34 35
+ vmrglh v23,v14,v15 //44 45 54 55 64 65 74 75
+ vmrghh v24,v16,v17 //06 07 16 17 26 27 36 37
+ vmrglh v25,v16,v17 //46 47 56 57 66 67 76 77
+
+ vmrghw v8,v18,v20 //00 01 02 03 10 11 12 13
+ vmrghw v9,v22,v24 //04 05 06 07 14 15 16 17
+ vmrghw v26,v19,v21 //40 41 42 43 50 51 52 53
+ vmrghw v27,v23,v25 //44 45 46 47 54 55 56 57
+ vmrglw v18,v18,v20 //20 21 22 23 30 31 32 33
+ vmrglw v22,v22,v24 //24 25 26 27 34 35 36 37
+ vmrglw v19,v19,v21 //60 61 62 63 70 71 72 73
+ vmrglw v23,v23,v25 //64 65 66 67 74 75 76 77
+
+ vperm v10,v8,v9,v30 //00 01 02 03 04 05 06 07
+ vperm v11,v8,v9,v31 //10 11 12 13 14 15 16 17
+ vperm v12,v18,v22,v30 //20 21 22 23 24 25 26 27
+ vperm v13,v18,v22,v31 //30 31 32 33 34 35 36 37
+ vperm v20,v26,v27,v30 //40 41 42 43 44 45 46 47
+ vperm v21,v26,v27,v31 //50 51 52 53 54 55 56 57
+ vperm v22,v19,v23,v30 //60 61 62 63 64 65 66 67
+ vperm v23,v19,v23,v31 //70 71 72 73 74 75 76 77
+//end of transpose
+//~~~~~~~~~~ start cut here
+ vsubuhm v14,v10,v23 //id07
+ vsubuhm v15,v11,v12 //id12
+ vsubuhm v16,v13,v20 //id34
+ vsubuhm v17,v21,v22 //id56
+
+ vadduhm v10,v10,v23 //is07
+ vadduhm v11,v11,v12 //is12
+ vadduhm v12,v13,v20 //is34
+ vadduhm v13,v21,v22 //is56
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// some precalulations
+ vspltish v28,15
+ vadduhm v18,v10,v12 //is0734
+ vadduhm v19,v11,v13 //is1256
+
+ vsubuhm v20,v11,v13 //(is12 - is56)
+ vmulesh v22,v20,v4
+ vmulosh v23,v20,v4
+ vperm v8,v22,v23,v29 //(c4s4 * (is12 - is56)) - (is12 - is56)
+ vadduhm v8,v8,v20 //c4s4 * (is12 - is56)
+ vsrh v20,v20,v28 //get sign bit
+ vadduhm v8,v8,v20 //add in sign bit aka icommon_product1
+
+ vadduhm v20,v15,v17 //(id12 + id56)
+ vmulesh v22,v20,v4
+ vmulosh v23,v20,v4
+ vperm v9,v22,v23,v29 //(c4s4 * (is12 + is56)) - (is12 + is56)
+ vadduhm v9,v9,v20 //c4s4 * (is12 + is56)
+ vsrh v20,v20,v28 //get sign bit
+ vadduhm v9,v9,v20 //add in sign bit aka icommon_product2
+
+ vsubuhm v20,v15,v17 //irot_input_x = id12 - id56
+ vsubuhm v21,v10,v12 //irot_input_y = is07 - is34
+
+ vadduhm v22,v14,v8 //irot_input_x = icommon_product1 + id07
+ vadduhm v23,v16,v9 //irot_input_y = icommon_product2 + id34
+ vxor v24,v24,v24
+ vsubuhm v23,v24,v23 //irot_input_y = -(icommon_product2 + id34)
+
+ vsubuhm v24,v14,v8 //irot_input_x = id07 - icommon_product1
+ vsubuhm v25,v16,v9 //irot_input_y = id34 - icommon_product2
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// ip[0], ip[4]
+ vadduhm v8,v18,v19 //(is0734 + is1256)
+ vsubuhm v9,v18,v19 //(is0734 - is1256)
+
+ vmulesh v18,v8,v4
+ vmulosh v19,v8,v4
+ vperm v18,v18,v19,v29 //(c4s4 * (is0734 + is1256)) - (is0734 + is1256)
+ vadduhm v18,v18,v8 //(c4s4 * (is0734 + is1256))
+ vsrh v8,v8,v28
+ vadduhm v10,v18,v8 //add in sign bit aka ip[0]
+
+ vmulesh v18,v9,v4
+ vmulosh v19,v9,v4
+ vperm v18,v18,v19,v29 //(c4s4 * (is0734 + is1256)) - (is0734 + is1256)
+ vadduhm v18,v18,v9 //(c4s4 * (is0734 + is1256))
+ vsrh v9,v9,v28
+ vadduhm v14,v18,v9 //add in sign bit aka ip[4]
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// ip[2], ip[6]
+ vmulesh v18,v20,v6
+ vmulosh v19,v20,v6
+ vperm v18,v18,v19,v29 //(c6s2 * (irot_input_x))
+ vsrh v8,v20,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v21,v2
+ vmulosh v19,v21,v2
+ vperm v18,v18,v19,v29 //(c2s6 * (irot_input_y)) - irot_input_y
+ vadduhm v18,v18,v21 //(c2s6 * (irot_input_y))
+ vsrh v9,v21,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vadduhm v12,v8,v9 //ip[2]
+
+ vmulesh v18,v21,v6
+ vmulosh v19,v21,v6
+ vperm v18,v18,v19,v29 //(c6s2 * (irot_input_y))
+ vsrh v8,v21,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v20,v2
+ vmulosh v19,v20,v2
+ vperm v18,v18,v19,v29 //(c2s6 * (irot_input_x)) - irot_input_x
+ vadduhm v18,v18,v20 //(c2s6 * (irot_input_x))
+ vsrh v9,v20,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vsubuhm v16,v8,v9 //ip[6]
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// ip[1], ip[7]
+ vmulesh v18,v22,v1
+ vmulosh v19,v22,v1
+ vperm v18,v18,v19,v29 //(c1s7 * (irot_input_x)) - irot_input_x
+ vadduhm v18,v18,v22 //(c1s7 * (irot_input_x))
+ vsrh v8,v22,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v23,v7
+ vmulosh v19,v23,v7
+ vperm v18,v18,v19,v29 //(c7s1 * (irot_input_y))
+ vsrh v9,v23,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vsubuhm v11,v8,v9 //ip[1]
+
+ vmulesh v18,v22,v7
+ vmulosh v19,v22,v7
+ vperm v18,v18,v19,v29 //(c7s1 * (irot_input_x))
+ vsrh v8,v22,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v23,v1
+ vmulosh v19,v23,v1
+ vperm v18,v18,v19,v29 //(c1s7 * (irot_input_y)) - irot_input_y
+ vadduhm v18,v18,v23 //(c1s7 * (irot_input_7))
+ vsrh v9,v23,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vadduhm v17,v8,v9 //ip[7]
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// ip[3], ip[5]
+ vmulesh v18,v24,v3
+ vmulosh v19,v24,v3
+ vperm v18,v18,v19,v29 //(c3s5 * (irot_input_x)) - irot_input_x
+ vadduhm v18,v18,v24 //(c3s5 * (irot_input_x))
+ vsrh v8,v24,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v25,v5
+ vmulosh v19,v25,v5
+ vperm v18,v18,v19,v29 //(c5s3 * (irot_input_y)) - irot_input_y
+ vadduhm v18,v18,v25 //(c5s3 * (irot_input_y))
+ vsrh v9,v25,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vsubuhm v13,v8,v9 //ip[3]
+
+ vmulesh v18,v24,v5
+ vmulosh v19,v24,v5
+ vperm v18,v18,v19,v29 //(c5s3 * (irot_input_x)) - irot_input_x
+ vadduhm v18,v18,v24 //(c5s3 * (irot_input_x))
+ vsrh v8,v24,v28
+ vadduhm v8,v18,v8 //add in sign bit aka temp1
+ vmulesh v18,v25,v3
+ vmulosh v19,v25,v3
+ vperm v18,v18,v19,v29 //(c3s5 * (irot_input_y)) - irot_input_y
+ vadduhm v18,v18,v25 //(c3s5 * (irot_input_y))
+ vsrh v9,v25,v28
+ vadduhm v9,v18,v9 //add in sign bit aka temp2
+ vadduhm v15,v8,v9 //ip[5]
+//~~~~~~~~~~ end cut here
+
+//~~~~~~~~~~ write to destination
+ xor r8,r8,r8
+
+ stvx v10,r4,r8
+ addi r8,r8,16
+
+ stvx v11,r4,r8
+ addi r8,r8,16
+
+ stvx v12,r4,r8
+ addi r8,r8,16
+
+ stvx v13,r4,r8
+ addi r8,r8,16
+
+ stvx v14,r4,r8
+ addi r8,r8,16
+
+ stvx v15,r4,r8
+ addi r8,r8,16
+
+ stvx v16,r4,r8
+ addi r8,r8,16
+
+ stvx v17,r4,r8
+ }
+
+}
+OIL_DEFINE_IMPL_REF (fdct8x8theora_altivec, fdct8x8theora, OIL_IMPL_FLAG_ALTIVEC);
diff --git a/liboil/powerpc/recon8x8_altivec.c b/liboil/powerpc/recon8x8_altivec.c
new file mode 100644
index 0000000..1d85b2a
--- /dev/null
+++ b/liboil/powerpc/recon8x8_altivec.c
@@ -0,0 +1,716 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ these functions are copied from
+ http://svn.xiph.org/trunk/vp32/CoreLibs/CDXV/Vp31/Common/mac/OptFunctionsPPC.c
+*/
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/simdpack/simdpack.h>
+
+OIL_DECLARE_CLASS (recon8x8_intra);
+OIL_DECLARE_CLASS (recon8x8_inter);
+OIL_DECLARE_CLASS (recon8x8_inter2);
+
+static const __attribute__ ((aligned(8),used)) uint64_t V128 = 0x8080808080808080LL;
+
+static void
+ /* r3, r4, r5 */
+recon8x8_intra_altivec (uint8_t *dest, int16_t *change, int ds)
+{
+ asm {
+ //trying cache hints
+ lis r6,0x0108
+ or r6,r6,r5
+ dstst r3,r6,0
+
+ vspltish v1,7
+
+ vspltish v8,1
+ xor r6,r6,r6
+
+ lvx v0,r4,r6 //get 8 shorts
+ vslh v8,v8,v1 //now have 128
+ addi r6,r6,16
+
+ lvx v1,r4,r6 //get 8 shorts
+ vaddshs v0,v0,v8 //+=128
+ addi r6,r6,16
+
+ lvx v2,r4,r6 //get 8 shorts
+ vaddshs v1,v1,v8 //+=128
+ addi r6,r6,16
+ vpkshus v0,v0,v0 //convert to bytes
+
+ lvx v3,r4,r6 //get 8 shorts
+ vaddshs v2,v2,v8 //+=128
+ addi r6,r6,16
+ vpkshus v1,v1,v1 //convert to bytes
+
+ lvx v4,r4,r6 //get 8 shorts
+ vaddshs v3,v3,v8 //+=128
+ addi r6,r6,16
+ vpkshus v2,v2,v2 //convert to bytes
+
+ lvx v5,r4,r6 //get 8 shorts
+ vaddshs v4,v4,v8 //+=128
+ addi r6,r6,16
+ vpkshus v3,v3,v3 //convert to bytes
+
+ lvx v6,r4,r6 //get 8 shorts
+ vaddshs v5,v5,v8 //+=128
+ addi r6,r6,16
+ vpkshus v4,v4,v4 //convert to bytes
+
+ lvx v7,r4,r6 //get 8 shorts
+ xor r6,r6,r6
+ vaddshs v6,v6,v8 //+=128
+ vpkshus v5,v5,v5 //convert to bytes
+
+ lvsr v9,r3,r6 //load alignment vector for stores
+ vaddshs v7,v7,v8 //+=128
+ vpkshus v6,v6,v6 //convert to bytes
+
+ vpkshus v7,v7,v7 //convert to bytes
+
+ li r7,4
+ vperm v0,v0,v0,v9
+
+ stvewx v0,r3,r6
+ add r6,r6,r5
+
+ lvsr v9,r3,r6 //load alignment vector for stores
+
+ stvewx v0,r3,r7
+ add r7,r7,r5
+ vperm v1,v1,v1,v9
+
+ stvewx v1,r3,r6
+ add r6,r6,r5
+
+ lvsr v9,r3,r6 //load alignment vector for stores
+
+ stvewx v1,r3,r7
+ add r7,r7,r5
+ vperm v2,v2,v2,v9
+
+ stvewx v2,r3,r6
+ add r6,r6,r5
+
+ lvsr v9,r3,r6 //load alignment vector for stores
+
+ stvewx v2,r3,r7
+ add r7,r7,r5
+ vperm v3,v3,v3,v9
+
+ stvewx v3,r3,r6
+ add r6,r6,r5
+
+ lvsr v9,r3,r6 //load alignment vector for stores
+
+ stvewx v3,r3,r7
+ add r7,r7,r5
+ vperm v4,v4,v4,v9
+
+ stvewx v4,r3,r6
+ add r6,r6,r5
+
+ lvsr v9,r3,r6 //load alignment vector for stores
+
+ stvewx v4,r3,r7
+ add r7,r7,r5
+ vperm v5,v5,v5,v9
+
+ stvewx v5,r3,r6
+ add r6,r6,r5
+
+ lvsr v9,r3,r6 //load alignment vector for stores
+
+ stvewx v5,r3,r7
+ add r7,r7,r5
+ vperm v6,v6,v6,v9
+
+ stvewx v6,r3,r6
+ add r6,r6,r5
+
+ lvsr v9,r3,r6 //load alignment vector for stores
+
+ stvewx v6,r3,r7
+ add r7,r7,r5
+ vperm v7,v7,v7,v9
+
+ stvewx v7,r3,r6
+
+ stvewx v7,r3,r7
+ }
+}
+
+OIL_DEFINE_IMPL_FULL (recon8x8_intra_altivec, recon8x8_intra, OIL_IMPL_FLAG_ALTIVEC);
+
+static void /* r3, r4, r5, r6 */
+recon8x8_inter_altivec (uint8_t *dest, uint8_t *src, int16_t *change, int dss)
+{
+ asm
+ {
+ //trying cache hints
+ lis r7,0x0108
+ or r7,r7,r6
+ dstst r3,r7,0
+
+ xor r7,r7,r7
+ li r8,16
+
+ lvsl v8,r4,r7 //load alignment vector for refs
+ vxor v9,v9,v9
+
+ lvx v10,r4,r7 //get 8 refs
+ add r7,r7,r6
+
+ lvx v0,r4,r8 //need another 16 bytes for misaligned data -- 0
+ add r8,r8,r6
+
+ lvx v11,r4,r7 //get 8 refs
+ vperm v10,v10,v0,v8
+
+ lvsl v8,r4,r7 //load alignment vector for refs
+ add r7,r7,r6
+
+ lvx v1,r4,r8 //need another 16 bytes for misaligned data -- 1
+ add r8,r8,r6
+
+ lvx v12,r4,r7 //get 8 refs
+ vperm v11,v11,v1,v8
+
+ lvsl v8,r4,r7 //load alignment vector for refs
+ add r7,r7,r6
+
+ lvx v2,r4,r8 //need another 16 bytes for misaligned data -- 2
+ add r8,r8,r6
+
+ lvx v13,r4,r7 //get 8 refs
+ vperm v12,v12,v2,v8
+
+ lvsl v8,r4,r7 //load alignment vector for refs
+ add r7,r7,r6
+
+ lvx v3,r4,r8 //need another 16 bytes for misaligned data -- 3
+ add r8,r8,r6
+
+ lvx v14,r4,r7 //get 8 refs
+ vperm v13,v13,v3,v8
+
+ lvsl v8,r4,r7 //load alignment vector for refs
+ add r7,r7,r6
+
+ lvx v4,r4,r8 //need another 16 bytes for misaligned data -- 4
+ add r8,r8,r6
+
+ lvx v15,r4,r7 //get 8 refs
+ vperm v14,v14,v4,v8
+
+ lvsl v8,r4,r7 //load alignment vector for refs
+ add r7,r7,r6
+
+ lvx v5,r4,r8 //need another 16 bytes for misaligned data -- 5
+ add r8,r8,r6
+
+ lvx v16,r4,r7 //get 8 refs
+ vperm v15,v15,v5,v8
+
+ lvsl v8,r4,r7 //load alignment vector for refs
+ add r7,r7,r6
+
+ lvx v6,r4,r8 //need another 16 bytes for misaligned data -- 6
+ add r8,r8,r6
+
+ lvx v17,r4,r7 //get 8 refs
+ vperm v16,v16,v6,v8
+
+ lvsl v8,r4,r7 //load alignment vector for refs
+ xor r7,r7,r7
+
+ lvx v7,r4,r8 //need another 16 bytes for misaligned data -- 7
+ add r8,r8,r6
+
+ lvx v0,r5,r7 //get 8 shorts
+ vperm v17,v17,v7,v8
+ addi r7,r7,16
+
+ lvx v1,r5,r7 //get 8 shorts
+ vmrghb v10,v9,v10 //unsigned byte -> unsigned half
+ addi r7,r7,16
+
+ lvx v2,r5,r7 //get 8 shorts
+ vmrghb v11,v9,v11 //unsigned byte -> unsigned half
+ vaddshs v0,v0,v10
+ addi r7,r7,16
+
+ lvx v3,r5,r7 //get 8 shorts
+ vmrghb v12,v9,v12 //unsigned byte -> unsigned half
+ vaddshs v1,v1,v11
+ addi r7,r7,16
+
+ lvx v4,r5,r7 //get 8 shorts
+ vmrghb v13,v9,v13 //unsigned byte -> unsigned half
+ vaddshs v2,v2,v12
+ addi r7,r7,16
+
+ lvx v5,r5,r7 //get 8 shorts
+ vmrghb v14,v9,v14 //unsigned byte -> unsigned half
+ vaddshs v3,v3,v13
+ addi r7,r7,16
+
+ lvx v6,r5,r7 //get 8 shorts
+ vmrghb v15,v9,v15 //unsigned byte -> unsigned half
+ vaddshs v4,v4,v14
+ addi r7,r7,16
+
+ lvx v7,r5,r7 //get 8 shorts
+ vmrghb v16,v9,v16 //unsigned byte -> unsigned half
+ vaddshs v5,v5,v15
+
+ vmrghb v17,v9,v17 //unsigned byte -> unsigned half
+ vaddshs v6,v6,v16
+
+ vpkshus v0,v0,v0
+ vaddshs v7,v7,v17
+
+ vpkshus v1,v1,v1
+ xor r7,r7,r7
+
+ vpkshus v2,v2,v2
+
+ vpkshus v3,v3,v3
+
+ vpkshus v4,v4,v4
+
+ vpkshus v5,v5,v5
+
+ vpkshus v6,v6,v6
+
+ lvsr v9,r3,r7 //load alignment vector for stores
+ vpkshus v7,v7,v7
+
+ li r8,4
+ vperm v0,v0,v0,v9 //adjust for writes
+
+ stvewx v0,r3,r7
+ add r7,r7,r6
+
+ lvsr v9,r3,r7 //load alignment vector for stores
+
+ stvewx v0,r3,r8
+ add r8,r8,r6
+ vperm v1,v1,v1,v9
+
+ stvewx v1,r3,r7
+ add r7,r7,r6
+
+ lvsr v9,r3,r7 //load alignment vector for stores
+
+ stvewx v1,r3,r8
+ add r8,r8,r6
+ vperm v2,v2,v2,v9
+
+ stvewx v2,r3,r7
+ add r7,r7,r6
+
+ lvsr v9,r3,r7 //load alignment vector for stores
+
+ stvewx v2,r3,r8
+ add r8,r8,r6
+ vperm v3,v3,v3,v9
+
+ stvewx v3,r3,r7
+ add r7,r7,r6
+
+ lvsr v9,r3,r7 //load alignment vector for stores
+
+ stvewx v3,r3,r8
+ add r8,r8,r6
+ vperm v4,v4,v4,v9
+
+ stvewx v4,r3,r7
+ add r7,r7,r6
+
+ lvsr v9,r3,r7 //load alignment vector for stores
+
+ stvewx v4,r3,r8
+ add r8,r8,r6
+ vperm v5,v5,v5,v9
+
+ stvewx v5,r3,r7
+ add r7,r7,r6
+
+ lvsr v9,r3,r7 //load alignment vector for stores
+
+ stvewx v5,r3,r8
+ add r8,r8,r6
+ vperm v6,v6,v6,v9
+
+ stvewx v6,r3,r7
+ add r7,r7,r6
+
+ lvsr v9,r3,r7 //load alignment vector for stores
+
+ stvewx v6,r3,r8
+ add r8,r8,r6
+ vperm v7,v7,v7,v9
+
+ stvewx v7,r3,r7
+
+ stvewx v7,r3,r8
+ }
+}
+
+OIL_DEFINE_IMPL_FULL (recon8x8_inter_altivec, recon8x8_inter, OIL_IMPL_FLAG_ALTIVEC);
+
+static void /* r3, r4, r5, r6, r7 */
+recon8x8_inter2_altivec (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss)
+{
+ asm
+ {
+ //trying cache hints
+ lis r8,0x0108
+ or r8,r8,r7
+ dstst r3,r8,0
+
+ xor r8,r8,r8
+ li r9,16
+
+ lvsl v8,r4,r8 //load alignment vector for RefPtr1
+ vxor v9,v9,v9
+
+ lvx v10,r4,r8 //get 8 RefPtr1 -- 0
+ add r8,r8,r7
+
+ lvx v0,r4,r9 //need another 16 bytes for misaligned data -- 0
+ add r9,r9,r7
+
+ lvx v11,r4,r8 //get 8 RefPtr1 -- 1
+ vperm v10,v10,v0,v8
+
+ lvsl v8,r4,r8 //load alignment vector for RefPtr1
+ add r8,r8,r7
+
+ lvx v1,r4,r9 //need another 16 bytes for misaligned data -- 1
+ vmrghb v10,v9,v10 //unsigned byte -> unsigned half
+ add r9,r9,r7
+
+ lvx v12,r4,r8 //get 8 RefPtr1 -- 2
+ vperm v11,v11,v1,v8
+
+ lvsl v8,r4,r8 //load alignment vector for RefPtr1
+ add r8,r8,r7
+
+ lvx v2,r4,r9 //need another 16 bytes for misaligned data -- 2
+ vmrghb v11,v9,v11 //unsigned byte -> unsigned half
+ add r9,r9,r7
+
+ lvx v13,r4,r8 //get 8 RefPtr1 -- 3
+ vperm v12,v12,v2,v8
+
+ lvsl v8,r4,r8 //load alignment vector for RefPtr1
+ add r8,r8,r7
+
+ lvx v3,r4,r9 //need another 16 bytes for misaligned data -- 3
+ vmrghb v12,v9,v12 //unsigned byte -> unsigned half
+ add r9,r9,r7
+
+ lvx v14,r4,r8 //get 8 RefPtr1 -- 4
+ vperm v13,v13,v3,v8
+
+ lvsl v8,r4,r8 //load alignment vector for RefPtr1
+ add r8,r8,r7
+
+ lvx v4,r4,r9 //need another 16 bytes for misaligned data -- 4
+ vmrghb v13,v9,v13 //unsigned byte -> unsigned half
+ add r9,r9,r7
+
+ lvx v15,r4,r8 //get 8 RefPtr1 -- 5
+ vperm v14,v14,v4,v8
+
+ lvsl v8,r4,r8 //load alignment vector for RefPtr1
+ add r8,r8,r7
+
+ lvx v5,r4,r9 //need another 16 bytes for misaligned data -- 5
+ vmrghb v14,v9,v14 //unsigned byte -> unsigned half
+ add r9,r9,r7
+
+ lvx v16,r4,r8 //get 8 RefPtr1 -- 6
+ vperm v15,v15,v5,v8
+
+ lvsl v8,r4,r8 //load alignment vector for RefPtr1
+ add r8,r8,r7
+
+ lvx v6,r4,r9 //need another 16 bytes for misaligned data -- 6
+ vmrghb v15,v9,v15 //unsigned byte -> unsigned half
+ add r9,r9,r7
+
+ lvx v17,r4,r8 //get 8 RefPtr1 -- 7
+ vperm v16,v16,v6,v8
+
+ lvsl v8,r4,r8 //load alignment vector for RefPtr1
+ add r8,r8,r7
+
+ lvx v7,r4,r9 //need another 16 bytes for misaligned data -- 7
+ vmrghb v16,v9,v16 //unsigned byte -> unsigned half
+ add r9,r9,r7
+//--------
+ vperm v17,v17,v7,v8
+ xor r8,r8,r8
+ li r9,16
+
+ lvsl v18,r5,r8 //load alignment vector for RefPtr2
+ vmrghb v17,v9,v17 //unsigned byte -> unsigned half
+
+ lvx v20,r5,r8 //get 8 RefPtr2 -- 0
+ add r8,r8,r7
+
+ lvx v0,r5,r9 //need another 16 bytes for misaligned data -- 0
+ add r9,r9,r7
+
+ lvx v21,r5,r8 //get 8 RefPtr2 -- 1
+ vperm v20,v20,v0,v18
+
+ lvsl v18,r5,r8 //load alignment vector for RefPtr2
+ add r8,r8,r7
+
+ lvx v1,r5,r9 //need another 16 bytes for misaligned data -- 1
+ vmrghb v20,v9,v20 //unsigned byte -> unsigned half
+ add r9,r9,r7
+
+ lvx v22,r5,r8 //get 8 RefPtr2 -- 2
+ vperm v21,v21,v1,v18
+
+ lvsl v18,r5,r8 //load alignment vector for RefPtr2
+ add r8,r8,r7
+
+ lvx v2,r5,r9 //need another 16 bytes for misaligned data -- 2
+ vmrghb v21,v9,v21 //unsigned byte -> unsigned half
+ vadduhm v10,v10,v20
+ add r9,r9,r7
+
+ lvx v23,r5,r8 //get 8 RefPtr2 -- 3
+ vperm v22,v22,v2,v18
+
+ lvsl v18,r5,r8 //load alignment vector for RefPtr2
+ add r8,r8,r7
+
+ lvx v3,r5,r9 //need another 16 bytes for misaligned data -- 3
+ vmrghb v22,v9,v22 //unsigned byte -> unsigned half
+ vadduhm v11,v11,v21
+ add r9,r9,r7
+
+ lvx v24,r5,r8 //get 8 RefPtr2 -- 4
+ vperm v23,v23,v3,v18
+
+ lvsl v18,r5,r8 //load alignment vector for RefPtr2
+ add r8,r8,r7
+
+ lvx v4,r5,r9 //need another 16 bytes for misaligned data -- 4
+ vmrghb v23,v9,v23 //unsigned byte -> unsigned half
+ vadduhm v12,v12,v22
+ add r9,r9,r7
+
+ lvx v25,r5,r8 //get 8 RefPtr2 -- 5
+ vperm v24,v24,v4,v18
+
+ lvsl v18,r5,r8 //load alignment vector for RefPtr2
+ add r8,r8,r7
+
+ lvx v5,r5,r9 //need another 16 bytes for misaligned data -- 5
+ vmrghb v24,v9,v24 //unsigned byte -> unsigned half
+ vadduhm v13,v13,v23
+ add r9,r9,r7
+
+ lvx v26,r5,r8 //get 8 RefPtr2 -- 6
+ vperm v25,v25,v5,v18
+
+ lvsl v18,r5,r8 //load alignment vector for RefPtr2
+ add r8,r8,r7
+
+ lvx v6,r5,r9 //need another 16 bytes for misaligned data -- 6
+ vmrghb v25,v9,v25 //unsigned byte -> unsigned half
+ vadduhm v14,v14,v24
+ add r9,r9,r7
+
+ lvx v27,r5,r8 //get 8 RefPtr2 -- 7
+ vperm v26,v26,v6,v18
+
+ lvsl v18,r5,r8 //load alignment vector for RefPtr2
+ add r8,r8,r7
+
+ lvx v7,r5,r9 //need another 16 bytes for misaligned data -- 7
+ vmrghb v26,v9,v26 //unsigned byte -> unsigned half
+ vadduhm v15,v15,v25
+ add r9,r9,r7
+
+ vperm v27,v27,v7,v18
+ xor r8,r8,r8
+
+ vmrghb v27,v9,v27 //unsigned byte -> unsigned half
+ vadduhm v16,v16,v26
+
+ vadduhm v17,v17,v27
+ vspltish v8,1
+//--------
+ lvx v0,r6,r8 //get 8 shorts
+ vsrh v10,v10,v8
+ addi r8,r8,16
+
+ lvx v1,r6,r8 //get 8 shorts
+ vsrh v11,v11,v8
+ addi r8,r8,16
+
+ lvx v2,r6,r8 //get 8 shorts
+ vsrh v12,v12,v8
+ addi r8,r8,16
+
+ lvx v3,r6,r8 //get 8 shorts
+ vsrh v13,v13,v8
+ addi r8,r8,16
+
+ lvx v4,r6,r8 //get 8 shorts
+ vsrh v14,v14,v8
+ addi r8,r8,16
+
+ lvx v5,r6,r8 //get 8 shorts
+ vsrh v15,v15,v8
+ addi r8,r8,16
+
+ lvx v6,r6,r8 //get 8 shorts
+ vsrh v16,v16,v8
+ addi r8,r8,16
+
+ lvx v7,r6,r8 //get 8 shorts
+ vsrh v17,v17,v8
+ xor r8,r8,r8
+//--------
+ lvsr v9,r3,r8 //load alignment vector for stores
+ vaddshs v0,v0,v10
+
+ vaddshs v1,v1,v11
+ vpkshus v0,v0,v0
+
+ vaddshs v2,v2,v12
+ vpkshus v1,v1,v1
+
+ vaddshs v3,v3,v13
+ vpkshus v2,v2,v2
+
+ vaddshs v4,v4,v14
+ vpkshus v3,v3,v3
+
+ vaddshs v5,v5,v15
+ vpkshus v4,v4,v4
+
+ vaddshs v6,v6,v16
+ vpkshus v5,v5,v5
+
+ vaddshs v7,v7,v17
+ vpkshus v6,v6,v6
+
+ vpkshus v7,v7,v7
+
+ li r9,4
+ vperm v0,v0,v0,v9 //adjust for writes
+
+ stvewx v0,r3,r8
+ add r8,r8,r7
+
+ lvsr v9,r3,r8 //load alignment vector for stores
+
+ stvewx v0,r3,r9
+ add r9,r9,r7
+ vperm v1,v1,v1,v9
+
+ stvewx v1,r3,r8
+ add r8,r8,r7
+
+ lvsr v9,r3,r8 //load alignment vector for stores
+
+ stvewx v1,r3,r9
+ add r9,r9,r7
+ vperm v2,v2,v2,v9
+
+ stvewx v2,r3,r8
+ add r8,r8,r7
+
+ lvsr v9,r3,r8 //load alignment vector for stores
+
+ stvewx v2,r3,r9
+ add r9,r9,r7
+ vperm v3,v3,v3,v9
+
+ stvewx v3,r3,r8
+ add r8,r8,r7
+
+ lvsr v9,r3,r8 //load alignment vector for stores
+
+ stvewx v3,r3,r9
+ add r9,r9,r7
+ vperm v4,v4,v4,v9
+
+ stvewx v4,r3,r8
+ add r8,r8,r7
+
+ lvsr v9,r3,r8 //load alignment vector for stores
+
+ stvewx v4,r3,r9
+ add r9,r9,r7
+ vperm v5,v5,v5,v9
+
+ stvewx v5,r3,r8
+ add r8,r8,r7
+
+ lvsr v9,r3,r8 //load alignment vector for stores
+
+ stvewx v5,r3,r9
+ add r9,r9,r7
+ vperm v6,v6,v6,v9
+
+ stvewx v6,r3,r8
+ add r8,r8,r7
+
+ lvsr v9,r3,r8 //load alignment vector for stores
+
+ stvewx v6,r3,r9
+ add r9,r9,r7
+ vperm v7,v7,v7,v9
+
+ stvewx v7,r3,r8
+
+ stvewx v7,r3,r9
+ }
+}
+
+OIL_DEFINE_IMPL_FULL (recon8x8_inter2_altivec, recon8x8_inter2, OIL_IMPL_FLAG_ALTIVEC);
diff --git a/liboil/powerpc/recon8x8_ppc.c b/liboil/powerpc/recon8x8_ppc.c
new file mode 100644
index 0000000..53c74b5
--- /dev/null
+++ b/liboil/powerpc/recon8x8_ppc.c
@@ -0,0 +1,526 @@
+/*
+ * LIBOIL - Library of Optimized Inner Loops
+ * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ these functions are copied from
+ http://svn.xiph.org/trunk/vp32/CoreLibs/CDXV/Vp31/Common/mac/OptFunctionsPPC.c
+*/
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <liboil/liboilfunction.h>
+#include <liboil/simdpack/simdpack.h>
+
+OIL_DECLARE_CLASS (recon8x8_intra);
+OIL_DECLARE_CLASS (recon8x8_inter);
+OIL_DECLARE_CLASS (recon8x8_inter2);
+
+static const __attribute__ ((aligned(8),used)) uint64_t V128 = 0x8080808080808080LL;
+
+static void
+ /* r3, r4, r5 */
+recon8x8_intra_ppc (uint8_t *dest, int16_t *change, int ds)
+{
+ asm
+ {
+ lwz r0,0(r3) ;//preload cache
+ mr r12,r4
+
+ addi r12,r12,128 ;//end ptr
+
+doLoop1:
+ lha r7,0(r4)
+
+ lha r8,2(r4)
+ addi r7,r7,128
+
+ lha r9,4(r4)
+ addi r8,r8,128
+ andi. r0,r7,0xff00
+ beq+ L1
+
+ srawi r0,r7,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r7,r0,0xff ;//now have 00 or ff
+
+L1:
+ lha r10,6(r4)
+ addi r9,r9,128
+ andi. r0,r8,0xff00
+ beq+ L2
+
+ srawi r0,r8,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r8,r0,0xff ;//now have 00 or ff
+
+L2:
+ lha r31,8(r4)
+ addi r10,r10,128
+ andi. r0,r9,0xff00
+ beq+ L3
+
+ srawi r0,r9,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r9,r0,0xff ;//now have 00 or ff
+
+L3:
+ lha r30,10(r4)
+ andi. r0,r10,0xff00
+ beq+ L4
+
+ srawi r0,r10,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r10,r0,0xff ;//now have 00 or ff
+
+L4:
+ lha r29,12(r4)
+ insrwi r10,r7,8,0
+ addi r31,r31,128
+
+ lwz r27,0(r3) ;//preload cache with dest
+ addi r30,r30,128
+ andi. r0,r31,0xff00
+ beq+ L5
+
+ srawi r0,r31,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r31,r0,0xff ;//now have 00 or ff
+
+L5:
+ lha r28,14(r4)
+ addi r29,r29,128
+ andi. r0,r30,0xff00
+ beq+ L6
+
+ srawi r0,r30,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r30,r0,0xff ;//now have 00 or ff
+
+L6:
+ addi r28,r28,128
+ andi. r0,r29,0xff00
+ beq+ L7
+
+ srawi r0,r29,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r29,r0,0xff ;//now have 00 or ff
+
+L7:
+ insrwi r10,r8,8,8
+ andi. r0,r28,0xff00
+ beq+ L8
+
+ srawi r0,r28,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r28,r0,0xff ;//now have 00 or ff
+
+L8:
+ insrwi r10,r9,8,16
+ insrwi r28,r31,8,0
+
+ stw r10,0(r3)
+ insrwi r28,r30,8,8
+ addi r4,r4,16
+
+ cmpw r4,r12
+ insrwi r28,r29,8,16
+
+ stw r28,4(r3)
+ add r3,r3,r5 ;//add in stride
+ bne doLoop1
+
+ }
+}
+
+OIL_DEFINE_IMPL_FULL (recon8x8_intra_ppc, recon8x8_intra, OIL_IMPL_FLAG_ASM);
+
+static void /* r3, r4, r5, r6 */
+recon8x8_inter_ppc (uint8_t *dest, uint8_t *src, int16_t *change, int dss)
+{
+ asm
+ {
+ mr r26,r4
+ mr r4,r5 ;//same reg usage as intra
+
+ lwz r0,0(r3) ;//preload cache
+ mr r12,r4
+
+ addi r12,r12,128 ;//end ptr
+ mr r5,r6 ;//same reg usage as intra
+
+doLoop1:
+ lha r7,0(r4)
+
+ lbz r25,0(r26)
+
+ lha r8,2(r4)
+ add r7,r7,r25
+
+ lbz r25,1(r26)
+
+ lha r9,4(r4)
+ add r8,r8,r25
+ andi. r0,r7,0xff00
+ beq+ L1
+
+ srawi r0,r7,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r7,r0,0xff ;//now have 00 or ff
+
+L1:
+ lbz r25,2(r26)
+
+ lha r10,6(r4)
+ add r9,r9,r25
+ andi. r0,r8,0xff00
+ beq+ L2
+
+ srawi r0,r8,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r8,r0,0xff ;//now have 00 or ff
+
+L2:
+ lbz r25,3(r26)
+
+ lha r31,8(r4)
+ add r10,r10,r25
+ andi. r0,r9,0xff00
+ beq+ L3
+
+ srawi r0,r9,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r9,r0,0xff ;//now have 00 or ff
+
+L3:
+ lha r30,10(r4)
+ andi. r0,r10,0xff00
+ beq+ L4
+
+ srawi r0,r10,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r10,r0,0xff ;//now have 00 or ff
+
+L4:
+ lbz r25,4(r26)
+
+
+ lha r29,12(r4)
+ insrwi r10,r7,8,0
+ add r31,r31,r25
+
+ lbz r25,5(r26)
+
+ lwz r27,0(r3) ;//preload cache with dest
+ add r30,r30,r25
+ andi. r0,r31,0xff00
+ beq+ L5
+
+ srawi r0,r31,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r31,r0,0xff ;//now have 00 or ff
+
+L5:
+ lbz r25,6(r26)
+
+ lha r28,14(r4)
+ add r29,r29,r25
+ andi. r0,r30,0xff00
+ beq+ L6
+
+ srawi r0,r30,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r30,r0,0xff ;//now have 00 or ff
+
+L6:
+ lbz r25,7(r26)
+ add r26,r26,r5
+
+ add r28,r28,r25
+ andi. r0,r29,0xff00
+ beq+ L7
+
+ srawi r0,r29,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r29,r0,0xff ;//now have 00 or ff
+
+L7:
+ insrwi r10,r8,8,8
+ andi. r0,r28,0xff00
+ beq+ L8
+
+ srawi r0,r28,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r28,r0,0xff ;//now have 00 or ff
+
+L8:
+ insrwi r10,r9,8,16
+ insrwi r28,r31,8,0
+
+ stw r10,0(r3)
+ insrwi r28,r30,8,8
+ addi r4,r4,16
+
+ cmpw r4,r12
+ insrwi r28,r29,8,16
+
+ stw r28,4(r3)
+ add r3,r3,r5 ;//add in stride
+ bne doLoop1
+
+ }
+}
+
+OIL_DEFINE_IMPL_FULL (recon8x8_inter_ppc, recon8x8_inter, OIL_IMPL_FLAG_ASM);
+
+static void /* r3, r4, r5, r6, r7 */
+recon8x8_inter2_ppc (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss)
+{
+ asm
+ {
+ mr r26,r4
+ mr r4,r6 ;//same reg usage as intra
+
+ lwz r0,0(r3) ;//preload cache
+ mr r25,r5
+ mr r12,r4
+
+ addi r12,r12,128 ;//end ptr
+ mr r5,r7 ;//same reg usage as intra
+
+ li r24,0x0101
+ li r23,0xfefe
+
+ insrwi r23,r23,16,0 ;//0xfefefefe
+ insrwi r24,r24,16,0 ;//0x01010101
+
+doLoop1:
+ lwz r22,0(r26) ;//get 4 ref pels
+
+ lwz r21,0(r25) ;//get 4 src pels
+
+ lha r7,0(r4)
+ and r20,r22,r21
+
+ lha r8,2(r4)
+ and r21,r21,r23 ;//mask low bits
+ and r22,r22,r23 ;//mask low bits
+
+ srwi r21,r21,1
+ srwi r22,r22,1
+
+ and r20,r20,r24 ;//save low bits
+ add r21,r21,r22
+
+ lwz r22,4(r26) ;//get 4 ref pels
+// or r20,r21,r20 ;//add in hot fudge
+ add r20,r21,r20 ;//add in hot fudge
+
+//xor r20,r20,r20
+
+ lwz r21,4(r25) ;//get 4 src pels
+ rlwinm r19,r20,8,24,31
+ rlwinm r18,r20,16,24,31
+
+ add r7,r7,r19
+
+ lha r9,4(r4)
+ add r8,r8,r18
+ andi. r0,r7,0xff00
+ beq+ L1
+
+ srawi r0,r7,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r7,r0,0xff ;//now have 00 or ff
+
+L1:
+ rlwinm r19,r20,24,24,31
+ rlwinm r18,r20,0,24,31
+
+ lha r10,6(r4)
+ add r9,r9,r19
+ andi. r0,r8,0xff00
+ beq+ L2
+
+ srawi r0,r8,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r8,r0,0xff ;//now have 00 or ff
+
+L2:
+ lha r31,8(r4)
+ add r10,r10,r18
+ andi. r0,r9,0xff00
+ beq+ L3
+
+ srawi r0,r9,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r9,r0,0xff ;//now have 00 or ff
+
+L3:
+ lha r30,10(r4)
+ andi. r0,r10,0xff00
+ beq+ L4
+
+ srawi r0,r10,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r10,r0,0xff ;//now have 00 or ff
+
+L4:
+ lha r29,12(r4)
+ insrwi r10,r7,8,0
+ and r20,r22,r21
+
+ and r21,r21,r23 ;//mask low bits
+ and r22,r22,r23 ;//mask low bits
+
+ srwi r21,r21,1
+ srwi r22,r22,1
+
+ and r20,r20,r24 ;//save low bits
+ add r21,r21,r22
+
+// or r20,r21,r20 ;//add in hot fudge
+ add r20,r21,r20 ;//add in hot fudge
+
+ rlwinm r19,r20,8,24,31
+ rlwinm r18,r20,16,24,31
+
+
+ add r31,r31,r19
+
+//xor r20,r20,r20
+
+ lwz r27,0(r3) ;//preload cache with dest
+ add r30,r30,r18
+ andi. r0,r31,0xff00
+ beq+ L5
+
+ srawi r0,r31,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r31,r0,0xff ;//now have 00 or ff
+
+L5:
+ rlwinm r19,r20,24,24,31
+ rlwinm r18,r20,0,24,31
+
+ lha r28,14(r4)
+ add r29,r29,r19
+ andi. r0,r30,0xff00
+ beq+ L6
+
+ srawi r0,r30,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r30,r0,0xff ;//now have 00 or ff
+
+L6:
+ add r26,r26,r5 ;//add stride to ref pels
+ add r25,r25,r5 ;//add stride to src pels
+
+ add r28,r28,r18
+ andi. r0,r29,0xff00
+ beq+ L7
+
+ srawi r0,r29,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r29,r0,0xff ;//now have 00 or ff
+
+L7:
+ insrwi r10,r8,8,8
+ andi. r0,r28,0xff00
+ beq+ L8
+
+ srawi r0,r28,15 ;//generate ff or 00
+
+ xori r0,r0,0xff ;//flip the bits
+
+ andi. r28,r0,0xff ;//now have 00 or ff
+
+L8:
+ insrwi r10,r9,8,16
+ insrwi r28,r31,8,0
+
+ stw r10,0(r3)
+ insrwi r28,r30,8,8
+ addi r4,r4,16
+
+ cmpw r4,r12
+ insrwi r28,r29,8,16
+
+ stw r28,4(r3)
+ add r3,r3,r5 ;//add in stride
+ bne doLoop1
+
+ }
+}
+
+OIL_DEFINE_IMPL_FULL (recon8x8_inter2_ppc, recon8x8_inter2, OIL_IMPL_FLAG_ASM);
diff --git a/liboil/simdpack/average2_u8.c b/liboil/simdpack/average2_u8.c
index fe07aa8..3ac8c6a 100644
--- a/liboil/simdpack/average2_u8.c
+++ b/liboil/simdpack/average2_u8.c
@@ -57,35 +57,27 @@ average2_u8_trick (uint8_t * dest, int dstr, uint8_t *src1, int sstr1,
{
unsigned int x, y, d;
-#if 0
- if (sstr1 == 1 && sstr2 == 1 && dstr == 1) {
- while (n > 0) {
- x = *(unsigned int *) src1;
- y = *(unsigned int *) src2;
- *(unsigned int *) dest = (((x ^ y) & 0xfefefefe) >> 1) + (x & y);
- src1 += 4;
- src2 += 4;
- dest += 4;
- n -= 4;
- }
- } else
-#endif
- {
- while (n > 0) {
- x = (src1[0] << 24) | (src1[sstr1] << 16) | (src1[2 *
- sstr1] << 8) | (src1[3 * sstr1]);
- y = (src2[0] << 24) | (src2[sstr2] << 16) | (src2[2 *
- sstr2] << 8) | (src2[3 * sstr2]);
- d = (((x ^ y) & 0xfefefefe) >> 1) + (x & y);
- dest[0] = (d >> 24);
- dest[1*dstr] = (d >> 16);
- dest[2*dstr] = (d >> 8);
- dest[3*dstr] = (d >> 0);
- src1 += 4 * sstr1;
- src2 += 4 * sstr2;
- dest += 4 * dstr;
- n -= 4;
- }
+ while (n&3) {
+ *dest = (*src1 + *src2) >> 1;
+ src1 += sstr1;
+ src2 += sstr2;
+ dest += dstr;
+ n--;
+ }
+ while (n > 0) {
+ x = (src1[0] << 24) | (src1[sstr1] << 16) | (src1[2 *
+ sstr1] << 8) | (src1[3 * sstr1]);
+ y = (src2[0] << 24) | (src2[sstr2] << 16) | (src2[2 *
+ sstr2] << 8) | (src2[3 * sstr2]);
+ d = (((x ^ y) & 0xfefefefe) >> 1) + (x & y);
+ dest[0] = (d >> 24);
+ dest[1*dstr] = (d >> 16);
+ dest[2*dstr] = (d >> 8);
+ dest[3*dstr] = (d >> 0);
+ src1 += 4 * sstr1;
+ src2 += 4 * sstr2;
+ dest += 4 * dstr;
+ n -= 4;
}
}
@@ -112,6 +104,7 @@ average2_u8_unroll4 (uint8_t * dest, int dstr, uint8_t *src1, int sstr1,
{
while (n & 0x3) {
*dest = (*src1 + *src2) >> 1;
+ dest += dstr;
src1 += sstr1;
src2 += sstr2;
n--;
@@ -139,32 +132,3 @@ average2_u8_unroll4 (uint8_t * dest, int dstr, uint8_t *src1, int sstr1,
OIL_DEFINE_IMPL (average2_u8_unroll4, average2_u8);
-#if 0 /* doesn't compile */
-#ifdef HAVE_CPU_I386
-/* This doesn't work with sstr!=2 or dstr!=2 */
-static void
-average2_u8_i386asm (uint8_t * dest, int dstr, uint8_t *src1, int sstr1,
- uint8_t *src2, int sstr2, int n)
-{
- __asm__ __volatile__ ("\n"
- " .p2align 4,,15 \n"
- "1: movzbl (%%ebx), %%eax \n"
- " addl $2, %%ebx \n"
- " movzbl (%%ecx), %%edx \n"
- " addl $2, %%ecx \n"
- " leal 1(%%edx, %%eax), %%eax \n"
- " sarl $1, %%eax \n"
- " movb %%al, 0(%%esi) \n"
- " incl %%esi \n"
- " decl %%edi \n"
- " jg 1b \n":"+b"
- (src1), "+c" (src2), "+D" (n), "+S" (dest)
- ::"eax", "edx");
-
-}
-
-OIL_DEFINE_IMPL (average2_u8_i386asm, average2_u8);
-#endif
-#endif
-
-
diff --git a/liboil/simdpack/scalarmult_i386.c b/liboil/simdpack/scalarmult_i386.c
index 232dc83..bade779 100644
--- a/liboil/simdpack/scalarmult_i386.c
+++ b/liboil/simdpack/scalarmult_i386.c
@@ -48,7 +48,7 @@ scalarmult_f32_sse (float *dest, int dstr, float *src, int sstr,
" movss (%0), %%xmm1 \n"
:
: "r" (t));
- for(i=0;i<n;i+=4) {
+ for(i=0;i<n-3;i+=4) {
t[0] = OIL_GET(src,sstr*(i + 0), float);
t[1] = OIL_GET(src,sstr*(i + 1), float);
t[2] = OIL_GET(src,sstr*(i + 2), float);
@@ -64,7 +64,9 @@ scalarmult_f32_sse (float *dest, int dstr, float *src, int sstr,
OIL_GET(dest,dstr*(i + 2), float) = t[2];
OIL_GET(dest,dstr*(i + 3), float) = t[3];
}
-
+ for(;i<n;i++){
+ OIL_GET(dest,dstr*i, float) = *val * OIL_GET(src,sstr*i, float);
+ }
}
OIL_DEFINE_IMPL_FULL (scalarmult_f32_sse, scalarmult_f32, OIL_IMPL_FLAG_SSE);
diff --git a/testsuite/stride.c b/testsuite/stride.c
index 38a6681..e14b230 100644
--- a/testsuite/stride.c
+++ b/testsuite/stride.c
@@ -74,6 +74,8 @@ int main (int argc, char *argv[])
continue;
}
oil_test_set_iterations (test, 1);
+ test->n = 1 + oil_rand_u8();
+ test->m = 1 + oil_rand_u8();
oil_test_check_impl (test, klass->reference_impl);
for(j=0;j<OIL_ARG_LAST;j++){