* configure.ac: snarf LIBMOTOVEC because it has a compatible

license. * COPYING: * liboil/Makefile.am: * liboil/motovec/Makefile.am: * liboil/motovec/README: * liboil/motovec/checksum_vec.S: * liboil/motovec/string_vec.S: * liboil/motovec/vec_csum.S: * liboil/motovec/vec_memcmp.S: * liboil/motovec/vec_memcpy.S: * liboil/motovec/vec_memset.S: * liboil/motovec/vec_strcpy.S:
author: David Schleef <ds@schleef.org> 2005-06-17 21:51:58 +0000
committer: David Schleef <ds@schleef.org> 2005-06-17 21:51:58 +0000
commit: d64fd56082933579566d4bf45d3f421d3eba8392 (patch)
tree: ad4cbc3d76daa914252999ddac96ffcaee17a8e2
parent: f811d988ddc37ca0592dee4024629dded03aef9f (diff)
download: liboil-d64fd56082933579566d4bf45d3f421d3eba8392.tar.gz
13 files changed, 5206 insertions, 22 deletions
diff --git a/COPYING b/COPYING
index 36d4eca..ba09b76 100644
--- a/COPYING
+++ b/COPYING
@@ -1,23 +1,58 @@
-Copyright (c) David A. Schleef <ds@schleef.org>
-All rights reserved.
 
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
+The majority of the source code and the collective work is subject
+to the following license:
+  
+  Copyright 2002,2003,2004,2005 David A. Schleef <ds@schleef.org>
+  All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  1. Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+  2. Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+  
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+  IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+  
+  
+The source code in the liboil/motovec directory is subject to the
+following license:
+
+  Copyright Motorola, Inc. 2003
+  ALL RIGHTS RESERVED
+
+  You are hereby granted a copyright license to use, modify, and 
+  distribute the SOFTWARE so long as this entire notice is retained 
+  without alteration in any modified and/or redistributed versions, 
+  and that such modified versions are clearly identified as such.  
+  No licenses are granted by implication, estoppel or otherwise under 
+  any patents or trademarks of Motorola, Inc.
+
+  The SOFTWARE is provided on an "AS IS" basis and without warranty.  
+  To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
+  ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
+  WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
+  PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
+  REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
+  THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
+
+  To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
+  MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
+  (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
+  BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
+  INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
+  INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
+  for the maintenance and support of the SOFTWARE.
 
-THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
-IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
diff --git a/ChangeLog b/ChangeLog
index ce4a41e..76a2b64 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,21 @@
 2005-06-17  David Schleef  <ds@schleef.org>
 
+	* configure.ac: snarf LIBMOTOVEC because it has a compatible
+	license.
+	* COPYING:
+	* liboil/Makefile.am:
+	* liboil/motovec/Makefile.am:
+	* liboil/motovec/README:
+	* liboil/motovec/checksum_vec.S:
+	* liboil/motovec/string_vec.S:
+	* liboil/motovec/vec_csum.S:
+	* liboil/motovec/vec_memcmp.S:
+	* liboil/motovec/vec_memcpy.S:
+	* liboil/motovec/vec_memset.S:
+	* liboil/motovec/vec_strcpy.S:
+
+2005-06-17  David Schleef  <ds@schleef.org>
+
 	* liboil/colorspace/Makefile.am: new files
 	* liboil/colorspace/argb_paint.c: remove temporary classes
 	* liboil/colorspace/composite.c: new
diff --git a/configure.ac b/configure.ac
index 5a28079..1f43945 100644
--- a/configure.ac
+++ b/configure.ac
@@ -20,6 +20,7 @@ dnl - interfaces removed -> AGE = 0
 LIBOIL_LIBVERSION="1:0:1"
 AC_SUBST(LIBOIL_LIBVERSION)
 AM_PROG_LIBTOOL
+AM_PROG_AS
 
 AC_CONFIG_SRCDIR([liboil/liboil.h])
 
@@ -204,6 +205,7 @@ liboil/conv/Makefile
 liboil/copy/Makefile
 liboil/dct/Makefile
 liboil/md5/Makefile
+liboil/motovec/Makefile
 liboil/jpeg/Makefile
 liboil/simdpack/Makefile
 liboil/sse/Makefile
diff --git a/liboil/Makefile.am b/liboil/Makefile.am
index 5711500..e245e7c 100644
--- a/liboil/Makefile.am
+++ b/liboil/Makefile.am
@@ -1,7 +1,7 @@
 
 pkgincludedir = $(includedir)/liboil-@LIBOIL_MAJORMINOR@/liboil
 
-SUBDIRS = colorspace conv copy dct jpeg simdpack md5 utf8 sse
+SUBDIRS = colorspace conv copy dct jpeg md5 motovec simdpack sse utf8
 
 lib_LTLIBRARIES = liboiltmp1.la liboil-@LIBOIL_MAJORMINOR@.la
 
@@ -27,6 +27,7 @@ liboilfunctions_la_LIBADD = \
 	dct/libdct.la \
 	jpeg/libjpeg.la \
 	md5/libmd5.la \
+	motovec/libmotovec.la \
 	simdpack/libsimdpack.la \
 	sse/libsse.la \
 	utf8/libutf8.la \
diff --git a/liboil/motovec/Makefile.am b/liboil/motovec/Makefile.am
new file mode 100644
index 0000000..a56fb98
--- /dev/null
+++ b/liboil/motovec/Makefile.am
@@ -0,0 +1,17 @@
+
+noinst_LTLIBRARIES = libmotovec.la
+
+c_sources = 
+
+if HAVE_CPU_POWERPC
+powerpc_sources = \
+	vec_memcpy.S
+else
+powerpc_sources =
+endif
+
+libmotovec_la_SOURCES = 	\
+	$(powerpc_sources)
+libmotovec_la_LIBADD = 
+libmotovec_la_CFLAGS = $(LIBOIL_CFLAGS)
+
diff --git a/liboil/motovec/README b/liboil/motovec/README
new file mode 100644
index 0000000..a458db4
--- /dev/null
+++ b/liboil/motovec/README
@@ -0,0 +1,345 @@
+//------------------------------------------------------------------
+// file:  readme.txt
+//    Readme to accompany libmotovec.a
+//------------------------------------------------------------------
+
+Rev 0.30 release - 5/28/2003 by Chuck Corley
+
+This release includes two new files, string_vec.S and checksum_vec.s,
+which you could paste into the Linux kernel files:
+/arch/ppc/lib/string.S  and
+/arch/ppc/lib/checksum.S
+if you wanted to employ AltiVec in the Linux kernel.  We used the
+memcpy_vec and csum_partial_copy_generic_vec functions from these 
+files only in the modified versions of /net/core/skbuf.c and
+/net/core/iovec.c to give us the networking performance boost in
+Linux described in the SNDF presentation "Accelerating Networking Data
+Movement Using the AltiVec® Technology" at www.motorola.com/sndf under
+Dallas-2003/Host Processors (H1110).  Also see the white paper
+"Enhanced TCP/IP Performance with AltiVec Technology" at 
+e-www.motorola.com/brdata/PDFDB/docs/ALTIVECTCPIPWP.pdf
+
+These files contain the following functions
+string.S contains:                   string_vec.S contains:
+memcpy                               memcpy_vec
+bcopy                                bcopy_vec
+memmove                              memmove_vec
+backwards_memcpy                     backwards_memcpy_vec
+memset                               memset_vec
+memcmp                               memcmp_vec
+memchr                               (coming soon)
+cacheable_memcpy                     cacheable_memcpy_vec
+cacheable_memzero                    cacheable_memzero_vec
+strcpy                               strcpy_vec
+strncpy                              (coming soon)
+strcat                               (coming soon)
+strcmp                               strcmp_vec
+strlen                               strlen_vec
+__copy_tofrom_user*                  __copy_tofrom_user_vec*
+__clear_user*                        __clear_user_vec*
+__strncpy_from_user*                 (coming soon)
+__strnlen_user*                      (coming soon)
+
+checksum.S contains:                 checksum_vec.S contains:
+csum_partial                         csum_partial_vec
+csum_partial_copy_generic*           csum_partial_copy_generic_vec
+ip_fast_csum                         (unlikely to benefit)			
+csum_tcpudp_magic                    (unlikely to benefit)
+
+*these functions have ex_table entries for handling memory access
+exceptions in the kernel.  The AltiVec versions were functionally
+tested by hand.
+
+csum_partial_copy_generic_vec and csum_partial_vec previously 
+assembled into libmotovec.a have been removed since they are in the file
+above.  We are finding that selective use of the *_vec functions in 
+the OS kernel is much "safer" than wholescale replacement of the libc
+library.  libmotovec.a returns to being exclusively a performance-enhancing
+library of libc functions that can be safely linked with user application
+code to test the performance of AltiVec.
+
+My presentation for SDNF-Europe includes performance comparisons
+of the scalar versus vector versions of the above functions.  It should
+be available on the SNDF website soon. It also includes an updated
+explanation of memcpy without the potential incoherency problem discussed
+below.
+
+So this release contains in libmotovec.a:
+memcpy.o           from vec_memcpy.S Rev 0.30 dated  4/02/2003
+bcopy.o            from vec_memcpy.S Rev 0.30 dated  4/02/2003
+memmove.o          from vec_memcpy.S Rev 0.30 dated  4/02/2003
+memset.o           from vec_memset.S Rev 0.10 dated  5/01/2003
+bzero.o            from vec_memset.S Rev 0.10 dated  5/01/2003
+strcmp.o           from vec_strcmp.S Rev 0.00 dated  3/03/2002
+strlen.o           from vec_strlen.S Rev 0.00 dated 12/26/2002
+
+And in string.s:
+memcpy_vec derived from vec_memcpy.S Rev 0.30 dated  4/02/2003
+bcopy_vec                   derived from vec_memcpy.S Rev 0.30
+memmove_vec                 derived from vec_memcpy.S Rev 0.30
+backwards_memcpy_vec        derived from vec_memcpy.S Rev 0.30
+memset_vec derived from vec_memset.S Rev 0.10 dated  5/01/2003
+memcmp_vec                  derived from vec_memcmp.S Rev 0.00
+memchr                                           (coming soon)
+cacheable_memcpy_vec        derived from vec_memcpy.S Rev 0.30
+cacheable_memzero_vec       derived from vec_memset.S Rev 0.10
+strcpy_vec                  derived from vec_strcpy.S Rev 0.10
+strncpy_vec                                      (coming soon)
+strcat_vec                                       (coming soon)
+strcmp_vec   derived from vec_strcmp.S Rev 0.00 (not released)
+strlen_vec   derived from vec_strlen.S Rev 0.00 (not released)
+__copy_tofrom_user_vec*     derived from vec_memcpy.S Rev 0.30
+__clear_user_vec*           derived from vec_memcpy.S Rev 0.30
+__strncpy_from_user_vec*                         (coming soon)
+__strnlen_user_vec*                              (coming soon)
+*with ex_table and exception code
+
+And in checksum.s:
+csum_partial_vec  derived from vec_csum.S Rev 0.0 dated 4/19/03
+csum_partial_copy_generic_vec           from vec_csum.S Rev 0.0
+
+string_vec.S and checksum_vec.S are only known to assemble with gcc 2.95
+and gcc 3.3+.  Should work with other gcc compilers but may need
+editing to be compatible with non-gcc compilers.
+
+Rev 0.20 release - 5/12/2003 by Chuck Corley
+
+Thanks to all of you who attended SNDF.  My presentation "Implementing
+and Using the Motorola AltiVec Libraries" is available for downloading 
+at www.motorola.com/sndf under Dallas-2003/Host Processors (H1109). 
+
+During the presentation DS from Lucent pointed out that the way I was
+bringing the beginning and ending destination Quad Words (vectors) into
+the registers for merging with the permuted source made the
+"uninvolved" destination bytes vulnerable to potential incoherency if
+some interrupting process changed those bytes while I was holding them
+in a register.  While the possibility seemed small, I have rewritten the
+code to avoid this potential problem.  The result actually is slightly 
+faster than the original for small buffers.
+
+So this release contains:
+memcpy.o       from vec_memcpy.S Rev 0.30 dated 4/02/2003
+bcopy.o        from vec_memcpy.S Rev 0.30 dated 4/02/2003
+memmove.o      from vec_memcpy.S Rev 0.30 dated 4/02/2003
+memset.o       from vec_memset.S Rev 0.10 dated 5/01/2003
+bzero.o        from vec_memset.S Rev 0.10 dated 5/01/2003
+csum_partial_copy_generic_vec from vec_csum.S Rev 0.0 dated 4/19/03
+csum_partial_vec from vec_csum.S Rev 0.0 dated 4/19/03
+
+The latter two additions were assembled into libmotovec.a despite the
+fact they are not standard libc functions.  Rather they are the Altivec
+enabled equivalents of functions by the same name from the linux
+source tree (Linux 2.4.17).  While we are pursuing how to get these
+functions incorporated into Linux, here they are assembled and in
+source form if you are building your own version of linux.  The use
+of an earlier version of csum_partial_copy_generic_vec and memcpy_vec is 
+documented to speed up TCP/IP and UDP transfers in Jacob Pan's SNDF
+presentation "Accelerating Networking Data Movement Using AltiVec
+Technology" (H1110) available at the website above.  csum_partial
+does not appear to be called with large enough buffer sizes in linux 
+to warrant using the vectorized version.
+
+I am also releasing the source for memset and bzero in this release.
+strcpy, strlen, strncpy, strcmp, memcmp, strcat, and memchr are still 
+on my list to do - soon.
+
+Rev 0.10 release - 3/13/2003 by Chuck Corley
+
+The presence of dcbz in the 32 byte loop of memcpy (or memmove)
+causes an alignment exception to non-cacheable memory (MPC7410 User's
+Manual p. 4-20 and MPC7450 User's Manual p. 4-25) so it was 
+removed in this release.  dcbz instructions were not present in 
+memset in any of these releases.  That fixed the alignment problem 
+but hurt the performance some; then it was "rediscovered" that
+dcba would have been a better choice anyway as it does not cause 
+an exception; it would just be noop'ed.  So this release substitutes
+dcba for dcbz.
+
+This release contains improvements in memcpy that should be
+documented in an application note which is still not finished but
+are being pretty nicely documented for SNDF presentation H1109.
+
+The memcpy was further loop unrolled to provide a 128B loop for
+large buffers (>256 bytes) and the data stream touch instruction
+was added.  It may still be possible to improve the tuning of
+the dst instruction, particularly in memmove, but this release
+is worthy of reving the number to the next significant revision.
+
+I've developed a new metric which will be explained at SNDF in
+Dallas, TX, March 23-26, 2003.  As the number of bytes in a 
+buffer gets larger, the memcpy routine settles into repetitions
+of the inner loop.  32 bytes were moved in the inner loop of
+Rev 0.0x and 128 bytes are moved in the inner loop of Rev 0.10.
+And the number of processor clocks per inner loop can be shown
+to approach the minimum possible.  Therefore the new metric
+measures the incremental transfer rate for the inner loop after 
+a reasonable number (>512) of bytes have been moved.  This will
+not be the bytes transferred per second because there were some
+less efficient transfers at start-up but this is the transfer
+rate that the routine is asymptotically approaching as the buffer
+gets big (regularly testing to 1460 bytes).
+
+Here is that metric for several cases:
+
+Case 1: For gcc's lib c memcpy when buffers are not word aligned 
+Case 2: For gcc's lib c memcpy when buffers are word aligned 
+Case 3: For Rev 0.01 of memcpy with Altivec irrespective of alignment
+Case 4: For Rev 0.10 of memcpy with Altivec irrespective of alignment
+
+Numbers are provided for the cold DCache and warm DCache.  Code is
+assumed to always be resident in the ICache as would be expected here
+where the inner loop has run multiple times.
+
+                                   COLD DCACHE           WARM DCACHE
+ FOR THE MPC7410@400/100     Insts  Clks   MB/Sec   Insts   Clks  MB/Sec
+Case 1: gcc_NWA (1 byte/loop)  6     6       71       6      3     133
+Case 2: gcc_WA (16 B/loop)    12    62      103      12      8     800     
+Case 3: vec_memcpy Rev 0.01   12    60      213      12      7    1961
+Case 4: vec_memcpy Rev 0.10   46   125      410      46     41    1250
+
+
+                                   COLD DCACHE           WARM DCACHE
+ FOR THE MPC7445@1GHz/133   Insts  Clks   MB/Sec   Insts   Clks  MB/Sec
+Case 1: gcc_NWA               6     8       122       6      3     350 
+Case 2: gcc_WA                12   104      153      12     12    1333             
+Case 3: vec_memcpy Rev 0.01   12   110      292      12      7    4413  
+Case 4: vec_memcpy Rev 0.10   46   247      518      46     35    3666
+
+Perhaps you notice that we are trading off Warm DCache performance to
+improve the Cold DCache case.  There are other interesting tradeoffs
+in going from 32 byte inner loop to 128 bytes.  And in using the dcba
+instruction - or not.  In other words, the numbers for vec_memcpy above
+are not the highest possible in the Warm DCache case but they look like
+a good compromise which most benefits the Cold DCache case.  More at SNDF
+(or eventually in the app note) ...
+
+I am releasing the source code to vec_memcpy.S with this release so if
+if you don't like the tradeoff above you can make your own selection.  It
+successfully assembles for me with Codewarrior, Diab, Green Hills, gcc,
+and Metaware.  It is nicely commented but could use more documentation.
+I will specifically be explaining it in SNDF presentation H1109.
+
+*************************************************************************
+
+Rev 0.01 release - 2/17/2003 by Chuck Corley
+
+Fixed a problem at Last_ld_fwd: that caused a load beyond a page
+boundary and resulting segment fault in Linux.  Last source load 
+of SRC+BK in vec_memcpy could be > SRC+BC-1.  Also found and fixed
+an error where the Quick and Dirty (QND) code that was in there for
+dst wasn't completely commented out.  Plan to enable dst soon.
+Probably loop unroll to 128 bytes first though.
+
+**********************************************************************
+
+Initial Release - 2/10/2003 by Chuck Corley
+
+Contains the libc functions:
+memcpy.o       from vec_memcpy.S Rev 0.0 dated 2/09/2003
+bcopy.o        from vec_memcpy.S Rev 0.0 dated 2/09/2003
+memmove.o      from vec_memcpy.S Rev 0.0 dated 2/09/2003
+memset.o       from vec_memset.S Rev 0.0 dated 2/09/2003
+bzero.o        from vec_memset.S Rev 0.0 dated 2/09/2003
+
+These functions are implemented in AltiVec but are still not as fast
+as we know how to make them.  Watch this site for frequent revisions 
+over the next several months.
+
+We are in the process of creating application notes to explain the 
+source code and the performance associated with these library functions;
+watch this site for those application notes to be added.  A logical 
+deadline for completion of this work is the Smart Network Developers
+Forum in Dallas, TX, March 23-26, 2003, where we will be discussing this 
+library, its performance, and application.
+
+We will also be adding the following libc functions in the very near future:
+strcpy
+strcmp
+strlen
+memcmp
+memchr
+strncpy
+
+We also have preliminary work completed on the following functions 
+found in Linux and have to figure out how to distribute them:
+csum_partial
+csum_partial_generic
+__copy_tofrom_user
+page_copy
+
+We believe that these libraries will improve performance on Motorola G4
+processors for applications that make heavy use of the included functions.
+On non-G4 microprocessors they will cause illegal operation exceptions
+because those processors do not support AltiVec.
+
+To use this library, you must:
+1. Include it on the linker command line prior to the compiler's libc
+library.
+
+Examples:
+For gcc:
+powerpc-eabisim-ld -T../../spprt/gcc_dink.script -Qy -dn -Bstatic ../../spprt/gcc_obj/gcc_crt0.o  ../../spprt/gcc_obj/dtime.o  ../../spprt/gcc_obj/cache.o  ../../spprt/gcc_obj/Support.o  ../../spprt/gcc_obj/dinkusr.o  ../../spprt/gcc_obj/perfmon.o gcc_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a   c:/cygwin/Altivec/powerpc-eabisim\lib\libm.a --start-group -lsim -lc --end-group -o gccBM.elf
+
+For Diab:
+dld ../../spprt/diab_dink.dld ../../spprt/diab_obj/diab_crt0.o  ../../spprt/diab_obj/dtime.o  ../../spprt/diab_obj/cache.o  ../../spprt/diab_obj/Support.o  ../../spprt/diab_obj/dinkusr.o  ../../spprt/diab_obj/perfmon.o diab_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a  -Y P,c:/diab/5.0.3/PPCEH:c:/diab/5.0.3/PPCE/simple:c:/diab/5.0.3/PPCE:c:/diab/5.0.3/PPCEN -lc -lm -o diabBM.elf
+
+For Green Hills:
+elxr -T../../spprt/ghs_dink.lnk ../../spprt/ghs_obj/ghs_crt0.o  ../../spprt/ghs_obj/dtime.o  ../../spprt/ghs_obj/cache.o  ../../spprt/ghs_obj/Support.o  ../../spprt/ghs_obj/dinkusr.o  ../../spprt/ghs_obj/perfmon.o ghs_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a  -Lc:\GHS\ppc36\ppc  -lansi -lsys -larch -lind -o ghsBM.elf
+
+For CodeWarrior:
+mwldeppc -lcf ../../spprt/cw_dink.lcf -nostdlib -fp fmadd -proc 7450 ../../spprt/cw_obj/cw_crt0.o  ../../spprt/cw_obj/dtime.o  ../../spprt/cw_obj/cache.o  ../../spprt/cw_obj/Support.o  ../../spprt/cw_obj/dinkusr.o  ../../spprt/cw_obj/perfmon.o cw_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a  -Lc:/"Program Files"/Metrowerks/CodeWarrior/PowerPC_EABI_Support/Runtime/Lib/ -lRuntime.PPCEABI.H.a  -Lc:/"Program Files"/Metrowerks/CodeWarrior/PowerPC_EABI_Support/Msl/MSL_C/Ppc_eabi/Lib/ -lMSL_C.PPCEABI.bare.H.a -o cwBM.elf
+
+For Metaware:
+ldppc ../../spprt/mw_link.txt -Bnoheader -Bhardalign -dn -q -Qn ../../spprt/mw_obj/mw_crt0.o  ../../spprt/mw_obj/dtime.o  ../../spprt/mw_obj/cache.o  ../../spprt/mw_obj/Support.o  ../../spprt/mw_obj/dinkusr.o  ../../spprt/mw_obj/perfmon.o mw_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a  -Y P,c:/hcppc/lib/be/fp -lct -lmwt -o mwBM.elf
+
+
+2. Enable AltiVec in the Machine State Processor (MSR) register of the
+target machine.
+
+Example:
+AltiVec_enable:
+	mfmsr	r4		// Get current MSR contents
+	oris	r4,r4,0x0200	// Set the AltiVec enable bit MSR[6]
+	mtmsr	r4		// Write to MSR
+	isync			// Context synchronizing instr after mtmsr
+
+
+3. If the AltiVec vector register set is used in more than one context,
+the AltiVec registers must be saved and restored on context switches.  The
+AltiVec EABI extensions define a register (SPR 256 - the VRSAVE register)
+which can be used to reduce the number of vector registers which have to
+be saved to only those in use.  This library is currently compiled
+without that VRSAVE feature enabled, so all 32 vector registers will have
+to be saved and restored.  We are currently thinking that this is a more
+efficient practice anyway and note that Linux and several RTOSes are taking
+that approach in saving and restoring the vector registers.  We have observed
+very little performance difference in Linux for saving all of the AltiVec 
+registers on a context switch versus saving only 8.  And saving all of the 
+registers is a less than 1% total impact on performance.
+
+4. There is one worrisome problem with this library when run on the MPC745X
+microprocessors in the 60x bus mode.  The MPC7450 Family User's Manual
+(Section 7.3) states that "The 60x bus protocol does not support a 16-byte
+bus transaction.  Therefore, cache-inhibited AltiVec loads, stores, and
+write-through stores take an alignment exception.  This requires a re-write
+of the alignment exception routines in software that supports AltiVec quad
+word access in 60x bus mode on the MPC745X."
+
+This says that if the user is attempting to use these routines in a
+cache-inhibited area of memory on a MPC745X in 60x bus mode, it will require
+special alignment exception handling software.  We are currently implementing
+that software for the Linux OS.  Alternatively, the user can restrict this 
+library's use to areas of memory known to be cacheable.
+
+This library was built using gcc, but as shown in the examples of step 1 above,
+links and executes with Diab5.0, Green Hills 3.6, Codewarrior EPPC 6.1, and
+Metaware 4.5.  The gcc archiver was used to create it in the following 
+command lines:
+
+powerpc-eabisim-gcc -c -s -fvec -mcpu=750 -mregnames   -I. -I./source -I../../spprt -Ic:/cygwin/Altivec\powerpc-eabisim\include         -Ic:/cygwin/Altivec\lib\gcc-lib\powerpc-eabisim\gcc-2.95.2\include -o gcc_obj/vec_memcpy.o -D__GNUC__  -DLIBMOTOVEC ../vec_memcpy/Source/vec_memcpy.S -o gcc_obj/vec_memcpy.o
+
+powerpc-eabisim-gcc -c -s -fvec -mcpu=750 -mregnames   -I. -I./source -I../../spprt -Ic:/cygwin/Altivec\powerpc-eabisim\include         -Ic:/cygwin/Altivec\lib\gcc-lib\powerpc-eabisim\gcc-2.95.2\include -o gcc_obj/vec_memset.o -D__GNUC__  -DLIBMOTOVEC ../vec_memset/source/vec_memset.S -o gcc_obj/vec_memset.o
+
+powerpc-eabisim-ar -ru libmotovec.a gcc_obj/vec_memcpy.o        gcc_obj/vec_memset.o
+
+Email questions or suggestions to risc10@email.sps.mot.com
diff --git a/liboil/motovec/checksum_vec.S b/liboil/motovec/checksum_vec.S
new file mode 100644
index 0000000..c5efe25
--- /dev/null
+++ b/liboil/motovec/checksum_vec.S
@@ -0,0 +1,627 @@
+/*
+ * AltiVec versions (*_vec) of equivalent Linux library functions
+ * found in /arch/ppc/lib/checksum.S from Linux 2.4.17.  Suggest this
+ * file be appended to that one when building a Linux kernel that
+ * will employ these functions.
+ *
+ * Copyright (C) Motorola, Inc. 2003
+ *
+ * Revision history:
+ * 	Rev 0.0	Original                Chuck Corley   5/28/03
+ *                                  Contact at risc10@motorola.com
+ * Commented source code for Altivec version available at
+ * www.motorola.com/altivec
+ */
+
+#ifndef TEST_OUTSIDE_LINUX
+#include <linux/sys.h>
+#include <asm/processor.h>
+#include <asm/errno.h>
+#include "../kernel/ppc_asm.tmpl"
+#if 0
+#define v0  vr0
+#define v1  vr1
+#define v2  vr2
+#define v3  vr3
+#define v4  vr4
+#define v5  vr5
+#define v6  vr6
+#define v7  vr7
+#define v8  vr8
+#define v9  vr9
+#define v10 vr10
+#define v11 vr11
+#define v12 vr12
+#define v13 vr13
+#define v14 vr14
+#define v15 vr15
+#endif
+#else
+#define EFAULT 0
+#endif
+
+	.text
+
+/* 
+ * AltiVec versions of selected functions for use on AltiVec
+ * enabled G4 and later microprocessors.
+ */ 
+#if defined(__GNUC__) || defined(__MWERKS__)  // gcc and codewarrior don't assemble dcba
+#define DCBAR4R12	.long 0x7c0465ec
+#else
+#define DCBAR4R12	dcba	r4,r12
+#endif
+
+	.text
+	.align	4
+#ifndef TEST_OUTSIDE_LINUX
+_GLOBAL(csum_partial_copy_generic_vec)
+#else
+#if __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+	.global	csum_partial_copy_generic_vec     
+csum_partial_copy_generic_vec:
+#endif
+	li	r12,32
+	rlwinm	r0,r5,31,1,31
+	cmpi	cr7,0,r5,48
+	dcbt	r3,r12
+	cmpi	cr6,0,r0,0
+	addic	r6,r6,0
+	addi	r11,r3,-2
+	add	r10,r4 ,r5
+	bgt	cr7,4f
+	andi.	r12,r5,1
+	addi	r9,r4,-2
+	add	r12,r3,r5
+	beq	cr6,2f
+	mtctr	r0
+1:	lhzu	r0,2(r11)
+204:	sthu	r0,2(r9)
+	addc	r6,r6,r0
+	bdnz	1b
+2:	beq	3f
+201:	lbz	r0,-1(r12 )
+202:	stb	r0,-1(r10)
+	rlwinm	r0,r0,8,16,23
+	addc	r6,r6,r0
+3:	addze	r3,r6
+	blr
+4:	lvsr	v5,0,r4
+	rlwinm	r9,r4,0,28,31
+	rlwinm	r12,r3,0,28,31
+	lvsr	v7,r4,r5
+	subf.	r12,r12,r9
+	subf	r12,r3,r4
+	lvsr	v6,0,r12
+	li	r12,64
+	vxor	v0,v0,v0
+	dcbt	r3,r12
+	cmpi	cr1,0,r9,0
+	vnor	v1,v0,v0
+	addi	r9,r4,16
+	addi	r10,r10,-1
+	vperm	v5,v1,v0,v5
+	bge	5f
+401:	lvx	v2,0,r3
+	addi	r3,r3,16
+5:	lvx	v3,0,r3
+	rlwinm	r9,r9,0,0,27
+	vperm	v1,v0,v1,v7
+	subf	r11,r9,r10
+	vxor	v7,v7,v7
+	vxor	v11,v11,v11
+	rlwinm	r11,r11,28,4,31
+	rlwinm	r0,r10,0,28,31
+	li	r12,96
+	cmpi	cr5,0,r0,0xF
+	subf	r0,r4,r9
+	mtctr	r11
+	cmpi	cr6,0,r11,4
+	mtcrf	0x01,r0
+	vperm	v4,v2,v3,v6
+	vor	v2,v3,v3
+	dcbt	r3,r12
+	beq	cr1,9f
+	li	r12,0
+	vsel	v4,v4,v0,v5
+	bns	cr7,6f
+502:	stvebx	v4,r4,r12
+	addi	r12,r12,1
+6:	bne	cr7,7f
+602:	stvehx	v4,r4,r12
+	addi	r12,r12,2
+7:	bng	cr7,8f
+702:	stvewx	v4,r4,r12
+	addi	r12,r12,4
+8:	bnl	cr7,10f
+802:	stvewx	v4,r4,r12
+	addi	r12,r12,4
+804:	stvewx	v4,r4,r12
+	b	10f
+9:	stvx	v4,0,r4
+10:	vxor	v8,v8,v8
+	li	r12,16
+11:	lvx	v3,r3,r12
+	vaddcuw	v9,v4,v8
+	vadduwm	v8,v4,v8
+	vperm	v4,v2,v3,v6
+	vor	v2,v3,v3
+112:	stvx	v4,r4,r12
+	vadduwm	v11,v9,v11
+	addi	r12,r12,16
+	bdnzf	25,11b
+	add	r9,r4,r12
+	addi	r11,r11,-1
+	bgt	cr6,19f
+12:	add	r10,r4,r5
+	add	r11,r3,r5
+	bge	13f
+	addi	r11,r11,-16
+13:	mtcrf	0x01,r10
+	addi	r0,r11,-1
+131:	lvx	v3,0,r0
+	vaddcuw	v9,v4,v8
+	vadduwm	v8,v4,v8
+	vadduwm	v11,v9,v11
+	vperm	v4,v2,v3,v6
+	beq	cr5,17f
+	vsel	v4,v4,v0,v1
+	rlwinm	r10,r10,0,0,27
+	li	r9,0
+	bnl	cr7,14f
+132:	stvewx	v4,r10,r9
+	addi	r9,r9,4
+134:	stvewx	v4,r10,r9
+	addi	r9,r9,4
+14:	bng	cr7,15f
+142:	stvewx	v4,r10,r9
+	addi	r9,r9,4
+15:	bne	cr7,16f
+152:	stvehx	v4,r10,r9
+	addi	r9,r9,2
+16:	bns	cr7,18f
+162:	stvebx	v4,r10,r9
+	b	18f
+17:	stvx	v4,r4,r12
+18:	vaddcuw	v9,v4,v7
+	vadduwm	v12,v4,v7
+	vaddcuw	v10,v12,v8
+	vadduwm	v8,v12,v8
+	vadduwm	v9,v9,v10
+500:	vmrglh	v2,v0,v8
+	vadduwm	v11,v9,v11
+	vmrghh	v3,v0,v8
+	rlwinm	r10,r1,0,0,27
+	vsumsws	v0,v11,v0
+	vadduwm	v8,v2,v3
+	li	r12,-16
+	vsumsws	v8,v8,v0
+182:	stvx	v8,r10,r12
+183:	lwz	r3,-4(r10)
+	addc	r3,r3,r6
+	addze	r3,r3
+	blr	
+19:	lvx	v3,r3,r12
+	addi	r11,r11,-1
+	vaddcuw	v9,v4,v8
+	vadduwm	v8,v4,v8
+	mtcrf	0x02,r9
+	addi	r9,r9,16
+	addi	r0,r11,-2
+	vperm	v4,v2,v3,v6
+	vor	v2,v3,v3
+192:	stvx	v4,r4,r12
+	addi	r12,r12,16
+	vadduwm	v11,v9,v11
+	bdnzf	27,19b
+	mtcrf	0x02,r10
+	addi	r11,r3,96
+	addi	r9,r12,16
+	bns	cr6,20f
+	bdnz	20f
+20:	lvx	v3,r3,r12
+	addi	r11,r11,32
+	vaddcuw	v9,v4,v7
+201:	lvx	v5,r3,r9
+	vadduwm	v12,v4,v7
+	dcbt	0,r11
+	vaddcuw	v10,v12,v8
+	DCBAR4R12
+	vadduwm	v8,v12,v8
+	vperm	v7,v2,v3,v6
+202:	stvx	v7,r4,r12
+	vperm	v4,v3,v5,v6
+	vadduwm	v9,v9,v10
+	bdz	21f
+21:	stvx	v4,r4,r9
+	vor	v2,v5,v5
+	vadduwm	v11,v9,v11
+	addi	r12,r9,16
+	addi	r9,r12,16
+	bdnz	20b
+	bso	cr6,22f
+	b	12b
+22:	lvx	v3,r3,r12
+	vaddcuw	v9,v4,v8
+	vadduwm	v8,v4,v8
+	vadduwm	v11,v9,v11
+	vperm	v4,v2,v3,v6
+	vor	v2,v3,v3
+222:	stvx	v4,r4,r12
+	addi	r12,r12,16
+	b	12b
+
+/* Intent of this exception table is to store -EFAULT to *src_err or
+ * or *dst_err respectively, and (for an error on src) zero the rest
+ * of dst.  Return checksum for only those bytes stored before error.
+ * (Can't quite figure out how this return value is used since there
+ * is no way to restart from the point of error.  So I'll only return 
+ * the checksum for actual buffer as stored in memory.  Doesn't look
+ * like scalar version adds in bytes loaded but not stored.)
+ *    
+ * Register useage here:
+ *    r3 = src, return checksum
+ *    r4 = dst
+ *    r5 = (preserve as total byte count til near end)
+ *    r6 = entering partial sum; accumulator for scalar result
+ *    r7 = src_err
+ *    r8 = dst_err
+ *    r9 = bytes not copied
+ *    r10= dst + byte count
+ *    r11= number of quad words (vectors)
+ *    r12= Byte Kount index
+ */
+
+/* read fault, initial half-word copy */
+100:	li	r0,0
+	sthu	r0,2(r9)		/* Zero rest of buffer */
+	cmpi	0,r7,0
+	beq	104f		/* Go return checksum */
+	li	r0,-EFAULT
+	stw	r0,0(r7)
+	b	104f
+
+/* write fault, initial half-word copy */
+101:	cmpi	0,r8,0
+	beq	104f
+	li	r0,-EFAULT
+	stw	r0,0(r8)
+	b	104f
+
+/* read fault, final single-byte copy */
+102:	li	r0,0
+	stb	r0,-1(r10)	/* Zero remaining byte */
+	cmpi	0,r7,0
+	beq	104f
+	li	r0,-EFAULT
+	stw	r0,0(r7)
+	b	104f
+
+/* write fault, final single-byte copy */
+103:	cmpi	0,r8,0
+	beq	104f
+	li	r0,-EFAULT
+	stw	r0,0(r8)
+104:	addze	r3,r6
+	blr
+
+/* read fault, 1st and 2nd vector load */
+105:	cmpi	0,r7,0
+	beq	155f
+	li	r0,-EFAULT
+	stw	r0,0(r7)
+155:	rlwinm	r0,r5,31,1,31
+	andi.	r12,r5,1
+	mtctr	r0
+	addi	r9,r4,-2
+	li	r0,0
+106:	sthu	r0,2(r9)
+	bdnz	106b
+	beq	107f
+	stb	r0,2(r9)
+107:	addze	r3,r6
+	blr
+
+/* write fault, initial vector store(s) (Nothing stored yet) */
+108:	cmpi	0,r8,0
+	beq	109f
+	li	r0,-EFAULT
+	stw	r0,0(r8)
+109:	addze	r3,r6
+	blr
+	
+/* read fault, load in 16B loop or final load  */
+110:	cmpi	0,r7,0
+	beq	156f
+	li	r0,-EFAULT
+	stw	r0,0(r7)
+156:	add	r11,r4,r5		/* Last dst byte + 1 */
+	add	r4,r4,r12		/* Current dst byte */
+	rlwinm  r4,r4,0,0,27	/* Rounded down */
+	subf	r5,r4,r11
+	rlwinm.	r0,r5,31,1,31
+	addi	r9,r4,-2
+	cmpi	1,r0,0
+	beq	cr1,157f	
+	mtctr	r0
+	li	r0,0
+111:	sthu	r0,2(r9)
+	bdnz	111b
+157:	andi.	r12,r5,1
+	beq	18b
+	li	r0,0
+	stb	r0,2(r9)
+	vaddcuw	v9,v4,v8
+	vadduwm	v8,v4,v8
+	vxor	v11,v11,v11
+	b	500b		/* Go sum across vector checksum */
+
+/* write fault, store in 16B loop  */
+1120:	cmpi	0,r8,0
+	beq	113f
+	li	r0,-EFAULT
+	stw	r0,0(r8)
+113:	b	500b
+
+/* write fault, final partial store(s)  */
+
+114:	cmpi	0,r8,0
+	vxor	v11,v11,v11
+	beq	115f
+	li	r0,-EFAULT
+	stw	r0,0(r8)
+115:	b	500b
+
+/* write fault, 1st store in 32B loop  */
+116:	cmpi	0,r8,0
+	vadduwm	v9,v9,v10
+	beq	117f
+	li	r0,-EFAULT
+	stw	r0,0(r8)
+117:	b	500b
+
+/* write fault, 2nd store in 32B loop  */
+118:	cmpi	0,r8,0
+	vxor	v4,v4,v4
+	vadduwm	v11,v9,v11
+	beq	119f
+	li	r0,-EFAULT
+	stw	r0,0(r8)
+119:	b	18b
+
+/* read fault, next to final load  */
+120:	cmpi	0,r7,0
+	beq	121f
+	li	r0,-EFAULT
+	stw	r0,0(r7)
+121:	add	r11,r4,r5
+	add	r4,r4,r12
+	rlwinm  r4,r4,0,0,27
+	subf	r5,r4,r11
+	rlwinm.	r0,r5,31,1,31
+	addi	r9,r4,-2
+	cmpi	1,r0,0
+	beq	cr1,123f	
+	mtctr	r0
+	li	r0,0
+122:	sthu	r0,2(r9)
+	bdnz	122b
+123:	andi.	r12,r5,1
+	beq	124f
+	li	r0,0
+	stb	r0,2(r9)
+124:	vaddcuw	v9,v4,v8
+	vadduwm	v8,v4,v8
+	vadduwm	v11,v9,v11
+	vxor	v4,v4,v4
+	b	18b
+
+/* write fault, 1st store in 32B loop  */
+125:	cmpi	0,r8,0
+	vxor	v4,v4,v4
+	beq	126f
+	li	r0,-EFAULT
+	stw	r0,0(r8)
+126:	b	18b
+
+/* write or read fault in push/pop from stack. csumcpy complete. */
+
+127:	vxor	v0,v0,v0
+	vspltisw v2,1
+ 	lis	r5,0x8000
+	vnor	v1,v0,v0
+	vmrglh	v8,v0,v8
+ 	li	r10,17
+	vsldoi	v3,v0,v1,4
+ 	li	r3,0
+	mtctr	r10
+	vsumsws	v8,v8,v0
+ 	vand	v4,v2,v3
+128:	vand	v5,v8,v4
+	rlwinm	r5,r5,1,0,31
+	vcmpequw.	v6,v5,v4
+	vsl	v4,v4,v2
+	bnl	cr6,129f
+	or	r3,r3,r5
+129:	bdnz	128b
+	addc	r3,r3,r6
+	addze	r3,r3
+	blr
+
+#ifndef TEST_OUTSIDE_LINUX
+	.section __ex_table,"a"	
+	.align	2		
+	.long	1b,100b
+	.long	204b,101b
+	.long	201b,102b
+	.long	202b,103b
+	.long	401b,105b
+	.long	5b,105b
+	.long	502b,108b
+	.long	602b,108b
+	.long	702b,108b
+	.long	802b,108b
+	.long	804b,108b
+	.long	9b,108b
+	.long	11b,110b
+	.long	112b,1120b
+	.long	131b,110b
+	.long	132b,114b
+	.long	134b,114b
+	.long	142b,114b
+	.long	152b,114b
+	.long	162b,114b
+	.long	17b,114b
+	.long	182b,127b
+	.long	183b,127b
+	.long	19b,110b
+	.long	192b,112b
+	.long	20b,110b
+	.long	201b,110b
+	.long	202b,116b
+	.long	21b,118b
+	.long	22b,120b
+	.long	222b,125b
+#endif
+
+	.text
+#ifndef TEST_OUTSIDE_LINUX
+_GLOBAL(csum_partial_vec)
+#else
+#if __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+	.global	csum_partial_vec     
+csum_partial_vec:
+#endif
+
+	li	r12,32
+	rlwinm	r0,r4,31,1,31
+	cmpi	cr7,0,r4,48
+	dcbt	r3,r12
+	cmpi	cr6,0,r0,0
+	addic	r5,r5,0
+	addi	r11,r3,-2
+	add	r10,r3,r4
+	bgt	cr7,4f
+	andi.	r12,r4,1
+	beq	cr6,2f
+	mtctr	r0
+1:	lhzu	r0,2(r11)
+	addc	r5,r5,r0
+	bdnz	1b
+2:	beq	3f
+	lbz	r0,-1(r10)
+	rlwinm	r0,r0,8,16,23
+	addc	r5,r5,r0
+3:	addze	r3,r5
+	blr	
+4:	lvsr	v5,0,r3
+	addi	r9,r3,16
+	li	r12,64
+	lvsr	v7,r3,r4
+	rlwinm	r9,r9,0,0,27
+	addi	r10,r10,-1
+	lvx	v2,0,r3
+	subf	r11,r9,r10
+	vxor	v0,v0,v0
+	dcbt	r3,r12
+	rlwinm	r11,r11,28,4,31
+	vnor	v1,v0,v0
+	mtctr	r11
+	vxor	v11,v11,v11
+	vperm	v5,v1,v0,v5
+	cmpi	cr6,0,r11,4
+	vxor	v8,v8,v8
+	vperm	v1,v0,v1,v7
+	li	r12,16
+	vsel	v2,v2,v0,v5
+5:	lvx	v3,r3,r12
+	vaddcuw	v9,v2,v8
+	vadduwm	v8,v2,v8
+	vadduwm	v11,v9,v11
+	addi	r12,r12,16
+	vor	v2,v3,v3
+	bdnzf	25,5b
+	add	r9,r3,r12
+	addi	r11,r11,-1
+	bgt	cr6,8f
+	vxor	v3,v3,v3
+6:	lvx	v5,0,r10
+	vaddcuw	v9,v2,v3
+	rlwinm	r10,r10,0,28,31
+	vadduwm	v12,v2,v3
+	cmpi	cr7,0,r10,0xF
+	vaddcuw	v10,v12,v8
+	vadduwm	v8,v12,v8
+	vadduwm	v9,v9,v10
+	vadduwm	v11,v9,v11
+	beq	cr7, 7f
+	vsel	v5,v5,v0,v1
+7:	vaddcuw	v9,v5,v8
+	vadduwm	v8,v5,v8
+	vadduwm	v11,v9,v11
+	vmrglh	v2,v0,v8
+	vmrghh	v3,v0,v8
+	rlwinm	r10,r1,0,0,27
+	vsumsws	v0,v11,v0
+	vadduwm	v8,v2,v3
+	li	r12,-16
+	vsumsws	v8,v8,v0
+	stvx	v8,r10,r12
+	lwz	r3,-4(r10 )
+	addc	r3,r3,r5
+	addze	r3,r3
+	blr	
+	.align	4
+8:	lvx	v3,r3,r12
+	addi	r11,r11,-1
+	vaddcuw	v9,v2,v8
+	vadduwm	v8,v2,v8
+	mtcrf	0x02,r9
+	addi	r9,r9,16
+	addi	r0,r11,-2
+	vor	v2,v3,v3
+	addi	r12,r12,16
+	vadduwm	v11,v9,v11
+	bdnzf	27,8b
+	mtcrf	0x02,r10
+	addi	r11,r3,96
+	vxor	v3,v3,v3
+	bns	cr6,9f
+	bdnz	9f
+9:	lvx	v5,r3,r12
+	addi	r12,r12,16
+	vaddcuw	v9,v2,v3
+	lvx	v6,r3,r12
+	addi	r11,r11,32
+	vadduwm	v12,v2,v3
+	dcbt	0,r11
+	addi	r12,r12,16
+	vaddcuw	v10,v12,v8
+	vadduwm	v8,v12,v8
+	vadduwm	v9,v9,v10
+	bdz	10f
+10:	vadduwm	v11,v9,v11
+	vor	v2,v5,v5
+	vor	v3,v6,v6
+	bdnz	9b
+	bso	cr6,11f
+	b	6b
+11:	lvx	v5,r3,r12
+	addi	r12,r12,16
+	vaddcuw	v9,v2,v3
+	vadduwm	v12,v2,v3
+	vaddcuw	v10,v12,v8
+	vadduwm	v8,v12,v8
+	vadduwm	v9,v9,v10
+	vadduwm	v11,v9,v11
+	vxor	v3,v3,v3
+	vor	v2,v5,v5
+	b	6b
diff --git a/liboil/motovec/string_vec.S b/liboil/motovec/string_vec.S
new file mode 100644
index 0000000..4da4a3e
--- /dev/null
+++ b/liboil/motovec/string_vec.S
@@ -0,0 +1,1375 @@
+/*
+ * AltiVec versions (*_vec) of equivalent Linux library functions
+ * found in /arch/ppc/lib/string.S from Linux 2.4.17.  Suggest this
+ * file be appended to that one when building a Linux kernel that
+ * will employ these functions.
+ *
+ * Copyright (C) Motorola, Inc. 2003
+ *
+ * Revision history:
+ * 	Rev 0.0	Original                Chuck Corley   5/28/03
+ *                                  Contact at risc10@motorola.com
+ * Commented source code for Altivec version available at
+ * www.motorola.com/altivec
+ *
+ * AltiVec versions will only deal with L1_CACHE_LINE_SIZE=32
+ */
+
+
+#ifndef TEST_OUTSIDE_LINUX
+#include "../kernel/ppc_asm.tmpl"
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#if 0
+#define v0  vr0
+#define v1  vr1
+#define v2  vr2
+#define v3  vr3
+#define v4  vr4
+#define v5  vr5
+#define v6  vr6
+#define v7  vr7
+#define v8  vr8
+#define v9  vr9
+#define v10 vr10
+#define v11 vr11
+#define v12 vr12
+#define v13 vr13
+#define v14 vr14
+#define v15 vr15
+#endif
+#else
+#define EFAULT 0
+#define L1_CACHE_LINE_SIZE 32
+#define LG_L1_CACHE_LINE_SIZE 5
+#define MAX_L1_COPY_PREFETCH 1
+#endif
+
+/* AltiVec versions of selected functions for use on AltiVec
+ * enabled G4 and later microprocessors.
+ */ 
+#if defined(__GNUC__) || defined(__MWERKS__)  /* gcc and codewarrior don't assemble dcba */
+#define DCBA_R3R7	.long 0x7c033dec
+#define DCBA_R3R9	.long 0x7c034dec
+#define DCBA_R0R8	.long 0x7c0045ec
+#else
+#define DCBA_R3R7	dcba	r4,r7
+#define DCBA_R3R9	dcba	r4,r9
+#define DCBA_R0R8	dcba	0,r8
+#endif
+
+     .text    
+     .align   5
+     .global  backwards_memcpy_vec
+backwards_memcpy_vec:   
+     nop      
+     .global  memmove_vec
+memmove_vec:   
+     nop
+	.global	cacheable_memcpy_vec
+cacheable_memcpy_vec:
+     nop     
+     .global  memcpy_vec
+memcpy_vec:    
+	subf.	r7,r4,r3
+	cmpi	cr1,0,r5,0
+	cmpi	cr7,0,r5,16
+	addi	r8,r4,-1
+	addi	r9,r3,-1
+	add	r10,r4,r5
+	beqlr    
+	add	r11,r3,r5
+	subf	r0,r3,r4
+	beqlr	cr1
+	bgt	2f
+	cmpi	cr5,0,r0,128
+	bgt	cr7,23f
+	mtctr	r5
+1:	lbzu	r0,1(r8)
+	stbu	r0,1(r9)
+	bdnz	1b
+	blr      
+2:	cmpi	cr5,0,r7,128
+	cmp	cr6,0,r7,r5
+	bgt	cr7,4f
+	mtctr	r5
+3:	lbzu	r0,-1(r10)
+	stbu	r0,-1(r11)
+	bdnz	3b
+	blr      
+
+4:	rlwinm	r8,r4,0,28,31
+	rlwinm	r9,r3,0,28,31
+	bge	cr6,24f
+	lis	r11,0x010c
+	subf.	r8,r9,r8
+	lvsr	v2,0,r7
+	ori	r11,r11,0xffe0
+	addi	r11,r10,-1
+	bgt	5f
+	addi	r8,r8,16
+5:	rlwinm	r11,r11,0,0,27
+	addi	r7,r5,-1
+	subf	r0,r11,r10
+	add	r11,r3,r7
+	addi	r10,r3,16
+	subf.	r8,r0,r8
+	rlwinm	r0,r11,0,28,31
+	rlwinm	r10,r10,0,0,27
+	blt	6f
+	lvx	v1,r4,r7
+	addi	r4,r4,-16
+6:	lvx	v0,r4,r7
+	subf	r10,r10,r11
+	cmpi	cr7,0,r0,0xF
+	cmpi	cr1,0,r9,0
+	rlwinm	r10,r10,28,4,31
+	add	r0,r3,r5
+	cmpi	cr6,0,r10,0
+	vperm	v3,v0,v1,v2
+	vor	v1,v0,v0
+	beq	cr7,10f
+	mtcrf	0x01,r0
+	rlwinm	r11,r11 ,0,0,27
+	li	r9,0
+	bnl	cr7,7f
+	stvewx	v3,r11,r9
+	addi	r9,r9,4
+	stvewx	v3,r11,r9
+	addi	r9,r9,4
+7:	bng	cr7,8f
+	stvewx	v3,r11,r9
+	addi	r9,r9,4
+8:	bne	cr7,9f
+	stvehx	v3,r11,r9
+	addi	r9,r9,2
+9:	bns	cr7,11f
+	stvebx	v3,r11,r9
+	b	11f
+10:	stvx	v3,r3,r7
+11:	addi	r7,r7,-16
+	ble	cr6,13f
+	mtctr	r10
+	cmpi	cr6,0,r10,4
+12:	lvx	v0,r4,r7
+	vperm	v3,v0,v1,v2
+	vor	v1,v0,v0
+	stvx	v3,r3,r7
+	addi	r7,r7,-16
+	bdnzf	25,12b
+	add	r9,r3,r7
+	bgt	cr6,19f
+13:	blt	14f
+	addi	r4,r4,16
+14:	lvx	v0,0,r4
+	vperm	v3,v0,v1,v2
+	subfic	r9,r3,16
+	beq	cr1,18f
+	mtcrf	0x01,r9
+	li	r9,0
+	bns	cr7,15f
+	stvebx	v3,r3,r9
+	addi	 r9,r9,1
+15:	bne	cr7,16f
+	stvehx	v3,r3,r9
+	addi	r9,r9,2
+16:	bng	cr7,17f
+	stvewx	v3,r3,r9
+	addi	r9,r9,4
+17:	bnllr	cr7
+	stvewx	v3,r3,r9
+	addi	r9,r9,4
+	stvewx	v3,r3,r9
+	blr
+18:	stvx	v3,0,r3    
+	blr      
+19:	lvx	v0,r4,r7
+	mtcrf	0x02,r9
+	vperm	v3,v0,v1,v2
+	vor	v1,v0,v0
+	addi	r9,r9,-16
+	stvx	v3,r3,r7
+	vor	v7,v0,v0
+	addi	r7,r7,-16
+	bdnzt	27,19b
+	lis	r8,0x102
+	mtcrf	0x02,r3
+	addi	r9,r7,-16
+	ori	r8,r8,0xffe0
+	addi	r11,r4,-64
+	bso	cr6,20f
+	bdnz	20f
+20:	lvx	v6,r4,r7
+	addi	r11,r11,-32
+	lvx	v1,r4,r9
+	vperm	v3,v6,v7,v2
+	DCBA_R3R9
+	vperm	v4,v1,v6,v2
+	vor	v7,v1,v1
+	bdz	21f
+21:	stvx	v3,r3,r7
+	addi	r7,r9,-16
+	stvx	v4,r3,r9
+	addi	r9,r7,-16
+	bdnz	20b
+	bns	cr6,22f
+	b	13b
+22:	lvx	v1,r4,r7
+	vperm	v4,v1,v7,v2
+	stvx	v4,r3,r7
+	b	13b
+
+23:	rlwinm	r8,r4,0,28,31
+	rlwinm	r9,r3,0,28,31
+24:	lis	r10,0x010c
+	subf.	r8,r8,r9
+	lvsr	v2,0,r7
+	ori	r10,r10,32
+	dst	r4,r10,0
+	addi	r10,r3,16
+	addi	r11,r11,-1
+	bge	25f
+	lvx	v0,0,r4
+	addi	r4,r4,16
+25:	lvx	v1,0,r4
+	rlwinm	r10,r10,0,0,27
+	cmpi	cr1,0,r9,0
+	subf	r0,r3,r10
+	subf	r10,r10,r11
+	li	r7,0
+	mtcrf	0x01,r0
+	rlwinm	r10,r10,28,4,31
+	vperm	v3,v0,v1,v2
+	vor	v0,v1,v1
+	beq	cr1,29f
+	bns	cr7,26f
+	stvebx	v3,r3,r7
+	addi	r7,r7,1
+26:	bne	cr7,27f
+	stvehx	v3,r3,r7
+	addi	r7,r7,2
+27:	bng	cr7,28f
+	stvewx	v3,r3,r7
+	addi	r7,r7,4
+28:	bnl	cr7,30f
+	stvewx	v3,r3,r7
+	addi	r7,r7,4
+	stvewx	v3,r3,r7
+	b	30f
+29:	stvx	v3,0,r3
+30:	rlwinm	r0,r11,0,28,31
+	cmpi	cr6,0,r10,0
+	li	r7,16
+	cmpi	cr1,0,r0,0xF
+	cmpi	cr7,0,r10,14
+	ble	cr6,32f
+	mtctr	r10
+	cmpi	cr6,0,r10,4
+31:	lvx	v1,r4,r7
+	vperm	v3,v0,v1,v2
+	vor	v0,v1,v1
+	stvx	v3,r3,r7
+	addi	r7,r7,16
+	bdnzf	25,31b
+	add	r9,r3,r7
+	addi	r10,r10,-1
+	bgt	cr6,38f
+32:	add	r11,r3,r5
+	add	r10,r4,r5
+	bge	33f
+	addi	r10,r10,-16
+33:	mtcrf	0x01,r11
+	addi	r11,r11,-1
+	addi	r0,r10,-1
+	lvx	v1,0,r0
+	dss	0
+	dss	1
+	vperm	v3,v0,v1,v2
+	beq	cr1,37f
+	rlwinm	r11,r11,0,0,27
+	li	r9,0
+	bnl	cr7,34f
+	stvewx	v3,r11,r9
+	addi	r9,r9,4
+	stvewx	v3,r11,r9
+	addi	r9,r9,4
+34:	bng	cr7,35f
+	stvewx	v3,r11,r9
+	addi	r9,r9,4
+35:	bne	cr7,36f
+	stvehx	v3,r11,r9
+	addi	r9,r9,2
+36:	bnslr	cr7
+	stvebx	v3,r11,r9
+	blr
+37:	stvx	v3,r3,r7
+	blr 
+
+38:	lvx	v1,r4,r7
+	addi	r10,r10,-1
+	mtcrf	0x02,r9
+	addi	r9,r9,16
+	addi	r0,r10,-2
+	vperm	v3,v0,v1,v2
+	vor	v0,v1,v1
+	stvx	v3,r3,r7
+	addi	r7,r7,16
+	bdnzf	27,38b
+	mtcrf	0x02,r11
+	lis	r8,0x104
+	addi	r9,r7,16
+	ori	r8,r8,32
+	rlwinm	r11,r0,29,3,31
+	rlwinm	r0,r0,0,0,28
+	bgt	cr7,43f
+39:	addi	r11,r4,256
+	xoris	r8,r8,0x6
+	bns	cr6,40f
+	bdnz	40f
+40:	lvx	v1,r4,r7
+	addi	r11,r11,32
+	lvx	v6,r4,r9
+	vperm	v4,v0,v1,v2
+	dst	r11,r8,1
+	DCBA_R3R7
+	vperm	v3,v1,v6,v2
+	vor	v0,v6,v6
+	bdz	41f
+41:	stvx	v4,r3,r7
+	addi	r7,r9,16
+	stvx	v3,r3,r9
+	addi	r9,r7,16
+	bdnz	40b
+	bso	cr6,42f
+	b	32b
+42:	lvx	v1,r4,r7
+	vperm	v3,v0,v1,v2
+	vor	v0,v1,v1
+	stvx	v3,r3,r7
+	addi	r7,r7,16
+	b	32b
+
+43:	subf	r10,r0,r10
+	blt	cr5,39b
+	mtctr	r11
+	addi	r11,r4,256
+44:	lvx	v1,r4,r7
+	addi	r9,r7,32
+	addi	r11,r11,128
+	lvx	v7,r4,r9
+	addi	r9,r9,32
+	lvx	v9,r4,r9
+	addi	r9,r9,32
+	lvx	v11,r4,r9
+	addi	r9,r7,16
+	lvx	v6,r4,r9
+	addi	r9,r9,32
+	lvx	v8,r4,r9
+	addi	r9,r9,32
+	lvx	v10,r4,r9
+	addi	r9,r9,32
+	vperm	v3,v0,v1,v2
+	lvx	v0,r4,r9
+	vperm	v4,v1,v6,v2
+	dst	r11,r8,1
+	DCBA_R3R7
+	stvx	v3,r3,r7
+	addi	r7,r7,16
+	vperm	v5,v6,v7,v2
+	stvx	v4,r3,r7
+	addi	r7,r7,16
+	vperm	v6,v7,v8,v2
+	DCBA_R3R7
+	stvx	v5,r3,r7
+	addi	r7,r7,16
+	vperm	v7,v8,v9,v2
+	stvx	v6,r3,r7
+	addi	r7,r7,16
+	vperm	v8,v9,v10,v2
+	DCBA_R3R7
+	stvx	v7,r3,r7
+	addi	r7,r7,16
+	vperm	v9,v10,v11,v2
+	stvx	v8,r3,r7
+	addi	r7,r7,16
+	vperm	v10,v11,v0,v2
+	DCBA_R3R7
+	stvx	v9,r3,r7
+	addi	r7,r7,16
+	stvx	v10,r3,r7
+	addi	r7,r7,16
+	bdnz	44b
+	mtctr	r10
+	addi	r9,r7,16
+	bns	cr6,40b
+	bdnz	40b
+
+	.global  bcopy_vec
+bcopy_vec:     
+	mr	r0,r3
+	mr	r3,r4
+	mr	r4,r0
+	b	memcpy_vec
+
+	.text
+	.align	4
+	.globl	__clear_user_vec
+__clear_user_vec:
+	mr	r5,r4
+	li	r4,0
+	.globl	memset_vec
+memset_vec:
+	cmpi	cr7,0,r5,16
+	cmpi	cr1,0,r5,0
+	rlwinm.	r8,r4,28,28,3
+	addi	r9,r3,-1
+	addi	r10,r3,16
+	add	r6,r3,r5
+	bgt	cr7,2f
+	mtctr	r5
+	beqlr	cr1
+1:	stbu	r4,1(r9)
+	bdnz	1b
+	blr              
+2:	rlwinm	r10,r10,0,0,27
+	addi	r11,r6,-1
+	subf	r9,r3,r10
+	li	r7,0
+	vxor	v0,v0,v0
+	subf	r10,r10 ,r11
+	cmpi	cr1,0,r9,16
+	beq	3f
+	lvsl	v0,0,r8
+	vspltisb	v1,4
+	lvsl	v2,0,r4
+	vslb	v0,v0,v1
+	vor	v0,v0,v2
+	vspltb	v0,v0,0
+3:	mtcrf	0x01,r9
+	rlwinm	r10,r10,28,4,31
+	beq	cr1,7f
+	bns	cr7,4f
+32:	stvebx	v0,r3,r7
+	addi	r7,r7,1
+4:	bne	cr7,5f
+42:	stvehx	v0,r3,r7
+	addi	r7,r7,2
+5:	bng	cr7,6f
+52:	stvewx	v0,r3,r7
+	addi	r7,r7,4
+6:	bnl	cr7,8f
+62:	stvewx	v0,r3,r7
+	addi	r7,r7,4
+64:	stvewx	v0,r3,r7
+	b	8f
+7:	stvx	v0,0,r3
+8:	rlwinm	r0,r11,0,28,31
+	cmpi	cr6,0,r10,0
+	li	r7,16
+	cmpi	cr1,0,r0,0xF
+	ble	cr6,10f
+	mtctr	r10
+	cmpi	cr6,0,r10,4
+9:	stvx	v0,r3,r7
+	addi	r7,r7,16
+	bdnzf	25,9b
+	add	r9,r3,r7
+	addi	r10,r10,-1
+	bgt	cr6,16f
+10:	mtcrf	0x01,r6
+	beq	cr1,14f
+	rlwinm	r11,r11,0,0,27
+	li	r9,0
+	bnl	cr7,11f
+102:	stvewx	v0,r11,r9
+	addi	r9,r9,4
+104:	stvewx	v0,r11,r9
+	addi	r9,r9,4
+11:	bng	cr7,12f
+112:	stvewx	v0,r11,r9
+	addi	r9,r9,4
+12:	bne	cr7,13f
+122:	stvehx	v0,r11,r9
+	addi	r9 ,r9 ,2
+13:	bnslr	cr7
+132:	stvebx	v0,r11,r9
+	blr              
+14:	stvx	v0,r3,r7
+	blr              
+
+16:	addi	r10,r10,-1
+	mtcrf	0x02,r9
+	addi	r9,r9,16
+162:	stvx	v0,r3,r7
+	addi	r7,r7,16
+	bdnzf	27,16b
+	mtcrf	0x02,r11
+	bns	cr6,17f
+	bdnz	17f                       
+17:	stvx	v0,r3,r7
+	addi	r7,r7,16
+	bdz	18f
+18:	stvx	v0,r3,r7
+	addi	r7,r7,16
+	bdnz	17b
+	bso	cr6,19f
+	b	10b
+19:	stvx	v0,r3,r7
+	addi	r7,r7,16
+	b	10b
+
+/* Intent of this exception table appears to be to return the byte count */
+/* remaining to be cleared when the current store error occurred.  Chuck */
+/* Memset doesn't require it but the code is identical to __clear_user   */
+/* FIRST FAILURE CHECKED BY RECOMPILATION WITH BRANCHES SUBSTITUTED
+ * FOR STORES.    chuckc  030515
+*/
+
+91:	mfctr	r3		/* Return byte count remaining */
+	blr
+92:	subf	r3,r7,r5	/* BC minus bytes already stored */
+	blr
+93:	mr	r3,r5		/* Nothing stored yet */
+	blr
+94:	add	r11,r3,r5
+	rlwinm	r6,r11,0,28,31	/* Bytes in last vector */
+	b	99f
+95:	add	r11,r3,r5
+	rlwinm	r6,r11,0,28,31	
+	subf	r3,r9,r6
+	blr
+96:	li	r3,16		/* 16 bytes in last vector to be stored. */
+	blr
+97:	add	r11,r3,r5
+	rlwinm	r6,r11,0,27,31
+99:	mfctr	r3
+	rlwinm	r3,r3,4,0,27
+	add	r3,r3,r6
+	blr
+98:	add	r11,r3,r5	
+	rlwinm	r3,r11,0,27,31
+	blr
+
+#ifndef TEST_OUTSIDE_LINUX
+	.section __ex_table,"a"	
+	.align	2		
+	.long	1b,91b
+	.long	32b,92b
+	.long	42b,92b
+	.long	52b,92b
+	.long	62b,92b
+	.long	64b,92b
+	.long	7b,93b
+	.long	9b,94b
+	.long	102b,95b
+	.long	104b,95b
+	.long	112b,95b
+	.long	122b,95b
+	.long	132b,95b
+	.long	14b,96b
+	.long	162b,94b
+	.long	17b,97b
+	.long	18b,97b
+	.long	19b,98b
+#endif
+	.text
+/* Scalar __copy_tofrom_user always copies forward and never checks 
+ * for overlap, __copy_tofrom_user_vec will do the same except it will
+ * check that overlap is > 128B before entering 128B loop when copying
+ * forward. 
+ * The scalar version always assumes the destination and source
+ * are word aligned.  This routine will assume the same to simplify handling
+ * exceptions.    chuckc
+ */
+
+	.globl	__copy_tofrom_user_vec
+__copy_tofrom_user_vec:
+	subf.	r7,r4,r3
+	cmpi	cr1,0,r5,0
+	cmpi	cr7,0,r5,16
+	addi	r8,r4,-1
+	addi	r9,r3,-1
+	add	r10,r4,r5
+	beqlr    
+	add	r11,r3,r5
+	subf	r0,r3,r4
+	beqlr	cr1
+	bgt	1f
+	cmpi	cr5,0,r0,128	/* Overlap |(DST-SRC)|> 128B? */
+	bgt	cr7,23f		/* b to v_memcpy */
+1:	cmpi	cr5,0,r7,128	/* Overlap |(DST-SRC)|> 128B? */
+	bgt	cr7,23f		/* b to v_memcpy */
+	mtctr	r5
+2:	lbzu	r0,1(r8)
+202:	stbu	r0,1(r9)
+	bdnz	2b
+	li	r3,0
+	blr    
+
+23:	rlwinm	r8,r4,0,28,31
+	rlwinm	r9,r3,0,28,31
+24:	lis	r10,0x010c
+	subf.	r8,r8,r9
+	lvsr	v2,0,r7
+	ori	r10,r10,32
+	dst	r4,r10,0
+	addi	r10,r3,16
+	addi	r11,r11,-1
+	bge	25f
+241:	lvx	v0,0,r4
+	addi	r4,r4,16
+25:	lvx	v1,0,r4
+	rlwinm	r10,r10,0,0,27
+	cmpi	cr1,0,r9,0
+	subf	r0,r3,r10
+	subf	r10,r10,r11
+	li	r7,0
+	mtcrf	0x01,r0
+	rlwinm	r10,r10,28,4,31
+	vperm	v3,v0,v1,v2
+	vor	v0,v1,v1
+	beq	cr1,29f
+	bns	cr7,26f
+252:	stvebx	v3,r3,r7
+	addi	r7,r7,1
+26:	bne	cr7,27f
+262:	stvehx	v3,r3,r7
+	addi	r7,r7,2
+27:	bng	cr7,28f
+272:	stvewx	v3,r3,r7
+	addi	r7,r7,4
+28:	bnl	cr7,30f
+282:	stvewx	v3,r3,r7
+	addi	r7,r7,4
+284:	stvewx	v3,r3,r7
+	b	30f
+29:	stvx	v3,0,r3
+30:	rlwinm	r0,r11,0,28,31
+	cmpi	cr6,0,r10,0
+	li	r7,16
+	cmpi	cr1,0,r0,0xF
+	cmpi	cr7,0,r10,14
+	ble	cr6,32f
+	mtctr	r10
+	cmpi	cr6,0,r10,4
+31:	lvx	v1,r4,r7
+	vperm	v3,v0,v1,v2
+	vor	v0,v1,v1
+312:	stvx	v3,r3,r7
+	addi	r7,r7,16
+	bdnzf	25,31b
+	add	r9,r3,r7
+	addi	r10,r10,-1
+	bgt	cr6,38f
+32:	add	r11,r3,r5
+	add	r10,r4,r5
+	bge	33f
+	addi	r10,r10,-16
+33:	mtcrf	0x01,r11
+	addi	r11,r11,-1
+	addi	r0,r10,-1
+331:	lvx	v1,0,r0
+	dss	0
+	dss	1
+	vperm	v3,v0,v1,v2
+	beq	cr1,37f
+	rlwinm	r11,r11,0,0,27
+	li	r9,0
+	li	r3,0
+	bnl	cr7,34f
+332:	stvewx	v3,r11,r9
+	addi	r9,r9,4
+334:	stvewx	v3,r11,r9
+	addi	r9,r9,4
+34:	bng	cr7,35f
+342:	stvewx	v3,r11,r9
+	addi	r9,r9,4
+35:	bne	cr7,36f
+352:	stvehx	v3,r11,r9
+	addi	r9,r9,2
+36:	bnslr	cr7
+362:	stvebx	v3,r11,r9
+	blr
+37:	stvx	v3,r3,r7
+	li	r3,0
+	blr 
+     
+	.align	4
+38:	lvx	v1,r4,r7
+	addi	r10,r10,-1
+	mtcrf	0x02,r9
+	addi	r9,r9,16
+	addi	r0,r10,-2
+	vperm	v3,v0,v1,v2
+	vor	v0,v1,v1
+382:	stvx	v3,r3,r7
+	addi	r7,r7,16
+	bdnzf	27,38b
+	mtcrf	0x02,r11
+	lis	r8,0x104
+	addi	r9,r7,16
+	ori	r8,r8,32
+	rlwinm	r11,r0,29,3,31
+	rlwinm	r0,r0,0,0,28
+	bgt	cr7,43f
+39:	addi	r11,r4,256
+	xoris	r8,r8,0x6
+	bns	cr6,40f
+	bdnz	40f
+40:	lvx	v1,r4,r7
+	addi	r11,r11,32
+401:	lvx	v6,r4,r9
+	vperm	v4,v0,v1,v2
+	dst	r11,r8,1
+	DCBA_R3R7
+	vperm	v3,v1,v6,v2
+	vor	v0,v6,v6
+402:	stvx	v4,r3,r7
+	addi	r7,r9,16
+	bdz	41f
+41:	stvx	v3,r3,r9
+	addi	r9,r7,16
+	bdnz	40b
+	bso	cr6,42f
+	b	32b
+42:	lvx	v1,r4,r7
+	vperm	v3,v0,v1,v2
+	vor	v0,v1,v1
+422:	stvx	v3,r3,r7
+	addi	r7,r7,16
+	b	32b
+
+43:	subf	r10,r0,r10
+	blt	cr5,39b
+	mtctr	r11
+	addi	r11,r4,256
+44:	lvx	v1,r4,r7
+	addi	r9,r7,32
+	addi	r11,r11,128
+443:	lvx	v7,r4,r9
+	addi	r9,r9,32
+447:	lvx	v9,r4,r9
+	addi	r9,r9,32
+451:	lvx	v11,r4,r9
+	addi	r9,r7,16
+441:	lvx	v6,r4,r9
+	addi	r9,r9,32
+445:	lvx	v8,r4,r9
+	addi	r9,r9,32
+449:	lvx	v10,r4,r9
+	addi	r9,r9,32
+	vperm	v3,v0,v1,v2
+453:	lvx	v0,r4,r9
+	vperm	v4,v1,v6,v2
+	dst	r11,r8,1
+	DCBA_R3R7
+440:	stvx	v3,r3,r7
+	addi	r7,r7,16
+	vperm	v5,v6,v7,v2
+442:	stvx	v4,r3,r7
+	addi	r7,r7,16
+	vperm	v6,v7,v8,v2
+	DCBA_R3R7
+444:	stvx	v5,r3,r7
+	addi	r7,r7,16
+	vperm	v7,v8,v9,v2
+446:	stvx	v6,r3,r7
+	addi	r7,r7,16
+	vperm	v8,v9,v10,v2
+	DCBA_R3R7
+448:	stvx	v7,r3,r7
+	addi	r7,r7,16
+	vperm	v9,v10,v11,v2
+450:	stvx	v8,r3,r7
+	addi	r7,r7,16
+	vperm	v10,v11,v0,v2
+	DCBA_R3R7
+452:	stvx	v9,r3,r7
+	addi	r7,r7,16
+454:	stvx	v10,r3,r7
+	addi	r7,r7,16
+	bdnz	44b
+	mtctr	r10
+	addi	r9,r7,16
+	bns	cr6,40b
+	bdnz	40b
+
+/* Intent of this exception table is to return:
+ *    r3 = bytes not copied (but preserve dst address in r3 til end)
+ *    r4 = 0 on read fault; 1 on write fault
+ * Register useage here:
+ *    r5 = (preserve as total byte count til near end)
+ *    r6 = bytes not copied (move to r3 at end)
+ *    r7 = byte count index from memcpy_vec
+ *    r9 = alternate byte count index in 128B loop
+ *    r10= vectors (QWs remaining) after 128B loop
+ *    r11= next destination address (assume word-aligned)
+ * For read fault, clear out the destination for bytes remaining
+ * starting at r3(dst) + r5(byte count) - r6 (bytes remaining).
+ */
+
+
+/* read fault, initial single-byte copy */
+100:	li	r4,0
+	mfctr	r3
+101:	stbu	r4,1(r9)
+	bdnz	101b
+	blr
+
+/* write fault, initial single-byte copy */
+102:	li	r4,1
+	mfctr	r3
+	blr
+
+/* read fault, initial vector(s) load */
+103:	li	r4,0
+	b	91f
+
+/* write fault, initial partial vector store */
+104:	li	r4,1
+	subf	r5,r7,r5	/* BC minus bytes in 1st vector already stored */
+	add	r3,r3,r7	/* dst plus bytes in 1st vector already stored. */
+	b	91f
+
+/* write fault, initial full vector store */
+105:	li	r4,1
+91:	mr	r6,r5
+	b	98f
+
+/* read fault in 16B loop(s) and 32B loop (treat as both loads fail)*/
+106:	li	r4,0
+	b	94f
+
+/* write fault in 16B loop(s), 128B, and first write fault in 32B loop */
+107:	li	r4,1
+	b	94f
+
+/* second write fault in 32B loop */
+108:	li	r4,1
+	add	r11,r3,r5	/* Last dst byte + 1 */
+	add	r3,r3,r9	/* Current dst byte */
+	b	95f
+
+/* read fault in 128B loop (treat as all loads fail)*/
+112:	li	r4,0
+	mfctr	r0
+	slwi	r0,r0,7		/* Convert 128B loop ctr to bytes */
+	add	r11,r3,r5
+	slwi	r10,r10,4	/* convert QW vectors remaining to bytes */
+	add	r3,r3,r7
+	rlwinm	r6,r11,0,28,31	/* Bytes in last vector(s) */
+	rlwinm  r3,r3,0,0,27
+	add	r6,r6,r10
+	add	r6,r6,r0
+	b	98f
+
+/* read fault, final vector(s) load */
+114:	li	r4,0
+94:	add	r11,r3,r5
+	add	r3,r3,r7
+95:	rlwinm  r3,r3,0,0,27
+	subf	r6,r3,r11
+	b	98f
+
+/* write fault, final partial vector store */
+115:	li	r4,1
+	add	r11,r3,r5	
+	add	r3,r3,r7
+	rlwinm  r3,r3,0,0,27	
+	subf	r6,r3,r11	
+	subf	r6,r9,r6	/* minus bytes already stored */
+	b	98f
+
+/* write fault, final full vector store */
+116:	li	r4,1
+	add	r3,r3,r7
+	rlwinm  r3,r3,0,0,27
+	li	r6,16
+	b	98f
+
+/*
+ * At this stage the number of bytes not copied is in r6
+ * and r4 is 0 for read or 1 for write.
+ * (Like the scalar version, assume dst is word-aligned.)
+ */
+98:	cmpwi	0,r4,0
+	bne	120f
+/* for read fault, clear out the destination: r6 bytes remaining 
+ */
+	srwi.	r0,r6,2
+	addi	r3,r3,-4
+	subf	r10,r6,r5
+	mtctr	r0
+	beq	118f
+117:	stwu	r4,4(r3)
+	bdnz	117b
+118:	andi.	r0,r6,3
+	mtctr	r0
+	beq	120f
+119:	stb	r4,4(r3)
+	addi	r3,r3,1
+	bdnz	119b
+120:	mr	r3,r6
+	blr
+
+121:	li	r4,1
+	mfctr	r3
+	rlwinm	r3,r3,2,0,29
+	andi.	r0,r6,3
+	add	r3,r3,r0
+	blr
+
+
+#ifndef TEST_OUTSIDE_LINUX
+	.section __ex_table,"a"	
+	.align	2		
+	.long	2b,100b
+	.long	202b,102b
+	.long	241b,103b
+	.long	25b,103b
+	.long	252b,104b
+	.long	262b,104b
+	.long	272b,104b
+	.long	282b,104b
+	.long	284b,104b
+	.long	29b,105b
+	.long	31b,106b
+	.long	312b,107b
+	.long	331b,114b
+	.long	332b,115b
+	.long	334b,115b
+	.long	342b,115b
+	.long	352b,115b
+	.long	362b,115b
+	.long	37b,116b
+	.long	38b,106b
+	.long	382b,107b
+	.long	40b,106b
+	.long	401b,106b
+	.long	402b,107b
+	.long	41b,108b
+	.long	42b,106b
+	.long	422b,107b
+	.long	44b,112b
+	.long	443b,112b
+	.long	447b,112b
+	.long	451b,112b
+	.long	441b,112b
+	.long	445b,112b
+	.long	449b,112b
+	.long	453b,112b
+	.long	440b,107b
+	.long	442b,107b
+	.long	444b,107b
+	.long	446b,107b
+	.long	448b,107b
+	.long	450b,107b
+	.long	452b,107b
+	.long	454b,107b
+	.long	101b,102b
+	.long	117b,121b
+	.long	119b,102b
+#endif
+
+	.text
+	.align 5
+
+	.global strlen_vec
+strlen_vec:
+
+	lvxl	v2,0,r3
+	vxor	v0,v0,v0
+	lvsl	v5,0,r3
+	vnor	v1,v0,v0
+	rlwinm	r5,r3,0,28,31
+	vperm	v2,v2,v1,v5
+	mr	r4,r3
+	li	r3,16
+	vcmpequb.	v4,v0,v2
+	vsldoi	v5,v0,v1,8
+	bne	cr6,2f
+	subf	r3,r5,r3
+1:	lvxl	v2,r4,r3
+	addi	r3,r3,16
+	vcmpequb.	v4,v0,v2
+	beq	cr6,1b
+2:	vandc	v3,v2,v5
+	vsldoi	v7,v0,v1,4
+	vcmpequb.	v4,v3,v5
+	vsldoi	v8,v0,v1,12
+	beq	cr6,10f
+	vandc	v3,v2,v8
+	vsldoi	v5,v0,v1,10
+	vcmpequb.	v4,v3,v8
+	vsldoi	v9,v0,v1,14
+	beq	cr6,6f
+	vandc	v3,v2,v9
+	vsldoi	v8,v0,v1,13
+	vcmpequb.	v4,v3,v9
+	vsldoi	v10,v0,v1,15
+	beq	cr6,4f
+	vandc	v3,v2,v10
+	vcmpequb.	v4,v3,v10
+	beq	cr6,3f
+	addi	r3,r3,-16
+	blr
+3:	addi	r3,r3,-15
+	blr
+
+4:	vandc	v3,v2,v8
+	vcmpequb. v4,v3,v8
+	beq	cr6,5f
+	addi	r3,r3,-14
+	blr
+5:	addi	r3,r3,-13
+	blr
+
+6:	vandc	v3,v2,v5
+	vsldoi	v9,v0,v1,9
+	vcmpequb.	v4,v3,v5
+	vsldoi	v10,v0,v1,11
+	beq	cr6,8f
+	vandc	v3,v2,v10
+	vcmpequb.	v4,v3,v10
+	beq	cr6,7f
+	addi	r3,r3,-12
+	blr
+7:	addi	r3,r3,-11
+	blr
+
+8:	vandc	v3,v2,v9
+	vcmpequb.	v4,v3,v9
+	beq	cr6,9f
+	addi	r3,r3,-10
+	blr
+9:	addi	r3,r3,-9
+	blr
+
+10:	vandc	v3,v2,v7
+	vsldoi	v5,v0,v1,2
+	vcmpequb.	v4,v3,v7
+	vsldoi	v10,v0,v1,6
+	beq	cr6,14f
+	vandc	v3,v2,v10
+	vsldoi	v9,v0,v1,5
+	vcmpequb.	v4,v3,v10
+	vsldoi	v7,v0,v1,7
+	beq	cr6,12f
+	vandc	v3,v2,v7
+	vcmpequb.	v4,v3,v7
+	beq	cr6,11f
+	addi	r3,r3,-8
+	blr
+11:	addi	r3,r3,-7
+	blr
+
+12:	vandc	v3,v2,v9
+	vcmpequb.	v4,v3,v9
+	beq	cr6,13f
+	addi	r3,r3,-6
+	blr
+13:	addi	r3,r3,-5
+	blr
+
+14:	vandc	v3,v2,v5
+	vsldoi	v8,v0,v1,1
+	vcmpequb.	v4,v3,v5
+	vsldoi	v10,v0,v1,3
+	beq	cr6,16f
+	vandc	v3,v2,v10
+	vcmpequb.	v4,v3,v10
+	beq	cr6,15f
+	addi	r3,r3,-4
+	blr
+15:	addi	r3,r3,-3
+	blr
+
+16:	vandc	v3,v2,v8
+	vcmpequb.	v4,v3,v8
+	beq	cr6,17f
+	addi	r3,r3,-2
+	blr
+17:	addi	r3,r3,-1
+	blr
+
+	.text
+	.align 5
+
+	.global strcmp_vec
+strcmp_vec:
+	lvxl	v2,0,r3
+	vxor	v0,v0,v0
+	addi	r7,r4,16
+	lvxl	v3,0,r4
+	vnor	v1,v0,v0
+	xor	r8,r7,r4
+	lvsl	v6,0,r3
+	vspltisb	v4,8
+	cmpi	2,0,r8,0x1000
+	lvsl	v10,0,r4
+	vspltisb	v12,1
+	beq	2,8f
+1:	andi.	r8,r3,0xF
+	lvxl	v8,0,r7
+	vslb	v13,v4,v12
+	andi.	r9,r4,0xF
+	vperm	v2,v2,v1,v6
+	subf.	r0,r8,r9
+	addi	r5,r3,16
+	vperm	v9,v0,v1,v6
+	lvsl 	v6,0,r0
+	vor	v7,v3,v3
+	vperm	v3,v3,v8,v10
+	addi	r4,r7,16
+	vslb	v11,v13,v12
+	vor	v3,v3,v9
+	xor	r3,r3,r3
+	vcmpequb.	v10,v2,v3
+	vslb	v14,v11,v12
+	vnor	v9,v10,v10
+	bc	4,6*4+0,3f
+	vcmpequb.	v5,v0,v2
+	bc	4,6*4+2,7f
+	blt	6f
+2:	lvxl	v7,0,r4
+	addi	r4,r4,16
+	lvxl	v2,0,r5
+	addi	r5,r5,16
+	vperm	v3,v8,v7,v6
+	vcmpequb.	v10,v2,v3
+	vnor	v9,v10,v10
+	bc	12,6*4+0,5f
+3:	vcmpequb	v5,v0,v2
+	vsum4ubs	v7,v4,v14
+	vor	v9,v9,v5
+	vsro	v12,v9,v11
+	vsrw	v11,v9,v4
+	vsro	v6,v9,v14
+	vsrw	v14,v9,v13
+	vsro	v13,v9,v7
+	vor	v9,v12,v6
+	vsro	v7,v14,v4
+	vor	v9,v9,v13
+	vcmpgtuw	v9,v9,v0
+	vor	v9,v9,v11
+	vor	v9,v9,v14
+	vor	v9,v9,v7
+	vandc	v11,v10,v9
+	vcmpequb.	v14,v11,v9
+	vcmpgtub	v7,v3,v2
+	bc	12,6*4+2,4f
+	vandc	v11,v7,v9
+	li	r3,-1
+	vcmpequb.	v14,v11,v1
+	bc	4,6*4+2,4f
+	li	r3,1
+4:	blr
+
+5:	vcmpequb.	v5,v0,v2
+	bc	4,6*4+2,7f
+	lvxl	v8,0,r4
+	addi	r4,r4,16
+6:	lvxl	v2,0,r5
+	addi	r5,r5,16
+	vperm	v3,v7,v8,v6
+	vcmpequb.	v10,v2,v3
+	vnor	v9,v10,v10
+	bc	4,6*4+0,3b
+	vcmpequb.	v5,v0,v2
+	bc	12,6*4+2,2b
+7:	blr
+
+8:	vcmpequb.	v5,v0,v2
+	bc	13,6*4+2,1b
+	vcmpequb.	v10,v2,v3
+	bc	4,6*4+0,3b
+	blr
+
+
+             .text            
+             .align           5
+             .global          memcmp_vec
+memcmp_vec:                    
+             subf.            r6,r4,r3
+             cmpi             cr1,0,r5,0
+             cmpi             cr7,0,r5,16
+             add              r9,r3,r5
+             addi             r7,r4,-1
+             addi             r11,r3,16
+             beq              2f
+             addi             r10,r9,-1
+             addi             r8,r3,-1
+             rlwinm           r11,r11,0,0,27
+             beq              cr1,2f
+             subf             r11,r11,r10
+             rlwinm           r9,r9,0,28,31
+             bgt              cr7,3f
+             mtctr            r5
+1:           lbzu             r6,1(r7)
+             lbzu             r10,1(r8)
+             subf.            r3,r6,r10
+             bdnzt            2,1b
+             blr              
+
+2:           xor              r3,r3,r3
+             blr              
+3:           rlwinm           r11,r11,28,4,31
+             rlwinm           r7,r4,0,28,31
+             rlwinm           r8,r3,0,28,31
+             cmpi             cr1,0,r11,0
+             lvxl             v0,0,r3
+             subf.            r7,r7,r8
+             li               r7,16
+             lvxl             v1,0,r4
+             vor              v2,v1,v1
+             addi             r5,r5,-1
+             bge              4f
+             lvxl             v2,r4,r7
+             addi             r4,r4,16
+             addi             r5,r5,-16
+4:           lvsl             v3,0,r3
+             vspltisb         v4,8
+             vxor             v5,v5,v5
+             lvsl             v6,0,r4
+             vspltisb         v7,1
+             vnor             v8,v5,v5
+             lvsr             v10,0,r6
+             cmpi             cr5,0,r9,0
+             vperm            v11,v5,v8,v3
+             lvsr             v12,0,r9
+             vperm            v0,v0,v8,v3
+             vperm            v1,v1,v2,v6
+             vslb             v3,v4,v7
+             vor              v1,v1,v11
+             vslb             v6,v3,v7
+             vcmpequb.        v8,v0,v1
+             vslb             v7,v6,v7
+             vnor             v13,v8,v8
+             bc               4,6*4+0,8f
+             ble              cr1,6f
+             mtctr            r11
+5:           lvxl             v9,r4,r7
+             lvxl             v0,r3,r7
+             addi             r7,r7,16
+             vperm            v1,v2,v9,v10
+             vor              v2,v9,v9
+             vcmpequb.        v8,v0,v1
+             vnor             v13,v8,v8
+             bdnzt            24,5b
+             bc               4,6*4+0,8f
+6:           lvxl             v9,r4,r5
+             vperm            v12,v5,v8,v12
+             lvxl             v0,r3,r7
+             vperm            v1,v2,v9,v10
+             beq              cr5,7f
+             vor              v1,v1,v12
+             vor              v0,v0,v12
+7:           vcmpequb.        v8,v0,v1
+             vnor             v13,v8,v8
+             bc               4,6*4+0,8f
+             xor              r3,r3,r3
+             blr              
+8:           vsum4ubs         v2,v4,v7
+             vsro             v9,v13,v6
+             vsrw             v6,v13,v4
+             vsro             v10,v13,v7
+             vsrw             v7,v13,v3
+             vsro             v3,v13,v2
+             vor              v11,v9,v10
+             vsro             v2,v7,v4
+             vor              v11,v11,v3
+             vcmpgtuw         v11,v11,v5
+             vor              v11,v11,v6
+             vor              v11,v11,v7
+             vor              v11,v11,v2
+             vor              v1,v1,v11
+             vor              v0,v0,v11
+             li               r3,-1
+             vcmpgtub.        v8,v1,v0
+             bclr             4,6*4+2
+	   li               r3,1
+             blr              
+
+             .text            
+             .align           5
+             .global          strcpy_vec
+strcpy_vec:                    
+             addi             r5,r3,32
+             subf.            r6,r4,r3
+             subf             r7,r3,r4
+             rlwinm           r5,r5,0,0,26
+             mr               r8,r3
+             beqlr            
+             bgt              1f
+             mr               r6,r7
+1:           subf.            r9,r3,r5
+             addi             r5,r8,4096
+             cmpi             cr7,0,r6,16
+             mtctr            r9
+2:           lbzx             r0,0,r4
+             addi             r4,r4,1
+             cmpi             cr1,0,r0,0
+             stbx             r0,0,r8
+             addi             r8,r8,1
+             bdnzf            6,2b
+             beqlr            cr1
+             li               r11,4096
+             rlwinm           r5,r5,0,0,19
+             mr               r10,r4
+             ble              cr7,2b                      
+             subf.            r5,r8,r5
+             rlwinm           r5,r5,28,4,31
+             lvsl             v4,0,r4
+             vxor             v0,v0,v0
+             ble              9f
+             mtctr            r5
+3:           lvx              v1,0,r10
+             addi             r10,r10,16
+             bdz              10f
+4:           lvx              v2,0,r10
+             addi             r10,r10,16
+             bdz              11f
+5:           lvx              v3,0,r10
+             addi             r10,r10,16
+             bdz              12f
+6:           vperm            v5,v1,v2,v4
+             vperm            v6,v2,v3,v4
+             vor              v1,v3,v3
+             vcmpequb.        v7,v0,v5
+             bne              cr6,8f
+             addi             r4,r4,16
+             vcmpequb.        v7,v0,v6
+             bne              cr6,7f
+             DCBA_R0R8
+             addi             r4,r4,16
+             stvx             v5,0,r8
+             addi             r8,r8,16
+             stvx             v6,0,r8
+             addi             r8,r8,16
+             b                4b
+7:           stvx             v5,0,r8
+             addi             r8,r8,16
+8:           lbzx             r0,0,r4
+             addi             r4,r4,1
+             cmpi             cr1,0,r0,0
+             stbx             r0,0,r8
+             addi             r8,r8,1
+             bne              cr1,8b
+             blr              
+
+9:           mtctr            r11
+             b                3b
+10:          vcmpequb.        v7,v0,v1
+             bnl              cr6,8b
+             mtctr            r11
+             b                4b
+11:          vcmpequb.        v7,v0,v2
+             bnl              cr6,8b
+             mtctr            r11
+             b                5b
+12:          vcmpequb.        v7,v0,v3
+             bnl              cr6,8b
+             mtctr            r11
+             b                6b
diff --git a/liboil/motovec/vec_csum.S b/liboil/motovec/vec_csum.S
new file mode 100644
index 0000000..29ddd11
--- /dev/null
+++ b/liboil/motovec/vec_csum.S
@@ -0,0 +1,724 @@
+//------------------------------------------------------------------
+// file:  vec_csum.S
+//    AltiVec enabled version of linux' checksum routines
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+//	Copyright Motorola, Inc. 2003
+//	ALL RIGHTS RESERVED
+//
+//	You are hereby granted a copyright license to use, modify, and 
+//	distribute the SOFTWARE so long as this entire notice is retained 
+//	without alteration in any modified and/or redistributed versions, 
+//	and that such modified versions are clearly identified as such.  
+//	No licenses are granted by implication, estoppel or otherwise under 
+//	any patents or trademarks of Motorola, Inc.
+//
+//	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
+//	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
+//	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
+//	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
+//	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
+//	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
+//	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
+//
+//	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
+//	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
+//	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
+//	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
+//	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
+//	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
+//	for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern  unsigned long csum_partial_copy_generic(src, dst, len, sum,
+//                                                  src_err, dst_err);
+// Computes the checksum of a memory block at src, length len,
+// and adds in "sum" (32-bit), while copying the block to dst.
+// Returns:
+//  unsigned long sum
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern  unsigned long csum_partial(buff, len, sum);
+//
+// computes the checksum of a memory block at buff, length len,
+// and adds in "sum" (32-bit unsigned long)
+// Returns:
+//  unsigned long sum
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// Assumptions from studying the original linux code:
+//   Copying forward is always safe
+//   src and dst are always half-word aligned
+//   len may be odd or even 0-n;
+//   there is no test to see if src and dst are equal.
+//   returns unsigned int checksum
+//
+//------------------------------------------------------------------
+
+// Revision History:
+//    Rev 0.0	Original                          Chuck Corley	04/19/03
+//
+//  This is alpha quality code; users are encouraged to make it faster.
+//  ASSUMPTIONS:
+//     Code is highly likely to be in the cache; data is not (streaming data)
+
+// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 32 bytes.
+#define MIN_VEC 48	// Experimentally chosen on 7455@1GHz/133 to beat scalar
+
+ // Register useage
+#define Rt r0	// 	r0 when used as a temporary register	
+
+#define SRC r3	// 	entering: src ptr; exiting: unsigned long checksum
+
+#define DST r4	// 	entering: dst pointer; exiting: 
+
+#define BC r5	//	entering: Byte_Count
+
+#define SUM r6	//	entering: Partial checksum
+
+#define SER r7	//	entering: src_err address
+
+#define DER r8	//	entering: dst_err address
+
+#define DM2 r9//	dst -2 for hw-by-hw forwards initially
+#define D r9	//	dst[28:31]
+#define DR r9	//	dst[0:27]
+#define DNX r9	//	(dst+n*16)[28:31]
+#define BL r9	//	second byte_kount index pointer
+
+#define DBC r10//	dst + byte count initially
+#define DBK r10//	(dst+byte_count-1) then (dst+byte_count-1)[28:31]
+
+#define SM2 r11//	src -2 for hw-by-hw forwards initially
+#define QW r11	//  	number of quad words (vectors)
+#define SP8 r11	//	data stream touch block & stride info for Big_loop
+#define SBC r11//	src + byte count initially then src[28:31]
+
+#define BK r12	//  	Byte Kount index
+#define BLK r12	//      temporary data stream touch block & stride info
+#define S r12//	src[28:31]
+#define DMS r12	//      dst - src initially
+
+#define V0	v0	// 	all zeros
+#define VCARS	v0 	//	sum of carries
+
+#define V1	v1	// 	all ones
+#define VMM	v1	// 	mask for final dst right
+
+#define VS0	v2	//  	src vector for permuting
+#define VL	v2	//	low data	
+
+#define VS1	v3	//  	src vector for permuting
+#define VH 	v3	//	high data
+
+#define VPS0	v4	// 	permuted source vector to store
+
+#define VP2	v5	// 	dst permute register
+#define VM	v5	// 	mask for first dst left
+#define VS2	v5	//  	src vector for permuting
+
+#define VP3	v6	// 	d - s permute register
+#define VS3	v6	// 	4th src vector in csum_partial
+
+#define VP4	v7	// 	Byte_Count permute register
+#define VPS1	v7	//  	2nd permuted source vector to store
+
+#define VSUM 	v8	//	Updated sum
+#define VFIN 	v8	//	final sum
+
+#define VCAR1 	v9 	//	temp register for carries
+#define VCAR3 	v9 	//	temp register for carries
+
+#define VCAR2 	v10 	//	temp register for carries
+
+#define VCARF 	v11 	//	temp register for carries
+
+#define	VTEMP 	v12 	//	Temp register
+
+
+// Conditionalize the use of dcba.  It will help if the data is
+// not in cache and hurt if it is.  Generally, except for small
+// benchmarks repeated many times, we assume data is not in cache
+// (data streaming) and using dcba is a performance boost.
+#ifndef NO_DCBA
+#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
+ // gcc and codewarrior and diab don't assemble dcba
+#define DCBK .long 0x7c0465ec
+// dcba r4,r12    or    dcba DST,BK
+#else
+#ifdef __ghs__
+.macro DCBK
+.long 0x7c0465ec
+.endm
+#else
+#define DCBK dcba DST,BK
+#endif  // __ghs__
+#endif  // __GNUC__ or __MWERKS__
+#else
+#define DCBK nop
+#endif  // NO_DCBA
+
+// Conditionalize the use of dst (data stream touch).  It will help
+// if the data is not in cache and hurt if it is (though not as badly
+// as dcbz).  Generally, except for small benchmarks repeated many times,
+// we assume data is not in cache (data streaming) and using dst is a
+// performance boost.
+#ifndef NO_DST
+#define STRM_F dst	SRC,BLK,0
+#define STRM_1 dst	SP8,Rt,1
+
+#else
+#define STRM_F	nop
+#define STRM_1	nop
+#endif
+	.text
+#if __MWERKS__
+	.align	16
+#define SP r1
+#else
+	.align	4
+#endif
+
+#ifdef LIBMOTOVEC
+	.global	csum_partial_copy_generic_vec     
+csum_partial_copy_generic:
+#else
+	.global	vec_csum_partial_copy_generic     
+vec_csum_partial_copy_generic:
+#endif
+
+	li	BK,32		// IU1 
+	rlwinm	Rt,BC,31,1,31	// IU1 BC/2
+	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
+
+	dcbt	SRC,BK		// LSU prefetch next cacheline
+	cmpi	cr6,0,Rt,0	// IU1 BC/2 == 0?
+	addic	SUM,SUM,0	// IU1 Zero carry bit
+
+	addi	SM2,SRC,-2	// IU1 Pre-bias and duplicate src
+	add	DBC,DST,BC	// IU1 Address of last dst byte + 1
+	bgt	cr7,v_csumcpy	// b if BC>MIN_VEC (will copy vectors fwd)
+	andi.	BK,BC,1		// IU1 BC[31]==0?
+
+	addi	DM2,DST,-2	// IU1 Pre-bias and duplicate destination
+	add	S,SRC,BC	// IU1 Last src byte + 1 (temp use of S)
+	beq	cr6,No_HWs	// b if BC/2==0
+	mtctr	Rt		// i=BC/2; do ...;i--; while (i>0)
+HW_cpy:
+	lhzu	Rt,2(SM2)	// LSU
+	sthu	Rt,2(DM2)	// LSU
+	addc	SUM,SUM,Rt	// IU1
+	bdnz	HW_cpy
+No_HWs:
+	beq	BC_even		// b if BC[31]==0 (or DBC[31]==0 when aligned)
+	lbz	Rt,-1(S)	// LSU Get last src address byte
+
+	stb	Rt,-1(DBC)	// LSU Store to last dst address byte
+	rlwinm	Rt,Rt,8,16,23	// IU1 Shift odd byte left
+
+	addc	SUM,SUM,Rt	// IU1
+BC_even:
+	addze	SRC,SUM
+	blr
+
+v_csumcpy:
+	lvsr	VP2,0,DST	// LSU Permute vector for initial byte mask
+	rlwinm	D,DST,0,28,31	// IU1 D = dst[28:31]
+	rlwinm	S,SRC,0,28,31	// IU1 Save src address bits s[28:31]
+
+	lvsr	VP4,DST,BC	// LSU Permute vector for final byte mask
+	subf.	S,S,D		// IU1 if D-S<0 essentially shifting left
+	subf	DMS,SRC,DST	// IU1 Compute dst-src difference
+
+	lvsr	VP3,0,DMS	// LSU Permute vector for dst - src shft right
+	li	BK,64		// IU1 Index of next cache line
+	vxor	V0,V0,V0	// VIU Clear v0
+
+	dcbt	SRC,BK		// LSU Prefetch next cache line at src+64
+	cmpi	cr1,0,D,0	// IU1 Is D0 left justified?
+	vnor	V1,V0,V0	// VIU1 Create a vector of all ones
+
+	addi	DR,DST,16	// IU1 Address of second dst vector
+	addi	DBK,DBC,-1	// IU1 Address of last dst byte
+	vperm	VM,V1,V0,VP2	// VPU D0 select vector for dst left; src right	
+	bge	Ld_bytes_rt	// b if shifting right (D-S>=0)
+
+	lvx	VS0,0,SRC	// LSU Get S0 load started
+	addi	SRC,SRC,16	// IU1 Increment src base (to keep BK useful)
+
+Ld_bytes_rt:	// Come here to get VS1 & Don't care what VS0 is	
+	lvx	VS1,0,SRC	// LSU Get S1 (or S0 if D-S>=0) in upper vector
+	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
+
+	vperm	VMM,V0,V1,VP4   // VPU DN select vector for src left; dst right 
+	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
+	vxor	VPS1,VPS1,VPS1	// VIU Clear VPS1
+	
+	vxor	VCARF,VCARF,VCARF	//VIU1 clear VCARF
+	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
+	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
+
+	li	BK,96		// IU1 Index of next cache line
+	cmpi	cr5,0,Rt,0xF	// IU1 Is DN right justified?
+	subf	Rt,DST,DR	// IU1 How many bytes in first destination?
+
+	mtctr	QW		// IU2
+	cmpi	cr6,0,QW,4	// IU1 Check QW>4
+	mtcrf	0x01,Rt		// IU2 Put bytes in 1st dst in cr7
+
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align S0 and S1 to D0
+	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
+	dcbt	SRC,BK		// LSU Prefetch next cache line at src+96
+	beq	cr1,Left_just	// b if D0 is left justified
+
+	li	BK,0		// IU1 Initialize byte kount index
+	vsel	VPS0,VPS0,V0,VM	// VIU1 Select zeroes left | S0 bytes right
+	bns	cr7,No_B_fwd	// b if only even number of bytes to store
+
+	stvebx	VPS0,DST,BK	// LSU store first byte at DST+0
+	addi	BK,BK,1		// IU1 increment index
+No_B_fwd:
+	bne	cr7,No_H_fwd	// b if only words to store
+
+	stvehx	VPS0,DST,BK	// LSU store halfword at DST+0/1
+	addi	BK,BK,2		// IU1 increment index
+No_H_fwd:
+	bng	cr7,No_W1_fwd	// b if exactly zero or two words to store
+
+	stvewx	VPS0,DST,BK	// LSU store word 1 of one or three
+	addi	BK,BK,4		// IU1 increment index
+
+No_W1_fwd:
+	bnl	cr7,No_W2_fwd	// b if there was only one word to store
+	stvewx	VPS0,DST,BK	// LSU store word 1 of two or 2 of three
+	addi	BK,BK,4		// IU1 increment index
+
+	stvewx	VPS0,DST,BK	// LSU store word 2 of two or 3 of three
+	b	No_W2_fwd
+
+Left_just:	
+	stvx	VPS0,0,DST	// LSU Store 16 bytes at D0
+No_W2_fwd:
+	vxor	VSUM,VSUM,VSUM	// VIU1 Clear VSUM
+	li	BK,16		// IU1 Re-initialize byte kount index
+
+QW_fwd_loop:
+	lvx	VS1,SRC,BK	// LSU Get S2 (or S1)
+	vaddcuw	VCAR1,VPS0,VSUM	// VIU1 data + previous sum ->store carries
+
+	vadduwm	VSUM,VPS0,VSUM	// VIU1 data + previous sum (no carries)
+
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align S1 and S2 to D1
+	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
+
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D1(+n*16 where n<4)
+	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF
+	addi	BK,BK,16	// IU1 Increment byte kount index
+	bdnzf	25,QW_fwd_loop	// b if 4 or less quad words to do
+
+	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
+	addi	QW,QW,-1	// IU1 One more QW stored by now
+	bgt	cr6,GT_4QW_fwd	// b if >4 quad words left
+
+Last_ld_fwd:	// Next 16 bytes is the last; we're done.
+	add	DBC,DST,BC	// IU1 Recompute address of last dst byte + 1
+	add	SBC,SRC,BC	// IU1 Recompute address of last src byte + 1
+	bge	No_ld_fwd	// b if shifting right (D-S>=0)
+
+	addi	SBC,SBC,-16	// IU1 if D-S>=0 we didn't add 16 to src
+No_ld_fwd:
+	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
+	addi	Rt,SBC,-1	// IU1 Recompute address of last src byte
+
+	lvx	VS1,0,Rt	// LSU Get last source S14 (guaranteed SN)
+	vaddcuw	VCAR1,VPS0,VSUM	// VIU1 data + previous sum ->store carries
+
+	vadduwm	VSUM,VPS0,VSUM	// VIU1 data + previous sum (no carries)
+
+	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF
+
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D14
+	beq	cr5,Rt_just_fwd	// b if last destination is right justified
+	vsel	VPS0,VPS0,V0,VMM   // VIU1 Select src bytes left | zeroes right
+
+	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
+	li	D,0		// IU1 Initialize index pointer
+	bnl	cr7,Only_1W_fwd	// b if there was only one or zero words to store
+
+	stvewx	VPS0,DBK,D	// LSU store word 1 of two or three
+	addi	D,D,4		// IU1 increment index
+
+	stvewx	VPS0,DBK,D	// LSU store word 2 of two or three
+	addi	D,D,4		// IU1 increment index
+Only_1W_fwd:
+	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
+
+	stvewx	VPS0,DBK,D	// LSU store word 3 of three if necessary
+	addi	D,D,4		// IU1 increment index
+Only_2W_fwd:
+	bne	cr7,Only_B_fwd	// b if there are no half words to store
+
+	stvehx	VPS0,DBK,D	// LSU store one halfword if necessary
+	addi	D,D,2		// IU1 increment index
+Only_B_fwd:
+	bns	cr7,All_done_fwd	// b if there are no bytes to store
+
+	stvebx	VPS0,DBK,D	// LSU store one byte if necessary
+	b	All_done_fwd
+
+Rt_just_fwd:
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D14
+All_done_fwd:
+	vaddcuw	VCAR1,VPS0,VPS1	//VIU1 add data and store carries
+
+	vadduwm	VTEMP,VPS0,VPS1	//VIU1 add data (no carries)
+
+	vaddcuw	VCAR2,VTEMP,VSUM	//VIU1 data + previous sum ->store carries
+	
+	vadduwm	VSUM,VTEMP,VSUM	//VIU1	data + previous sum
+	
+	vadduwm	VCAR3,VCAR1,VCAR2	//VIU1 add carries from previous adds
+	vmrglh	VL,V0,VSUM	// VPU separate low shorts of sum
+
+	vadduwm	VCARF,VCAR3,VCARF	//VIU1 update VCARF
+	vmrghh	VH,V0,VSUM	//VPU separate high shorts of sum
+	rlwinm	DBK,SP,0,0,27	// IU1 Align stack pointer to QW
+
+	vsumsws	VCARS,VCARF,V0	 //VIU2 sum all carries 
+	vadduwm	VSUM,VL,VH	//VIU1 add low and high data
+	li	BK,-16		// IU1 Index 0x10 less than SP
+
+	vsumsws	VFIN,VSUM,VCARS	//VIU2	sum all data including carries
+        	
+	stvx	VFIN,DBK,BK	// LSU Store partial checksum from VR
+	
+	lwz	SRC,-4(DBK)	// LSU Load partial checksum to GPR
+
+	addc	SRC,SRC,SUM
+
+	addze	SRC,SRC
+
+	blr			// Return destination address from entry
+	
+
+#ifdef __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
+
+	lvx	VS1,SRC,BK	// LSU Get S3 (or S2)
+	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
+	vaddcuw	VCAR1,VPS0,VSUM	// VIU1 data + previous sum ->store carries
+
+	vadduwm	VSUM,VPS0,VSUM	// VIU1 data + previous sum (no carries)
+	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+	addi	DNX,DNX,16		// IU1 Update cr6 for next loop
+
+	addi	Rt,QW,-2	// IU1 Insure at least 2 QW left after big loop
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align S2 and S3 to D2
+	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
+
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D2
+	addi	BK,BK,16	// IU1 Increment byte count by 16
+	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF
+	bdnzf	27,GT_4QW_fwd	// b if next store is to lower (even) half of CL
+// At this point next store will be to even address.
+
+	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+	addi	SP8,SRC,96	// IU1 Starting address for dcbt
+	addi	BL,BK,16	// IU1 Create an alternate byte kount + 32
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+
+	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even
+
+	bdnz	B32_fwd		// decrement counter for last QW store odd
+
+B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
+	lvx	VS1,SRC,BK	// LSU Get S4
+	addi	SP8,SP8,32	// IU1 Next starting address for dcbt
+	vaddcuw	VCAR1,VPS0,VPS1	// VIU1 add data and store carries
+
+	lvx	VS2,SRC,BL	// LSU Get S5
+	vadduwm	VTEMP,VPS0,VPS1	// VIU1 add data (no carries)
+
+	dcbt	0,SP8		// LSU Prefetch cache line 64 bytes ahead
+	vaddcuw	VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries
+
+	DCBK			// LSU Kill instead of RWITM
+	vadduwm	VSUM,VTEMP,VSUM //VIU1	data + previous sum
+	vperm	VPS1,VS0,VS1,VP3	// VPU Align S11 and S12 to D11
+
+	stvx	VPS1,DST,BK	// LSU Store 16 bytes at D11
+	vperm	VPS0,VS1,VS2,VP3	// VPU Align S12 and S13 to D12
+	vadduwm	VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds
+	bdz	Nxt_loc_fwd	// always decrement and branch to next instr		
+
+Nxt_loc_fwd:
+	stvx	VPS0,DST,BL	// LSU Store 16 bytes at D12
+	vor	VS0,VS2,VS2	// VIU1 Move S13 to S11
+	vadduwm	VCARF,VCAR3,VCARF //VIU1 update VCARF
+
+	addi	BK,BL,16	// IU1 Increment byte count
+	addi	BL,BK,16	// IU1 Increment alternate byte count
+	bdnz	B32_fwd		// b if there are at least two more QWs to do
+
+	bso	cr6,One_even_QW	// b if there is one even and one odd QW to store
+
+	b	Last_ld_fwd	// b if last store is to even address
+
+// Come here with two more loads and two stores to do
+One_even_QW:
+	lvx	VS1,SRC,BK	// LSU Get S6 (or S5 if if D-S>=0)
+	vaddcuw	VCAR1,VPS0,VSUM	// VIU1 data + previous sum ->store carries
+
+	vadduwm	VSUM,VPS0,VSUM	// VIU1 data + previous sum (no carries)
+
+	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF
+
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D13
+	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
+
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D13
+	addi	BK,BK,16	// IU1 Increment byte count
+	b	Last_ld_fwd
+
+// End of vec_csum_partial_copy_generic in AltiVec
+
+// Modified from above Register useage
+// Don't use vectors for BC <= MIN_VEC_CS. Works only if MIN_VEC >= 32 bytes.
+#define MIN_VEC_CS 48	// Chosen experimentally on MPC7455@1GHz/133MHz bus
+#undef	DST	//      will not be using here
+#undef	BC
+#define BC r4	//	entering: Byte_Count
+
+#undef	SUM
+#define SUM r5	//	entering: Partial checksum
+
+#if __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+#ifdef LIBMOTOVEC
+	.global	csum_partial_vec     
+csum_partial:
+#else
+	.global	vec_csum_partial 
+vec_csum_partial:
+#endif
+	li	BK,32		// IU1 
+	rlwinm	Rt,BC,31,1,31	// IU1 BC/2
+	cmpi	cr7,0,BC,MIN_VEC_CS	// IU1 Check for minimum byte count
+
+	dcbt	SRC,BK		// LSU prefetch next cacheline
+	cmpi	cr6,0,Rt,0	// IU1 BC/2 == 0?
+	addic	SUM,SUM,0	// IU1 Zero carry bit
+
+	addi	SM2,SRC,-2	// IU1 Pre-bias and duplicate src
+	add	DBC,SRC,BC	// IU1 Compute address of last src byte + 1
+	bgt	cr7,v_csum	// b if BC>MIN_VEC_CS
+	andi.	BK,BC,1		// IU1 BC[31]==0?
+
+	beq	cr6,No_HWs_cs	// b if BC/2==0
+	mtctr	Rt		// i=BC/2; do ...;i--; while (i>0)
+HW_cs:
+	lhzu	Rt,2(SM2)	// LSU
+
+	addc	SUM,SUM,Rt	// IU1
+	bdnz	HW_cs
+No_HWs_cs:
+	beq	BC_even_cs	// b if BC[31]==0 (or DBC[31]==0 when aligned)
+	lbz	Rt,-1(DBC)	// LSU Get last src address byte
+
+	rlwinm	Rt,Rt,8,16,23	// IU1 Shift odd byte left
+
+	addc	SUM,SUM,Rt	// IU1
+BC_even_cs:
+	addze	SRC,SUM
+	blr
+
+v_csum:
+	lvsr	VP2,0,SRC	// LSU Permute vector for initial byte mask
+	addi	DR,SRC,16	// IU1 Address of second src vector
+	li	BK,64		// IU1 Index of next cache line
+
+	lvsr	VP4,SRC,BC	// LSU Permute vector for final byte mask
+	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
+	addi	DBK,DBC,-1	// IU1 Address of last src byte
+
+	lvx	VS0,0,SRC	// LSU Get S0 load started
+	subf	QW,DR,DBK	// IU1 Bytes of full vectors to test (-16)
+	vxor	V0,V0,V0	// VIU Clear v0
+
+	dcbt	SRC,BK		// LSU Prefetch next cache line at src+64
+	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
+	vnor	V1,V0,V0	// VIU1 Create a vector of all ones
+
+	mtctr	QW		// IU2
+	vxor	VCARF,VCARF,VCARF	//VIU1 clear VCARF
+	vperm	VM,V1,V0,VP2	// VPU D0 select vector for dst left; src right	
+
+	cmpi	cr6,0,QW,4	// IU1 Check QW>4
+	vxor	VSUM,VSUM,VSUM	// VIU1 Clear VSUM
+	vperm	VMM,V0,V1,VP4   // VPU DN select vector for src left; dst right 
+
+	li	BK,16		// IU1 Initialize byte kount index
+	vsel	VS0,VS0,V0,VM	// VIU1 Select zeroes left | S0 bytes right
+vp_fwd_loop:
+	lvx	VS1,SRC,BK	// LSU Get S1
+	vaddcuw	VCAR1,VS0,VSUM	// VIU1 data + previous sum ->store carries
+
+	vadduwm	VSUM,VS0,VSUM	// VIU1 data + previous sum (no carries)
+
+	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF
+	addi	BK,BK,16	// IU1 Increment byte kount index
+
+	vor	VS0,VS1,VS1	// VIU1 Swap vectors for next loop
+	bdnzf	25,vp_fwd_loop	// b if 4 or less quad words to do
+
+	add	DNX,SRC,BK	// IU1 address of next load (SRC+32 if QW>4)
+	addi	QW,QW,-1	// IU1 One more QW summed by now
+	bgt	cr6,GT_4QW_cs	// b if >4 quad words left
+	vxor	VS1,VS1,VS1	// VIU1 Zero before adding below
+
+// Next 16 bytes is the last; we're done.
+Last_ld_cs:
+	lvx	VS2,0,DBK	// LSU Get last source (guaranteed SN)
+	vaddcuw	VCAR1,VS0,VS1	// VIU1 add data and store carries
+	rlwinm	DBK,DBK,0,28,31 // IU1 (dst + BC -1)[28:31]
+
+	vadduwm	VTEMP,VS0,VS1	// VIU1 add data (no carries)
+	cmpi	cr7,0,DBK,0xF	// IU1 Is last byte right justified?
+
+	vaddcuw	VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries
+
+	vadduwm	VSUM,VTEMP,VSUM //VIU1	data + previous sum
+
+	vadduwm	VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds
+
+	vadduwm	VCARF,VCAR3,VCARF //VIU1 update VCARF
+
+	beq	cr7, Rt_just	// b if right justified.
+	vsel	VS2,VS2,V0,VMM   // VIU1 Select src bytes left | zeroes right
+
+Rt_just:
+	vaddcuw	VCAR1,VS2,VSUM	// VIU1 data + previous sum ->store carries
+
+	vadduwm	VSUM,VS2,VSUM	// VIU1 data + previous sum (no carries)
+
+	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF
+	vmrglh	VL,V0,VSUM	// VPU separate low shorts of sum
+
+	vmrghh	VH,V0,VSUM	//VPU separate high shorts of sum
+	rlwinm	DBK,SP,0,0,27	// IU1 Align stack pointer to QW
+
+	vsumsws	VCARS,VCARF,V0	 //VIU2 sum all carries 
+	vadduwm	VSUM,VL,VH	//VIU1 add low and high data
+	li	BK,-16		// IU1 Index 0x10 less than SP
+
+	vsumsws	VFIN,VSUM,VCARS	//VIU2	sum all data including carries
+        	
+	stvx	VFIN,DBK,BK	// LSU Store partial checksum from VR
+	
+	lwz	SRC,-4(DBK)	// LSU Load partial checksum to GPR
+
+	addc	SRC,SRC,SUM
+
+	addze	SRC,SRC
+
+	blr			// Return destination address from entry
+
+#ifdef __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+GT_4QW_cs:	// Do once if nxt ld is from odd half of cache line, else twice
+
+	lvx	VS1,SRC,BK	// LSU Get S3 (or S2)
+	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
+	vaddcuw	VCAR1,VS0,VSUM	// VIU1 data + previous sum ->store carries
+
+	vadduwm	VSUM,VS0,VSUM	// VIU1 data + previous sum (no carries)
+	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
+
+	addi	Rt,QW,-2	// IU1 Insure at least 2 QW left after big loop
+	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
+
+	addi	BK,BK,16	// IU1 Increment byte count by 16
+	vadduwm	VCARF,VCAR1,VCARF	// VIU1 Update VCARF
+	bdnzf	27,GT_4QW_cs	// b if next store is to lower (even) half of CL
+// At this point next store will be to even address.
+
+	mtcrf	0x02,DBK	// IU2 cr6[3]=((last load)[27]==1)?1:0; (odd?)
+	addi	SP8,SRC,96	// IU1 Starting address for dcbt
+	vxor	VS1,VS1,VS1	// VIU1 Zero before adding below
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+
+	bns	cr6,B32_cs	// b if DST[27] == 0; i.e, final load is even
+
+	bdnz	B32_cs		// decrement counter for last QW load odd
+
+B32_cs:	// Should be at least 2 loads remaining and next 2 are cache aligned
+	lvx	VS2,SRC,BK	// LSU Get S4
+	addi	BK,BK,16	// IU1 Increment byte count by 16
+	vaddcuw	VCAR1,VS0,VS1	// VIU1 add data and store carries
+
+	lvx	VS3,SRC,BK	// LSU Get S5
+	addi	SP8,SP8,32	// IU1 Next starting address for dcbt
+	vadduwm	VTEMP,VS0,VS1	// VIU1 add data (no carries)
+
+	dcbt	0,SP8		// LSU Prefetch cache line 64 bytes ahead
+	addi	BK,BK,16	// IU1 Increment byte count
+	vaddcuw	VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries
+
+	vadduwm	VSUM,VTEMP,VSUM //VIU1	data + previous sum
+
+	vadduwm	VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds
+	bdz	Nxt_loc_cs	// always decrement and branch to next instr		
+
+Nxt_loc_cs:
+	vadduwm	VCARF,VCAR3,VCARF //VIU1 update VCARF
+
+	vor	VS0,VS2,VS2	// VIU1 Move S13 to S11
+
+	vor	VS1,VS3,VS3	// VIU1 Move upper vector to lower
+	bdnz	B32_cs		// b if there are at least two more QWs to do
+
+	bso	cr6,One_even_QW_cs	// b if there is one even and one odd QW to store
+
+	b	Last_ld_cs	// b if last store is to even address
+
+// Come here with two more loads and two stores to do
+One_even_QW_cs:
+	lvx	VS2,SRC,BK	// LSU Get S6 (or S5 if if D-S>=0)
+	addi	BK,BK,16	// IU1 Increment byte count
+	vaddcuw	VCAR1,VS0,VS1	// VIU1 add data and store carries
+
+	vadduwm	VTEMP,VS0,VS1	// VIU1 add data (no carries)
+
+	vaddcuw	VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries
+
+	vadduwm	VSUM,VTEMP,VSUM //VIU1	data + previous sum
+
+	vadduwm	VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds
+
+	vadduwm	VCARF,VCAR3,VCARF //VIU1 update VCARF
+
+	vxor	VS1,VS1,VS1	// VIU1 Zero before next add
+
+	vor	VS0,VS2,VS2	// VIU1 Move S13 to S11
+	b	Last_ld_cs
+
+// End of vec_csum_partial in AltiVec
+\ No newline at end of file
diff --git a/liboil/motovec/vec_memcmp.S b/liboil/motovec/vec_memcmp.S
new file mode 100644
index 0000000..d0117fa
--- /dev/null
+++ b/liboil/motovec/vec_memcmp.S
@@ -0,0 +1,340 @@
+//#define __MWERKS__
+//------------------------------------------------------------------
+// file:  vec_memcmp.S
+//    AltiVec enabled version of memcmp
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+//	Copyright Motorola, Inc. 2003
+//	ALL RIGHTS RESERVED
+//
+//	You are hereby granted a copyright license to use, modify, and 
+//	distribute the SOFTWARE so long as this entire notice is retained 
+//	without alteration in any modified and/or redistributed versions, 
+//	and that such modified versions are clearly identified as such.  
+//	No licenses are granted by implication, estoppel or otherwise under 
+//	any patents or trademarks of Motorola, Inc.
+//
+//	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
+//	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
+//	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
+//	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
+//	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
+//	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
+//	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
+//
+//	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
+//	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
+//	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
+//	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
+//	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
+//	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
+//	for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern int vec_memcmp(const void *ptr1, const void *ptr2, size_t len);
+// Returns:
+//  value < 0  if ptr1[0:len] <  ptr2[0:len]
+//  value = 0  if ptr1[0:len] == ptr2[0:len]
+//  value > 0  if ptr1[0:len] >  ptr2[0:len]
+//------------------------------------------------------------------
+
+// Revision History:
+//    Rev 0.0	Original                          Chuck Corley	05/27/03
+
+
+#define VRSV 256	//	VRSAVE spr
+// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
+#define MIN_VEC 16
+
+	// Macros for bits in CR6
+#define	_all 6*4+0
+#define	_none 6*4+2
+	// Macros for condition to be true/false and unlikely/likely to be taken    
+#define	_F_u 4
+#define	_T_u 12
+#define	_T_l 13
+
+// Register useage
+#define Rt r0	// 	r0 when used as a temporary register	
+
+#define PT1 r3	// 	entering: ptr1; exiting: return value
+
+#define SRC r4	// 	entering: ptr2; then ptr2+16 if ptr1[28:31]<ptr2[28:31]
+
+#define BC r5	//	entering: Byte_Count
+#define BCM1 r5	//	then Byte_Count -1
+
+#define DMS r6	//      	ptr1 - ptr2 initially
+#define S2 r6	//  	ptr2 bytes initially
+
+// Codewarrior will put an unwelcome space as "lbzu	r0,1(r7 )"
+// if you don't put the comment right after the r7.  CJC 030314
+#define SM1 r7//	ptr2 -1 for byte-by-byte forwards initially
+#define S r7	//	ptr2[28:31]
+#define BK r7	//	byte index
+
+#define DM1 r8//	ptr1 -1 for byte-by-byte forwards initially
+#define D r8	//	ptr1[28:31]
+
+#define PBC r9	//	ptr1 + byte count initially
+
+#define S1 r10	//  	ptr1 bytes initially
+#define PBK r10	//	(ptr1+byte_count-1)
+
+#define DR r11	//	(ptr1+16)[0:27]
+#define QW r11	//  	number of quad words (vectors)
+
+#define RSV r12	//  	storage for VRSAVE register if used
+
+#define VS1 v0	// 	source 1 as a vector of 16 bytes
+
+#define VS2 v1	// 	source 2 as a vector of 16 bytes
+
+#define VS2b v2	//  	second source 2 vector for permuting
+#define VS12B v2	// 	octet shift count of 12
+#define VMB3 v2	// 	mismatch shifted right 3 bytes
+
+#define VP1 v3	// 	source 1 permute register
+#define VSH16 v3	// 	octet shift count of 16 bits/2 octets
+#define VMW3 v3	// 	mismatch shifted right 3 words
+
+#define VS1B v4	// 	octet shift count of 1
+
+#define V0 v5	// 	all zeros
+
+#define VP2 v6	// 	source 2 permute register
+#define VS4B v6	// 	octet shift count of 4
+#define VMB1 v6	// 	mismatch shifted right one byte
+
+#define VSH1 v7	// 	shift count of 1 bit
+#define VS8B v7	// 	octet shift count of 8 octets
+#define VMB2 v7	// 	mismatch shifted right 2 bytes
+
+#define V1 v8	// 	all ones
+#define VCE v8	// 	equality compare destination register
+
+#define VS2a v9	//  	first source 2 vector for permuting
+#define VMW1 v9	// 	mismatch shifted right one word
+
+#define VP3 v10	// 	ptr1-ptr2 permute register
+#define VMW2 v10	// 	mismatch shifted right 2 words
+
+#define VM v11	// 	mask for right end of 1st S1 vector
+
+#define VP4   v12	//	last mask permute vector
+#define VLM   v12	//      last mask register
+
+#define VMM   v13	// 	vector of zeroes with ones at mismatch(es) and DN
+
+//  Condition register use
+//      cr0[0:2] = (ptr1-ptr2==0)? return
+// then cr0[0:2] = (ptr1[28:31]-ptr2[28:31]<0)? "Need more S2?";
+//      cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
+// then cr1[2]   = (QW == 0)? 1 : 0; (Any full vectors to move?)
+//      cr5[2]   = ((PBK = PT1+BC)[28:31] = 0)? 1 : 0; (S1N right justified)
+//      cr6[0]   = (S1 == S2)?1:0;  (By vector)
+// then cr6[2]   = (S2 > S1)? 1 : 0; (At mismatched byte)
+//      cr7[2]   = (BC>MIN_VEC)?1:0;  (BC big enough to warrant vectors)
+
+	.text
+#ifdef __MWERKS__
+	.align	32
+#else
+	.align	5
+#endif
+
+#ifdef LIBMOTOVEC
+	.global	memcmp     
+memcmp:
+#else
+	.global	vec_memcmp     
+vec_memcmp:
+#endif
+	subf.	DMS,SRC,PT1	// IU1 Compute ptr1-ptr2 difference
+	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count moves
+	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
+
+	add	PBC,PT1,BC	// IU1 Address of last byte + 1
+	addi	SM1,SRC,-1	// IU1 Pre-bias and duplicate ptr2
+	addi	DR,PT1,16		// IU1 Duplicate s1 pointer
+	beq	Dumb_exit		// return if PT1 = SRC
+
+	addi	PBK,PBC,-1	// IU1 Address of last ptr1 byte
+	addi	DM1,PT1,-1	// IU1 Pre-bias and duplicate ptr1
+	rlwinm	DR,DR,0,0,27	// IU1 (PT1+16)[0:27]
+	beq	cr1,Dumb_exit	// return if BC = 0
+
+	subf	QW,DR,PBK		// IU1 Bytes of full vectors to move (-16)
+	rlwinm	PBC,PBC,0,28,31
+	bgt	cr7,v_memcmp	// do as vectors if BC>MIN_VEC
+	
+// Compare byte-by-byte if BC<=MIN_VEC	
+	mtctr	BC		// i=BC; do if...;i--; while (i>0)
+Cmp_nxt_byte:
+	lbzu	S2,1(SM1)		// LSU
+	lbzu	S1,1(DM1)		// LSU
+	subf.	PT1,S2,S1		// IU1 if (*s1++ == *s2++)
+	bdnzt	2,Cmp_nxt_byte	// b while equal and bytes left
+	blr
+Dumb_exit:
+	xor	PT1,PT1,PT1	// IU1 return zero
+	blr
+
+#ifdef __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+v_memcmp:
+// Byte count < MIN_VEC bytes will have been compared by scalar code above,
+// so this will not deal with small block compares < MIN_VEC.
+
+#ifdef VRSAVE
+	mfspr	RSV,VRSV		// IU2 Get current VRSAVE contents
+#endif
+	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
+	rlwinm	S,SRC,0,28,31	// IU1 Save ptr2 address bits s[28:31]
+
+#ifdef VRSAVE
+	oris	Rt,RSV,0xfff8	// IU1 Or in registers used by this routine
+#endif	
+	rlwinm	D,PT1,0,28,31	// IU1 D = ptr1[28:31]
+	cmpi	cr1,0,QW,0	// IU1 Any full vectors to move?
+
+#ifdef VRSAVE
+	mtspr	VRSV,Rt		// IU2 Save in VRSAVE before first vec op
+#endif
+	lvxl	VS1,0,PT1		// LSU Get source1 load started (load as LRU)
+	subf.	S,S,D		// IU1 Is s2 longer than s1? (28:31 greater?)
+	li	BK,16		// IU1 Byte index pointer
+
+	lvxl	VS2,0,SRC		// LSU Get source2 load started (load as LRU)
+	vor	VS2b,VS2,VS2	// VIU1 Preset second s2 vector if not loaded
+	addi	BCM1,BC,-1	// IU1 Index to last s2 byte
+// Decide if second vector of S2 is needed to compare to first vector of S1
+	bge	Around		// b if initial S1 is shorter than or equal S2
+
+	lvxl	VS2b,SRC,BK	// LSU Otherwise, we need more of s2
+	addi	SRC,SRC,16	// IU1 Increment s2 pointer
+	addi	BCM1,BCM1,-16	// IU1 Correction for last byte
+Around:
+
+	lvsl	VP1,0,PT1		// LSU Set permute vector for s1 shift left
+	vspltisb	VS1B,8		// VPU Create a shift count for 1 octet/8 bits
+	vxor	V0,V0,V0		// VIU1 Create a vector of all zeroes
+
+	lvsl	VP2,0,SRC		// LSU Set permute vector for s2 shift left
+	vspltisb	VSH1,1		// VPU Create a shift count of 1 bit
+	vnor	V1,V0,V0		// VIU1 Create a vector of all ones
+
+	lvsr	VP3,0,DMS		// LSU Set permute vector for S2-S1 difference
+	cmpi	cr5,0,PBC,0	// IU1 Will last byte of S2 be rt justified?
+	vperm	VM,V0,V1,VP1	// VPU Mask as long as our subset of 1.
+
+
+	lvsr	VP4,0,PBC		// VIU1 Permute vector for bytes rt of end
+// Dealing with first S1 Vector - Permute S1 and S2 (possibly + S2b) to left edge
+	vperm	VS1,VS1,V1,VP1	// VPU Left align s1 with ones as pad
+
+	vperm	VS2,VS2,VS2b,VP2	// VPU Left align s2 and s2+
+
+	vslb	VSH16,VS1B,VSH1	// VPU Shift count for 16 bits/2 octets
+	vor	VS2,VS2,VM	// VIU1 s2 now has identical ones padding to s1
+
+	vslb	VS4B,VSH16,VSH1	// VPU Create a shift count for 4 octets
+	vcmpequb.	VCE,VS1,VS2	// VIU1 Does s1 = s2?
+
+	vslb	VS8B,VS4B,VSH1	// VPU Create a shift count for 8 octets
+	vnor	VMM,VCE,VCE	// VIU1 Not equals become ones
+	bc	_F_u,_all,memcmp_final_v_NE	// b if s1!=s2
+
+	ble	cr1,Last_ld	// b if there are no QW to do
+	mtctr	QW		// IU2 i=QW; do ...;  while (i-- > 0)
+
+// Dealing with middle vectors
+memcmp_NA_next_v:
+	lvxl	VS2a,SRC,BK	// LSU Get next 16 bytes of s2
+
+	lvxl	VS1,PT1,BK	// LSU Get next 16 bytes of s1
+	addi	BK,BK,16		// IU1 Increment byte index
+
+	vperm	VS2,VS2b,VS2a,VP3	// VPU Combine into left justified s2
+	vor	VS2b,VS2a,VS2a	// VIU1 Move upper vector to lower
+
+	vcmpequb.	VCE,VS1,VS2	// VIU1 Does s1 == s2 ?
+
+	vnor	VMM,VCE,VCE	// VIU1 Not equals become ones
+	bdnzt	24,memcmp_NA_next_v	// b if more whole QWs to do and s1==s2
+
+	bc	_F_u,_all,memcmp_final_v_NE	// b if s1 != s2
+	
+// Dealing with last vector
+Last_ld:
+	lvxl	VS2a,SRC,BCM1	// LSU Last load of s2 (perhaps redundant)
+	vperm	VLM,V0,V1,VP4	// VPU Ones mask for bytes rt of end
+
+	lvxl	VS1,PT1,BK	// LSU Last load of s1
+
+	vperm	VS2,VS2b,VS2a,VP3	// VPU Combine into left justified s2
+	beq	cr5,Rt_just	// b if final S1 byte is rt justified
+
+	vor	VS2,VS2,VLM	// VIU1 Set uninvolved bytes at end
+
+	vor	VS1,VS1,VLM	// VIU1 Set bytes at end of s1
+Rt_just:
+	vcmpequb.	VCE,VS1,VS2	// VIU1 Does s1 == s2 ?
+
+	vnor	VMM,VCE,VCE	// VIU1 Not equals become ones
+	bc	_F_u,_all,memcmp_final_v_NE	// b if s1!=s2
+
+	xor	PT1,PT1,PT1	// IU1 Will return zero if strings are equal
+#ifdef VRSAVE
+	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
+#endif
+	blr			// Return 0 if s1 == s2	
+	
+memcmp_final_v_NE:
+	// s1 != s2, We're going to create a mask to mask off everything to
+	// the right of the first mismatching byte so we know we are just
+	// looking at the string up to the mismatch.
+
+	vsum4ubs	VS12B,VS1B,VS8B	// VIU2 Create a shift count for 12 octets
+
+	vsro	VMW1,VMM,VS4B	// VPU Shift the compare result one word right
+	vsrw	VMB1,VMM,VS1B	// VIU1 Shift compare result 8 bits right
+	
+	vsro	VMW2,VMM,VS8B	// VPU Shift the compare result 2 words right
+	vsrw	VMB2,VMM,VSH16	// VIU1 Shift compare result 16 bits right
+
+	vsro	VMW3,VMM,VS12B	// VPU Shift the compare result 3 words right
+	vor	VM,VMW1,VMW2	// VIU1 Mask of words one and 2 to the right
+	
+	vsro	VMB3,VMB2,VS1B	// VPU Shift compare result 3 bytes right
+	vor	VM,VM,VMW3	// VIU1 Mask of MM 1,2,&3 words to the right
+
+	vcmpgtuw	VM,VM,V0		// VIU1 Mask of all ones in words to the right
+	
+	vor	VM,VM,VMB1	// VIU1 Or in first byte to right
+
+	vor	VM,VM,VMB2	// VIU1 Or in second byte to right
+
+	vor	VM,VM,VMB3	// VIU1 Or in third byte to right
+
+	vor	VS2,VS2,VM	// VIU1 Set bytes right of mismatch
+
+	vor	VS1,VS1,VM	// VIU1 Set bytes right of mismatch
+	li	r3,-1		// IU1 Return -1 if s1 < s2	
+
+	vcmpgtub.	VCE,VS2,VS1	// VIU1 Compute s2 > s1 for all bytes
+#ifdef VRSAVE
+	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
+#endif
+	bclr	_F_u,_none	// s1 < s2 in first byte with a mismatch		
+		
+S2_lt_S1:	li	r3,1		// IU1 Return +1 if s1 > s2
+	blr	// s1 > s2 in first byte with a mismatch	
+
+// End of memcmp in AltiVec
+
diff --git a/liboil/motovec/vec_memcpy.S b/liboil/motovec/vec_memcpy.S
new file mode 100644
index 0000000..f280393
--- /dev/null
+++ b/liboil/motovec/vec_memcpy.S
@@ -0,0 +1,876 @@
+//------------------------------------------------------------------
+// file:  vec_memcpy.S
+//    AltiVec enabled version of memcpy and bcopy
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+//	Copyright Motorola, Inc. 2003
+//	ALL RIGHTS RESERVED
+//
+//	You are hereby granted a copyright license to use, modify, and 
+//	distribute the SOFTWARE so long as this entire notice is retained 
+//	without alteration in any modified and/or redistributed versions, 
+//	and that such modified versions are clearly identified as such.  
+//	No licenses are granted by implication, estoppel or otherwise under 
+//	any patents or trademarks of Motorola, Inc.
+//
+//	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
+//	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
+//	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
+//	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
+//	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
+//	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
+//	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
+//
+//	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
+//	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
+//	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
+//	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
+//	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
+//	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
+//	for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern  void * memcpy(void *dst, const void *src, size_t len);
+// Returns:
+//  void *dst
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void * memmove( void *dst, const void *src, size_t len );
+//   Copies len characters from src to dst and returns the value of
+//   dst.  Works correctly for overlapping memory regions.
+//               - Harbison&Steele 4th ed (corrected as to return)
+// Returns:
+//  void *dst
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern  void * bcopy(const void *src, void *dst,  size_t len);
+// Returns:
+//  void *dst
+//------------------------------------------------------------------
+
+// memcpy and memmove are combined into one entry point here because of
+// the similarity of operation and need to create fool-proof code.
+// The following conditions determine what is "fool proof":
+//
+// if:                                          then single entry:
+// (DST-SRC)<0 && (SRC-DST)>=BC && BC>MIN_VEC    will b to v_memcpy
+// (DST-SRC)<0 && (SRC-DST)< BC && BC>MIN_VEC    must b to v_memcpy
+// (DST-SRC)<0                  && BC<MIN_VEC    copy fwd byte-by-byte
+// (DST-SRC)==0                 || BC==0         will just return
+// (DST-SRC)>0                  && BC<MIN_VEC    copy bkwd byte-by-byte
+// (DST-SRC)>0 && (DST-SRC)< BC && BC>MIN_VEC    must b to v_memmove
+// (DST-SRC)>0 && (SRC-DST)>=BC && BC>MIN_VEC    will b to v_memmove
+
+// If you call memmove (or vec_memmove) and |DST-SRC|>=BC,
+// this code will branch to v_memcpy anyway for maximum performance.
+
+// Revision History:
+//    Rev 0.0	Original                          Chuck Corley	02/03/03
+//              Can still add dst, 128B loop, and aligned option
+//    Rev 0.01  Fixed JY's seg-fault violation              CJC 02/17/03
+//    Rev 0.1   Added 128B loop and dst; cndtnlzd dcbz      CJC 02/18/03
+//              (Creating separate path for QW aligned didn't help much)
+//    Rev 0.11  Small code schdling; chngd dst for memmove  CJC 02/23/03
+//    Rev 0.20  Eliminated alternate entry and cleanup      CJC 02/27/03                   
+//    Rev 0.21  Inproved loop branch targets for v_mempcy   CJC 03/01/03                   
+//    Rev 0.22  Experimented with dst (sent to H.)          CJC 03/02/03                   
+//    Rev 0.23  Substituted dcba for dcbz (sent to JY)      CJC 03/08/03                   
+//    Rev 0.24  Use two dst streams                         CJC 03/12/03
+//    Rev 0.25  Fix for all compilers, cleanup, and release with
+//              libmotovec.a rev 0.10                       CJC 03/14/03
+//    Rev 0.30  Fix for pre-empted destination (SNDF-DS)    CJC 04/02/03                   
+//
+//  Between Rev 0.25 and 0.30 the code was revised to store elements of
+//  source at destination when first and/or last vector are less than 16
+//  bytes. Areviewer at SNDF observed that loading the destination vector
+//  for merging exposed the "uninvolved" destination bytes to incoherency 
+//  if an interrupt pre-empted this routine and modified the "uninvolved"
+//  destination vector(s) while held in register for merging.  It seems
+//  like a low possibility but this revision is no longer subject to that
+//  possibility.  (It is also slightly faster than Rev 0.25.)
+//  This is beta quality code; users are encouraged to make it faster.
+//  ASSUMPTIONS:
+//     Code is highly likely to be in the cache; data is not (streaming data)
+
+#define VRSV 256	//	VRSAVE spr
+// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
+#define MIN_VEC 16
+// Don't use Big_loop in v_memcpy for |dst-src|<= minimum overlap.
+#define MIN_OVL 128
+
+// Register useage
+#define Rt r0	// 	r0 when used as a temporary register	
+
+#define DST r3	// 	entering: dst pointer; exiting: same dst pointer
+
+#define SRC r4	// 	entering: src ptr; then end of src range index (SRC+BC) in memmove
+
+#define BC r5	//	entering: Byte_Count
+
+#define PCS r6	//  	save for partial checksum entering
+
+#define DMS r7	//      dst - src initially
+#define BK r7	//  	BC - 1 +/- (n*16)
+
+// Codewarrior will put an unwelcome space as "lbzu	r0,1(r7 )"
+// if you don't put the comment right after the r7.  CJC 030314
+#define SM1 r8//	src -1 for byte-by-byte forwards initially
+#define S r8	//	src[28:31]
+#define SMD r8	//      src[0:27]-dst[0:27]
+#define STR r8	//	data stream touch block & stride info for Big_loop
+
+#define DM1 r9//	dst -1 for byte-by-byte forwards initially
+#define D r9	//	dst[28:31]
+#define DNX r9	//	(dst+n*16)[28:31]
+#define BL r9	//	second byte_kount index pointer
+
+#define SBC r10//	src + byte count initially then src[28:31]
+#define BLK r10	//      temporary data stream touch block & stride info
+#define DR r10	//	(dst+16)[0:27]
+#define QW r10	//  	number of quad words (vectors)
+
+#define DBC r11//	dst + byte count initially
+#define BLL r11	//      temporary data stream touch block & stride info
+#define SBK r11	//	(src+byte_count-1)
+#define SBR r11	//	(src+byte_count-1)[0:27]
+#define DBK r11	//	(dst+byte_count-1) then (dst+byte_count-1)[28:31]
+#define BIG r11	//	QW/8 or 128 byte loop count
+#define SP8 r11	//      SRC + n*128 (8 QWs) for data streaming after first call
+
+#define RSV r12	//  	storage for VRSAVE register if used
+
+#define VS0   v0	//  	src vector for permuting
+
+#define VS1   v1	//  	src vector for permuting
+
+#define VP3   v2	// 	d - s permute register
+
+#define VPS0  v3	// 	permuted source vector to store
+
+#define VPS1  v4	//  	2nd permuted source vector to store
+
+#define VPS2  v5	//      additional permuted src in Big loop
+
+#define VS2   v6	//  	src vector for permuting
+#define VPS3  v6	//      additional permuted src in Big loop
+
+#define VS3   v7	//      additional src load in Big loop
+#define VPS4  v7	//      additional permuted src in Big loop
+
+#define VS4   v8	//      additional src load in Big loop
+#define VPS5  v8	//      additional permuted src in Big loop
+
+#define VS5   v9	//      additional src load in Big loop
+#define VPS6  v9	//      additional permuted src in Big loop
+
+#define VS6   v10	//      additional src load in Big loop
+#define VPS7  v10	//      additional permuted src in Big loop
+
+#define VS7   v11	//      additional src load in Big loop
+
+// Conditionalize the use of dcba.  It will help if the data is
+// not in cache and hurt if it is.  Generally, except for small
+// benchmarks repeated many times, we assume data is not in cache
+// (data streaming) and using dcbz is a performance boost.
+#ifndef NO_DCBA
+#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
+ // gcc and codewarrior and diab don't assemble dcba
+#define DCBK .long 0x7c033dec
+// dcba r3,r7    or    dcba DST,BK
+#define DCBL .long 0x7c034dec
+// dcba r3,r9     or    dcba DST,BL
+#else
+#ifdef __ghs__
+.macro DCBK
+.long 0x7c033dec
+.endm
+.macro DCBL
+.long 0x7c034dec
+.endm
+#else
+#define DCBK dcba DST,BK
+#define DCBL dcba DST,BL
+#endif  // __ghs__
+#endif  // __GNUC__ or __MWERKS__
+#else
+#define DCBK nop
+#define DCBL nop
+#endif  // NO_DCBA
+
+// Conditionalize the use of dst (data stream touch).  It will help
+// if the data is not in cache and hurt if it is (though not as badly
+// as dcbz).  Generally, except for small benchmarks repeated many times,
+// we assume data is not in cache (data streaming) and using dst is a
+// performance boost.
+#ifndef NO_DST
+#define STRM_B dst	SBC,BLL,0
+#define STRM_F dst	SRC,BLK,0
+#define STRM_1 dst	SP8,STR,1
+
+#else
+#define STRM_B	nop
+#define STRM_F	nop
+#define STRM_1	nop
+#endif
+
+//  Condition register use
+//      cr0[0:2] = (dst-src==0)? return: ((dst-src>0)? copy_bkwd, copy_fwd;);
+// then cr0[0:2] = (dst[28:31]-src[28:31]<0)? "shifting left", "shifting right";
+//      cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
+// then cr1[2]   = (DST[28:31] == 0)? 1 : 0;  (D0 left justified)
+// then cr1[2]   = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
+//      cr5[0,2] = (|DST-SRC|<=MIN_OVL)?1:0;  (Overlap too small for Big loop?)
+//      cr6[1,2] = (DST-SRC>=BC)?1:0;  (Okay for v_memmove to copy forward?)
+// then cr6[2]   = (QW == 0)? 1 : 0; (Any full vectors to move?)
+// then cr6[1]   = (QW > 4)? 1 : 0; (>4 vectors to move?)
+// then cr6[3]   = (third store[27] == 1)? 1: 0; (cache line alignment)
+// then cr6[3]   = (last store[27] == 1)? 1: 0; (last store odd?)
+//      cr7[2]   = (BC>MIN_VEC)?1:0;  (BC big enough to warrant vectors)
+// then cr7[0:3] = (DST+16)[0:27]-DST  (How many bytes (iff <16) in first vector?)
+// then cr7[1]   = (QW > 14)? 1 : 0; (>14 vectors to move?)
+// then cr7[0:3] = (DST+BC)[0:27]  (How many bytes (iff <16) in last vector?)
+
+	.text
+#ifdef __MWERKS__
+	.align	32
+#else
+	.align	5
+#endif
+
+#ifdef LIBMOTOVEC
+	.global	memmove     
+memmove:
+	nop			// IU1 Compilers forget first label
+	.global	memcpy     
+memcpy:
+#else
+	.global	vec_memmove     
+vec_memmove:
+	nop			// IU1 Only way I know to preserve both labels
+	.global	vec_memcpy     
+vec_memcpy:
+#endif
+	subf.	DMS,SRC,DST	// IU1 Compute dst-src difference
+	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count moves
+	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
+
+	addi	SM1,SRC,-1	// IU1 Pre-bias and duplicate src for fwd
+	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
+	add	SBC,SRC,BC	// IU1 Pre-bias and duplicate src for bkwd
+	beqlr			// return if DST = SRC
+
+	add	DBC,DST,BC	// IU1 Pre-bias and duplicate destination
+	subf	Rt,DST,SRC	// IU1 Form |DST-SRC| if DST-SRC<0
+	beqlr	cr1		// return if BC = 0
+
+	bgt	Cpy_bkwd	// b if DST-SRC>0 (have to copy backward)
+	cmpi	cr5,0,Rt,MIN_OVL	// IU1 (|DST-SRC|>128)?1:0; for v_memcpy
+	bgt	cr7,v_memcpy	// b if BC>MIN_VEC (okay to copy vectors fwd)
+
+// Copy byte-by-byte forwards if DST-SRC<0 and BC<=MIN_VEC	
+	mtctr	BC		// i=BC; do ...;i--; while (i>0)
+Byte_cpy_fwd:
+	lbzu	Rt,1(SM1)	// LSU * ++(DST-1) = * ++(SRC-1)
+	stbu	Rt,1(DM1)	// LSU
+	bdnz	Byte_cpy_fwd
+
+	blr
+	nop			// IU1 Improve next label as branch target	
+Cpy_bkwd:
+	cmpi	cr5,0,DMS,MIN_OVL	// IU1 ((DST-SRC)>128)?1:0; for v_memcpy
+	cmp	cr6,0,DMS,BC	// IU1 cr6[1,2]=(DST-SRC>=BC)?1:0;
+	bgt	cr7,v_memmove	// b if BC>MIN_VEC (copy vectors bkwd)
+// Copy byte-by-byte backwards if DST-SRC>0 and BC<=MIN_VEC
+	mtctr	BC		// i=BC; do ...;i--; while (i>0)
+Byte_cpy_bwd:
+	lbzu	Rt,-1(SBC)	// LSU * --(DST+BC) = * --(SRC+BC)
+	stbu	Rt,-1(DBC)	// LSU Store it
+	bdnz	Byte_cpy_bwd
+	blr
+	
+#ifdef __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+
+v_memmove:
+// Byte count < MIN_VEC bytes will have been copied by scalar code above,
+// so this will not deal with small block moves < MIN_VEC.
+
+// For systems using VRSAVE, define VRSAVE=1 when compiling.  For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
+#endif
+	rlwinm	S,SRC,0,28,31	// IU1 Save src address bits s[28:31]
+	rlwinm	D,DST,0,28,31	// IU1 D = dst[28:31]
+	bge	cr6,MC_entry	// b to v_memcpy if DST-SRC>=BC (fwd copy OK)
+
+#ifdef VRSAVE
+	oris	Rt,RSV,0xfff0	// IU1 Or in registers used by this routine
+#endif	
+	lis	BLL,0x010c	// IU1 Stream 12 blocks of 16 bytes
+	subf.	SMD,D,S		// IU1 if S-D<0 essentially shifting right
+
+#ifdef VRSAVE
+	mtspr	VRSV,Rt		// IU2 Save in VRSAVE before first vec op
+#endif
+	lvsr	VP3,0,DMS	// LSU Permute vector for dst - src shft right
+	ori	BLL,BLL,0xffe0	// IU1 Stream stride -32B
+
+	STRM_B			// LSU Start data stream at SRC+BC
+	addi	SBK,SBC,-1	// IU1 Address of last src byte
+	bgt	Rt_shft		// Bytes from upper vector = (s-d>0)?s-d:16+s-d;
+	addi	SMD,SMD,16	// IU1 Save 16-(d-s)
+Rt_shft:
+
+	rlwinm	SBR,SBK,0,0,27	// IU1 (SRC+BC-1)[0:27]
+	addi	BK,BC,-1	// IU1 Initialize byte index
+
+	subf	Rt,SBR,SBC	// IU1 How many bytes in first source?
+	add	DBK,DST,BK	// IU1 Address of last dst byte
+	addi	DR,DST,16	// IU1 Address of second dst vector
+
+	subf.	SMD,Rt,SMD	// IU1 if bytes in 1st src>Bytes in 1st permute
+	rlwinm	Rt,DBK,0,28,31	// IU1 (DST+BC-1)[28:31]
+	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
+
+// If there are more useful bytes in the upper vector of a permute pair than we
+// will get in the first permute, the first loaded vector needs to be in the
+// lower half of the permute pair.  The upper half is a don't care then.
+	blt	Get_bytes_rt	// b if shifting left (D-S>=0)
+
+	lvx	VS1,SRC,BK	// LSU Get SN load started
+// Comments numbering source and destination assume single path through the
+// code executing each instruction once.  For vec_memmove, an example would
+// be the call memmove(BASE+0x0F, BASE+0x2F, 82). N = 6 in that case.
+	addi	SRC,SRC,-16	// IU1 Decrement src base (to keep BK useful)
+
+Get_bytes_rt:	// Come here to get VS0 & Don't care what VS1 is	
+	lvx	VS0,SRC,BK	// LSU Get SN-1 (SN if D-S<0) in lower vector
+	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
+	cmpi	cr7,0,Rt,0xF	// IU1 Is Dn right justified?
+
+	cmpi	cr1,0,D,0	// IU1 Is D0 left justified?
+	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
+	add	Rt,DST,BC	// IU1 Refresh the value of DST+BC
+
+	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-1 and SN to DN
+	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper
+	beq	cr7,Rt_just	// b if DN is right justified
+
+	mtcrf	0x01,Rt		// IU2 Put final vector byte count in cr7
+	rlwinm	DBK,DBK,0,0,27	// IU1 Address of first byte of final vector
+	li	D,0		// IU1 Initialize an index pointer
+	bnl	cr7,Only_1W_bkwd	// b if there was only one or zero words to store
+
+	stvewx	VPS0,DBK,D	// LSU store word 1 of two or three
+	addi	D,D,4		// IU1 increment index
+
+	stvewx	VPS0,DBK,D	// LSU store word 2 of two or three
+	addi	D,D,4		// IU1 increment index
+Only_1W_bkwd:
+	bng	cr7,Only_2W_bkwd	// b if there were only two or zero words to store
+
+	stvewx	VPS0,DBK,D	// LSU store word 3 of three if necessary
+	addi	D,D,4		// IU1 increment index
+Only_2W_bkwd:
+	bne	cr7,Only_B_bkwd	// b if there are no half words to store
+
+	stvehx	VPS0,DBK,D	// LSU store one halfword if necessary
+	addi	D,D,2		// IU1 increment index
+Only_B_bkwd:
+	bns	cr7,All_done_bkwd	// b if there are no bytes to store
+
+	stvebx	VPS0,DBK,D	// LSU store one byte if necessary
+	b	All_done_bkwd
+
+Rt_just:	
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN
+All_done_bkwd:
+	addi	BK,BK,-16	// IU1 Decrement destination byte count
+
+	ble	cr6,Last_load	// b if no Quad words to do
+	mtctr	QW		// IU2 for (i=0;i<=QW;i++)-execution serializng
+	cmpi	cr6,0,QW,4	// IU1 Check QW>4
+QW_loop:
+	lvx	VS0,SRC,BK	// LSU Get SN-2 (or SN-1 if ADJ==0)
+
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-2 and SN-1 to DN-1
+	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper
+
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-1
+	addi	BK,BK,-16	// IU1 Decrement byte kount
+	bdnzf	25,QW_loop	// b if 4 or less quad words to do
+
+	add	DNX,DST,BK	// IU1 address of next store (DST+BC-1-16)
+	bgt	cr6,GT_4QW	// b if >4 quad words left
+
+Last_load:	// if D-S>=0, next load will be from same address as last
+	blt	No_ld_bkwd	// b if shifting right (S-D>=0)
+	addi	SRC,SRC,16	// IU1 recorrect source if it was decremented
+No_ld_bkwd:				
+	lvx	VS0,0,SRC	// LSU Get last source SN-6 (guaranteed S0)
+// Current 16 bytes is the last; we're done.
+	dss	0		// Data stream stop
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-6 and SN-5 to DN-6
+	subfic	D,DST,16	// IU1 How many bytes in first destination?
+	beq	cr1,Lt_just	// b if last destination is left justified
+
+	mtcrf	0x01,D		// IU2 Put byte count remaining in cr7
+	li	D,0		// IU1 Initialize index pointer
+	bns	cr7,No_B_bkwd	// b if only even number of bytes to store
+
+	stvebx	VPS0,DST,D	// LSU store first byte at DST+0
+	addi	D,D,1		// IU1 increment index
+No_B_bkwd:
+	bne	cr7,No_H_bkwd	// b if only words to store
+	stvehx	VPS0,DST,D	// LSU store halfword at DST+0/1
+	addi	D,D,2		// IU1 increment index
+
+No_H_bkwd:
+	bng	cr7,No_W1_bkwd	// b if exactly zero or two words to store
+	stvewx	VPS0,DST,D	// LSU store word 1 of one or three
+	addi	D,D,4		// IU1 increment index
+
+No_W1_bkwd:
+	bnl	cr7,No_W2_bkwd	// b if there was only one word to store
+	stvewx	VPS0,DST,D	// LSU store word 1 of two or 2 of three
+	addi	D,D,4		// IU1 increment index
+
+	stvewx	VPS0,DST,D	// LSU store word 2 of two or 3 of three
+	b	No_W2_bkwd
+
+Lt_just:
+	stvx	VPS0,0,DST	// LSU Store 16 bytes at final dst addr D0
+No_W2_bkwd:
+#ifdef VRSAVE
+	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
+#endif
+	blr			// Return destination address from entry
+
+GT_4QW:	// Do once if next store is to even half of cache line, else twice
+
+	lvx	VS0,SRC,BK	// LSU Get SN-3 (or SN-2)
+	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+BC-1)[27]==1)?1:0;
+	
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-3 and SN-2 to Dn-2
+	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper
+	addi	DNX,DNX,-16	// IU1 Prepare to update cr6 next loop
+
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-2
+	vor	VS3,VS0,VS0	// VIU Make a copy of lower vector
+	addi	BK,BK,-16	// IU1 Decrement byte count by 16
+	bdnzt	27,GT_4QW	// b if next store is to upper (odd) half of CL
+// At this point next store will be to even address.
+
+	lis	STR,0x102	// IU1 Stream 2 blocks of 16 bytes
+	mtcrf	0x02,DST	// IU2 cr6[3]=(DST[27]==1)?1:0; (DST odd?)
+	addi	BL,BK,-16	// IU1 Create an alternate byte count - 16
+
+	ori	STR,STR,0xffe0	// IU1 Stream stride -32B
+	addi	SP8,SRC,-64	// IU1 Starting address for data stream touch
+	bso	cr6,B32_bkwd	// b if DST[27] == 1; i.e, final store is odd
+
+	bdnz	B32_bkwd	// decrement counter for last odd QW store
+B32_bkwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
+	lvx	VS2,SRC,BK	// LSU Get SN-4 (or SN-3)
+	addi	SP8,SP8,-32	// IU1 Next starting address for data stream touch
+
+	lvx	VS1,SRC,BL	// LSU Get SN-5 (or SN-4)
+	vperm	VPS0,VS2,VS3,VP3	// VPU Align SN-4 and SN-3 to DN-3
+
+	STRM_1			// LSU Stream 64 byte blocks ahead of loads
+
+	DCBL			// LSU allocate next cache line
+
+	vperm	VPS1,VS1,VS2,VP3	// VPU Align SN-5 and SN-4 to DN-4
+	vor	VS3,VS1,VS1	// VIU1 Move SN-5 to SN-3
+
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-3
+	addi	BK,BL,-16	// IU1 Decrement byte count
+	bdz	Nxt_loc_bkwd	// always decrement and branch to next instr		
+
+Nxt_loc_bkwd:
+	stvx	VPS1,DST,BL	// LSU Store 16 bytes at DN-4
+	addi	BL,BK,-16	// IU1 Decrement alternate byte count
+	bdnz	B32_bkwd	// b if there are at least two more QWs to do
+
+	bns	cr6,One_odd_QW	// b if there was one more odd QW to store
+	b	Last_load
+
+// Come here with two more loads and two stores to do
+One_odd_QW:
+	lvx	VS1,SRC,BK	// LSU Get SN-6 (or SN-5)
+
+	vperm	VPS1,VS1,VS3,VP3	// VPU Align SN-6 and SN-5 to DN-5
+
+	stvx	VPS1,DST,BK	// LSU Store 16 bytes at DN-5
+
+	b	Last_load
+
+// End of memmove in AltiVec
+
+#ifdef __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+v_memcpy:
+// Byte count < MIN_VEC bytes will have been copied by scalar code above,
+// so this will not deal with small block moves < MIN_VEC.
+
+#ifdef VRSAVE
+	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
+#endif
+	rlwinm	S,SRC,0,28,31	// IU1 Save src address bits s[28:31]
+	rlwinm	D,DST,0,28,31	// IU1 D = dst[28:31]
+
+MC_entry:	// enter here from memmove if DST-SRC>=BC; this should be faster
+#ifdef VRSAVE
+	oris	Rt,RSV,0xfff0	// IU1 Or in registers used by this routine
+#endif	
+	lis	BLK,0x010c	// IU1 Stream 12 blocks of 16 bytes
+
+	subf.	S,S,D		// IU1 if D-S<0 essentially shifting left
+
+#ifdef VRSAVE
+	mtspr	VRSV,Rt		// IU2 Save in VRSAVE before first vec op
+#endif
+	lvsr	VP3,0,DMS	// LSU Permute vector for dst - src shft right
+	ori	BLK,BLK,32	// IU1 Stream stride 32B
+
+	STRM_F			// LSU Start data stream 0 at SRC
+	addi	DR,DST,16	// IU1 Address of second dst vector
+	addi	DBK,DBC,-1	// IU1 Address of last dst byte
+
+// If D-S<0 we are "kinda" shifting left with the right shift permute vector
+// loaded to VP3 and we need both S0 and S1 to permute.  If D-S>=0 then the
+// first loaded vector needs to be in the upper half of the permute pair and
+// the lower half is a don't care then.
+	bge	Ld_bytes_rt	// b if shifting right (D-S>=0)
+
+	lvx	VS0,0,SRC	// LSU Get S0 load started
+// Comments numbering source and destination assume single path through the
+// code executing each instruction once.  For vec_memcpy, an example would
+// be the call memcpy(BASE+0x1E, BASE+0x1F, 259). N = 16 in that case.
+	addi	SRC,SRC,16	// IU1 Increment src base (to keep BK useful)
+
+Ld_bytes_rt:	// Come here to get VS1 & Don't care what VS0 is	
+	lvx	VS1,0,SRC	// LSU Get S1 (or S0 if D-S>=0) in upper vector
+	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
+	cmpi	cr1,0,D,0	// IU1 Is D0 left justified?
+
+	subf	Rt,DST,DR	// IU1 How many bytes in first destination?
+	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
+	li	BK,0		// IU1 Initialize byte kount index
+
+	mtcrf	0x01,Rt		// IU2 Put bytes in 1st dst in cr7
+	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align S0 and S1 to D0
+
+	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
+	beq	cr1,Left_just	// b if D0 is left justified
+
+	bns	cr7,No_B_fwd	// b if only even number of bytes to store
+
+	stvebx	VPS0,DST,BK	// LSU store first byte at DST+0
+	addi	BK,BK,1		// IU1 increment index
+No_B_fwd:
+	bne	cr7,No_H_fwd	// b if only words to store
+
+	stvehx	VPS0,DST,BK	// LSU store halfword at DST+0/1
+	addi	BK,BK,2		// IU1 increment index
+No_H_fwd:
+	bng	cr7,No_W1_fwd	// b if exactly zero or two words to store
+
+	stvewx	VPS0,DST,BK	// LSU store word 1 of one or three
+	addi	BK,BK,4		// IU1 increment index
+
+No_W1_fwd:
+	bnl	cr7,No_W2_fwd	// b if there was only one word to store
+	stvewx	VPS0,DST,BK	// LSU store word 1 of two or 2 of three
+	addi	BK,BK,4		// IU1 increment index
+
+	stvewx	VPS0,DST,BK	// LSU store word 2 of two or 3 of three
+	b	No_W2_fwd
+
+Left_just:	
+	stvx	VPS0,0,DST	// LSU Store 16 bytes at D0
+No_W2_fwd:
+	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
+	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
+
+	li	BK,16		// IU1 Re-initialize byte kount index
+	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
+	cmpi	cr7,0,QW,14	// IU1 Check QW>14
+	ble	cr6,Last_ld_fwd	// b if no Quad words to do
+
+	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
+	cmpi	cr6,0,QW,4	// IU1 Check QW>4
+QW_fwd_loop:
+	lvx	VS1,SRC,BK	// LSU Get S2 (or S1)
+
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align S1 and S2 to D1
+	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
+
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D1(+n*16 where n<4)
+	addi	BK,BK,16	// IU1 Increment byte kount index
+	bdnzf	25,QW_fwd_loop	// b if 4 or less quad words to do
+
+	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
+	addi	QW,QW,-1	// IU1 One more QW stored by now
+	bgt	cr6,GT_4QW_fwd	// b if >4 quad words left
+
+Last_ld_fwd:	// Next 16 bytes is the last; we're done.
+	add	DBC,DST,BC	// IU1 Recompute address of last dst byte + 1
+	add	SBC,SRC,BC	// IU1 Recompute address of last src byte + 1
+	bge	No_ld_fwd	// b if shifting right (D-S>=0)
+
+	addi	SBC,SBC,-16	// IU1 if D-S>=0 we didn't add 16 to src
+No_ld_fwd:
+	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
+	addi	DBK,DBC,-1	// IU1 Recompute address of last dst byte
+	addi	Rt,SBC,-1	// IU1 Recompute address of last src byte
+
+// If D-S<0 we have already loaded all the source vectors.
+// If D-S>=0 then the first loaded vector went to the upper half of the permute
+// pair and we need one more vector.  (This may be a duplicate.)
+
+	lvx	VS1,0,Rt	// LSU Get last source S14 (guaranteed SN)
+
+#ifndef NO_DST				
+	dss	0		// Data stream 0 stop
+
+	dss	1		// Data stream 1 stop
+#endif
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D14
+	beq	cr1,Rt_just_fwd	// b if last destination is right justified
+
+	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
+	li	D,0		// IU1 Initialize index pointer
+	bnl	cr7,Only_1W_fwd	// b if there was only one or zero words to store
+
+	stvewx	VPS0,DBK,D	// LSU store word 1 of two or three
+	addi	D,D,4		// IU1 increment index
+
+	stvewx	VPS0,DBK,D	// LSU store word 2 of two or three
+	addi	D,D,4		// IU1 increment index
+Only_1W_fwd:
+	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
+
+	stvewx	VPS0,DBK,D	// LSU store word 3 of three if necessary
+	addi	D,D,4		// IU1 increment index
+Only_2W_fwd:
+	bne	cr7,Only_B_fwd	// b if there are no half words to store
+
+	stvehx	VPS0,DBK,D	// LSU store one halfword if necessary
+	addi	D,D,2		// IU1 increment index
+Only_B_fwd:
+	bns	cr7,All_done_fwd	// b if there are no bytes to store
+
+	stvebx	VPS0,DBK,D	// LSU store one byte if necessary
+	b	All_done_fwd
+
+Rt_just_fwd:
+
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D14
+All_done_fwd:
+#ifdef VRSAVE
+	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
+#endif
+	blr			// Return destination address from entry
+#ifdef __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
+
+	lvx	VS1,SRC,BK	// LSU Get S3 (or S2)
+	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
+	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+	
+	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
+	addi	Rt,QW,-2	// IU1 Insure at least 2 QW left after big loop
+
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align S2 and S3 to D2
+	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
+
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D2
+	addi	BK,BK,16	// IU1 Increment byte count by 16
+	bdnzf	27,GT_4QW_fwd	// b if next store is to lower (even) half of CL
+// At this point next store will be to even address.
+
+	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+	lis	STR,0x104	// IU1 Stream 4 blocks of 16 bytes
+	addi	BL,BK,16	// IU1 Create an alternate byte kount + 32
+
+	ori	STR,STR,32	// IU1 Stream stride 32B
+#ifndef NO_BIG_LOOP
+	rlwinm	BIG,Rt,29,3,31	// IU1 QW/8 big loops to do
+
+	rlwinm	Rt,Rt,0,0,28	// IU1 How many QWs will be done in big loop
+	bgt	cr7,Big_loop	// b if QW > 14
+#endif
+No_big_loop:
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+
+	addi	SP8,SRC,256	// IU1 Starting address for data stream touch
+	xoris	STR,STR,0x6	// IU1 Reset stream to 2 blocks of 16 bytes
+	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even
+
+	bdnz	B32_fwd		// decrement counter for last QW store odd
+
+B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
+	lvx	VS1,SRC,BK	// LSU Get S12
+	addi	SP8,SP8,32	// IU1 Next starting address for data stream touch
+
+	lvx	VS2,SRC,BL	// LSU Get S13
+	vperm	VPS1,VS0,VS1,VP3	// VPU Align S11 and S12 to D11
+
+	STRM_1			// LSU Stream 64 byte blocks ahead of loads
+
+	DCBK			// LSU then Kill instead of RWITM
+
+	vperm	VPS0,VS1,VS2,VP3	// VPU Align S12 and S13 to D12
+	vor	VS0,VS2,VS2	// VIU1 Move S13 to S11
+
+	stvx	VPS1,DST,BK	// LSU Store 16 bytes at D11
+	addi	BK,BL,16	// IU1 Increment byte count
+	bdz	Nxt_loc_fwd	// always decrement and branch to next instr		
+
+Nxt_loc_fwd:
+	stvx	VPS0,DST,BL	// LSU Store 16 bytes at D12
+	addi	BL,BK,16	// IU1 Increment alternate byte count
+	bdnz	B32_fwd		// b if there are at least two more QWs to do
+
+	bso	cr6,One_even_QW	// b if there is one even and one odd QW to store
+	b	Last_ld_fwd	// b if last store is to even address
+
+// Come here with two more loads and two stores to do
+One_even_QW:
+	lvx	VS1,SRC,BK	// LSU Get S14 (or S13 if if D-S>=0)
+
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D13
+	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
+
+	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D13
+	addi	BK,BK,16	// IU1 Increment byte count
+
+	b	Last_ld_fwd
+
+#ifdef __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+Big_loop:
+	subf	QW,Rt,QW	// IU1 Should be 2-7 QWs left after big loop
+	blt	cr5,No_big_loop	// b back if |DST-SRC|<128; Big_loop won't work.
+	mtctr	BIG		// IU2 loop for as many 128B loops as possible
+	addi	SP8,SRC,256	// IU1 Starting address for data stream touch
+
+Loop_of_128B:	// Come here with QW>=10 and next store even; VS0 last load
+	lvx	VS1,SRC,BK	// LSU Get S4 (or S3 if D-S>=0)
+	addi	BL,BK,32	// IU1 Increment Byte_Kount+16 by 32	
+	addi	SP8,SP8,128	// IU1 increment address for data stream touch
+
+	lvx	VS3,SRC,BL	// LSU Get S6 (or S5)
+	addi	BL,BL,32	// IU1 Increment Byte_Kount+48 by 32	
+
+	lvx	VS5,SRC,BL	// LSU Get S8 (or S7)
+	addi	BL,BL,32	// IU1 Increment Byte_Kount+80 by 32	
+
+	lvx	VS7,SRC,BL	// LSU Get S10 (or S9)
+	addi	BL,BK,16	// IU1 Increment Byte_Kount+16 by 16	
+
+	lvx	VS2,SRC,BL	// LSU Get S5 (or S4)
+	addi	BL,BL,32	// IU1 Increment Byte_Kount+32 by 32	
+
+	lvx	VS4,SRC,BL	// LSU Get S7 (or S6)
+	addi	BL,BL,32	// IU1 Increment Byte_Kount+64 by 32	
+	
+	lvx	VS6,SRC,BL	// LSU Get S9 (or S8)
+	addi	BL,BL,32	// IU1 Increment Byte_Kount+96 by 32	
+	vperm	VPS0,VS0,VS1,VP3	// VPU
+
+	lvx	VS0,SRC,BL	// LSU Get S11 (or S10)
+	vperm	VPS1,VS1,VS2,VP3	// VPU
+
+	STRM_1			// LSU Stream 4 32B blocks, stride 32B
+
+	DCBK			// LSU then Kill instead of RWITM
+
+	stvx	VPS0,DST,BK	// LSU Store D3
+	addi	BK,BK,16	// IU1 Increment Byte_Kount+16 by 16	
+	vperm	VPS2,VS2,VS3,VP3	// VPU
+
+	stvx	VPS1,DST,BK	// LSU Store D4
+	addi	BK,BK,16	// IU1 Increment Byte_Kount+32 by 16	
+	vperm	VPS3,VS3,VS4,VP3	// VPU
+
+	DCBK			// LSU then Kill instead of RWITM
+
+	stvx	VPS2,DST,BK	// LSU Store D5
+	addi	BK,BK,16	// IU1 Increment Byte_Kount+48 by 16	
+	vperm	VPS4,VS4,VS5,VP3	// VPU
+
+	stvx	VPS3,DST,BK	// LSU Store D6
+	addi	BK,BK,16	// IU1 Increment Byte_Kount+64 by 16	
+	vperm	VPS5,VS5,VS6,VP3	// VPU
+
+	DCBK			// LSU then Kill instead of RWITM
+
+	stvx	VPS4,DST,BK	// LSU Store D7
+	addi	BK,BK,16	// IU1 Increment Byte_Kount+80 by 16	
+	vperm	VPS6,VS6,VS7,VP3	// VPU
+
+	stvx	VPS5,DST,BK	// LSU Store D8
+	addi	BK,BK,16	// IU1 Increment Byte_Kount+96 by 16	
+	vperm	VPS7,VS7,VS0,VP3	// VPU
+
+	DCBK			// LSU then Kill instead of RWITM
+
+	stvx	VPS6,DST,BK	// LSU Store D9
+	addi	BK,BK,16	// IU1 Increment Byte_Kount+112 by 16	
+
+	stvx	VPS7,DST,BK	// LSU Store D10
+	addi	BK,BK,16	// IU1 Increment Byte_Kount+128 by 16	
+	bdnz	Loop_of_128B	// b if ctr > 0 (QW/8 still > 0)
+
+	mtctr	QW		// IU1 Restore QW remaining to counter
+	addi	BL,BK,16	// IU1 Create an alternate byte kount + 16
+	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even
+
+	bdnz	B32_fwd		// b and decrement counter for last QW store odd
+				// One of the above branches should have taken
+
+// End of memcpy in AltiVec
+
+// bcopy works like memcpy, but the source and destination operands are reversed.
+// Following will just reverse the operands and branch to memcpy.
+
+#ifdef LIBMOTOVEC
+	.global	bcopy     
+bcopy:
+#else
+	.global	vec_bcopy     
+vec_bcopy:
+#endif
+	mr	Rt,DST		// temp storage for what is really source address (r3)
+	mr	DST,SRC		// swap destination address to r3 to match memcpy dst
+	mr	SRC,Rt		// Complete swap of destination and source for memcpy
+#ifdef LIBMOTOVEC
+	b	memcpy		// b to memcpy with correct args in r3 and r4	
+#else
+	b	vec_memcpy	// b to vec_memcpy with correct args in r3 and r4	
+#endif
+// End of bcopy in AltiVec
diff --git a/liboil/motovec/vec_memset.S b/liboil/motovec/vec_memset.S
new file mode 100644
index 0000000..2b00e80
--- /dev/null
+++ b/liboil/motovec/vec_memset.S
@@ -0,0 +1,553 @@
+//------------------------------------------------------------------
+// file:  vec_memset.S
+//    AltiVec enabled version of memset and bzero and cacheable_memzero
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+//	Copyright Motorola, Inc. 2002
+//	ALL RIGHTS RESERVED
+//
+//	You are hereby granted a copyright license to use, modify, and 
+//	distribute the SOFTWARE so long as this entire notice is retained 
+//	without alteration in any modified and/or redistributed versions, 
+//	and that such modified versions are clearly identified as such.  
+//	No licenses are granted by implication, estoppel or otherwise under 
+//	any patents or trademarks of Motorola, Inc.
+//
+//	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
+//	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
+//	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
+//	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
+//	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
+//	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
+//	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
+//
+//	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
+//	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
+//	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
+//	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
+//	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
+//	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
+//	for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void *memset( void *ptr, int val, size_t len );
+//   Copies val into each of len characters beginning at ptr.
+//                                       - Harbison&Steele 4th ed
+//    (despite val being an int, this memset assumes it is never
+//     more than a byte.  That seems to be correct from all the
+//     memset functions I've seen but I don't know if ANSI allows
+//     anthing longer.     Chuck Corley  12/21/02) 
+// Returns:
+//  void * ptr
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void * bzero( char *ptr, int len);   
+//   Copies 0 into each of len characters at ptr.
+//                                       - Harbison&Steele 4th ed
+// Returns:
+//  void * ptr
+//------------------------------------------------------------------
+
+// Revision History:
+//    Rev 0.0	Original                        Chuck Corley	02/09/03
+//              Could benefit from changes added to memcpy
+//    Rev 0.1	Revised per memcpy Rev 0.30     Chuck Corley	05/01/03
+//
+//  This is beta quality code; users are encouraged to make it faster.
+//  ASSUMPTIONS:
+//     Code is highly likely to be in the cache; data is not (streaming data)
+//     Zero fill could be quite likely.
+//     Moving fill byte from GPR to VR as below faster than stw->lvebx via stack
+
+#define VRSV 256	//	VRSAVE spr
+// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
+#define MIN_VEC 16
+
+// Register useage
+#define Rt r0	// 	r0 when used as a temporary register	
+
+#define DST r3	// 	entering: dest pointer; exiting: same dest pointer
+
+#define FILL r4	// 	entering: fill char then fill word
+
+#define BC r5	//	entering: Byte_Count then remaining Byte_Count
+
+#define DBC r6//	dst + byte count
+
+#define BK r7	//  	BC - 1 +/- (n*16)
+
+#define Fsh r8	//	fill byte shifted right one nibble
+
+#define DM1 r9//	dst -1 for byte-by-byte backwards initially
+#define D r9	//	(dst+16)[0:27] - dst[28:31]
+#define DNX r9	//	(dst+n*16)[28:31]
+#define BL r9	//	second byte_kount index pointer
+
+#define DR r10	//	(dst+16)[0:27]
+#define QW r10	//  	number of cache lines
+
+#define DBK r11	//	(dst+byte_count-1) then (dst+byte_count-1)[28:31]
+
+#define RSV r12	//  	storage for VRSAVE register if used
+
+//  Condition register use (not including temporary cr0)
+//      cr0[2]   = (FILL==0)?
+//      cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
+// then cr1[2]   = (DST[28:31] == 0)? 1 : 0;  (D0 left justified)
+// then cr1[2]   = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
+//      cr6[2]   = (QW == 0)? 1 : 0;
+// then cr6[1]   = (QW > 4)? 1 : 0; (>4 vectors to move?)
+// then cr6[3]   = (third store[27] == 1)? 1: 0; (cache line alignment)
+// then cr6[3]   = (last store[27] == 1)? 1: 0; (last store odd?)
+//      cr7[2]   = (BC>MIN_VEC)?1:0;  (BC big enough to warrant vectors)
+// then cr7[0:3] = (DST+16)[0:27]-DST  (How many bytes (iff <16) in first vector?)
+// then cr7[0:3] = (DST+BC)[0:27]  (How many bytes (iff <16) in last vector?)
+
+// Conditionalize the use of dcba.  It will help if the data is
+// not in cache and hurt if it is.  Generally, except for small
+// benchmarks repeated many times, we assume data is not in cache
+// (data streaming) and using dcba is a performance boost.
+// We use dcba which will noop to non-cacheable memory rather than
+// dcbz which will cause an aligment exception.
+#ifndef NO_DCBA
+#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
+ // gcc and codewarrior and diab don't assemble dcba
+#define DCBK .long 0x7c033dec
+// dcba r3,r7    or    dcba DST,BK
+#else
+#ifdef __ghs__
+.macro DCBK
+.long 0x7c033dec
+.endm
+#else
+#define DCBK dcba DST,BK
+#endif  // __ghs__
+#endif  // __GNUC__ or __MWERKS__
+#else
+#define DCBK nop
+#endif  // NO_DCBA
+
+	.text
+#ifdef __MWERKS__
+	.align	32
+#else
+	.align	5
+#endif
+
+#ifdef LIBMOTOVEC
+	.global	memset     
+memset:
+#else
+	.global	vec_memset     
+vec_memset:
+#endif
+
+	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
+	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count
+	rlwinm.	Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift
+
+	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
+	addi	DR,DST,16	// IU1 Address of second dst vector
+	add	DBC,DST,BC	// IU1 Address of last dst byte + 1
+	bgt	cr7,v_memset	// b if BC>MIN_VEC
+
+	mtctr	BC		// for (i=1;i<=BC;i++)
+	beqlr	cr1		// return if BC = 0
+Byte_set:
+	stbu	FILL,1(DM1)	// LSU * ++(DST-1) = FILL
+	bdnz	Byte_set
+
+	blr
+
+v_memset:
+// Byte count < MIN_VEC bytes will have been set by scalar code above,
+// so this will not deal with small block sets < MIN_VEC.
+
+// For systems using VRSAVE, define VRSAV=1 when compiling.  For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
+#endif
+	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
+	addi	DBK,DBC,-1	// IU1 Address of last dst byte
+
+#ifdef VRSAVE
+	oris	Rt,RSV,0xe000	// IU1 Or in registers used by this routine
+#endif
+	subf	D,DST,DR	// IU1 How many bytes in first destination?
+	li	BK,0		// IU1 Initialize byte kount index
+
+#ifdef VRSAVE
+	mtspr	VRSV,Rt	// IU2 Save in VRSAVE before first vec op
+#endif
+	vxor	v0,v0,v0	// VIU Clear v0
+	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
+	cmpi	cr1,0,D,16	// IU1 Is D0 left justified?
+	beq+	enter_bzero	// b if FILL==0
+
+	lvsl	v0,0,Fsh	// LSU Move upper nibble to byte 0 of VR
+	vspltisb	v1,4	// VPU Splat 0x4 to every byte
+
+	lvsl	v2,0,FILL	// LSU Move lower nibble to byte 0 of VR
+
+	vslb	v0,v0,v1	// VIU Move upper nibble to VR[0:3]
+
+	vor	v0,v0,v2	// VIU Form FILL byte in VR[0:7]
+
+	vspltb	v0,v0,0		// VPU Splat the fill byte to all bytes
+enter_bzero:
+	mtcrf	0x01,D		// IU2 Put bytes in 1st dst in cr7
+	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
+	beq	cr1,Left_just	// b if D0 is left justified
+
+	bns	cr7,No_B_fwd	// b if only even number of bytes to store
+
+	stvebx	v0,DST,BK	// LSU store first byte at DST+0
+	addi	BK,BK,1		// IU1 increment index
+No_B_fwd:
+	bne	cr7,No_H_fwd	// b if only words to store
+
+	stvehx	v0,DST,BK	// LSU store halfword at DST+0/1
+	addi	BK,BK,2		// IU1 increment index
+No_H_fwd:
+	bng	cr7,No_W1_fwd	// b if exactly zero or two words to store
+
+	stvewx	v0,DST,BK	// LSU store word 1 of one or three
+	addi	BK,BK,4		// IU1 increment index
+
+No_W1_fwd:
+	bnl	cr7,No_W2_fwd	// b if there was only one word to store
+	stvewx	v0,DST,BK	// LSU store word 1 of two or 2 of three
+	addi	BK,BK,4		// IU1 increment index
+
+	stvewx	v0,DST,BK	// LSU store word 2 of two or 3 of three
+	b	No_W2_fwd
+
+Left_just:	
+	stvx	v0,0,DST	// LSU Store 16 bytes at D0
+No_W2_fwd:
+	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
+	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
+
+	li	BK,16		// IU1 Re-initialize byte kount index
+	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
+	ble	cr6,Last_QW	// b if no Quad words to do
+
+	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
+	cmpi	cr6,0,QW,4	// IU1 Check QW>4
+
+QW_loop:
+	stvx	v0,DST,BK	// LSU Store 16 fill bytes
+	addi	BK,BK,16	// IU1 Increment byte kount index
+	bdnzf	25,QW_loop	// b if 4 or less quad words to do
+
+	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
+	addi	QW,QW,-1	// IU1 One more QW stored by now
+	bgt	cr6,GT_4QW_fwd	// b if >4 quad words left
+
+Last_QW:	// Next vector is the last; we're done.
+	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
+
+	beq	cr1,Rt_just_fwd	// b if last destination is right justified
+
+	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
+	li	BL,0		// IU1 Initialize index pointer
+	bnl	cr7,Only_1W_fwd	// b if there was only one or zero words to store
+
+	stvewx	v0,DBK,BL	// LSU store word 1 of two or three
+	addi	BL,BL,4		// IU1 increment index
+
+	stvewx	v0,DBK,BL	// LSU store word 2 of two or three
+	addi	BL,BL,4		// IU1 increment index
+Only_1W_fwd:
+	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
+
+	stvewx	v0,DBK,BL	// LSU store word 3 of three if necessary
+	addi	BL,BL,4		// IU1 increment index
+Only_2W_fwd:
+	bne	cr7,Only_B_fwd	// b if there are no half words to store
+
+	stvehx	v0,DBK,BL	// LSU store one halfword if necessary
+	addi	BL,BL,2		// IU1 increment index
+Only_B_fwd:
+	bns	cr7,All_done_fwd	// b if there are no bytes to store
+
+	stvebx	v0,DBK,BL	// LSU store one byte if necessary
+	b	All_done_fwd
+
+Rt_just_fwd:
+
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D14
+All_done_fwd:
+#ifdef VRSAVE
+	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
+#endif
+	blr			// Return destination address from entry
+
+#ifdef __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
+
+	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
+	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
+
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D2
+	addi	BK,BK,16	// IU1 Increment byte count by 16
+	bdnzf	27,GT_4QW_fwd	// b if next store is to lower (even) half of CL
+
+	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+
+	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+	bdnz	B32_fwd		// decrement counter for last QW store odd
+
+B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
+	DCBK			// LSU then Kill instead of RWITM
+
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D11
+	addi	BK,BK,16	// IU1 Increment byte count
+	bdz	Nxt_loc_fwd	// always decrement and branch to next instr		
+
+Nxt_loc_fwd:
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D12
+	addi	BK,BK,16	// IU1 Increment byte count
+	bdnz	B32_fwd		// b if there are at least two more QWs to do
+
+	bso	cr6,One_even_QW	// b if there is one even and one odd QW to store
+	b	Last_QW		// b if last store is to even address
+
+// Come here with two more loads and two stores to do
+One_even_QW:
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D13
+	addi	BK,BK,16	// IU1 Increment byte count
+
+	b	Last_QW
+
+// End of memset in AltiVec
+
+#define BCz r4		// in bzero r4 enters with byte count
+
+#ifdef __MWERKS__
+	.align	32
+#else
+	.align	5
+#endif
+
+#ifdef LIBMOTOVEC
+	.global	bzero     
+bzero:
+#else
+	.global	vec_bzero     
+vec_bzero:
+#endif
+
+	mr	BC,BCz		// IU1 arg[2] is BC here, not FILL
+	li	FILL,0		// IU1 for bzero FILL=0
+#ifdef LIBMOTOVEC
+	b	memset     
+#else
+	b	vec_memset     
+#endif
+
+// cacheable_memzero will employ dcbz to clear 32 bytes at a time
+// of cacheable memory. Like bzero, second entering argument will be BC.
+// Using this for non-cacheable memory will generate an alignment exception.
+
+	.text
+#ifdef __MWERKS__
+	.align	32
+#else
+	.align	5
+#endif
+
+#ifdef LIBMOTOVEC
+	.global	cacheable_memzero     
+cacheable_memzero:
+#else
+	.global	vec_cacheable_memzero     
+vec_cacheable_memzero:
+#endif
+
+	mr	BC,BCz		// IU1 arg[2] is BC here, not FILL
+	li	FILL,0		// IU1 for bzero FILL=0
+	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
+
+	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count
+
+	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
+	addi	DR,DST,16	// IU1 Address of second dst vector
+	add	DBC,DST,BC	// IU1 Address of last dst byte + 1
+	bgt	cr7,c_v_memset	// b if BC>MIN_VEC
+
+	mtctr	BC		// for (i=1;i<=BC;i++)
+	beqlr	cr1		// return if BC = 0
+c_Byte_set:
+	stbu	FILL,1(DM1)	// LSU * ++(DST-1) = FILL
+	bdnz	c_Byte_set
+
+	blr
+
+c_v_memset:
+// Byte count < MIN_VEC bytes will have been set by scalar code above,
+// so this will not deal with small block sets < MIN_VEC.
+
+// For systems using VRSAVE, define VRSAV=1 when compiling.  For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
+#endif
+	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
+	addi	DBK,DBC,-1	// IU1 Address of last dst byte
+
+#ifdef VRSAVE
+	oris	Rt,RSV,0x8000	// IU1 Or in registers used by this routine
+#endif
+	subf	D,DST,DR	// IU1 How many bytes in first destination?
+	li	BK,0		// IU1 Initialize byte kount index
+
+#ifdef VRSAVE
+	mtspr	VRSV,Rt	// IU2 Save in VRSAVE before first vec op
+#endif
+	vxor	v0,v0,v0	// VIU Clear v0
+	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
+	cmpi	cr1,0,D,16	// IU1 Is D0 left justified?
+
+	mtcrf	0x01,D		// IU2 Put bytes in 1st dst in cr7
+	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
+	beq	cr1,c_Left_just	// b if D0 is left justified
+
+	bns	cr7,c_No_B_fwd	// b if only even number of bytes to store
+
+	stvebx	v0,DST,BK	// LSU store first byte at DST+0
+	addi	BK,BK,1		// IU1 increment index
+c_No_B_fwd:
+	bne	cr7,c_No_H_fwd	// b if only words to store
+
+	stvehx	v0,DST,BK	// LSU store halfword at DST+0/1
+	addi	BK,BK,2		// IU1 increment index
+c_No_H_fwd:
+	bng	cr7,c_No_W1_fwd	// b if exactly zero or two words to store
+
+	stvewx	v0,DST,BK	// LSU store word 1 of one or three
+	addi	BK,BK,4		// IU1 increment index
+
+c_No_W1_fwd:
+	bnl	cr7,c_No_W2_fwd	// b if there was only one word to store
+	stvewx	v0,DST,BK	// LSU store word 1 of two or 2 of three
+	addi	BK,BK,4		// IU1 increment index
+
+	stvewx	v0,DST,BK	// LSU store word 2 of two or 3 of three
+	b	c_No_W2_fwd
+
+c_Left_just:	
+	stvx	v0,0,DST	// LSU Store 16 bytes at D0
+c_No_W2_fwd:
+	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
+	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
+
+	li	BK,16		// IU1 Re-initialize byte kount index
+	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
+	ble	cr6,c_Last_QW	// b if no Quad words to do
+
+	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
+	cmpi	cr6,0,QW,4	// IU1 Check QW>4
+
+c_QW_loop:
+	stvx	v0,DST,BK	// LSU Store 16 fill bytes
+	addi	BK,BK,16	// IU1 Increment byte kount index
+	bdnzf	25,c_QW_loop	// b if 4 or less quad words to do
+
+	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
+	addi	QW,QW,-1	// IU1 One more QW stored by now
+	bgt	cr6,c_GT_4QW_fwd	// b if >4 quad words left
+
+c_Last_QW:	// Next vector is the last; we're done.
+	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
+
+	beq	cr1,c_Rt_just_fwd	// b if last destination is right justified
+
+	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
+	li	BL,0		// IU1 Initialize index pointer
+	bnl	cr7,c_Only_1W_fwd	// b if there was only one or zero words to store
+
+	stvewx	v0,DBK,BL	// LSU store word 1 of two or three
+	addi	BL,BL,4		// IU1 increment index
+
+	stvewx	v0,DBK,BL	// LSU store word 2 of two or three
+	addi	BL,BL,4		// IU1 increment index
+c_Only_1W_fwd:
+	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
+
+	stvewx	v0,DBK,BL	// LSU store word 3 of three if necessary
+	addi	BL,BL,4		// IU1 increment index
+c_Only_2W_fwd:
+	bne	cr7,c_Only_B_fwd	// b if there are no half words to store
+
+	stvehx	v0,DBK,BL	// LSU store one halfword if necessary
+	addi	BL,BL,2		// IU1 increment index
+c_Only_B_fwd:
+	bns	cr7,c_All_done_fwd	// b if there are no bytes to store
+
+	stvebx	v0,DBK,BL	// LSU store one byte if necessary
+	b	c_All_done_fwd
+
+c_Rt_just_fwd:
+
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D14
+c_All_done_fwd:
+#ifdef VRSAVE
+	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
+#endif
+	blr			// Return destination address from entry
+
+#ifdef __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+c_GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
+
+	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
+	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
+
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D2
+	addi	BK,BK,16	// IU1 Increment byte count by 16
+	bdnzf	27,c_GT_4QW_fwd	// b if next store is to lower (even) half of CL
+
+	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+
+	bns	cr6,c_B32_fwd	// b if DST[27] == 0; i.e, final store is even
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+	bdnz	B32_fwd		// decrement counter for last QW store odd
+
+c_B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
+	dcbz	DST,BK		// LSU zero whole cache line
+	bdz	c_Nxt_loc_fwd	// always decrement and branch to next instr		
+
+c_Nxt_loc_fwd:
+	addi	BK,BK,32	// IU1 Increment byte count
+	bdnz	B32_fwd		// b if there are at least two more QWs to do
+
+	bso	cr6,c_One_even_QW	// b if there is one even and one odd QW to store
+	b	c_Last_QW		// b if last store is to even address
+
+// Come here with two more loads and two stores to do
+c_One_even_QW:
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D13
+	addi	BK,BK,16	// IU1 Increment byte count
+
+	b	c_Last_QW
+
+// End of cacheable_memzero in AltiVec
diff --git a/liboil/motovec/vec_strcpy.S b/liboil/motovec/vec_strcpy.S
new file mode 100644
index 0000000..c31beaa
--- /dev/null
+++ b/liboil/motovec/vec_strcpy.S
@@ -0,0 +1,273 @@
+//------------------------------------------------------------------
+// file:  vec_strcpy.S
+//    AltiVec enabled version of strcpy and strncpy
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+//	Copyright Motorola, Inc. 2003
+//	ALL RIGHTS RESERVED
+//
+//	You are hereby granted a copyright license to use, modify, and 
+//	distribute the SOFTWARE so long as this entire notice is retained 
+//	without alteration in any modified and/or redistributed versions, 
+//	and that such modified versions are clearly identified as such.  
+//	No licenses are granted by implication, estoppel or otherwise under 
+//	any patents or trademarks of Motorola, Inc.
+//
+//	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
+//	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
+//	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
+//	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
+//	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
+//	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
+//	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
+//
+//	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
+//	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
+//	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
+//	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
+//	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
+//	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
+//	for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern  char *vec_strcpy(char *dest, const char *src);
+//           
+// Returns:
+//  char *dest
+//------------------------------------------------------------------
+
+// Revision History:
+//    Rev 0.0	Original                          Chuck Corley	03/22/02
+//    Rev 0.1   Modified per vec_memcpy rev 0.30  Chuck Corley  05/24/03
+//
+
+// Harbison and Steele says "the results of both strcpy, strncpy, ... are
+// unpredictable if the two string arguments overlap in memory."
+// Since we do not know the address of the end of the string, copying
+// from back to front is not an option.  Therefore we always "copy forward."
+
+#define VRSV 256	//	VRSAVE spr
+// Use scalar for first MIN_SCALAR bytes. Overhead for vector is too great to win.
+#define MIN_SCALAR 32
+// Also don't use vectors if |DST-SRC| <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
+#define MIN_VEC 16
+#define PAGE_SIZE 4096	// True for G4 with AltiVec
+
+// Register useage:
+#define Rt r0	// 	r0 when used as a temporary register	
+
+#define DST r3	// 	entering: dst pointer; exiting: same dst pointer
+
+#define SRC r4	// 	entering: src ptr; then end of src range index (SRC+BC) in memmove
+
+#define ADD r5	//      Temporary future dst address
+#define PBC r5	//	Computed Byte_Count to next 4K page src boundary
+
+#define DMS r6	//      dst - src initially
+
+#define SMD r7	//      src - dst initially
+
+#define DD r8	//	duplicate of dst register for incementing
+
+#define QBC r9	// 	Computed Byte_Count to next QW dst boundary
+
+#define DS r10	//	duplicate of src register for speculative incementing
+
+#define PSZ r11	//	storage for page size constant
+
+#define RSV r12	//  	storage for VRSAVE register if used
+
+#define V0    v0	// 	all zeros
+
+#define VS0   v1	//  	src vector for permuting
+
+#define VS1   v2	//  	src vector for permuting
+
+#define VS2   v3	//  	src vector for permuting
+
+#define VP3   v4	// 	alignment permute register
+
+#define VPS0  v5	// 	permuted source vector to store
+
+#define VPS1  v6	//  	2nd permuted source vector to store
+
+#define VCN  v7		//  	null comparison result register
+
+// Conditionalize the use of dcba.  It will help if the data is
+// not in cache and hurt if it is.  Generally, except for small
+// benchmarks repeated many times, we assume data is not in cache
+// (data streaming) and using dcbz is a performance boost.
+#ifndef NO_DCBA
+#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
+ // gcc and codewarrior and diab don't assemble dcba
+#define DCBA .long 0x7c0045ec
+// dcba 0,r8     or    dcba 0,DD
+#else
+#ifdef __ghs__
+.macro DCBA
+.long 7c0045ec
+.endm
+#else
+#define DCBA dcba 0,DD
+#endif  // __ghs__
+#endif  // __GNUC__ or __MWERKS__
+#else
+#define DCBA nop
+#endif  // NO_DCBA
+
+	.text
+#ifdef __MWERKS__
+	.align	32
+#else
+	.align	5
+#endif
+
+#ifdef LIBMOTOVEC
+	.global	strcpy     
+strcpy:
+#else
+	.global	vec_strcpy     
+vec_strcpy:
+#endif
+
+
+	addi	ADD,DST,32	// IU1 Next dst cacheline
+	subf.	DMS,SRC,DST	// IU1 Compute dst-src difference
+	subf	SMD,DST,SRC	// IU1 src-dst for use if dst-src<0
+
+	rlwinm	ADD,ADD,0,0,26	// IU1 Round down to even QW
+	mr	DD,DST		// IU1 Duplicate dest
+	beqlr			// return if DST = SRC
+
+	bgt	Pos_value	// b if DST-SRC>0
+	mr	DMS,SMD		// IU1 |dst - src| = src - dst
+Pos_value:
+	subf.	QBC,DST,ADD	// IU1 Bytes to even QW start of vect (min 32)
+	addi	ADD,DD,PAGE_SIZE	// IU1 dst addr in next 4K page
+	cmpi	cr7,0,DMS,MIN_VEC	// IU1 Check for min byte count separation
+
+	mtctr	QBC		// IU2 Init counter
+Byte_loop:
+	lbzx	Rt,0,SRC		// LSU Get a byte
+	addi	SRC,SRC,1		// IU1 Increment src
+	
+	cmpi	cr1,0,Rt,0	// IU1 Is the byte loaded null?
+	stbx	Rt,0,DD		// LSU Store it
+	addi	DD,DD,1		// IU1 Increment dest
+	bdnzf	6,Byte_loop	// b to get another if this one wasn't null
+
+	beqlr	cr1		// return if found a null
+	
+	li	PSZ,PAGE_SIZE	// IU1 Constant for potential use in vector
+	rlwinm	ADD,ADD,0,0,19	// IU1 First address in next 4K page
+	mr	DS,SRC		// IU1 Get current src addr
+	ble	cr7,Byte_loop	// do by bytes forever if < MIN_VEC separation
+	
+v_strcpy:
+// For systems using VRSAVE, define VRSAVE=1 when compiling.  For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
+#endif
+	subf.	PBC,DD,ADD	// IU1 Now bytes to next 4K page
+
+#ifdef VRSAVE
+	oris	Rt,RSV,0xff00	// IU1 Or in registers used by this routine
+#endif	
+	rlwinm	PBC,PBC,28,4,31	// IU1 Now QWs to next 4K page
+
+#ifdef VRSAVE
+	mtspr	VRSV,Rt		// IU2 Save in VRSAVE before first vec op
+#endif
+// Since DD has to be QW aligned at this point, we need three (or two 
+// if SRC[28:31]==0) source vectors to permute into two dest vectors.
+// Loading beyond the end of the string should be okay as long as we don't
+// cross a page boundary.
+
+	lvsl	VP3,0,SRC		// LSU Create left permute vector
+	vxor	V0,V0,V0		// VIU Clear v0
+	ble	New_page_0	// b if next load will cross page boundary
+	mtctr	PBC		// IU2 Okay to load up to next page
+Page_0:
+
+	lvx	VS0,0,DS		// LSU Get first src vector
+	addi	DS,DS,16		// IU1 Increment vector src pointer
+	bdz	New_page_1	// b if next load will cross page boundary
+Page_1:
+
+	lvx	VS1,0,DS		// LSU Get second src vector
+	addi	DS,DS,16		// IU1 Increment vector src pointer
+	bdz	New_page_2	// b if next load will cross page boundary
+Page_2:	
+
+	lvx	VS2,0,DS	// LSU Get third src vector
+	addi	DS,DS,16	// IU1 Increment vector src pointer
+	bdz	New_page_3	// b if next load will cross page boundary
+Page_3:	
+	
+	vperm	VPS0,VS0,VS1,VP3	// VPU Align S0 and S1 to D0
+
+	vperm	VPS1,VS1,VS2,VP3	// VPU Align S1 and S2 to D1
+	vor	VS0,VS2,VS2	// VIU1 Move upper vector to lower
+
+	vcmpequb.	VCN,V0,VPS0		// VIU1 Check for null
+	bne	cr6,Final_0	// b if found a null in this permuted source vector
+	addi	SRC,SRC,16	// IU1 Increment byte src pointer
+	
+	vcmpequb.	VCN,V0,VPS1		// VIU1 Check for null
+	bne	cr6,Final_1	// b if found a null in this permuted source vector
+	DCBA			// LSU Conditionally dcba 0,DST
+	addi	SRC,SRC,16	// IU1 Increment byte src pointer
+
+	stvx	VPS0,0,DD	// LSU Store 16 bytes at dst addr D0
+	addi	DD,DD,16	// IU1 Increment duplicate dst pointer
+
+	stvx	VPS1,0,DD	// LSU Store 16 bytes at dst addr D1
+	addi	DD,DD,16	// IU1 Increment duplicate dst pointer
+	
+	b	Page_1
+	
+Final_1:	// Found a null in 2nd vector, store 1st vector then do bytes
+	stvx	VPS0,0,DD	// LSU Store 16 bytes at dst addr D0
+	addi	DD,DD,16	// IU1 Increment duplicate dst pointer
+	
+Final_0:	// Found a null in vector, load and store bytes to null instead	
+	lbzx	Rt,0,SRC	// LSU Get a byte
+	addi	SRC,SRC,1	// IU1 Increment src
+	
+	cmpi	cr1,0,Rt,0	// IU1 Is the byte loaded null?
+	stbx	Rt,0,DD		// LSU Store it
+	addi	DD,DD,1		// IU1 Increment dest
+
+	bne	cr1,Final_0	// b to get another if this one wasn't null
+	
+#ifdef VRSAVE
+	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
+#endif
+	blr
+
+New_page_0:	// Next load will be from new page; (ctr would have been <= zero)
+	mtctr	PSZ	// reinitialize counter
+	b	Page_0
+
+New_page_1:	// Did VS0 contain any nulls?
+	vcmpequb.	VCN,V0,VS0		// VIU1 Check for null
+	bnl	cr6,Final_0	// b if found a null in this source vector
+	mtctr	PSZ	// reinitialize counter
+	b	Page_1
+
+New_page_2:	// Did VS1 contain any nulls?
+	vcmpequb.	VCN,V0,VS1		// VIU1 Check for null
+	bnl	cr6,Final_0	// b if found a null in this source vector
+	mtctr	PSZ	// reinitialize counter
+	b	Page_2
+	
+New_page_3:	// Did VS2 contain any nulls?
+	vcmpequb.	VCN,V0,VS2		// VIU1 Check for null
+	bnl	cr6,Final_0	// b if found a null in this source vector
+	mtctr	PSZ	// reinitialize counter
+	b	Page_3
+
+// End of strcpy in AltiVec
author	David Schleef <ds@schleef.org>	2005-06-17 21:51:58 +0000
committer	David Schleef <ds@schleef.org>	2005-06-17 21:51:58 +0000
commit	d64fd56082933579566d4bf45d3f421d3eba8392 (patch)
tree	ad4cbc3d76daa914252999ddac96ffcaee17a8e2
parent	f811d988ddc37ca0592dee4024629dded03aef9f (diff)
download	liboil-d64fd56082933579566d4bf45d3f421d3eba8392.tar.gz