summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2005-06-17 21:51:58 +0000
committerDavid Schleef <ds@schleef.org>2005-06-17 21:51:58 +0000
commitd64fd56082933579566d4bf45d3f421d3eba8392 (patch)
treead4cbc3d76daa914252999ddac96ffcaee17a8e2
parentf811d988ddc37ca0592dee4024629dded03aef9f (diff)
downloadliboil-d64fd56082933579566d4bf45d3f421d3eba8392.tar.gz
* configure.ac: snarf LIBMOTOVEC because it has a compatible
license. * COPYING: * liboil/Makefile.am: * liboil/motovec/Makefile.am: * liboil/motovec/README: * liboil/motovec/checksum_vec.S: * liboil/motovec/string_vec.S: * liboil/motovec/vec_csum.S: * liboil/motovec/vec_memcmp.S: * liboil/motovec/vec_memcpy.S: * liboil/motovec/vec_memset.S: * liboil/motovec/vec_strcpy.S:
-rw-r--r--COPYING77
-rw-r--r--ChangeLog16
-rw-r--r--configure.ac2
-rw-r--r--liboil/Makefile.am3
-rw-r--r--liboil/motovec/Makefile.am17
-rw-r--r--liboil/motovec/README345
-rw-r--r--liboil/motovec/checksum_vec.S627
-rw-r--r--liboil/motovec/string_vec.S1375
-rw-r--r--liboil/motovec/vec_csum.S724
-rw-r--r--liboil/motovec/vec_memcmp.S340
-rw-r--r--liboil/motovec/vec_memcpy.S876
-rw-r--r--liboil/motovec/vec_memset.S553
-rw-r--r--liboil/motovec/vec_strcpy.S273
13 files changed, 5206 insertions, 22 deletions
diff --git a/COPYING b/COPYING
index 36d4eca..ba09b76 100644
--- a/COPYING
+++ b/COPYING
@@ -1,23 +1,58 @@
-Copyright (c) David A. Schleef <ds@schleef.org>
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
+The majority of the source code and the collective work is subject
+to the following license:
+
+ Copyright 2002,2003,2004,2005 David A. Schleef <ds@schleef.org>
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+
+
+The source code in the liboil/motovec directory is subject to the
+following license:
+
+ Copyright Motorola, Inc. 2003
+ ALL RIGHTS RESERVED
+
+ You are hereby granted a copyright license to use, modify, and
+ distribute the SOFTWARE so long as this entire notice is retained
+ without alteration in any modified and/or redistributed versions,
+ and that such modified versions are clearly identified as such.
+ No licenses are granted by implication, estoppel or otherwise under
+ any patents or trademarks of Motorola, Inc.
+
+ The SOFTWARE is provided on an "AS IS" basis and without warranty.
+ To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
+ ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
+ WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
+ PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
+ REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
+ THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
+
+ To the maximum extent permitted by applicable law, IN NO EVENT SHALL
+ MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
+ (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
+ BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
+ INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
+ INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
+ for the maintenance and support of the SOFTWARE.
-THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
-IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
diff --git a/ChangeLog b/ChangeLog
index ce4a41e..76a2b64 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,21 @@
2005-06-17 David Schleef <ds@schleef.org>
+ * configure.ac: snarf LIBMOTOVEC because it has a compatible
+ license.
+ * COPYING:
+ * liboil/Makefile.am:
+ * liboil/motovec/Makefile.am:
+ * liboil/motovec/README:
+ * liboil/motovec/checksum_vec.S:
+ * liboil/motovec/string_vec.S:
+ * liboil/motovec/vec_csum.S:
+ * liboil/motovec/vec_memcmp.S:
+ * liboil/motovec/vec_memcpy.S:
+ * liboil/motovec/vec_memset.S:
+ * liboil/motovec/vec_strcpy.S:
+
+2005-06-17 David Schleef <ds@schleef.org>
+
* liboil/colorspace/Makefile.am: new files
* liboil/colorspace/argb_paint.c: remove temporary classes
* liboil/colorspace/composite.c: new
diff --git a/configure.ac b/configure.ac
index 5a28079..1f43945 100644
--- a/configure.ac
+++ b/configure.ac
@@ -20,6 +20,7 @@ dnl - interfaces removed -> AGE = 0
LIBOIL_LIBVERSION="1:0:1"
AC_SUBST(LIBOIL_LIBVERSION)
AM_PROG_LIBTOOL
+AM_PROG_AS
AC_CONFIG_SRCDIR([liboil/liboil.h])
@@ -204,6 +205,7 @@ liboil/conv/Makefile
liboil/copy/Makefile
liboil/dct/Makefile
liboil/md5/Makefile
+liboil/motovec/Makefile
liboil/jpeg/Makefile
liboil/simdpack/Makefile
liboil/sse/Makefile
diff --git a/liboil/Makefile.am b/liboil/Makefile.am
index 5711500..e245e7c 100644
--- a/liboil/Makefile.am
+++ b/liboil/Makefile.am
@@ -1,7 +1,7 @@
pkgincludedir = $(includedir)/liboil-@LIBOIL_MAJORMINOR@/liboil
-SUBDIRS = colorspace conv copy dct jpeg simdpack md5 utf8 sse
+SUBDIRS = colorspace conv copy dct jpeg md5 motovec simdpack sse utf8
lib_LTLIBRARIES = liboiltmp1.la liboil-@LIBOIL_MAJORMINOR@.la
@@ -27,6 +27,7 @@ liboilfunctions_la_LIBADD = \
dct/libdct.la \
jpeg/libjpeg.la \
md5/libmd5.la \
+ motovec/libmotovec.la \
simdpack/libsimdpack.la \
sse/libsse.la \
utf8/libutf8.la \
diff --git a/liboil/motovec/Makefile.am b/liboil/motovec/Makefile.am
new file mode 100644
index 0000000..a56fb98
--- /dev/null
+++ b/liboil/motovec/Makefile.am
@@ -0,0 +1,17 @@
+
+noinst_LTLIBRARIES = libmotovec.la
+
+c_sources =
+
+if HAVE_CPU_POWERPC
+powerpc_sources = \
+ vec_memcpy.S
+else
+powerpc_sources =
+endif
+
+libmotovec_la_SOURCES = \
+ $(powerpc_sources)
+libmotovec_la_LIBADD =
+libmotovec_la_CFLAGS = $(LIBOIL_CFLAGS)
+
diff --git a/liboil/motovec/README b/liboil/motovec/README
new file mode 100644
index 0000000..a458db4
--- /dev/null
+++ b/liboil/motovec/README
@@ -0,0 +1,345 @@
+//------------------------------------------------------------------
+// file: readme.txt
+// Readme to accompany libmotovec.a
+//------------------------------------------------------------------
+
+Rev 0.30 release - 5/28/2003 by Chuck Corley
+
+This release includes two new files, string_vec.S and checksum_vec.s,
+which you could paste into the Linux kernel files:
+/arch/ppc/lib/string.S and
+/arch/ppc/lib/checksum.S
+if you wanted to employ AltiVec in the Linux kernel. We used the
+memcpy_vec and csum_partial_copy_generic_vec functions from these
+files only in the modified versions of /net/core/skbuf.c and
+/net/core/iovec.c to give us the networking performance boost in
+Linux described in the SNDF presentation "Accelerating Networking Data
+Movement Using the AltiVecĀ® Technology" at www.motorola.com/sndf under
+Dallas-2003/Host Processors (H1110). Also see the white paper
+"Enhanced TCP/IP Performance with AltiVec Technology" at
+e-www.motorola.com/brdata/PDFDB/docs/ALTIVECTCPIPWP.pdf
+
+These files contain the following functions
+string.S contains: string_vec.S contains:
+memcpy memcpy_vec
+bcopy bcopy_vec
+memmove memmove_vec
+backwards_memcpy backwards_memcpy_vec
+memset memset_vec
+memcmp memcmp_vec
+memchr (coming soon)
+cacheable_memcpy cacheable_memcpy_vec
+cacheable_memzero cacheable_memzero_vec
+strcpy strcpy_vec
+strncpy (coming soon)
+strcat (coming soon)
+strcmp strcmp_vec
+strlen strlen_vec
+__copy_tofrom_user* __copy_tofrom_user_vec*
+__clear_user* __clear_user_vec*
+__strncpy_from_user* (coming soon)
+__strnlen_user* (coming soon)
+
+checksum.S contains: checksum_vec.S contains:
+csum_partial csum_partial_vec
+csum_partial_copy_generic* csum_partial_copy_generic_vec
+ip_fast_csum (unlikely to benefit)
+csum_tcpudp_magic (unlikely to benefit)
+
+*these functions have ex_table entries for handling memory access
+exceptions in the kernel. The AltiVec versions were functionally
+tested by hand.
+
+csum_partial_copy_generic_vec and csum_partial_vec previously
+assembled into libmotovec.a have been removed since they are in the file
+above. We are finding that selective use of the *_vec functions in
+the OS kernel is much "safer" than wholescale replacement of the libc
+library. libmotovec.a returns to being exclusively a performance-enhancing
+library of libc functions that can be safely linked with user application
+code to test the performance of AltiVec.
+
+My presentation for SDNF-Europe includes performance comparisons
+of the scalar versus vector versions of the above functions. It should
+be available on the SNDF website soon. It also includes an updated
+explanation of memcpy without the potential incoherency problem discussed
+below.
+
+So this release contains in libmotovec.a:
+memcpy.o from vec_memcpy.S Rev 0.30 dated 4/02/2003
+bcopy.o from vec_memcpy.S Rev 0.30 dated 4/02/2003
+memmove.o from vec_memcpy.S Rev 0.30 dated 4/02/2003
+memset.o from vec_memset.S Rev 0.10 dated 5/01/2003
+bzero.o from vec_memset.S Rev 0.10 dated 5/01/2003
+strcmp.o from vec_strcmp.S Rev 0.00 dated 3/03/2002
+strlen.o from vec_strlen.S Rev 0.00 dated 12/26/2002
+
+And in string.s:
+memcpy_vec derived from vec_memcpy.S Rev 0.30 dated 4/02/2003
+bcopy_vec derived from vec_memcpy.S Rev 0.30
+memmove_vec derived from vec_memcpy.S Rev 0.30
+backwards_memcpy_vec derived from vec_memcpy.S Rev 0.30
+memset_vec derived from vec_memset.S Rev 0.10 dated 5/01/2003
+memcmp_vec derived from vec_memcmp.S Rev 0.00
+memchr (coming soon)
+cacheable_memcpy_vec derived from vec_memcpy.S Rev 0.30
+cacheable_memzero_vec derived from vec_memset.S Rev 0.10
+strcpy_vec derived from vec_strcpy.S Rev 0.10
+strncpy_vec (coming soon)
+strcat_vec (coming soon)
+strcmp_vec derived from vec_strcmp.S Rev 0.00 (not released)
+strlen_vec derived from vec_strlen.S Rev 0.00 (not released)
+__copy_tofrom_user_vec* derived from vec_memcpy.S Rev 0.30
+__clear_user_vec* derived from vec_memcpy.S Rev 0.30
+__strncpy_from_user_vec* (coming soon)
+__strnlen_user_vec* (coming soon)
+*with ex_table and exception code
+
+And in checksum.s:
+csum_partial_vec derived from vec_csum.S Rev 0.0 dated 4/19/03
+csum_partial_copy_generic_vec from vec_csum.S Rev 0.0
+
+string_vec.S and checksum_vec.S are only known to assemble with gcc 2.95
+and gcc 3.3+. Should work with other gcc compilers but may need
+editing to be compatible with non-gcc compilers.
+
+Rev 0.20 release - 5/12/2003 by Chuck Corley
+
+Thanks to all of you who attended SNDF. My presentation "Implementing
+and Using the Motorola AltiVec Libraries" is available for downloading
+at www.motorola.com/sndf under Dallas-2003/Host Processors (H1109).
+
+During the presentation DS from Lucent pointed out that the way I was
+bringing the beginning and ending destination Quad Words (vectors) into
+the registers for merging with the permuted source made the
+"uninvolved" destination bytes vulnerable to potential incoherency if
+some interrupting process changed those bytes while I was holding them
+in a register. While the possibility seemed small, I have rewritten the
+code to avoid this potential problem. The result actually is slightly
+faster than the original for small buffers.
+
+So this release contains:
+memcpy.o from vec_memcpy.S Rev 0.30 dated 4/02/2003
+bcopy.o from vec_memcpy.S Rev 0.30 dated 4/02/2003
+memmove.o from vec_memcpy.S Rev 0.30 dated 4/02/2003
+memset.o from vec_memset.S Rev 0.10 dated 5/01/2003
+bzero.o from vec_memset.S Rev 0.10 dated 5/01/2003
+csum_partial_copy_generic_vec from vec_csum.S Rev 0.0 dated 4/19/03
+csum_partial_vec from vec_csum.S Rev 0.0 dated 4/19/03
+
+The latter two additions were assembled into libmotovec.a despite the
+fact they are not standard libc functions. Rather they are the Altivec
+enabled equivalents of functions by the same name from the linux
+source tree (Linux 2.4.17). While we are pursuing how to get these
+functions incorporated into Linux, here they are assembled and in
+source form if you are building your own version of linux. The use
+of an earlier version of csum_partial_copy_generic_vec and memcpy_vec is
+documented to speed up TCP/IP and UDP transfers in Jacob Pan's SNDF
+presentation "Accelerating Networking Data Movement Using AltiVec
+Technology" (H1110) available at the website above. csum_partial
+does not appear to be called with large enough buffer sizes in linux
+to warrant using the vectorized version.
+
+I am also releasing the source for memset and bzero in this release.
+strcpy, strlen, strncpy, strcmp, memcmp, strcat, and memchr are still
+on my list to do - soon.
+
+Rev 0.10 release - 3/13/2003 by Chuck Corley
+
+The presence of dcbz in the 32 byte loop of memcpy (or memmove)
+causes an alignment exception to non-cacheable memory (MPC7410 User's
+Manual p. 4-20 and MPC7450 User's Manual p. 4-25) so it was
+removed in this release. dcbz instructions were not present in
+memset in any of these releases. That fixed the alignment problem
+but hurt the performance some; then it was "rediscovered" that
+dcba would have been a better choice anyway as it does not cause
+an exception; it would just be noop'ed. So this release substitutes
+dcba for dcbz.
+
+This release contains improvements in memcpy that should be
+documented in an application note which is still not finished but
+are being pretty nicely documented for SNDF presentation H1109.
+
+The memcpy was further loop unrolled to provide a 128B loop for
+large buffers (>256 bytes) and the data stream touch instruction
+was added. It may still be possible to improve the tuning of
+the dst instruction, particularly in memmove, but this release
+is worthy of reving the number to the next significant revision.
+
+I've developed a new metric which will be explained at SNDF in
+Dallas, TX, March 23-26, 2003. As the number of bytes in a
+buffer gets larger, the memcpy routine settles into repetitions
+of the inner loop. 32 bytes were moved in the inner loop of
+Rev 0.0x and 128 bytes are moved in the inner loop of Rev 0.10.
+And the number of processor clocks per inner loop can be shown
+to approach the minimum possible. Therefore the new metric
+measures the incremental transfer rate for the inner loop after
+a reasonable number (>512) of bytes have been moved. This will
+not be the bytes transferred per second because there were some
+less efficient transfers at start-up but this is the transfer
+rate that the routine is asymptotically approaching as the buffer
+gets big (regularly testing to 1460 bytes).
+
+Here is that metric for several cases:
+
+Case 1: For gcc's lib c memcpy when buffers are not word aligned
+Case 2: For gcc's lib c memcpy when buffers are word aligned
+Case 3: For Rev 0.01 of memcpy with Altivec irrespective of alignment
+Case 4: For Rev 0.10 of memcpy with Altivec irrespective of alignment
+
+Numbers are provided for the cold DCache and warm DCache. Code is
+assumed to always be resident in the ICache as would be expected here
+where the inner loop has run multiple times.
+
+ COLD DCACHE WARM DCACHE
+ FOR THE MPC7410@400/100 Insts Clks MB/Sec Insts Clks MB/Sec
+Case 1: gcc_NWA (1 byte/loop) 6 6 71 6 3 133
+Case 2: gcc_WA (16 B/loop) 12 62 103 12 8 800
+Case 3: vec_memcpy Rev 0.01 12 60 213 12 7 1961
+Case 4: vec_memcpy Rev 0.10 46 125 410 46 41 1250
+
+
+ COLD DCACHE WARM DCACHE
+ FOR THE MPC7445@1GHz/133 Insts Clks MB/Sec Insts Clks MB/Sec
+Case 1: gcc_NWA 6 8 122 6 3 350
+Case 2: gcc_WA 12 104 153 12 12 1333
+Case 3: vec_memcpy Rev 0.01 12 110 292 12 7 4413
+Case 4: vec_memcpy Rev 0.10 46 247 518 46 35 3666
+
+Perhaps you notice that we are trading off Warm DCache performance to
+improve the Cold DCache case. There are other interesting tradeoffs
+in going from 32 byte inner loop to 128 bytes. And in using the dcba
+instruction - or not. In other words, the numbers for vec_memcpy above
+are not the highest possible in the Warm DCache case but they look like
+a good compromise which most benefits the Cold DCache case. More at SNDF
+(or eventually in the app note) ...
+
+I am releasing the source code to vec_memcpy.S with this release so if
+if you don't like the tradeoff above you can make your own selection. It
+successfully assembles for me with Codewarrior, Diab, Green Hills, gcc,
+and Metaware. It is nicely commented but could use more documentation.
+I will specifically be explaining it in SNDF presentation H1109.
+
+*************************************************************************
+
+Rev 0.01 release - 2/17/2003 by Chuck Corley
+
+Fixed a problem at Last_ld_fwd: that caused a load beyond a page
+boundary and resulting segment fault in Linux. Last source load
+of SRC+BK in vec_memcpy could be > SRC+BC-1. Also found and fixed
+an error where the Quick and Dirty (QND) code that was in there for
+dst wasn't completely commented out. Plan to enable dst soon.
+Probably loop unroll to 128 bytes first though.
+
+**********************************************************************
+
+Initial Release - 2/10/2003 by Chuck Corley
+
+Contains the libc functions:
+memcpy.o from vec_memcpy.S Rev 0.0 dated 2/09/2003
+bcopy.o from vec_memcpy.S Rev 0.0 dated 2/09/2003
+memmove.o from vec_memcpy.S Rev 0.0 dated 2/09/2003
+memset.o from vec_memset.S Rev 0.0 dated 2/09/2003
+bzero.o from vec_memset.S Rev 0.0 dated 2/09/2003
+
+These functions are implemented in AltiVec but are still not as fast
+as we know how to make them. Watch this site for frequent revisions
+over the next several months.
+
+We are in the process of creating application notes to explain the
+source code and the performance associated with these library functions;
+watch this site for those application notes to be added. A logical
+deadline for completion of this work is the Smart Network Developers
+Forum in Dallas, TX, March 23-26, 2003, where we will be discussing this
+library, its performance, and application.
+
+We will also be adding the following libc functions in the very near future:
+strcpy
+strcmp
+strlen
+memcmp
+memchr
+strncpy
+
+We also have preliminary work completed on the following functions
+found in Linux and have to figure out how to distribute them:
+csum_partial
+csum_partial_generic
+__copy_tofrom_user
+page_copy
+
+We believe that these libraries will improve performance on Motorola G4
+processors for applications that make heavy use of the included functions.
+On non-G4 microprocessors they will cause illegal operation exceptions
+because those processors do not support AltiVec.
+
+To use this library, you must:
+1. Include it on the linker command line prior to the compiler's libc
+library.
+
+Examples:
+For gcc:
+powerpc-eabisim-ld -T../../spprt/gcc_dink.script -Qy -dn -Bstatic ../../spprt/gcc_obj/gcc_crt0.o ../../spprt/gcc_obj/dtime.o ../../spprt/gcc_obj/cache.o ../../spprt/gcc_obj/Support.o ../../spprt/gcc_obj/dinkusr.o ../../spprt/gcc_obj/perfmon.o gcc_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a c:/cygwin/Altivec/powerpc-eabisim\lib\libm.a --start-group -lsim -lc --end-group -o gccBM.elf
+
+For Diab:
+dld ../../spprt/diab_dink.dld ../../spprt/diab_obj/diab_crt0.o ../../spprt/diab_obj/dtime.o ../../spprt/diab_obj/cache.o ../../spprt/diab_obj/Support.o ../../spprt/diab_obj/dinkusr.o ../../spprt/diab_obj/perfmon.o diab_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a -Y P,c:/diab/5.0.3/PPCEH:c:/diab/5.0.3/PPCE/simple:c:/diab/5.0.3/PPCE:c:/diab/5.0.3/PPCEN -lc -lm -o diabBM.elf
+
+For Green Hills:
+elxr -T../../spprt/ghs_dink.lnk ../../spprt/ghs_obj/ghs_crt0.o ../../spprt/ghs_obj/dtime.o ../../spprt/ghs_obj/cache.o ../../spprt/ghs_obj/Support.o ../../spprt/ghs_obj/dinkusr.o ../../spprt/ghs_obj/perfmon.o ghs_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a -Lc:\GHS\ppc36\ppc -lansi -lsys -larch -lind -o ghsBM.elf
+
+For CodeWarrior:
+mwldeppc -lcf ../../spprt/cw_dink.lcf -nostdlib -fp fmadd -proc 7450 ../../spprt/cw_obj/cw_crt0.o ../../spprt/cw_obj/dtime.o ../../spprt/cw_obj/cache.o ../../spprt/cw_obj/Support.o ../../spprt/cw_obj/dinkusr.o ../../spprt/cw_obj/perfmon.o cw_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a -Lc:/"Program Files"/Metrowerks/CodeWarrior/PowerPC_EABI_Support/Runtime/Lib/ -lRuntime.PPCEABI.H.a -Lc:/"Program Files"/Metrowerks/CodeWarrior/PowerPC_EABI_Support/Msl/MSL_C/Ppc_eabi/Lib/ -lMSL_C.PPCEABI.bare.H.a -o cwBM.elf
+
+For Metaware:
+ldppc ../../spprt/mw_link.txt -Bnoheader -Bhardalign -dn -q -Qn ../../spprt/mw_obj/mw_crt0.o ../../spprt/mw_obj/dtime.o ../../spprt/mw_obj/cache.o ../../spprt/mw_obj/Support.o ../../spprt/mw_obj/dinkusr.o ../../spprt/mw_obj/perfmon.o mw_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a -Y P,c:/hcppc/lib/be/fp -lct -lmwt -o mwBM.elf
+
+
+2. Enable AltiVec in the Machine State Processor (MSR) register of the
+target machine.
+
+Example:
+AltiVec_enable:
+ mfmsr r4 // Get current MSR contents
+ oris r4,r4,0x0200 // Set the AltiVec enable bit MSR[6]
+ mtmsr r4 // Write to MSR
+ isync // Context synchronizing instr after mtmsr
+
+
+3. If the AltiVec vector register set is used in more than one context,
+the AltiVec registers must be saved and restored on context switches. The
+AltiVec EABI extensions define a register (SPR 256 - the VRSAVE register)
+which can be used to reduce the number of vector registers which have to
+be saved to only those in use. This library is currently compiled
+without that VRSAVE feature enabled, so all 32 vector registers will have
+to be saved and restored. We are currently thinking that this is a more
+efficient practice anyway and note that Linux and several RTOSes are taking
+that approach in saving and restoring the vector registers. We have observed
+very little performance difference in Linux for saving all of the AltiVec
+registers on a context switch versus saving only 8. And saving all of the
+registers is a less than 1% total impact on performance.
+
+4. There is one worrisome problem with this library when run on the MPC745X
+microprocessors in the 60x bus mode. The MPC7450 Family User's Manual
+(Section 7.3) states that "The 60x bus protocol does not support a 16-byte
+bus transaction. Therefore, cache-inhibited AltiVec loads, stores, and
+write-through stores take an alignment exception. This requires a re-write
+of the alignment exception routines in software that supports AltiVec quad
+word access in 60x bus mode on the MPC745X."
+
+This says that if the user is attempting to use these routines in a
+cache-inhibited area of memory on a MPC745X in 60x bus mode, it will require
+special alignment exception handling software. We are currently implementing
+that software for the Linux OS. Alternatively, the user can restrict this
+library's use to areas of memory known to be cacheable.
+
+This library was built using gcc, but as shown in the examples of step 1 above,
+links and executes with Diab5.0, Green Hills 3.6, Codewarrior EPPC 6.1, and
+Metaware 4.5. The gcc archiver was used to create it in the following
+command lines:
+
+powerpc-eabisim-gcc -c -s -fvec -mcpu=750 -mregnames -I. -I./source -I../../spprt -Ic:/cygwin/Altivec\powerpc-eabisim\include -Ic:/cygwin/Altivec\lib\gcc-lib\powerpc-eabisim\gcc-2.95.2\include -o gcc_obj/vec_memcpy.o -D__GNUC__ -DLIBMOTOVEC ../vec_memcpy/Source/vec_memcpy.S -o gcc_obj/vec_memcpy.o
+
+powerpc-eabisim-gcc -c -s -fvec -mcpu=750 -mregnames -I. -I./source -I../../spprt -Ic:/cygwin/Altivec\powerpc-eabisim\include -Ic:/cygwin/Altivec\lib\gcc-lib\powerpc-eabisim\gcc-2.95.2\include -o gcc_obj/vec_memset.o -D__GNUC__ -DLIBMOTOVEC ../vec_memset/source/vec_memset.S -o gcc_obj/vec_memset.o
+
+powerpc-eabisim-ar -ru libmotovec.a gcc_obj/vec_memcpy.o gcc_obj/vec_memset.o
+
+Email questions or suggestions to risc10@email.sps.mot.com
diff --git a/liboil/motovec/checksum_vec.S b/liboil/motovec/checksum_vec.S
new file mode 100644
index 0000000..c5efe25
--- /dev/null
+++ b/liboil/motovec/checksum_vec.S
@@ -0,0 +1,627 @@
+/*
+ * AltiVec versions (*_vec) of equivalent Linux library functions
+ * found in /arch/ppc/lib/checksum.S from Linux 2.4.17. Suggest this
+ * file be appended to that one when building a Linux kernel that
+ * will employ these functions.
+ *
+ * Copyright (C) Motorola, Inc. 2003
+ *
+ * Revision history:
+ * Rev 0.0 Original Chuck Corley 5/28/03
+ * Contact at risc10@motorola.com
+ * Commented source code for Altivec version available at
+ * www.motorola.com/altivec
+ */
+
+#ifndef TEST_OUTSIDE_LINUX
+#include <linux/sys.h>
+#include <asm/processor.h>
+#include <asm/errno.h>
+#include "../kernel/ppc_asm.tmpl"
+#if 0
+#define v0 vr0
+#define v1 vr1
+#define v2 vr2
+#define v3 vr3
+#define v4 vr4
+#define v5 vr5
+#define v6 vr6
+#define v7 vr7
+#define v8 vr8
+#define v9 vr9
+#define v10 vr10
+#define v11 vr11
+#define v12 vr12
+#define v13 vr13
+#define v14 vr14
+#define v15 vr15
+#endif
+#else
+#define EFAULT 0
+#endif
+
+ .text
+
+/*
+ * AltiVec versions of selected functions for use on AltiVec
+ * enabled G4 and later microprocessors.
+ */
+#if defined(__GNUC__) || defined(__MWERKS__) // gcc and codewarrior don't assemble dcba
+#define DCBAR4R12 .long 0x7c0465ec
+#else
+#define DCBAR4R12 dcba r4,r12
+#endif
+
+ .text
+ .align 4
+#ifndef TEST_OUTSIDE_LINUX
+_GLOBAL(csum_partial_copy_generic_vec)
+#else
+#if __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+ .global csum_partial_copy_generic_vec
+csum_partial_copy_generic_vec:
+#endif
+ li r12,32
+ rlwinm r0,r5,31,1,31
+ cmpi cr7,0,r5,48
+ dcbt r3,r12
+ cmpi cr6,0,r0,0
+ addic r6,r6,0
+ addi r11,r3,-2
+ add r10,r4 ,r5
+ bgt cr7,4f
+ andi. r12,r5,1
+ addi r9,r4,-2
+ add r12,r3,r5
+ beq cr6,2f
+ mtctr r0
+1: lhzu r0,2(r11)
+204: sthu r0,2(r9)
+ addc r6,r6,r0
+ bdnz 1b
+2: beq 3f
+201: lbz r0,-1(r12 )
+202: stb r0,-1(r10)
+ rlwinm r0,r0,8,16,23
+ addc r6,r6,r0
+3: addze r3,r6
+ blr
+4: lvsr v5,0,r4
+ rlwinm r9,r4,0,28,31
+ rlwinm r12,r3,0,28,31
+ lvsr v7,r4,r5
+ subf. r12,r12,r9
+ subf r12,r3,r4
+ lvsr v6,0,r12
+ li r12,64
+ vxor v0,v0,v0
+ dcbt r3,r12
+ cmpi cr1,0,r9,0
+ vnor v1,v0,v0
+ addi r9,r4,16
+ addi r10,r10,-1
+ vperm v5,v1,v0,v5
+ bge 5f
+401: lvx v2,0,r3
+ addi r3,r3,16
+5: lvx v3,0,r3
+ rlwinm r9,r9,0,0,27
+ vperm v1,v0,v1,v7
+ subf r11,r9,r10
+ vxor v7,v7,v7
+ vxor v11,v11,v11
+ rlwinm r11,r11,28,4,31
+ rlwinm r0,r10,0,28,31
+ li r12,96
+ cmpi cr5,0,r0,0xF
+ subf r0,r4,r9
+ mtctr r11
+ cmpi cr6,0,r11,4
+ mtcrf 0x01,r0
+ vperm v4,v2,v3,v6
+ vor v2,v3,v3
+ dcbt r3,r12
+ beq cr1,9f
+ li r12,0
+ vsel v4,v4,v0,v5
+ bns cr7,6f
+502: stvebx v4,r4,r12
+ addi r12,r12,1
+6: bne cr7,7f
+602: stvehx v4,r4,r12
+ addi r12,r12,2
+7: bng cr7,8f
+702: stvewx v4,r4,r12
+ addi r12,r12,4
+8: bnl cr7,10f
+802: stvewx v4,r4,r12
+ addi r12,r12,4
+804: stvewx v4,r4,r12
+ b 10f
+9: stvx v4,0,r4
+10: vxor v8,v8,v8
+ li r12,16
+11: lvx v3,r3,r12
+ vaddcuw v9,v4,v8
+ vadduwm v8,v4,v8
+ vperm v4,v2,v3,v6
+ vor v2,v3,v3
+112: stvx v4,r4,r12
+ vadduwm v11,v9,v11
+ addi r12,r12,16
+ bdnzf 25,11b
+ add r9,r4,r12
+ addi r11,r11,-1
+ bgt cr6,19f
+12: add r10,r4,r5
+ add r11,r3,r5
+ bge 13f
+ addi r11,r11,-16
+13: mtcrf 0x01,r10
+ addi r0,r11,-1
+131: lvx v3,0,r0
+ vaddcuw v9,v4,v8
+ vadduwm v8,v4,v8
+ vadduwm v11,v9,v11
+ vperm v4,v2,v3,v6
+ beq cr5,17f
+ vsel v4,v4,v0,v1
+ rlwinm r10,r10,0,0,27
+ li r9,0
+ bnl cr7,14f
+132: stvewx v4,r10,r9
+ addi r9,r9,4
+134: stvewx v4,r10,r9
+ addi r9,r9,4
+14: bng cr7,15f
+142: stvewx v4,r10,r9
+ addi r9,r9,4
+15: bne cr7,16f
+152: stvehx v4,r10,r9
+ addi r9,r9,2
+16: bns cr7,18f
+162: stvebx v4,r10,r9
+ b 18f
+17: stvx v4,r4,r12
+18: vaddcuw v9,v4,v7
+ vadduwm v12,v4,v7
+ vaddcuw v10,v12,v8
+ vadduwm v8,v12,v8
+ vadduwm v9,v9,v10
+500: vmrglh v2,v0,v8
+ vadduwm v11,v9,v11
+ vmrghh v3,v0,v8
+ rlwinm r10,r1,0,0,27
+ vsumsws v0,v11,v0
+ vadduwm v8,v2,v3
+ li r12,-16
+ vsumsws v8,v8,v0
+182: stvx v8,r10,r12
+183: lwz r3,-4(r10)
+ addc r3,r3,r6
+ addze r3,r3
+ blr
+19: lvx v3,r3,r12
+ addi r11,r11,-1
+ vaddcuw v9,v4,v8
+ vadduwm v8,v4,v8
+ mtcrf 0x02,r9
+ addi r9,r9,16
+ addi r0,r11,-2
+ vperm v4,v2,v3,v6
+ vor v2,v3,v3
+192: stvx v4,r4,r12
+ addi r12,r12,16
+ vadduwm v11,v9,v11
+ bdnzf 27,19b
+ mtcrf 0x02,r10
+ addi r11,r3,96
+ addi r9,r12,16
+ bns cr6,20f
+ bdnz 20f
+20: lvx v3,r3,r12
+ addi r11,r11,32
+ vaddcuw v9,v4,v7
+201: lvx v5,r3,r9
+ vadduwm v12,v4,v7
+ dcbt 0,r11
+ vaddcuw v10,v12,v8
+ DCBAR4R12
+ vadduwm v8,v12,v8
+ vperm v7,v2,v3,v6
+202: stvx v7,r4,r12
+ vperm v4,v3,v5,v6
+ vadduwm v9,v9,v10
+ bdz 21f
+21: stvx v4,r4,r9
+ vor v2,v5,v5
+ vadduwm v11,v9,v11
+ addi r12,r9,16
+ addi r9,r12,16
+ bdnz 20b
+ bso cr6,22f
+ b 12b
+22: lvx v3,r3,r12
+ vaddcuw v9,v4,v8
+ vadduwm v8,v4,v8
+ vadduwm v11,v9,v11
+ vperm v4,v2,v3,v6
+ vor v2,v3,v3
+222: stvx v4,r4,r12
+ addi r12,r12,16
+ b 12b
+
+/* Intent of this exception table is to store -EFAULT to *src_err or
+ * or *dst_err respectively, and (for an error on src) zero the rest
+ * of dst. Return checksum for only those bytes stored before error.
+ * (Can't quite figure out how this return value is used since there
+ * is no way to restart from the point of error. So I'll only return
+ * the checksum for actual buffer as stored in memory. Doesn't look
+ * like scalar version adds in bytes loaded but not stored.)
+ *
+ * Register useage here:
+ * r3 = src, return checksum
+ * r4 = dst
+ * r5 = (preserve as total byte count til near end)
+ * r6 = entering partial sum; accumulator for scalar result
+ * r7 = src_err
+ * r8 = dst_err
+ * r9 = bytes not copied
+ * r10= dst + byte count
+ * r11= number of quad words (vectors)
+ * r12= Byte Kount index
+ */
+
+/* read fault, initial half-word copy */
+100: li r0,0
+ sthu r0,2(r9) /* Zero rest of buffer */
+ cmpi 0,r7,0
+ beq 104f /* Go return checksum */
+ li r0,-EFAULT
+ stw r0,0(r7)
+ b 104f
+
+/* write fault, initial half-word copy */
+101: cmpi 0,r8,0
+ beq 104f
+ li r0,-EFAULT
+ stw r0,0(r8)
+ b 104f
+
+/* read fault, final single-byte copy */
+102: li r0,0
+ stb r0,-1(r10) /* Zero remaining byte */
+ cmpi 0,r7,0
+ beq 104f
+ li r0,-EFAULT
+ stw r0,0(r7)
+ b 104f
+
+/* write fault, final single-byte copy */
+103: cmpi 0,r8,0
+ beq 104f
+ li r0,-EFAULT
+ stw r0,0(r8)
+104: addze r3,r6
+ blr
+
+/* read fault, 1st and 2nd vector load */
+105: cmpi 0,r7,0
+ beq 155f
+ li r0,-EFAULT
+ stw r0,0(r7)
+155: rlwinm r0,r5,31,1,31
+ andi. r12,r5,1
+ mtctr r0
+ addi r9,r4,-2
+ li r0,0
+106: sthu r0,2(r9)
+ bdnz 106b
+ beq 107f
+ stb r0,2(r9)
+107: addze r3,r6
+ blr
+
+/* write fault, initial vector store(s) (Nothing stored yet) */
+108: cmpi 0,r8,0
+ beq 109f
+ li r0,-EFAULT
+ stw r0,0(r8)
+109: addze r3,r6
+ blr
+
+/* read fault, load in 16B loop or final load */
+110: cmpi 0,r7,0
+ beq 156f
+ li r0,-EFAULT
+ stw r0,0(r7)
+156: add r11,r4,r5 /* Last dst byte + 1 */
+ add r4,r4,r12 /* Current dst byte */
+ rlwinm r4,r4,0,0,27 /* Rounded down */
+ subf r5,r4,r11
+ rlwinm. r0,r5,31,1,31
+ addi r9,r4,-2
+ cmpi 1,r0,0
+ beq cr1,157f
+ mtctr r0
+ li r0,0
+111: sthu r0,2(r9)
+ bdnz 111b
+157: andi. r12,r5,1
+ beq 18b
+ li r0,0
+ stb r0,2(r9)
+ vaddcuw v9,v4,v8
+ vadduwm v8,v4,v8
+ vxor v11,v11,v11
+ b 500b /* Go sum across vector checksum */
+
+/* write fault, store in 16B loop */
+1120: cmpi 0,r8,0
+ beq 113f
+ li r0,-EFAULT
+ stw r0,0(r8)
+113: b 500b
+
+/* write fault, final partial store(s) */
+
+114: cmpi 0,r8,0
+ vxor v11,v11,v11
+ beq 115f
+ li r0,-EFAULT
+ stw r0,0(r8)
+115: b 500b
+
+/* write fault, 1st store in 32B loop */
+116: cmpi 0,r8,0
+ vadduwm v9,v9,v10
+ beq 117f
+ li r0,-EFAULT
+ stw r0,0(r8)
+117: b 500b
+
+/* write fault, 2nd store in 32B loop */
+118: cmpi 0,r8,0
+ vxor v4,v4,v4
+ vadduwm v11,v9,v11
+ beq 119f
+ li r0,-EFAULT
+ stw r0,0(r8)
+119: b 18b
+
+/* read fault, next to final load */
+120: cmpi 0,r7,0
+ beq 121f
+ li r0,-EFAULT
+ stw r0,0(r7)
+121: add r11,r4,r5
+ add r4,r4,r12
+ rlwinm r4,r4,0,0,27
+ subf r5,r4,r11
+ rlwinm. r0,r5,31,1,31
+ addi r9,r4,-2
+ cmpi 1,r0,0
+ beq cr1,123f
+ mtctr r0
+ li r0,0
+122: sthu r0,2(r9)
+ bdnz 122b
+123: andi. r12,r5,1
+ beq 124f
+ li r0,0
+ stb r0,2(r9)
+124: vaddcuw v9,v4,v8
+ vadduwm v8,v4,v8
+ vadduwm v11,v9,v11
+ vxor v4,v4,v4
+ b 18b
+
+/* write fault, 1st store in 32B loop */
+125: cmpi 0,r8,0
+ vxor v4,v4,v4
+ beq 126f
+ li r0,-EFAULT
+ stw r0,0(r8)
+126: b 18b
+
+/* write or read fault in push/pop from stack. csumcpy complete. */
+
+127: vxor v0,v0,v0
+ vspltisw v2,1
+ lis r5,0x8000
+ vnor v1,v0,v0
+ vmrglh v8,v0,v8
+ li r10,17
+ vsldoi v3,v0,v1,4
+ li r3,0
+ mtctr r10
+ vsumsws v8,v8,v0
+ vand v4,v2,v3
+128: vand v5,v8,v4
+ rlwinm r5,r5,1,0,31
+ vcmpequw. v6,v5,v4
+ vsl v4,v4,v2
+ bnl cr6,129f
+ or r3,r3,r5
+129: bdnz 128b
+ addc r3,r3,r6
+ addze r3,r3
+ blr
+
+#ifndef TEST_OUTSIDE_LINUX
+ .section __ex_table,"a"
+ .align 2
+ .long 1b,100b
+ .long 204b,101b
+ .long 201b,102b
+ .long 202b,103b
+ .long 401b,105b
+ .long 5b,105b
+ .long 502b,108b
+ .long 602b,108b
+ .long 702b,108b
+ .long 802b,108b
+ .long 804b,108b
+ .long 9b,108b
+ .long 11b,110b
+ .long 112b,1120b
+ .long 131b,110b
+ .long 132b,114b
+ .long 134b,114b
+ .long 142b,114b
+ .long 152b,114b
+ .long 162b,114b
+ .long 17b,114b
+ .long 182b,127b
+ .long 183b,127b
+ .long 19b,110b
+ .long 192b,112b
+ .long 20b,110b
+ .long 201b,110b
+ .long 202b,116b
+ .long 21b,118b
+ .long 22b,120b
+ .long 222b,125b
+#endif
+
+ .text
+#ifndef TEST_OUTSIDE_LINUX
+_GLOBAL(csum_partial_vec)
+#else
+#if __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+ .global csum_partial_vec
+csum_partial_vec:
+#endif
+
+ li r12,32
+ rlwinm r0,r4,31,1,31
+ cmpi cr7,0,r4,48
+ dcbt r3,r12
+ cmpi cr6,0,r0,0
+ addic r5,r5,0
+ addi r11,r3,-2
+ add r10,r3,r4
+ bgt cr7,4f
+ andi. r12,r4,1
+ beq cr6,2f
+ mtctr r0
+1: lhzu r0,2(r11)
+ addc r5,r5,r0
+ bdnz 1b
+2: beq 3f
+ lbz r0,-1(r10)
+ rlwinm r0,r0,8,16,23
+ addc r5,r5,r0
+3: addze r3,r5
+ blr
+4: lvsr v5,0,r3
+ addi r9,r3,16
+ li r12,64
+ lvsr v7,r3,r4
+ rlwinm r9,r9,0,0,27
+ addi r10,r10,-1
+ lvx v2,0,r3
+ subf r11,r9,r10
+ vxor v0,v0,v0
+ dcbt r3,r12
+ rlwinm r11,r11,28,4,31
+ vnor v1,v0,v0
+ mtctr r11
+ vxor v11,v11,v11
+ vperm v5,v1,v0,v5
+ cmpi cr6,0,r11,4
+ vxor v8,v8,v8
+ vperm v1,v0,v1,v7
+ li r12,16
+ vsel v2,v2,v0,v5
+5: lvx v3,r3,r12
+ vaddcuw v9,v2,v8
+ vadduwm v8,v2,v8
+ vadduwm v11,v9,v11
+ addi r12,r12,16
+ vor v2,v3,v3
+ bdnzf 25,5b
+ add r9,r3,r12
+ addi r11,r11,-1
+ bgt cr6,8f
+ vxor v3,v3,v3
+6: lvx v5,0,r10
+ vaddcuw v9,v2,v3
+ rlwinm r10,r10,0,28,31
+ vadduwm v12,v2,v3
+ cmpi cr7,0,r10,0xF
+ vaddcuw v10,v12,v8
+ vadduwm v8,v12,v8
+ vadduwm v9,v9,v10
+ vadduwm v11,v9,v11
+ beq cr7, 7f
+ vsel v5,v5,v0,v1
+7: vaddcuw v9,v5,v8
+ vadduwm v8,v5,v8
+ vadduwm v11,v9,v11
+ vmrglh v2,v0,v8
+ vmrghh v3,v0,v8
+ rlwinm r10,r1,0,0,27
+ vsumsws v0,v11,v0
+ vadduwm v8,v2,v3
+ li r12,-16
+ vsumsws v8,v8,v0
+ stvx v8,r10,r12
+ lwz r3,-4(r10 )
+ addc r3,r3,r5
+ addze r3,r3
+ blr
+ .align 4
+8: lvx v3,r3,r12
+ addi r11,r11,-1
+ vaddcuw v9,v2,v8
+ vadduwm v8,v2,v8
+ mtcrf 0x02,r9
+ addi r9,r9,16
+ addi r0,r11,-2
+ vor v2,v3,v3
+ addi r12,r12,16
+ vadduwm v11,v9,v11
+ bdnzf 27,8b
+ mtcrf 0x02,r10
+ addi r11,r3,96
+ vxor v3,v3,v3
+ bns cr6,9f
+ bdnz 9f
+9: lvx v5,r3,r12
+ addi r12,r12,16
+ vaddcuw v9,v2,v3
+ lvx v6,r3,r12
+ addi r11,r11,32
+ vadduwm v12,v2,v3
+ dcbt 0,r11
+ addi r12,r12,16
+ vaddcuw v10,v12,v8
+ vadduwm v8,v12,v8
+ vadduwm v9,v9,v10
+ bdz 10f
+10: vadduwm v11,v9,v11
+ vor v2,v5,v5
+ vor v3,v6,v6
+ bdnz 9b
+ bso cr6,11f
+ b 6b
+11: lvx v5,r3,r12
+ addi r12,r12,16
+ vaddcuw v9,v2,v3
+ vadduwm v12,v2,v3
+ vaddcuw v10,v12,v8
+ vadduwm v8,v12,v8
+ vadduwm v9,v9,v10
+ vadduwm v11,v9,v11
+ vxor v3,v3,v3
+ vor v2,v5,v5
+ b 6b
diff --git a/liboil/motovec/string_vec.S b/liboil/motovec/string_vec.S
new file mode 100644
index 0000000..4da4a3e
--- /dev/null
+++ b/liboil/motovec/string_vec.S
@@ -0,0 +1,1375 @@
+/*
+ * AltiVec versions (*_vec) of equivalent Linux library functions
+ * found in /arch/ppc/lib/string.S from Linux 2.4.17. Suggest this
+ * file be appended to that one when building a Linux kernel that
+ * will employ these functions.
+ *
+ * Copyright (C) Motorola, Inc. 2003
+ *
+ * Revision history:
+ * Rev 0.0 Original Chuck Corley 5/28/03
+ * Contact at risc10@motorola.com
+ * Commented source code for Altivec version available at
+ * www.motorola.com/altivec
+ *
+ * AltiVec versions will only deal with L1_CACHE_LINE_SIZE=32
+ */
+
+
+#ifndef TEST_OUTSIDE_LINUX
+#include "../kernel/ppc_asm.tmpl"
+#include <linux/config.h>
+#include <asm/processor.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#if 0
+#define v0 vr0
+#define v1 vr1
+#define v2 vr2
+#define v3 vr3
+#define v4 vr4
+#define v5 vr5
+#define v6 vr6
+#define v7 vr7
+#define v8 vr8
+#define v9 vr9
+#define v10 vr10
+#define v11 vr11
+#define v12 vr12
+#define v13 vr13
+#define v14 vr14
+#define v15 vr15
+#endif
+#else
+#define EFAULT 0
+#define L1_CACHE_LINE_SIZE 32
+#define LG_L1_CACHE_LINE_SIZE 5
+#define MAX_L1_COPY_PREFETCH 1
+#endif
+
+/* AltiVec versions of selected functions for use on AltiVec
+ * enabled G4 and later microprocessors.
+ */
+#if defined(__GNUC__) || defined(__MWERKS__) /* gcc and codewarrior don't assemble dcba */
+#define DCBA_R3R7 .long 0x7c033dec
+#define DCBA_R3R9 .long 0x7c034dec
+#define DCBA_R0R8 .long 0x7c0045ec
+#else
+#define DCBA_R3R7 dcba r4,r7
+#define DCBA_R3R9 dcba r4,r9
+#define DCBA_R0R8 dcba 0,r8
+#endif
+
+ .text
+ .align 5
+ .global backwards_memcpy_vec
+backwards_memcpy_vec:
+ nop
+ .global memmove_vec
+memmove_vec:
+ nop
+ .global cacheable_memcpy_vec
+cacheable_memcpy_vec:
+ nop
+ .global memcpy_vec
+memcpy_vec:
+ subf. r7,r4,r3
+ cmpi cr1,0,r5,0
+ cmpi cr7,0,r5,16
+ addi r8,r4,-1
+ addi r9,r3,-1
+ add r10,r4,r5
+ beqlr
+ add r11,r3,r5
+ subf r0,r3,r4
+ beqlr cr1
+ bgt 2f
+ cmpi cr5,0,r0,128
+ bgt cr7,23f
+ mtctr r5
+1: lbzu r0,1(r8)
+ stbu r0,1(r9)
+ bdnz 1b
+ blr
+2: cmpi cr5,0,r7,128
+ cmp cr6,0,r7,r5
+ bgt cr7,4f
+ mtctr r5
+3: lbzu r0,-1(r10)
+ stbu r0,-1(r11)
+ bdnz 3b
+ blr
+
+4: rlwinm r8,r4,0,28,31
+ rlwinm r9,r3,0,28,31
+ bge cr6,24f
+ lis r11,0x010c
+ subf. r8,r9,r8
+ lvsr v2,0,r7
+ ori r11,r11,0xffe0
+ addi r11,r10,-1
+ bgt 5f
+ addi r8,r8,16
+5: rlwinm r11,r11,0,0,27
+ addi r7,r5,-1
+ subf r0,r11,r10
+ add r11,r3,r7
+ addi r10,r3,16
+ subf. r8,r0,r8
+ rlwinm r0,r11,0,28,31
+ rlwinm r10,r10,0,0,27
+ blt 6f
+ lvx v1,r4,r7
+ addi r4,r4,-16
+6: lvx v0,r4,r7
+ subf r10,r10,r11
+ cmpi cr7,0,r0,0xF
+ cmpi cr1,0,r9,0
+ rlwinm r10,r10,28,4,31
+ add r0,r3,r5
+ cmpi cr6,0,r10,0
+ vperm v3,v0,v1,v2
+ vor v1,v0,v0
+ beq cr7,10f
+ mtcrf 0x01,r0
+ rlwinm r11,r11 ,0,0,27
+ li r9,0
+ bnl cr7,7f
+ stvewx v3,r11,r9
+ addi r9,r9,4
+ stvewx v3,r11,r9
+ addi r9,r9,4
+7: bng cr7,8f
+ stvewx v3,r11,r9
+ addi r9,r9,4
+8: bne cr7,9f
+ stvehx v3,r11,r9
+ addi r9,r9,2
+9: bns cr7,11f
+ stvebx v3,r11,r9
+ b 11f
+10: stvx v3,r3,r7
+11: addi r7,r7,-16
+ ble cr6,13f
+ mtctr r10
+ cmpi cr6,0,r10,4
+12: lvx v0,r4,r7
+ vperm v3,v0,v1,v2
+ vor v1,v0,v0
+ stvx v3,r3,r7
+ addi r7,r7,-16
+ bdnzf 25,12b
+ add r9,r3,r7
+ bgt cr6,19f
+13: blt 14f
+ addi r4,r4,16
+14: lvx v0,0,r4
+ vperm v3,v0,v1,v2
+ subfic r9,r3,16
+ beq cr1,18f
+ mtcrf 0x01,r9
+ li r9,0
+ bns cr7,15f
+ stvebx v3,r3,r9
+ addi r9,r9,1
+15: bne cr7,16f
+ stvehx v3,r3,r9
+ addi r9,r9,2
+16: bng cr7,17f
+ stvewx v3,r3,r9
+ addi r9,r9,4
+17: bnllr cr7
+ stvewx v3,r3,r9
+ addi r9,r9,4
+ stvewx v3,r3,r9
+ blr
+18: stvx v3,0,r3
+ blr
+19: lvx v0,r4,r7
+ mtcrf 0x02,r9
+ vperm v3,v0,v1,v2
+ vor v1,v0,v0
+ addi r9,r9,-16
+ stvx v3,r3,r7
+ vor v7,v0,v0
+ addi r7,r7,-16
+ bdnzt 27,19b
+ lis r8,0x102
+ mtcrf 0x02,r3
+ addi r9,r7,-16
+ ori r8,r8,0xffe0
+ addi r11,r4,-64
+ bso cr6,20f
+ bdnz 20f
+20: lvx v6,r4,r7
+ addi r11,r11,-32
+ lvx v1,r4,r9
+ vperm v3,v6,v7,v2
+ DCBA_R3R9
+ vperm v4,v1,v6,v2
+ vor v7,v1,v1
+ bdz 21f
+21: stvx v3,r3,r7
+ addi r7,r9,-16
+ stvx v4,r3,r9
+ addi r9,r7,-16
+ bdnz 20b
+ bns cr6,22f
+ b 13b
+22: lvx v1,r4,r7
+ vperm v4,v1,v7,v2
+ stvx v4,r3,r7
+ b 13b
+
+23: rlwinm r8,r4,0,28,31
+ rlwinm r9,r3,0,28,31
+24: lis r10,0x010c
+ subf. r8,r8,r9
+ lvsr v2,0,r7
+ ori r10,r10,32
+ dst r4,r10,0
+ addi r10,r3,16
+ addi r11,r11,-1
+ bge 25f
+ lvx v0,0,r4
+ addi r4,r4,16
+25: lvx v1,0,r4
+ rlwinm r10,r10,0,0,27
+ cmpi cr1,0,r9,0
+ subf r0,r3,r10
+ subf r10,r10,r11
+ li r7,0
+ mtcrf 0x01,r0
+ rlwinm r10,r10,28,4,31
+ vperm v3,v0,v1,v2
+ vor v0,v1,v1
+ beq cr1,29f
+ bns cr7,26f
+ stvebx v3,r3,r7
+ addi r7,r7,1
+26: bne cr7,27f
+ stvehx v3,r3,r7
+ addi r7,r7,2
+27: bng cr7,28f
+ stvewx v3,r3,r7
+ addi r7,r7,4
+28: bnl cr7,30f
+ stvewx v3,r3,r7
+ addi r7,r7,4
+ stvewx v3,r3,r7
+ b 30f
+29: stvx v3,0,r3
+30: rlwinm r0,r11,0,28,31
+ cmpi cr6,0,r10,0
+ li r7,16
+ cmpi cr1,0,r0,0xF
+ cmpi cr7,0,r10,14
+ ble cr6,32f
+ mtctr r10
+ cmpi cr6,0,r10,4
+31: lvx v1,r4,r7
+ vperm v3,v0,v1,v2
+ vor v0,v1,v1
+ stvx v3,r3,r7
+ addi r7,r7,16
+ bdnzf 25,31b
+ add r9,r3,r7
+ addi r10,r10,-1
+ bgt cr6,38f
+32: add r11,r3,r5
+ add r10,r4,r5
+ bge 33f
+ addi r10,r10,-16
+33: mtcrf 0x01,r11
+ addi r11,r11,-1
+ addi r0,r10,-1
+ lvx v1,0,r0
+ dss 0
+ dss 1
+ vperm v3,v0,v1,v2
+ beq cr1,37f
+ rlwinm r11,r11,0,0,27
+ li r9,0
+ bnl cr7,34f
+ stvewx v3,r11,r9
+ addi r9,r9,4
+ stvewx v3,r11,r9
+ addi r9,r9,4
+34: bng cr7,35f
+ stvewx v3,r11,r9
+ addi r9,r9,4
+35: bne cr7,36f
+ stvehx v3,r11,r9
+ addi r9,r9,2
+36: bnslr cr7
+ stvebx v3,r11,r9
+ blr
+37: stvx v3,r3,r7
+ blr
+
+38: lvx v1,r4,r7
+ addi r10,r10,-1
+ mtcrf 0x02,r9
+ addi r9,r9,16
+ addi r0,r10,-2
+ vperm v3,v0,v1,v2
+ vor v0,v1,v1
+ stvx v3,r3,r7
+ addi r7,r7,16
+ bdnzf 27,38b
+ mtcrf 0x02,r11
+ lis r8,0x104
+ addi r9,r7,16
+ ori r8,r8,32
+ rlwinm r11,r0,29,3,31
+ rlwinm r0,r0,0,0,28
+ bgt cr7,43f
+39: addi r11,r4,256
+ xoris r8,r8,0x6
+ bns cr6,40f
+ bdnz 40f
+40: lvx v1,r4,r7
+ addi r11,r11,32
+ lvx v6,r4,r9
+ vperm v4,v0,v1,v2
+ dst r11,r8,1
+ DCBA_R3R7
+ vperm v3,v1,v6,v2
+ vor v0,v6,v6
+ bdz 41f
+41: stvx v4,r3,r7
+ addi r7,r9,16
+ stvx v3,r3,r9
+ addi r9,r7,16
+ bdnz 40b
+ bso cr6,42f
+ b 32b
+42: lvx v1,r4,r7
+ vperm v3,v0,v1,v2
+ vor v0,v1,v1
+ stvx v3,r3,r7
+ addi r7,r7,16
+ b 32b
+
+43: subf r10,r0,r10
+ blt cr5,39b
+ mtctr r11
+ addi r11,r4,256
+44: lvx v1,r4,r7
+ addi r9,r7,32
+ addi r11,r11,128
+ lvx v7,r4,r9
+ addi r9,r9,32
+ lvx v9,r4,r9
+ addi r9,r9,32
+ lvx v11,r4,r9
+ addi r9,r7,16
+ lvx v6,r4,r9
+ addi r9,r9,32
+ lvx v8,r4,r9
+ addi r9,r9,32
+ lvx v10,r4,r9
+ addi r9,r9,32
+ vperm v3,v0,v1,v2
+ lvx v0,r4,r9
+ vperm v4,v1,v6,v2
+ dst r11,r8,1
+ DCBA_R3R7
+ stvx v3,r3,r7
+ addi r7,r7,16
+ vperm v5,v6,v7,v2
+ stvx v4,r3,r7
+ addi r7,r7,16
+ vperm v6,v7,v8,v2
+ DCBA_R3R7
+ stvx v5,r3,r7
+ addi r7,r7,16
+ vperm v7,v8,v9,v2
+ stvx v6,r3,r7
+ addi r7,r7,16
+ vperm v8,v9,v10,v2
+ DCBA_R3R7
+ stvx v7,r3,r7
+ addi r7,r7,16
+ vperm v9,v10,v11,v2
+ stvx v8,r3,r7
+ addi r7,r7,16
+ vperm v10,v11,v0,v2
+ DCBA_R3R7
+ stvx v9,r3,r7
+ addi r7,r7,16
+ stvx v10,r3,r7
+ addi r7,r7,16
+ bdnz 44b
+ mtctr r10
+ addi r9,r7,16
+ bns cr6,40b
+ bdnz 40b
+
+ .global bcopy_vec
+bcopy_vec:
+ mr r0,r3
+ mr r3,r4
+ mr r4,r0
+ b memcpy_vec
+
+ .text
+ .align 4
+ .globl __clear_user_vec
+__clear_user_vec:
+ mr r5,r4
+ li r4,0
+ .globl memset_vec
+memset_vec:
+ cmpi cr7,0,r5,16
+ cmpi cr1,0,r5,0
+ rlwinm. r8,r4,28,28,3
+ addi r9,r3,-1
+ addi r10,r3,16
+ add r6,r3,r5
+ bgt cr7,2f
+ mtctr r5
+ beqlr cr1
+1: stbu r4,1(r9)
+ bdnz 1b
+ blr
+2: rlwinm r10,r10,0,0,27
+ addi r11,r6,-1
+ subf r9,r3,r10
+ li r7,0
+ vxor v0,v0,v0
+ subf r10,r10 ,r11
+ cmpi cr1,0,r9,16
+ beq 3f
+ lvsl v0,0,r8
+ vspltisb v1,4
+ lvsl v2,0,r4
+ vslb v0,v0,v1
+ vor v0,v0,v2
+ vspltb v0,v0,0
+3: mtcrf 0x01,r9
+ rlwinm r10,r10,28,4,31
+ beq cr1,7f
+ bns cr7,4f
+32: stvebx v0,r3,r7
+ addi r7,r7,1
+4: bne cr7,5f
+42: stvehx v0,r3,r7
+ addi r7,r7,2
+5: bng cr7,6f
+52: stvewx v0,r3,r7
+ addi r7,r7,4
+6: bnl cr7,8f
+62: stvewx v0,r3,r7
+ addi r7,r7,4
+64: stvewx v0,r3,r7
+ b 8f
+7: stvx v0,0,r3
+8: rlwinm r0,r11,0,28,31
+ cmpi cr6,0,r10,0
+ li r7,16
+ cmpi cr1,0,r0,0xF
+ ble cr6,10f
+ mtctr r10
+ cmpi cr6,0,r10,4
+9: stvx v0,r3,r7
+ addi r7,r7,16
+ bdnzf 25,9b
+ add r9,r3,r7
+ addi r10,r10,-1
+ bgt cr6,16f
+10: mtcrf 0x01,r6
+ beq cr1,14f
+ rlwinm r11,r11,0,0,27
+ li r9,0
+ bnl cr7,11f
+102: stvewx v0,r11,r9
+ addi r9,r9,4
+104: stvewx v0,r11,r9
+ addi r9,r9,4
+11: bng cr7,12f
+112: stvewx v0,r11,r9
+ addi r9,r9,4
+12: bne cr7,13f
+122: stvehx v0,r11,r9
+ addi r9 ,r9 ,2
+13: bnslr cr7
+132: stvebx v0,r11,r9
+ blr
+14: stvx v0,r3,r7
+ blr
+
+16: addi r10,r10,-1
+ mtcrf 0x02,r9
+ addi r9,r9,16
+162: stvx v0,r3,r7
+ addi r7,r7,16
+ bdnzf 27,16b
+ mtcrf 0x02,r11
+ bns cr6,17f
+ bdnz 17f
+17: stvx v0,r3,r7
+ addi r7,r7,16
+ bdz 18f
+18: stvx v0,r3,r7
+ addi r7,r7,16
+ bdnz 17b
+ bso cr6,19f
+ b 10b
+19: stvx v0,r3,r7
+ addi r7,r7,16
+ b 10b
+
+/* Intent of this exception table appears to be to return the byte count */
+/* remaining to be cleared when the current store error occurred. Chuck */
+/* Memset doesn't require it but the code is identical to __clear_user */
+/* FIRST FAILURE CHECKED BY RECOMPILATION WITH BRANCHES SUBSTITUTED
+ * FOR STORES. chuckc 030515
+*/
+
+91: mfctr r3 /* Return byte count remaining */
+ blr
+92: subf r3,r7,r5 /* BC minus bytes already stored */
+ blr
+93: mr r3,r5 /* Nothing stored yet */
+ blr
+94: add r11,r3,r5
+ rlwinm r6,r11,0,28,31 /* Bytes in last vector */
+ b 99f
+95: add r11,r3,r5
+ rlwinm r6,r11,0,28,31
+ subf r3,r9,r6
+ blr
+96: li r3,16 /* 16 bytes in last vector to be stored. */
+ blr
+97: add r11,r3,r5
+ rlwinm r6,r11,0,27,31
+99: mfctr r3
+ rlwinm r3,r3,4,0,27
+ add r3,r3,r6
+ blr
+98: add r11,r3,r5
+ rlwinm r3,r11,0,27,31
+ blr
+
+#ifndef TEST_OUTSIDE_LINUX
+ .section __ex_table,"a"
+ .align 2
+ .long 1b,91b
+ .long 32b,92b
+ .long 42b,92b
+ .long 52b,92b
+ .long 62b,92b
+ .long 64b,92b
+ .long 7b,93b
+ .long 9b,94b
+ .long 102b,95b
+ .long 104b,95b
+ .long 112b,95b
+ .long 122b,95b
+ .long 132b,95b
+ .long 14b,96b
+ .long 162b,94b
+ .long 17b,97b
+ .long 18b,97b
+ .long 19b,98b
+#endif
+ .text
+/* Scalar __copy_tofrom_user always copies forward and never checks
+ * for overlap, __copy_tofrom_user_vec will do the same except it will
+ * check that overlap is > 128B before entering 128B loop when copying
+ * forward.
+ * The scalar version always assumes the destination and source
+ * are word aligned. This routine will assume the same to simplify handling
+ * exceptions. chuckc
+ */
+
+ .globl __copy_tofrom_user_vec
+__copy_tofrom_user_vec:
+ subf. r7,r4,r3
+ cmpi cr1,0,r5,0
+ cmpi cr7,0,r5,16
+ addi r8,r4,-1
+ addi r9,r3,-1
+ add r10,r4,r5
+ beqlr
+ add r11,r3,r5
+ subf r0,r3,r4
+ beqlr cr1
+ bgt 1f
+ cmpi cr5,0,r0,128 /* Overlap |(DST-SRC)|> 128B? */
+ bgt cr7,23f /* b to v_memcpy */
+1: cmpi cr5,0,r7,128 /* Overlap |(DST-SRC)|> 128B? */
+ bgt cr7,23f /* b to v_memcpy */
+ mtctr r5
+2: lbzu r0,1(r8)
+202: stbu r0,1(r9)
+ bdnz 2b
+ li r3,0
+ blr
+
+23: rlwinm r8,r4,0,28,31
+ rlwinm r9,r3,0,28,31
+24: lis r10,0x010c
+ subf. r8,r8,r9
+ lvsr v2,0,r7
+ ori r10,r10,32
+ dst r4,r10,0
+ addi r10,r3,16
+ addi r11,r11,-1
+ bge 25f
+241: lvx v0,0,r4
+ addi r4,r4,16
+25: lvx v1,0,r4
+ rlwinm r10,r10,0,0,27
+ cmpi cr1,0,r9,0
+ subf r0,r3,r10
+ subf r10,r10,r11
+ li r7,0
+ mtcrf 0x01,r0
+ rlwinm r10,r10,28,4,31
+ vperm v3,v0,v1,v2
+ vor v0,v1,v1
+ beq cr1,29f
+ bns cr7,26f
+252: stvebx v3,r3,r7
+ addi r7,r7,1
+26: bne cr7,27f
+262: stvehx v3,r3,r7
+ addi r7,r7,2
+27: bng cr7,28f
+272: stvewx v3,r3,r7
+ addi r7,r7,4
+28: bnl cr7,30f
+282: stvewx v3,r3,r7
+ addi r7,r7,4
+284: stvewx v3,r3,r7
+ b 30f
+29: stvx v3,0,r3
+30: rlwinm r0,r11,0,28,31
+ cmpi cr6,0,r10,0
+ li r7,16
+ cmpi cr1,0,r0,0xF
+ cmpi cr7,0,r10,14
+ ble cr6,32f
+ mtctr r10
+ cmpi cr6,0,r10,4
+31: lvx v1,r4,r7
+ vperm v3,v0,v1,v2
+ vor v0,v1,v1
+312: stvx v3,r3,r7
+ addi r7,r7,16
+ bdnzf 25,31b
+ add r9,r3,r7
+ addi r10,r10,-1
+ bgt cr6,38f
+32: add r11,r3,r5
+ add r10,r4,r5
+ bge 33f
+ addi r10,r10,-16
+33: mtcrf 0x01,r11
+ addi r11,r11,-1
+ addi r0,r10,-1
+331: lvx v1,0,r0
+ dss 0
+ dss 1
+ vperm v3,v0,v1,v2
+ beq cr1,37f
+ rlwinm r11,r11,0,0,27
+ li r9,0
+ li r3,0
+ bnl cr7,34f
+332: stvewx v3,r11,r9
+ addi r9,r9,4
+334: stvewx v3,r11,r9
+ addi r9,r9,4
+34: bng cr7,35f
+342: stvewx v3,r11,r9
+ addi r9,r9,4
+35: bne cr7,36f
+352: stvehx v3,r11,r9
+ addi r9,r9,2
+36: bnslr cr7
+362: stvebx v3,r11,r9
+ blr
+37: stvx v3,r3,r7
+ li r3,0
+ blr
+
+ .align 4
+38: lvx v1,r4,r7
+ addi r10,r10,-1
+ mtcrf 0x02,r9
+ addi r9,r9,16
+ addi r0,r10,-2
+ vperm v3,v0,v1,v2
+ vor v0,v1,v1
+382: stvx v3,r3,r7
+ addi r7,r7,16
+ bdnzf 27,38b
+ mtcrf 0x02,r11
+ lis r8,0x104
+ addi r9,r7,16
+ ori r8,r8,32
+ rlwinm r11,r0,29,3,31
+ rlwinm r0,r0,0,0,28
+ bgt cr7,43f
+39: addi r11,r4,256
+ xoris r8,r8,0x6
+ bns cr6,40f
+ bdnz 40f
+40: lvx v1,r4,r7
+ addi r11,r11,32
+401: lvx v6,r4,r9
+ vperm v4,v0,v1,v2
+ dst r11,r8,1
+ DCBA_R3R7
+ vperm v3,v1,v6,v2
+ vor v0,v6,v6
+402: stvx v4,r3,r7
+ addi r7,r9,16
+ bdz 41f
+41: stvx v3,r3,r9
+ addi r9,r7,16
+ bdnz 40b
+ bso cr6,42f
+ b 32b
+42: lvx v1,r4,r7
+ vperm v3,v0,v1,v2
+ vor v0,v1,v1
+422: stvx v3,r3,r7
+ addi r7,r7,16
+ b 32b
+
+43: subf r10,r0,r10
+ blt cr5,39b
+ mtctr r11
+ addi r11,r4,256
+44: lvx v1,r4,r7
+ addi r9,r7,32
+ addi r11,r11,128
+443: lvx v7,r4,r9
+ addi r9,r9,32
+447: lvx v9,r4,r9
+ addi r9,r9,32
+451: lvx v11,r4,r9
+ addi r9,r7,16
+441: lvx v6,r4,r9
+ addi r9,r9,32
+445: lvx v8,r4,r9
+ addi r9,r9,32
+449: lvx v10,r4,r9
+ addi r9,r9,32
+ vperm v3,v0,v1,v2
+453: lvx v0,r4,r9
+ vperm v4,v1,v6,v2
+ dst r11,r8,1
+ DCBA_R3R7
+440: stvx v3,r3,r7
+ addi r7,r7,16
+ vperm v5,v6,v7,v2
+442: stvx v4,r3,r7
+ addi r7,r7,16
+ vperm v6,v7,v8,v2
+ DCBA_R3R7
+444: stvx v5,r3,r7
+ addi r7,r7,16
+ vperm v7,v8,v9,v2
+446: stvx v6,r3,r7
+ addi r7,r7,16
+ vperm v8,v9,v10,v2
+ DCBA_R3R7
+448: stvx v7,r3,r7
+ addi r7,r7,16
+ vperm v9,v10,v11,v2
+450: stvx v8,r3,r7
+ addi r7,r7,16
+ vperm v10,v11,v0,v2
+ DCBA_R3R7
+452: stvx v9,r3,r7
+ addi r7,r7,16
+454: stvx v10,r3,r7
+ addi r7,r7,16
+ bdnz 44b
+ mtctr r10
+ addi r9,r7,16
+ bns cr6,40b
+ bdnz 40b
+
+/* Intent of this exception table is to return:
+ * r3 = bytes not copied (but preserve dst address in r3 til end)
+ * r4 = 0 on read fault; 1 on write fault
+ * Register useage here:
+ * r5 = (preserve as total byte count til near end)
+ * r6 = bytes not copied (move to r3 at end)
+ * r7 = byte count index from memcpy_vec
+ * r9 = alternate byte count index in 128B loop
+ * r10= vectors (QWs remaining) after 128B loop
+ * r11= next destination address (assume word-aligned)
+ * For read fault, clear out the destination for bytes remaining
+ * starting at r3(dst) + r5(byte count) - r6 (bytes remaining).
+ */
+
+
+/* read fault, initial single-byte copy */
+100: li r4,0
+ mfctr r3
+101: stbu r4,1(r9)
+ bdnz 101b
+ blr
+
+/* write fault, initial single-byte copy */
+102: li r4,1
+ mfctr r3
+ blr
+
+/* read fault, initial vector(s) load */
+103: li r4,0
+ b 91f
+
+/* write fault, initial partial vector store */
+104: li r4,1
+ subf r5,r7,r5 /* BC minus bytes in 1st vector already stored */
+ add r3,r3,r7 /* dst plus bytes in 1st vector already stored. */
+ b 91f
+
+/* write fault, initial full vector store */
+105: li r4,1
+91: mr r6,r5
+ b 98f
+
+/* read fault in 16B loop(s) and 32B loop (treat as both loads fail)*/
+106: li r4,0
+ b 94f
+
+/* write fault in 16B loop(s), 128B, and first write fault in 32B loop */
+107: li r4,1
+ b 94f
+
+/* second write fault in 32B loop */
+108: li r4,1
+ add r11,r3,r5 /* Last dst byte + 1 */
+ add r3,r3,r9 /* Current dst byte */
+ b 95f
+
+/* read fault in 128B loop (treat as all loads fail)*/
+112: li r4,0
+ mfctr r0
+ slwi r0,r0,7 /* Convert 128B loop ctr to bytes */
+ add r11,r3,r5
+ slwi r10,r10,4 /* convert QW vectors remaining to bytes */
+ add r3,r3,r7
+ rlwinm r6,r11,0,28,31 /* Bytes in last vector(s) */
+ rlwinm r3,r3,0,0,27
+ add r6,r6,r10
+ add r6,r6,r0
+ b 98f
+
+/* read fault, final vector(s) load */
+114: li r4,0
+94: add r11,r3,r5
+ add r3,r3,r7
+95: rlwinm r3,r3,0,0,27
+ subf r6,r3,r11
+ b 98f
+
+/* write fault, final partial vector store */
+115: li r4,1
+ add r11,r3,r5
+ add r3,r3,r7
+ rlwinm r3,r3,0,0,27
+ subf r6,r3,r11
+ subf r6,r9,r6 /* minus bytes already stored */
+ b 98f
+
+/* write fault, final full vector store */
+116: li r4,1
+ add r3,r3,r7
+ rlwinm r3,r3,0,0,27
+ li r6,16
+ b 98f
+
+/*
+ * At this stage the number of bytes not copied is in r6
+ * and r4 is 0 for read or 1 for write.
+ * (Like the scalar version, assume dst is word-aligned.)
+ */
+98: cmpwi 0,r4,0
+ bne 120f
+/* for read fault, clear out the destination: r6 bytes remaining
+ */
+ srwi. r0,r6,2
+ addi r3,r3,-4
+ subf r10,r6,r5
+ mtctr r0
+ beq 118f
+117: stwu r4,4(r3)
+ bdnz 117b
+118: andi. r0,r6,3
+ mtctr r0
+ beq 120f
+119: stb r4,4(r3)
+ addi r3,r3,1
+ bdnz 119b
+120: mr r3,r6
+ blr
+
+121: li r4,1
+ mfctr r3
+ rlwinm r3,r3,2,0,29
+ andi. r0,r6,3
+ add r3,r3,r0
+ blr
+
+
+#ifndef TEST_OUTSIDE_LINUX
+ .section __ex_table,"a"
+ .align 2
+ .long 2b,100b
+ .long 202b,102b
+ .long 241b,103b
+ .long 25b,103b
+ .long 252b,104b
+ .long 262b,104b
+ .long 272b,104b
+ .long 282b,104b
+ .long 284b,104b
+ .long 29b,105b
+ .long 31b,106b
+ .long 312b,107b
+ .long 331b,114b
+ .long 332b,115b
+ .long 334b,115b
+ .long 342b,115b
+ .long 352b,115b
+ .long 362b,115b
+ .long 37b,116b
+ .long 38b,106b
+ .long 382b,107b
+ .long 40b,106b
+ .long 401b,106b
+ .long 402b,107b
+ .long 41b,108b
+ .long 42b,106b
+ .long 422b,107b
+ .long 44b,112b
+ .long 443b,112b
+ .long 447b,112b
+ .long 451b,112b
+ .long 441b,112b
+ .long 445b,112b
+ .long 449b,112b
+ .long 453b,112b
+ .long 440b,107b
+ .long 442b,107b
+ .long 444b,107b
+ .long 446b,107b
+ .long 448b,107b
+ .long 450b,107b
+ .long 452b,107b
+ .long 454b,107b
+ .long 101b,102b
+ .long 117b,121b
+ .long 119b,102b
+#endif
+
+ .text
+ .align 5
+
+ .global strlen_vec
+strlen_vec:
+
+ lvxl v2,0,r3
+ vxor v0,v0,v0
+ lvsl v5,0,r3
+ vnor v1,v0,v0
+ rlwinm r5,r3,0,28,31
+ vperm v2,v2,v1,v5
+ mr r4,r3
+ li r3,16
+ vcmpequb. v4,v0,v2
+ vsldoi v5,v0,v1,8
+ bne cr6,2f
+ subf r3,r5,r3
+1: lvxl v2,r4,r3
+ addi r3,r3,16
+ vcmpequb. v4,v0,v2
+ beq cr6,1b
+2: vandc v3,v2,v5
+ vsldoi v7,v0,v1,4
+ vcmpequb. v4,v3,v5
+ vsldoi v8,v0,v1,12
+ beq cr6,10f
+ vandc v3,v2,v8
+ vsldoi v5,v0,v1,10
+ vcmpequb. v4,v3,v8
+ vsldoi v9,v0,v1,14
+ beq cr6,6f
+ vandc v3,v2,v9
+ vsldoi v8,v0,v1,13
+ vcmpequb. v4,v3,v9
+ vsldoi v10,v0,v1,15
+ beq cr6,4f
+ vandc v3,v2,v10
+ vcmpequb. v4,v3,v10
+ beq cr6,3f
+ addi r3,r3,-16
+ blr
+3: addi r3,r3,-15
+ blr
+
+4: vandc v3,v2,v8
+ vcmpequb. v4,v3,v8
+ beq cr6,5f
+ addi r3,r3,-14
+ blr
+5: addi r3,r3,-13
+ blr
+
+6: vandc v3,v2,v5
+ vsldoi v9,v0,v1,9
+ vcmpequb. v4,v3,v5
+ vsldoi v10,v0,v1,11
+ beq cr6,8f
+ vandc v3,v2,v10
+ vcmpequb. v4,v3,v10
+ beq cr6,7f
+ addi r3,r3,-12
+ blr
+7: addi r3,r3,-11
+ blr
+
+8: vandc v3,v2,v9
+ vcmpequb. v4,v3,v9
+ beq cr6,9f
+ addi r3,r3,-10
+ blr
+9: addi r3,r3,-9
+ blr
+
+10: vandc v3,v2,v7
+ vsldoi v5,v0,v1,2
+ vcmpequb. v4,v3,v7
+ vsldoi v10,v0,v1,6
+ beq cr6,14f
+ vandc v3,v2,v10
+ vsldoi v9,v0,v1,5
+ vcmpequb. v4,v3,v10
+ vsldoi v7,v0,v1,7
+ beq cr6,12f
+ vandc v3,v2,v7
+ vcmpequb. v4,v3,v7
+ beq cr6,11f
+ addi r3,r3,-8
+ blr
+11: addi r3,r3,-7
+ blr
+
+12: vandc v3,v2,v9
+ vcmpequb. v4,v3,v9
+ beq cr6,13f
+ addi r3,r3,-6
+ blr
+13: addi r3,r3,-5
+ blr
+
+14: vandc v3,v2,v5
+ vsldoi v8,v0,v1,1
+ vcmpequb. v4,v3,v5
+ vsldoi v10,v0,v1,3
+ beq cr6,16f
+ vandc v3,v2,v10
+ vcmpequb. v4,v3,v10
+ beq cr6,15f
+ addi r3,r3,-4
+ blr
+15: addi r3,r3,-3
+ blr
+
+16: vandc v3,v2,v8
+ vcmpequb. v4,v3,v8
+ beq cr6,17f
+ addi r3,r3,-2
+ blr
+17: addi r3,r3,-1
+ blr
+
+ .text
+ .align 5
+
+ .global strcmp_vec
+strcmp_vec:
+ lvxl v2,0,r3
+ vxor v0,v0,v0
+ addi r7,r4,16
+ lvxl v3,0,r4
+ vnor v1,v0,v0
+ xor r8,r7,r4
+ lvsl v6,0,r3
+ vspltisb v4,8
+ cmpi 2,0,r8,0x1000
+ lvsl v10,0,r4
+ vspltisb v12,1
+ beq 2,8f
+1: andi. r8,r3,0xF
+ lvxl v8,0,r7
+ vslb v13,v4,v12
+ andi. r9,r4,0xF
+ vperm v2,v2,v1,v6
+ subf. r0,r8,r9
+ addi r5,r3,16
+ vperm v9,v0,v1,v6
+ lvsl v6,0,r0
+ vor v7,v3,v3
+ vperm v3,v3,v8,v10
+ addi r4,r7,16
+ vslb v11,v13,v12
+ vor v3,v3,v9
+ xor r3,r3,r3
+ vcmpequb. v10,v2,v3
+ vslb v14,v11,v12
+ vnor v9,v10,v10
+ bc 4,6*4+0,3f
+ vcmpequb. v5,v0,v2
+ bc 4,6*4+2,7f
+ blt 6f
+2: lvxl v7,0,r4
+ addi r4,r4,16
+ lvxl v2,0,r5
+ addi r5,r5,16
+ vperm v3,v8,v7,v6
+ vcmpequb. v10,v2,v3
+ vnor v9,v10,v10
+ bc 12,6*4+0,5f
+3: vcmpequb v5,v0,v2
+ vsum4ubs v7,v4,v14
+ vor v9,v9,v5
+ vsro v12,v9,v11
+ vsrw v11,v9,v4
+ vsro v6,v9,v14
+ vsrw v14,v9,v13
+ vsro v13,v9,v7
+ vor v9,v12,v6
+ vsro v7,v14,v4
+ vor v9,v9,v13
+ vcmpgtuw v9,v9,v0
+ vor v9,v9,v11
+ vor v9,v9,v14
+ vor v9,v9,v7
+ vandc v11,v10,v9
+ vcmpequb. v14,v11,v9
+ vcmpgtub v7,v3,v2
+ bc 12,6*4+2,4f
+ vandc v11,v7,v9
+ li r3,-1
+ vcmpequb. v14,v11,v1
+ bc 4,6*4+2,4f
+ li r3,1
+4: blr
+
+5: vcmpequb. v5,v0,v2
+ bc 4,6*4+2,7f
+ lvxl v8,0,r4
+ addi r4,r4,16
+6: lvxl v2,0,r5
+ addi r5,r5,16
+ vperm v3,v7,v8,v6
+ vcmpequb. v10,v2,v3
+ vnor v9,v10,v10
+ bc 4,6*4+0,3b
+ vcmpequb. v5,v0,v2
+ bc 12,6*4+2,2b
+7: blr
+
+8: vcmpequb. v5,v0,v2
+ bc 13,6*4+2,1b
+ vcmpequb. v10,v2,v3
+ bc 4,6*4+0,3b
+ blr
+
+
+ .text
+ .align 5
+ .global memcmp_vec
+memcmp_vec:
+ subf. r6,r4,r3
+ cmpi cr1,0,r5,0
+ cmpi cr7,0,r5,16
+ add r9,r3,r5
+ addi r7,r4,-1
+ addi r11,r3,16
+ beq 2f
+ addi r10,r9,-1
+ addi r8,r3,-1
+ rlwinm r11,r11,0,0,27
+ beq cr1,2f
+ subf r11,r11,r10
+ rlwinm r9,r9,0,28,31
+ bgt cr7,3f
+ mtctr r5
+1: lbzu r6,1(r7)
+ lbzu r10,1(r8)
+ subf. r3,r6,r10
+ bdnzt 2,1b
+ blr
+
+2: xor r3,r3,r3
+ blr
+3: rlwinm r11,r11,28,4,31
+ rlwinm r7,r4,0,28,31
+ rlwinm r8,r3,0,28,31
+ cmpi cr1,0,r11,0
+ lvxl v0,0,r3
+ subf. r7,r7,r8
+ li r7,16
+ lvxl v1,0,r4
+ vor v2,v1,v1
+ addi r5,r5,-1
+ bge 4f
+ lvxl v2,r4,r7
+ addi r4,r4,16
+ addi r5,r5,-16
+4: lvsl v3,0,r3
+ vspltisb v4,8
+ vxor v5,v5,v5
+ lvsl v6,0,r4
+ vspltisb v7,1
+ vnor v8,v5,v5
+ lvsr v10,0,r6
+ cmpi cr5,0,r9,0
+ vperm v11,v5,v8,v3
+ lvsr v12,0,r9
+ vperm v0,v0,v8,v3
+ vperm v1,v1,v2,v6
+ vslb v3,v4,v7
+ vor v1,v1,v11
+ vslb v6,v3,v7
+ vcmpequb. v8,v0,v1
+ vslb v7,v6,v7
+ vnor v13,v8,v8
+ bc 4,6*4+0,8f
+ ble cr1,6f
+ mtctr r11
+5: lvxl v9,r4,r7
+ lvxl v0,r3,r7
+ addi r7,r7,16
+ vperm v1,v2,v9,v10
+ vor v2,v9,v9
+ vcmpequb. v8,v0,v1
+ vnor v13,v8,v8
+ bdnzt 24,5b
+ bc 4,6*4+0,8f
+6: lvxl v9,r4,r5
+ vperm v12,v5,v8,v12
+ lvxl v0,r3,r7
+ vperm v1,v2,v9,v10
+ beq cr5,7f
+ vor v1,v1,v12
+ vor v0,v0,v12
+7: vcmpequb. v8,v0,v1
+ vnor v13,v8,v8
+ bc 4,6*4+0,8f
+ xor r3,r3,r3
+ blr
+8: vsum4ubs v2,v4,v7
+ vsro v9,v13,v6
+ vsrw v6,v13,v4
+ vsro v10,v13,v7
+ vsrw v7,v13,v3
+ vsro v3,v13,v2
+ vor v11,v9,v10
+ vsro v2,v7,v4
+ vor v11,v11,v3
+ vcmpgtuw v11,v11,v5
+ vor v11,v11,v6
+ vor v11,v11,v7
+ vor v11,v11,v2
+ vor v1,v1,v11
+ vor v0,v0,v11
+ li r3,-1
+ vcmpgtub. v8,v1,v0
+ bclr 4,6*4+2
+ li r3,1
+ blr
+
+ .text
+ .align 5
+ .global strcpy_vec
+strcpy_vec:
+ addi r5,r3,32
+ subf. r6,r4,r3
+ subf r7,r3,r4
+ rlwinm r5,r5,0,0,26
+ mr r8,r3
+ beqlr
+ bgt 1f
+ mr r6,r7
+1: subf. r9,r3,r5
+ addi r5,r8,4096
+ cmpi cr7,0,r6,16
+ mtctr r9
+2: lbzx r0,0,r4
+ addi r4,r4,1
+ cmpi cr1,0,r0,0
+ stbx r0,0,r8
+ addi r8,r8,1
+ bdnzf 6,2b
+ beqlr cr1
+ li r11,4096
+ rlwinm r5,r5,0,0,19
+ mr r10,r4
+ ble cr7,2b
+ subf. r5,r8,r5
+ rlwinm r5,r5,28,4,31
+ lvsl v4,0,r4
+ vxor v0,v0,v0
+ ble 9f
+ mtctr r5
+3: lvx v1,0,r10
+ addi r10,r10,16
+ bdz 10f
+4: lvx v2,0,r10
+ addi r10,r10,16
+ bdz 11f
+5: lvx v3,0,r10
+ addi r10,r10,16
+ bdz 12f
+6: vperm v5,v1,v2,v4
+ vperm v6,v2,v3,v4
+ vor v1,v3,v3
+ vcmpequb. v7,v0,v5
+ bne cr6,8f
+ addi r4,r4,16
+ vcmpequb. v7,v0,v6
+ bne cr6,7f
+ DCBA_R0R8
+ addi r4,r4,16
+ stvx v5,0,r8
+ addi r8,r8,16
+ stvx v6,0,r8
+ addi r8,r8,16
+ b 4b
+7: stvx v5,0,r8
+ addi r8,r8,16
+8: lbzx r0,0,r4
+ addi r4,r4,1
+ cmpi cr1,0,r0,0
+ stbx r0,0,r8
+ addi r8,r8,1
+ bne cr1,8b
+ blr
+
+9: mtctr r11
+ b 3b
+10: vcmpequb. v7,v0,v1
+ bnl cr6,8b
+ mtctr r11
+ b 4b
+11: vcmpequb. v7,v0,v2
+ bnl cr6,8b
+ mtctr r11
+ b 5b
+12: vcmpequb. v7,v0,v3
+ bnl cr6,8b
+ mtctr r11
+ b 6b
diff --git a/liboil/motovec/vec_csum.S b/liboil/motovec/vec_csum.S
new file mode 100644
index 0000000..29ddd11
--- /dev/null
+++ b/liboil/motovec/vec_csum.S
@@ -0,0 +1,724 @@
+//------------------------------------------------------------------
+// file: vec_csum.S
+// AltiVec enabled version of linux' checksum routines
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// Copyright Motorola, Inc. 2003
+// ALL RIGHTS RESERVED
+//
+// You are hereby granted a copyright license to use, modify, and
+// distribute the SOFTWARE so long as this entire notice is retained
+// without alteration in any modified and/or redistributed versions,
+// and that such modified versions are clearly identified as such.
+// No licenses are granted by implication, estoppel or otherwise under
+// any patents or trademarks of Motorola, Inc.
+//
+// The SOFTWARE is provided on an "AS IS" basis and without warranty.
+// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
+// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
+// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
+// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
+// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
+// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
+//
+// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
+// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
+// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
+// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
+// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
+// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
+// for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern unsigned long csum_partial_copy_generic(src, dst, len, sum,
+// src_err, dst_err);
+// Computes the checksum of a memory block at src, length len,
+// and adds in "sum" (32-bit), while copying the block to dst.
+// Returns:
+// unsigned long sum
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern unsigned long csum_partial(buff, len, sum);
+//
+// computes the checksum of a memory block at buff, length len,
+// and adds in "sum" (32-bit unsigned long)
+// Returns:
+// unsigned long sum
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// Assumptions from studying the original linux code:
+// Copying forward is always safe
+// src and dst are always half-word aligned
+// len may be odd or even 0-n;
+// there is no test to see if src and dst are equal.
+// returns unsigned int checksum
+//
+//------------------------------------------------------------------
+
+// Revision History:
+// Rev 0.0 Original Chuck Corley 04/19/03
+//
+// This is alpha quality code; users are encouraged to make it faster.
+// ASSUMPTIONS:
+// Code is highly likely to be in the cache; data is not (streaming data)
+
+// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 32 bytes.
+#define MIN_VEC 48 // Experimentally chosen on 7455@1GHz/133 to beat scalar
+
+ // Register useage
+#define Rt r0 // r0 when used as a temporary register
+
+#define SRC r3 // entering: src ptr; exiting: unsigned long checksum
+
+#define DST r4 // entering: dst pointer; exiting:
+
+#define BC r5 // entering: Byte_Count
+
+#define SUM r6 // entering: Partial checksum
+
+#define SER r7 // entering: src_err address
+
+#define DER r8 // entering: dst_err address
+
+#define DM2 r9// dst -2 for hw-by-hw forwards initially
+#define D r9 // dst[28:31]
+#define DR r9 // dst[0:27]
+#define DNX r9 // (dst+n*16)[28:31]
+#define BL r9 // second byte_kount index pointer
+
+#define DBC r10// dst + byte count initially
+#define DBK r10// (dst+byte_count-1) then (dst+byte_count-1)[28:31]
+
+#define SM2 r11// src -2 for hw-by-hw forwards initially
+#define QW r11 // number of quad words (vectors)
+#define SP8 r11 // data stream touch block & stride info for Big_loop
+#define SBC r11// src + byte count initially then src[28:31]
+
+#define BK r12 // Byte Kount index
+#define BLK r12 // temporary data stream touch block & stride info
+#define S r12// src[28:31]
+#define DMS r12 // dst - src initially
+
+#define V0 v0 // all zeros
+#define VCARS v0 // sum of carries
+
+#define V1 v1 // all ones
+#define VMM v1 // mask for final dst right
+
+#define VS0 v2 // src vector for permuting
+#define VL v2 // low data
+
+#define VS1 v3 // src vector for permuting
+#define VH v3 // high data
+
+#define VPS0 v4 // permuted source vector to store
+
+#define VP2 v5 // dst permute register
+#define VM v5 // mask for first dst left
+#define VS2 v5 // src vector for permuting
+
+#define VP3 v6 // d - s permute register
+#define VS3 v6 // 4th src vector in csum_partial
+
+#define VP4 v7 // Byte_Count permute register
+#define VPS1 v7 // 2nd permuted source vector to store
+
+#define VSUM v8 // Updated sum
+#define VFIN v8 // final sum
+
+#define VCAR1 v9 // temp register for carries
+#define VCAR3 v9 // temp register for carries
+
+#define VCAR2 v10 // temp register for carries
+
+#define VCARF v11 // temp register for carries
+
+#define VTEMP v12 // Temp register
+
+
+// Conditionalize the use of dcba. It will help if the data is
+// not in cache and hurt if it is. Generally, except for small
+// benchmarks repeated many times, we assume data is not in cache
+// (data streaming) and using dcba is a performance boost.
+#ifndef NO_DCBA
+#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
+ // gcc and codewarrior and diab don't assemble dcba
+#define DCBK .long 0x7c0465ec
+// dcba r4,r12 or dcba DST,BK
+#else
+#ifdef __ghs__
+.macro DCBK
+.long 0x7c0465ec
+.endm
+#else
+#define DCBK dcba DST,BK
+#endif // __ghs__
+#endif // __GNUC__ or __MWERKS__
+#else
+#define DCBK nop
+#endif // NO_DCBA
+
+// Conditionalize the use of dst (data stream touch). It will help
+// if the data is not in cache and hurt if it is (though not as badly
+// as dcbz). Generally, except for small benchmarks repeated many times,
+// we assume data is not in cache (data streaming) and using dst is a
+// performance boost.
+#ifndef NO_DST
+#define STRM_F dst SRC,BLK,0
+#define STRM_1 dst SP8,Rt,1
+
+#else
+#define STRM_F nop
+#define STRM_1 nop
+#endif
+ .text
+#if __MWERKS__
+ .align 16
+#define SP r1
+#else
+ .align 4
+#endif
+
+#ifdef LIBMOTOVEC
+ .global csum_partial_copy_generic_vec
+csum_partial_copy_generic:
+#else
+ .global vec_csum_partial_copy_generic
+vec_csum_partial_copy_generic:
+#endif
+
+ li BK,32 // IU1
+ rlwinm Rt,BC,31,1,31 // IU1 BC/2
+ cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
+
+ dcbt SRC,BK // LSU prefetch next cacheline
+ cmpi cr6,0,Rt,0 // IU1 BC/2 == 0?
+ addic SUM,SUM,0 // IU1 Zero carry bit
+
+ addi SM2,SRC,-2 // IU1 Pre-bias and duplicate src
+ add DBC,DST,BC // IU1 Address of last dst byte + 1
+ bgt cr7,v_csumcpy // b if BC>MIN_VEC (will copy vectors fwd)
+ andi. BK,BC,1 // IU1 BC[31]==0?
+
+ addi DM2,DST,-2 // IU1 Pre-bias and duplicate destination
+ add S,SRC,BC // IU1 Last src byte + 1 (temp use of S)
+ beq cr6,No_HWs // b if BC/2==0
+ mtctr Rt // i=BC/2; do ...;i--; while (i>0)
+HW_cpy:
+ lhzu Rt,2(SM2) // LSU
+ sthu Rt,2(DM2) // LSU
+ addc SUM,SUM,Rt // IU1
+ bdnz HW_cpy
+No_HWs:
+ beq BC_even // b if BC[31]==0 (or DBC[31]==0 when aligned)
+ lbz Rt,-1(S) // LSU Get last src address byte
+
+ stb Rt,-1(DBC) // LSU Store to last dst address byte
+ rlwinm Rt,Rt,8,16,23 // IU1 Shift odd byte left
+
+ addc SUM,SUM,Rt // IU1
+BC_even:
+ addze SRC,SUM
+ blr
+
+v_csumcpy:
+ lvsr VP2,0,DST // LSU Permute vector for initial byte mask
+ rlwinm D,DST,0,28,31 // IU1 D = dst[28:31]
+ rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31]
+
+ lvsr VP4,DST,BC // LSU Permute vector for final byte mask
+ subf. S,S,D // IU1 if D-S<0 essentially shifting left
+ subf DMS,SRC,DST // IU1 Compute dst-src difference
+
+ lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right
+ li BK,64 // IU1 Index of next cache line
+ vxor V0,V0,V0 // VIU Clear v0
+
+ dcbt SRC,BK // LSU Prefetch next cache line at src+64
+ cmpi cr1,0,D,0 // IU1 Is D0 left justified?
+ vnor V1,V0,V0 // VIU1 Create a vector of all ones
+
+ addi DR,DST,16 // IU1 Address of second dst vector
+ addi DBK,DBC,-1 // IU1 Address of last dst byte
+ vperm VM,V1,V0,VP2 // VPU D0 select vector for dst left; src right
+ bge Ld_bytes_rt // b if shifting right (D-S>=0)
+
+ lvx VS0,0,SRC // LSU Get S0 load started
+ addi SRC,SRC,16 // IU1 Increment src base (to keep BK useful)
+
+Ld_bytes_rt: // Come here to get VS1 & Don't care what VS0 is
+ lvx VS1,0,SRC // LSU Get S1 (or S0 if D-S>=0) in upper vector
+ rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
+
+ vperm VMM,V0,V1,VP4 // VPU DN select vector for src left; dst right
+ subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
+ vxor VPS1,VPS1,VPS1 // VIU Clear VPS1
+
+ vxor VCARF,VCARF,VCARF //VIU1 clear VCARF
+ rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
+ rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
+
+ li BK,96 // IU1 Index of next cache line
+ cmpi cr5,0,Rt,0xF // IU1 Is DN right justified?
+ subf Rt,DST,DR // IU1 How many bytes in first destination?
+
+ mtctr QW // IU2
+ cmpi cr6,0,QW,4 // IU1 Check QW>4
+ mtcrf 0x01,Rt // IU2 Put bytes in 1st dst in cr7
+
+ vperm VPS0,VS0,VS1,VP3 // VPU Align S0 and S1 to D0
+ vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
+ dcbt SRC,BK // LSU Prefetch next cache line at src+96
+ beq cr1,Left_just // b if D0 is left justified
+
+ li BK,0 // IU1 Initialize byte kount index
+ vsel VPS0,VPS0,V0,VM // VIU1 Select zeroes left | S0 bytes right
+ bns cr7,No_B_fwd // b if only even number of bytes to store
+
+ stvebx VPS0,DST,BK // LSU store first byte at DST+0
+ addi BK,BK,1 // IU1 increment index
+No_B_fwd:
+ bne cr7,No_H_fwd // b if only words to store
+
+ stvehx VPS0,DST,BK // LSU store halfword at DST+0/1
+ addi BK,BK,2 // IU1 increment index
+No_H_fwd:
+ bng cr7,No_W1_fwd // b if exactly zero or two words to store
+
+ stvewx VPS0,DST,BK // LSU store word 1 of one or three
+ addi BK,BK,4 // IU1 increment index
+
+No_W1_fwd:
+ bnl cr7,No_W2_fwd // b if there was only one word to store
+ stvewx VPS0,DST,BK // LSU store word 1 of two or 2 of three
+ addi BK,BK,4 // IU1 increment index
+
+ stvewx VPS0,DST,BK // LSU store word 2 of two or 3 of three
+ b No_W2_fwd
+
+Left_just:
+ stvx VPS0,0,DST // LSU Store 16 bytes at D0
+No_W2_fwd:
+ vxor VSUM,VSUM,VSUM // VIU1 Clear VSUM
+ li BK,16 // IU1 Re-initialize byte kount index
+
+QW_fwd_loop:
+ lvx VS1,SRC,BK // LSU Get S2 (or S1)
+ vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries
+
+ vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries)
+
+ vperm VPS0,VS0,VS1,VP3 // VPU Align S1 and S2 to D1
+ vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
+
+ stvx VPS0,DST,BK // LSU Store 16 bytes at D1(+n*16 where n<4)
+ vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF
+ addi BK,BK,16 // IU1 Increment byte kount index
+ bdnzf 25,QW_fwd_loop // b if 4 or less quad words to do
+
+ add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
+ addi QW,QW,-1 // IU1 One more QW stored by now
+ bgt cr6,GT_4QW_fwd // b if >4 quad words left
+
+Last_ld_fwd: // Next 16 bytes is the last; we're done.
+ add DBC,DST,BC // IU1 Recompute address of last dst byte + 1
+ add SBC,SRC,BC // IU1 Recompute address of last src byte + 1
+ bge No_ld_fwd // b if shifting right (D-S>=0)
+
+ addi SBC,SBC,-16 // IU1 if D-S>=0 we didn't add 16 to src
+No_ld_fwd:
+ mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
+ addi Rt,SBC,-1 // IU1 Recompute address of last src byte
+
+ lvx VS1,0,Rt // LSU Get last source S14 (guaranteed SN)
+ vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries
+
+ vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries)
+
+ vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF
+
+ vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D14
+ beq cr5,Rt_just_fwd // b if last destination is right justified
+ vsel VPS0,VPS0,V0,VMM // VIU1 Select src bytes left | zeroes right
+
+ rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
+ li D,0 // IU1 Initialize index pointer
+ bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
+
+ stvewx VPS0,DBK,D // LSU store word 1 of two or three
+ addi D,D,4 // IU1 increment index
+
+ stvewx VPS0,DBK,D // LSU store word 2 of two or three
+ addi D,D,4 // IU1 increment index
+Only_1W_fwd:
+ bng cr7,Only_2W_fwd // b if there were only two or zero words to store
+
+ stvewx VPS0,DBK,D // LSU store word 3 of three if necessary
+ addi D,D,4 // IU1 increment index
+Only_2W_fwd:
+ bne cr7,Only_B_fwd // b if there are no half words to store
+
+ stvehx VPS0,DBK,D // LSU store one halfword if necessary
+ addi D,D,2 // IU1 increment index
+Only_B_fwd:
+ bns cr7,All_done_fwd // b if there are no bytes to store
+
+ stvebx VPS0,DBK,D // LSU store one byte if necessary
+ b All_done_fwd
+
+Rt_just_fwd:
+ stvx VPS0,DST,BK // LSU Store 16 bytes at D14
+All_done_fwd:
+ vaddcuw VCAR1,VPS0,VPS1 //VIU1 add data and store carries
+
+ vadduwm VTEMP,VPS0,VPS1 //VIU1 add data (no carries)
+
+ vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries
+
+ vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum
+
+ vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds
+ vmrglh VL,V0,VSUM // VPU separate low shorts of sum
+
+ vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF
+ vmrghh VH,V0,VSUM //VPU separate high shorts of sum
+ rlwinm DBK,SP,0,0,27 // IU1 Align stack pointer to QW
+
+ vsumsws VCARS,VCARF,V0 //VIU2 sum all carries
+ vadduwm VSUM,VL,VH //VIU1 add low and high data
+ li BK,-16 // IU1 Index 0x10 less than SP
+
+ vsumsws VFIN,VSUM,VCARS //VIU2 sum all data including carries
+
+ stvx VFIN,DBK,BK // LSU Store partial checksum from VR
+
+ lwz SRC,-4(DBK) // LSU Load partial checksum to GPR
+
+ addc SRC,SRC,SUM
+
+ addze SRC,SRC
+
+ blr // Return destination address from entry
+
+
+#ifdef __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
+
+ lvx VS1,SRC,BK // LSU Get S3 (or S2)
+ addi QW,QW,-1 // IU1 Keeping track of QWs stored
+ vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries
+
+ vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries)
+ mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+ addi DNX,DNX,16 // IU1 Update cr6 for next loop
+
+ addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop
+ vperm VPS0,VS0,VS1,VP3 // VPU Align S2 and S3 to D2
+ vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
+
+ stvx VPS0,DST,BK // LSU Store 16 bytes at D2
+ addi BK,BK,16 // IU1 Increment byte count by 16
+ vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF
+ bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
+// At this point next store will be to even address.
+
+ mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+ addi SP8,SRC,96 // IU1 Starting address for dcbt
+ addi BL,BK,16 // IU1 Create an alternate byte kount + 32
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+
+ bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
+
+ bdnz B32_fwd // decrement counter for last QW store odd
+
+B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
+ lvx VS1,SRC,BK // LSU Get S4
+ addi SP8,SP8,32 // IU1 Next starting address for dcbt
+ vaddcuw VCAR1,VPS0,VPS1 // VIU1 add data and store carries
+
+ lvx VS2,SRC,BL // LSU Get S5
+ vadduwm VTEMP,VPS0,VPS1 // VIU1 add data (no carries)
+
+ dcbt 0,SP8 // LSU Prefetch cache line 64 bytes ahead
+ vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries
+
+ DCBK // LSU Kill instead of RWITM
+ vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum
+ vperm VPS1,VS0,VS1,VP3 // VPU Align S11 and S12 to D11
+
+ stvx VPS1,DST,BK // LSU Store 16 bytes at D11
+ vperm VPS0,VS1,VS2,VP3 // VPU Align S12 and S13 to D12
+ vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds
+ bdz Nxt_loc_fwd // always decrement and branch to next instr
+
+Nxt_loc_fwd:
+ stvx VPS0,DST,BL // LSU Store 16 bytes at D12
+ vor VS0,VS2,VS2 // VIU1 Move S13 to S11
+ vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF
+
+ addi BK,BL,16 // IU1 Increment byte count
+ addi BL,BK,16 // IU1 Increment alternate byte count
+ bdnz B32_fwd // b if there are at least two more QWs to do
+
+ bso cr6,One_even_QW // b if there is one even and one odd QW to store
+
+ b Last_ld_fwd // b if last store is to even address
+
+// Come here with two more loads and two stores to do
+One_even_QW:
+ lvx VS1,SRC,BK // LSU Get S6 (or S5 if if D-S>=0)
+ vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries
+
+ vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries)
+
+ vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF
+
+ vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D13
+ vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
+
+ stvx VPS0,DST,BK // LSU Store 16 bytes at D13
+ addi BK,BK,16 // IU1 Increment byte count
+ b Last_ld_fwd
+
+// End of vec_csum_partial_copy_generic in AltiVec
+
+// Modified from above Register useage
+// Don't use vectors for BC <= MIN_VEC_CS. Works only if MIN_VEC >= 32 bytes.
+#define MIN_VEC_CS 48 // Chosen experimentally on MPC7455@1GHz/133MHz bus
+#undef DST // will not be using here
+#undef BC
+#define BC r4 // entering: Byte_Count
+
+#undef SUM
+#define SUM r5 // entering: Partial checksum
+
+#if __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+#ifdef LIBMOTOVEC
+ .global csum_partial_vec
+csum_partial:
+#else
+ .global vec_csum_partial
+vec_csum_partial:
+#endif
+ li BK,32 // IU1
+ rlwinm Rt,BC,31,1,31 // IU1 BC/2
+ cmpi cr7,0,BC,MIN_VEC_CS // IU1 Check for minimum byte count
+
+ dcbt SRC,BK // LSU prefetch next cacheline
+ cmpi cr6,0,Rt,0 // IU1 BC/2 == 0?
+ addic SUM,SUM,0 // IU1 Zero carry bit
+
+ addi SM2,SRC,-2 // IU1 Pre-bias and duplicate src
+ add DBC,SRC,BC // IU1 Compute address of last src byte + 1
+ bgt cr7,v_csum // b if BC>MIN_VEC_CS
+ andi. BK,BC,1 // IU1 BC[31]==0?
+
+ beq cr6,No_HWs_cs // b if BC/2==0
+ mtctr Rt // i=BC/2; do ...;i--; while (i>0)
+HW_cs:
+ lhzu Rt,2(SM2) // LSU
+
+ addc SUM,SUM,Rt // IU1
+ bdnz HW_cs
+No_HWs_cs:
+ beq BC_even_cs // b if BC[31]==0 (or DBC[31]==0 when aligned)
+ lbz Rt,-1(DBC) // LSU Get last src address byte
+
+ rlwinm Rt,Rt,8,16,23 // IU1 Shift odd byte left
+
+ addc SUM,SUM,Rt // IU1
+BC_even_cs:
+ addze SRC,SUM
+ blr
+
+v_csum:
+ lvsr VP2,0,SRC // LSU Permute vector for initial byte mask
+ addi DR,SRC,16 // IU1 Address of second src vector
+ li BK,64 // IU1 Index of next cache line
+
+ lvsr VP4,SRC,BC // LSU Permute vector for final byte mask
+ rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
+ addi DBK,DBC,-1 // IU1 Address of last src byte
+
+ lvx VS0,0,SRC // LSU Get S0 load started
+ subf QW,DR,DBK // IU1 Bytes of full vectors to test (-16)
+ vxor V0,V0,V0 // VIU Clear v0
+
+ dcbt SRC,BK // LSU Prefetch next cache line at src+64
+ rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
+ vnor V1,V0,V0 // VIU1 Create a vector of all ones
+
+ mtctr QW // IU2
+ vxor VCARF,VCARF,VCARF //VIU1 clear VCARF
+ vperm VM,V1,V0,VP2 // VPU D0 select vector for dst left; src right
+
+ cmpi cr6,0,QW,4 // IU1 Check QW>4
+ vxor VSUM,VSUM,VSUM // VIU1 Clear VSUM
+ vperm VMM,V0,V1,VP4 // VPU DN select vector for src left; dst right
+
+ li BK,16 // IU1 Initialize byte kount index
+ vsel VS0,VS0,V0,VM // VIU1 Select zeroes left | S0 bytes right
+vp_fwd_loop:
+ lvx VS1,SRC,BK // LSU Get S1
+ vaddcuw VCAR1,VS0,VSUM // VIU1 data + previous sum ->store carries
+
+ vadduwm VSUM,VS0,VSUM // VIU1 data + previous sum (no carries)
+
+ vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF
+ addi BK,BK,16 // IU1 Increment byte kount index
+
+ vor VS0,VS1,VS1 // VIU1 Swap vectors for next loop
+ bdnzf 25,vp_fwd_loop // b if 4 or less quad words to do
+
+ add DNX,SRC,BK // IU1 address of next load (SRC+32 if QW>4)
+ addi QW,QW,-1 // IU1 One more QW summed by now
+ bgt cr6,GT_4QW_cs // b if >4 quad words left
+ vxor VS1,VS1,VS1 // VIU1 Zero before adding below
+
+// Next 16 bytes is the last; we're done.
+Last_ld_cs:
+ lvx VS2,0,DBK // LSU Get last source (guaranteed SN)
+ vaddcuw VCAR1,VS0,VS1 // VIU1 add data and store carries
+ rlwinm DBK,DBK,0,28,31 // IU1 (dst + BC -1)[28:31]
+
+ vadduwm VTEMP,VS0,VS1 // VIU1 add data (no carries)
+ cmpi cr7,0,DBK,0xF // IU1 Is last byte right justified?
+
+ vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries
+
+ vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum
+
+ vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds
+
+ vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF
+
+ beq cr7, Rt_just // b if right justified.
+ vsel VS2,VS2,V0,VMM // VIU1 Select src bytes left | zeroes right
+
+Rt_just:
+ vaddcuw VCAR1,VS2,VSUM // VIU1 data + previous sum ->store carries
+
+ vadduwm VSUM,VS2,VSUM // VIU1 data + previous sum (no carries)
+
+ vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF
+ vmrglh VL,V0,VSUM // VPU separate low shorts of sum
+
+ vmrghh VH,V0,VSUM //VPU separate high shorts of sum
+ rlwinm DBK,SP,0,0,27 // IU1 Align stack pointer to QW
+
+ vsumsws VCARS,VCARF,V0 //VIU2 sum all carries
+ vadduwm VSUM,VL,VH //VIU1 add low and high data
+ li BK,-16 // IU1 Index 0x10 less than SP
+
+ vsumsws VFIN,VSUM,VCARS //VIU2 sum all data including carries
+
+ stvx VFIN,DBK,BK // LSU Store partial checksum from VR
+
+ lwz SRC,-4(DBK) // LSU Load partial checksum to GPR
+
+ addc SRC,SRC,SUM
+
+ addze SRC,SRC
+
+ blr // Return destination address from entry
+
+#ifdef __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+GT_4QW_cs: // Do once if nxt ld is from odd half of cache line, else twice
+
+ lvx VS1,SRC,BK // LSU Get S3 (or S2)
+ addi QW,QW,-1 // IU1 Keeping track of QWs stored
+ vaddcuw VCAR1,VS0,VSUM // VIU1 data + previous sum ->store carries
+
+ vadduwm VSUM,VS0,VSUM // VIU1 data + previous sum (no carries)
+ mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+ addi DNX,DNX,16 // IU1 Update cr6 for next loop
+
+ addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop
+ vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
+
+ addi BK,BK,16 // IU1 Increment byte count by 16
+ vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF
+ bdnzf 27,GT_4QW_cs // b if next store is to lower (even) half of CL
+// At this point next store will be to even address.
+
+ mtcrf 0x02,DBK // IU2 cr6[3]=((last load)[27]==1)?1:0; (odd?)
+ addi SP8,SRC,96 // IU1 Starting address for dcbt
+ vxor VS1,VS1,VS1 // VIU1 Zero before adding below
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+
+ bns cr6,B32_cs // b if DST[27] == 0; i.e, final load is even
+
+ bdnz B32_cs // decrement counter for last QW load odd
+
+B32_cs: // Should be at least 2 loads remaining and next 2 are cache aligned
+ lvx VS2,SRC,BK // LSU Get S4
+ addi BK,BK,16 // IU1 Increment byte count by 16
+ vaddcuw VCAR1,VS0,VS1 // VIU1 add data and store carries
+
+ lvx VS3,SRC,BK // LSU Get S5
+ addi SP8,SP8,32 // IU1 Next starting address for dcbt
+ vadduwm VTEMP,VS0,VS1 // VIU1 add data (no carries)
+
+ dcbt 0,SP8 // LSU Prefetch cache line 64 bytes ahead
+ addi BK,BK,16 // IU1 Increment byte count
+ vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries
+
+ vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum
+
+ vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds
+ bdz Nxt_loc_cs // always decrement and branch to next instr
+
+Nxt_loc_cs:
+ vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF
+
+ vor VS0,VS2,VS2 // VIU1 Move S13 to S11
+
+ vor VS1,VS3,VS3 // VIU1 Move upper vector to lower
+ bdnz B32_cs // b if there are at least two more QWs to do
+
+ bso cr6,One_even_QW_cs // b if there is one even and one odd QW to store
+
+ b Last_ld_cs // b if last store is to even address
+
+// Come here with two more loads and two stores to do
+One_even_QW_cs:
+ lvx VS2,SRC,BK // LSU Get S6 (or S5 if if D-S>=0)
+ addi BK,BK,16 // IU1 Increment byte count
+ vaddcuw VCAR1,VS0,VS1 // VIU1 add data and store carries
+
+ vadduwm VTEMP,VS0,VS1 // VIU1 add data (no carries)
+
+ vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries
+
+ vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum
+
+ vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds
+
+ vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF
+
+ vxor VS1,VS1,VS1 // VIU1 Zero before next add
+
+ vor VS0,VS2,VS2 // VIU1 Move S13 to S11
+ b Last_ld_cs
+
+// End of vec_csum_partial in AltiVec \ No newline at end of file
diff --git a/liboil/motovec/vec_memcmp.S b/liboil/motovec/vec_memcmp.S
new file mode 100644
index 0000000..d0117fa
--- /dev/null
+++ b/liboil/motovec/vec_memcmp.S
@@ -0,0 +1,340 @@
+//#define __MWERKS__
+//------------------------------------------------------------------
+// file: vec_memcmp.S
+// AltiVec enabled version of memcmp
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// Copyright Motorola, Inc. 2003
+// ALL RIGHTS RESERVED
+//
+// You are hereby granted a copyright license to use, modify, and
+// distribute the SOFTWARE so long as this entire notice is retained
+// without alteration in any modified and/or redistributed versions,
+// and that such modified versions are clearly identified as such.
+// No licenses are granted by implication, estoppel or otherwise under
+// any patents or trademarks of Motorola, Inc.
+//
+// The SOFTWARE is provided on an "AS IS" basis and without warranty.
+// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
+// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
+// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
+// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
+// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
+// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
+//
+// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
+// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
+// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
+// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
+// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
+// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
+// for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern int vec_memcmp(const void *ptr1, const void *ptr2, size_t len);
+// Returns:
+// value < 0 if ptr1[0:len] < ptr2[0:len]
+// value = 0 if ptr1[0:len] == ptr2[0:len]
+// value > 0 if ptr1[0:len] > ptr2[0:len]
+//------------------------------------------------------------------
+
+// Revision History:
+// Rev 0.0 Original Chuck Corley 05/27/03
+
+
+#define VRSV 256 // VRSAVE spr
+// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
+#define MIN_VEC 16
+
+ // Macros for bits in CR6
+#define _all 6*4+0
+#define _none 6*4+2
+ // Macros for condition to be true/false and unlikely/likely to be taken
+#define _F_u 4
+#define _T_u 12
+#define _T_l 13
+
+// Register useage
+#define Rt r0 // r0 when used as a temporary register
+
+#define PT1 r3 // entering: ptr1; exiting: return value
+
+#define SRC r4 // entering: ptr2; then ptr2+16 if ptr1[28:31]<ptr2[28:31]
+
+#define BC r5 // entering: Byte_Count
+#define BCM1 r5 // then Byte_Count -1
+
+#define DMS r6 // ptr1 - ptr2 initially
+#define S2 r6 // ptr2 bytes initially
+
+// Codewarrior will put an unwelcome space as "lbzu r0,1(r7 )"
+// if you don't put the comment right after the r7. CJC 030314
+#define SM1 r7// ptr2 -1 for byte-by-byte forwards initially
+#define S r7 // ptr2[28:31]
+#define BK r7 // byte index
+
+#define DM1 r8// ptr1 -1 for byte-by-byte forwards initially
+#define D r8 // ptr1[28:31]
+
+#define PBC r9 // ptr1 + byte count initially
+
+#define S1 r10 // ptr1 bytes initially
+#define PBK r10 // (ptr1+byte_count-1)
+
+#define DR r11 // (ptr1+16)[0:27]
+#define QW r11 // number of quad words (vectors)
+
+#define RSV r12 // storage for VRSAVE register if used
+
+#define VS1 v0 // source 1 as a vector of 16 bytes
+
+#define VS2 v1 // source 2 as a vector of 16 bytes
+
+#define VS2b v2 // second source 2 vector for permuting
+#define VS12B v2 // octet shift count of 12
+#define VMB3 v2 // mismatch shifted right 3 bytes
+
+#define VP1 v3 // source 1 permute register
+#define VSH16 v3 // octet shift count of 16 bits/2 octets
+#define VMW3 v3 // mismatch shifted right 3 words
+
+#define VS1B v4 // octet shift count of 1
+
+#define V0 v5 // all zeros
+
+#define VP2 v6 // source 2 permute register
+#define VS4B v6 // octet shift count of 4
+#define VMB1 v6 // mismatch shifted right one byte
+
+#define VSH1 v7 // shift count of 1 bit
+#define VS8B v7 // octet shift count of 8 octets
+#define VMB2 v7 // mismatch shifted right 2 bytes
+
+#define V1 v8 // all ones
+#define VCE v8 // equality compare destination register
+
+#define VS2a v9 // first source 2 vector for permuting
+#define VMW1 v9 // mismatch shifted right one word
+
+#define VP3 v10 // ptr1-ptr2 permute register
+#define VMW2 v10 // mismatch shifted right 2 words
+
+#define VM v11 // mask for right end of 1st S1 vector
+
+#define VP4 v12 // last mask permute vector
+#define VLM v12 // last mask register
+
+#define VMM v13 // vector of zeroes with ones at mismatch(es) and DN
+
+// Condition register use
+// cr0[0:2] = (ptr1-ptr2==0)? return
+// then cr0[0:2] = (ptr1[28:31]-ptr2[28:31]<0)? "Need more S2?";
+// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
+// then cr1[2] = (QW == 0)? 1 : 0; (Any full vectors to move?)
+// cr5[2] = ((PBK = PT1+BC)[28:31] = 0)? 1 : 0; (S1N right justified)
+// cr6[0] = (S1 == S2)?1:0; (By vector)
+// then cr6[2] = (S2 > S1)? 1 : 0; (At mismatched byte)
+// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
+
+ .text
+#ifdef __MWERKS__
+ .align 32
+#else
+ .align 5
+#endif
+
+#ifdef LIBMOTOVEC
+ .global memcmp
+memcmp:
+#else
+ .global vec_memcmp
+vec_memcmp:
+#endif
+ subf. DMS,SRC,PT1 // IU1 Compute ptr1-ptr2 difference
+ cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count moves
+ cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
+
+ add PBC,PT1,BC // IU1 Address of last byte + 1
+ addi SM1,SRC,-1 // IU1 Pre-bias and duplicate ptr2
+ addi DR,PT1,16 // IU1 Duplicate s1 pointer
+ beq Dumb_exit // return if PT1 = SRC
+
+ addi PBK,PBC,-1 // IU1 Address of last ptr1 byte
+ addi DM1,PT1,-1 // IU1 Pre-bias and duplicate ptr1
+ rlwinm DR,DR,0,0,27 // IU1 (PT1+16)[0:27]
+ beq cr1,Dumb_exit // return if BC = 0
+
+ subf QW,DR,PBK // IU1 Bytes of full vectors to move (-16)
+ rlwinm PBC,PBC,0,28,31
+ bgt cr7,v_memcmp // do as vectors if BC>MIN_VEC
+
+// Compare byte-by-byte if BC<=MIN_VEC
+ mtctr BC // i=BC; do if...;i--; while (i>0)
+Cmp_nxt_byte:
+ lbzu S2,1(SM1) // LSU
+ lbzu S1,1(DM1) // LSU
+ subf. PT1,S2,S1 // IU1 if (*s1++ == *s2++)
+ bdnzt 2,Cmp_nxt_byte // b while equal and bytes left
+ blr
+Dumb_exit:
+ xor PT1,PT1,PT1 // IU1 return zero
+ blr
+
+#ifdef __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+v_memcmp:
+// Byte count < MIN_VEC bytes will have been compared by scalar code above,
+// so this will not deal with small block compares < MIN_VEC.
+
+#ifdef VRSAVE
+ mfspr RSV,VRSV // IU2 Get current VRSAVE contents
+#endif
+ rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
+ rlwinm S,SRC,0,28,31 // IU1 Save ptr2 address bits s[28:31]
+
+#ifdef VRSAVE
+ oris Rt,RSV,0xfff8 // IU1 Or in registers used by this routine
+#endif
+ rlwinm D,PT1,0,28,31 // IU1 D = ptr1[28:31]
+ cmpi cr1,0,QW,0 // IU1 Any full vectors to move?
+
+#ifdef VRSAVE
+ mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
+#endif
+ lvxl VS1,0,PT1 // LSU Get source1 load started (load as LRU)
+ subf. S,S,D // IU1 Is s2 longer than s1? (28:31 greater?)
+ li BK,16 // IU1 Byte index pointer
+
+ lvxl VS2,0,SRC // LSU Get source2 load started (load as LRU)
+ vor VS2b,VS2,VS2 // VIU1 Preset second s2 vector if not loaded
+ addi BCM1,BC,-1 // IU1 Index to last s2 byte
+// Decide if second vector of S2 is needed to compare to first vector of S1
+ bge Around // b if initial S1 is shorter than or equal S2
+
+ lvxl VS2b,SRC,BK // LSU Otherwise, we need more of s2
+ addi SRC,SRC,16 // IU1 Increment s2 pointer
+ addi BCM1,BCM1,-16 // IU1 Correction for last byte
+Around:
+
+ lvsl VP1,0,PT1 // LSU Set permute vector for s1 shift left
+ vspltisb VS1B,8 // VPU Create a shift count for 1 octet/8 bits
+ vxor V0,V0,V0 // VIU1 Create a vector of all zeroes
+
+ lvsl VP2,0,SRC // LSU Set permute vector for s2 shift left
+ vspltisb VSH1,1 // VPU Create a shift count of 1 bit
+ vnor V1,V0,V0 // VIU1 Create a vector of all ones
+
+ lvsr VP3,0,DMS // LSU Set permute vector for S2-S1 difference
+ cmpi cr5,0,PBC,0 // IU1 Will last byte of S2 be rt justified?
+ vperm VM,V0,V1,VP1 // VPU Mask as long as our subset of 1.
+
+
+ lvsr VP4,0,PBC // VIU1 Permute vector for bytes rt of end
+// Dealing with first S1 Vector - Permute S1 and S2 (possibly + S2b) to left edge
+ vperm VS1,VS1,V1,VP1 // VPU Left align s1 with ones as pad
+
+ vperm VS2,VS2,VS2b,VP2 // VPU Left align s2 and s2+
+
+ vslb VSH16,VS1B,VSH1 // VPU Shift count for 16 bits/2 octets
+ vor VS2,VS2,VM // VIU1 s2 now has identical ones padding to s1
+
+ vslb VS4B,VSH16,VSH1 // VPU Create a shift count for 4 octets
+ vcmpequb. VCE,VS1,VS2 // VIU1 Does s1 = s2?
+
+ vslb VS8B,VS4B,VSH1 // VPU Create a shift count for 8 octets
+ vnor VMM,VCE,VCE // VIU1 Not equals become ones
+ bc _F_u,_all,memcmp_final_v_NE // b if s1!=s2
+
+ ble cr1,Last_ld // b if there are no QW to do
+ mtctr QW // IU2 i=QW; do ...; while (i-- > 0)
+
+// Dealing with middle vectors
+memcmp_NA_next_v:
+ lvxl VS2a,SRC,BK // LSU Get next 16 bytes of s2
+
+ lvxl VS1,PT1,BK // LSU Get next 16 bytes of s1
+ addi BK,BK,16 // IU1 Increment byte index
+
+ vperm VS2,VS2b,VS2a,VP3 // VPU Combine into left justified s2
+ vor VS2b,VS2a,VS2a // VIU1 Move upper vector to lower
+
+ vcmpequb. VCE,VS1,VS2 // VIU1 Does s1 == s2 ?
+
+ vnor VMM,VCE,VCE // VIU1 Not equals become ones
+ bdnzt 24,memcmp_NA_next_v // b if more whole QWs to do and s1==s2
+
+ bc _F_u,_all,memcmp_final_v_NE // b if s1 != s2
+
+// Dealing with last vector
+Last_ld:
+ lvxl VS2a,SRC,BCM1 // LSU Last load of s2 (perhaps redundant)
+ vperm VLM,V0,V1,VP4 // VPU Ones mask for bytes rt of end
+
+ lvxl VS1,PT1,BK // LSU Last load of s1
+
+ vperm VS2,VS2b,VS2a,VP3 // VPU Combine into left justified s2
+ beq cr5,Rt_just // b if final S1 byte is rt justified
+
+ vor VS2,VS2,VLM // VIU1 Set uninvolved bytes at end
+
+ vor VS1,VS1,VLM // VIU1 Set bytes at end of s1
+Rt_just:
+ vcmpequb. VCE,VS1,VS2 // VIU1 Does s1 == s2 ?
+
+ vnor VMM,VCE,VCE // VIU1 Not equals become ones
+ bc _F_u,_all,memcmp_final_v_NE // b if s1!=s2
+
+ xor PT1,PT1,PT1 // IU1 Will return zero if strings are equal
+#ifdef VRSAVE
+ mtspr VRSV,RSV // IU1 Restore VRSAVE
+#endif
+ blr // Return 0 if s1 == s2
+
+memcmp_final_v_NE:
+ // s1 != s2, We're going to create a mask to mask off everything to
+ // the right of the first mismatching byte so we know we are just
+ // looking at the string up to the mismatch.
+
+ vsum4ubs VS12B,VS1B,VS8B // VIU2 Create a shift count for 12 octets
+
+ vsro VMW1,VMM,VS4B // VPU Shift the compare result one word right
+ vsrw VMB1,VMM,VS1B // VIU1 Shift compare result 8 bits right
+
+ vsro VMW2,VMM,VS8B // VPU Shift the compare result 2 words right
+ vsrw VMB2,VMM,VSH16 // VIU1 Shift compare result 16 bits right
+
+ vsro VMW3,VMM,VS12B // VPU Shift the compare result 3 words right
+ vor VM,VMW1,VMW2 // VIU1 Mask of words one and 2 to the right
+
+ vsro VMB3,VMB2,VS1B // VPU Shift compare result 3 bytes right
+ vor VM,VM,VMW3 // VIU1 Mask of MM 1,2,&3 words to the right
+
+ vcmpgtuw VM,VM,V0 // VIU1 Mask of all ones in words to the right
+
+ vor VM,VM,VMB1 // VIU1 Or in first byte to right
+
+ vor VM,VM,VMB2 // VIU1 Or in second byte to right
+
+ vor VM,VM,VMB3 // VIU1 Or in third byte to right
+
+ vor VS2,VS2,VM // VIU1 Set bytes right of mismatch
+
+ vor VS1,VS1,VM // VIU1 Set bytes right of mismatch
+ li r3,-1 // IU1 Return -1 if s1 < s2
+
+ vcmpgtub. VCE,VS2,VS1 // VIU1 Compute s2 > s1 for all bytes
+#ifdef VRSAVE
+ mtspr VRSV,RSV // IU1 Restore VRSAVE
+#endif
+ bclr _F_u,_none // s1 < s2 in first byte with a mismatch
+
+S2_lt_S1: li r3,1 // IU1 Return +1 if s1 > s2
+ blr // s1 > s2 in first byte with a mismatch
+
+// End of memcmp in AltiVec
+
diff --git a/liboil/motovec/vec_memcpy.S b/liboil/motovec/vec_memcpy.S
new file mode 100644
index 0000000..f280393
--- /dev/null
+++ b/liboil/motovec/vec_memcpy.S
@@ -0,0 +1,876 @@
+//------------------------------------------------------------------
+// file: vec_memcpy.S
+// AltiVec enabled version of memcpy and bcopy
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// Copyright Motorola, Inc. 2003
+// ALL RIGHTS RESERVED
+//
+// You are hereby granted a copyright license to use, modify, and
+// distribute the SOFTWARE so long as this entire notice is retained
+// without alteration in any modified and/or redistributed versions,
+// and that such modified versions are clearly identified as such.
+// No licenses are granted by implication, estoppel or otherwise under
+// any patents or trademarks of Motorola, Inc.
+//
+// The SOFTWARE is provided on an "AS IS" basis and without warranty.
+// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
+// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
+// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
+// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
+// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
+// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
+//
+// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
+// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
+// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
+// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
+// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
+// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
+// for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void * memcpy(void *dst, const void *src, size_t len);
+// Returns:
+// void *dst
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void * memmove( void *dst, const void *src, size_t len );
+// Copies len characters from src to dst and returns the value of
+// dst. Works correctly for overlapping memory regions.
+// - Harbison&Steele 4th ed (corrected as to return)
+// Returns:
+// void *dst
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void * bcopy(const void *src, void *dst, size_t len);
+// Returns:
+// void *dst
+//------------------------------------------------------------------
+
+// memcpy and memmove are combined into one entry point here because of
+// the similarity of operation and need to create fool-proof code.
+// The following conditions determine what is "fool proof":
+//
+// if: then single entry:
+// (DST-SRC)<0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memcpy
+// (DST-SRC)<0 && (SRC-DST)< BC && BC>MIN_VEC must b to v_memcpy
+// (DST-SRC)<0 && BC<MIN_VEC copy fwd byte-by-byte
+// (DST-SRC)==0 || BC==0 will just return
+// (DST-SRC)>0 && BC<MIN_VEC copy bkwd byte-by-byte
+// (DST-SRC)>0 && (DST-SRC)< BC && BC>MIN_VEC must b to v_memmove
+// (DST-SRC)>0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memmove
+
+// If you call memmove (or vec_memmove) and |DST-SRC|>=BC,
+// this code will branch to v_memcpy anyway for maximum performance.
+
+// Revision History:
+// Rev 0.0 Original Chuck Corley 02/03/03
+// Can still add dst, 128B loop, and aligned option
+// Rev 0.01 Fixed JY's seg-fault violation CJC 02/17/03
+// Rev 0.1 Added 128B loop and dst; cndtnlzd dcbz CJC 02/18/03
+// (Creating separate path for QW aligned didn't help much)
+// Rev 0.11 Small code schdling; chngd dst for memmove CJC 02/23/03
+// Rev 0.20 Eliminated alternate entry and cleanup CJC 02/27/03
+// Rev 0.21 Inproved loop branch targets for v_mempcy CJC 03/01/03
+// Rev 0.22 Experimented with dst (sent to H.) CJC 03/02/03
+// Rev 0.23 Substituted dcba for dcbz (sent to JY) CJC 03/08/03
+// Rev 0.24 Use two dst streams CJC 03/12/03
+// Rev 0.25 Fix for all compilers, cleanup, and release with
+// libmotovec.a rev 0.10 CJC 03/14/03
+// Rev 0.30 Fix for pre-empted destination (SNDF-DS) CJC 04/02/03
+//
+// Between Rev 0.25 and 0.30 the code was revised to store elements of
+// source at destination when first and/or last vector are less than 16
+// bytes. Areviewer at SNDF observed that loading the destination vector
+// for merging exposed the "uninvolved" destination bytes to incoherency
+// if an interrupt pre-empted this routine and modified the "uninvolved"
+// destination vector(s) while held in register for merging. It seems
+// like a low possibility but this revision is no longer subject to that
+// possibility. (It is also slightly faster than Rev 0.25.)
+// This is beta quality code; users are encouraged to make it faster.
+// ASSUMPTIONS:
+// Code is highly likely to be in the cache; data is not (streaming data)
+
+#define VRSV 256 // VRSAVE spr
+// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
+#define MIN_VEC 16
+// Don't use Big_loop in v_memcpy for |dst-src|<= minimum overlap.
+#define MIN_OVL 128
+
+// Register useage
+#define Rt r0 // r0 when used as a temporary register
+
+#define DST r3 // entering: dst pointer; exiting: same dst pointer
+
+#define SRC r4 // entering: src ptr; then end of src range index (SRC+BC) in memmove
+
+#define BC r5 // entering: Byte_Count
+
+#define PCS r6 // save for partial checksum entering
+
+#define DMS r7 // dst - src initially
+#define BK r7 // BC - 1 +/- (n*16)
+
+// Codewarrior will put an unwelcome space as "lbzu r0,1(r7 )"
+// if you don't put the comment right after the r7. CJC 030314
+#define SM1 r8// src -1 for byte-by-byte forwards initially
+#define S r8 // src[28:31]
+#define SMD r8 // src[0:27]-dst[0:27]
+#define STR r8 // data stream touch block & stride info for Big_loop
+
+#define DM1 r9// dst -1 for byte-by-byte forwards initially
+#define D r9 // dst[28:31]
+#define DNX r9 // (dst+n*16)[28:31]
+#define BL r9 // second byte_kount index pointer
+
+#define SBC r10// src + byte count initially then src[28:31]
+#define BLK r10 // temporary data stream touch block & stride info
+#define DR r10 // (dst+16)[0:27]
+#define QW r10 // number of quad words (vectors)
+
+#define DBC r11// dst + byte count initially
+#define BLL r11 // temporary data stream touch block & stride info
+#define SBK r11 // (src+byte_count-1)
+#define SBR r11 // (src+byte_count-1)[0:27]
+#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31]
+#define BIG r11 // QW/8 or 128 byte loop count
+#define SP8 r11 // SRC + n*128 (8 QWs) for data streaming after first call
+
+#define RSV r12 // storage for VRSAVE register if used
+
+#define VS0 v0 // src vector for permuting
+
+#define VS1 v1 // src vector for permuting
+
+#define VP3 v2 // d - s permute register
+
+#define VPS0 v3 // permuted source vector to store
+
+#define VPS1 v4 // 2nd permuted source vector to store
+
+#define VPS2 v5 // additional permuted src in Big loop
+
+#define VS2 v6 // src vector for permuting
+#define VPS3 v6 // additional permuted src in Big loop
+
+#define VS3 v7 // additional src load in Big loop
+#define VPS4 v7 // additional permuted src in Big loop
+
+#define VS4 v8 // additional src load in Big loop
+#define VPS5 v8 // additional permuted src in Big loop
+
+#define VS5 v9 // additional src load in Big loop
+#define VPS6 v9 // additional permuted src in Big loop
+
+#define VS6 v10 // additional src load in Big loop
+#define VPS7 v10 // additional permuted src in Big loop
+
+#define VS7 v11 // additional src load in Big loop
+
+// Conditionalize the use of dcba. It will help if the data is
+// not in cache and hurt if it is. Generally, except for small
+// benchmarks repeated many times, we assume data is not in cache
+// (data streaming) and using dcbz is a performance boost.
+#ifndef NO_DCBA
+#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
+ // gcc and codewarrior and diab don't assemble dcba
+#define DCBK .long 0x7c033dec
+// dcba r3,r7 or dcba DST,BK
+#define DCBL .long 0x7c034dec
+// dcba r3,r9 or dcba DST,BL
+#else
+#ifdef __ghs__
+.macro DCBK
+.long 0x7c033dec
+.endm
+.macro DCBL
+.long 0x7c034dec
+.endm
+#else
+#define DCBK dcba DST,BK
+#define DCBL dcba DST,BL
+#endif // __ghs__
+#endif // __GNUC__ or __MWERKS__
+#else
+#define DCBK nop
+#define DCBL nop
+#endif // NO_DCBA
+
+// Conditionalize the use of dst (data stream touch). It will help
+// if the data is not in cache and hurt if it is (though not as badly
+// as dcbz). Generally, except for small benchmarks repeated many times,
+// we assume data is not in cache (data streaming) and using dst is a
+// performance boost.
+#ifndef NO_DST
+#define STRM_B dst SBC,BLL,0
+#define STRM_F dst SRC,BLK,0
+#define STRM_1 dst SP8,STR,1
+
+#else
+#define STRM_B nop
+#define STRM_F nop
+#define STRM_1 nop
+#endif
+
+// Condition register use
+// cr0[0:2] = (dst-src==0)? return: ((dst-src>0)? copy_bkwd, copy_fwd;);
+// then cr0[0:2] = (dst[28:31]-src[28:31]<0)? "shifting left", "shifting right";
+// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
+// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified)
+// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
+// cr5[0,2] = (|DST-SRC|<=MIN_OVL)?1:0; (Overlap too small for Big loop?)
+// cr6[1,2] = (DST-SRC>=BC)?1:0; (Okay for v_memmove to copy forward?)
+// then cr6[2] = (QW == 0)? 1 : 0; (Any full vectors to move?)
+// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?)
+// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment)
+// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?)
+// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
+// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?)
+// then cr7[1] = (QW > 14)? 1 : 0; (>14 vectors to move?)
+// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?)
+
+ .text
+#ifdef __MWERKS__
+ .align 32
+#else
+ .align 5
+#endif
+
+#ifdef LIBMOTOVEC
+ .global memmove
+memmove:
+ nop // IU1 Compilers forget first label
+ .global memcpy
+memcpy:
+#else
+ .global vec_memmove
+vec_memmove:
+ nop // IU1 Only way I know to preserve both labels
+ .global vec_memcpy
+vec_memcpy:
+#endif
+ subf. DMS,SRC,DST // IU1 Compute dst-src difference
+ cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count moves
+ cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
+
+ addi SM1,SRC,-1 // IU1 Pre-bias and duplicate src for fwd
+ addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
+ add SBC,SRC,BC // IU1 Pre-bias and duplicate src for bkwd
+ beqlr // return if DST = SRC
+
+ add DBC,DST,BC // IU1 Pre-bias and duplicate destination
+ subf Rt,DST,SRC // IU1 Form |DST-SRC| if DST-SRC<0
+ beqlr cr1 // return if BC = 0
+
+ bgt Cpy_bkwd // b if DST-SRC>0 (have to copy backward)
+ cmpi cr5,0,Rt,MIN_OVL // IU1 (|DST-SRC|>128)?1:0; for v_memcpy
+ bgt cr7,v_memcpy // b if BC>MIN_VEC (okay to copy vectors fwd)
+
+// Copy byte-by-byte forwards if DST-SRC<0 and BC<=MIN_VEC
+ mtctr BC // i=BC; do ...;i--; while (i>0)
+Byte_cpy_fwd:
+ lbzu Rt,1(SM1) // LSU * ++(DST-1) = * ++(SRC-1)
+ stbu Rt,1(DM1) // LSU
+ bdnz Byte_cpy_fwd
+
+ blr
+ nop // IU1 Improve next label as branch target
+Cpy_bkwd:
+ cmpi cr5,0,DMS,MIN_OVL // IU1 ((DST-SRC)>128)?1:0; for v_memcpy
+ cmp cr6,0,DMS,BC // IU1 cr6[1,2]=(DST-SRC>=BC)?1:0;
+ bgt cr7,v_memmove // b if BC>MIN_VEC (copy vectors bkwd)
+// Copy byte-by-byte backwards if DST-SRC>0 and BC<=MIN_VEC
+ mtctr BC // i=BC; do ...;i--; while (i>0)
+Byte_cpy_bwd:
+ lbzu Rt,-1(SBC) // LSU * --(DST+BC) = * --(SRC+BC)
+ stbu Rt,-1(DBC) // LSU Store it
+ bdnz Byte_cpy_bwd
+ blr
+
+#ifdef __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+
+v_memmove:
+// Byte count < MIN_VEC bytes will have been copied by scalar code above,
+// so this will not deal with small block moves < MIN_VEC.
+
+// For systems using VRSAVE, define VRSAVE=1 when compiling. For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+ mfspr RSV,VRSV // IU2 Get current VRSAVE contents
+#endif
+ rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31]
+ rlwinm D,DST,0,28,31 // IU1 D = dst[28:31]
+ bge cr6,MC_entry // b to v_memcpy if DST-SRC>=BC (fwd copy OK)
+
+#ifdef VRSAVE
+ oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine
+#endif
+ lis BLL,0x010c // IU1 Stream 12 blocks of 16 bytes
+ subf. SMD,D,S // IU1 if S-D<0 essentially shifting right
+
+#ifdef VRSAVE
+ mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
+#endif
+ lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right
+ ori BLL,BLL,0xffe0 // IU1 Stream stride -32B
+
+ STRM_B // LSU Start data stream at SRC+BC
+ addi SBK,SBC,-1 // IU1 Address of last src byte
+ bgt Rt_shft // Bytes from upper vector = (s-d>0)?s-d:16+s-d;
+ addi SMD,SMD,16 // IU1 Save 16-(d-s)
+Rt_shft:
+
+ rlwinm SBR,SBK,0,0,27 // IU1 (SRC+BC-1)[0:27]
+ addi BK,BC,-1 // IU1 Initialize byte index
+
+ subf Rt,SBR,SBC // IU1 How many bytes in first source?
+ add DBK,DST,BK // IU1 Address of last dst byte
+ addi DR,DST,16 // IU1 Address of second dst vector
+
+ subf. SMD,Rt,SMD // IU1 if bytes in 1st src>Bytes in 1st permute
+ rlwinm Rt,DBK,0,28,31 // IU1 (DST+BC-1)[28:31]
+ rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
+
+// If there are more useful bytes in the upper vector of a permute pair than we
+// will get in the first permute, the first loaded vector needs to be in the
+// lower half of the permute pair. The upper half is a don't care then.
+ blt Get_bytes_rt // b if shifting left (D-S>=0)
+
+ lvx VS1,SRC,BK // LSU Get SN load started
+// Comments numbering source and destination assume single path through the
+// code executing each instruction once. For vec_memmove, an example would
+// be the call memmove(BASE+0x0F, BASE+0x2F, 82). N = 6 in that case.
+ addi SRC,SRC,-16 // IU1 Decrement src base (to keep BK useful)
+
+Get_bytes_rt: // Come here to get VS0 & Don't care what VS1 is
+ lvx VS0,SRC,BK // LSU Get SN-1 (SN if D-S<0) in lower vector
+ subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
+ cmpi cr7,0,Rt,0xF // IU1 Is Dn right justified?
+
+ cmpi cr1,0,D,0 // IU1 Is D0 left justified?
+ rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
+ add Rt,DST,BC // IU1 Refresh the value of DST+BC
+
+ cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
+ vperm VPS0,VS0,VS1,VP3 // VPU Align SN-1 and SN to DN
+ vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
+ beq cr7,Rt_just // b if DN is right justified
+
+ mtcrf 0x01,Rt // IU2 Put final vector byte count in cr7
+ rlwinm DBK,DBK,0,0,27 // IU1 Address of first byte of final vector
+ li D,0 // IU1 Initialize an index pointer
+ bnl cr7,Only_1W_bkwd // b if there was only one or zero words to store
+
+ stvewx VPS0,DBK,D // LSU store word 1 of two or three
+ addi D,D,4 // IU1 increment index
+
+ stvewx VPS0,DBK,D // LSU store word 2 of two or three
+ addi D,D,4 // IU1 increment index
+Only_1W_bkwd:
+ bng cr7,Only_2W_bkwd // b if there were only two or zero words to store
+
+ stvewx VPS0,DBK,D // LSU store word 3 of three if necessary
+ addi D,D,4 // IU1 increment index
+Only_2W_bkwd:
+ bne cr7,Only_B_bkwd // b if there are no half words to store
+
+ stvehx VPS0,DBK,D // LSU store one halfword if necessary
+ addi D,D,2 // IU1 increment index
+Only_B_bkwd:
+ bns cr7,All_done_bkwd // b if there are no bytes to store
+
+ stvebx VPS0,DBK,D // LSU store one byte if necessary
+ b All_done_bkwd
+
+Rt_just:
+ stvx VPS0,DST,BK // LSU Store 16 bytes at DN
+All_done_bkwd:
+ addi BK,BK,-16 // IU1 Decrement destination byte count
+
+ ble cr6,Last_load // b if no Quad words to do
+ mtctr QW // IU2 for (i=0;i<=QW;i++)-execution serializng
+ cmpi cr6,0,QW,4 // IU1 Check QW>4
+QW_loop:
+ lvx VS0,SRC,BK // LSU Get SN-2 (or SN-1 if ADJ==0)
+
+ vperm VPS0,VS0,VS1,VP3 // VPU Align SN-2 and SN-1 to DN-1
+ vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
+
+ stvx VPS0,DST,BK // LSU Store 16 bytes at DN-1
+ addi BK,BK,-16 // IU1 Decrement byte kount
+ bdnzf 25,QW_loop // b if 4 or less quad words to do
+
+ add DNX,DST,BK // IU1 address of next store (DST+BC-1-16)
+ bgt cr6,GT_4QW // b if >4 quad words left
+
+Last_load: // if D-S>=0, next load will be from same address as last
+ blt No_ld_bkwd // b if shifting right (S-D>=0)
+ addi SRC,SRC,16 // IU1 recorrect source if it was decremented
+No_ld_bkwd:
+ lvx VS0,0,SRC // LSU Get last source SN-6 (guaranteed S0)
+// Current 16 bytes is the last; we're done.
+ dss 0 // Data stream stop
+ vperm VPS0,VS0,VS1,VP3 // VPU Align SN-6 and SN-5 to DN-6
+ subfic D,DST,16 // IU1 How many bytes in first destination?
+ beq cr1,Lt_just // b if last destination is left justified
+
+ mtcrf 0x01,D // IU2 Put byte count remaining in cr7
+ li D,0 // IU1 Initialize index pointer
+ bns cr7,No_B_bkwd // b if only even number of bytes to store
+
+ stvebx VPS0,DST,D // LSU store first byte at DST+0
+ addi D,D,1 // IU1 increment index
+No_B_bkwd:
+ bne cr7,No_H_bkwd // b if only words to store
+ stvehx VPS0,DST,D // LSU store halfword at DST+0/1
+ addi D,D,2 // IU1 increment index
+
+No_H_bkwd:
+ bng cr7,No_W1_bkwd // b if exactly zero or two words to store
+ stvewx VPS0,DST,D // LSU store word 1 of one or three
+ addi D,D,4 // IU1 increment index
+
+No_W1_bkwd:
+ bnl cr7,No_W2_bkwd // b if there was only one word to store
+ stvewx VPS0,DST,D // LSU store word 1 of two or 2 of three
+ addi D,D,4 // IU1 increment index
+
+ stvewx VPS0,DST,D // LSU store word 2 of two or 3 of three
+ b No_W2_bkwd
+
+Lt_just:
+ stvx VPS0,0,DST // LSU Store 16 bytes at final dst addr D0
+No_W2_bkwd:
+#ifdef VRSAVE
+ mtspr VRSV,RSV // IU1 Restore VRSAVE
+#endif
+ blr // Return destination address from entry
+
+GT_4QW: // Do once if next store is to even half of cache line, else twice
+
+ lvx VS0,SRC,BK // LSU Get SN-3 (or SN-2)
+ mtcrf 0x02,DNX // IU2 cr6[3]=((DST+BC-1)[27]==1)?1:0;
+
+ vperm VPS0,VS0,VS1,VP3 // VPU Align SN-3 and SN-2 to Dn-2
+ vor VS1,VS0,VS0 // VIU1 Move lower vector to upper
+ addi DNX,DNX,-16 // IU1 Prepare to update cr6 next loop
+
+ stvx VPS0,DST,BK // LSU Store 16 bytes at DN-2
+ vor VS3,VS0,VS0 // VIU Make a copy of lower vector
+ addi BK,BK,-16 // IU1 Decrement byte count by 16
+ bdnzt 27,GT_4QW // b if next store is to upper (odd) half of CL
+// At this point next store will be to even address.
+
+ lis STR,0x102 // IU1 Stream 2 blocks of 16 bytes
+ mtcrf 0x02,DST // IU2 cr6[3]=(DST[27]==1)?1:0; (DST odd?)
+ addi BL,BK,-16 // IU1 Create an alternate byte count - 16
+
+ ori STR,STR,0xffe0 // IU1 Stream stride -32B
+ addi SP8,SRC,-64 // IU1 Starting address for data stream touch
+ bso cr6,B32_bkwd // b if DST[27] == 1; i.e, final store is odd
+
+ bdnz B32_bkwd // decrement counter for last odd QW store
+B32_bkwd: // Should be at least 2 stores remaining and next 2 are cache aligned
+ lvx VS2,SRC,BK // LSU Get SN-4 (or SN-3)
+ addi SP8,SP8,-32 // IU1 Next starting address for data stream touch
+
+ lvx VS1,SRC,BL // LSU Get SN-5 (or SN-4)
+ vperm VPS0,VS2,VS3,VP3 // VPU Align SN-4 and SN-3 to DN-3
+
+ STRM_1 // LSU Stream 64 byte blocks ahead of loads
+
+ DCBL // LSU allocate next cache line
+
+ vperm VPS1,VS1,VS2,VP3 // VPU Align SN-5 and SN-4 to DN-4
+ vor VS3,VS1,VS1 // VIU1 Move SN-5 to SN-3
+
+ stvx VPS0,DST,BK // LSU Store 16 bytes at DN-3
+ addi BK,BL,-16 // IU1 Decrement byte count
+ bdz Nxt_loc_bkwd // always decrement and branch to next instr
+
+Nxt_loc_bkwd:
+ stvx VPS1,DST,BL // LSU Store 16 bytes at DN-4
+ addi BL,BK,-16 // IU1 Decrement alternate byte count
+ bdnz B32_bkwd // b if there are at least two more QWs to do
+
+ bns cr6,One_odd_QW // b if there was one more odd QW to store
+ b Last_load
+
+// Come here with two more loads and two stores to do
+One_odd_QW:
+ lvx VS1,SRC,BK // LSU Get SN-6 (or SN-5)
+
+ vperm VPS1,VS1,VS3,VP3 // VPU Align SN-6 and SN-5 to DN-5
+
+ stvx VPS1,DST,BK // LSU Store 16 bytes at DN-5
+
+ b Last_load
+
+// End of memmove in AltiVec
+
+#ifdef __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+v_memcpy:
+// Byte count < MIN_VEC bytes will have been copied by scalar code above,
+// so this will not deal with small block moves < MIN_VEC.
+
+#ifdef VRSAVE
+ mfspr RSV,VRSV // IU2 Get current VRSAVE contents
+#endif
+ rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31]
+ rlwinm D,DST,0,28,31 // IU1 D = dst[28:31]
+
+MC_entry: // enter here from memmove if DST-SRC>=BC; this should be faster
+#ifdef VRSAVE
+ oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine
+#endif
+ lis BLK,0x010c // IU1 Stream 12 blocks of 16 bytes
+
+ subf. S,S,D // IU1 if D-S<0 essentially shifting left
+
+#ifdef VRSAVE
+ mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
+#endif
+ lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right
+ ori BLK,BLK,32 // IU1 Stream stride 32B
+
+ STRM_F // LSU Start data stream 0 at SRC
+ addi DR,DST,16 // IU1 Address of second dst vector
+ addi DBK,DBC,-1 // IU1 Address of last dst byte
+
+// If D-S<0 we are "kinda" shifting left with the right shift permute vector
+// loaded to VP3 and we need both S0 and S1 to permute. If D-S>=0 then the
+// first loaded vector needs to be in the upper half of the permute pair and
+// the lower half is a don't care then.
+ bge Ld_bytes_rt // b if shifting right (D-S>=0)
+
+ lvx VS0,0,SRC // LSU Get S0 load started
+// Comments numbering source and destination assume single path through the
+// code executing each instruction once. For vec_memcpy, an example would
+// be the call memcpy(BASE+0x1E, BASE+0x1F, 259). N = 16 in that case.
+ addi SRC,SRC,16 // IU1 Increment src base (to keep BK useful)
+
+Ld_bytes_rt: // Come here to get VS1 & Don't care what VS0 is
+ lvx VS1,0,SRC // LSU Get S1 (or S0 if D-S>=0) in upper vector
+ rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
+ cmpi cr1,0,D,0 // IU1 Is D0 left justified?
+
+ subf Rt,DST,DR // IU1 How many bytes in first destination?
+ subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
+ li BK,0 // IU1 Initialize byte kount index
+
+ mtcrf 0x01,Rt // IU2 Put bytes in 1st dst in cr7
+ rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
+ vperm VPS0,VS0,VS1,VP3 // VPU Align S0 and S1 to D0
+
+ vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
+ beq cr1,Left_just // b if D0 is left justified
+
+ bns cr7,No_B_fwd // b if only even number of bytes to store
+
+ stvebx VPS0,DST,BK // LSU store first byte at DST+0
+ addi BK,BK,1 // IU1 increment index
+No_B_fwd:
+ bne cr7,No_H_fwd // b if only words to store
+
+ stvehx VPS0,DST,BK // LSU store halfword at DST+0/1
+ addi BK,BK,2 // IU1 increment index
+No_H_fwd:
+ bng cr7,No_W1_fwd // b if exactly zero or two words to store
+
+ stvewx VPS0,DST,BK // LSU store word 1 of one or three
+ addi BK,BK,4 // IU1 increment index
+
+No_W1_fwd:
+ bnl cr7,No_W2_fwd // b if there was only one word to store
+ stvewx VPS0,DST,BK // LSU store word 1 of two or 2 of three
+ addi BK,BK,4 // IU1 increment index
+
+ stvewx VPS0,DST,BK // LSU store word 2 of two or 3 of three
+ b No_W2_fwd
+
+Left_just:
+ stvx VPS0,0,DST // LSU Store 16 bytes at D0
+No_W2_fwd:
+ rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
+ cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
+
+ li BK,16 // IU1 Re-initialize byte kount index
+ cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
+ cmpi cr7,0,QW,14 // IU1 Check QW>14
+ ble cr6,Last_ld_fwd // b if no Quad words to do
+
+ mtctr QW // IU2 for (i=0;i<=QW;i++)
+ cmpi cr6,0,QW,4 // IU1 Check QW>4
+QW_fwd_loop:
+ lvx VS1,SRC,BK // LSU Get S2 (or S1)
+
+ vperm VPS0,VS0,VS1,VP3 // VPU Align S1 and S2 to D1
+ vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
+
+ stvx VPS0,DST,BK // LSU Store 16 bytes at D1(+n*16 where n<4)
+ addi BK,BK,16 // IU1 Increment byte kount index
+ bdnzf 25,QW_fwd_loop // b if 4 or less quad words to do
+
+ add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
+ addi QW,QW,-1 // IU1 One more QW stored by now
+ bgt cr6,GT_4QW_fwd // b if >4 quad words left
+
+Last_ld_fwd: // Next 16 bytes is the last; we're done.
+ add DBC,DST,BC // IU1 Recompute address of last dst byte + 1
+ add SBC,SRC,BC // IU1 Recompute address of last src byte + 1
+ bge No_ld_fwd // b if shifting right (D-S>=0)
+
+ addi SBC,SBC,-16 // IU1 if D-S>=0 we didn't add 16 to src
+No_ld_fwd:
+ mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
+ addi DBK,DBC,-1 // IU1 Recompute address of last dst byte
+ addi Rt,SBC,-1 // IU1 Recompute address of last src byte
+
+// If D-S<0 we have already loaded all the source vectors.
+// If D-S>=0 then the first loaded vector went to the upper half of the permute
+// pair and we need one more vector. (This may be a duplicate.)
+
+ lvx VS1,0,Rt // LSU Get last source S14 (guaranteed SN)
+
+#ifndef NO_DST
+ dss 0 // Data stream 0 stop
+
+ dss 1 // Data stream 1 stop
+#endif
+ vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D14
+ beq cr1,Rt_just_fwd // b if last destination is right justified
+
+ rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
+ li D,0 // IU1 Initialize index pointer
+ bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
+
+ stvewx VPS0,DBK,D // LSU store word 1 of two or three
+ addi D,D,4 // IU1 increment index
+
+ stvewx VPS0,DBK,D // LSU store word 2 of two or three
+ addi D,D,4 // IU1 increment index
+Only_1W_fwd:
+ bng cr7,Only_2W_fwd // b if there were only two or zero words to store
+
+ stvewx VPS0,DBK,D // LSU store word 3 of three if necessary
+ addi D,D,4 // IU1 increment index
+Only_2W_fwd:
+ bne cr7,Only_B_fwd // b if there are no half words to store
+
+ stvehx VPS0,DBK,D // LSU store one halfword if necessary
+ addi D,D,2 // IU1 increment index
+Only_B_fwd:
+ bns cr7,All_done_fwd // b if there are no bytes to store
+
+ stvebx VPS0,DBK,D // LSU store one byte if necessary
+ b All_done_fwd
+
+Rt_just_fwd:
+
+ stvx VPS0,DST,BK // LSU Store 16 bytes at D14
+All_done_fwd:
+#ifdef VRSAVE
+ mtspr VRSV,RSV // IU1 Restore VRSAVE
+#endif
+ blr // Return destination address from entry
+#ifdef __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
+
+ lvx VS1,SRC,BK // LSU Get S3 (or S2)
+ addi QW,QW,-1 // IU1 Keeping track of QWs stored
+ mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+
+ addi DNX,DNX,16 // IU1 Update cr6 for next loop
+ addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop
+
+ vperm VPS0,VS0,VS1,VP3 // VPU Align S2 and S3 to D2
+ vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
+
+ stvx VPS0,DST,BK // LSU Store 16 bytes at D2
+ addi BK,BK,16 // IU1 Increment byte count by 16
+ bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
+// At this point next store will be to even address.
+
+ mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+ lis STR,0x104 // IU1 Stream 4 blocks of 16 bytes
+ addi BL,BK,16 // IU1 Create an alternate byte kount + 32
+
+ ori STR,STR,32 // IU1 Stream stride 32B
+#ifndef NO_BIG_LOOP
+ rlwinm BIG,Rt,29,3,31 // IU1 QW/8 big loops to do
+
+ rlwinm Rt,Rt,0,0,28 // IU1 How many QWs will be done in big loop
+ bgt cr7,Big_loop // b if QW > 14
+#endif
+No_big_loop:
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+
+ addi SP8,SRC,256 // IU1 Starting address for data stream touch
+ xoris STR,STR,0x6 // IU1 Reset stream to 2 blocks of 16 bytes
+ bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
+
+ bdnz B32_fwd // decrement counter for last QW store odd
+
+B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
+ lvx VS1,SRC,BK // LSU Get S12
+ addi SP8,SP8,32 // IU1 Next starting address for data stream touch
+
+ lvx VS2,SRC,BL // LSU Get S13
+ vperm VPS1,VS0,VS1,VP3 // VPU Align S11 and S12 to D11
+
+ STRM_1 // LSU Stream 64 byte blocks ahead of loads
+
+ DCBK // LSU then Kill instead of RWITM
+
+ vperm VPS0,VS1,VS2,VP3 // VPU Align S12 and S13 to D12
+ vor VS0,VS2,VS2 // VIU1 Move S13 to S11
+
+ stvx VPS1,DST,BK // LSU Store 16 bytes at D11
+ addi BK,BL,16 // IU1 Increment byte count
+ bdz Nxt_loc_fwd // always decrement and branch to next instr
+
+Nxt_loc_fwd:
+ stvx VPS0,DST,BL // LSU Store 16 bytes at D12
+ addi BL,BK,16 // IU1 Increment alternate byte count
+ bdnz B32_fwd // b if there are at least two more QWs to do
+
+ bso cr6,One_even_QW // b if there is one even and one odd QW to store
+ b Last_ld_fwd // b if last store is to even address
+
+// Come here with two more loads and two stores to do
+One_even_QW:
+ lvx VS1,SRC,BK // LSU Get S14 (or S13 if if D-S>=0)
+
+ vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D13
+ vor VS0,VS1,VS1 // VIU1 Move upper vector to lower
+
+ stvx VPS0,DST,BK // LSU Store 16 bytes at D13
+ addi BK,BK,16 // IU1 Increment byte count
+
+ b Last_ld_fwd
+
+#ifdef __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+Big_loop:
+ subf QW,Rt,QW // IU1 Should be 2-7 QWs left after big loop
+ blt cr5,No_big_loop // b back if |DST-SRC|<128; Big_loop won't work.
+ mtctr BIG // IU2 loop for as many 128B loops as possible
+ addi SP8,SRC,256 // IU1 Starting address for data stream touch
+
+Loop_of_128B: // Come here with QW>=10 and next store even; VS0 last load
+ lvx VS1,SRC,BK // LSU Get S4 (or S3 if D-S>=0)
+ addi BL,BK,32 // IU1 Increment Byte_Kount+16 by 32
+ addi SP8,SP8,128 // IU1 increment address for data stream touch
+
+ lvx VS3,SRC,BL // LSU Get S6 (or S5)
+ addi BL,BL,32 // IU1 Increment Byte_Kount+48 by 32
+
+ lvx VS5,SRC,BL // LSU Get S8 (or S7)
+ addi BL,BL,32 // IU1 Increment Byte_Kount+80 by 32
+
+ lvx VS7,SRC,BL // LSU Get S10 (or S9)
+ addi BL,BK,16 // IU1 Increment Byte_Kount+16 by 16
+
+ lvx VS2,SRC,BL // LSU Get S5 (or S4)
+ addi BL,BL,32 // IU1 Increment Byte_Kount+32 by 32
+
+ lvx VS4,SRC,BL // LSU Get S7 (or S6)
+ addi BL,BL,32 // IU1 Increment Byte_Kount+64 by 32
+
+ lvx VS6,SRC,BL // LSU Get S9 (or S8)
+ addi BL,BL,32 // IU1 Increment Byte_Kount+96 by 32
+ vperm VPS0,VS0,VS1,VP3 // VPU
+
+ lvx VS0,SRC,BL // LSU Get S11 (or S10)
+ vperm VPS1,VS1,VS2,VP3 // VPU
+
+ STRM_1 // LSU Stream 4 32B blocks, stride 32B
+
+ DCBK // LSU then Kill instead of RWITM
+
+ stvx VPS0,DST,BK // LSU Store D3
+ addi BK,BK,16 // IU1 Increment Byte_Kount+16 by 16
+ vperm VPS2,VS2,VS3,VP3 // VPU
+
+ stvx VPS1,DST,BK // LSU Store D4
+ addi BK,BK,16 // IU1 Increment Byte_Kount+32 by 16
+ vperm VPS3,VS3,VS4,VP3 // VPU
+
+ DCBK // LSU then Kill instead of RWITM
+
+ stvx VPS2,DST,BK // LSU Store D5
+ addi BK,BK,16 // IU1 Increment Byte_Kount+48 by 16
+ vperm VPS4,VS4,VS5,VP3 // VPU
+
+ stvx VPS3,DST,BK // LSU Store D6
+ addi BK,BK,16 // IU1 Increment Byte_Kount+64 by 16
+ vperm VPS5,VS5,VS6,VP3 // VPU
+
+ DCBK // LSU then Kill instead of RWITM
+
+ stvx VPS4,DST,BK // LSU Store D7
+ addi BK,BK,16 // IU1 Increment Byte_Kount+80 by 16
+ vperm VPS6,VS6,VS7,VP3 // VPU
+
+ stvx VPS5,DST,BK // LSU Store D8
+ addi BK,BK,16 // IU1 Increment Byte_Kount+96 by 16
+ vperm VPS7,VS7,VS0,VP3 // VPU
+
+ DCBK // LSU then Kill instead of RWITM
+
+ stvx VPS6,DST,BK // LSU Store D9
+ addi BK,BK,16 // IU1 Increment Byte_Kount+112 by 16
+
+ stvx VPS7,DST,BK // LSU Store D10
+ addi BK,BK,16 // IU1 Increment Byte_Kount+128 by 16
+ bdnz Loop_of_128B // b if ctr > 0 (QW/8 still > 0)
+
+ mtctr QW // IU1 Restore QW remaining to counter
+ addi BL,BK,16 // IU1 Create an alternate byte kount + 16
+ bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
+
+ bdnz B32_fwd // b and decrement counter for last QW store odd
+ // One of the above branches should have taken
+
+// End of memcpy in AltiVec
+
+// bcopy works like memcpy, but the source and destination operands are reversed.
+// Following will just reverse the operands and branch to memcpy.
+
+#ifdef LIBMOTOVEC
+ .global bcopy
+bcopy:
+#else
+ .global vec_bcopy
+vec_bcopy:
+#endif
+ mr Rt,DST // temp storage for what is really source address (r3)
+ mr DST,SRC // swap destination address to r3 to match memcpy dst
+ mr SRC,Rt // Complete swap of destination and source for memcpy
+#ifdef LIBMOTOVEC
+ b memcpy // b to memcpy with correct args in r3 and r4
+#else
+ b vec_memcpy // b to vec_memcpy with correct args in r3 and r4
+#endif
+// End of bcopy in AltiVec
diff --git a/liboil/motovec/vec_memset.S b/liboil/motovec/vec_memset.S
new file mode 100644
index 0000000..2b00e80
--- /dev/null
+++ b/liboil/motovec/vec_memset.S
@@ -0,0 +1,553 @@
+//------------------------------------------------------------------
+// file: vec_memset.S
+// AltiVec enabled version of memset and bzero and cacheable_memzero
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// Copyright Motorola, Inc. 2002
+// ALL RIGHTS RESERVED
+//
+// You are hereby granted a copyright license to use, modify, and
+// distribute the SOFTWARE so long as this entire notice is retained
+// without alteration in any modified and/or redistributed versions,
+// and that such modified versions are clearly identified as such.
+// No licenses are granted by implication, estoppel or otherwise under
+// any patents or trademarks of Motorola, Inc.
+//
+// The SOFTWARE is provided on an "AS IS" basis and without warranty.
+// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
+// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
+// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
+// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
+// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
+// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
+//
+// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
+// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
+// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
+// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
+// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
+// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
+// for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void *memset( void *ptr, int val, size_t len );
+// Copies val into each of len characters beginning at ptr.
+// - Harbison&Steele 4th ed
+// (despite val being an int, this memset assumes it is never
+// more than a byte. That seems to be correct from all the
+// memset functions I've seen but I don't know if ANSI allows
+// anthing longer. Chuck Corley 12/21/02)
+// Returns:
+// void * ptr
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void * bzero( char *ptr, int len);
+// Copies 0 into each of len characters at ptr.
+// - Harbison&Steele 4th ed
+// Returns:
+// void * ptr
+//------------------------------------------------------------------
+
+// Revision History:
+// Rev 0.0 Original Chuck Corley 02/09/03
+// Could benefit from changes added to memcpy
+// Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03
+//
+// This is beta quality code; users are encouraged to make it faster.
+// ASSUMPTIONS:
+// Code is highly likely to be in the cache; data is not (streaming data)
+// Zero fill could be quite likely.
+// Moving fill byte from GPR to VR as below faster than stw->lvebx via stack
+
+#define VRSV 256 // VRSAVE spr
+// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
+#define MIN_VEC 16
+
+// Register useage
+#define Rt r0 // r0 when used as a temporary register
+
+#define DST r3 // entering: dest pointer; exiting: same dest pointer
+
+#define FILL r4 // entering: fill char then fill word
+
+#define BC r5 // entering: Byte_Count then remaining Byte_Count
+
+#define DBC r6// dst + byte count
+
+#define BK r7 // BC - 1 +/- (n*16)
+
+#define Fsh r8 // fill byte shifted right one nibble
+
+#define DM1 r9// dst -1 for byte-by-byte backwards initially
+#define D r9 // (dst+16)[0:27] - dst[28:31]
+#define DNX r9 // (dst+n*16)[28:31]
+#define BL r9 // second byte_kount index pointer
+
+#define DR r10 // (dst+16)[0:27]
+#define QW r10 // number of cache lines
+
+#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31]
+
+#define RSV r12 // storage for VRSAVE register if used
+
+// Condition register use (not including temporary cr0)
+// cr0[2] = (FILL==0)?
+// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
+// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified)
+// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
+// cr6[2] = (QW == 0)? 1 : 0;
+// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?)
+// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment)
+// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?)
+// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
+// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?)
+// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?)
+
+// Conditionalize the use of dcba. It will help if the data is
+// not in cache and hurt if it is. Generally, except for small
+// benchmarks repeated many times, we assume data is not in cache
+// (data streaming) and using dcba is a performance boost.
+// We use dcba which will noop to non-cacheable memory rather than
+// dcbz which will cause an aligment exception.
+#ifndef NO_DCBA
+#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
+ // gcc and codewarrior and diab don't assemble dcba
+#define DCBK .long 0x7c033dec
+// dcba r3,r7 or dcba DST,BK
+#else
+#ifdef __ghs__
+.macro DCBK
+.long 0x7c033dec
+.endm
+#else
+#define DCBK dcba DST,BK
+#endif // __ghs__
+#endif // __GNUC__ or __MWERKS__
+#else
+#define DCBK nop
+#endif // NO_DCBA
+
+ .text
+#ifdef __MWERKS__
+ .align 32
+#else
+ .align 5
+#endif
+
+#ifdef LIBMOTOVEC
+ .global memset
+memset:
+#else
+ .global vec_memset
+vec_memset:
+#endif
+
+ cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
+ cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
+ rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift
+
+ addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
+ addi DR,DST,16 // IU1 Address of second dst vector
+ add DBC,DST,BC // IU1 Address of last dst byte + 1
+ bgt cr7,v_memset // b if BC>MIN_VEC
+
+ mtctr BC // for (i=1;i<=BC;i++)
+ beqlr cr1 // return if BC = 0
+Byte_set:
+ stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
+ bdnz Byte_set
+
+ blr
+
+v_memset:
+// Byte count < MIN_VEC bytes will have been set by scalar code above,
+// so this will not deal with small block sets < MIN_VEC.
+
+// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+ mfspr RSV,VRSV // IU2 Get current VRSAVE contents
+#endif
+ rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
+ addi DBK,DBC,-1 // IU1 Address of last dst byte
+
+#ifdef VRSAVE
+ oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine
+#endif
+ subf D,DST,DR // IU1 How many bytes in first destination?
+ li BK,0 // IU1 Initialize byte kount index
+
+#ifdef VRSAVE
+ mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
+#endif
+ vxor v0,v0,v0 // VIU Clear v0
+ subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
+ cmpi cr1,0,D,16 // IU1 Is D0 left justified?
+ beq+ enter_bzero // b if FILL==0
+
+ lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR
+ vspltisb v1,4 // VPU Splat 0x4 to every byte
+
+ lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR
+
+ vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3]
+
+ vor v0,v0,v2 // VIU Form FILL byte in VR[0:7]
+
+ vspltb v0,v0,0 // VPU Splat the fill byte to all bytes
+enter_bzero:
+ mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
+ rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
+ beq cr1,Left_just // b if D0 is left justified
+
+ bns cr7,No_B_fwd // b if only even number of bytes to store
+
+ stvebx v0,DST,BK // LSU store first byte at DST+0
+ addi BK,BK,1 // IU1 increment index
+No_B_fwd:
+ bne cr7,No_H_fwd // b if only words to store
+
+ stvehx v0,DST,BK // LSU store halfword at DST+0/1
+ addi BK,BK,2 // IU1 increment index
+No_H_fwd:
+ bng cr7,No_W1_fwd // b if exactly zero or two words to store
+
+ stvewx v0,DST,BK // LSU store word 1 of one or three
+ addi BK,BK,4 // IU1 increment index
+
+No_W1_fwd:
+ bnl cr7,No_W2_fwd // b if there was only one word to store
+ stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
+ addi BK,BK,4 // IU1 increment index
+
+ stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
+ b No_W2_fwd
+
+Left_just:
+ stvx v0,0,DST // LSU Store 16 bytes at D0
+No_W2_fwd:
+ rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
+ cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
+
+ li BK,16 // IU1 Re-initialize byte kount index
+ cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
+ ble cr6,Last_QW // b if no Quad words to do
+
+ mtctr QW // IU2 for (i=0;i<=QW;i++)
+ cmpi cr6,0,QW,4 // IU1 Check QW>4
+
+QW_loop:
+ stvx v0,DST,BK // LSU Store 16 fill bytes
+ addi BK,BK,16 // IU1 Increment byte kount index
+ bdnzf 25,QW_loop // b if 4 or less quad words to do
+
+ add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
+ addi QW,QW,-1 // IU1 One more QW stored by now
+ bgt cr6,GT_4QW_fwd // b if >4 quad words left
+
+Last_QW: // Next vector is the last; we're done.
+ mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
+
+ beq cr1,Rt_just_fwd // b if last destination is right justified
+
+ rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
+ li BL,0 // IU1 Initialize index pointer
+ bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
+
+ stvewx v0,DBK,BL // LSU store word 1 of two or three
+ addi BL,BL,4 // IU1 increment index
+
+ stvewx v0,DBK,BL // LSU store word 2 of two or three
+ addi BL,BL,4 // IU1 increment index
+Only_1W_fwd:
+ bng cr7,Only_2W_fwd // b if there were only two or zero words to store
+
+ stvewx v0,DBK,BL // LSU store word 3 of three if necessary
+ addi BL,BL,4 // IU1 increment index
+Only_2W_fwd:
+ bne cr7,Only_B_fwd // b if there are no half words to store
+
+ stvehx v0,DBK,BL // LSU store one halfword if necessary
+ addi BL,BL,2 // IU1 increment index
+Only_B_fwd:
+ bns cr7,All_done_fwd // b if there are no bytes to store
+
+ stvebx v0,DBK,BL // LSU store one byte if necessary
+ b All_done_fwd
+
+Rt_just_fwd:
+
+ stvx v0,DST,BK // LSU Store 16 bytes at D14
+All_done_fwd:
+#ifdef VRSAVE
+ mtspr VRSV,RSV // IU1 Restore VRSAVE
+#endif
+ blr // Return destination address from entry
+
+#ifdef __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
+
+ addi QW,QW,-1 // IU1 Keeping track of QWs stored
+ mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+ addi DNX,DNX,16 // IU1 Update cr6 for next loop
+
+ stvx v0,DST,BK // LSU Store 16 bytes at D2
+ addi BK,BK,16 // IU1 Increment byte count by 16
+ bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
+
+ mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+
+ bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+ bdnz B32_fwd // decrement counter for last QW store odd
+
+B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
+ DCBK // LSU then Kill instead of RWITM
+
+ stvx v0,DST,BK // LSU Store 16 bytes at D11
+ addi BK,BK,16 // IU1 Increment byte count
+ bdz Nxt_loc_fwd // always decrement and branch to next instr
+
+Nxt_loc_fwd:
+ stvx v0,DST,BK // LSU Store 16 bytes at D12
+ addi BK,BK,16 // IU1 Increment byte count
+ bdnz B32_fwd // b if there are at least two more QWs to do
+
+ bso cr6,One_even_QW // b if there is one even and one odd QW to store
+ b Last_QW // b if last store is to even address
+
+// Come here with two more loads and two stores to do
+One_even_QW:
+ stvx v0,DST,BK // LSU Store 16 bytes at D13
+ addi BK,BK,16 // IU1 Increment byte count
+
+ b Last_QW
+
+// End of memset in AltiVec
+
+#define BCz r4 // in bzero r4 enters with byte count
+
+#ifdef __MWERKS__
+ .align 32
+#else
+ .align 5
+#endif
+
+#ifdef LIBMOTOVEC
+ .global bzero
+bzero:
+#else
+ .global vec_bzero
+vec_bzero:
+#endif
+
+ mr BC,BCz // IU1 arg[2] is BC here, not FILL
+ li FILL,0 // IU1 for bzero FILL=0
+#ifdef LIBMOTOVEC
+ b memset
+#else
+ b vec_memset
+#endif
+
+// cacheable_memzero will employ dcbz to clear 32 bytes at a time
+// of cacheable memory. Like bzero, second entering argument will be BC.
+// Using this for non-cacheable memory will generate an alignment exception.
+
+ .text
+#ifdef __MWERKS__
+ .align 32
+#else
+ .align 5
+#endif
+
+#ifdef LIBMOTOVEC
+ .global cacheable_memzero
+cacheable_memzero:
+#else
+ .global vec_cacheable_memzero
+vec_cacheable_memzero:
+#endif
+
+ mr BC,BCz // IU1 arg[2] is BC here, not FILL
+ li FILL,0 // IU1 for bzero FILL=0
+ cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
+
+ cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
+
+ addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
+ addi DR,DST,16 // IU1 Address of second dst vector
+ add DBC,DST,BC // IU1 Address of last dst byte + 1
+ bgt cr7,c_v_memset // b if BC>MIN_VEC
+
+ mtctr BC // for (i=1;i<=BC;i++)
+ beqlr cr1 // return if BC = 0
+c_Byte_set:
+ stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
+ bdnz c_Byte_set
+
+ blr
+
+c_v_memset:
+// Byte count < MIN_VEC bytes will have been set by scalar code above,
+// so this will not deal with small block sets < MIN_VEC.
+
+// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+ mfspr RSV,VRSV // IU2 Get current VRSAVE contents
+#endif
+ rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
+ addi DBK,DBC,-1 // IU1 Address of last dst byte
+
+#ifdef VRSAVE
+ oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine
+#endif
+ subf D,DST,DR // IU1 How many bytes in first destination?
+ li BK,0 // IU1 Initialize byte kount index
+
+#ifdef VRSAVE
+ mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
+#endif
+ vxor v0,v0,v0 // VIU Clear v0
+ subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
+ cmpi cr1,0,D,16 // IU1 Is D0 left justified?
+
+ mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
+ rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
+ beq cr1,c_Left_just // b if D0 is left justified
+
+ bns cr7,c_No_B_fwd // b if only even number of bytes to store
+
+ stvebx v0,DST,BK // LSU store first byte at DST+0
+ addi BK,BK,1 // IU1 increment index
+c_No_B_fwd:
+ bne cr7,c_No_H_fwd // b if only words to store
+
+ stvehx v0,DST,BK // LSU store halfword at DST+0/1
+ addi BK,BK,2 // IU1 increment index
+c_No_H_fwd:
+ bng cr7,c_No_W1_fwd // b if exactly zero or two words to store
+
+ stvewx v0,DST,BK // LSU store word 1 of one or three
+ addi BK,BK,4 // IU1 increment index
+
+c_No_W1_fwd:
+ bnl cr7,c_No_W2_fwd // b if there was only one word to store
+ stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
+ addi BK,BK,4 // IU1 increment index
+
+ stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
+ b c_No_W2_fwd
+
+c_Left_just:
+ stvx v0,0,DST // LSU Store 16 bytes at D0
+c_No_W2_fwd:
+ rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
+ cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
+
+ li BK,16 // IU1 Re-initialize byte kount index
+ cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
+ ble cr6,c_Last_QW // b if no Quad words to do
+
+ mtctr QW // IU2 for (i=0;i<=QW;i++)
+ cmpi cr6,0,QW,4 // IU1 Check QW>4
+
+c_QW_loop:
+ stvx v0,DST,BK // LSU Store 16 fill bytes
+ addi BK,BK,16 // IU1 Increment byte kount index
+ bdnzf 25,c_QW_loop // b if 4 or less quad words to do
+
+ add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
+ addi QW,QW,-1 // IU1 One more QW stored by now
+ bgt cr6,c_GT_4QW_fwd // b if >4 quad words left
+
+c_Last_QW: // Next vector is the last; we're done.
+ mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
+
+ beq cr1,c_Rt_just_fwd // b if last destination is right justified
+
+ rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
+ li BL,0 // IU1 Initialize index pointer
+ bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store
+
+ stvewx v0,DBK,BL // LSU store word 1 of two or three
+ addi BL,BL,4 // IU1 increment index
+
+ stvewx v0,DBK,BL // LSU store word 2 of two or three
+ addi BL,BL,4 // IU1 increment index
+c_Only_1W_fwd:
+ bng cr7,Only_2W_fwd // b if there were only two or zero words to store
+
+ stvewx v0,DBK,BL // LSU store word 3 of three if necessary
+ addi BL,BL,4 // IU1 increment index
+c_Only_2W_fwd:
+ bne cr7,c_Only_B_fwd // b if there are no half words to store
+
+ stvehx v0,DBK,BL // LSU store one halfword if necessary
+ addi BL,BL,2 // IU1 increment index
+c_Only_B_fwd:
+ bns cr7,c_All_done_fwd // b if there are no bytes to store
+
+ stvebx v0,DBK,BL // LSU store one byte if necessary
+ b c_All_done_fwd
+
+c_Rt_just_fwd:
+
+ stvx v0,DST,BK // LSU Store 16 bytes at D14
+c_All_done_fwd:
+#ifdef VRSAVE
+ mtspr VRSV,RSV // IU1 Restore VRSAVE
+#endif
+ blr // Return destination address from entry
+
+#ifdef __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
+
+ addi QW,QW,-1 // IU1 Keeping track of QWs stored
+ mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+ addi DNX,DNX,16 // IU1 Update cr6 for next loop
+
+ stvx v0,DST,BK // LSU Store 16 bytes at D2
+ addi BK,BK,16 // IU1 Increment byte count by 16
+ bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL
+
+ mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+
+ bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+ bdnz B32_fwd // decrement counter for last QW store odd
+
+c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
+ dcbz DST,BK // LSU zero whole cache line
+ bdz c_Nxt_loc_fwd // always decrement and branch to next instr
+
+c_Nxt_loc_fwd:
+ addi BK,BK,32 // IU1 Increment byte count
+ bdnz B32_fwd // b if there are at least two more QWs to do
+
+ bso cr6,c_One_even_QW // b if there is one even and one odd QW to store
+ b c_Last_QW // b if last store is to even address
+
+// Come here with two more loads and two stores to do
+c_One_even_QW:
+ stvx v0,DST,BK // LSU Store 16 bytes at D13
+ addi BK,BK,16 // IU1 Increment byte count
+
+ b c_Last_QW
+
+// End of cacheable_memzero in AltiVec
diff --git a/liboil/motovec/vec_strcpy.S b/liboil/motovec/vec_strcpy.S
new file mode 100644
index 0000000..c31beaa
--- /dev/null
+++ b/liboil/motovec/vec_strcpy.S
@@ -0,0 +1,273 @@
+//------------------------------------------------------------------
+// file: vec_strcpy.S
+// AltiVec enabled version of strcpy and strncpy
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// Copyright Motorola, Inc. 2003
+// ALL RIGHTS RESERVED
+//
+// You are hereby granted a copyright license to use, modify, and
+// distribute the SOFTWARE so long as this entire notice is retained
+// without alteration in any modified and/or redistributed versions,
+// and that such modified versions are clearly identified as such.
+// No licenses are granted by implication, estoppel or otherwise under
+// any patents or trademarks of Motorola, Inc.
+//
+// The SOFTWARE is provided on an "AS IS" basis and without warranty.
+// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
+// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
+// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
+// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
+// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
+// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
+//
+// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
+// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
+// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
+// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
+// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
+// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
+// for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern char *vec_strcpy(char *dest, const char *src);
+//
+// Returns:
+// char *dest
+//------------------------------------------------------------------
+
+// Revision History:
+// Rev 0.0 Original Chuck Corley 03/22/02
+// Rev 0.1 Modified per vec_memcpy rev 0.30 Chuck Corley 05/24/03
+//
+
+// Harbison and Steele says "the results of both strcpy, strncpy, ... are
+// unpredictable if the two string arguments overlap in memory."
+// Since we do not know the address of the end of the string, copying
+// from back to front is not an option. Therefore we always "copy forward."
+
+#define VRSV 256 // VRSAVE spr
+// Use scalar for first MIN_SCALAR bytes. Overhead for vector is too great to win.
+#define MIN_SCALAR 32
+// Also don't use vectors if |DST-SRC| <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
+#define MIN_VEC 16
+#define PAGE_SIZE 4096 // True for G4 with AltiVec
+
+// Register useage:
+#define Rt r0 // r0 when used as a temporary register
+
+#define DST r3 // entering: dst pointer; exiting: same dst pointer
+
+#define SRC r4 // entering: src ptr; then end of src range index (SRC+BC) in memmove
+
+#define ADD r5 // Temporary future dst address
+#define PBC r5 // Computed Byte_Count to next 4K page src boundary
+
+#define DMS r6 // dst - src initially
+
+#define SMD r7 // src - dst initially
+
+#define DD r8 // duplicate of dst register for incementing
+
+#define QBC r9 // Computed Byte_Count to next QW dst boundary
+
+#define DS r10 // duplicate of src register for speculative incementing
+
+#define PSZ r11 // storage for page size constant
+
+#define RSV r12 // storage for VRSAVE register if used
+
+#define V0 v0 // all zeros
+
+#define VS0 v1 // src vector for permuting
+
+#define VS1 v2 // src vector for permuting
+
+#define VS2 v3 // src vector for permuting
+
+#define VP3 v4 // alignment permute register
+
+#define VPS0 v5 // permuted source vector to store
+
+#define VPS1 v6 // 2nd permuted source vector to store
+
+#define VCN v7 // null comparison result register
+
+// Conditionalize the use of dcba. It will help if the data is
+// not in cache and hurt if it is. Generally, except for small
+// benchmarks repeated many times, we assume data is not in cache
+// (data streaming) and using dcbz is a performance boost.
+#ifndef NO_DCBA
+#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
+ // gcc and codewarrior and diab don't assemble dcba
+#define DCBA .long 0x7c0045ec
+// dcba 0,r8 or dcba 0,DD
+#else
+#ifdef __ghs__
+.macro DCBA
+.long 7c0045ec
+.endm
+#else
+#define DCBA dcba 0,DD
+#endif // __ghs__
+#endif // __GNUC__ or __MWERKS__
+#else
+#define DCBA nop
+#endif // NO_DCBA
+
+ .text
+#ifdef __MWERKS__
+ .align 32
+#else
+ .align 5
+#endif
+
+#ifdef LIBMOTOVEC
+ .global strcpy
+strcpy:
+#else
+ .global vec_strcpy
+vec_strcpy:
+#endif
+
+
+ addi ADD,DST,32 // IU1 Next dst cacheline
+ subf. DMS,SRC,DST // IU1 Compute dst-src difference
+ subf SMD,DST,SRC // IU1 src-dst for use if dst-src<0
+
+ rlwinm ADD,ADD,0,0,26 // IU1 Round down to even QW
+ mr DD,DST // IU1 Duplicate dest
+ beqlr // return if DST = SRC
+
+ bgt Pos_value // b if DST-SRC>0
+ mr DMS,SMD // IU1 |dst - src| = src - dst
+Pos_value:
+ subf. QBC,DST,ADD // IU1 Bytes to even QW start of vect (min 32)
+ addi ADD,DD,PAGE_SIZE // IU1 dst addr in next 4K page
+ cmpi cr7,0,DMS,MIN_VEC // IU1 Check for min byte count separation
+
+ mtctr QBC // IU2 Init counter
+Byte_loop:
+ lbzx Rt,0,SRC // LSU Get a byte
+ addi SRC,SRC,1 // IU1 Increment src
+
+ cmpi cr1,0,Rt,0 // IU1 Is the byte loaded null?
+ stbx Rt,0,DD // LSU Store it
+ addi DD,DD,1 // IU1 Increment dest
+ bdnzf 6,Byte_loop // b to get another if this one wasn't null
+
+ beqlr cr1 // return if found a null
+
+ li PSZ,PAGE_SIZE // IU1 Constant for potential use in vector
+ rlwinm ADD,ADD,0,0,19 // IU1 First address in next 4K page
+ mr DS,SRC // IU1 Get current src addr
+ ble cr7,Byte_loop // do by bytes forever if < MIN_VEC separation
+
+v_strcpy:
+// For systems using VRSAVE, define VRSAVE=1 when compiling. For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+ mfspr RSV,VRSV // IU2 Get current VRSAVE contents
+#endif
+ subf. PBC,DD,ADD // IU1 Now bytes to next 4K page
+
+#ifdef VRSAVE
+ oris Rt,RSV,0xff00 // IU1 Or in registers used by this routine
+#endif
+ rlwinm PBC,PBC,28,4,31 // IU1 Now QWs to next 4K page
+
+#ifdef VRSAVE
+ mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
+#endif
+// Since DD has to be QW aligned at this point, we need three (or two
+// if SRC[28:31]==0) source vectors to permute into two dest vectors.
+// Loading beyond the end of the string should be okay as long as we don't
+// cross a page boundary.
+
+ lvsl VP3,0,SRC // LSU Create left permute vector
+ vxor V0,V0,V0 // VIU Clear v0
+ ble New_page_0 // b if next load will cross page boundary
+ mtctr PBC // IU2 Okay to load up to next page
+Page_0:
+
+ lvx VS0,0,DS // LSU Get first src vector
+ addi DS,DS,16 // IU1 Increment vector src pointer
+ bdz New_page_1 // b if next load will cross page boundary
+Page_1:
+
+ lvx VS1,0,DS // LSU Get second src vector
+ addi DS,DS,16 // IU1 Increment vector src pointer
+ bdz New_page_2 // b if next load will cross page boundary
+Page_2:
+
+ lvx VS2,0,DS // LSU Get third src vector
+ addi DS,DS,16 // IU1 Increment vector src pointer
+ bdz New_page_3 // b if next load will cross page boundary
+Page_3:
+
+ vperm VPS0,VS0,VS1,VP3 // VPU Align S0 and S1 to D0
+
+ vperm VPS1,VS1,VS2,VP3 // VPU Align S1 and S2 to D1
+ vor VS0,VS2,VS2 // VIU1 Move upper vector to lower
+
+ vcmpequb. VCN,V0,VPS0 // VIU1 Check for null
+ bne cr6,Final_0 // b if found a null in this permuted source vector
+ addi SRC,SRC,16 // IU1 Increment byte src pointer
+
+ vcmpequb. VCN,V0,VPS1 // VIU1 Check for null
+ bne cr6,Final_1 // b if found a null in this permuted source vector
+ DCBA // LSU Conditionally dcba 0,DST
+ addi SRC,SRC,16 // IU1 Increment byte src pointer
+
+ stvx VPS0,0,DD // LSU Store 16 bytes at dst addr D0
+ addi DD,DD,16 // IU1 Increment duplicate dst pointer
+
+ stvx VPS1,0,DD // LSU Store 16 bytes at dst addr D1
+ addi DD,DD,16 // IU1 Increment duplicate dst pointer
+
+ b Page_1
+
+Final_1: // Found a null in 2nd vector, store 1st vector then do bytes
+ stvx VPS0,0,DD // LSU Store 16 bytes at dst addr D0
+ addi DD,DD,16 // IU1 Increment duplicate dst pointer
+
+Final_0: // Found a null in vector, load and store bytes to null instead
+ lbzx Rt,0,SRC // LSU Get a byte
+ addi SRC,SRC,1 // IU1 Increment src
+
+ cmpi cr1,0,Rt,0 // IU1 Is the byte loaded null?
+ stbx Rt,0,DD // LSU Store it
+ addi DD,DD,1 // IU1 Increment dest
+
+ bne cr1,Final_0 // b to get another if this one wasn't null
+
+#ifdef VRSAVE
+ mtspr VRSV,RSV // IU1 Restore VRSAVE
+#endif
+ blr
+
+New_page_0: // Next load will be from new page; (ctr would have been <= zero)
+ mtctr PSZ // reinitialize counter
+ b Page_0
+
+New_page_1: // Did VS0 contain any nulls?
+ vcmpequb. VCN,V0,VS0 // VIU1 Check for null
+ bnl cr6,Final_0 // b if found a null in this source vector
+ mtctr PSZ // reinitialize counter
+ b Page_1
+
+New_page_2: // Did VS1 contain any nulls?
+ vcmpequb. VCN,V0,VS1 // VIU1 Check for null
+ bnl cr6,Final_0 // b if found a null in this source vector
+ mtctr PSZ // reinitialize counter
+ b Page_2
+
+New_page_3: // Did VS2 contain any nulls?
+ vcmpequb. VCN,V0,VS2 // VIU1 Check for null
+ bnl cr6,Final_0 // b if found a null in this source vector
+ mtctr PSZ // reinitialize counter
+ b Page_3
+
+// End of strcpy in AltiVec