diff options
author | David Schleef <ds@schleef.org> | 2005-06-17 21:51:58 +0000 |
---|---|---|
committer | David Schleef <ds@schleef.org> | 2005-06-17 21:51:58 +0000 |
commit | d64fd56082933579566d4bf45d3f421d3eba8392 (patch) | |
tree | ad4cbc3d76daa914252999ddac96ffcaee17a8e2 | |
parent | f811d988ddc37ca0592dee4024629dded03aef9f (diff) | |
download | liboil-d64fd56082933579566d4bf45d3f421d3eba8392.tar.gz |
* configure.ac: snarf LIBMOTOVEC because it has a compatible
license.
* COPYING:
* liboil/Makefile.am:
* liboil/motovec/Makefile.am:
* liboil/motovec/README:
* liboil/motovec/checksum_vec.S:
* liboil/motovec/string_vec.S:
* liboil/motovec/vec_csum.S:
* liboil/motovec/vec_memcmp.S:
* liboil/motovec/vec_memcpy.S:
* liboil/motovec/vec_memset.S:
* liboil/motovec/vec_strcpy.S:
-rw-r--r-- | COPYING | 77 | ||||
-rw-r--r-- | ChangeLog | 16 | ||||
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | liboil/Makefile.am | 3 | ||||
-rw-r--r-- | liboil/motovec/Makefile.am | 17 | ||||
-rw-r--r-- | liboil/motovec/README | 345 | ||||
-rw-r--r-- | liboil/motovec/checksum_vec.S | 627 | ||||
-rw-r--r-- | liboil/motovec/string_vec.S | 1375 | ||||
-rw-r--r-- | liboil/motovec/vec_csum.S | 724 | ||||
-rw-r--r-- | liboil/motovec/vec_memcmp.S | 340 | ||||
-rw-r--r-- | liboil/motovec/vec_memcpy.S | 876 | ||||
-rw-r--r-- | liboil/motovec/vec_memset.S | 553 | ||||
-rw-r--r-- | liboil/motovec/vec_strcpy.S | 273 |
13 files changed, 5206 insertions, 22 deletions
@@ -1,23 +1,58 @@ -Copyright (c) David A. Schleef <ds@schleef.org> -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: -1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. +The majority of the source code and the collective work is subject +to the following license: + + Copyright 2002,2003,2004,2005 David A. Schleef <ds@schleef.org> + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + +The source code in the liboil/motovec directory is subject to the +following license: + + Copyright Motorola, Inc. 2003 + ALL RIGHTS RESERVED + + You are hereby granted a copyright license to use, modify, and + distribute the SOFTWARE so long as this entire notice is retained + without alteration in any modified and/or redistributed versions, + and that such modified versions are clearly identified as such. + No licenses are granted by implication, estoppel or otherwise under + any patents or trademarks of Motorola, Inc. + + The SOFTWARE is provided on an "AS IS" basis and without warranty. + To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS + ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED + WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR + PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH + REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS + THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. + + To the maximum extent permitted by applicable law, IN NO EVENT SHALL + MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER + (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF + BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS + INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR + INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility + for the maintenance and support of the SOFTWARE. -THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR -IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, -INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. @@ -1,5 +1,21 @@ 2005-06-17 David Schleef <ds@schleef.org> + * configure.ac: snarf LIBMOTOVEC because it has a compatible + license. + * COPYING: + * liboil/Makefile.am: + * liboil/motovec/Makefile.am: + * liboil/motovec/README: + * liboil/motovec/checksum_vec.S: + * liboil/motovec/string_vec.S: + * liboil/motovec/vec_csum.S: + * liboil/motovec/vec_memcmp.S: + * liboil/motovec/vec_memcpy.S: + * liboil/motovec/vec_memset.S: + * liboil/motovec/vec_strcpy.S: + +2005-06-17 David Schleef <ds@schleef.org> + * liboil/colorspace/Makefile.am: new files * liboil/colorspace/argb_paint.c: remove temporary classes * liboil/colorspace/composite.c: new diff --git a/configure.ac b/configure.ac index 5a28079..1f43945 100644 --- a/configure.ac +++ b/configure.ac @@ -20,6 +20,7 @@ dnl - interfaces removed -> AGE = 0 LIBOIL_LIBVERSION="1:0:1" AC_SUBST(LIBOIL_LIBVERSION) AM_PROG_LIBTOOL +AM_PROG_AS AC_CONFIG_SRCDIR([liboil/liboil.h]) @@ -204,6 +205,7 @@ liboil/conv/Makefile liboil/copy/Makefile liboil/dct/Makefile liboil/md5/Makefile +liboil/motovec/Makefile liboil/jpeg/Makefile liboil/simdpack/Makefile liboil/sse/Makefile diff --git a/liboil/Makefile.am b/liboil/Makefile.am index 5711500..e245e7c 100644 --- a/liboil/Makefile.am +++ b/liboil/Makefile.am @@ -1,7 +1,7 @@ pkgincludedir = $(includedir)/liboil-@LIBOIL_MAJORMINOR@/liboil -SUBDIRS = colorspace conv copy dct jpeg simdpack md5 utf8 sse +SUBDIRS = colorspace conv copy dct jpeg md5 motovec simdpack sse utf8 lib_LTLIBRARIES = liboiltmp1.la liboil-@LIBOIL_MAJORMINOR@.la @@ -27,6 +27,7 @@ liboilfunctions_la_LIBADD = \ dct/libdct.la \ jpeg/libjpeg.la \ md5/libmd5.la \ + motovec/libmotovec.la \ simdpack/libsimdpack.la \ sse/libsse.la \ utf8/libutf8.la \ diff --git a/liboil/motovec/Makefile.am b/liboil/motovec/Makefile.am new file mode 100644 index 0000000..a56fb98 --- /dev/null +++ b/liboil/motovec/Makefile.am @@ -0,0 +1,17 @@ + +noinst_LTLIBRARIES = libmotovec.la + +c_sources = + +if HAVE_CPU_POWERPC +powerpc_sources = \ + vec_memcpy.S +else +powerpc_sources = +endif + +libmotovec_la_SOURCES = \ + $(powerpc_sources) +libmotovec_la_LIBADD = +libmotovec_la_CFLAGS = $(LIBOIL_CFLAGS) + diff --git a/liboil/motovec/README b/liboil/motovec/README new file mode 100644 index 0000000..a458db4 --- /dev/null +++ b/liboil/motovec/README @@ -0,0 +1,345 @@ +//------------------------------------------------------------------ +// file: readme.txt +// Readme to accompany libmotovec.a +//------------------------------------------------------------------ + +Rev 0.30 release - 5/28/2003 by Chuck Corley + +This release includes two new files, string_vec.S and checksum_vec.s, +which you could paste into the Linux kernel files: +/arch/ppc/lib/string.S and +/arch/ppc/lib/checksum.S +if you wanted to employ AltiVec in the Linux kernel. We used the +memcpy_vec and csum_partial_copy_generic_vec functions from these +files only in the modified versions of /net/core/skbuf.c and +/net/core/iovec.c to give us the networking performance boost in +Linux described in the SNDF presentation "Accelerating Networking Data +Movement Using the AltiVecĀ® Technology" at www.motorola.com/sndf under +Dallas-2003/Host Processors (H1110). Also see the white paper +"Enhanced TCP/IP Performance with AltiVec Technology" at +e-www.motorola.com/brdata/PDFDB/docs/ALTIVECTCPIPWP.pdf + +These files contain the following functions +string.S contains: string_vec.S contains: +memcpy memcpy_vec +bcopy bcopy_vec +memmove memmove_vec +backwards_memcpy backwards_memcpy_vec +memset memset_vec +memcmp memcmp_vec +memchr (coming soon) +cacheable_memcpy cacheable_memcpy_vec +cacheable_memzero cacheable_memzero_vec +strcpy strcpy_vec +strncpy (coming soon) +strcat (coming soon) +strcmp strcmp_vec +strlen strlen_vec +__copy_tofrom_user* __copy_tofrom_user_vec* +__clear_user* __clear_user_vec* +__strncpy_from_user* (coming soon) +__strnlen_user* (coming soon) + +checksum.S contains: checksum_vec.S contains: +csum_partial csum_partial_vec +csum_partial_copy_generic* csum_partial_copy_generic_vec +ip_fast_csum (unlikely to benefit) +csum_tcpudp_magic (unlikely to benefit) + +*these functions have ex_table entries for handling memory access +exceptions in the kernel. The AltiVec versions were functionally +tested by hand. + +csum_partial_copy_generic_vec and csum_partial_vec previously +assembled into libmotovec.a have been removed since they are in the file +above. We are finding that selective use of the *_vec functions in +the OS kernel is much "safer" than wholescale replacement of the libc +library. libmotovec.a returns to being exclusively a performance-enhancing +library of libc functions that can be safely linked with user application +code to test the performance of AltiVec. + +My presentation for SDNF-Europe includes performance comparisons +of the scalar versus vector versions of the above functions. It should +be available on the SNDF website soon. It also includes an updated +explanation of memcpy without the potential incoherency problem discussed +below. + +So this release contains in libmotovec.a: +memcpy.o from vec_memcpy.S Rev 0.30 dated 4/02/2003 +bcopy.o from vec_memcpy.S Rev 0.30 dated 4/02/2003 +memmove.o from vec_memcpy.S Rev 0.30 dated 4/02/2003 +memset.o from vec_memset.S Rev 0.10 dated 5/01/2003 +bzero.o from vec_memset.S Rev 0.10 dated 5/01/2003 +strcmp.o from vec_strcmp.S Rev 0.00 dated 3/03/2002 +strlen.o from vec_strlen.S Rev 0.00 dated 12/26/2002 + +And in string.s: +memcpy_vec derived from vec_memcpy.S Rev 0.30 dated 4/02/2003 +bcopy_vec derived from vec_memcpy.S Rev 0.30 +memmove_vec derived from vec_memcpy.S Rev 0.30 +backwards_memcpy_vec derived from vec_memcpy.S Rev 0.30 +memset_vec derived from vec_memset.S Rev 0.10 dated 5/01/2003 +memcmp_vec derived from vec_memcmp.S Rev 0.00 +memchr (coming soon) +cacheable_memcpy_vec derived from vec_memcpy.S Rev 0.30 +cacheable_memzero_vec derived from vec_memset.S Rev 0.10 +strcpy_vec derived from vec_strcpy.S Rev 0.10 +strncpy_vec (coming soon) +strcat_vec (coming soon) +strcmp_vec derived from vec_strcmp.S Rev 0.00 (not released) +strlen_vec derived from vec_strlen.S Rev 0.00 (not released) +__copy_tofrom_user_vec* derived from vec_memcpy.S Rev 0.30 +__clear_user_vec* derived from vec_memcpy.S Rev 0.30 +__strncpy_from_user_vec* (coming soon) +__strnlen_user_vec* (coming soon) +*with ex_table and exception code + +And in checksum.s: +csum_partial_vec derived from vec_csum.S Rev 0.0 dated 4/19/03 +csum_partial_copy_generic_vec from vec_csum.S Rev 0.0 + +string_vec.S and checksum_vec.S are only known to assemble with gcc 2.95 +and gcc 3.3+. Should work with other gcc compilers but may need +editing to be compatible with non-gcc compilers. + +Rev 0.20 release - 5/12/2003 by Chuck Corley + +Thanks to all of you who attended SNDF. My presentation "Implementing +and Using the Motorola AltiVec Libraries" is available for downloading +at www.motorola.com/sndf under Dallas-2003/Host Processors (H1109). + +During the presentation DS from Lucent pointed out that the way I was +bringing the beginning and ending destination Quad Words (vectors) into +the registers for merging with the permuted source made the +"uninvolved" destination bytes vulnerable to potential incoherency if +some interrupting process changed those bytes while I was holding them +in a register. While the possibility seemed small, I have rewritten the +code to avoid this potential problem. The result actually is slightly +faster than the original for small buffers. + +So this release contains: +memcpy.o from vec_memcpy.S Rev 0.30 dated 4/02/2003 +bcopy.o from vec_memcpy.S Rev 0.30 dated 4/02/2003 +memmove.o from vec_memcpy.S Rev 0.30 dated 4/02/2003 +memset.o from vec_memset.S Rev 0.10 dated 5/01/2003 +bzero.o from vec_memset.S Rev 0.10 dated 5/01/2003 +csum_partial_copy_generic_vec from vec_csum.S Rev 0.0 dated 4/19/03 +csum_partial_vec from vec_csum.S Rev 0.0 dated 4/19/03 + +The latter two additions were assembled into libmotovec.a despite the +fact they are not standard libc functions. Rather they are the Altivec +enabled equivalents of functions by the same name from the linux +source tree (Linux 2.4.17). While we are pursuing how to get these +functions incorporated into Linux, here they are assembled and in +source form if you are building your own version of linux. The use +of an earlier version of csum_partial_copy_generic_vec and memcpy_vec is +documented to speed up TCP/IP and UDP transfers in Jacob Pan's SNDF +presentation "Accelerating Networking Data Movement Using AltiVec +Technology" (H1110) available at the website above. csum_partial +does not appear to be called with large enough buffer sizes in linux +to warrant using the vectorized version. + +I am also releasing the source for memset and bzero in this release. +strcpy, strlen, strncpy, strcmp, memcmp, strcat, and memchr are still +on my list to do - soon. + +Rev 0.10 release - 3/13/2003 by Chuck Corley + +The presence of dcbz in the 32 byte loop of memcpy (or memmove) +causes an alignment exception to non-cacheable memory (MPC7410 User's +Manual p. 4-20 and MPC7450 User's Manual p. 4-25) so it was +removed in this release. dcbz instructions were not present in +memset in any of these releases. That fixed the alignment problem +but hurt the performance some; then it was "rediscovered" that +dcba would have been a better choice anyway as it does not cause +an exception; it would just be noop'ed. So this release substitutes +dcba for dcbz. + +This release contains improvements in memcpy that should be +documented in an application note which is still not finished but +are being pretty nicely documented for SNDF presentation H1109. + +The memcpy was further loop unrolled to provide a 128B loop for +large buffers (>256 bytes) and the data stream touch instruction +was added. It may still be possible to improve the tuning of +the dst instruction, particularly in memmove, but this release +is worthy of reving the number to the next significant revision. + +I've developed a new metric which will be explained at SNDF in +Dallas, TX, March 23-26, 2003. As the number of bytes in a +buffer gets larger, the memcpy routine settles into repetitions +of the inner loop. 32 bytes were moved in the inner loop of +Rev 0.0x and 128 bytes are moved in the inner loop of Rev 0.10. +And the number of processor clocks per inner loop can be shown +to approach the minimum possible. Therefore the new metric +measures the incremental transfer rate for the inner loop after +a reasonable number (>512) of bytes have been moved. This will +not be the bytes transferred per second because there were some +less efficient transfers at start-up but this is the transfer +rate that the routine is asymptotically approaching as the buffer +gets big (regularly testing to 1460 bytes). + +Here is that metric for several cases: + +Case 1: For gcc's lib c memcpy when buffers are not word aligned +Case 2: For gcc's lib c memcpy when buffers are word aligned +Case 3: For Rev 0.01 of memcpy with Altivec irrespective of alignment +Case 4: For Rev 0.10 of memcpy with Altivec irrespective of alignment + +Numbers are provided for the cold DCache and warm DCache. Code is +assumed to always be resident in the ICache as would be expected here +where the inner loop has run multiple times. + + COLD DCACHE WARM DCACHE + FOR THE MPC7410@400/100 Insts Clks MB/Sec Insts Clks MB/Sec +Case 1: gcc_NWA (1 byte/loop) 6 6 71 6 3 133 +Case 2: gcc_WA (16 B/loop) 12 62 103 12 8 800 +Case 3: vec_memcpy Rev 0.01 12 60 213 12 7 1961 +Case 4: vec_memcpy Rev 0.10 46 125 410 46 41 1250 + + + COLD DCACHE WARM DCACHE + FOR THE MPC7445@1GHz/133 Insts Clks MB/Sec Insts Clks MB/Sec +Case 1: gcc_NWA 6 8 122 6 3 350 +Case 2: gcc_WA 12 104 153 12 12 1333 +Case 3: vec_memcpy Rev 0.01 12 110 292 12 7 4413 +Case 4: vec_memcpy Rev 0.10 46 247 518 46 35 3666 + +Perhaps you notice that we are trading off Warm DCache performance to +improve the Cold DCache case. There are other interesting tradeoffs +in going from 32 byte inner loop to 128 bytes. And in using the dcba +instruction - or not. In other words, the numbers for vec_memcpy above +are not the highest possible in the Warm DCache case but they look like +a good compromise which most benefits the Cold DCache case. More at SNDF +(or eventually in the app note) ... + +I am releasing the source code to vec_memcpy.S with this release so if +if you don't like the tradeoff above you can make your own selection. It +successfully assembles for me with Codewarrior, Diab, Green Hills, gcc, +and Metaware. It is nicely commented but could use more documentation. +I will specifically be explaining it in SNDF presentation H1109. + +************************************************************************* + +Rev 0.01 release - 2/17/2003 by Chuck Corley + +Fixed a problem at Last_ld_fwd: that caused a load beyond a page +boundary and resulting segment fault in Linux. Last source load +of SRC+BK in vec_memcpy could be > SRC+BC-1. Also found and fixed +an error where the Quick and Dirty (QND) code that was in there for +dst wasn't completely commented out. Plan to enable dst soon. +Probably loop unroll to 128 bytes first though. + +********************************************************************** + +Initial Release - 2/10/2003 by Chuck Corley + +Contains the libc functions: +memcpy.o from vec_memcpy.S Rev 0.0 dated 2/09/2003 +bcopy.o from vec_memcpy.S Rev 0.0 dated 2/09/2003 +memmove.o from vec_memcpy.S Rev 0.0 dated 2/09/2003 +memset.o from vec_memset.S Rev 0.0 dated 2/09/2003 +bzero.o from vec_memset.S Rev 0.0 dated 2/09/2003 + +These functions are implemented in AltiVec but are still not as fast +as we know how to make them. Watch this site for frequent revisions +over the next several months. + +We are in the process of creating application notes to explain the +source code and the performance associated with these library functions; +watch this site for those application notes to be added. A logical +deadline for completion of this work is the Smart Network Developers +Forum in Dallas, TX, March 23-26, 2003, where we will be discussing this +library, its performance, and application. + +We will also be adding the following libc functions in the very near future: +strcpy +strcmp +strlen +memcmp +memchr +strncpy + +We also have preliminary work completed on the following functions +found in Linux and have to figure out how to distribute them: +csum_partial +csum_partial_generic +__copy_tofrom_user +page_copy + +We believe that these libraries will improve performance on Motorola G4 +processors for applications that make heavy use of the included functions. +On non-G4 microprocessors they will cause illegal operation exceptions +because those processors do not support AltiVec. + +To use this library, you must: +1. Include it on the linker command line prior to the compiler's libc +library. + +Examples: +For gcc: +powerpc-eabisim-ld -T../../spprt/gcc_dink.script -Qy -dn -Bstatic ../../spprt/gcc_obj/gcc_crt0.o ../../spprt/gcc_obj/dtime.o ../../spprt/gcc_obj/cache.o ../../spprt/gcc_obj/Support.o ../../spprt/gcc_obj/dinkusr.o ../../spprt/gcc_obj/perfmon.o gcc_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a c:/cygwin/Altivec/powerpc-eabisim\lib\libm.a --start-group -lsim -lc --end-group -o gccBM.elf + +For Diab: +dld ../../spprt/diab_dink.dld ../../spprt/diab_obj/diab_crt0.o ../../spprt/diab_obj/dtime.o ../../spprt/diab_obj/cache.o ../../spprt/diab_obj/Support.o ../../spprt/diab_obj/dinkusr.o ../../spprt/diab_obj/perfmon.o diab_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a -Y P,c:/diab/5.0.3/PPCEH:c:/diab/5.0.3/PPCE/simple:c:/diab/5.0.3/PPCE:c:/diab/5.0.3/PPCEN -lc -lm -o diabBM.elf + +For Green Hills: +elxr -T../../spprt/ghs_dink.lnk ../../spprt/ghs_obj/ghs_crt0.o ../../spprt/ghs_obj/dtime.o ../../spprt/ghs_obj/cache.o ../../spprt/ghs_obj/Support.o ../../spprt/ghs_obj/dinkusr.o ../../spprt/ghs_obj/perfmon.o ghs_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a -Lc:\GHS\ppc36\ppc -lansi -lsys -larch -lind -o ghsBM.elf + +For CodeWarrior: +mwldeppc -lcf ../../spprt/cw_dink.lcf -nostdlib -fp fmadd -proc 7450 ../../spprt/cw_obj/cw_crt0.o ../../spprt/cw_obj/dtime.o ../../spprt/cw_obj/cache.o ../../spprt/cw_obj/Support.o ../../spprt/cw_obj/dinkusr.o ../../spprt/cw_obj/perfmon.o cw_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a -Lc:/"Program Files"/Metrowerks/CodeWarrior/PowerPC_EABI_Support/Runtime/Lib/ -lRuntime.PPCEABI.H.a -Lc:/"Program Files"/Metrowerks/CodeWarrior/PowerPC_EABI_Support/Msl/MSL_C/Ppc_eabi/Lib/ -lMSL_C.PPCEABI.bare.H.a -o cwBM.elf + +For Metaware: +ldppc ../../spprt/mw_link.txt -Bnoheader -Bhardalign -dn -q -Qn ../../spprt/mw_obj/mw_crt0.o ../../spprt/mw_obj/dtime.o ../../spprt/mw_obj/cache.o ../../spprt/mw_obj/Support.o ../../spprt/mw_obj/dinkusr.o ../../spprt/mw_obj/perfmon.o mw_obj/test_memmove.o c:\BMS\vec_lib\libmotovec\libmotovec.a -Y P,c:/hcppc/lib/be/fp -lct -lmwt -o mwBM.elf + + +2. Enable AltiVec in the Machine State Processor (MSR) register of the +target machine. + +Example: +AltiVec_enable: + mfmsr r4 // Get current MSR contents + oris r4,r4,0x0200 // Set the AltiVec enable bit MSR[6] + mtmsr r4 // Write to MSR + isync // Context synchronizing instr after mtmsr + + +3. If the AltiVec vector register set is used in more than one context, +the AltiVec registers must be saved and restored on context switches. The +AltiVec EABI extensions define a register (SPR 256 - the VRSAVE register) +which can be used to reduce the number of vector registers which have to +be saved to only those in use. This library is currently compiled +without that VRSAVE feature enabled, so all 32 vector registers will have +to be saved and restored. We are currently thinking that this is a more +efficient practice anyway and note that Linux and several RTOSes are taking +that approach in saving and restoring the vector registers. We have observed +very little performance difference in Linux for saving all of the AltiVec +registers on a context switch versus saving only 8. And saving all of the +registers is a less than 1% total impact on performance. + +4. There is one worrisome problem with this library when run on the MPC745X +microprocessors in the 60x bus mode. The MPC7450 Family User's Manual +(Section 7.3) states that "The 60x bus protocol does not support a 16-byte +bus transaction. Therefore, cache-inhibited AltiVec loads, stores, and +write-through stores take an alignment exception. This requires a re-write +of the alignment exception routines in software that supports AltiVec quad +word access in 60x bus mode on the MPC745X." + +This says that if the user is attempting to use these routines in a +cache-inhibited area of memory on a MPC745X in 60x bus mode, it will require +special alignment exception handling software. We are currently implementing +that software for the Linux OS. Alternatively, the user can restrict this +library's use to areas of memory known to be cacheable. + +This library was built using gcc, but as shown in the examples of step 1 above, +links and executes with Diab5.0, Green Hills 3.6, Codewarrior EPPC 6.1, and +Metaware 4.5. The gcc archiver was used to create it in the following +command lines: + +powerpc-eabisim-gcc -c -s -fvec -mcpu=750 -mregnames -I. -I./source -I../../spprt -Ic:/cygwin/Altivec\powerpc-eabisim\include -Ic:/cygwin/Altivec\lib\gcc-lib\powerpc-eabisim\gcc-2.95.2\include -o gcc_obj/vec_memcpy.o -D__GNUC__ -DLIBMOTOVEC ../vec_memcpy/Source/vec_memcpy.S -o gcc_obj/vec_memcpy.o + +powerpc-eabisim-gcc -c -s -fvec -mcpu=750 -mregnames -I. -I./source -I../../spprt -Ic:/cygwin/Altivec\powerpc-eabisim\include -Ic:/cygwin/Altivec\lib\gcc-lib\powerpc-eabisim\gcc-2.95.2\include -o gcc_obj/vec_memset.o -D__GNUC__ -DLIBMOTOVEC ../vec_memset/source/vec_memset.S -o gcc_obj/vec_memset.o + +powerpc-eabisim-ar -ru libmotovec.a gcc_obj/vec_memcpy.o gcc_obj/vec_memset.o + +Email questions or suggestions to risc10@email.sps.mot.com diff --git a/liboil/motovec/checksum_vec.S b/liboil/motovec/checksum_vec.S new file mode 100644 index 0000000..c5efe25 --- /dev/null +++ b/liboil/motovec/checksum_vec.S @@ -0,0 +1,627 @@ +/* + * AltiVec versions (*_vec) of equivalent Linux library functions + * found in /arch/ppc/lib/checksum.S from Linux 2.4.17. Suggest this + * file be appended to that one when building a Linux kernel that + * will employ these functions. + * + * Copyright (C) Motorola, Inc. 2003 + * + * Revision history: + * Rev 0.0 Original Chuck Corley 5/28/03 + * Contact at risc10@motorola.com + * Commented source code for Altivec version available at + * www.motorola.com/altivec + */ + +#ifndef TEST_OUTSIDE_LINUX +#include <linux/sys.h> +#include <asm/processor.h> +#include <asm/errno.h> +#include "../kernel/ppc_asm.tmpl" +#if 0 +#define v0 vr0 +#define v1 vr1 +#define v2 vr2 +#define v3 vr3 +#define v4 vr4 +#define v5 vr5 +#define v6 vr6 +#define v7 vr7 +#define v8 vr8 +#define v9 vr9 +#define v10 vr10 +#define v11 vr11 +#define v12 vr12 +#define v13 vr13 +#define v14 vr14 +#define v15 vr15 +#endif +#else +#define EFAULT 0 +#endif + + .text + +/* + * AltiVec versions of selected functions for use on AltiVec + * enabled G4 and later microprocessors. + */ +#if defined(__GNUC__) || defined(__MWERKS__) // gcc and codewarrior don't assemble dcba +#define DCBAR4R12 .long 0x7c0465ec +#else +#define DCBAR4R12 dcba r4,r12 +#endif + + .text + .align 4 +#ifndef TEST_OUTSIDE_LINUX +_GLOBAL(csum_partial_copy_generic_vec) +#else +#if __MWERKS__ + .align 16 +#else + .align 4 +#endif + .global csum_partial_copy_generic_vec +csum_partial_copy_generic_vec: +#endif + li r12,32 + rlwinm r0,r5,31,1,31 + cmpi cr7,0,r5,48 + dcbt r3,r12 + cmpi cr6,0,r0,0 + addic r6,r6,0 + addi r11,r3,-2 + add r10,r4 ,r5 + bgt cr7,4f + andi. r12,r5,1 + addi r9,r4,-2 + add r12,r3,r5 + beq cr6,2f + mtctr r0 +1: lhzu r0,2(r11) +204: sthu r0,2(r9) + addc r6,r6,r0 + bdnz 1b +2: beq 3f +201: lbz r0,-1(r12 ) +202: stb r0,-1(r10) + rlwinm r0,r0,8,16,23 + addc r6,r6,r0 +3: addze r3,r6 + blr +4: lvsr v5,0,r4 + rlwinm r9,r4,0,28,31 + rlwinm r12,r3,0,28,31 + lvsr v7,r4,r5 + subf. r12,r12,r9 + subf r12,r3,r4 + lvsr v6,0,r12 + li r12,64 + vxor v0,v0,v0 + dcbt r3,r12 + cmpi cr1,0,r9,0 + vnor v1,v0,v0 + addi r9,r4,16 + addi r10,r10,-1 + vperm v5,v1,v0,v5 + bge 5f +401: lvx v2,0,r3 + addi r3,r3,16 +5: lvx v3,0,r3 + rlwinm r9,r9,0,0,27 + vperm v1,v0,v1,v7 + subf r11,r9,r10 + vxor v7,v7,v7 + vxor v11,v11,v11 + rlwinm r11,r11,28,4,31 + rlwinm r0,r10,0,28,31 + li r12,96 + cmpi cr5,0,r0,0xF + subf r0,r4,r9 + mtctr r11 + cmpi cr6,0,r11,4 + mtcrf 0x01,r0 + vperm v4,v2,v3,v6 + vor v2,v3,v3 + dcbt r3,r12 + beq cr1,9f + li r12,0 + vsel v4,v4,v0,v5 + bns cr7,6f +502: stvebx v4,r4,r12 + addi r12,r12,1 +6: bne cr7,7f +602: stvehx v4,r4,r12 + addi r12,r12,2 +7: bng cr7,8f +702: stvewx v4,r4,r12 + addi r12,r12,4 +8: bnl cr7,10f +802: stvewx v4,r4,r12 + addi r12,r12,4 +804: stvewx v4,r4,r12 + b 10f +9: stvx v4,0,r4 +10: vxor v8,v8,v8 + li r12,16 +11: lvx v3,r3,r12 + vaddcuw v9,v4,v8 + vadduwm v8,v4,v8 + vperm v4,v2,v3,v6 + vor v2,v3,v3 +112: stvx v4,r4,r12 + vadduwm v11,v9,v11 + addi r12,r12,16 + bdnzf 25,11b + add r9,r4,r12 + addi r11,r11,-1 + bgt cr6,19f +12: add r10,r4,r5 + add r11,r3,r5 + bge 13f + addi r11,r11,-16 +13: mtcrf 0x01,r10 + addi r0,r11,-1 +131: lvx v3,0,r0 + vaddcuw v9,v4,v8 + vadduwm v8,v4,v8 + vadduwm v11,v9,v11 + vperm v4,v2,v3,v6 + beq cr5,17f + vsel v4,v4,v0,v1 + rlwinm r10,r10,0,0,27 + li r9,0 + bnl cr7,14f +132: stvewx v4,r10,r9 + addi r9,r9,4 +134: stvewx v4,r10,r9 + addi r9,r9,4 +14: bng cr7,15f +142: stvewx v4,r10,r9 + addi r9,r9,4 +15: bne cr7,16f +152: stvehx v4,r10,r9 + addi r9,r9,2 +16: bns cr7,18f +162: stvebx v4,r10,r9 + b 18f +17: stvx v4,r4,r12 +18: vaddcuw v9,v4,v7 + vadduwm v12,v4,v7 + vaddcuw v10,v12,v8 + vadduwm v8,v12,v8 + vadduwm v9,v9,v10 +500: vmrglh v2,v0,v8 + vadduwm v11,v9,v11 + vmrghh v3,v0,v8 + rlwinm r10,r1,0,0,27 + vsumsws v0,v11,v0 + vadduwm v8,v2,v3 + li r12,-16 + vsumsws v8,v8,v0 +182: stvx v8,r10,r12 +183: lwz r3,-4(r10) + addc r3,r3,r6 + addze r3,r3 + blr +19: lvx v3,r3,r12 + addi r11,r11,-1 + vaddcuw v9,v4,v8 + vadduwm v8,v4,v8 + mtcrf 0x02,r9 + addi r9,r9,16 + addi r0,r11,-2 + vperm v4,v2,v3,v6 + vor v2,v3,v3 +192: stvx v4,r4,r12 + addi r12,r12,16 + vadduwm v11,v9,v11 + bdnzf 27,19b + mtcrf 0x02,r10 + addi r11,r3,96 + addi r9,r12,16 + bns cr6,20f + bdnz 20f +20: lvx v3,r3,r12 + addi r11,r11,32 + vaddcuw v9,v4,v7 +201: lvx v5,r3,r9 + vadduwm v12,v4,v7 + dcbt 0,r11 + vaddcuw v10,v12,v8 + DCBAR4R12 + vadduwm v8,v12,v8 + vperm v7,v2,v3,v6 +202: stvx v7,r4,r12 + vperm v4,v3,v5,v6 + vadduwm v9,v9,v10 + bdz 21f +21: stvx v4,r4,r9 + vor v2,v5,v5 + vadduwm v11,v9,v11 + addi r12,r9,16 + addi r9,r12,16 + bdnz 20b + bso cr6,22f + b 12b +22: lvx v3,r3,r12 + vaddcuw v9,v4,v8 + vadduwm v8,v4,v8 + vadduwm v11,v9,v11 + vperm v4,v2,v3,v6 + vor v2,v3,v3 +222: stvx v4,r4,r12 + addi r12,r12,16 + b 12b + +/* Intent of this exception table is to store -EFAULT to *src_err or + * or *dst_err respectively, and (for an error on src) zero the rest + * of dst. Return checksum for only those bytes stored before error. + * (Can't quite figure out how this return value is used since there + * is no way to restart from the point of error. So I'll only return + * the checksum for actual buffer as stored in memory. Doesn't look + * like scalar version adds in bytes loaded but not stored.) + * + * Register useage here: + * r3 = src, return checksum + * r4 = dst + * r5 = (preserve as total byte count til near end) + * r6 = entering partial sum; accumulator for scalar result + * r7 = src_err + * r8 = dst_err + * r9 = bytes not copied + * r10= dst + byte count + * r11= number of quad words (vectors) + * r12= Byte Kount index + */ + +/* read fault, initial half-word copy */ +100: li r0,0 + sthu r0,2(r9) /* Zero rest of buffer */ + cmpi 0,r7,0 + beq 104f /* Go return checksum */ + li r0,-EFAULT + stw r0,0(r7) + b 104f + +/* write fault, initial half-word copy */ +101: cmpi 0,r8,0 + beq 104f + li r0,-EFAULT + stw r0,0(r8) + b 104f + +/* read fault, final single-byte copy */ +102: li r0,0 + stb r0,-1(r10) /* Zero remaining byte */ + cmpi 0,r7,0 + beq 104f + li r0,-EFAULT + stw r0,0(r7) + b 104f + +/* write fault, final single-byte copy */ +103: cmpi 0,r8,0 + beq 104f + li r0,-EFAULT + stw r0,0(r8) +104: addze r3,r6 + blr + +/* read fault, 1st and 2nd vector load */ +105: cmpi 0,r7,0 + beq 155f + li r0,-EFAULT + stw r0,0(r7) +155: rlwinm r0,r5,31,1,31 + andi. r12,r5,1 + mtctr r0 + addi r9,r4,-2 + li r0,0 +106: sthu r0,2(r9) + bdnz 106b + beq 107f + stb r0,2(r9) +107: addze r3,r6 + blr + +/* write fault, initial vector store(s) (Nothing stored yet) */ +108: cmpi 0,r8,0 + beq 109f + li r0,-EFAULT + stw r0,0(r8) +109: addze r3,r6 + blr + +/* read fault, load in 16B loop or final load */ +110: cmpi 0,r7,0 + beq 156f + li r0,-EFAULT + stw r0,0(r7) +156: add r11,r4,r5 /* Last dst byte + 1 */ + add r4,r4,r12 /* Current dst byte */ + rlwinm r4,r4,0,0,27 /* Rounded down */ + subf r5,r4,r11 + rlwinm. r0,r5,31,1,31 + addi r9,r4,-2 + cmpi 1,r0,0 + beq cr1,157f + mtctr r0 + li r0,0 +111: sthu r0,2(r9) + bdnz 111b +157: andi. r12,r5,1 + beq 18b + li r0,0 + stb r0,2(r9) + vaddcuw v9,v4,v8 + vadduwm v8,v4,v8 + vxor v11,v11,v11 + b 500b /* Go sum across vector checksum */ + +/* write fault, store in 16B loop */ +1120: cmpi 0,r8,0 + beq 113f + li r0,-EFAULT + stw r0,0(r8) +113: b 500b + +/* write fault, final partial store(s) */ + +114: cmpi 0,r8,0 + vxor v11,v11,v11 + beq 115f + li r0,-EFAULT + stw r0,0(r8) +115: b 500b + +/* write fault, 1st store in 32B loop */ +116: cmpi 0,r8,0 + vadduwm v9,v9,v10 + beq 117f + li r0,-EFAULT + stw r0,0(r8) +117: b 500b + +/* write fault, 2nd store in 32B loop */ +118: cmpi 0,r8,0 + vxor v4,v4,v4 + vadduwm v11,v9,v11 + beq 119f + li r0,-EFAULT + stw r0,0(r8) +119: b 18b + +/* read fault, next to final load */ +120: cmpi 0,r7,0 + beq 121f + li r0,-EFAULT + stw r0,0(r7) +121: add r11,r4,r5 + add r4,r4,r12 + rlwinm r4,r4,0,0,27 + subf r5,r4,r11 + rlwinm. r0,r5,31,1,31 + addi r9,r4,-2 + cmpi 1,r0,0 + beq cr1,123f + mtctr r0 + li r0,0 +122: sthu r0,2(r9) + bdnz 122b +123: andi. r12,r5,1 + beq 124f + li r0,0 + stb r0,2(r9) +124: vaddcuw v9,v4,v8 + vadduwm v8,v4,v8 + vadduwm v11,v9,v11 + vxor v4,v4,v4 + b 18b + +/* write fault, 1st store in 32B loop */ +125: cmpi 0,r8,0 + vxor v4,v4,v4 + beq 126f + li r0,-EFAULT + stw r0,0(r8) +126: b 18b + +/* write or read fault in push/pop from stack. csumcpy complete. */ + +127: vxor v0,v0,v0 + vspltisw v2,1 + lis r5,0x8000 + vnor v1,v0,v0 + vmrglh v8,v0,v8 + li r10,17 + vsldoi v3,v0,v1,4 + li r3,0 + mtctr r10 + vsumsws v8,v8,v0 + vand v4,v2,v3 +128: vand v5,v8,v4 + rlwinm r5,r5,1,0,31 + vcmpequw. v6,v5,v4 + vsl v4,v4,v2 + bnl cr6,129f + or r3,r3,r5 +129: bdnz 128b + addc r3,r3,r6 + addze r3,r3 + blr + +#ifndef TEST_OUTSIDE_LINUX + .section __ex_table,"a" + .align 2 + .long 1b,100b + .long 204b,101b + .long 201b,102b + .long 202b,103b + .long 401b,105b + .long 5b,105b + .long 502b,108b + .long 602b,108b + .long 702b,108b + .long 802b,108b + .long 804b,108b + .long 9b,108b + .long 11b,110b + .long 112b,1120b + .long 131b,110b + .long 132b,114b + .long 134b,114b + .long 142b,114b + .long 152b,114b + .long 162b,114b + .long 17b,114b + .long 182b,127b + .long 183b,127b + .long 19b,110b + .long 192b,112b + .long 20b,110b + .long 201b,110b + .long 202b,116b + .long 21b,118b + .long 22b,120b + .long 222b,125b +#endif + + .text +#ifndef TEST_OUTSIDE_LINUX +_GLOBAL(csum_partial_vec) +#else +#if __MWERKS__ + .align 16 +#else + .align 4 +#endif + .global csum_partial_vec +csum_partial_vec: +#endif + + li r12,32 + rlwinm r0,r4,31,1,31 + cmpi cr7,0,r4,48 + dcbt r3,r12 + cmpi cr6,0,r0,0 + addic r5,r5,0 + addi r11,r3,-2 + add r10,r3,r4 + bgt cr7,4f + andi. r12,r4,1 + beq cr6,2f + mtctr r0 +1: lhzu r0,2(r11) + addc r5,r5,r0 + bdnz 1b +2: beq 3f + lbz r0,-1(r10) + rlwinm r0,r0,8,16,23 + addc r5,r5,r0 +3: addze r3,r5 + blr +4: lvsr v5,0,r3 + addi r9,r3,16 + li r12,64 + lvsr v7,r3,r4 + rlwinm r9,r9,0,0,27 + addi r10,r10,-1 + lvx v2,0,r3 + subf r11,r9,r10 + vxor v0,v0,v0 + dcbt r3,r12 + rlwinm r11,r11,28,4,31 + vnor v1,v0,v0 + mtctr r11 + vxor v11,v11,v11 + vperm v5,v1,v0,v5 + cmpi cr6,0,r11,4 + vxor v8,v8,v8 + vperm v1,v0,v1,v7 + li r12,16 + vsel v2,v2,v0,v5 +5: lvx v3,r3,r12 + vaddcuw v9,v2,v8 + vadduwm v8,v2,v8 + vadduwm v11,v9,v11 + addi r12,r12,16 + vor v2,v3,v3 + bdnzf 25,5b + add r9,r3,r12 + addi r11,r11,-1 + bgt cr6,8f + vxor v3,v3,v3 +6: lvx v5,0,r10 + vaddcuw v9,v2,v3 + rlwinm r10,r10,0,28,31 + vadduwm v12,v2,v3 + cmpi cr7,0,r10,0xF + vaddcuw v10,v12,v8 + vadduwm v8,v12,v8 + vadduwm v9,v9,v10 + vadduwm v11,v9,v11 + beq cr7, 7f + vsel v5,v5,v0,v1 +7: vaddcuw v9,v5,v8 + vadduwm v8,v5,v8 + vadduwm v11,v9,v11 + vmrglh v2,v0,v8 + vmrghh v3,v0,v8 + rlwinm r10,r1,0,0,27 + vsumsws v0,v11,v0 + vadduwm v8,v2,v3 + li r12,-16 + vsumsws v8,v8,v0 + stvx v8,r10,r12 + lwz r3,-4(r10 ) + addc r3,r3,r5 + addze r3,r3 + blr + .align 4 +8: lvx v3,r3,r12 + addi r11,r11,-1 + vaddcuw v9,v2,v8 + vadduwm v8,v2,v8 + mtcrf 0x02,r9 + addi r9,r9,16 + addi r0,r11,-2 + vor v2,v3,v3 + addi r12,r12,16 + vadduwm v11,v9,v11 + bdnzf 27,8b + mtcrf 0x02,r10 + addi r11,r3,96 + vxor v3,v3,v3 + bns cr6,9f + bdnz 9f +9: lvx v5,r3,r12 + addi r12,r12,16 + vaddcuw v9,v2,v3 + lvx v6,r3,r12 + addi r11,r11,32 + vadduwm v12,v2,v3 + dcbt 0,r11 + addi r12,r12,16 + vaddcuw v10,v12,v8 + vadduwm v8,v12,v8 + vadduwm v9,v9,v10 + bdz 10f +10: vadduwm v11,v9,v11 + vor v2,v5,v5 + vor v3,v6,v6 + bdnz 9b + bso cr6,11f + b 6b +11: lvx v5,r3,r12 + addi r12,r12,16 + vaddcuw v9,v2,v3 + vadduwm v12,v2,v3 + vaddcuw v10,v12,v8 + vadduwm v8,v12,v8 + vadduwm v9,v9,v10 + vadduwm v11,v9,v11 + vxor v3,v3,v3 + vor v2,v5,v5 + b 6b diff --git a/liboil/motovec/string_vec.S b/liboil/motovec/string_vec.S new file mode 100644 index 0000000..4da4a3e --- /dev/null +++ b/liboil/motovec/string_vec.S @@ -0,0 +1,1375 @@ +/* + * AltiVec versions (*_vec) of equivalent Linux library functions + * found in /arch/ppc/lib/string.S from Linux 2.4.17. Suggest this + * file be appended to that one when building a Linux kernel that + * will employ these functions. + * + * Copyright (C) Motorola, Inc. 2003 + * + * Revision history: + * Rev 0.0 Original Chuck Corley 5/28/03 + * Contact at risc10@motorola.com + * Commented source code for Altivec version available at + * www.motorola.com/altivec + * + * AltiVec versions will only deal with L1_CACHE_LINE_SIZE=32 + */ + + +#ifndef TEST_OUTSIDE_LINUX +#include "../kernel/ppc_asm.tmpl" +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/cache.h> +#include <asm/errno.h> +#if 0 +#define v0 vr0 +#define v1 vr1 +#define v2 vr2 +#define v3 vr3 +#define v4 vr4 +#define v5 vr5 +#define v6 vr6 +#define v7 vr7 +#define v8 vr8 +#define v9 vr9 +#define v10 vr10 +#define v11 vr11 +#define v12 vr12 +#define v13 vr13 +#define v14 vr14 +#define v15 vr15 +#endif +#else +#define EFAULT 0 +#define L1_CACHE_LINE_SIZE 32 +#define LG_L1_CACHE_LINE_SIZE 5 +#define MAX_L1_COPY_PREFETCH 1 +#endif + +/* AltiVec versions of selected functions for use on AltiVec + * enabled G4 and later microprocessors. + */ +#if defined(__GNUC__) || defined(__MWERKS__) /* gcc and codewarrior don't assemble dcba */ +#define DCBA_R3R7 .long 0x7c033dec +#define DCBA_R3R9 .long 0x7c034dec +#define DCBA_R0R8 .long 0x7c0045ec +#else +#define DCBA_R3R7 dcba r4,r7 +#define DCBA_R3R9 dcba r4,r9 +#define DCBA_R0R8 dcba 0,r8 +#endif + + .text + .align 5 + .global backwards_memcpy_vec +backwards_memcpy_vec: + nop + .global memmove_vec +memmove_vec: + nop + .global cacheable_memcpy_vec +cacheable_memcpy_vec: + nop + .global memcpy_vec +memcpy_vec: + subf. r7,r4,r3 + cmpi cr1,0,r5,0 + cmpi cr7,0,r5,16 + addi r8,r4,-1 + addi r9,r3,-1 + add r10,r4,r5 + beqlr + add r11,r3,r5 + subf r0,r3,r4 + beqlr cr1 + bgt 2f + cmpi cr5,0,r0,128 + bgt cr7,23f + mtctr r5 +1: lbzu r0,1(r8) + stbu r0,1(r9) + bdnz 1b + blr +2: cmpi cr5,0,r7,128 + cmp cr6,0,r7,r5 + bgt cr7,4f + mtctr r5 +3: lbzu r0,-1(r10) + stbu r0,-1(r11) + bdnz 3b + blr + +4: rlwinm r8,r4,0,28,31 + rlwinm r9,r3,0,28,31 + bge cr6,24f + lis r11,0x010c + subf. r8,r9,r8 + lvsr v2,0,r7 + ori r11,r11,0xffe0 + addi r11,r10,-1 + bgt 5f + addi r8,r8,16 +5: rlwinm r11,r11,0,0,27 + addi r7,r5,-1 + subf r0,r11,r10 + add r11,r3,r7 + addi r10,r3,16 + subf. r8,r0,r8 + rlwinm r0,r11,0,28,31 + rlwinm r10,r10,0,0,27 + blt 6f + lvx v1,r4,r7 + addi r4,r4,-16 +6: lvx v0,r4,r7 + subf r10,r10,r11 + cmpi cr7,0,r0,0xF + cmpi cr1,0,r9,0 + rlwinm r10,r10,28,4,31 + add r0,r3,r5 + cmpi cr6,0,r10,0 + vperm v3,v0,v1,v2 + vor v1,v0,v0 + beq cr7,10f + mtcrf 0x01,r0 + rlwinm r11,r11 ,0,0,27 + li r9,0 + bnl cr7,7f + stvewx v3,r11,r9 + addi r9,r9,4 + stvewx v3,r11,r9 + addi r9,r9,4 +7: bng cr7,8f + stvewx v3,r11,r9 + addi r9,r9,4 +8: bne cr7,9f + stvehx v3,r11,r9 + addi r9,r9,2 +9: bns cr7,11f + stvebx v3,r11,r9 + b 11f +10: stvx v3,r3,r7 +11: addi r7,r7,-16 + ble cr6,13f + mtctr r10 + cmpi cr6,0,r10,4 +12: lvx v0,r4,r7 + vperm v3,v0,v1,v2 + vor v1,v0,v0 + stvx v3,r3,r7 + addi r7,r7,-16 + bdnzf 25,12b + add r9,r3,r7 + bgt cr6,19f +13: blt 14f + addi r4,r4,16 +14: lvx v0,0,r4 + vperm v3,v0,v1,v2 + subfic r9,r3,16 + beq cr1,18f + mtcrf 0x01,r9 + li r9,0 + bns cr7,15f + stvebx v3,r3,r9 + addi r9,r9,1 +15: bne cr7,16f + stvehx v3,r3,r9 + addi r9,r9,2 +16: bng cr7,17f + stvewx v3,r3,r9 + addi r9,r9,4 +17: bnllr cr7 + stvewx v3,r3,r9 + addi r9,r9,4 + stvewx v3,r3,r9 + blr +18: stvx v3,0,r3 + blr +19: lvx v0,r4,r7 + mtcrf 0x02,r9 + vperm v3,v0,v1,v2 + vor v1,v0,v0 + addi r9,r9,-16 + stvx v3,r3,r7 + vor v7,v0,v0 + addi r7,r7,-16 + bdnzt 27,19b + lis r8,0x102 + mtcrf 0x02,r3 + addi r9,r7,-16 + ori r8,r8,0xffe0 + addi r11,r4,-64 + bso cr6,20f + bdnz 20f +20: lvx v6,r4,r7 + addi r11,r11,-32 + lvx v1,r4,r9 + vperm v3,v6,v7,v2 + DCBA_R3R9 + vperm v4,v1,v6,v2 + vor v7,v1,v1 + bdz 21f +21: stvx v3,r3,r7 + addi r7,r9,-16 + stvx v4,r3,r9 + addi r9,r7,-16 + bdnz 20b + bns cr6,22f + b 13b +22: lvx v1,r4,r7 + vperm v4,v1,v7,v2 + stvx v4,r3,r7 + b 13b + +23: rlwinm r8,r4,0,28,31 + rlwinm r9,r3,0,28,31 +24: lis r10,0x010c + subf. r8,r8,r9 + lvsr v2,0,r7 + ori r10,r10,32 + dst r4,r10,0 + addi r10,r3,16 + addi r11,r11,-1 + bge 25f + lvx v0,0,r4 + addi r4,r4,16 +25: lvx v1,0,r4 + rlwinm r10,r10,0,0,27 + cmpi cr1,0,r9,0 + subf r0,r3,r10 + subf r10,r10,r11 + li r7,0 + mtcrf 0x01,r0 + rlwinm r10,r10,28,4,31 + vperm v3,v0,v1,v2 + vor v0,v1,v1 + beq cr1,29f + bns cr7,26f + stvebx v3,r3,r7 + addi r7,r7,1 +26: bne cr7,27f + stvehx v3,r3,r7 + addi r7,r7,2 +27: bng cr7,28f + stvewx v3,r3,r7 + addi r7,r7,4 +28: bnl cr7,30f + stvewx v3,r3,r7 + addi r7,r7,4 + stvewx v3,r3,r7 + b 30f +29: stvx v3,0,r3 +30: rlwinm r0,r11,0,28,31 + cmpi cr6,0,r10,0 + li r7,16 + cmpi cr1,0,r0,0xF + cmpi cr7,0,r10,14 + ble cr6,32f + mtctr r10 + cmpi cr6,0,r10,4 +31: lvx v1,r4,r7 + vperm v3,v0,v1,v2 + vor v0,v1,v1 + stvx v3,r3,r7 + addi r7,r7,16 + bdnzf 25,31b + add r9,r3,r7 + addi r10,r10,-1 + bgt cr6,38f +32: add r11,r3,r5 + add r10,r4,r5 + bge 33f + addi r10,r10,-16 +33: mtcrf 0x01,r11 + addi r11,r11,-1 + addi r0,r10,-1 + lvx v1,0,r0 + dss 0 + dss 1 + vperm v3,v0,v1,v2 + beq cr1,37f + rlwinm r11,r11,0,0,27 + li r9,0 + bnl cr7,34f + stvewx v3,r11,r9 + addi r9,r9,4 + stvewx v3,r11,r9 + addi r9,r9,4 +34: bng cr7,35f + stvewx v3,r11,r9 + addi r9,r9,4 +35: bne cr7,36f + stvehx v3,r11,r9 + addi r9,r9,2 +36: bnslr cr7 + stvebx v3,r11,r9 + blr +37: stvx v3,r3,r7 + blr + +38: lvx v1,r4,r7 + addi r10,r10,-1 + mtcrf 0x02,r9 + addi r9,r9,16 + addi r0,r10,-2 + vperm v3,v0,v1,v2 + vor v0,v1,v1 + stvx v3,r3,r7 + addi r7,r7,16 + bdnzf 27,38b + mtcrf 0x02,r11 + lis r8,0x104 + addi r9,r7,16 + ori r8,r8,32 + rlwinm r11,r0,29,3,31 + rlwinm r0,r0,0,0,28 + bgt cr7,43f +39: addi r11,r4,256 + xoris r8,r8,0x6 + bns cr6,40f + bdnz 40f +40: lvx v1,r4,r7 + addi r11,r11,32 + lvx v6,r4,r9 + vperm v4,v0,v1,v2 + dst r11,r8,1 + DCBA_R3R7 + vperm v3,v1,v6,v2 + vor v0,v6,v6 + bdz 41f +41: stvx v4,r3,r7 + addi r7,r9,16 + stvx v3,r3,r9 + addi r9,r7,16 + bdnz 40b + bso cr6,42f + b 32b +42: lvx v1,r4,r7 + vperm v3,v0,v1,v2 + vor v0,v1,v1 + stvx v3,r3,r7 + addi r7,r7,16 + b 32b + +43: subf r10,r0,r10 + blt cr5,39b + mtctr r11 + addi r11,r4,256 +44: lvx v1,r4,r7 + addi r9,r7,32 + addi r11,r11,128 + lvx v7,r4,r9 + addi r9,r9,32 + lvx v9,r4,r9 + addi r9,r9,32 + lvx v11,r4,r9 + addi r9,r7,16 + lvx v6,r4,r9 + addi r9,r9,32 + lvx v8,r4,r9 + addi r9,r9,32 + lvx v10,r4,r9 + addi r9,r9,32 + vperm v3,v0,v1,v2 + lvx v0,r4,r9 + vperm v4,v1,v6,v2 + dst r11,r8,1 + DCBA_R3R7 + stvx v3,r3,r7 + addi r7,r7,16 + vperm v5,v6,v7,v2 + stvx v4,r3,r7 + addi r7,r7,16 + vperm v6,v7,v8,v2 + DCBA_R3R7 + stvx v5,r3,r7 + addi r7,r7,16 + vperm v7,v8,v9,v2 + stvx v6,r3,r7 + addi r7,r7,16 + vperm v8,v9,v10,v2 + DCBA_R3R7 + stvx v7,r3,r7 + addi r7,r7,16 + vperm v9,v10,v11,v2 + stvx v8,r3,r7 + addi r7,r7,16 + vperm v10,v11,v0,v2 + DCBA_R3R7 + stvx v9,r3,r7 + addi r7,r7,16 + stvx v10,r3,r7 + addi r7,r7,16 + bdnz 44b + mtctr r10 + addi r9,r7,16 + bns cr6,40b + bdnz 40b + + .global bcopy_vec +bcopy_vec: + mr r0,r3 + mr r3,r4 + mr r4,r0 + b memcpy_vec + + .text + .align 4 + .globl __clear_user_vec +__clear_user_vec: + mr r5,r4 + li r4,0 + .globl memset_vec +memset_vec: + cmpi cr7,0,r5,16 + cmpi cr1,0,r5,0 + rlwinm. r8,r4,28,28,3 + addi r9,r3,-1 + addi r10,r3,16 + add r6,r3,r5 + bgt cr7,2f + mtctr r5 + beqlr cr1 +1: stbu r4,1(r9) + bdnz 1b + blr +2: rlwinm r10,r10,0,0,27 + addi r11,r6,-1 + subf r9,r3,r10 + li r7,0 + vxor v0,v0,v0 + subf r10,r10 ,r11 + cmpi cr1,0,r9,16 + beq 3f + lvsl v0,0,r8 + vspltisb v1,4 + lvsl v2,0,r4 + vslb v0,v0,v1 + vor v0,v0,v2 + vspltb v0,v0,0 +3: mtcrf 0x01,r9 + rlwinm r10,r10,28,4,31 + beq cr1,7f + bns cr7,4f +32: stvebx v0,r3,r7 + addi r7,r7,1 +4: bne cr7,5f +42: stvehx v0,r3,r7 + addi r7,r7,2 +5: bng cr7,6f +52: stvewx v0,r3,r7 + addi r7,r7,4 +6: bnl cr7,8f +62: stvewx v0,r3,r7 + addi r7,r7,4 +64: stvewx v0,r3,r7 + b 8f +7: stvx v0,0,r3 +8: rlwinm r0,r11,0,28,31 + cmpi cr6,0,r10,0 + li r7,16 + cmpi cr1,0,r0,0xF + ble cr6,10f + mtctr r10 + cmpi cr6,0,r10,4 +9: stvx v0,r3,r7 + addi r7,r7,16 + bdnzf 25,9b + add r9,r3,r7 + addi r10,r10,-1 + bgt cr6,16f +10: mtcrf 0x01,r6 + beq cr1,14f + rlwinm r11,r11,0,0,27 + li r9,0 + bnl cr7,11f +102: stvewx v0,r11,r9 + addi r9,r9,4 +104: stvewx v0,r11,r9 + addi r9,r9,4 +11: bng cr7,12f +112: stvewx v0,r11,r9 + addi r9,r9,4 +12: bne cr7,13f +122: stvehx v0,r11,r9 + addi r9 ,r9 ,2 +13: bnslr cr7 +132: stvebx v0,r11,r9 + blr +14: stvx v0,r3,r7 + blr + +16: addi r10,r10,-1 + mtcrf 0x02,r9 + addi r9,r9,16 +162: stvx v0,r3,r7 + addi r7,r7,16 + bdnzf 27,16b + mtcrf 0x02,r11 + bns cr6,17f + bdnz 17f +17: stvx v0,r3,r7 + addi r7,r7,16 + bdz 18f +18: stvx v0,r3,r7 + addi r7,r7,16 + bdnz 17b + bso cr6,19f + b 10b +19: stvx v0,r3,r7 + addi r7,r7,16 + b 10b + +/* Intent of this exception table appears to be to return the byte count */ +/* remaining to be cleared when the current store error occurred. Chuck */ +/* Memset doesn't require it but the code is identical to __clear_user */ +/* FIRST FAILURE CHECKED BY RECOMPILATION WITH BRANCHES SUBSTITUTED + * FOR STORES. chuckc 030515 +*/ + +91: mfctr r3 /* Return byte count remaining */ + blr +92: subf r3,r7,r5 /* BC minus bytes already stored */ + blr +93: mr r3,r5 /* Nothing stored yet */ + blr +94: add r11,r3,r5 + rlwinm r6,r11,0,28,31 /* Bytes in last vector */ + b 99f +95: add r11,r3,r5 + rlwinm r6,r11,0,28,31 + subf r3,r9,r6 + blr +96: li r3,16 /* 16 bytes in last vector to be stored. */ + blr +97: add r11,r3,r5 + rlwinm r6,r11,0,27,31 +99: mfctr r3 + rlwinm r3,r3,4,0,27 + add r3,r3,r6 + blr +98: add r11,r3,r5 + rlwinm r3,r11,0,27,31 + blr + +#ifndef TEST_OUTSIDE_LINUX + .section __ex_table,"a" + .align 2 + .long 1b,91b + .long 32b,92b + .long 42b,92b + .long 52b,92b + .long 62b,92b + .long 64b,92b + .long 7b,93b + .long 9b,94b + .long 102b,95b + .long 104b,95b + .long 112b,95b + .long 122b,95b + .long 132b,95b + .long 14b,96b + .long 162b,94b + .long 17b,97b + .long 18b,97b + .long 19b,98b +#endif + .text +/* Scalar __copy_tofrom_user always copies forward and never checks + * for overlap, __copy_tofrom_user_vec will do the same except it will + * check that overlap is > 128B before entering 128B loop when copying + * forward. + * The scalar version always assumes the destination and source + * are word aligned. This routine will assume the same to simplify handling + * exceptions. chuckc + */ + + .globl __copy_tofrom_user_vec +__copy_tofrom_user_vec: + subf. r7,r4,r3 + cmpi cr1,0,r5,0 + cmpi cr7,0,r5,16 + addi r8,r4,-1 + addi r9,r3,-1 + add r10,r4,r5 + beqlr + add r11,r3,r5 + subf r0,r3,r4 + beqlr cr1 + bgt 1f + cmpi cr5,0,r0,128 /* Overlap |(DST-SRC)|> 128B? */ + bgt cr7,23f /* b to v_memcpy */ +1: cmpi cr5,0,r7,128 /* Overlap |(DST-SRC)|> 128B? */ + bgt cr7,23f /* b to v_memcpy */ + mtctr r5 +2: lbzu r0,1(r8) +202: stbu r0,1(r9) + bdnz 2b + li r3,0 + blr + +23: rlwinm r8,r4,0,28,31 + rlwinm r9,r3,0,28,31 +24: lis r10,0x010c + subf. r8,r8,r9 + lvsr v2,0,r7 + ori r10,r10,32 + dst r4,r10,0 + addi r10,r3,16 + addi r11,r11,-1 + bge 25f +241: lvx v0,0,r4 + addi r4,r4,16 +25: lvx v1,0,r4 + rlwinm r10,r10,0,0,27 + cmpi cr1,0,r9,0 + subf r0,r3,r10 + subf r10,r10,r11 + li r7,0 + mtcrf 0x01,r0 + rlwinm r10,r10,28,4,31 + vperm v3,v0,v1,v2 + vor v0,v1,v1 + beq cr1,29f + bns cr7,26f +252: stvebx v3,r3,r7 + addi r7,r7,1 +26: bne cr7,27f +262: stvehx v3,r3,r7 + addi r7,r7,2 +27: bng cr7,28f +272: stvewx v3,r3,r7 + addi r7,r7,4 +28: bnl cr7,30f +282: stvewx v3,r3,r7 + addi r7,r7,4 +284: stvewx v3,r3,r7 + b 30f +29: stvx v3,0,r3 +30: rlwinm r0,r11,0,28,31 + cmpi cr6,0,r10,0 + li r7,16 + cmpi cr1,0,r0,0xF + cmpi cr7,0,r10,14 + ble cr6,32f + mtctr r10 + cmpi cr6,0,r10,4 +31: lvx v1,r4,r7 + vperm v3,v0,v1,v2 + vor v0,v1,v1 +312: stvx v3,r3,r7 + addi r7,r7,16 + bdnzf 25,31b + add r9,r3,r7 + addi r10,r10,-1 + bgt cr6,38f +32: add r11,r3,r5 + add r10,r4,r5 + bge 33f + addi r10,r10,-16 +33: mtcrf 0x01,r11 + addi r11,r11,-1 + addi r0,r10,-1 +331: lvx v1,0,r0 + dss 0 + dss 1 + vperm v3,v0,v1,v2 + beq cr1,37f + rlwinm r11,r11,0,0,27 + li r9,0 + li r3,0 + bnl cr7,34f +332: stvewx v3,r11,r9 + addi r9,r9,4 +334: stvewx v3,r11,r9 + addi r9,r9,4 +34: bng cr7,35f +342: stvewx v3,r11,r9 + addi r9,r9,4 +35: bne cr7,36f +352: stvehx v3,r11,r9 + addi r9,r9,2 +36: bnslr cr7 +362: stvebx v3,r11,r9 + blr +37: stvx v3,r3,r7 + li r3,0 + blr + + .align 4 +38: lvx v1,r4,r7 + addi r10,r10,-1 + mtcrf 0x02,r9 + addi r9,r9,16 + addi r0,r10,-2 + vperm v3,v0,v1,v2 + vor v0,v1,v1 +382: stvx v3,r3,r7 + addi r7,r7,16 + bdnzf 27,38b + mtcrf 0x02,r11 + lis r8,0x104 + addi r9,r7,16 + ori r8,r8,32 + rlwinm r11,r0,29,3,31 + rlwinm r0,r0,0,0,28 + bgt cr7,43f +39: addi r11,r4,256 + xoris r8,r8,0x6 + bns cr6,40f + bdnz 40f +40: lvx v1,r4,r7 + addi r11,r11,32 +401: lvx v6,r4,r9 + vperm v4,v0,v1,v2 + dst r11,r8,1 + DCBA_R3R7 + vperm v3,v1,v6,v2 + vor v0,v6,v6 +402: stvx v4,r3,r7 + addi r7,r9,16 + bdz 41f +41: stvx v3,r3,r9 + addi r9,r7,16 + bdnz 40b + bso cr6,42f + b 32b +42: lvx v1,r4,r7 + vperm v3,v0,v1,v2 + vor v0,v1,v1 +422: stvx v3,r3,r7 + addi r7,r7,16 + b 32b + +43: subf r10,r0,r10 + blt cr5,39b + mtctr r11 + addi r11,r4,256 +44: lvx v1,r4,r7 + addi r9,r7,32 + addi r11,r11,128 +443: lvx v7,r4,r9 + addi r9,r9,32 +447: lvx v9,r4,r9 + addi r9,r9,32 +451: lvx v11,r4,r9 + addi r9,r7,16 +441: lvx v6,r4,r9 + addi r9,r9,32 +445: lvx v8,r4,r9 + addi r9,r9,32 +449: lvx v10,r4,r9 + addi r9,r9,32 + vperm v3,v0,v1,v2 +453: lvx v0,r4,r9 + vperm v4,v1,v6,v2 + dst r11,r8,1 + DCBA_R3R7 +440: stvx v3,r3,r7 + addi r7,r7,16 + vperm v5,v6,v7,v2 +442: stvx v4,r3,r7 + addi r7,r7,16 + vperm v6,v7,v8,v2 + DCBA_R3R7 +444: stvx v5,r3,r7 + addi r7,r7,16 + vperm v7,v8,v9,v2 +446: stvx v6,r3,r7 + addi r7,r7,16 + vperm v8,v9,v10,v2 + DCBA_R3R7 +448: stvx v7,r3,r7 + addi r7,r7,16 + vperm v9,v10,v11,v2 +450: stvx v8,r3,r7 + addi r7,r7,16 + vperm v10,v11,v0,v2 + DCBA_R3R7 +452: stvx v9,r3,r7 + addi r7,r7,16 +454: stvx v10,r3,r7 + addi r7,r7,16 + bdnz 44b + mtctr r10 + addi r9,r7,16 + bns cr6,40b + bdnz 40b + +/* Intent of this exception table is to return: + * r3 = bytes not copied (but preserve dst address in r3 til end) + * r4 = 0 on read fault; 1 on write fault + * Register useage here: + * r5 = (preserve as total byte count til near end) + * r6 = bytes not copied (move to r3 at end) + * r7 = byte count index from memcpy_vec + * r9 = alternate byte count index in 128B loop + * r10= vectors (QWs remaining) after 128B loop + * r11= next destination address (assume word-aligned) + * For read fault, clear out the destination for bytes remaining + * starting at r3(dst) + r5(byte count) - r6 (bytes remaining). + */ + + +/* read fault, initial single-byte copy */ +100: li r4,0 + mfctr r3 +101: stbu r4,1(r9) + bdnz 101b + blr + +/* write fault, initial single-byte copy */ +102: li r4,1 + mfctr r3 + blr + +/* read fault, initial vector(s) load */ +103: li r4,0 + b 91f + +/* write fault, initial partial vector store */ +104: li r4,1 + subf r5,r7,r5 /* BC minus bytes in 1st vector already stored */ + add r3,r3,r7 /* dst plus bytes in 1st vector already stored. */ + b 91f + +/* write fault, initial full vector store */ +105: li r4,1 +91: mr r6,r5 + b 98f + +/* read fault in 16B loop(s) and 32B loop (treat as both loads fail)*/ +106: li r4,0 + b 94f + +/* write fault in 16B loop(s), 128B, and first write fault in 32B loop */ +107: li r4,1 + b 94f + +/* second write fault in 32B loop */ +108: li r4,1 + add r11,r3,r5 /* Last dst byte + 1 */ + add r3,r3,r9 /* Current dst byte */ + b 95f + +/* read fault in 128B loop (treat as all loads fail)*/ +112: li r4,0 + mfctr r0 + slwi r0,r0,7 /* Convert 128B loop ctr to bytes */ + add r11,r3,r5 + slwi r10,r10,4 /* convert QW vectors remaining to bytes */ + add r3,r3,r7 + rlwinm r6,r11,0,28,31 /* Bytes in last vector(s) */ + rlwinm r3,r3,0,0,27 + add r6,r6,r10 + add r6,r6,r0 + b 98f + +/* read fault, final vector(s) load */ +114: li r4,0 +94: add r11,r3,r5 + add r3,r3,r7 +95: rlwinm r3,r3,0,0,27 + subf r6,r3,r11 + b 98f + +/* write fault, final partial vector store */ +115: li r4,1 + add r11,r3,r5 + add r3,r3,r7 + rlwinm r3,r3,0,0,27 + subf r6,r3,r11 + subf r6,r9,r6 /* minus bytes already stored */ + b 98f + +/* write fault, final full vector store */ +116: li r4,1 + add r3,r3,r7 + rlwinm r3,r3,0,0,27 + li r6,16 + b 98f + +/* + * At this stage the number of bytes not copied is in r6 + * and r4 is 0 for read or 1 for write. + * (Like the scalar version, assume dst is word-aligned.) + */ +98: cmpwi 0,r4,0 + bne 120f +/* for read fault, clear out the destination: r6 bytes remaining + */ + srwi. r0,r6,2 + addi r3,r3,-4 + subf r10,r6,r5 + mtctr r0 + beq 118f +117: stwu r4,4(r3) + bdnz 117b +118: andi. r0,r6,3 + mtctr r0 + beq 120f +119: stb r4,4(r3) + addi r3,r3,1 + bdnz 119b +120: mr r3,r6 + blr + +121: li r4,1 + mfctr r3 + rlwinm r3,r3,2,0,29 + andi. r0,r6,3 + add r3,r3,r0 + blr + + +#ifndef TEST_OUTSIDE_LINUX + .section __ex_table,"a" + .align 2 + .long 2b,100b + .long 202b,102b + .long 241b,103b + .long 25b,103b + .long 252b,104b + .long 262b,104b + .long 272b,104b + .long 282b,104b + .long 284b,104b + .long 29b,105b + .long 31b,106b + .long 312b,107b + .long 331b,114b + .long 332b,115b + .long 334b,115b + .long 342b,115b + .long 352b,115b + .long 362b,115b + .long 37b,116b + .long 38b,106b + .long 382b,107b + .long 40b,106b + .long 401b,106b + .long 402b,107b + .long 41b,108b + .long 42b,106b + .long 422b,107b + .long 44b,112b + .long 443b,112b + .long 447b,112b + .long 451b,112b + .long 441b,112b + .long 445b,112b + .long 449b,112b + .long 453b,112b + .long 440b,107b + .long 442b,107b + .long 444b,107b + .long 446b,107b + .long 448b,107b + .long 450b,107b + .long 452b,107b + .long 454b,107b + .long 101b,102b + .long 117b,121b + .long 119b,102b +#endif + + .text + .align 5 + + .global strlen_vec +strlen_vec: + + lvxl v2,0,r3 + vxor v0,v0,v0 + lvsl v5,0,r3 + vnor v1,v0,v0 + rlwinm r5,r3,0,28,31 + vperm v2,v2,v1,v5 + mr r4,r3 + li r3,16 + vcmpequb. v4,v0,v2 + vsldoi v5,v0,v1,8 + bne cr6,2f + subf r3,r5,r3 +1: lvxl v2,r4,r3 + addi r3,r3,16 + vcmpequb. v4,v0,v2 + beq cr6,1b +2: vandc v3,v2,v5 + vsldoi v7,v0,v1,4 + vcmpequb. v4,v3,v5 + vsldoi v8,v0,v1,12 + beq cr6,10f + vandc v3,v2,v8 + vsldoi v5,v0,v1,10 + vcmpequb. v4,v3,v8 + vsldoi v9,v0,v1,14 + beq cr6,6f + vandc v3,v2,v9 + vsldoi v8,v0,v1,13 + vcmpequb. v4,v3,v9 + vsldoi v10,v0,v1,15 + beq cr6,4f + vandc v3,v2,v10 + vcmpequb. v4,v3,v10 + beq cr6,3f + addi r3,r3,-16 + blr +3: addi r3,r3,-15 + blr + +4: vandc v3,v2,v8 + vcmpequb. v4,v3,v8 + beq cr6,5f + addi r3,r3,-14 + blr +5: addi r3,r3,-13 + blr + +6: vandc v3,v2,v5 + vsldoi v9,v0,v1,9 + vcmpequb. v4,v3,v5 + vsldoi v10,v0,v1,11 + beq cr6,8f + vandc v3,v2,v10 + vcmpequb. v4,v3,v10 + beq cr6,7f + addi r3,r3,-12 + blr +7: addi r3,r3,-11 + blr + +8: vandc v3,v2,v9 + vcmpequb. v4,v3,v9 + beq cr6,9f + addi r3,r3,-10 + blr +9: addi r3,r3,-9 + blr + +10: vandc v3,v2,v7 + vsldoi v5,v0,v1,2 + vcmpequb. v4,v3,v7 + vsldoi v10,v0,v1,6 + beq cr6,14f + vandc v3,v2,v10 + vsldoi v9,v0,v1,5 + vcmpequb. v4,v3,v10 + vsldoi v7,v0,v1,7 + beq cr6,12f + vandc v3,v2,v7 + vcmpequb. v4,v3,v7 + beq cr6,11f + addi r3,r3,-8 + blr +11: addi r3,r3,-7 + blr + +12: vandc v3,v2,v9 + vcmpequb. v4,v3,v9 + beq cr6,13f + addi r3,r3,-6 + blr +13: addi r3,r3,-5 + blr + +14: vandc v3,v2,v5 + vsldoi v8,v0,v1,1 + vcmpequb. v4,v3,v5 + vsldoi v10,v0,v1,3 + beq cr6,16f + vandc v3,v2,v10 + vcmpequb. v4,v3,v10 + beq cr6,15f + addi r3,r3,-4 + blr +15: addi r3,r3,-3 + blr + +16: vandc v3,v2,v8 + vcmpequb. v4,v3,v8 + beq cr6,17f + addi r3,r3,-2 + blr +17: addi r3,r3,-1 + blr + + .text + .align 5 + + .global strcmp_vec +strcmp_vec: + lvxl v2,0,r3 + vxor v0,v0,v0 + addi r7,r4,16 + lvxl v3,0,r4 + vnor v1,v0,v0 + xor r8,r7,r4 + lvsl v6,0,r3 + vspltisb v4,8 + cmpi 2,0,r8,0x1000 + lvsl v10,0,r4 + vspltisb v12,1 + beq 2,8f +1: andi. r8,r3,0xF + lvxl v8,0,r7 + vslb v13,v4,v12 + andi. r9,r4,0xF + vperm v2,v2,v1,v6 + subf. r0,r8,r9 + addi r5,r3,16 + vperm v9,v0,v1,v6 + lvsl v6,0,r0 + vor v7,v3,v3 + vperm v3,v3,v8,v10 + addi r4,r7,16 + vslb v11,v13,v12 + vor v3,v3,v9 + xor r3,r3,r3 + vcmpequb. v10,v2,v3 + vslb v14,v11,v12 + vnor v9,v10,v10 + bc 4,6*4+0,3f + vcmpequb. v5,v0,v2 + bc 4,6*4+2,7f + blt 6f +2: lvxl v7,0,r4 + addi r4,r4,16 + lvxl v2,0,r5 + addi r5,r5,16 + vperm v3,v8,v7,v6 + vcmpequb. v10,v2,v3 + vnor v9,v10,v10 + bc 12,6*4+0,5f +3: vcmpequb v5,v0,v2 + vsum4ubs v7,v4,v14 + vor v9,v9,v5 + vsro v12,v9,v11 + vsrw v11,v9,v4 + vsro v6,v9,v14 + vsrw v14,v9,v13 + vsro v13,v9,v7 + vor v9,v12,v6 + vsro v7,v14,v4 + vor v9,v9,v13 + vcmpgtuw v9,v9,v0 + vor v9,v9,v11 + vor v9,v9,v14 + vor v9,v9,v7 + vandc v11,v10,v9 + vcmpequb. v14,v11,v9 + vcmpgtub v7,v3,v2 + bc 12,6*4+2,4f + vandc v11,v7,v9 + li r3,-1 + vcmpequb. v14,v11,v1 + bc 4,6*4+2,4f + li r3,1 +4: blr + +5: vcmpequb. v5,v0,v2 + bc 4,6*4+2,7f + lvxl v8,0,r4 + addi r4,r4,16 +6: lvxl v2,0,r5 + addi r5,r5,16 + vperm v3,v7,v8,v6 + vcmpequb. v10,v2,v3 + vnor v9,v10,v10 + bc 4,6*4+0,3b + vcmpequb. v5,v0,v2 + bc 12,6*4+2,2b +7: blr + +8: vcmpequb. v5,v0,v2 + bc 13,6*4+2,1b + vcmpequb. v10,v2,v3 + bc 4,6*4+0,3b + blr + + + .text + .align 5 + .global memcmp_vec +memcmp_vec: + subf. r6,r4,r3 + cmpi cr1,0,r5,0 + cmpi cr7,0,r5,16 + add r9,r3,r5 + addi r7,r4,-1 + addi r11,r3,16 + beq 2f + addi r10,r9,-1 + addi r8,r3,-1 + rlwinm r11,r11,0,0,27 + beq cr1,2f + subf r11,r11,r10 + rlwinm r9,r9,0,28,31 + bgt cr7,3f + mtctr r5 +1: lbzu r6,1(r7) + lbzu r10,1(r8) + subf. r3,r6,r10 + bdnzt 2,1b + blr + +2: xor r3,r3,r3 + blr +3: rlwinm r11,r11,28,4,31 + rlwinm r7,r4,0,28,31 + rlwinm r8,r3,0,28,31 + cmpi cr1,0,r11,0 + lvxl v0,0,r3 + subf. r7,r7,r8 + li r7,16 + lvxl v1,0,r4 + vor v2,v1,v1 + addi r5,r5,-1 + bge 4f + lvxl v2,r4,r7 + addi r4,r4,16 + addi r5,r5,-16 +4: lvsl v3,0,r3 + vspltisb v4,8 + vxor v5,v5,v5 + lvsl v6,0,r4 + vspltisb v7,1 + vnor v8,v5,v5 + lvsr v10,0,r6 + cmpi cr5,0,r9,0 + vperm v11,v5,v8,v3 + lvsr v12,0,r9 + vperm v0,v0,v8,v3 + vperm v1,v1,v2,v6 + vslb v3,v4,v7 + vor v1,v1,v11 + vslb v6,v3,v7 + vcmpequb. v8,v0,v1 + vslb v7,v6,v7 + vnor v13,v8,v8 + bc 4,6*4+0,8f + ble cr1,6f + mtctr r11 +5: lvxl v9,r4,r7 + lvxl v0,r3,r7 + addi r7,r7,16 + vperm v1,v2,v9,v10 + vor v2,v9,v9 + vcmpequb. v8,v0,v1 + vnor v13,v8,v8 + bdnzt 24,5b + bc 4,6*4+0,8f +6: lvxl v9,r4,r5 + vperm v12,v5,v8,v12 + lvxl v0,r3,r7 + vperm v1,v2,v9,v10 + beq cr5,7f + vor v1,v1,v12 + vor v0,v0,v12 +7: vcmpequb. v8,v0,v1 + vnor v13,v8,v8 + bc 4,6*4+0,8f + xor r3,r3,r3 + blr +8: vsum4ubs v2,v4,v7 + vsro v9,v13,v6 + vsrw v6,v13,v4 + vsro v10,v13,v7 + vsrw v7,v13,v3 + vsro v3,v13,v2 + vor v11,v9,v10 + vsro v2,v7,v4 + vor v11,v11,v3 + vcmpgtuw v11,v11,v5 + vor v11,v11,v6 + vor v11,v11,v7 + vor v11,v11,v2 + vor v1,v1,v11 + vor v0,v0,v11 + li r3,-1 + vcmpgtub. v8,v1,v0 + bclr 4,6*4+2 + li r3,1 + blr + + .text + .align 5 + .global strcpy_vec +strcpy_vec: + addi r5,r3,32 + subf. r6,r4,r3 + subf r7,r3,r4 + rlwinm r5,r5,0,0,26 + mr r8,r3 + beqlr + bgt 1f + mr r6,r7 +1: subf. r9,r3,r5 + addi r5,r8,4096 + cmpi cr7,0,r6,16 + mtctr r9 +2: lbzx r0,0,r4 + addi r4,r4,1 + cmpi cr1,0,r0,0 + stbx r0,0,r8 + addi r8,r8,1 + bdnzf 6,2b + beqlr cr1 + li r11,4096 + rlwinm r5,r5,0,0,19 + mr r10,r4 + ble cr7,2b + subf. r5,r8,r5 + rlwinm r5,r5,28,4,31 + lvsl v4,0,r4 + vxor v0,v0,v0 + ble 9f + mtctr r5 +3: lvx v1,0,r10 + addi r10,r10,16 + bdz 10f +4: lvx v2,0,r10 + addi r10,r10,16 + bdz 11f +5: lvx v3,0,r10 + addi r10,r10,16 + bdz 12f +6: vperm v5,v1,v2,v4 + vperm v6,v2,v3,v4 + vor v1,v3,v3 + vcmpequb. v7,v0,v5 + bne cr6,8f + addi r4,r4,16 + vcmpequb. v7,v0,v6 + bne cr6,7f + DCBA_R0R8 + addi r4,r4,16 + stvx v5,0,r8 + addi r8,r8,16 + stvx v6,0,r8 + addi r8,r8,16 + b 4b +7: stvx v5,0,r8 + addi r8,r8,16 +8: lbzx r0,0,r4 + addi r4,r4,1 + cmpi cr1,0,r0,0 + stbx r0,0,r8 + addi r8,r8,1 + bne cr1,8b + blr + +9: mtctr r11 + b 3b +10: vcmpequb. v7,v0,v1 + bnl cr6,8b + mtctr r11 + b 4b +11: vcmpequb. v7,v0,v2 + bnl cr6,8b + mtctr r11 + b 5b +12: vcmpequb. v7,v0,v3 + bnl cr6,8b + mtctr r11 + b 6b diff --git a/liboil/motovec/vec_csum.S b/liboil/motovec/vec_csum.S new file mode 100644 index 0000000..29ddd11 --- /dev/null +++ b/liboil/motovec/vec_csum.S @@ -0,0 +1,724 @@ +//------------------------------------------------------------------ +// file: vec_csum.S +// AltiVec enabled version of linux' checksum routines +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// Copyright Motorola, Inc. 2003 +// ALL RIGHTS RESERVED +// +// You are hereby granted a copyright license to use, modify, and +// distribute the SOFTWARE so long as this entire notice is retained +// without alteration in any modified and/or redistributed versions, +// and that such modified versions are clearly identified as such. +// No licenses are granted by implication, estoppel or otherwise under +// any patents or trademarks of Motorola, Inc. +// +// The SOFTWARE is provided on an "AS IS" basis and without warranty. +// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS +// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED +// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR +// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH +// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS +// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. +// +// To the maximum extent permitted by applicable law, IN NO EVENT SHALL +// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER +// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF +// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS +// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR +// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility +// for the maintenance and support of the SOFTWARE. +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// extern unsigned long csum_partial_copy_generic(src, dst, len, sum, +// src_err, dst_err); +// Computes the checksum of a memory block at src, length len, +// and adds in "sum" (32-bit), while copying the block to dst. +// Returns: +// unsigned long sum +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// extern unsigned long csum_partial(buff, len, sum); +// +// computes the checksum of a memory block at buff, length len, +// and adds in "sum" (32-bit unsigned long) +// Returns: +// unsigned long sum +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// Assumptions from studying the original linux code: +// Copying forward is always safe +// src and dst are always half-word aligned +// len may be odd or even 0-n; +// there is no test to see if src and dst are equal. +// returns unsigned int checksum +// +//------------------------------------------------------------------ + +// Revision History: +// Rev 0.0 Original Chuck Corley 04/19/03 +// +// This is alpha quality code; users are encouraged to make it faster. +// ASSUMPTIONS: +// Code is highly likely to be in the cache; data is not (streaming data) + +// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 32 bytes. +#define MIN_VEC 48 // Experimentally chosen on 7455@1GHz/133 to beat scalar + + // Register useage +#define Rt r0 // r0 when used as a temporary register + +#define SRC r3 // entering: src ptr; exiting: unsigned long checksum + +#define DST r4 // entering: dst pointer; exiting: + +#define BC r5 // entering: Byte_Count + +#define SUM r6 // entering: Partial checksum + +#define SER r7 // entering: src_err address + +#define DER r8 // entering: dst_err address + +#define DM2 r9// dst -2 for hw-by-hw forwards initially +#define D r9 // dst[28:31] +#define DR r9 // dst[0:27] +#define DNX r9 // (dst+n*16)[28:31] +#define BL r9 // second byte_kount index pointer + +#define DBC r10// dst + byte count initially +#define DBK r10// (dst+byte_count-1) then (dst+byte_count-1)[28:31] + +#define SM2 r11// src -2 for hw-by-hw forwards initially +#define QW r11 // number of quad words (vectors) +#define SP8 r11 // data stream touch block & stride info for Big_loop +#define SBC r11// src + byte count initially then src[28:31] + +#define BK r12 // Byte Kount index +#define BLK r12 // temporary data stream touch block & stride info +#define S r12// src[28:31] +#define DMS r12 // dst - src initially + +#define V0 v0 // all zeros +#define VCARS v0 // sum of carries + +#define V1 v1 // all ones +#define VMM v1 // mask for final dst right + +#define VS0 v2 // src vector for permuting +#define VL v2 // low data + +#define VS1 v3 // src vector for permuting +#define VH v3 // high data + +#define VPS0 v4 // permuted source vector to store + +#define VP2 v5 // dst permute register +#define VM v5 // mask for first dst left +#define VS2 v5 // src vector for permuting + +#define VP3 v6 // d - s permute register +#define VS3 v6 // 4th src vector in csum_partial + +#define VP4 v7 // Byte_Count permute register +#define VPS1 v7 // 2nd permuted source vector to store + +#define VSUM v8 // Updated sum +#define VFIN v8 // final sum + +#define VCAR1 v9 // temp register for carries +#define VCAR3 v9 // temp register for carries + +#define VCAR2 v10 // temp register for carries + +#define VCARF v11 // temp register for carries + +#define VTEMP v12 // Temp register + + +// Conditionalize the use of dcba. It will help if the data is +// not in cache and hurt if it is. Generally, except for small +// benchmarks repeated many times, we assume data is not in cache +// (data streaming) and using dcba is a performance boost. +#ifndef NO_DCBA +#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) + // gcc and codewarrior and diab don't assemble dcba +#define DCBK .long 0x7c0465ec +// dcba r4,r12 or dcba DST,BK +#else +#ifdef __ghs__ +.macro DCBK +.long 0x7c0465ec +.endm +#else +#define DCBK dcba DST,BK +#endif // __ghs__ +#endif // __GNUC__ or __MWERKS__ +#else +#define DCBK nop +#endif // NO_DCBA + +// Conditionalize the use of dst (data stream touch). It will help +// if the data is not in cache and hurt if it is (though not as badly +// as dcbz). Generally, except for small benchmarks repeated many times, +// we assume data is not in cache (data streaming) and using dst is a +// performance boost. +#ifndef NO_DST +#define STRM_F dst SRC,BLK,0 +#define STRM_1 dst SP8,Rt,1 + +#else +#define STRM_F nop +#define STRM_1 nop +#endif + .text +#if __MWERKS__ + .align 16 +#define SP r1 +#else + .align 4 +#endif + +#ifdef LIBMOTOVEC + .global csum_partial_copy_generic_vec +csum_partial_copy_generic: +#else + .global vec_csum_partial_copy_generic +vec_csum_partial_copy_generic: +#endif + + li BK,32 // IU1 + rlwinm Rt,BC,31,1,31 // IU1 BC/2 + cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count + + dcbt SRC,BK // LSU prefetch next cacheline + cmpi cr6,0,Rt,0 // IU1 BC/2 == 0? + addic SUM,SUM,0 // IU1 Zero carry bit + + addi SM2,SRC,-2 // IU1 Pre-bias and duplicate src + add DBC,DST,BC // IU1 Address of last dst byte + 1 + bgt cr7,v_csumcpy // b if BC>MIN_VEC (will copy vectors fwd) + andi. BK,BC,1 // IU1 BC[31]==0? + + addi DM2,DST,-2 // IU1 Pre-bias and duplicate destination + add S,SRC,BC // IU1 Last src byte + 1 (temp use of S) + beq cr6,No_HWs // b if BC/2==0 + mtctr Rt // i=BC/2; do ...;i--; while (i>0) +HW_cpy: + lhzu Rt,2(SM2) // LSU + sthu Rt,2(DM2) // LSU + addc SUM,SUM,Rt // IU1 + bdnz HW_cpy +No_HWs: + beq BC_even // b if BC[31]==0 (or DBC[31]==0 when aligned) + lbz Rt,-1(S) // LSU Get last src address byte + + stb Rt,-1(DBC) // LSU Store to last dst address byte + rlwinm Rt,Rt,8,16,23 // IU1 Shift odd byte left + + addc SUM,SUM,Rt // IU1 +BC_even: + addze SRC,SUM + blr + +v_csumcpy: + lvsr VP2,0,DST // LSU Permute vector for initial byte mask + rlwinm D,DST,0,28,31 // IU1 D = dst[28:31] + rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31] + + lvsr VP4,DST,BC // LSU Permute vector for final byte mask + subf. S,S,D // IU1 if D-S<0 essentially shifting left + subf DMS,SRC,DST // IU1 Compute dst-src difference + + lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right + li BK,64 // IU1 Index of next cache line + vxor V0,V0,V0 // VIU Clear v0 + + dcbt SRC,BK // LSU Prefetch next cache line at src+64 + cmpi cr1,0,D,0 // IU1 Is D0 left justified? + vnor V1,V0,V0 // VIU1 Create a vector of all ones + + addi DR,DST,16 // IU1 Address of second dst vector + addi DBK,DBC,-1 // IU1 Address of last dst byte + vperm VM,V1,V0,VP2 // VPU D0 select vector for dst left; src right + bge Ld_bytes_rt // b if shifting right (D-S>=0) + + lvx VS0,0,SRC // LSU Get S0 load started + addi SRC,SRC,16 // IU1 Increment src base (to keep BK useful) + +Ld_bytes_rt: // Come here to get VS1 & Don't care what VS0 is + lvx VS1,0,SRC // LSU Get S1 (or S0 if D-S>=0) in upper vector + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] + + vperm VMM,V0,V1,VP4 // VPU DN select vector for src left; dst right + subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) + vxor VPS1,VPS1,VPS1 // VIU Clear VPS1 + + vxor VCARF,VCARF,VCARF //VIU1 clear VCARF + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining + rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] + + li BK,96 // IU1 Index of next cache line + cmpi cr5,0,Rt,0xF // IU1 Is DN right justified? + subf Rt,DST,DR // IU1 How many bytes in first destination? + + mtctr QW // IU2 + cmpi cr6,0,QW,4 // IU1 Check QW>4 + mtcrf 0x01,Rt // IU2 Put bytes in 1st dst in cr7 + + vperm VPS0,VS0,VS1,VP3 // VPU Align S0 and S1 to D0 + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower + dcbt SRC,BK // LSU Prefetch next cache line at src+96 + beq cr1,Left_just // b if D0 is left justified + + li BK,0 // IU1 Initialize byte kount index + vsel VPS0,VPS0,V0,VM // VIU1 Select zeroes left | S0 bytes right + bns cr7,No_B_fwd // b if only even number of bytes to store + + stvebx VPS0,DST,BK // LSU store first byte at DST+0 + addi BK,BK,1 // IU1 increment index +No_B_fwd: + bne cr7,No_H_fwd // b if only words to store + + stvehx VPS0,DST,BK // LSU store halfword at DST+0/1 + addi BK,BK,2 // IU1 increment index +No_H_fwd: + bng cr7,No_W1_fwd // b if exactly zero or two words to store + + stvewx VPS0,DST,BK // LSU store word 1 of one or three + addi BK,BK,4 // IU1 increment index + +No_W1_fwd: + bnl cr7,No_W2_fwd // b if there was only one word to store + stvewx VPS0,DST,BK // LSU store word 1 of two or 2 of three + addi BK,BK,4 // IU1 increment index + + stvewx VPS0,DST,BK // LSU store word 2 of two or 3 of three + b No_W2_fwd + +Left_just: + stvx VPS0,0,DST // LSU Store 16 bytes at D0 +No_W2_fwd: + vxor VSUM,VSUM,VSUM // VIU1 Clear VSUM + li BK,16 // IU1 Re-initialize byte kount index + +QW_fwd_loop: + lvx VS1,SRC,BK // LSU Get S2 (or S1) + vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries + + vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries) + + vperm VPS0,VS0,VS1,VP3 // VPU Align S1 and S2 to D1 + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower + + stvx VPS0,DST,BK // LSU Store 16 bytes at D1(+n*16 where n<4) + vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF + addi BK,BK,16 // IU1 Increment byte kount index + bdnzf 25,QW_fwd_loop // b if 4 or less quad words to do + + add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) + addi QW,QW,-1 // IU1 One more QW stored by now + bgt cr6,GT_4QW_fwd // b if >4 quad words left + +Last_ld_fwd: // Next 16 bytes is the last; we're done. + add DBC,DST,BC // IU1 Recompute address of last dst byte + 1 + add SBC,SRC,BC // IU1 Recompute address of last src byte + 1 + bge No_ld_fwd // b if shifting right (D-S>=0) + + addi SBC,SBC,-16 // IU1 if D-S>=0 we didn't add 16 to src +No_ld_fwd: + mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 + addi Rt,SBC,-1 // IU1 Recompute address of last src byte + + lvx VS1,0,Rt // LSU Get last source S14 (guaranteed SN) + vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries + + vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries) + + vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF + + vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D14 + beq cr5,Rt_just_fwd // b if last destination is right justified + vsel VPS0,VPS0,V0,VMM // VIU1 Select src bytes left | zeroes right + + rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte + li D,0 // IU1 Initialize index pointer + bnl cr7,Only_1W_fwd // b if there was only one or zero words to store + + stvewx VPS0,DBK,D // LSU store word 1 of two or three + addi D,D,4 // IU1 increment index + + stvewx VPS0,DBK,D // LSU store word 2 of two or three + addi D,D,4 // IU1 increment index +Only_1W_fwd: + bng cr7,Only_2W_fwd // b if there were only two or zero words to store + + stvewx VPS0,DBK,D // LSU store word 3 of three if necessary + addi D,D,4 // IU1 increment index +Only_2W_fwd: + bne cr7,Only_B_fwd // b if there are no half words to store + + stvehx VPS0,DBK,D // LSU store one halfword if necessary + addi D,D,2 // IU1 increment index +Only_B_fwd: + bns cr7,All_done_fwd // b if there are no bytes to store + + stvebx VPS0,DBK,D // LSU store one byte if necessary + b All_done_fwd + +Rt_just_fwd: + stvx VPS0,DST,BK // LSU Store 16 bytes at D14 +All_done_fwd: + vaddcuw VCAR1,VPS0,VPS1 //VIU1 add data and store carries + + vadduwm VTEMP,VPS0,VPS1 //VIU1 add data (no carries) + + vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries + + vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum + + vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds + vmrglh VL,V0,VSUM // VPU separate low shorts of sum + + vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF + vmrghh VH,V0,VSUM //VPU separate high shorts of sum + rlwinm DBK,SP,0,0,27 // IU1 Align stack pointer to QW + + vsumsws VCARS,VCARF,V0 //VIU2 sum all carries + vadduwm VSUM,VL,VH //VIU1 add low and high data + li BK,-16 // IU1 Index 0x10 less than SP + + vsumsws VFIN,VSUM,VCARS //VIU2 sum all data including carries + + stvx VFIN,DBK,BK // LSU Store partial checksum from VR + + lwz SRC,-4(DBK) // LSU Load partial checksum to GPR + + addc SRC,SRC,SUM + + addze SRC,SRC + + blr // Return destination address from entry + + +#ifdef __MWERKS__ + .align 16 +#else + .align 4 +#endif +GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice + + lvx VS1,SRC,BK // LSU Get S3 (or S2) + addi QW,QW,-1 // IU1 Keeping track of QWs stored + vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries + + vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries) + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; + addi DNX,DNX,16 // IU1 Update cr6 for next loop + + addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop + vperm VPS0,VS0,VS1,VP3 // VPU Align S2 and S3 to D2 + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower + + stvx VPS0,DST,BK // LSU Store 16 bytes at D2 + addi BK,BK,16 // IU1 Increment byte count by 16 + vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF + bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL +// At this point next store will be to even address. + + mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) + addi SP8,SRC,96 // IU1 Starting address for dcbt + addi BL,BK,16 // IU1 Create an alternate byte kount + 32 + +// We need the ctr register to reflect an even byte count before entering +// the next block - faster to decrement than to reload. + + bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even + + bdnz B32_fwd // decrement counter for last QW store odd + +B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned + lvx VS1,SRC,BK // LSU Get S4 + addi SP8,SP8,32 // IU1 Next starting address for dcbt + vaddcuw VCAR1,VPS0,VPS1 // VIU1 add data and store carries + + lvx VS2,SRC,BL // LSU Get S5 + vadduwm VTEMP,VPS0,VPS1 // VIU1 add data (no carries) + + dcbt 0,SP8 // LSU Prefetch cache line 64 bytes ahead + vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries + + DCBK // LSU Kill instead of RWITM + vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum + vperm VPS1,VS0,VS1,VP3 // VPU Align S11 and S12 to D11 + + stvx VPS1,DST,BK // LSU Store 16 bytes at D11 + vperm VPS0,VS1,VS2,VP3 // VPU Align S12 and S13 to D12 + vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds + bdz Nxt_loc_fwd // always decrement and branch to next instr + +Nxt_loc_fwd: + stvx VPS0,DST,BL // LSU Store 16 bytes at D12 + vor VS0,VS2,VS2 // VIU1 Move S13 to S11 + vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF + + addi BK,BL,16 // IU1 Increment byte count + addi BL,BK,16 // IU1 Increment alternate byte count + bdnz B32_fwd // b if there are at least two more QWs to do + + bso cr6,One_even_QW // b if there is one even and one odd QW to store + + b Last_ld_fwd // b if last store is to even address + +// Come here with two more loads and two stores to do +One_even_QW: + lvx VS1,SRC,BK // LSU Get S6 (or S5 if if D-S>=0) + vaddcuw VCAR1,VPS0,VSUM // VIU1 data + previous sum ->store carries + + vadduwm VSUM,VPS0,VSUM // VIU1 data + previous sum (no carries) + + vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF + + vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D13 + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower + + stvx VPS0,DST,BK // LSU Store 16 bytes at D13 + addi BK,BK,16 // IU1 Increment byte count + b Last_ld_fwd + +// End of vec_csum_partial_copy_generic in AltiVec + +// Modified from above Register useage +// Don't use vectors for BC <= MIN_VEC_CS. Works only if MIN_VEC >= 32 bytes. +#define MIN_VEC_CS 48 // Chosen experimentally on MPC7455@1GHz/133MHz bus +#undef DST // will not be using here +#undef BC +#define BC r4 // entering: Byte_Count + +#undef SUM +#define SUM r5 // entering: Partial checksum + +#if __MWERKS__ + .align 16 +#else + .align 4 +#endif +#ifdef LIBMOTOVEC + .global csum_partial_vec +csum_partial: +#else + .global vec_csum_partial +vec_csum_partial: +#endif + li BK,32 // IU1 + rlwinm Rt,BC,31,1,31 // IU1 BC/2 + cmpi cr7,0,BC,MIN_VEC_CS // IU1 Check for minimum byte count + + dcbt SRC,BK // LSU prefetch next cacheline + cmpi cr6,0,Rt,0 // IU1 BC/2 == 0? + addic SUM,SUM,0 // IU1 Zero carry bit + + addi SM2,SRC,-2 // IU1 Pre-bias and duplicate src + add DBC,SRC,BC // IU1 Compute address of last src byte + 1 + bgt cr7,v_csum // b if BC>MIN_VEC_CS + andi. BK,BC,1 // IU1 BC[31]==0? + + beq cr6,No_HWs_cs // b if BC/2==0 + mtctr Rt // i=BC/2; do ...;i--; while (i>0) +HW_cs: + lhzu Rt,2(SM2) // LSU + + addc SUM,SUM,Rt // IU1 + bdnz HW_cs +No_HWs_cs: + beq BC_even_cs // b if BC[31]==0 (or DBC[31]==0 when aligned) + lbz Rt,-1(DBC) // LSU Get last src address byte + + rlwinm Rt,Rt,8,16,23 // IU1 Shift odd byte left + + addc SUM,SUM,Rt // IU1 +BC_even_cs: + addze SRC,SUM + blr + +v_csum: + lvsr VP2,0,SRC // LSU Permute vector for initial byte mask + addi DR,SRC,16 // IU1 Address of second src vector + li BK,64 // IU1 Index of next cache line + + lvsr VP4,SRC,BC // LSU Permute vector for final byte mask + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] + addi DBK,DBC,-1 // IU1 Address of last src byte + + lvx VS0,0,SRC // LSU Get S0 load started + subf QW,DR,DBK // IU1 Bytes of full vectors to test (-16) + vxor V0,V0,V0 // VIU Clear v0 + + dcbt SRC,BK // LSU Prefetch next cache line at src+64 + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining + vnor V1,V0,V0 // VIU1 Create a vector of all ones + + mtctr QW // IU2 + vxor VCARF,VCARF,VCARF //VIU1 clear VCARF + vperm VM,V1,V0,VP2 // VPU D0 select vector for dst left; src right + + cmpi cr6,0,QW,4 // IU1 Check QW>4 + vxor VSUM,VSUM,VSUM // VIU1 Clear VSUM + vperm VMM,V0,V1,VP4 // VPU DN select vector for src left; dst right + + li BK,16 // IU1 Initialize byte kount index + vsel VS0,VS0,V0,VM // VIU1 Select zeroes left | S0 bytes right +vp_fwd_loop: + lvx VS1,SRC,BK // LSU Get S1 + vaddcuw VCAR1,VS0,VSUM // VIU1 data + previous sum ->store carries + + vadduwm VSUM,VS0,VSUM // VIU1 data + previous sum (no carries) + + vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF + addi BK,BK,16 // IU1 Increment byte kount index + + vor VS0,VS1,VS1 // VIU1 Swap vectors for next loop + bdnzf 25,vp_fwd_loop // b if 4 or less quad words to do + + add DNX,SRC,BK // IU1 address of next load (SRC+32 if QW>4) + addi QW,QW,-1 // IU1 One more QW summed by now + bgt cr6,GT_4QW_cs // b if >4 quad words left + vxor VS1,VS1,VS1 // VIU1 Zero before adding below + +// Next 16 bytes is the last; we're done. +Last_ld_cs: + lvx VS2,0,DBK // LSU Get last source (guaranteed SN) + vaddcuw VCAR1,VS0,VS1 // VIU1 add data and store carries + rlwinm DBK,DBK,0,28,31 // IU1 (dst + BC -1)[28:31] + + vadduwm VTEMP,VS0,VS1 // VIU1 add data (no carries) + cmpi cr7,0,DBK,0xF // IU1 Is last byte right justified? + + vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries + + vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum + + vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds + + vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF + + beq cr7, Rt_just // b if right justified. + vsel VS2,VS2,V0,VMM // VIU1 Select src bytes left | zeroes right + +Rt_just: + vaddcuw VCAR1,VS2,VSUM // VIU1 data + previous sum ->store carries + + vadduwm VSUM,VS2,VSUM // VIU1 data + previous sum (no carries) + + vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF + vmrglh VL,V0,VSUM // VPU separate low shorts of sum + + vmrghh VH,V0,VSUM //VPU separate high shorts of sum + rlwinm DBK,SP,0,0,27 // IU1 Align stack pointer to QW + + vsumsws VCARS,VCARF,V0 //VIU2 sum all carries + vadduwm VSUM,VL,VH //VIU1 add low and high data + li BK,-16 // IU1 Index 0x10 less than SP + + vsumsws VFIN,VSUM,VCARS //VIU2 sum all data including carries + + stvx VFIN,DBK,BK // LSU Store partial checksum from VR + + lwz SRC,-4(DBK) // LSU Load partial checksum to GPR + + addc SRC,SRC,SUM + + addze SRC,SRC + + blr // Return destination address from entry + +#ifdef __MWERKS__ + .align 16 +#else + .align 4 +#endif +GT_4QW_cs: // Do once if nxt ld is from odd half of cache line, else twice + + lvx VS1,SRC,BK // LSU Get S3 (or S2) + addi QW,QW,-1 // IU1 Keeping track of QWs stored + vaddcuw VCAR1,VS0,VSUM // VIU1 data + previous sum ->store carries + + vadduwm VSUM,VS0,VSUM // VIU1 data + previous sum (no carries) + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; + addi DNX,DNX,16 // IU1 Update cr6 for next loop + + addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower + + addi BK,BK,16 // IU1 Increment byte count by 16 + vadduwm VCARF,VCAR1,VCARF // VIU1 Update VCARF + bdnzf 27,GT_4QW_cs // b if next store is to lower (even) half of CL +// At this point next store will be to even address. + + mtcrf 0x02,DBK // IU2 cr6[3]=((last load)[27]==1)?1:0; (odd?) + addi SP8,SRC,96 // IU1 Starting address for dcbt + vxor VS1,VS1,VS1 // VIU1 Zero before adding below + +// We need the ctr register to reflect an even byte count before entering +// the next block - faster to decrement than to reload. + + bns cr6,B32_cs // b if DST[27] == 0; i.e, final load is even + + bdnz B32_cs // decrement counter for last QW load odd + +B32_cs: // Should be at least 2 loads remaining and next 2 are cache aligned + lvx VS2,SRC,BK // LSU Get S4 + addi BK,BK,16 // IU1 Increment byte count by 16 + vaddcuw VCAR1,VS0,VS1 // VIU1 add data and store carries + + lvx VS3,SRC,BK // LSU Get S5 + addi SP8,SP8,32 // IU1 Next starting address for dcbt + vadduwm VTEMP,VS0,VS1 // VIU1 add data (no carries) + + dcbt 0,SP8 // LSU Prefetch cache line 64 bytes ahead + addi BK,BK,16 // IU1 Increment byte count + vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries + + vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum + + vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds + bdz Nxt_loc_cs // always decrement and branch to next instr + +Nxt_loc_cs: + vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF + + vor VS0,VS2,VS2 // VIU1 Move S13 to S11 + + vor VS1,VS3,VS3 // VIU1 Move upper vector to lower + bdnz B32_cs // b if there are at least two more QWs to do + + bso cr6,One_even_QW_cs // b if there is one even and one odd QW to store + + b Last_ld_cs // b if last store is to even address + +// Come here with two more loads and two stores to do +One_even_QW_cs: + lvx VS2,SRC,BK // LSU Get S6 (or S5 if if D-S>=0) + addi BK,BK,16 // IU1 Increment byte count + vaddcuw VCAR1,VS0,VS1 // VIU1 add data and store carries + + vadduwm VTEMP,VS0,VS1 // VIU1 add data (no carries) + + vaddcuw VCAR2,VTEMP,VSUM //VIU1 data + previous sum ->store carries + + vadduwm VSUM,VTEMP,VSUM //VIU1 data + previous sum + + vadduwm VCAR3,VCAR1,VCAR2 //VIU1 add carries from previous adds + + vadduwm VCARF,VCAR3,VCARF //VIU1 update VCARF + + vxor VS1,VS1,VS1 // VIU1 Zero before next add + + vor VS0,VS2,VS2 // VIU1 Move S13 to S11 + b Last_ld_cs + +// End of vec_csum_partial in AltiVec
\ No newline at end of file diff --git a/liboil/motovec/vec_memcmp.S b/liboil/motovec/vec_memcmp.S new file mode 100644 index 0000000..d0117fa --- /dev/null +++ b/liboil/motovec/vec_memcmp.S @@ -0,0 +1,340 @@ +//#define __MWERKS__ +//------------------------------------------------------------------ +// file: vec_memcmp.S +// AltiVec enabled version of memcmp +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// Copyright Motorola, Inc. 2003 +// ALL RIGHTS RESERVED +// +// You are hereby granted a copyright license to use, modify, and +// distribute the SOFTWARE so long as this entire notice is retained +// without alteration in any modified and/or redistributed versions, +// and that such modified versions are clearly identified as such. +// No licenses are granted by implication, estoppel or otherwise under +// any patents or trademarks of Motorola, Inc. +// +// The SOFTWARE is provided on an "AS IS" basis and without warranty. +// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS +// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED +// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR +// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH +// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS +// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. +// +// To the maximum extent permitted by applicable law, IN NO EVENT SHALL +// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER +// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF +// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS +// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR +// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility +// for the maintenance and support of the SOFTWARE. +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// extern int vec_memcmp(const void *ptr1, const void *ptr2, size_t len); +// Returns: +// value < 0 if ptr1[0:len] < ptr2[0:len] +// value = 0 if ptr1[0:len] == ptr2[0:len] +// value > 0 if ptr1[0:len] > ptr2[0:len] +//------------------------------------------------------------------ + +// Revision History: +// Rev 0.0 Original Chuck Corley 05/27/03 + + +#define VRSV 256 // VRSAVE spr +// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes. +#define MIN_VEC 16 + + // Macros for bits in CR6 +#define _all 6*4+0 +#define _none 6*4+2 + // Macros for condition to be true/false and unlikely/likely to be taken +#define _F_u 4 +#define _T_u 12 +#define _T_l 13 + +// Register useage +#define Rt r0 // r0 when used as a temporary register + +#define PT1 r3 // entering: ptr1; exiting: return value + +#define SRC r4 // entering: ptr2; then ptr2+16 if ptr1[28:31]<ptr2[28:31] + +#define BC r5 // entering: Byte_Count +#define BCM1 r5 // then Byte_Count -1 + +#define DMS r6 // ptr1 - ptr2 initially +#define S2 r6 // ptr2 bytes initially + +// Codewarrior will put an unwelcome space as "lbzu r0,1(r7 )" +// if you don't put the comment right after the r7. CJC 030314 +#define SM1 r7// ptr2 -1 for byte-by-byte forwards initially +#define S r7 // ptr2[28:31] +#define BK r7 // byte index + +#define DM1 r8// ptr1 -1 for byte-by-byte forwards initially +#define D r8 // ptr1[28:31] + +#define PBC r9 // ptr1 + byte count initially + +#define S1 r10 // ptr1 bytes initially +#define PBK r10 // (ptr1+byte_count-1) + +#define DR r11 // (ptr1+16)[0:27] +#define QW r11 // number of quad words (vectors) + +#define RSV r12 // storage for VRSAVE register if used + +#define VS1 v0 // source 1 as a vector of 16 bytes + +#define VS2 v1 // source 2 as a vector of 16 bytes + +#define VS2b v2 // second source 2 vector for permuting +#define VS12B v2 // octet shift count of 12 +#define VMB3 v2 // mismatch shifted right 3 bytes + +#define VP1 v3 // source 1 permute register +#define VSH16 v3 // octet shift count of 16 bits/2 octets +#define VMW3 v3 // mismatch shifted right 3 words + +#define VS1B v4 // octet shift count of 1 + +#define V0 v5 // all zeros + +#define VP2 v6 // source 2 permute register +#define VS4B v6 // octet shift count of 4 +#define VMB1 v6 // mismatch shifted right one byte + +#define VSH1 v7 // shift count of 1 bit +#define VS8B v7 // octet shift count of 8 octets +#define VMB2 v7 // mismatch shifted right 2 bytes + +#define V1 v8 // all ones +#define VCE v8 // equality compare destination register + +#define VS2a v9 // first source 2 vector for permuting +#define VMW1 v9 // mismatch shifted right one word + +#define VP3 v10 // ptr1-ptr2 permute register +#define VMW2 v10 // mismatch shifted right 2 words + +#define VM v11 // mask for right end of 1st S1 vector + +#define VP4 v12 // last mask permute vector +#define VLM v12 // last mask register + +#define VMM v13 // vector of zeroes with ones at mismatch(es) and DN + +// Condition register use +// cr0[0:2] = (ptr1-ptr2==0)? return +// then cr0[0:2] = (ptr1[28:31]-ptr2[28:31]<0)? "Need more S2?"; +// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move) +// then cr1[2] = (QW == 0)? 1 : 0; (Any full vectors to move?) +// cr5[2] = ((PBK = PT1+BC)[28:31] = 0)? 1 : 0; (S1N right justified) +// cr6[0] = (S1 == S2)?1:0; (By vector) +// then cr6[2] = (S2 > S1)? 1 : 0; (At mismatched byte) +// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors) + + .text +#ifdef __MWERKS__ + .align 32 +#else + .align 5 +#endif + +#ifdef LIBMOTOVEC + .global memcmp +memcmp: +#else + .global vec_memcmp +vec_memcmp: +#endif + subf. DMS,SRC,PT1 // IU1 Compute ptr1-ptr2 difference + cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count moves + cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count + + add PBC,PT1,BC // IU1 Address of last byte + 1 + addi SM1,SRC,-1 // IU1 Pre-bias and duplicate ptr2 + addi DR,PT1,16 // IU1 Duplicate s1 pointer + beq Dumb_exit // return if PT1 = SRC + + addi PBK,PBC,-1 // IU1 Address of last ptr1 byte + addi DM1,PT1,-1 // IU1 Pre-bias and duplicate ptr1 + rlwinm DR,DR,0,0,27 // IU1 (PT1+16)[0:27] + beq cr1,Dumb_exit // return if BC = 0 + + subf QW,DR,PBK // IU1 Bytes of full vectors to move (-16) + rlwinm PBC,PBC,0,28,31 + bgt cr7,v_memcmp // do as vectors if BC>MIN_VEC + +// Compare byte-by-byte if BC<=MIN_VEC + mtctr BC // i=BC; do if...;i--; while (i>0) +Cmp_nxt_byte: + lbzu S2,1(SM1) // LSU + lbzu S1,1(DM1) // LSU + subf. PT1,S2,S1 // IU1 if (*s1++ == *s2++) + bdnzt 2,Cmp_nxt_byte // b while equal and bytes left + blr +Dumb_exit: + xor PT1,PT1,PT1 // IU1 return zero + blr + +#ifdef __MWERKS__ + .align 16 +#else + .align 4 +#endif +v_memcmp: +// Byte count < MIN_VEC bytes will have been compared by scalar code above, +// so this will not deal with small block compares < MIN_VEC. + +#ifdef VRSAVE + mfspr RSV,VRSV // IU2 Get current VRSAVE contents +#endif + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining + rlwinm S,SRC,0,28,31 // IU1 Save ptr2 address bits s[28:31] + +#ifdef VRSAVE + oris Rt,RSV,0xfff8 // IU1 Or in registers used by this routine +#endif + rlwinm D,PT1,0,28,31 // IU1 D = ptr1[28:31] + cmpi cr1,0,QW,0 // IU1 Any full vectors to move? + +#ifdef VRSAVE + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op +#endif + lvxl VS1,0,PT1 // LSU Get source1 load started (load as LRU) + subf. S,S,D // IU1 Is s2 longer than s1? (28:31 greater?) + li BK,16 // IU1 Byte index pointer + + lvxl VS2,0,SRC // LSU Get source2 load started (load as LRU) + vor VS2b,VS2,VS2 // VIU1 Preset second s2 vector if not loaded + addi BCM1,BC,-1 // IU1 Index to last s2 byte +// Decide if second vector of S2 is needed to compare to first vector of S1 + bge Around // b if initial S1 is shorter than or equal S2 + + lvxl VS2b,SRC,BK // LSU Otherwise, we need more of s2 + addi SRC,SRC,16 // IU1 Increment s2 pointer + addi BCM1,BCM1,-16 // IU1 Correction for last byte +Around: + + lvsl VP1,0,PT1 // LSU Set permute vector for s1 shift left + vspltisb VS1B,8 // VPU Create a shift count for 1 octet/8 bits + vxor V0,V0,V0 // VIU1 Create a vector of all zeroes + + lvsl VP2,0,SRC // LSU Set permute vector for s2 shift left + vspltisb VSH1,1 // VPU Create a shift count of 1 bit + vnor V1,V0,V0 // VIU1 Create a vector of all ones + + lvsr VP3,0,DMS // LSU Set permute vector for S2-S1 difference + cmpi cr5,0,PBC,0 // IU1 Will last byte of S2 be rt justified? + vperm VM,V0,V1,VP1 // VPU Mask as long as our subset of 1. + + + lvsr VP4,0,PBC // VIU1 Permute vector for bytes rt of end +// Dealing with first S1 Vector - Permute S1 and S2 (possibly + S2b) to left edge + vperm VS1,VS1,V1,VP1 // VPU Left align s1 with ones as pad + + vperm VS2,VS2,VS2b,VP2 // VPU Left align s2 and s2+ + + vslb VSH16,VS1B,VSH1 // VPU Shift count for 16 bits/2 octets + vor VS2,VS2,VM // VIU1 s2 now has identical ones padding to s1 + + vslb VS4B,VSH16,VSH1 // VPU Create a shift count for 4 octets + vcmpequb. VCE,VS1,VS2 // VIU1 Does s1 = s2? + + vslb VS8B,VS4B,VSH1 // VPU Create a shift count for 8 octets + vnor VMM,VCE,VCE // VIU1 Not equals become ones + bc _F_u,_all,memcmp_final_v_NE // b if s1!=s2 + + ble cr1,Last_ld // b if there are no QW to do + mtctr QW // IU2 i=QW; do ...; while (i-- > 0) + +// Dealing with middle vectors +memcmp_NA_next_v: + lvxl VS2a,SRC,BK // LSU Get next 16 bytes of s2 + + lvxl VS1,PT1,BK // LSU Get next 16 bytes of s1 + addi BK,BK,16 // IU1 Increment byte index + + vperm VS2,VS2b,VS2a,VP3 // VPU Combine into left justified s2 + vor VS2b,VS2a,VS2a // VIU1 Move upper vector to lower + + vcmpequb. VCE,VS1,VS2 // VIU1 Does s1 == s2 ? + + vnor VMM,VCE,VCE // VIU1 Not equals become ones + bdnzt 24,memcmp_NA_next_v // b if more whole QWs to do and s1==s2 + + bc _F_u,_all,memcmp_final_v_NE // b if s1 != s2 + +// Dealing with last vector +Last_ld: + lvxl VS2a,SRC,BCM1 // LSU Last load of s2 (perhaps redundant) + vperm VLM,V0,V1,VP4 // VPU Ones mask for bytes rt of end + + lvxl VS1,PT1,BK // LSU Last load of s1 + + vperm VS2,VS2b,VS2a,VP3 // VPU Combine into left justified s2 + beq cr5,Rt_just // b if final S1 byte is rt justified + + vor VS2,VS2,VLM // VIU1 Set uninvolved bytes at end + + vor VS1,VS1,VLM // VIU1 Set bytes at end of s1 +Rt_just: + vcmpequb. VCE,VS1,VS2 // VIU1 Does s1 == s2 ? + + vnor VMM,VCE,VCE // VIU1 Not equals become ones + bc _F_u,_all,memcmp_final_v_NE // b if s1!=s2 + + xor PT1,PT1,PT1 // IU1 Will return zero if strings are equal +#ifdef VRSAVE + mtspr VRSV,RSV // IU1 Restore VRSAVE +#endif + blr // Return 0 if s1 == s2 + +memcmp_final_v_NE: + // s1 != s2, We're going to create a mask to mask off everything to + // the right of the first mismatching byte so we know we are just + // looking at the string up to the mismatch. + + vsum4ubs VS12B,VS1B,VS8B // VIU2 Create a shift count for 12 octets + + vsro VMW1,VMM,VS4B // VPU Shift the compare result one word right + vsrw VMB1,VMM,VS1B // VIU1 Shift compare result 8 bits right + + vsro VMW2,VMM,VS8B // VPU Shift the compare result 2 words right + vsrw VMB2,VMM,VSH16 // VIU1 Shift compare result 16 bits right + + vsro VMW3,VMM,VS12B // VPU Shift the compare result 3 words right + vor VM,VMW1,VMW2 // VIU1 Mask of words one and 2 to the right + + vsro VMB3,VMB2,VS1B // VPU Shift compare result 3 bytes right + vor VM,VM,VMW3 // VIU1 Mask of MM 1,2,&3 words to the right + + vcmpgtuw VM,VM,V0 // VIU1 Mask of all ones in words to the right + + vor VM,VM,VMB1 // VIU1 Or in first byte to right + + vor VM,VM,VMB2 // VIU1 Or in second byte to right + + vor VM,VM,VMB3 // VIU1 Or in third byte to right + + vor VS2,VS2,VM // VIU1 Set bytes right of mismatch + + vor VS1,VS1,VM // VIU1 Set bytes right of mismatch + li r3,-1 // IU1 Return -1 if s1 < s2 + + vcmpgtub. VCE,VS2,VS1 // VIU1 Compute s2 > s1 for all bytes +#ifdef VRSAVE + mtspr VRSV,RSV // IU1 Restore VRSAVE +#endif + bclr _F_u,_none // s1 < s2 in first byte with a mismatch + +S2_lt_S1: li r3,1 // IU1 Return +1 if s1 > s2 + blr // s1 > s2 in first byte with a mismatch + +// End of memcmp in AltiVec + diff --git a/liboil/motovec/vec_memcpy.S b/liboil/motovec/vec_memcpy.S new file mode 100644 index 0000000..f280393 --- /dev/null +++ b/liboil/motovec/vec_memcpy.S @@ -0,0 +1,876 @@ +//------------------------------------------------------------------ +// file: vec_memcpy.S +// AltiVec enabled version of memcpy and bcopy +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// Copyright Motorola, Inc. 2003 +// ALL RIGHTS RESERVED +// +// You are hereby granted a copyright license to use, modify, and +// distribute the SOFTWARE so long as this entire notice is retained +// without alteration in any modified and/or redistributed versions, +// and that such modified versions are clearly identified as such. +// No licenses are granted by implication, estoppel or otherwise under +// any patents or trademarks of Motorola, Inc. +// +// The SOFTWARE is provided on an "AS IS" basis and without warranty. +// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS +// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED +// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR +// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH +// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS +// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. +// +// To the maximum extent permitted by applicable law, IN NO EVENT SHALL +// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER +// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF +// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS +// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR +// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility +// for the maintenance and support of the SOFTWARE. +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// extern void * memcpy(void *dst, const void *src, size_t len); +// Returns: +// void *dst +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// extern void * memmove( void *dst, const void *src, size_t len ); +// Copies len characters from src to dst and returns the value of +// dst. Works correctly for overlapping memory regions. +// - Harbison&Steele 4th ed (corrected as to return) +// Returns: +// void *dst +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// extern void * bcopy(const void *src, void *dst, size_t len); +// Returns: +// void *dst +//------------------------------------------------------------------ + +// memcpy and memmove are combined into one entry point here because of +// the similarity of operation and need to create fool-proof code. +// The following conditions determine what is "fool proof": +// +// if: then single entry: +// (DST-SRC)<0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memcpy +// (DST-SRC)<0 && (SRC-DST)< BC && BC>MIN_VEC must b to v_memcpy +// (DST-SRC)<0 && BC<MIN_VEC copy fwd byte-by-byte +// (DST-SRC)==0 || BC==0 will just return +// (DST-SRC)>0 && BC<MIN_VEC copy bkwd byte-by-byte +// (DST-SRC)>0 && (DST-SRC)< BC && BC>MIN_VEC must b to v_memmove +// (DST-SRC)>0 && (SRC-DST)>=BC && BC>MIN_VEC will b to v_memmove + +// If you call memmove (or vec_memmove) and |DST-SRC|>=BC, +// this code will branch to v_memcpy anyway for maximum performance. + +// Revision History: +// Rev 0.0 Original Chuck Corley 02/03/03 +// Can still add dst, 128B loop, and aligned option +// Rev 0.01 Fixed JY's seg-fault violation CJC 02/17/03 +// Rev 0.1 Added 128B loop and dst; cndtnlzd dcbz CJC 02/18/03 +// (Creating separate path for QW aligned didn't help much) +// Rev 0.11 Small code schdling; chngd dst for memmove CJC 02/23/03 +// Rev 0.20 Eliminated alternate entry and cleanup CJC 02/27/03 +// Rev 0.21 Inproved loop branch targets for v_mempcy CJC 03/01/03 +// Rev 0.22 Experimented with dst (sent to H.) CJC 03/02/03 +// Rev 0.23 Substituted dcba for dcbz (sent to JY) CJC 03/08/03 +// Rev 0.24 Use two dst streams CJC 03/12/03 +// Rev 0.25 Fix for all compilers, cleanup, and release with +// libmotovec.a rev 0.10 CJC 03/14/03 +// Rev 0.30 Fix for pre-empted destination (SNDF-DS) CJC 04/02/03 +// +// Between Rev 0.25 and 0.30 the code was revised to store elements of +// source at destination when first and/or last vector are less than 16 +// bytes. Areviewer at SNDF observed that loading the destination vector +// for merging exposed the "uninvolved" destination bytes to incoherency +// if an interrupt pre-empted this routine and modified the "uninvolved" +// destination vector(s) while held in register for merging. It seems +// like a low possibility but this revision is no longer subject to that +// possibility. (It is also slightly faster than Rev 0.25.) +// This is beta quality code; users are encouraged to make it faster. +// ASSUMPTIONS: +// Code is highly likely to be in the cache; data is not (streaming data) + +#define VRSV 256 // VRSAVE spr +// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes. +#define MIN_VEC 16 +// Don't use Big_loop in v_memcpy for |dst-src|<= minimum overlap. +#define MIN_OVL 128 + +// Register useage +#define Rt r0 // r0 when used as a temporary register + +#define DST r3 // entering: dst pointer; exiting: same dst pointer + +#define SRC r4 // entering: src ptr; then end of src range index (SRC+BC) in memmove + +#define BC r5 // entering: Byte_Count + +#define PCS r6 // save for partial checksum entering + +#define DMS r7 // dst - src initially +#define BK r7 // BC - 1 +/- (n*16) + +// Codewarrior will put an unwelcome space as "lbzu r0,1(r7 )" +// if you don't put the comment right after the r7. CJC 030314 +#define SM1 r8// src -1 for byte-by-byte forwards initially +#define S r8 // src[28:31] +#define SMD r8 // src[0:27]-dst[0:27] +#define STR r8 // data stream touch block & stride info for Big_loop + +#define DM1 r9// dst -1 for byte-by-byte forwards initially +#define D r9 // dst[28:31] +#define DNX r9 // (dst+n*16)[28:31] +#define BL r9 // second byte_kount index pointer + +#define SBC r10// src + byte count initially then src[28:31] +#define BLK r10 // temporary data stream touch block & stride info +#define DR r10 // (dst+16)[0:27] +#define QW r10 // number of quad words (vectors) + +#define DBC r11// dst + byte count initially +#define BLL r11 // temporary data stream touch block & stride info +#define SBK r11 // (src+byte_count-1) +#define SBR r11 // (src+byte_count-1)[0:27] +#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31] +#define BIG r11 // QW/8 or 128 byte loop count +#define SP8 r11 // SRC + n*128 (8 QWs) for data streaming after first call + +#define RSV r12 // storage for VRSAVE register if used + +#define VS0 v0 // src vector for permuting + +#define VS1 v1 // src vector for permuting + +#define VP3 v2 // d - s permute register + +#define VPS0 v3 // permuted source vector to store + +#define VPS1 v4 // 2nd permuted source vector to store + +#define VPS2 v5 // additional permuted src in Big loop + +#define VS2 v6 // src vector for permuting +#define VPS3 v6 // additional permuted src in Big loop + +#define VS3 v7 // additional src load in Big loop +#define VPS4 v7 // additional permuted src in Big loop + +#define VS4 v8 // additional src load in Big loop +#define VPS5 v8 // additional permuted src in Big loop + +#define VS5 v9 // additional src load in Big loop +#define VPS6 v9 // additional permuted src in Big loop + +#define VS6 v10 // additional src load in Big loop +#define VPS7 v10 // additional permuted src in Big loop + +#define VS7 v11 // additional src load in Big loop + +// Conditionalize the use of dcba. It will help if the data is +// not in cache and hurt if it is. Generally, except for small +// benchmarks repeated many times, we assume data is not in cache +// (data streaming) and using dcbz is a performance boost. +#ifndef NO_DCBA +#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) + // gcc and codewarrior and diab don't assemble dcba +#define DCBK .long 0x7c033dec +// dcba r3,r7 or dcba DST,BK +#define DCBL .long 0x7c034dec +// dcba r3,r9 or dcba DST,BL +#else +#ifdef __ghs__ +.macro DCBK +.long 0x7c033dec +.endm +.macro DCBL +.long 0x7c034dec +.endm +#else +#define DCBK dcba DST,BK +#define DCBL dcba DST,BL +#endif // __ghs__ +#endif // __GNUC__ or __MWERKS__ +#else +#define DCBK nop +#define DCBL nop +#endif // NO_DCBA + +// Conditionalize the use of dst (data stream touch). It will help +// if the data is not in cache and hurt if it is (though not as badly +// as dcbz). Generally, except for small benchmarks repeated many times, +// we assume data is not in cache (data streaming) and using dst is a +// performance boost. +#ifndef NO_DST +#define STRM_B dst SBC,BLL,0 +#define STRM_F dst SRC,BLK,0 +#define STRM_1 dst SP8,STR,1 + +#else +#define STRM_B nop +#define STRM_F nop +#define STRM_1 nop +#endif + +// Condition register use +// cr0[0:2] = (dst-src==0)? return: ((dst-src>0)? copy_bkwd, copy_fwd;); +// then cr0[0:2] = (dst[28:31]-src[28:31]<0)? "shifting left", "shifting right"; +// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move) +// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified) +// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified) +// cr5[0,2] = (|DST-SRC|<=MIN_OVL)?1:0; (Overlap too small for Big loop?) +// cr6[1,2] = (DST-SRC>=BC)?1:0; (Okay for v_memmove to copy forward?) +// then cr6[2] = (QW == 0)? 1 : 0; (Any full vectors to move?) +// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?) +// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment) +// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?) +// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors) +// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?) +// then cr7[1] = (QW > 14)? 1 : 0; (>14 vectors to move?) +// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?) + + .text +#ifdef __MWERKS__ + .align 32 +#else + .align 5 +#endif + +#ifdef LIBMOTOVEC + .global memmove +memmove: + nop // IU1 Compilers forget first label + .global memcpy +memcpy: +#else + .global vec_memmove +vec_memmove: + nop // IU1 Only way I know to preserve both labels + .global vec_memcpy +vec_memcpy: +#endif + subf. DMS,SRC,DST // IU1 Compute dst-src difference + cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count moves + cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count + + addi SM1,SRC,-1 // IU1 Pre-bias and duplicate src for fwd + addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination + add SBC,SRC,BC // IU1 Pre-bias and duplicate src for bkwd + beqlr // return if DST = SRC + + add DBC,DST,BC // IU1 Pre-bias and duplicate destination + subf Rt,DST,SRC // IU1 Form |DST-SRC| if DST-SRC<0 + beqlr cr1 // return if BC = 0 + + bgt Cpy_bkwd // b if DST-SRC>0 (have to copy backward) + cmpi cr5,0,Rt,MIN_OVL // IU1 (|DST-SRC|>128)?1:0; for v_memcpy + bgt cr7,v_memcpy // b if BC>MIN_VEC (okay to copy vectors fwd) + +// Copy byte-by-byte forwards if DST-SRC<0 and BC<=MIN_VEC + mtctr BC // i=BC; do ...;i--; while (i>0) +Byte_cpy_fwd: + lbzu Rt,1(SM1) // LSU * ++(DST-1) = * ++(SRC-1) + stbu Rt,1(DM1) // LSU + bdnz Byte_cpy_fwd + + blr + nop // IU1 Improve next label as branch target +Cpy_bkwd: + cmpi cr5,0,DMS,MIN_OVL // IU1 ((DST-SRC)>128)?1:0; for v_memcpy + cmp cr6,0,DMS,BC // IU1 cr6[1,2]=(DST-SRC>=BC)?1:0; + bgt cr7,v_memmove // b if BC>MIN_VEC (copy vectors bkwd) +// Copy byte-by-byte backwards if DST-SRC>0 and BC<=MIN_VEC + mtctr BC // i=BC; do ...;i--; while (i>0) +Byte_cpy_bwd: + lbzu Rt,-1(SBC) // LSU * --(DST+BC) = * --(SRC+BC) + stbu Rt,-1(DBC) // LSU Store it + bdnz Byte_cpy_bwd + blr + +#ifdef __MWERKS__ + .align 16 +#else + .align 4 +#endif + +v_memmove: +// Byte count < MIN_VEC bytes will have been copied by scalar code above, +// so this will not deal with small block moves < MIN_VEC. + +// For systems using VRSAVE, define VRSAVE=1 when compiling. For systems +// that don't, make sure VRSAVE is undefined. +#ifdef VRSAVE + mfspr RSV,VRSV // IU2 Get current VRSAVE contents +#endif + rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31] + rlwinm D,DST,0,28,31 // IU1 D = dst[28:31] + bge cr6,MC_entry // b to v_memcpy if DST-SRC>=BC (fwd copy OK) + +#ifdef VRSAVE + oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine +#endif + lis BLL,0x010c // IU1 Stream 12 blocks of 16 bytes + subf. SMD,D,S // IU1 if S-D<0 essentially shifting right + +#ifdef VRSAVE + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op +#endif + lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right + ori BLL,BLL,0xffe0 // IU1 Stream stride -32B + + STRM_B // LSU Start data stream at SRC+BC + addi SBK,SBC,-1 // IU1 Address of last src byte + bgt Rt_shft // Bytes from upper vector = (s-d>0)?s-d:16+s-d; + addi SMD,SMD,16 // IU1 Save 16-(d-s) +Rt_shft: + + rlwinm SBR,SBK,0,0,27 // IU1 (SRC+BC-1)[0:27] + addi BK,BC,-1 // IU1 Initialize byte index + + subf Rt,SBR,SBC // IU1 How many bytes in first source? + add DBK,DST,BK // IU1 Address of last dst byte + addi DR,DST,16 // IU1 Address of second dst vector + + subf. SMD,Rt,SMD // IU1 if bytes in 1st src>Bytes in 1st permute + rlwinm Rt,DBK,0,28,31 // IU1 (DST+BC-1)[28:31] + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] + +// If there are more useful bytes in the upper vector of a permute pair than we +// will get in the first permute, the first loaded vector needs to be in the +// lower half of the permute pair. The upper half is a don't care then. + blt Get_bytes_rt // b if shifting left (D-S>=0) + + lvx VS1,SRC,BK // LSU Get SN load started +// Comments numbering source and destination assume single path through the +// code executing each instruction once. For vec_memmove, an example would +// be the call memmove(BASE+0x0F, BASE+0x2F, 82). N = 6 in that case. + addi SRC,SRC,-16 // IU1 Decrement src base (to keep BK useful) + +Get_bytes_rt: // Come here to get VS0 & Don't care what VS1 is + lvx VS0,SRC,BK // LSU Get SN-1 (SN if D-S<0) in lower vector + subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) + cmpi cr7,0,Rt,0xF // IU1 Is Dn right justified? + + cmpi cr1,0,D,0 // IU1 Is D0 left justified? + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining + add Rt,DST,BC // IU1 Refresh the value of DST+BC + + cmpi cr6,0,QW,0 // IU1 Any full vectors to move? + vperm VPS0,VS0,VS1,VP3 // VPU Align SN-1 and SN to DN + vor VS1,VS0,VS0 // VIU1 Move lower vector to upper + beq cr7,Rt_just // b if DN is right justified + + mtcrf 0x01,Rt // IU2 Put final vector byte count in cr7 + rlwinm DBK,DBK,0,0,27 // IU1 Address of first byte of final vector + li D,0 // IU1 Initialize an index pointer + bnl cr7,Only_1W_bkwd // b if there was only one or zero words to store + + stvewx VPS0,DBK,D // LSU store word 1 of two or three + addi D,D,4 // IU1 increment index + + stvewx VPS0,DBK,D // LSU store word 2 of two or three + addi D,D,4 // IU1 increment index +Only_1W_bkwd: + bng cr7,Only_2W_bkwd // b if there were only two or zero words to store + + stvewx VPS0,DBK,D // LSU store word 3 of three if necessary + addi D,D,4 // IU1 increment index +Only_2W_bkwd: + bne cr7,Only_B_bkwd // b if there are no half words to store + + stvehx VPS0,DBK,D // LSU store one halfword if necessary + addi D,D,2 // IU1 increment index +Only_B_bkwd: + bns cr7,All_done_bkwd // b if there are no bytes to store + + stvebx VPS0,DBK,D // LSU store one byte if necessary + b All_done_bkwd + +Rt_just: + stvx VPS0,DST,BK // LSU Store 16 bytes at DN +All_done_bkwd: + addi BK,BK,-16 // IU1 Decrement destination byte count + + ble cr6,Last_load // b if no Quad words to do + mtctr QW // IU2 for (i=0;i<=QW;i++)-execution serializng + cmpi cr6,0,QW,4 // IU1 Check QW>4 +QW_loop: + lvx VS0,SRC,BK // LSU Get SN-2 (or SN-1 if ADJ==0) + + vperm VPS0,VS0,VS1,VP3 // VPU Align SN-2 and SN-1 to DN-1 + vor VS1,VS0,VS0 // VIU1 Move lower vector to upper + + stvx VPS0,DST,BK // LSU Store 16 bytes at DN-1 + addi BK,BK,-16 // IU1 Decrement byte kount + bdnzf 25,QW_loop // b if 4 or less quad words to do + + add DNX,DST,BK // IU1 address of next store (DST+BC-1-16) + bgt cr6,GT_4QW // b if >4 quad words left + +Last_load: // if D-S>=0, next load will be from same address as last + blt No_ld_bkwd // b if shifting right (S-D>=0) + addi SRC,SRC,16 // IU1 recorrect source if it was decremented +No_ld_bkwd: + lvx VS0,0,SRC // LSU Get last source SN-6 (guaranteed S0) +// Current 16 bytes is the last; we're done. + dss 0 // Data stream stop + vperm VPS0,VS0,VS1,VP3 // VPU Align SN-6 and SN-5 to DN-6 + subfic D,DST,16 // IU1 How many bytes in first destination? + beq cr1,Lt_just // b if last destination is left justified + + mtcrf 0x01,D // IU2 Put byte count remaining in cr7 + li D,0 // IU1 Initialize index pointer + bns cr7,No_B_bkwd // b if only even number of bytes to store + + stvebx VPS0,DST,D // LSU store first byte at DST+0 + addi D,D,1 // IU1 increment index +No_B_bkwd: + bne cr7,No_H_bkwd // b if only words to store + stvehx VPS0,DST,D // LSU store halfword at DST+0/1 + addi D,D,2 // IU1 increment index + +No_H_bkwd: + bng cr7,No_W1_bkwd // b if exactly zero or two words to store + stvewx VPS0,DST,D // LSU store word 1 of one or three + addi D,D,4 // IU1 increment index + +No_W1_bkwd: + bnl cr7,No_W2_bkwd // b if there was only one word to store + stvewx VPS0,DST,D // LSU store word 1 of two or 2 of three + addi D,D,4 // IU1 increment index + + stvewx VPS0,DST,D // LSU store word 2 of two or 3 of three + b No_W2_bkwd + +Lt_just: + stvx VPS0,0,DST // LSU Store 16 bytes at final dst addr D0 +No_W2_bkwd: +#ifdef VRSAVE + mtspr VRSV,RSV // IU1 Restore VRSAVE +#endif + blr // Return destination address from entry + +GT_4QW: // Do once if next store is to even half of cache line, else twice + + lvx VS0,SRC,BK // LSU Get SN-3 (or SN-2) + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+BC-1)[27]==1)?1:0; + + vperm VPS0,VS0,VS1,VP3 // VPU Align SN-3 and SN-2 to Dn-2 + vor VS1,VS0,VS0 // VIU1 Move lower vector to upper + addi DNX,DNX,-16 // IU1 Prepare to update cr6 next loop + + stvx VPS0,DST,BK // LSU Store 16 bytes at DN-2 + vor VS3,VS0,VS0 // VIU Make a copy of lower vector + addi BK,BK,-16 // IU1 Decrement byte count by 16 + bdnzt 27,GT_4QW // b if next store is to upper (odd) half of CL +// At this point next store will be to even address. + + lis STR,0x102 // IU1 Stream 2 blocks of 16 bytes + mtcrf 0x02,DST // IU2 cr6[3]=(DST[27]==1)?1:0; (DST odd?) + addi BL,BK,-16 // IU1 Create an alternate byte count - 16 + + ori STR,STR,0xffe0 // IU1 Stream stride -32B + addi SP8,SRC,-64 // IU1 Starting address for data stream touch + bso cr6,B32_bkwd // b if DST[27] == 1; i.e, final store is odd + + bdnz B32_bkwd // decrement counter for last odd QW store +B32_bkwd: // Should be at least 2 stores remaining and next 2 are cache aligned + lvx VS2,SRC,BK // LSU Get SN-4 (or SN-3) + addi SP8,SP8,-32 // IU1 Next starting address for data stream touch + + lvx VS1,SRC,BL // LSU Get SN-5 (or SN-4) + vperm VPS0,VS2,VS3,VP3 // VPU Align SN-4 and SN-3 to DN-3 + + STRM_1 // LSU Stream 64 byte blocks ahead of loads + + DCBL // LSU allocate next cache line + + vperm VPS1,VS1,VS2,VP3 // VPU Align SN-5 and SN-4 to DN-4 + vor VS3,VS1,VS1 // VIU1 Move SN-5 to SN-3 + + stvx VPS0,DST,BK // LSU Store 16 bytes at DN-3 + addi BK,BL,-16 // IU1 Decrement byte count + bdz Nxt_loc_bkwd // always decrement and branch to next instr + +Nxt_loc_bkwd: + stvx VPS1,DST,BL // LSU Store 16 bytes at DN-4 + addi BL,BK,-16 // IU1 Decrement alternate byte count + bdnz B32_bkwd // b if there are at least two more QWs to do + + bns cr6,One_odd_QW // b if there was one more odd QW to store + b Last_load + +// Come here with two more loads and two stores to do +One_odd_QW: + lvx VS1,SRC,BK // LSU Get SN-6 (or SN-5) + + vperm VPS1,VS1,VS3,VP3 // VPU Align SN-6 and SN-5 to DN-5 + + stvx VPS1,DST,BK // LSU Store 16 bytes at DN-5 + + b Last_load + +// End of memmove in AltiVec + +#ifdef __MWERKS__ + .align 16 +#else + .align 4 +#endif +v_memcpy: +// Byte count < MIN_VEC bytes will have been copied by scalar code above, +// so this will not deal with small block moves < MIN_VEC. + +#ifdef VRSAVE + mfspr RSV,VRSV // IU2 Get current VRSAVE contents +#endif + rlwinm S,SRC,0,28,31 // IU1 Save src address bits s[28:31] + rlwinm D,DST,0,28,31 // IU1 D = dst[28:31] + +MC_entry: // enter here from memmove if DST-SRC>=BC; this should be faster +#ifdef VRSAVE + oris Rt,RSV,0xfff0 // IU1 Or in registers used by this routine +#endif + lis BLK,0x010c // IU1 Stream 12 blocks of 16 bytes + + subf. S,S,D // IU1 if D-S<0 essentially shifting left + +#ifdef VRSAVE + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op +#endif + lvsr VP3,0,DMS // LSU Permute vector for dst - src shft right + ori BLK,BLK,32 // IU1 Stream stride 32B + + STRM_F // LSU Start data stream 0 at SRC + addi DR,DST,16 // IU1 Address of second dst vector + addi DBK,DBC,-1 // IU1 Address of last dst byte + +// If D-S<0 we are "kinda" shifting left with the right shift permute vector +// loaded to VP3 and we need both S0 and S1 to permute. If D-S>=0 then the +// first loaded vector needs to be in the upper half of the permute pair and +// the lower half is a don't care then. + bge Ld_bytes_rt // b if shifting right (D-S>=0) + + lvx VS0,0,SRC // LSU Get S0 load started +// Comments numbering source and destination assume single path through the +// code executing each instruction once. For vec_memcpy, an example would +// be the call memcpy(BASE+0x1E, BASE+0x1F, 259). N = 16 in that case. + addi SRC,SRC,16 // IU1 Increment src base (to keep BK useful) + +Ld_bytes_rt: // Come here to get VS1 & Don't care what VS0 is + lvx VS1,0,SRC // LSU Get S1 (or S0 if D-S>=0) in upper vector + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] + cmpi cr1,0,D,0 // IU1 Is D0 left justified? + + subf Rt,DST,DR // IU1 How many bytes in first destination? + subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) + li BK,0 // IU1 Initialize byte kount index + + mtcrf 0x01,Rt // IU2 Put bytes in 1st dst in cr7 + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining + vperm VPS0,VS0,VS1,VP3 // VPU Align S0 and S1 to D0 + + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower + beq cr1,Left_just // b if D0 is left justified + + bns cr7,No_B_fwd // b if only even number of bytes to store + + stvebx VPS0,DST,BK // LSU store first byte at DST+0 + addi BK,BK,1 // IU1 increment index +No_B_fwd: + bne cr7,No_H_fwd // b if only words to store + + stvehx VPS0,DST,BK // LSU store halfword at DST+0/1 + addi BK,BK,2 // IU1 increment index +No_H_fwd: + bng cr7,No_W1_fwd // b if exactly zero or two words to store + + stvewx VPS0,DST,BK // LSU store word 1 of one or three + addi BK,BK,4 // IU1 increment index + +No_W1_fwd: + bnl cr7,No_W2_fwd // b if there was only one word to store + stvewx VPS0,DST,BK // LSU store word 1 of two or 2 of three + addi BK,BK,4 // IU1 increment index + + stvewx VPS0,DST,BK // LSU store word 2 of two or 3 of three + b No_W2_fwd + +Left_just: + stvx VPS0,0,DST // LSU Store 16 bytes at D0 +No_W2_fwd: + rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] + cmpi cr6,0,QW,0 // IU1 Any full vectors to move? + + li BK,16 // IU1 Re-initialize byte kount index + cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? + cmpi cr7,0,QW,14 // IU1 Check QW>14 + ble cr6,Last_ld_fwd // b if no Quad words to do + + mtctr QW // IU2 for (i=0;i<=QW;i++) + cmpi cr6,0,QW,4 // IU1 Check QW>4 +QW_fwd_loop: + lvx VS1,SRC,BK // LSU Get S2 (or S1) + + vperm VPS0,VS0,VS1,VP3 // VPU Align S1 and S2 to D1 + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower + + stvx VPS0,DST,BK // LSU Store 16 bytes at D1(+n*16 where n<4) + addi BK,BK,16 // IU1 Increment byte kount index + bdnzf 25,QW_fwd_loop // b if 4 or less quad words to do + + add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) + addi QW,QW,-1 // IU1 One more QW stored by now + bgt cr6,GT_4QW_fwd // b if >4 quad words left + +Last_ld_fwd: // Next 16 bytes is the last; we're done. + add DBC,DST,BC // IU1 Recompute address of last dst byte + 1 + add SBC,SRC,BC // IU1 Recompute address of last src byte + 1 + bge No_ld_fwd // b if shifting right (D-S>=0) + + addi SBC,SBC,-16 // IU1 if D-S>=0 we didn't add 16 to src +No_ld_fwd: + mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 + addi DBK,DBC,-1 // IU1 Recompute address of last dst byte + addi Rt,SBC,-1 // IU1 Recompute address of last src byte + +// If D-S<0 we have already loaded all the source vectors. +// If D-S>=0 then the first loaded vector went to the upper half of the permute +// pair and we need one more vector. (This may be a duplicate.) + + lvx VS1,0,Rt // LSU Get last source S14 (guaranteed SN) + +#ifndef NO_DST + dss 0 // Data stream 0 stop + + dss 1 // Data stream 1 stop +#endif + vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D14 + beq cr1,Rt_just_fwd // b if last destination is right justified + + rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte + li D,0 // IU1 Initialize index pointer + bnl cr7,Only_1W_fwd // b if there was only one or zero words to store + + stvewx VPS0,DBK,D // LSU store word 1 of two or three + addi D,D,4 // IU1 increment index + + stvewx VPS0,DBK,D // LSU store word 2 of two or three + addi D,D,4 // IU1 increment index +Only_1W_fwd: + bng cr7,Only_2W_fwd // b if there were only two or zero words to store + + stvewx VPS0,DBK,D // LSU store word 3 of three if necessary + addi D,D,4 // IU1 increment index +Only_2W_fwd: + bne cr7,Only_B_fwd // b if there are no half words to store + + stvehx VPS0,DBK,D // LSU store one halfword if necessary + addi D,D,2 // IU1 increment index +Only_B_fwd: + bns cr7,All_done_fwd // b if there are no bytes to store + + stvebx VPS0,DBK,D // LSU store one byte if necessary + b All_done_fwd + +Rt_just_fwd: + + stvx VPS0,DST,BK // LSU Store 16 bytes at D14 +All_done_fwd: +#ifdef VRSAVE + mtspr VRSV,RSV // IU1 Restore VRSAVE +#endif + blr // Return destination address from entry +#ifdef __MWERKS__ + .align 16 +#else + .align 4 +#endif +GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice + + lvx VS1,SRC,BK // LSU Get S3 (or S2) + addi QW,QW,-1 // IU1 Keeping track of QWs stored + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; + + addi DNX,DNX,16 // IU1 Update cr6 for next loop + addi Rt,QW,-2 // IU1 Insure at least 2 QW left after big loop + + vperm VPS0,VS0,VS1,VP3 // VPU Align S2 and S3 to D2 + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower + + stvx VPS0,DST,BK // LSU Store 16 bytes at D2 + addi BK,BK,16 // IU1 Increment byte count by 16 + bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL +// At this point next store will be to even address. + + mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) + lis STR,0x104 // IU1 Stream 4 blocks of 16 bytes + addi BL,BK,16 // IU1 Create an alternate byte kount + 32 + + ori STR,STR,32 // IU1 Stream stride 32B +#ifndef NO_BIG_LOOP + rlwinm BIG,Rt,29,3,31 // IU1 QW/8 big loops to do + + rlwinm Rt,Rt,0,0,28 // IU1 How many QWs will be done in big loop + bgt cr7,Big_loop // b if QW > 14 +#endif +No_big_loop: +// We need the ctr register to reflect an even byte count before entering +// the next block - faster to decrement than to reload. + + addi SP8,SRC,256 // IU1 Starting address for data stream touch + xoris STR,STR,0x6 // IU1 Reset stream to 2 blocks of 16 bytes + bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even + + bdnz B32_fwd // decrement counter for last QW store odd + +B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned + lvx VS1,SRC,BK // LSU Get S12 + addi SP8,SP8,32 // IU1 Next starting address for data stream touch + + lvx VS2,SRC,BL // LSU Get S13 + vperm VPS1,VS0,VS1,VP3 // VPU Align S11 and S12 to D11 + + STRM_1 // LSU Stream 64 byte blocks ahead of loads + + DCBK // LSU then Kill instead of RWITM + + vperm VPS0,VS1,VS2,VP3 // VPU Align S12 and S13 to D12 + vor VS0,VS2,VS2 // VIU1 Move S13 to S11 + + stvx VPS1,DST,BK // LSU Store 16 bytes at D11 + addi BK,BL,16 // IU1 Increment byte count + bdz Nxt_loc_fwd // always decrement and branch to next instr + +Nxt_loc_fwd: + stvx VPS0,DST,BL // LSU Store 16 bytes at D12 + addi BL,BK,16 // IU1 Increment alternate byte count + bdnz B32_fwd // b if there are at least two more QWs to do + + bso cr6,One_even_QW // b if there is one even and one odd QW to store + b Last_ld_fwd // b if last store is to even address + +// Come here with two more loads and two stores to do +One_even_QW: + lvx VS1,SRC,BK // LSU Get S14 (or S13 if if D-S>=0) + + vperm VPS0,VS0,VS1,VP3 // VPU Align S13 and S14 to D13 + vor VS0,VS1,VS1 // VIU1 Move upper vector to lower + + stvx VPS0,DST,BK // LSU Store 16 bytes at D13 + addi BK,BK,16 // IU1 Increment byte count + + b Last_ld_fwd + +#ifdef __MWERKS__ + .align 16 +#else + .align 4 +#endif +Big_loop: + subf QW,Rt,QW // IU1 Should be 2-7 QWs left after big loop + blt cr5,No_big_loop // b back if |DST-SRC|<128; Big_loop won't work. + mtctr BIG // IU2 loop for as many 128B loops as possible + addi SP8,SRC,256 // IU1 Starting address for data stream touch + +Loop_of_128B: // Come here with QW>=10 and next store even; VS0 last load + lvx VS1,SRC,BK // LSU Get S4 (or S3 if D-S>=0) + addi BL,BK,32 // IU1 Increment Byte_Kount+16 by 32 + addi SP8,SP8,128 // IU1 increment address for data stream touch + + lvx VS3,SRC,BL // LSU Get S6 (or S5) + addi BL,BL,32 // IU1 Increment Byte_Kount+48 by 32 + + lvx VS5,SRC,BL // LSU Get S8 (or S7) + addi BL,BL,32 // IU1 Increment Byte_Kount+80 by 32 + + lvx VS7,SRC,BL // LSU Get S10 (or S9) + addi BL,BK,16 // IU1 Increment Byte_Kount+16 by 16 + + lvx VS2,SRC,BL // LSU Get S5 (or S4) + addi BL,BL,32 // IU1 Increment Byte_Kount+32 by 32 + + lvx VS4,SRC,BL // LSU Get S7 (or S6) + addi BL,BL,32 // IU1 Increment Byte_Kount+64 by 32 + + lvx VS6,SRC,BL // LSU Get S9 (or S8) + addi BL,BL,32 // IU1 Increment Byte_Kount+96 by 32 + vperm VPS0,VS0,VS1,VP3 // VPU + + lvx VS0,SRC,BL // LSU Get S11 (or S10) + vperm VPS1,VS1,VS2,VP3 // VPU + + STRM_1 // LSU Stream 4 32B blocks, stride 32B + + DCBK // LSU then Kill instead of RWITM + + stvx VPS0,DST,BK // LSU Store D3 + addi BK,BK,16 // IU1 Increment Byte_Kount+16 by 16 + vperm VPS2,VS2,VS3,VP3 // VPU + + stvx VPS1,DST,BK // LSU Store D4 + addi BK,BK,16 // IU1 Increment Byte_Kount+32 by 16 + vperm VPS3,VS3,VS4,VP3 // VPU + + DCBK // LSU then Kill instead of RWITM + + stvx VPS2,DST,BK // LSU Store D5 + addi BK,BK,16 // IU1 Increment Byte_Kount+48 by 16 + vperm VPS4,VS4,VS5,VP3 // VPU + + stvx VPS3,DST,BK // LSU Store D6 + addi BK,BK,16 // IU1 Increment Byte_Kount+64 by 16 + vperm VPS5,VS5,VS6,VP3 // VPU + + DCBK // LSU then Kill instead of RWITM + + stvx VPS4,DST,BK // LSU Store D7 + addi BK,BK,16 // IU1 Increment Byte_Kount+80 by 16 + vperm VPS6,VS6,VS7,VP3 // VPU + + stvx VPS5,DST,BK // LSU Store D8 + addi BK,BK,16 // IU1 Increment Byte_Kount+96 by 16 + vperm VPS7,VS7,VS0,VP3 // VPU + + DCBK // LSU then Kill instead of RWITM + + stvx VPS6,DST,BK // LSU Store D9 + addi BK,BK,16 // IU1 Increment Byte_Kount+112 by 16 + + stvx VPS7,DST,BK // LSU Store D10 + addi BK,BK,16 // IU1 Increment Byte_Kount+128 by 16 + bdnz Loop_of_128B // b if ctr > 0 (QW/8 still > 0) + + mtctr QW // IU1 Restore QW remaining to counter + addi BL,BK,16 // IU1 Create an alternate byte kount + 16 + bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even + + bdnz B32_fwd // b and decrement counter for last QW store odd + // One of the above branches should have taken + +// End of memcpy in AltiVec + +// bcopy works like memcpy, but the source and destination operands are reversed. +// Following will just reverse the operands and branch to memcpy. + +#ifdef LIBMOTOVEC + .global bcopy +bcopy: +#else + .global vec_bcopy +vec_bcopy: +#endif + mr Rt,DST // temp storage for what is really source address (r3) + mr DST,SRC // swap destination address to r3 to match memcpy dst + mr SRC,Rt // Complete swap of destination and source for memcpy +#ifdef LIBMOTOVEC + b memcpy // b to memcpy with correct args in r3 and r4 +#else + b vec_memcpy // b to vec_memcpy with correct args in r3 and r4 +#endif +// End of bcopy in AltiVec diff --git a/liboil/motovec/vec_memset.S b/liboil/motovec/vec_memset.S new file mode 100644 index 0000000..2b00e80 --- /dev/null +++ b/liboil/motovec/vec_memset.S @@ -0,0 +1,553 @@ +//------------------------------------------------------------------ +// file: vec_memset.S +// AltiVec enabled version of memset and bzero and cacheable_memzero +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// Copyright Motorola, Inc. 2002 +// ALL RIGHTS RESERVED +// +// You are hereby granted a copyright license to use, modify, and +// distribute the SOFTWARE so long as this entire notice is retained +// without alteration in any modified and/or redistributed versions, +// and that such modified versions are clearly identified as such. +// No licenses are granted by implication, estoppel or otherwise under +// any patents or trademarks of Motorola, Inc. +// +// The SOFTWARE is provided on an "AS IS" basis and without warranty. +// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS +// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED +// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR +// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH +// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS +// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. +// +// To the maximum extent permitted by applicable law, IN NO EVENT SHALL +// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER +// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF +// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS +// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR +// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility +// for the maintenance and support of the SOFTWARE. +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// extern void *memset( void *ptr, int val, size_t len ); +// Copies val into each of len characters beginning at ptr. +// - Harbison&Steele 4th ed +// (despite val being an int, this memset assumes it is never +// more than a byte. That seems to be correct from all the +// memset functions I've seen but I don't know if ANSI allows +// anthing longer. Chuck Corley 12/21/02) +// Returns: +// void * ptr +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// extern void * bzero( char *ptr, int len); +// Copies 0 into each of len characters at ptr. +// - Harbison&Steele 4th ed +// Returns: +// void * ptr +//------------------------------------------------------------------ + +// Revision History: +// Rev 0.0 Original Chuck Corley 02/09/03 +// Could benefit from changes added to memcpy +// Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03 +// +// This is beta quality code; users are encouraged to make it faster. +// ASSUMPTIONS: +// Code is highly likely to be in the cache; data is not (streaming data) +// Zero fill could be quite likely. +// Moving fill byte from GPR to VR as below faster than stw->lvebx via stack + +#define VRSV 256 // VRSAVE spr +// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes. +#define MIN_VEC 16 + +// Register useage +#define Rt r0 // r0 when used as a temporary register + +#define DST r3 // entering: dest pointer; exiting: same dest pointer + +#define FILL r4 // entering: fill char then fill word + +#define BC r5 // entering: Byte_Count then remaining Byte_Count + +#define DBC r6// dst + byte count + +#define BK r7 // BC - 1 +/- (n*16) + +#define Fsh r8 // fill byte shifted right one nibble + +#define DM1 r9// dst -1 for byte-by-byte backwards initially +#define D r9 // (dst+16)[0:27] - dst[28:31] +#define DNX r9 // (dst+n*16)[28:31] +#define BL r9 // second byte_kount index pointer + +#define DR r10 // (dst+16)[0:27] +#define QW r10 // number of cache lines + +#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31] + +#define RSV r12 // storage for VRSAVE register if used + +// Condition register use (not including temporary cr0) +// cr0[2] = (FILL==0)? +// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move) +// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified) +// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified) +// cr6[2] = (QW == 0)? 1 : 0; +// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?) +// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment) +// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?) +// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors) +// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?) +// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?) + +// Conditionalize the use of dcba. It will help if the data is +// not in cache and hurt if it is. Generally, except for small +// benchmarks repeated many times, we assume data is not in cache +// (data streaming) and using dcba is a performance boost. +// We use dcba which will noop to non-cacheable memory rather than +// dcbz which will cause an aligment exception. +#ifndef NO_DCBA +#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) + // gcc and codewarrior and diab don't assemble dcba +#define DCBK .long 0x7c033dec +// dcba r3,r7 or dcba DST,BK +#else +#ifdef __ghs__ +.macro DCBK +.long 0x7c033dec +.endm +#else +#define DCBK dcba DST,BK +#endif // __ghs__ +#endif // __GNUC__ or __MWERKS__ +#else +#define DCBK nop +#endif // NO_DCBA + + .text +#ifdef __MWERKS__ + .align 32 +#else + .align 5 +#endif + +#ifdef LIBMOTOVEC + .global memset +memset: +#else + .global vec_memset +vec_memset: +#endif + + cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count + cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count + rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift + + addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination + addi DR,DST,16 // IU1 Address of second dst vector + add DBC,DST,BC // IU1 Address of last dst byte + 1 + bgt cr7,v_memset // b if BC>MIN_VEC + + mtctr BC // for (i=1;i<=BC;i++) + beqlr cr1 // return if BC = 0 +Byte_set: + stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL + bdnz Byte_set + + blr + +v_memset: +// Byte count < MIN_VEC bytes will have been set by scalar code above, +// so this will not deal with small block sets < MIN_VEC. + +// For systems using VRSAVE, define VRSAV=1 when compiling. For systems +// that don't, make sure VRSAVE is undefined. +#ifdef VRSAVE + mfspr RSV,VRSV // IU2 Get current VRSAVE contents +#endif + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] + addi DBK,DBC,-1 // IU1 Address of last dst byte + +#ifdef VRSAVE + oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine +#endif + subf D,DST,DR // IU1 How many bytes in first destination? + li BK,0 // IU1 Initialize byte kount index + +#ifdef VRSAVE + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op +#endif + vxor v0,v0,v0 // VIU Clear v0 + subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) + cmpi cr1,0,D,16 // IU1 Is D0 left justified? + beq+ enter_bzero // b if FILL==0 + + lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR + vspltisb v1,4 // VPU Splat 0x4 to every byte + + lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR + + vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3] + + vor v0,v0,v2 // VIU Form FILL byte in VR[0:7] + + vspltb v0,v0,0 // VPU Splat the fill byte to all bytes +enter_bzero: + mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining + beq cr1,Left_just // b if D0 is left justified + + bns cr7,No_B_fwd // b if only even number of bytes to store + + stvebx v0,DST,BK // LSU store first byte at DST+0 + addi BK,BK,1 // IU1 increment index +No_B_fwd: + bne cr7,No_H_fwd // b if only words to store + + stvehx v0,DST,BK // LSU store halfword at DST+0/1 + addi BK,BK,2 // IU1 increment index +No_H_fwd: + bng cr7,No_W1_fwd // b if exactly zero or two words to store + + stvewx v0,DST,BK // LSU store word 1 of one or three + addi BK,BK,4 // IU1 increment index + +No_W1_fwd: + bnl cr7,No_W2_fwd // b if there was only one word to store + stvewx v0,DST,BK // LSU store word 1 of two or 2 of three + addi BK,BK,4 // IU1 increment index + + stvewx v0,DST,BK // LSU store word 2 of two or 3 of three + b No_W2_fwd + +Left_just: + stvx v0,0,DST // LSU Store 16 bytes at D0 +No_W2_fwd: + rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] + cmpi cr6,0,QW,0 // IU1 Any full vectors to move? + + li BK,16 // IU1 Re-initialize byte kount index + cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? + ble cr6,Last_QW // b if no Quad words to do + + mtctr QW // IU2 for (i=0;i<=QW;i++) + cmpi cr6,0,QW,4 // IU1 Check QW>4 + +QW_loop: + stvx v0,DST,BK // LSU Store 16 fill bytes + addi BK,BK,16 // IU1 Increment byte kount index + bdnzf 25,QW_loop // b if 4 or less quad words to do + + add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) + addi QW,QW,-1 // IU1 One more QW stored by now + bgt cr6,GT_4QW_fwd // b if >4 quad words left + +Last_QW: // Next vector is the last; we're done. + mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 + + beq cr1,Rt_just_fwd // b if last destination is right justified + + rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte + li BL,0 // IU1 Initialize index pointer + bnl cr7,Only_1W_fwd // b if there was only one or zero words to store + + stvewx v0,DBK,BL // LSU store word 1 of two or three + addi BL,BL,4 // IU1 increment index + + stvewx v0,DBK,BL // LSU store word 2 of two or three + addi BL,BL,4 // IU1 increment index +Only_1W_fwd: + bng cr7,Only_2W_fwd // b if there were only two or zero words to store + + stvewx v0,DBK,BL // LSU store word 3 of three if necessary + addi BL,BL,4 // IU1 increment index +Only_2W_fwd: + bne cr7,Only_B_fwd // b if there are no half words to store + + stvehx v0,DBK,BL // LSU store one halfword if necessary + addi BL,BL,2 // IU1 increment index +Only_B_fwd: + bns cr7,All_done_fwd // b if there are no bytes to store + + stvebx v0,DBK,BL // LSU store one byte if necessary + b All_done_fwd + +Rt_just_fwd: + + stvx v0,DST,BK // LSU Store 16 bytes at D14 +All_done_fwd: +#ifdef VRSAVE + mtspr VRSV,RSV // IU1 Restore VRSAVE +#endif + blr // Return destination address from entry + +#ifdef __MWERKS__ + .align 16 +#else + .align 4 +#endif +GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice + + addi QW,QW,-1 // IU1 Keeping track of QWs stored + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; + addi DNX,DNX,16 // IU1 Update cr6 for next loop + + stvx v0,DST,BK // LSU Store 16 bytes at D2 + addi BK,BK,16 // IU1 Increment byte count by 16 + bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL + + mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) + + bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even + +// We need the ctr register to reflect an even byte count before entering +// the next block - faster to decrement than to reload. + bdnz B32_fwd // decrement counter for last QW store odd + +B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned + DCBK // LSU then Kill instead of RWITM + + stvx v0,DST,BK // LSU Store 16 bytes at D11 + addi BK,BK,16 // IU1 Increment byte count + bdz Nxt_loc_fwd // always decrement and branch to next instr + +Nxt_loc_fwd: + stvx v0,DST,BK // LSU Store 16 bytes at D12 + addi BK,BK,16 // IU1 Increment byte count + bdnz B32_fwd // b if there are at least two more QWs to do + + bso cr6,One_even_QW // b if there is one even and one odd QW to store + b Last_QW // b if last store is to even address + +// Come here with two more loads and two stores to do +One_even_QW: + stvx v0,DST,BK // LSU Store 16 bytes at D13 + addi BK,BK,16 // IU1 Increment byte count + + b Last_QW + +// End of memset in AltiVec + +#define BCz r4 // in bzero r4 enters with byte count + +#ifdef __MWERKS__ + .align 32 +#else + .align 5 +#endif + +#ifdef LIBMOTOVEC + .global bzero +bzero: +#else + .global vec_bzero +vec_bzero: +#endif + + mr BC,BCz // IU1 arg[2] is BC here, not FILL + li FILL,0 // IU1 for bzero FILL=0 +#ifdef LIBMOTOVEC + b memset +#else + b vec_memset +#endif + +// cacheable_memzero will employ dcbz to clear 32 bytes at a time +// of cacheable memory. Like bzero, second entering argument will be BC. +// Using this for non-cacheable memory will generate an alignment exception. + + .text +#ifdef __MWERKS__ + .align 32 +#else + .align 5 +#endif + +#ifdef LIBMOTOVEC + .global cacheable_memzero +cacheable_memzero: +#else + .global vec_cacheable_memzero +vec_cacheable_memzero: +#endif + + mr BC,BCz // IU1 arg[2] is BC here, not FILL + li FILL,0 // IU1 for bzero FILL=0 + cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count + + cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count + + addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination + addi DR,DST,16 // IU1 Address of second dst vector + add DBC,DST,BC // IU1 Address of last dst byte + 1 + bgt cr7,c_v_memset // b if BC>MIN_VEC + + mtctr BC // for (i=1;i<=BC;i++) + beqlr cr1 // return if BC = 0 +c_Byte_set: + stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL + bdnz c_Byte_set + + blr + +c_v_memset: +// Byte count < MIN_VEC bytes will have been set by scalar code above, +// so this will not deal with small block sets < MIN_VEC. + +// For systems using VRSAVE, define VRSAV=1 when compiling. For systems +// that don't, make sure VRSAVE is undefined. +#ifdef VRSAVE + mfspr RSV,VRSV // IU2 Get current VRSAVE contents +#endif + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] + addi DBK,DBC,-1 // IU1 Address of last dst byte + +#ifdef VRSAVE + oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine +#endif + subf D,DST,DR // IU1 How many bytes in first destination? + li BK,0 // IU1 Initialize byte kount index + +#ifdef VRSAVE + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op +#endif + vxor v0,v0,v0 // VIU Clear v0 + subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) + cmpi cr1,0,D,16 // IU1 Is D0 left justified? + + mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining + beq cr1,c_Left_just // b if D0 is left justified + + bns cr7,c_No_B_fwd // b if only even number of bytes to store + + stvebx v0,DST,BK // LSU store first byte at DST+0 + addi BK,BK,1 // IU1 increment index +c_No_B_fwd: + bne cr7,c_No_H_fwd // b if only words to store + + stvehx v0,DST,BK // LSU store halfword at DST+0/1 + addi BK,BK,2 // IU1 increment index +c_No_H_fwd: + bng cr7,c_No_W1_fwd // b if exactly zero or two words to store + + stvewx v0,DST,BK // LSU store word 1 of one or three + addi BK,BK,4 // IU1 increment index + +c_No_W1_fwd: + bnl cr7,c_No_W2_fwd // b if there was only one word to store + stvewx v0,DST,BK // LSU store word 1 of two or 2 of three + addi BK,BK,4 // IU1 increment index + + stvewx v0,DST,BK // LSU store word 2 of two or 3 of three + b c_No_W2_fwd + +c_Left_just: + stvx v0,0,DST // LSU Store 16 bytes at D0 +c_No_W2_fwd: + rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] + cmpi cr6,0,QW,0 // IU1 Any full vectors to move? + + li BK,16 // IU1 Re-initialize byte kount index + cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? + ble cr6,c_Last_QW // b if no Quad words to do + + mtctr QW // IU2 for (i=0;i<=QW;i++) + cmpi cr6,0,QW,4 // IU1 Check QW>4 + +c_QW_loop: + stvx v0,DST,BK // LSU Store 16 fill bytes + addi BK,BK,16 // IU1 Increment byte kount index + bdnzf 25,c_QW_loop // b if 4 or less quad words to do + + add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) + addi QW,QW,-1 // IU1 One more QW stored by now + bgt cr6,c_GT_4QW_fwd // b if >4 quad words left + +c_Last_QW: // Next vector is the last; we're done. + mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 + + beq cr1,c_Rt_just_fwd // b if last destination is right justified + + rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte + li BL,0 // IU1 Initialize index pointer + bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store + + stvewx v0,DBK,BL // LSU store word 1 of two or three + addi BL,BL,4 // IU1 increment index + + stvewx v0,DBK,BL // LSU store word 2 of two or three + addi BL,BL,4 // IU1 increment index +c_Only_1W_fwd: + bng cr7,Only_2W_fwd // b if there were only two or zero words to store + + stvewx v0,DBK,BL // LSU store word 3 of three if necessary + addi BL,BL,4 // IU1 increment index +c_Only_2W_fwd: + bne cr7,c_Only_B_fwd // b if there are no half words to store + + stvehx v0,DBK,BL // LSU store one halfword if necessary + addi BL,BL,2 // IU1 increment index +c_Only_B_fwd: + bns cr7,c_All_done_fwd // b if there are no bytes to store + + stvebx v0,DBK,BL // LSU store one byte if necessary + b c_All_done_fwd + +c_Rt_just_fwd: + + stvx v0,DST,BK // LSU Store 16 bytes at D14 +c_All_done_fwd: +#ifdef VRSAVE + mtspr VRSV,RSV // IU1 Restore VRSAVE +#endif + blr // Return destination address from entry + +#ifdef __MWERKS__ + .align 16 +#else + .align 4 +#endif +c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice + + addi QW,QW,-1 // IU1 Keeping track of QWs stored + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; + addi DNX,DNX,16 // IU1 Update cr6 for next loop + + stvx v0,DST,BK // LSU Store 16 bytes at D2 + addi BK,BK,16 // IU1 Increment byte count by 16 + bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL + + mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) + + bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even + +// We need the ctr register to reflect an even byte count before entering +// the next block - faster to decrement than to reload. + bdnz B32_fwd // decrement counter for last QW store odd + +c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned + dcbz DST,BK // LSU zero whole cache line + bdz c_Nxt_loc_fwd // always decrement and branch to next instr + +c_Nxt_loc_fwd: + addi BK,BK,32 // IU1 Increment byte count + bdnz B32_fwd // b if there are at least two more QWs to do + + bso cr6,c_One_even_QW // b if there is one even and one odd QW to store + b c_Last_QW // b if last store is to even address + +// Come here with two more loads and two stores to do +c_One_even_QW: + stvx v0,DST,BK // LSU Store 16 bytes at D13 + addi BK,BK,16 // IU1 Increment byte count + + b c_Last_QW + +// End of cacheable_memzero in AltiVec diff --git a/liboil/motovec/vec_strcpy.S b/liboil/motovec/vec_strcpy.S new file mode 100644 index 0000000..c31beaa --- /dev/null +++ b/liboil/motovec/vec_strcpy.S @@ -0,0 +1,273 @@ +//------------------------------------------------------------------ +// file: vec_strcpy.S +// AltiVec enabled version of strcpy and strncpy +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// Copyright Motorola, Inc. 2003 +// ALL RIGHTS RESERVED +// +// You are hereby granted a copyright license to use, modify, and +// distribute the SOFTWARE so long as this entire notice is retained +// without alteration in any modified and/or redistributed versions, +// and that such modified versions are clearly identified as such. +// No licenses are granted by implication, estoppel or otherwise under +// any patents or trademarks of Motorola, Inc. +// +// The SOFTWARE is provided on an "AS IS" basis and without warranty. +// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS +// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED +// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR +// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH +// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS +// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. +// +// To the maximum extent permitted by applicable law, IN NO EVENT SHALL +// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER +// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF +// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS +// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR +// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility +// for the maintenance and support of the SOFTWARE. +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// extern char *vec_strcpy(char *dest, const char *src); +// +// Returns: +// char *dest +//------------------------------------------------------------------ + +// Revision History: +// Rev 0.0 Original Chuck Corley 03/22/02 +// Rev 0.1 Modified per vec_memcpy rev 0.30 Chuck Corley 05/24/03 +// + +// Harbison and Steele says "the results of both strcpy, strncpy, ... are +// unpredictable if the two string arguments overlap in memory." +// Since we do not know the address of the end of the string, copying +// from back to front is not an option. Therefore we always "copy forward." + +#define VRSV 256 // VRSAVE spr +// Use scalar for first MIN_SCALAR bytes. Overhead for vector is too great to win. +#define MIN_SCALAR 32 +// Also don't use vectors if |DST-SRC| <= MIN_VEC. Works only if MIN_VEC >= 16 bytes. +#define MIN_VEC 16 +#define PAGE_SIZE 4096 // True for G4 with AltiVec + +// Register useage: +#define Rt r0 // r0 when used as a temporary register + +#define DST r3 // entering: dst pointer; exiting: same dst pointer + +#define SRC r4 // entering: src ptr; then end of src range index (SRC+BC) in memmove + +#define ADD r5 // Temporary future dst address +#define PBC r5 // Computed Byte_Count to next 4K page src boundary + +#define DMS r6 // dst - src initially + +#define SMD r7 // src - dst initially + +#define DD r8 // duplicate of dst register for incementing + +#define QBC r9 // Computed Byte_Count to next QW dst boundary + +#define DS r10 // duplicate of src register for speculative incementing + +#define PSZ r11 // storage for page size constant + +#define RSV r12 // storage for VRSAVE register if used + +#define V0 v0 // all zeros + +#define VS0 v1 // src vector for permuting + +#define VS1 v2 // src vector for permuting + +#define VS2 v3 // src vector for permuting + +#define VP3 v4 // alignment permute register + +#define VPS0 v5 // permuted source vector to store + +#define VPS1 v6 // 2nd permuted source vector to store + +#define VCN v7 // null comparison result register + +// Conditionalize the use of dcba. It will help if the data is +// not in cache and hurt if it is. Generally, except for small +// benchmarks repeated many times, we assume data is not in cache +// (data streaming) and using dcbz is a performance boost. +#ifndef NO_DCBA +#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) + // gcc and codewarrior and diab don't assemble dcba +#define DCBA .long 0x7c0045ec +// dcba 0,r8 or dcba 0,DD +#else +#ifdef __ghs__ +.macro DCBA +.long 7c0045ec +.endm +#else +#define DCBA dcba 0,DD +#endif // __ghs__ +#endif // __GNUC__ or __MWERKS__ +#else +#define DCBA nop +#endif // NO_DCBA + + .text +#ifdef __MWERKS__ + .align 32 +#else + .align 5 +#endif + +#ifdef LIBMOTOVEC + .global strcpy +strcpy: +#else + .global vec_strcpy +vec_strcpy: +#endif + + + addi ADD,DST,32 // IU1 Next dst cacheline + subf. DMS,SRC,DST // IU1 Compute dst-src difference + subf SMD,DST,SRC // IU1 src-dst for use if dst-src<0 + + rlwinm ADD,ADD,0,0,26 // IU1 Round down to even QW + mr DD,DST // IU1 Duplicate dest + beqlr // return if DST = SRC + + bgt Pos_value // b if DST-SRC>0 + mr DMS,SMD // IU1 |dst - src| = src - dst +Pos_value: + subf. QBC,DST,ADD // IU1 Bytes to even QW start of vect (min 32) + addi ADD,DD,PAGE_SIZE // IU1 dst addr in next 4K page + cmpi cr7,0,DMS,MIN_VEC // IU1 Check for min byte count separation + + mtctr QBC // IU2 Init counter +Byte_loop: + lbzx Rt,0,SRC // LSU Get a byte + addi SRC,SRC,1 // IU1 Increment src + + cmpi cr1,0,Rt,0 // IU1 Is the byte loaded null? + stbx Rt,0,DD // LSU Store it + addi DD,DD,1 // IU1 Increment dest + bdnzf 6,Byte_loop // b to get another if this one wasn't null + + beqlr cr1 // return if found a null + + li PSZ,PAGE_SIZE // IU1 Constant for potential use in vector + rlwinm ADD,ADD,0,0,19 // IU1 First address in next 4K page + mr DS,SRC // IU1 Get current src addr + ble cr7,Byte_loop // do by bytes forever if < MIN_VEC separation + +v_strcpy: +// For systems using VRSAVE, define VRSAVE=1 when compiling. For systems +// that don't, make sure VRSAVE is undefined. +#ifdef VRSAVE + mfspr RSV,VRSV // IU2 Get current VRSAVE contents +#endif + subf. PBC,DD,ADD // IU1 Now bytes to next 4K page + +#ifdef VRSAVE + oris Rt,RSV,0xff00 // IU1 Or in registers used by this routine +#endif + rlwinm PBC,PBC,28,4,31 // IU1 Now QWs to next 4K page + +#ifdef VRSAVE + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op +#endif +// Since DD has to be QW aligned at this point, we need three (or two +// if SRC[28:31]==0) source vectors to permute into two dest vectors. +// Loading beyond the end of the string should be okay as long as we don't +// cross a page boundary. + + lvsl VP3,0,SRC // LSU Create left permute vector + vxor V0,V0,V0 // VIU Clear v0 + ble New_page_0 // b if next load will cross page boundary + mtctr PBC // IU2 Okay to load up to next page +Page_0: + + lvx VS0,0,DS // LSU Get first src vector + addi DS,DS,16 // IU1 Increment vector src pointer + bdz New_page_1 // b if next load will cross page boundary +Page_1: + + lvx VS1,0,DS // LSU Get second src vector + addi DS,DS,16 // IU1 Increment vector src pointer + bdz New_page_2 // b if next load will cross page boundary +Page_2: + + lvx VS2,0,DS // LSU Get third src vector + addi DS,DS,16 // IU1 Increment vector src pointer + bdz New_page_3 // b if next load will cross page boundary +Page_3: + + vperm VPS0,VS0,VS1,VP3 // VPU Align S0 and S1 to D0 + + vperm VPS1,VS1,VS2,VP3 // VPU Align S1 and S2 to D1 + vor VS0,VS2,VS2 // VIU1 Move upper vector to lower + + vcmpequb. VCN,V0,VPS0 // VIU1 Check for null + bne cr6,Final_0 // b if found a null in this permuted source vector + addi SRC,SRC,16 // IU1 Increment byte src pointer + + vcmpequb. VCN,V0,VPS1 // VIU1 Check for null + bne cr6,Final_1 // b if found a null in this permuted source vector + DCBA // LSU Conditionally dcba 0,DST + addi SRC,SRC,16 // IU1 Increment byte src pointer + + stvx VPS0,0,DD // LSU Store 16 bytes at dst addr D0 + addi DD,DD,16 // IU1 Increment duplicate dst pointer + + stvx VPS1,0,DD // LSU Store 16 bytes at dst addr D1 + addi DD,DD,16 // IU1 Increment duplicate dst pointer + + b Page_1 + +Final_1: // Found a null in 2nd vector, store 1st vector then do bytes + stvx VPS0,0,DD // LSU Store 16 bytes at dst addr D0 + addi DD,DD,16 // IU1 Increment duplicate dst pointer + +Final_0: // Found a null in vector, load and store bytes to null instead + lbzx Rt,0,SRC // LSU Get a byte + addi SRC,SRC,1 // IU1 Increment src + + cmpi cr1,0,Rt,0 // IU1 Is the byte loaded null? + stbx Rt,0,DD // LSU Store it + addi DD,DD,1 // IU1 Increment dest + + bne cr1,Final_0 // b to get another if this one wasn't null + +#ifdef VRSAVE + mtspr VRSV,RSV // IU1 Restore VRSAVE +#endif + blr + +New_page_0: // Next load will be from new page; (ctr would have been <= zero) + mtctr PSZ // reinitialize counter + b Page_0 + +New_page_1: // Did VS0 contain any nulls? + vcmpequb. VCN,V0,VS0 // VIU1 Check for null + bnl cr6,Final_0 // b if found a null in this source vector + mtctr PSZ // reinitialize counter + b Page_1 + +New_page_2: // Did VS1 contain any nulls? + vcmpequb. VCN,V0,VS1 // VIU1 Check for null + bnl cr6,Final_0 // b if found a null in this source vector + mtctr PSZ // reinitialize counter + b Page_2 + +New_page_3: // Did VS2 contain any nulls? + vcmpequb. VCN,V0,VS2 // VIU1 Check for null + bnl cr6,Final_0 // b if found a null in this source vector + mtctr PSZ // reinitialize counter + b Page_3 + +// End of strcpy in AltiVec |