diff options
Diffstat (limited to 'powerpc-cpu/sysdeps/powerpc/powerpc64/power4/memcmp.S')
-rw-r--r-- | powerpc-cpu/sysdeps/powerpc/powerpc64/power4/memcmp.S | 981 |
1 files changed, 0 insertions, 981 deletions
diff --git a/powerpc-cpu/sysdeps/powerpc/powerpc64/power4/memcmp.S b/powerpc-cpu/sysdeps/powerpc/powerpc64/power4/memcmp.S deleted file mode 100644 index 8f74ca7044..0000000000 --- a/powerpc-cpu/sysdeps/powerpc/powerpc64/power4/memcmp.S +++ /dev/null @@ -1,981 +0,0 @@ -/* Optimized strcmp implementation for PowerPC64. - Copyright (C) 2003, 2006 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA - 02110-1301 USA. */ - -#include <sysdep.h> -#include <bp-sym.h> -#include <bp-asm.h> - -/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */ - -EALIGN (BP_SYM(memcmp), 4, 0) - CALL_MCOUNT 3 - -#define rTMP r0 -#define rRTN r3 -#define rSTR1 r3 /* first string arg */ -#define rSTR2 r4 /* second string arg */ -#define rN r5 /* max string length */ -/* Note: The Bounded pointer support in this code is broken. This code - was inherited from PPC32 and and that support was never completed. - Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */ -#define rWORD1 r6 /* current word in s1 */ -#define rWORD2 r7 /* current word in s2 */ -#define rWORD3 r8 /* next word in s1 */ -#define rWORD4 r9 /* next word in s2 */ -#define rWORD5 r10 /* next word in s1 */ -#define rWORD6 r11 /* next word in s2 */ -#define rBITDIF r12 /* bits that differ in s1 & s2 words */ -#define rWORD7 r30 /* next word in s1 */ -#define rWORD8 r31 /* next word in s2 */ - - xor rTMP, rSTR2, rSTR1 - cmpldi cr6, rN, 0 - cmpldi cr1, rN, 12 - clrldi. rTMP, rTMP, 61 - clrldi rBITDIF, rSTR1, 61 - cmpldi cr5, rBITDIF, 0 - beq- cr6, L(zeroLength) - dcbt 0,rSTR1 - dcbt 0,rSTR2 -/* If less than 8 bytes or not aligned, use the unalligned - byte loop. */ - blt cr1, L(bytealigned) - std rWORD8,-8(r1) - cfi_offset(rWORD8,-8) - std rWORD7,-16(r1) - cfi_offset(rWORD7,-16) - bne L(unaligned) -/* At this point we know both strings have the same alignment and the - compare length is at least 8 bytes. rBITDIF containes the low order - 3 bits of rSTR1 and cr5 contains the result of the logical compare - of rBITDIF to 0. If rBITDIF == 0 then we are already double word - aligned and can perform the DWaligned loop. - - Otherwise we know the two strings have the same alignment (but not - yet DW). So we can force the string addresses to the next lower DW - boundary and special case this first DW word using shift left to - ellimiate bits preceeding the first byte. Since we want to join the - normal (DWaligned) compare loop, starting at the second double word, - we need to adjust the length (rN) and special case the loop - versioning for the first DW. This insures that the loop count is - correct and the first DW (shifted) is in the expected resister pair. */ - .align 4 -L(samealignment): - clrrdi rSTR1, rSTR1, 3 - clrrdi rSTR2, rSTR2, 3 - beq cr5, L(DWaligned) - add rN, rN, rBITDIF - sldi r11, rBITDIF, 3 - srdi rTMP, rN, 5 /* Divide by 32 */ - andi. rBITDIF, rN, 24 /* Get the DW remainder */ - ld rWORD1, 0(rSTR1) - ld rWORD2, 0(rSTR2) - cmpldi cr1, rBITDIF, 16 - cmpldi cr7, rN, 32 - clrldi rN, rN, 61 - beq L(dPs4) - mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ - bgt cr1, L(dPs3) - beq cr1, L(dPs2) - -/* Remainder is 8 */ - .align 3 -L(dsP1): - sld rWORD5, rWORD1, r11 - sld rWORD6, rWORD2, r11 - cmpld cr5, rWORD5, rWORD6 - blt cr7, L(dP1x) -/* Do something useful in this cycle since we have to branch anyway. */ - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) - cmpld cr0, rWORD1, rWORD2 - b L(dP1e) -/* Remainder is 16 */ - .align 4 -L(dPs2): - sld rWORD5, rWORD1, r11 - sld rWORD6, rWORD2, r11 - cmpld cr6, rWORD5, rWORD6 - blt cr7, L(dP2x) -/* Do something useful in this cycle since we have to branch anyway. */ - ld rWORD7, 8(rSTR1) - ld rWORD8, 8(rSTR2) - cmpld cr5, rWORD7, rWORD8 - b L(dP2e) -/* Remainder is 24 */ - .align 4 -L(dPs3): - sld rWORD3, rWORD1, r11 - sld rWORD4, rWORD2, r11 - cmpld cr1, rWORD3, rWORD4 - b L(dP3e) -/* Count is a multiple of 32, remainder is 0 */ - .align 4 -L(dPs4): - mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ - sld rWORD1, rWORD1, r11 - sld rWORD2, rWORD2, r11 - cmpld cr0, rWORD1, rWORD2 - b L(dP4e) - -/* At this point we know both strings are double word aligned and the - compare length is at least 8 bytes. */ - .align 4 -L(DWaligned): - andi. rBITDIF, rN, 24 /* Get the DW remainder */ - srdi rTMP, rN, 5 /* Divide by 32 */ - cmpldi cr1, rBITDIF, 16 - cmpldi cr7, rN, 32 - clrldi rN, rN, 61 - beq L(dP4) - bgt cr1, L(dP3) - beq cr1, L(dP2) - -/* Remainder is 8 */ - .align 4 -L(dP1): - mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ -/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early - (8-15 byte compare), we want to use only volitile registers. This - means we can avoid restoring non-volitile registers since we did not - change any on the early exit path. The key here is the non-early - exit path only cares about the condition code (cr5), not about which - register pair was used. */ - ld rWORD5, 0(rSTR1) - ld rWORD6, 0(rSTR2) - cmpld cr5, rWORD5, rWORD6 - blt cr7, L(dP1x) - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) - cmpld cr0, rWORD1, rWORD2 -L(dP1e): - ld rWORD3, 16(rSTR1) - ld rWORD4, 16(rSTR2) - cmpld cr1, rWORD3, rWORD4 - ld rWORD5, 24(rSTR1) - ld rWORD6, 24(rSTR2) - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(dLcr5) - bne cr0, L(dLcr0) - - ldu rWORD7, 32(rSTR1) - ldu rWORD8, 32(rSTR2) - bne cr1, L(dLcr1) - cmpld cr5, rWORD7, rWORD8 - bdnz L(dLoop) - bne cr6, L(dLcr6) - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - .align 3 -L(dP1x): - sldi. r12, rN, 3 - bne cr5, L(dLcr5) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - bne L(d00) - li rRTN, 0 - blr - -/* Remainder is 16 */ - .align 4 -L(dP2): - mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ - ld rWORD5, 0(rSTR1) - ld rWORD6, 0(rSTR2) - cmpld cr6, rWORD5, rWORD6 - blt cr7, L(dP2x) - ld rWORD7, 8(rSTR1) - ld rWORD8, 8(rSTR2) - cmpld cr5, rWORD7, rWORD8 -L(dP2e): - ld rWORD1, 16(rSTR1) - ld rWORD2, 16(rSTR2) - cmpld cr0, rWORD1, rWORD2 - ld rWORD3, 24(rSTR1) - ld rWORD4, 24(rSTR2) - cmpld cr1, rWORD3, rWORD4 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - bne cr6, L(dLcr6) - bne cr5, L(dLcr5) - b L(dLoop2) -/* Again we are on a early exit path (16-23 byte compare), we want to - only use volitile registers and avoid restoring non-volitile - registers. */ - .align 4 -L(dP2x): - ld rWORD3, 8(rSTR1) - ld rWORD4, 8(rSTR2) - cmpld cr5, rWORD3, rWORD4 - sldi. r12, rN, 3 - bne cr6, L(dLcr6) - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - bne cr5, L(dLcr5) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - bne L(d00) - li rRTN, 0 - blr - -/* Remainder is 24 */ - .align 4 -L(dP3): - mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ - ld rWORD3, 0(rSTR1) - ld rWORD4, 0(rSTR2) - cmpld cr1, rWORD3, rWORD4 -L(dP3e): - ld rWORD5, 8(rSTR1) - ld rWORD6, 8(rSTR2) - cmpld cr6, rWORD5, rWORD6 - blt cr7, L(dP3x) - ld rWORD7, 16(rSTR1) - ld rWORD8, 16(rSTR2) - cmpld cr5, rWORD7, rWORD8 - ld rWORD1, 24(rSTR1) - ld rWORD2, 24(rSTR2) - cmpld cr0, rWORD1, rWORD2 - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - bne cr1, L(dLcr1) - bne cr6, L(dLcr6) - b L(dLoop1) -/* Again we are on a early exit path (24-31 byte compare), we want to - only use volitile registers and avoid restoring non-volitile - registers. */ - .align 4 -L(dP3x): - ld rWORD1, 16(rSTR1) - ld rWORD2, 16(rSTR2) - cmpld cr5, rWORD1, rWORD2 - sldi. r12, rN, 3 - bne cr1, L(dLcr1) - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - bne cr6, L(dLcr6) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - bne cr5, L(dLcr5) - bne L(d00) - li rRTN, 0 - blr - -/* Count is a multiple of 32, remainder is 0 */ - .align 4 -L(dP4): - mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ - ld rWORD1, 0(rSTR1) - ld rWORD2, 0(rSTR2) - cmpld cr0, rWORD1, rWORD2 -L(dP4e): - ld rWORD3, 8(rSTR1) - ld rWORD4, 8(rSTR2) - cmpld cr1, rWORD3, rWORD4 - ld rWORD5, 16(rSTR1) - ld rWORD6, 16(rSTR2) - cmpld cr6, rWORD5, rWORD6 - ldu rWORD7, 24(rSTR1) - ldu rWORD8, 24(rSTR2) - cmpld cr5, rWORD7, rWORD8 - bne cr0, L(dLcr0) - bne cr1, L(dLcr1) - bdz- L(d24) /* Adjust CTR as we start with +4 */ -/* This is the primary loop */ - .align 4 -L(dLoop): - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(dLcr6) -L(dLoop1): - ld rWORD3, 16(rSTR1) - ld rWORD4, 16(rSTR2) - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(dLcr5) -L(dLoop2): - ld rWORD5, 24(rSTR1) - ld rWORD6, 24(rSTR2) - cmpld cr5, rWORD7, rWORD8 - bne cr0, L(dLcr0) -L(dLoop3): - ldu rWORD7, 32(rSTR1) - ldu rWORD8, 32(rSTR2) - bne- cr1, L(dLcr1) - cmpld cr0, rWORD1, rWORD2 - bdnz+ L(dLoop) - -L(dL4): - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(dLcr6) - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(dLcr5) - cmpld cr5, rWORD7, rWORD8 -L(d44): - bne cr0, L(dLcr0) -L(d34): - bne cr1, L(dLcr1) -L(d24): - bne cr6, L(dLcr6) -L(d14): - sldi. r12, rN, 3 - bne cr5, L(dLcr5) -L(d04): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - beq L(zeroLength) -/* At this point we have a remainder of 1 to 7 bytes to compare. Since - we are aligned it is safe to load the whole double word, and use - shift right double to elliminate bits beyond the compare length. */ -L(d00): - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) - srd rWORD1, rWORD1, rN - srd rWORD2, rWORD2, rN - cmpld cr5, rWORD1, rWORD2 - bne cr5, L(dLcr5x) - li rRTN, 0 - blr - .align 4 -L(dLcr0): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN, 1 - bgtlr cr0 - li rRTN, -1 - blr - .align 4 -L(dLcr1): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN, 1 - bgtlr cr1 - li rRTN, -1 - blr - .align 4 -L(dLcr6): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN, 1 - bgtlr cr6 - li rRTN, -1 - blr - .align 4 -L(dLcr5): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) -L(dLcr5x): - li rRTN, 1 - bgtlr cr5 - li rRTN, -1 - blr - - .align 4 -L(bytealigned): - mtctr rN /* Power4 wants mtctr 1st in dispatch group */ - beq- cr6, L(zeroLength) - -/* We need to prime this loop. This loop is swing modulo scheduled - to avoid pipe delays. The dependent instruction latencies (load to - compare to conditional branch) is 2 to 3 cycles. In this loop each - dispatch group ends in a branch and takes 1 cycle. Effectively - the first iteration of the loop only serves to load operands and - branches based on compares are delayed until the next loop. - - So we must precondition some registers and condition codes so that - we don't exit the loop early on the first iteration. */ - - lbz rWORD1, 0(rSTR1) - lbz rWORD2, 0(rSTR2) - bdz- L(b11) - cmpld cr0, rWORD1, rWORD2 - lbz rWORD3, 1(rSTR1) - lbz rWORD4, 1(rSTR2) - bdz- L(b12) - cmpld cr1, rWORD3, rWORD4 - lbzu rWORD5, 2(rSTR1) - lbzu rWORD6, 2(rSTR2) - bdz- L(b13) - .align 4 -L(bLoop): - lbzu rWORD1, 1(rSTR1) - lbzu rWORD2, 1(rSTR2) - bne- cr0, L(bLcr0) - - cmpld cr6, rWORD5, rWORD6 - bdz- L(b3i) - - lbzu rWORD3, 1(rSTR1) - lbzu rWORD4, 1(rSTR2) - bne- cr1, L(bLcr1) - - cmpld cr0, rWORD1, rWORD2 - bdz- L(b2i) - - lbzu rWORD5, 1(rSTR1) - lbzu rWORD6, 1(rSTR2) - bne- cr6, L(bLcr6) - - cmpld cr1, rWORD3, rWORD4 - bdnz+ L(bLoop) - -/* We speculatively loading bytes before we have tested the previous - bytes. But we must avoid overrunning the length (in the ctr) to - prevent these speculative loads from causing a segfault. In this - case the loop will exit early (before the all pending bytes are - tested. In this case we must complete the pending operations - before returning. */ -L(b1i): - bne- cr0, L(bLcr0) - bne- cr1, L(bLcr1) - b L(bx56) - .align 4 -L(b2i): - bne- cr6, L(bLcr6) - bne- cr0, L(bLcr0) - b L(bx34) - .align 4 -L(b3i): - bne- cr1, L(bLcr1) - bne- cr6, L(bLcr6) - b L(bx12) - .align 4 -L(bLcr0): - li rRTN, 1 - bgtlr cr0 - li rRTN, -1 - blr -L(bLcr1): - li rRTN, 1 - bgtlr cr1 - li rRTN, -1 - blr -L(bLcr6): - li rRTN, 1 - bgtlr cr6 - li rRTN, -1 - blr - -L(b13): - bne- cr0, L(bx12) - bne- cr1, L(bx34) -L(bx56): - sub rRTN, rWORD5, rWORD6 - blr - nop -L(b12): - bne- cr0, L(bx12) -L(bx34): - sub rRTN, rWORD3, rWORD4 - blr -L(b11): -L(bx12): - sub rRTN, rWORD1, rWORD2 - blr - .align 4 -L(zeroLengthReturn): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) -L(zeroLength): - li rRTN, 0 - blr - - .align 4 -/* At this point we know the strings have different alignment and the - compare length is at least 8 bytes. rBITDIF containes the low order - 3 bits of rSTR1 and cr5 contains the result of the logical compare - of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word - aligned and can perform the DWunaligned loop. - - Otherwise we know that rSTR1 is not aready DW aligned yet. - So we can force the string addresses to the next lower DW - boundary and special case this first DW word using shift left to - ellimiate bits preceeding the first byte. Since we want to join the - normal (DWaligned) compare loop, starting at the second double word, - we need to adjust the length (rN) and special case the loop - versioning for the first DW. This insures that the loop count is - correct and the first DW (shifted) is in the expected resister pair. */ -#define rSHL r29 /* Unaligned shift left count. */ -#define rSHR r28 /* Unaligned shift right count. */ -#define rB r27 /* Left rotation temp for rWORD2. */ -#define rD r26 /* Left rotation temp for rWORD4. */ -#define rF r25 /* Left rotation temp for rWORD6. */ -#define rH r24 /* Left rotation temp for rWORD8. */ -#define rA r0 /* Right rotation temp for rWORD2. */ -#define rC r12 /* Right rotation temp for rWORD4. */ -#define rE r0 /* Right rotation temp for rWORD6. */ -#define rG r12 /* Right rotation temp for rWORD8. */ -L(unaligned): - std r29,-24(r1) - cfi_offset(r29,-24) - clrldi rSHL, rSTR2, 61 - beq- cr6, L(duzeroLength) - std r28,-32(r1) - cfi_offset(r28,-32) - beq cr5, L(DWunaligned) - std r27,-40(r1) - cfi_offset(r27,-40) -/* Adjust the logical start of rSTR2 ro compensate for the extra bits - in the 1st rSTR1 DW. */ - sub r27, rSTR2, rBITDIF -/* But do not attempt to address the DW before that DW that contains - the actual start of rSTR2. */ - clrrdi rSTR2, rSTR2, 3 - std r26,-48(r1) - cfi_offset(r26,-48) -/* Compute the leaft/right shift counts for the unalign rSTR2, - compensating for the logical (DW aligned) start of rSTR1. */ - clrldi rSHL, r27, 61 - clrrdi rSTR1, rSTR1, 3 - std r25,-56(r1) - cfi_offset(r25,-56) - sldi rSHL, rSHL, 3 - cmpld cr5, r27, rSTR2 - add rN, rN, rBITDIF - sldi r11, rBITDIF, 3 - std r24,-64(r1) - cfi_offset(r24,-64) - subfic rSHR, rSHL, 64 - srdi rTMP, rN, 5 /* Divide by 32 */ - andi. rBITDIF, rN, 24 /* Get the DW remainder */ -/* We normally need to load 2 DWs to start the unaligned rSTR2, but in - this special case those bits may be discarded anyway. Also we - must avoid loading a DW where none of the bits are part of rSTR2 as - this may cross a page boundary and cause a page fault. */ - li rWORD8, 0 - blt cr5, L(dus0) - ld rWORD8, 0(rSTR2) - la rSTR2, 8(rSTR2) - sld rWORD8, rWORD8, rSHL - -L(dus0): - ld rWORD1, 0(rSTR1) - ld rWORD2, 0(rSTR2) - cmpldi cr1, rBITDIF, 16 - cmpldi cr7, rN, 32 - srd rG, rWORD2, rSHR - clrldi rN, rN, 61 - beq L(duPs4) - mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ - or rWORD8, rG, rWORD8 - bgt cr1, L(duPs3) - beq cr1, L(duPs2) - -/* Remainder is 8 */ - .align 4 -L(dusP1): - sld rB, rWORD2, rSHL - sld rWORD7, rWORD1, r11 - sld rWORD8, rWORD8, r11 - bge cr7, L(duP1e) -/* At this point we exit early with the first double word compare - complete and remainder of 0 to 7 bytes. See L(du14) for details on - how we handle the remaining bytes. */ - cmpld cr5, rWORD7, rWORD8 - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li rA, 0 - ble cr7, L(dutrim) - ld rWORD2, 8(rSTR2) - srd rA, rWORD2, rSHR - b L(dutrim) -/* Remainder is 16 */ - .align 4 -L(duPs2): - sld rH, rWORD2, rSHL - sld rWORD5, rWORD1, r11 - sld rWORD6, rWORD8, r11 - b L(duP2e) -/* Remainder is 24 */ - .align 4 -L(duPs3): - sld rF, rWORD2, rSHL - sld rWORD3, rWORD1, r11 - sld rWORD4, rWORD8, r11 - b L(duP3e) -/* Count is a multiple of 32, remainder is 0 */ - .align 4 -L(duPs4): - mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ - or rWORD8, rG, rWORD8 - sld rD, rWORD2, rSHL - sld rWORD1, rWORD1, r11 - sld rWORD2, rWORD8, r11 - b L(duP4e) - -/* At this point we know rSTR1 is double word aligned and the - compare length is at least 8 bytes. */ - .align 4 -L(DWunaligned): - std r27,-40(r1) - cfi_offset(r27,-40) - clrrdi rSTR2, rSTR2, 3 - std r26,-48(r1) - cfi_offset(r26,-48) - srdi rTMP, rN, 5 /* Divide by 32 */ - std r25,-56(r1) - cfi_offset(r25,-56) - andi. rBITDIF, rN, 24 /* Get the DW remainder */ - std r24,-64(r1) - cfi_offset(r24,-64) - sldi rSHL, rSHL, 3 - ld rWORD6, 0(rSTR2) - ldu rWORD8, 8(rSTR2) - cmpldi cr1, rBITDIF, 16 - cmpldi cr7, rN, 32 - clrldi rN, rN, 61 - subfic rSHR, rSHL, 64 - sld rH, rWORD6, rSHL - beq L(duP4) - mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ - bgt cr1, L(duP3) - beq cr1, L(duP2) - -/* Remainder is 8 */ - .align 4 -L(duP1): - srd rG, rWORD8, rSHR - ld rWORD7, 0(rSTR1) - sld rB, rWORD8, rSHL - or rWORD8, rG, rH - blt cr7, L(duP1x) -L(duP1e): - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) - cmpld cr5, rWORD7, rWORD8 - srd rA, rWORD2, rSHR - sld rD, rWORD2, rSHL - or rWORD2, rA, rB - ld rWORD3, 16(rSTR1) - ld rWORD4, 16(rSTR2) - cmpld cr0, rWORD1, rWORD2 - srd rC, rWORD4, rSHR - sld rF, rWORD4, rSHL - bne cr5, L(duLcr5) - or rWORD4, rC, rD - ld rWORD5, 24(rSTR1) - ld rWORD6, 24(rSTR2) - cmpld cr1, rWORD3, rWORD4 - srd rE, rWORD6, rSHR - sld rH, rWORD6, rSHL - bne cr0, L(duLcr0) - or rWORD6, rE, rF - cmpld cr6, rWORD5, rWORD6 - b L(duLoop3) - .align 4 -/* At this point we exit early with the first double word compare - complete and remainder of 0 to 7 bytes. See L(du14) for details on - how we handle the remaining bytes. */ -L(duP1x): - cmpld cr5, rWORD7, rWORD8 - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li rA, 0 - ble cr7, L(dutrim) - ld rWORD2, 8(rSTR2) - srd rA, rWORD2, rSHR - b L(dutrim) -/* Remainder is 16 */ - .align 4 -L(duP2): - srd rE, rWORD8, rSHR - ld rWORD5, 0(rSTR1) - or rWORD6, rE, rH - sld rH, rWORD8, rSHL -L(duP2e): - ld rWORD7, 8(rSTR1) - ld rWORD8, 8(rSTR2) - cmpld cr6, rWORD5, rWORD6 - srd rG, rWORD8, rSHR - sld rB, rWORD8, rSHL - or rWORD8, rG, rH - blt cr7, L(duP2x) - ld rWORD1, 16(rSTR1) - ld rWORD2, 16(rSTR2) - cmpld cr5, rWORD7, rWORD8 - bne cr6, L(duLcr6) - srd rA, rWORD2, rSHR - sld rD, rWORD2, rSHL - or rWORD2, rA, rB - ld rWORD3, 24(rSTR1) - ld rWORD4, 24(rSTR2) - cmpld cr0, rWORD1, rWORD2 - bne cr5, L(duLcr5) - srd rC, rWORD4, rSHR - sld rF, rWORD4, rSHL - or rWORD4, rC, rD - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - cmpld cr1, rWORD3, rWORD4 - b L(duLoop2) - .align 4 -L(duP2x): - cmpld cr5, rWORD7, rWORD8 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 - bne cr6, L(duLcr6) - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li rA, 0 - ble cr7, L(dutrim) - ld rWORD2, 8(rSTR2) - srd rA, rWORD2, rSHR - b L(dutrim) - -/* Remainder is 24 */ - .align 4 -L(duP3): - srd rC, rWORD8, rSHR - ld rWORD3, 0(rSTR1) - sld rF, rWORD8, rSHL - or rWORD4, rC, rH -L(duP3e): - ld rWORD5, 8(rSTR1) - ld rWORD6, 8(rSTR2) - cmpld cr1, rWORD3, rWORD4 - srd rE, rWORD6, rSHR - sld rH, rWORD6, rSHL - or rWORD6, rE, rF - ld rWORD7, 16(rSTR1) - ld rWORD8, 16(rSTR2) - cmpld cr6, rWORD5, rWORD6 - bne cr1, L(duLcr1) - srd rG, rWORD8, rSHR - sld rB, rWORD8, rSHL - or rWORD8, rG, rH - blt cr7, L(duP3x) - ld rWORD1, 24(rSTR1) - ld rWORD2, 24(rSTR2) - cmpld cr5, rWORD7, rWORD8 - bne cr6, L(duLcr6) - srd rA, rWORD2, rSHR - sld rD, rWORD2, rSHL - or rWORD2, rA, rB - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - cmpld cr0, rWORD1, rWORD2 - b L(duLoop1) - .align 4 -L(duP3x): - addi rSTR1, rSTR1, 16 - addi rSTR2, rSTR2, 16 - bne cr1, L(duLcr1) - cmpld cr5, rWORD7, rWORD8 - bne cr6, L(duLcr6) - sldi. rN, rN, 3 - bne cr5, L(duLcr5) - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li rA, 0 - ble cr7, L(dutrim) - ld rWORD2, 8(rSTR2) - srd rA, rWORD2, rSHR - b L(dutrim) - -/* Count is a multiple of 32, remainder is 0 */ - .align 4 -L(duP4): - mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */ - srd rA, rWORD8, rSHR - ld rWORD1, 0(rSTR1) - sld rD, rWORD8, rSHL - or rWORD2, rA, rH -L(duP4e): - ld rWORD3, 8(rSTR1) - ld rWORD4, 8(rSTR2) - cmpld cr0, rWORD1, rWORD2 - srd rC, rWORD4, rSHR - sld rF, rWORD4, rSHL - or rWORD4, rC, rD - ld rWORD5, 16(rSTR1) - ld rWORD6, 16(rSTR2) - cmpld cr1, rWORD3, rWORD4 - bne cr0, L(duLcr0) - srd rE, rWORD6, rSHR - sld rH, rWORD6, rSHL - or rWORD6, rE, rF - ldu rWORD7, 24(rSTR1) - ldu rWORD8, 24(rSTR2) - cmpld cr6, rWORD5, rWORD6 - bne cr1, L(duLcr1) - srd rG, rWORD8, rSHR - sld rB, rWORD8, rSHL - or rWORD8, rG, rH - cmpld cr5, rWORD7, rWORD8 - bdz- L(du24) /* Adjust CTR as we start with +4 */ -/* This is the primary loop */ - .align 4 -L(duLoop): - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(duLcr6) - srd rA, rWORD2, rSHR - sld rD, rWORD2, rSHL - or rWORD2, rA, rB -L(duLoop1): - ld rWORD3, 16(rSTR1) - ld rWORD4, 16(rSTR2) - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(duLcr5) - srd rC, rWORD4, rSHR - sld rF, rWORD4, rSHL - or rWORD4, rC, rD -L(duLoop2): - ld rWORD5, 24(rSTR1) - ld rWORD6, 24(rSTR2) - cmpld cr5, rWORD7, rWORD8 - bne cr0, L(duLcr0) - srd rE, rWORD6, rSHR - sld rH, rWORD6, rSHL - or rWORD6, rE, rF -L(duLoop3): - ldu rWORD7, 32(rSTR1) - ldu rWORD8, 32(rSTR2) - cmpld cr0, rWORD1, rWORD2 - bne- cr1, L(duLcr1) - srd rG, rWORD8, rSHR - sld rB, rWORD8, rSHL - or rWORD8, rG, rH - bdnz+ L(duLoop) - -L(duL4): - bne cr1, L(duLcr1) - cmpld cr1, rWORD3, rWORD4 - bne cr6, L(duLcr6) - cmpld cr6, rWORD5, rWORD6 - bne cr5, L(duLcr5) - cmpld cr5, rWORD7, rWORD8 -L(du44): - bne cr0, L(duLcr0) -L(du34): - bne cr1, L(duLcr1) -L(du24): - bne cr6, L(duLcr6) -L(du14): - sldi. rN, rN, 3 - bne cr5, L(duLcr5) -/* At this point we have a remainder of 1 to 7 bytes to compare. We use - shift right double to elliminate bits beyond the compare length. - This allows the use of double word subtract to compute the final - result. - - However it may not be safe to load rWORD2 which may be beyond the - string length. So we compare the bit length of the remainder to - the right shift count (rSHR). If the bit count is less than or equal - we do not need to load rWORD2 (all significant bits are already in - rB). */ - cmpld cr7, rN, rSHR - beq L(duZeroReturn) - li rA, 0 - ble cr7, L(dutrim) - ld rWORD2, 8(rSTR2) - srd rA, rWORD2, rSHR - .align 4 -L(dutrim): - ld rWORD1, 8(rSTR1) - ld rWORD8,-8(r1) - subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ - or rWORD2, rA, rB - ld rWORD7,-16(r1) - ld r29,-24(r1) - srd rWORD1, rWORD1, rN - srd rWORD2, rWORD2, rN - ld r28,-32(r1) - ld r27,-40(r1) - li rRTN, 0 - cmpld cr0, rWORD1, rWORD2 - ld r26,-48(r1) - ld r25,-56(r1) - beq cr0, L(dureturn24) - li rRTN, 1 - ld r24,-64(r1) - bgtlr cr0 - li rRTN, -1 - blr - .align 4 -L(duLcr0): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN, 1 - bgt cr0, L(dureturn29) - ld r29,-24(r1) - ld r28,-32(r1) - li rRTN, -1 - b L(dureturn27) - .align 4 -L(duLcr1): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN, 1 - bgt cr1, L(dureturn29) - ld r29,-24(r1) - ld r28,-32(r1) - li rRTN, -1 - b L(dureturn27) - .align 4 -L(duLcr6): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN, 1 - bgt cr6, L(dureturn29) - ld r29,-24(r1) - ld r28,-32(r1) - li rRTN, -1 - b L(dureturn27) - .align 4 -L(duLcr5): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN, 1 - bgt cr5, L(dureturn29) - ld r29,-24(r1) - ld r28,-32(r1) - li rRTN, -1 - b L(dureturn27) - .align 3 -L(duZeroReturn): - li rRTN,0 - .align 4 -L(dureturn): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) -L(dureturn29): - ld r29,-24(r1) - ld r28,-32(r1) -L(dureturn27): - ld r27,-40(r1) -L(dureturn26): - ld r26,-48(r1) -L(dureturn25): - ld r25,-56(r1) -L(dureturn24): - ld r24,-64(r1) - blr -L(duzeroLength): - li rRTN,0 - blr - -END (BP_SYM (memcmp)) -libc_hidden_builtin_def (memcmp) -weak_alias (memcmp, bcmp) |