diff options
author | joseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d> | 2013-10-18 21:33:25 +0000 |
---|---|---|
committer | joseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d> | 2013-10-18 21:33:25 +0000 |
commit | fe2ed5aaa408e1ab996a9fe1595a05634208a79c (patch) | |
tree | e1027fbc9d8a4a8c33f8149b2b42e8cde89c74f6 /libc/sysdeps/powerpc/powerpc64/power7 | |
parent | 571c782b982d888565e7d06bfc2f3d47582fe829 (diff) | |
download | eglibc2-fe2ed5aaa408e1ab996a9fe1595a05634208a79c.tar.gz |
Merge changes between r23946 and r24305 from /fsf/trunk.
git-svn-id: svn://svn.eglibc.org/trunk@24306 7b3dc134-2b1b-0410-93df-9e9f96275f8d
Diffstat (limited to 'libc/sysdeps/powerpc/powerpc64/power7')
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S | 6 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S | 5 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/memchr.S | 188 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/memcmp.S | 1613 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/memcpy.S | 704 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/mempcpy.S | 26 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/memrchr.S | 192 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/memset.S | 6 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/rawmemchr.S | 17 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/strchr.S | 43 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/strchrnul.S | 19 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/strlen.S | 17 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/strncmp.S | 61 | ||||
-rw-r--r-- | libc/sysdeps/powerpc/powerpc64/power7/strnlen.S | 111 |
14 files changed, 1725 insertions, 1283 deletions
diff --git a/libc/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/libc/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S index d0071c765..ebec0e0ba 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S @@ -39,10 +39,8 @@ EALIGN (__finite, 4, 0) stfd fp1,-16(r1) /* Transfer FP to GPR's. */ ori 2,2,0 /* Force a new dispatch group. */ - - lhz r4,-16(r1) /* Fetch the upper portion of the high word of - the FP value (where the exponent and sign bits - are). */ + lhz r4,-16+HISHORT(r1) /* Fetch the upper 16 bits of the FP value + (biased exponent and sign bit). */ clrlwi r4,r4,17 /* r4 = abs(r4). */ cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ bltlr cr7 /* LT means finite, other non-finite. */ diff --git a/libc/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/libc/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S index 1aea12383..8d088db5a 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S @@ -38,9 +38,8 @@ EALIGN (__isinf, 4, 0) stfd fp1,-16(r1) /* Transfer FP to GPR's. */ ori 2,2,0 /* Force a new dispatch group. */ - lhz r4,-16(r1) /* Fetch the upper portion of the high word of - the FP value (where the exponent and sign bits - are). */ + lhz r4,-16+HISHORT(r1) /* Fetch the upper 16 bits of the FP value + (biased exponent and sign bit). */ cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */ li r3,1 beqlr cr7 /* EQ means INF, otherwise -INF. */ diff --git a/libc/sysdeps/powerpc/powerpc64/power7/memchr.S b/libc/sysdeps/powerpc/powerpc64/power7/memchr.S index 3416897f5..5076dd0c1 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/memchr.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/memchr.S @@ -25,109 +25,112 @@ ENTRY (__memchr) CALL_MCOUNT 2 dcbt 0,r3 clrrdi r8,r3,3 - rlwimi r4,r4,8,16,23 - rlwimi r4,r4,16,0,15 + insrdi r4,r4,8,48 add r7,r3,r5 /* Calculate the last acceptable address. */ + insrdi r4,r4,16,32 cmpldi r5,32 + li r9, -1 + rlwinm r6,r3,3,26,28 /* Calculate padding. */ insrdi r4,r4,32,0 + addi r7,r7,-1 +#ifdef __LITTLE_ENDIAN__ + sld r9,r9,r6 +#else + srd r9,r9,r6 +#endif ble L(small_range) - cmpld cr7,r3,r7 /* Compare the starting address (r3) with the - ending address (r7). If (r3 >= r7), - the size passed in was zero or negative. */ - ble cr7,L(proceed) - - li r7,-1 /* Artificially set our ending address (r7) - such that we will exit early. */ - -L(proceed): - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - cmpldi cr6,r6,0 /* cr6 == Do we have padding? */ ld r12,0(r8) /* Load doubleword from memory. */ - cmpb r10,r12,r4 /* Check for BYTEs in DWORD1. */ - beq cr6,L(proceed_no_padding) - sld r10,r10,r6 - srd r10,r10,r6 -L(proceed_no_padding): - cmpldi cr7,r10,0 /* Does r10 indicate we got a hit? */ + cmpb r3,r12,r4 /* Check for BYTEs in DWORD1. */ + and r3,r3,r9 + clrldi r5,r7,61 /* Byte count - 1 in last dword. */ + clrrdi r7,r7,3 /* Address of last doubleword. */ + cmpldi cr7,r3,0 /* Does r3 indicate we got a hit? */ bne cr7,L(done) - /* See if we are at the last acceptable address yet. */ - addi r9,r8,8 - cmpld cr6,r9,r7 - bge cr6,L(null) - mtcrf 0x01,r8 /* Are we now aligned to a quadword boundary? If so, skip to the main loop. Otherwise, go through the alignment code. */ - bt 28,L(loop_setup) /* Handle DWORD2 of pair. */ ldu r12,8(r8) - cmpb r10,r12,r4 - cmpldi cr7,r10,0 + cmpb r3,r12,r4 + cmpldi cr7,r3,0 bne cr7,L(done) - /* Are we done already? */ - addi r9,r8,8 - cmpld cr6,r9,r7 - bge cr6,L(null) - L(loop_setup): - sub r5,r7,r9 - srdi r6,r5,4 /* Number of loop iterations. */ + /* The last dword we want to read in the loop below is the one + containing the last byte of the string, ie. the dword at + (s + size - 1) & ~7, or r7. The first dword read is at + r8 + 8, we read 2 * cnt dwords, so the last dword read will + be at r8 + 8 + 16 * cnt - 8. Solving for cnt gives + cnt = (r7 - r8) / 16 */ + sub r6,r7,r8 + srdi r6,r6,4 /* Number of loop iterations. */ mtctr r6 /* Setup the counter. */ - b L(loop) - /* Main loop to look for BYTE backwards in the string. Since - it's a small loop (< 8 instructions), align it to 32-bytes. */ - .p2align 5 + + /* Main loop to look for BYTE in the string. Since + it's a small loop (8 instructions), align it to 32-bytes. */ + .align 5 L(loop): /* Load two doublewords, compare and merge in a single register for speed. This is an attempt to speed up the byte-checking process for bigger strings. */ ld r12,8(r8) ldu r11,16(r8) - cmpb r10,r12,r4 + cmpb r3,r12,r4 cmpb r9,r11,r4 - or r5,r9,r10 /* Merge everything in one doubleword. */ - cmpldi cr7,r5,0 + or r6,r9,r3 /* Merge everything in one doubleword. */ + cmpldi cr7,r6,0 bne cr7,L(found) bdnz L(loop) - /* We're here because the counter reached 0, and that means we - didn't have any matches for BYTE in the whole range. */ - subi r11,r7,8 - cmpld cr6,r8,r11 - blt cr6,L(loop_small) - b L(null) + /* We may have one more dword to read. */ + cmpld r8,r7 + beqlr + ldu r12,8(r8) + cmpb r3,r12,r4 + cmpldi cr6,r3,0 + bne cr6,L(done) + blr + + .align 4 +L(found): /* OK, one (or both) of the doublewords contains BYTE. Check the first doubleword and decrement the address in case the first doubleword really contains BYTE. */ - .align 4 -L(found): - cmpldi cr6,r10,0 + cmpldi cr6,r3,0 addi r8,r8,-8 bne cr6,L(done) /* BYTE must be in the second doubleword. Adjust the address - again and move the result of cmpb to r10 so we can calculate the + again and move the result of cmpb to r3 so we can calculate the pointer. */ - mr r10,r9 + mr r3,r9 addi r8,r8,8 - /* r10 has the output of the cmpb instruction, that is, it contains + /* r3 has the output of the cmpb instruction, that is, it contains 0xff in the same position as BYTE in the original doubleword from the string. Use that to calculate the pointer. We need to make sure BYTE is *before* the end of the range. */ L(done): - cntlzd r0,r10 /* Count leading zeroes before the match. */ - srdi r0,r0,3 /* Convert leading zeroes to bytes. */ +#ifdef __LITTLE_ENDIAN__ + addi r0,r3,-1 + andc r0,r0,r3 + popcntd r0,r0 /* Count trailing zeros. */ +#else + cntlzd r0,r3 /* Count leading zeros before the match. */ +#endif + cmpld r8,r7 /* Are we on the last dword? */ + srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ add r3,r8,r0 - cmpld r3,r7 - bge L(null) + cmpld cr7,r0,r5 /* If on the last dword, check byte offset. */ + bnelr + blelr cr7 + li r3,0 blr .align 4 @@ -139,67 +142,44 @@ L(null): .align 4 L(small_range): cmpldi r5,0 - rlwinm r6,r3,3,26,28 /* Calculate padding. */ - beq L(null) /* This branch is for the cmpldi r5,0 above. */ + beq L(null) ld r12,0(r8) /* Load word from memory. */ - cmpldi cr6,r6,0 /* cr6 == Do we have padding? */ - cmpb r10,r12,r4 /* Check for BYTE in DWORD1. */ - /* If no padding, skip the shifts. */ - beq cr6,L(small_no_padding) - sld r10,r10,r6 - srd r10,r10,r6 -L(small_no_padding): - cmpldi cr7,r10,0 + cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */ + and r3,r3,r9 + cmpldi cr7,r3,0 + clrldi r5,r7,61 /* Byte count - 1 in last dword. */ + clrrdi r7,r7,3 /* Address of last doubleword. */ + cmpld r8,r7 /* Are we done already? */ bne cr7,L(done) + beqlr - /* Are we done already? */ - addi r9,r8,8 - cmpld r9,r7 - bge L(null) - /* If we're not done, drop through into loop_small. */ - -L(loop_small): /* loop_small has been unrolled. */ ldu r12,8(r8) - cmpb r10,r12,r4 - addi r9,r8,8 - cmpldi cr6,r10,0 - cmpld r9,r7 + cmpb r3,r12,r4 + cmpldi cr6,r3,0 + cmpld r8,r7 bne cr6,L(done) /* Found something. */ - bge L(null) /* Hit end of string (length). */ + beqlr /* Hit end of string (length). */ ldu r12,8(r8) - cmpb r10,r12,r4 - addi r9,r8,8 - cmpldi cr6,r10,0 - cmpld r9,r7 - bne cr6,L(done) /* Found something. */ - bge L(null) + cmpb r3,r12,r4 + cmpldi cr6,r3,0 + cmpld r8,r7 + bne cr6,L(done) + beqlr ldu r12,8(r8) - subi r11,r7,8 - cmpb r10,r12,r4 - cmpldi cr6,r10,0 - ori r2,r2,0 /* Force a dispatch group. */ + cmpb r3,r12,r4 + cmpldi cr6,r3,0 + cmpld r8,r7 bne cr6,L(done) + beqlr - cmpld r8,r11 /* At end of range? */ - bge L(null) - - /* For most cases we will never get here. Under some combinations of - padding + length there is a leftover double that still needs to be - checked. */ ldu r12,8(r8) - cmpb r10,r12,r4 - addi r9,r8,8 - cmpldi cr6,r10,0 - cmpld r9,r7 - bne cr6,L(done) /* Found something. */ - - /* Save a branch and exit directly. */ - li r3,0 + cmpb r3,r12,r4 + cmpldi cr6,r3,0 + bne cr6,L(done) blr - END (__memchr) weak_alias (__memchr, memchr) libc_hidden_builtin_def (memchr) diff --git a/libc/sysdeps/powerpc/powerpc64/power7/memcmp.S b/libc/sysdeps/powerpc/powerpc64/power7/memcmp.S index f190c6461..6851cdc75 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/memcmp.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/memcmp.S @@ -23,10 +23,9 @@ size_t size [r5]) */ .machine power7 -EALIGN (memcmp,4,0) +EALIGN (memcmp, 4, 0) CALL_MCOUNT 3 -#define rTMP r0 #define rRTN r3 #define rSTR1 r3 /* first string arg */ #define rSTR2 r4 /* second string arg */ @@ -37,354 +36,557 @@ EALIGN (memcmp,4,0) #define rWORD4 r9 /* next word in s2 */ #define rWORD5 r10 /* next word in s1 */ #define rWORD6 r11 /* next word in s2 */ -#define rBITDIF r12 /* bits that differ in s1 & s2 words */ #define rWORD7 r30 /* next word in s1 */ #define rWORD8 r31 /* next word in s2 */ - xor rTMP,rSTR2,rSTR1 - cmpldi cr6,rN,0 - cmpldi cr1,rN,12 - clrldi. rTMP,rTMP,61 - clrldi rBITDIF,rSTR1,61 - cmpldi cr5,rBITDIF,0 - beq- cr6,L(zeroLength) - dcbt 0,rSTR1 - dcbt 0,rSTR2 + xor r0, rSTR2, rSTR1 + cmpldi cr6, rN, 0 + cmpldi cr1, rN, 12 + clrldi. r0, r0, 61 + clrldi r12, rSTR1, 61 + cmpldi cr5, r12, 0 + beq- cr6, L(zeroLength) + dcbt 0, rSTR1 + dcbt 0, rSTR2 /* If less than 8 bytes or not aligned, use the unaligned byte loop. */ - blt cr1,L(bytealigned) - std rWORD8,-8(r1) - cfi_offset(rWORD8,-8) - std rWORD7,-16(r1) - cfi_offset(rWORD7,-16) + blt cr1, L(bytealigned) + std rWORD8, -8(r1) + cfi_offset(rWORD8, -8) + std rWORD7, -16(r1) + cfi_offset(rWORD7, -16) bne L(unaligned) /* At this point we know both strings have the same alignment and the - compare length is at least 8 bytes. rBITDIF contains the low order + compare length is at least 8 bytes. r12 contains the low order 3 bits of rSTR1 and cr5 contains the result of the logical compare - of rBITDIF to 0. If rBITDIF == 0 then we are already double word - aligned and can perform the DWaligned loop. + of r12 to 0. If r12 == 0 then we are already double word + aligned and can perform the DW aligned loop. Otherwise we know the two strings have the same alignment (but not - yet DW). So we can force the string addresses to the next lower DW - boundary and special case this first DW word using shift left to + yet DW). So we force the string addresses to the next lower DW + boundary and special case this first DW using shift left to eliminate bits preceding the first byte. Since we want to join the - normal (DWaligned) compare loop, starting at the second double word, + normal (DW aligned) compare loop, starting at the second double word, we need to adjust the length (rN) and special case the loop - versioning for the first DW. This insures that the loop count is - correct and the first DW (shifted) is in the expected resister pair. */ + versioning for the first DW. This ensures that the loop count is + correct and the first DW (shifted) is in the expected register pair. */ .align 4 L(samealignment): - clrrdi rSTR1,rSTR1,3 - clrrdi rSTR2,rSTR2,3 - beq cr5,L(DWaligned) - add rN,rN,rBITDIF - sldi r11,rBITDIF,3 - srdi rTMP,rN,5 /* Divide by 32 */ - andi. rBITDIF,rN,24 /* Get the DW remainder */ - ld rWORD1,0(rSTR1) - ld rWORD2,0(rSTR2) - cmpldi cr1,rBITDIF,16 - cmpldi cr7,rN,32 - clrldi rN,rN,61 + clrrdi rSTR1, rSTR1, 3 + clrrdi rSTR2, rSTR2, 3 + beq cr5, L(DWaligned) + add rN, rN, r12 + sldi rWORD6, r12, 3 + srdi r0, rN, 5 /* Divide by 32 */ + andi. r12, rN, 24 /* Get the DW remainder */ +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 0(rSTR1) + ld rWORD2, 0(rSTR2) +#endif + cmpldi cr1, r12, 16 + cmpldi cr7, rN, 32 + clrldi rN, rN, 61 beq L(dPs4) - mtctr rTMP - bgt cr1,L(dPs3) - beq cr1,L(dPs2) + mtctr r0 + bgt cr1, L(dPs3) + beq cr1, L(dPs2) /* Remainder is 8 */ .align 3 L(dsP1): - sld rWORD5,rWORD1,r11 - sld rWORD6,rWORD2,r11 - cmpld cr5,rWORD5,rWORD6 - blt cr7,L(dP1x) + sld rWORD5, rWORD1, rWORD6 + sld rWORD6, rWORD2, rWORD6 + cmpld cr5, rWORD5, rWORD6 + blt cr7, L(dP1x) /* Do something useful in this cycle since we have to branch anyway. */ - ld rWORD1,8(rSTR1) - ld rWORD2,8(rSTR2) - cmpld cr0,rWORD1,rWORD2 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 8(rSTR1) + ld rWORD2, 8(rSTR2) +#endif + cmpld cr7, rWORD1, rWORD2 b L(dP1e) /* Remainder is 16 */ .align 4 L(dPs2): - sld rWORD5,rWORD1,r11 - sld rWORD6,rWORD2,r11 - cmpld cr6,rWORD5,rWORD6 - blt cr7,L(dP2x) + sld rWORD5, rWORD1, rWORD6 + sld rWORD6, rWORD2, rWORD6 + cmpld cr6, rWORD5, rWORD6 + blt cr7, L(dP2x) /* Do something useful in this cycle since we have to branch anyway. */ - ld rWORD7,8(rSTR1) - ld rWORD8,8(rSTR2) - cmpld cr5,rWORD7,rWORD8 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD7, 0, rSTR1 + ldbrx rWORD8, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD7, 8(rSTR1) + ld rWORD8, 8(rSTR2) +#endif + cmpld cr5, rWORD7, rWORD8 b L(dP2e) /* Remainder is 24 */ .align 4 L(dPs3): - sld rWORD3,rWORD1,r11 - sld rWORD4,rWORD2,r11 - cmpld cr1,rWORD3,rWORD4 + sld rWORD3, rWORD1, rWORD6 + sld rWORD4, rWORD2, rWORD6 + cmpld cr1, rWORD3, rWORD4 b L(dP3e) /* Count is a multiple of 32, remainder is 0 */ .align 4 L(dPs4): - mtctr rTMP - sld rWORD1,rWORD1,r11 - sld rWORD2,rWORD2,r11 - cmpld cr0,rWORD1,rWORD2 + mtctr r0 + sld rWORD1, rWORD1, rWORD6 + sld rWORD2, rWORD2, rWORD6 + cmpld cr7, rWORD1, rWORD2 b L(dP4e) /* At this point we know both strings are double word aligned and the compare length is at least 8 bytes. */ .align 4 L(DWaligned): - andi. rBITDIF,rN,24 /* Get the DW remainder */ - srdi rTMP,rN,5 /* Divide by 32 */ - cmpldi cr1,rBITDIF,16 - cmpldi cr7,rN,32 - clrldi rN,rN,61 + andi. r12, rN, 24 /* Get the DW remainder */ + srdi r0, rN, 5 /* Divide by 32 */ + cmpldi cr1, r12, 16 + cmpldi cr7, rN, 32 + clrldi rN, rN, 61 beq L(dP4) - bgt cr1,L(dP3) - beq cr1,L(dP2) + bgt cr1, L(dP3) + beq cr1, L(dP2) /* Remainder is 8 */ .align 4 L(dP1): - mtctr rTMP + mtctr r0 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early (8-15 byte compare), we want to use only volatile registers. This means we can avoid restoring non-volatile registers since we did not change any on the early exit path. The key here is the non-early exit path only cares about the condition code (cr5), not about which register pair was used. */ - ld rWORD5,0(rSTR1) - ld rWORD6,0(rSTR2) - cmpld cr5,rWORD5,rWORD6 - blt cr7,L(dP1x) - ld rWORD1,8(rSTR1) - ld rWORD2,8(rSTR2) - cmpld cr0,rWORD1,rWORD2 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD5, 0, rSTR1 + ldbrx rWORD6, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD5, 0(rSTR1) + ld rWORD6, 0(rSTR2) +#endif + cmpld cr5, rWORD5, rWORD6 + blt cr7, L(dP1x) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 8(rSTR1) + ld rWORD2, 8(rSTR2) +#endif + cmpld cr7, rWORD1, rWORD2 L(dP1e): - ld rWORD3,16(rSTR1) - ld rWORD4,16(rSTR2) - cmpld cr1,rWORD3,rWORD4 - ld rWORD5,24(rSTR1) - ld rWORD6,24(rSTR2) - cmpld cr6,rWORD5,rWORD6 - bne cr5,L(dLcr5) - bne cr0,L(dLcr0) - - ldu rWORD7,32(rSTR1) - ldu rWORD8,32(rSTR2) - bne cr1,L(dLcr1) - cmpld cr5,rWORD7,rWORD8 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD3, 0, rSTR1 + ldbrx rWORD4, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD3, 16(rSTR1) + ld rWORD4, 16(rSTR2) +#endif + cmpld cr1, rWORD3, rWORD4 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD5, 0, rSTR1 + ldbrx rWORD6, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD5, 24(rSTR1) + ld rWORD6, 24(rSTR2) +#endif + cmpld cr6, rWORD5, rWORD6 + bne cr5, L(dLcr5x) + bne cr7, L(dLcr7x) + +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD7, 0, rSTR1 + ldbrx rWORD8, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ldu rWORD7, 32(rSTR1) + ldu rWORD8, 32(rSTR2) +#endif + bne cr1, L(dLcr1) + cmpld cr5, rWORD7, rWORD8 bdnz L(dLoop) - bne cr6,L(dLcr6) - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) + bne cr6, L(dLcr6) + ld rWORD8, -8(r1) + ld rWORD7, -16(r1) .align 3 L(dP1x): - sldi. r12,rN,3 - bne cr5,L(dLcr5) - subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ + sldi. r12, rN, 3 + bne cr5, L(dLcr5x) + subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ bne L(d00) - li rRTN,0 + li rRTN, 0 blr /* Remainder is 16 */ .align 4 L(dP2): - mtctr rTMP - ld rWORD5,0(rSTR1) - ld rWORD6,0(rSTR2) - cmpld cr6,rWORD5,rWORD6 - blt cr7,L(dP2x) - ld rWORD7,8(rSTR1) - ld rWORD8,8(rSTR2) - cmpld cr5,rWORD7,rWORD8 + mtctr r0 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD5, 0, rSTR1 + ldbrx rWORD6, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD5, 0(rSTR1) + ld rWORD6, 0(rSTR2) +#endif + cmpld cr6, rWORD5, rWORD6 + blt cr7, L(dP2x) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD7, 0, rSTR1 + ldbrx rWORD8, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD7, 8(rSTR1) + ld rWORD8, 8(rSTR2) +#endif + cmpld cr5, rWORD7, rWORD8 L(dP2e): - ld rWORD1,16(rSTR1) - ld rWORD2,16(rSTR2) - cmpld cr0,rWORD1,rWORD2 - ld rWORD3,24(rSTR1) - ld rWORD4,24(rSTR2) - cmpld cr1,rWORD3,rWORD4 - addi rSTR1,rSTR1,8 - addi rSTR2,rSTR2,8 - bne cr6,L(dLcr6) - bne cr5,L(dLcr5) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 16(rSTR1) + ld rWORD2, 16(rSTR2) +#endif + cmpld cr7, rWORD1, rWORD2 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD3, 0, rSTR1 + ldbrx rWORD4, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD3, 24(rSTR1) + ld rWORD4, 24(rSTR2) +#endif + cmpld cr1, rWORD3, rWORD4 +#ifndef __LITTLE_ENDIAN__ + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#endif + bne cr6, L(dLcr6) + bne cr5, L(dLcr5) b L(dLoop2) /* Again we are on a early exit path (16-23 byte compare), we want to only use volatile registers and avoid restoring non-volatile registers. */ .align 4 L(dP2x): - ld rWORD3,8(rSTR1) - ld rWORD4,8(rSTR2) - cmpld cr5,rWORD3,rWORD4 - sldi. r12,rN,3 - bne cr6,L(dLcr6) - addi rSTR1,rSTR1,8 - addi rSTR2,rSTR2,8 - bne cr5,L(dLcr5) - subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD3, 0, rSTR1 + ldbrx rWORD4, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD3, 8(rSTR1) + ld rWORD4, 8(rSTR2) +#endif + cmpld cr1, rWORD3, rWORD4 + sldi. r12, rN, 3 + bne cr6, L(dLcr6x) +#ifndef __LITTLE_ENDIAN__ + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#endif + bne cr1, L(dLcr1x) + subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ bne L(d00) - li rRTN,0 + li rRTN, 0 blr /* Remainder is 24 */ .align 4 L(dP3): - mtctr rTMP - ld rWORD3,0(rSTR1) - ld rWORD4,0(rSTR2) - cmpld cr1,rWORD3,rWORD4 + mtctr r0 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD3, 0, rSTR1 + ldbrx rWORD4, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD3, 0(rSTR1) + ld rWORD4, 0(rSTR2) +#endif + cmpld cr1, rWORD3, rWORD4 L(dP3e): - ld rWORD5,8(rSTR1) - ld rWORD6,8(rSTR2) - cmpld cr6,rWORD5,rWORD6 - blt cr7,L(dP3x) - ld rWORD7,16(rSTR1) - ld rWORD8,16(rSTR2) - cmpld cr5,rWORD7,rWORD8 - ld rWORD1,24(rSTR1) - ld rWORD2,24(rSTR2) - cmpld cr0,rWORD1,rWORD2 - addi rSTR1,rSTR1,16 - addi rSTR2,rSTR2,16 - bne cr1,L(dLcr1) - bne cr6,L(dLcr6) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD5, 0, rSTR1 + ldbrx rWORD6, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD5, 8(rSTR1) + ld rWORD6, 8(rSTR2) +#endif + cmpld cr6, rWORD5, rWORD6 + blt cr7, L(dP3x) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD7, 0, rSTR1 + ldbrx rWORD8, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD7, 16(rSTR1) + ld rWORD8, 16(rSTR2) +#endif + cmpld cr5, rWORD7, rWORD8 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 24(rSTR1) + ld rWORD2, 24(rSTR2) +#endif + cmpld cr7, rWORD1, rWORD2 +#ifndef __LITTLE_ENDIAN__ + addi rSTR1, rSTR1, 16 + addi rSTR2, rSTR2, 16 +#endif + bne cr1, L(dLcr1) + bne cr6, L(dLcr6) b L(dLoop1) /* Again we are on a early exit path (24-31 byte compare), we want to only use volatile registers and avoid restoring non-volatile registers. */ .align 4 L(dP3x): - ld rWORD1,16(rSTR1) - ld rWORD2,16(rSTR2) - cmpld cr5,rWORD1,rWORD2 - sldi. r12,rN,3 - bne cr1,L(dLcr1) - addi rSTR1,rSTR1,16 - addi rSTR2,rSTR2,16 - bne cr6,L(dLcr6) - subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ - bne cr5,L(dLcr5) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 16(rSTR1) + ld rWORD2, 16(rSTR2) +#endif + cmpld cr7, rWORD1, rWORD2 + sldi. r12, rN, 3 + bne cr1, L(dLcr1x) +#ifndef __LITTLE_ENDIAN__ + addi rSTR1, rSTR1, 16 + addi rSTR2, rSTR2, 16 +#endif + bne cr6, L(dLcr6x) + subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ + bne cr7, L(dLcr7x) bne L(d00) - li rRTN,0 + li rRTN, 0 blr /* Count is a multiple of 32, remainder is 0 */ .align 4 L(dP4): - mtctr rTMP - ld rWORD1,0(rSTR1) - ld rWORD2,0(rSTR2) - cmpld cr0,rWORD1,rWORD2 + mtctr r0 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 0(rSTR1) + ld rWORD2, 0(rSTR2) +#endif + cmpld cr7, rWORD1, rWORD2 L(dP4e): - ld rWORD3,8(rSTR1) - ld rWORD4,8(rSTR2) - cmpld cr1,rWORD3,rWORD4 - ld rWORD5,16(rSTR1) - ld rWORD6,16(rSTR2) - cmpld cr6,rWORD5,rWORD6 - ldu rWORD7,24(rSTR1) - ldu rWORD8,24(rSTR2) - cmpld cr5,rWORD7,rWORD8 - bne cr0,L(dLcr0) - bne cr1,L(dLcr1) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD3, 0, rSTR1 + ldbrx rWORD4, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD3, 8(rSTR1) + ld rWORD4, 8(rSTR2) +#endif + cmpld cr1, rWORD3, rWORD4 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD5, 0, rSTR1 + ldbrx rWORD6, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD5, 16(rSTR1) + ld rWORD6, 16(rSTR2) +#endif + cmpld cr6, rWORD5, rWORD6 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD7, 0, rSTR1 + ldbrx rWORD8, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ldu rWORD7, 24(rSTR1) + ldu rWORD8, 24(rSTR2) +#endif + cmpld cr5, rWORD7, rWORD8 + bne cr7, L(dLcr7) + bne cr1, L(dLcr1) bdz- L(d24) /* Adjust CTR as we start with +4 */ /* This is the primary loop */ .align 4 L(dLoop): - ld rWORD1,8(rSTR1) - ld rWORD2,8(rSTR2) - cmpld cr1,rWORD3,rWORD4 - bne cr6,L(dLcr6) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 8(rSTR1) + ld rWORD2, 8(rSTR2) +#endif + cmpld cr1, rWORD3, rWORD4 + bne cr6, L(dLcr6) L(dLoop1): - ld rWORD3,16(rSTR1) - ld rWORD4,16(rSTR2) - cmpld cr6,rWORD5,rWORD6 - bne cr5,L(dLcr5) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD3, 0, rSTR1 + ldbrx rWORD4, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD3, 16(rSTR1) + ld rWORD4, 16(rSTR2) +#endif + cmpld cr6, rWORD5, rWORD6 + bne cr5, L(dLcr5) L(dLoop2): - ld rWORD5,24(rSTR1) - ld rWORD6,24(rSTR2) - cmpld cr5,rWORD7,rWORD8 - bne cr0,L(dLcr0) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD5, 0, rSTR1 + ldbrx rWORD6, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD5, 24(rSTR1) + ld rWORD6, 24(rSTR2) +#endif + cmpld cr5, rWORD7, rWORD8 + bne cr7, L(dLcr7) L(dLoop3): - ldu rWORD7,32(rSTR1) - ldu rWORD8,32(rSTR2) - bne cr1,L(dLcr1) - cmpld cr0,rWORD1,rWORD2 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD7, 0, rSTR1 + ldbrx rWORD8, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ldu rWORD7, 32(rSTR1) + ldu rWORD8, 32(rSTR2) +#endif + bne cr1, L(dLcr1) + cmpld cr7, rWORD1, rWORD2 bdnz L(dLoop) L(dL4): - cmpld cr1,rWORD3,rWORD4 - bne cr6,L(dLcr6) - cmpld cr6,rWORD5,rWORD6 - bne cr5,L(dLcr5) - cmpld cr5,rWORD7,rWORD8 + cmpld cr1, rWORD3, rWORD4 + bne cr6, L(dLcr6) + cmpld cr6, rWORD5, rWORD6 + bne cr5, L(dLcr5) + cmpld cr5, rWORD7, rWORD8 L(d44): - bne cr0,L(dLcr0) + bne cr7, L(dLcr7) L(d34): - bne cr1,L(dLcr1) + bne cr1, L(dLcr1) L(d24): - bne cr6,L(dLcr6) + bne cr6, L(dLcr6) L(d14): - sldi. r12,rN,3 - bne cr5,L(dLcr5) + sldi. r12, rN, 3 + bne cr5, L(dLcr5) L(d04): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */ + ld rWORD8, -8(r1) + ld rWORD7, -16(r1) + subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ beq L(zeroLength) /* At this point we have a remainder of 1 to 7 bytes to compare. Since we are aligned it is safe to load the whole double word, and use shift right double to eliminate bits beyond the compare length. */ L(d00): - ld rWORD1,8(rSTR1) - ld rWORD2,8(rSTR2) - srd rWORD1,rWORD1,rN - srd rWORD2,rWORD2,rN - cmpld cr5,rWORD1,rWORD2 - bne cr5,L(dLcr5x) - li rRTN,0 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 8(rSTR1) + ld rWORD2, 8(rSTR2) +#endif + srd rWORD1, rWORD1, rN + srd rWORD2, rWORD2, rN + cmpld cr7, rWORD1, rWORD2 + bne cr7, L(dLcr7x) + li rRTN, 0 blr + .align 4 -L(dLcr0): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN,1 - bgtlr cr0 - li rRTN,-1 +L(dLcr7): + ld rWORD8, -8(r1) + ld rWORD7, -16(r1) +L(dLcr7x): + li rRTN, 1 + bgtlr cr7 + li rRTN, -1 blr .align 4 L(dLcr1): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN,1 + ld rWORD8, -8(r1) + ld rWORD7, -16(r1) +L(dLcr1x): + li rRTN, 1 bgtlr cr1 - li rRTN,-1 + li rRTN, -1 blr .align 4 L(dLcr6): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN,1 + ld rWORD8, -8(r1) + ld rWORD7, -16(r1) +L(dLcr6x): + li rRTN, 1 bgtlr cr6 - li rRTN,-1 + li rRTN, -1 blr .align 4 L(dLcr5): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) + ld rWORD8, -8(r1) + ld rWORD7, -16(r1) L(dLcr5x): - li rRTN,1 + li rRTN, 1 bgtlr cr5 - li rRTN,-1 + li rRTN, -1 blr .align 4 L(bytealigned): mtctr rN - beq cr6,L(zeroLength) +#if 0 +/* Huh? We've already branched on cr6! */ + beq cr6, L(zeroLength) +#endif /* We need to prime this loop. This loop is swing modulo scheduled to avoid pipe delays. The dependent instruction latencies (load to @@ -396,38 +598,38 @@ L(bytealigned): So we must precondition some registers and condition codes so that we don't exit the loop early on the first iteration. */ - lbz rWORD1,0(rSTR1) - lbz rWORD2,0(rSTR2) + lbz rWORD1, 0(rSTR1) + lbz rWORD2, 0(rSTR2) bdz L(b11) - cmpld cr0,rWORD1,rWORD2 - lbz rWORD3,1(rSTR1) - lbz rWORD4,1(rSTR2) + cmpld cr7, rWORD1, rWORD2 + lbz rWORD3, 1(rSTR1) + lbz rWORD4, 1(rSTR2) bdz L(b12) - cmpld cr1,rWORD3,rWORD4 - lbzu rWORD5,2(rSTR1) - lbzu rWORD6,2(rSTR2) + cmpld cr1, rWORD3, rWORD4 + lbzu rWORD5, 2(rSTR1) + lbzu rWORD6, 2(rSTR2) bdz L(b13) .align 4 L(bLoop): - lbzu rWORD1,1(rSTR1) - lbzu rWORD2,1(rSTR2) - bne cr0,L(bLcr0) + lbzu rWORD1, 1(rSTR1) + lbzu rWORD2, 1(rSTR2) + bne cr7, L(bLcr7) - cmpld cr6,rWORD5,rWORD6 + cmpld cr6, rWORD5, rWORD6 bdz L(b3i) - lbzu rWORD3,1(rSTR1) - lbzu rWORD4,1(rSTR2) - bne cr1,L(bLcr1) + lbzu rWORD3, 1(rSTR1) + lbzu rWORD4, 1(rSTR2) + bne cr1, L(bLcr1) - cmpld cr0,rWORD1,rWORD2 + cmpld cr7, rWORD1, rWORD2 bdz L(b2i) - lbzu rWORD5,1(rSTR1) - lbzu rWORD6,1(rSTR2) - bne cr6,L(bLcr6) + lbzu rWORD5, 1(rSTR1) + lbzu rWORD6, 1(rSTR2) + bne cr6, L(bLcr6) - cmpld cr1,rWORD3,rWORD4 + cmpld cr1, rWORD3, rWORD4 bdnz L(bLoop) /* We speculatively loading bytes before we have tested the previous @@ -437,542 +639,727 @@ L(bLoop): tested. In this case we must complete the pending operations before returning. */ L(b1i): - bne cr0,L(bLcr0) - bne cr1,L(bLcr1) + bne cr7, L(bLcr7) + bne cr1, L(bLcr1) b L(bx56) .align 4 L(b2i): - bne cr6,L(bLcr6) - bne cr0,L(bLcr0) + bne cr6, L(bLcr6) + bne cr7, L(bLcr7) b L(bx34) .align 4 L(b3i): - bne cr1,L(bLcr1) - bne cr6,L(bLcr6) + bne cr1, L(bLcr1) + bne cr6, L(bLcr6) b L(bx12) .align 4 -L(bLcr0): - li rRTN,1 - bgtlr cr0 - li rRTN,-1 +L(bLcr7): + li rRTN, 1 + bgtlr cr7 + li rRTN, -1 blr L(bLcr1): - li rRTN,1 + li rRTN, 1 bgtlr cr1 - li rRTN,-1 + li rRTN, -1 blr L(bLcr6): - li rRTN,1 + li rRTN, 1 bgtlr cr6 - li rRTN,-1 + li rRTN, -1 blr L(b13): - bne cr0,L(bx12) - bne cr1,L(bx34) + bne cr7, L(bx12) + bne cr1, L(bx34) L(bx56): - sub rRTN,rWORD5,rWORD6 + sub rRTN, rWORD5, rWORD6 blr nop L(b12): - bne cr0,L(bx12) + bne cr7, L(bx12) L(bx34): - sub rRTN,rWORD3,rWORD4 + sub rRTN, rWORD3, rWORD4 blr L(b11): L(bx12): - sub rRTN,rWORD1,rWORD2 + sub rRTN, rWORD1, rWORD2 blr .align 4 -L(zeroLengthReturn): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) L(zeroLength): - li rRTN,0 + li rRTN, 0 blr .align 4 /* At this point we know the strings have different alignment and the - compare length is at least 8 bytes. rBITDIF contains the low order + compare length is at least 8 bytes. r12 contains the low order 3 bits of rSTR1 and cr5 contains the result of the logical compare - of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word + of r12 to 0. If r12 == 0 then rStr1 is double word aligned and can perform the DWunaligned loop. Otherwise we know that rSTR1 is not already DW aligned yet. So we can force the string addresses to the next lower DW - boundary and special case this first DW word using shift left to + boundary and special case this first DW using shift left to eliminate bits preceding the first byte. Since we want to join the normal (DWaligned) compare loop, starting at the second double word, we need to adjust the length (rN) and special case the loop - versioning for the first DW. This insures that the loop count is + versioning for the first DW. This ensures that the loop count is correct and the first DW (shifted) is in the expected resister pair. */ -#define rSHL r29 /* Unaligned shift left count. */ -#define rSHR r28 /* Unaligned shift right count. */ -#define rB r27 /* Left rotation temp for rWORD2. */ -#define rD r26 /* Left rotation temp for rWORD4. */ -#define rF r25 /* Left rotation temp for rWORD6. */ -#define rH r24 /* Left rotation temp for rWORD8. */ -#define rA r0 /* Right rotation temp for rWORD2. */ -#define rC r12 /* Right rotation temp for rWORD4. */ -#define rE r0 /* Right rotation temp for rWORD6. */ -#define rG r12 /* Right rotation temp for rWORD8. */ +#define rSHL r29 /* Unaligned shift left count. */ +#define rSHR r28 /* Unaligned shift right count. */ +#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ +#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ +#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ +#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ L(unaligned): - std r29,-24(r1) - cfi_offset(r29,-24) - clrldi rSHL,rSTR2,61 - beq cr6,L(duzeroLength) - std r28,-32(r1) - cfi_offset(r28,-32) - beq cr5,L(DWunaligned) - std r27,-40(r1) - cfi_offset(r27,-40) -/* Adjust the logical start of rSTR2 ro compensate for the extra bits + std rSHL, -24(r1) + cfi_offset(rSHL, -24) + clrldi rSHL, rSTR2, 61 + beq cr6, L(duzeroLength) + std rSHR, -32(r1) + cfi_offset(rSHR, -32) + beq cr5, L(DWunaligned) + std rWORD8_SHIFT, -40(r1) + cfi_offset(rWORD8_SHIFT, -40) +/* Adjust the logical start of rSTR2 to compensate for the extra bits in the 1st rSTR1 DW. */ - sub r27,rSTR2,rBITDIF + sub rWORD8_SHIFT, rSTR2, r12 /* But do not attempt to address the DW before that DW that contains the actual start of rSTR2. */ - clrrdi rSTR2,rSTR2,3 - std r26,-48(r1) - cfi_offset(r26,-48) + clrrdi rSTR2, rSTR2, 3 + std rWORD2_SHIFT, -48(r1) + cfi_offset(rWORD2_SHIFT, -48) /* Compute the left/right shift counts for the unaligned rSTR2, compensating for the logical (DW aligned) start of rSTR1. */ - clrldi rSHL,r27,61 - clrrdi rSTR1,rSTR1,3 - std r25,-56(r1) - cfi_offset(r25,-56) - sldi rSHL,rSHL,3 - cmpld cr5,r27,rSTR2 - add rN,rN,rBITDIF - sldi r11,rBITDIF,3 - std r24,-64(r1) - cfi_offset(r24,-64) - subfic rSHR,rSHL,64 - srdi rTMP,rN,5 /* Divide by 32 */ - andi. rBITDIF,rN,24 /* Get the DW remainder */ + clrldi rSHL, rWORD8_SHIFT, 61 + clrrdi rSTR1, rSTR1, 3 + std rWORD4_SHIFT, -56(r1) + cfi_offset(rWORD4_SHIFT, -56) + sldi rSHL, rSHL, 3 + cmpld cr5, rWORD8_SHIFT, rSTR2 + add rN, rN, r12 + sldi rWORD6, r12, 3 + std rWORD6_SHIFT, -64(r1) + cfi_offset(rWORD6_SHIFT, -64) + subfic rSHR, rSHL, 64 + srdi r0, rN, 5 /* Divide by 32 */ + andi. r12, rN, 24 /* Get the DW remainder */ /* We normally need to load 2 DWs to start the unaligned rSTR2, but in this special case those bits may be discarded anyway. Also we must avoid loading a DW where none of the bits are part of rSTR2 as this may cross a page boundary and cause a page fault. */ - li rWORD8,0 - blt cr5,L(dus0) - ld rWORD8,0(rSTR2) - la rSTR2,8(rSTR2) - sld rWORD8,rWORD8,rSHL + li rWORD8, 0 + blt cr5, L(dus0) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD8, 0, rSTR2 + addi rSTR2, rSTR2, 8 +#else + ld rWORD8, 0(rSTR2) + addi rSTR2, rSTR2, 8 +#endif + sld rWORD8, rWORD8, rSHL L(dus0): - ld rWORD1,0(rSTR1) - ld rWORD2,0(rSTR2) - cmpldi cr1,rBITDIF,16 - cmpldi cr7,rN,32 - srd rG,rWORD2,rSHR - clrldi rN,rN,61 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 0(rSTR1) + ld rWORD2, 0(rSTR2) +#endif + cmpldi cr1, r12, 16 + cmpldi cr7, rN, 32 + srd r12, rWORD2, rSHR + clrldi rN, rN, 61 beq L(duPs4) - mtctr rTMP - or rWORD8,rG,rWORD8 - bgt cr1,L(duPs3) - beq cr1,L(duPs2) + mtctr r0 + or rWORD8, r12, rWORD8 + bgt cr1, L(duPs3) + beq cr1, L(duPs2) /* Remainder is 8 */ .align 4 L(dusP1): - sld rB,rWORD2,rSHL - sld rWORD7,rWORD1,r11 - sld rWORD8,rWORD8,r11 - bge cr7,L(duP1e) + sld rWORD8_SHIFT, rWORD2, rSHL + sld rWORD7, rWORD1, rWORD6 + sld rWORD8, rWORD8, rWORD6 + bge cr7, L(duP1e) /* At this point we exit early with the first double word compare complete and remainder of 0 to 7 bytes. See L(du14) for details on how we handle the remaining bytes. */ - cmpld cr5,rWORD7,rWORD8 - sldi. rN,rN,3 - bne cr5,L(duLcr5) - cmpld cr7,rN,rSHR + cmpld cr5, rWORD7, rWORD8 + sldi. rN, rN, 3 + bne cr5, L(duLcr5) + cmpld cr7, rN, rSHR beq L(duZeroReturn) - li rA,0 - ble cr7,L(dutrim) - ld rWORD2,8(rSTR2) - srd rA,rWORD2,rSHR + li r0, 0 + ble cr7, L(dutrim) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD2, 0, rSTR2 + addi rSTR2, rSTR2, 8 +#else + ld rWORD2, 8(rSTR2) +#endif + srd r0, rWORD2, rSHR b L(dutrim) /* Remainder is 16 */ .align 4 L(duPs2): - sld rH,rWORD2,rSHL - sld rWORD5,rWORD1,r11 - sld rWORD6,rWORD8,r11 + sld rWORD6_SHIFT, rWORD2, rSHL + sld rWORD5, rWORD1, rWORD6 + sld rWORD6, rWORD8, rWORD6 b L(duP2e) /* Remainder is 24 */ .align 4 L(duPs3): - sld rF,rWORD2,rSHL - sld rWORD3,rWORD1,r11 - sld rWORD4,rWORD8,r11 + sld rWORD4_SHIFT, rWORD2, rSHL + sld rWORD3, rWORD1, rWORD6 + sld rWORD4, rWORD8, rWORD6 b L(duP3e) /* Count is a multiple of 32, remainder is 0 */ .align 4 L(duPs4): - mtctr rTMP - or rWORD8,rG,rWORD8 - sld rD,rWORD2,rSHL - sld rWORD1,rWORD1,r11 - sld rWORD2,rWORD8,r11 + mtctr r0 + or rWORD8, r12, rWORD8 + sld rWORD2_SHIFT, rWORD2, rSHL + sld rWORD1, rWORD1, rWORD6 + sld rWORD2, rWORD8, rWORD6 b L(duP4e) /* At this point we know rSTR1 is double word aligned and the compare length is at least 8 bytes. */ .align 4 L(DWunaligned): - std r27,-40(r1) - cfi_offset(r27,-40) - clrrdi rSTR2,rSTR2,3 - std r26,-48(r1) - cfi_offset(r26,-48) - srdi rTMP,rN,5 /* Divide by 32 */ - std r25,-56(r1) - cfi_offset(r25,-56) - andi. rBITDIF,rN,24 /* Get the DW remainder */ - std r24,-64(r1) - cfi_offset(r24,-64) - sldi rSHL,rSHL,3 - ld rWORD6,0(rSTR2) - ldu rWORD8,8(rSTR2) - cmpldi cr1,rBITDIF,16 - cmpldi cr7,rN,32 - clrldi rN,rN,61 - subfic rSHR,rSHL,64 - sld rH,rWORD6,rSHL + std rWORD8_SHIFT, -40(r1) + cfi_offset(rWORD8_SHIFT, -40) + clrrdi rSTR2, rSTR2, 3 + std rWORD2_SHIFT, -48(r1) + cfi_offset(rWORD2_SHIFT, -48) + srdi r0, rN, 5 /* Divide by 32 */ + std rWORD4_SHIFT, -56(r1) + cfi_offset(rWORD4_SHIFT, -56) + andi. r12, rN, 24 /* Get the DW remainder */ + std rWORD6_SHIFT, -64(r1) + cfi_offset(rWORD6_SHIFT, -64) + sldi rSHL, rSHL, 3 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD6, 0, rSTR2 + addi rSTR2, rSTR2, 8 + ldbrx rWORD8, 0, rSTR2 + addi rSTR2, rSTR2, 8 +#else + ld rWORD6, 0(rSTR2) + ldu rWORD8, 8(rSTR2) +#endif + cmpldi cr1, r12, 16 + cmpldi cr7, rN, 32 + clrldi rN, rN, 61 + subfic rSHR, rSHL, 64 + sld rWORD6_SHIFT, rWORD6, rSHL beq L(duP4) - mtctr rTMP - bgt cr1,L(duP3) - beq cr1,L(duP2) + mtctr r0 + bgt cr1, L(duP3) + beq cr1, L(duP2) /* Remainder is 8 */ .align 4 L(duP1): - srd rG,rWORD8,rSHR - ld rWORD7,0(rSTR1) - sld rB,rWORD8,rSHL - or rWORD8,rG,rH - blt cr7,L(duP1x) + srd r12, rWORD8, rSHR +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD7, 0, rSTR1 + addi rSTR1, rSTR1, 8 +#else + ld rWORD7, 0(rSTR1) +#endif + sld rWORD8_SHIFT, rWORD8, rSHL + or rWORD8, r12, rWORD6_SHIFT + blt cr7, L(duP1x) L(duP1e): - ld rWORD1,8(rSTR1) - ld rWORD2,8(rSTR2) - cmpld cr5,rWORD7,rWORD8 - srd rA,rWORD2,rSHR - sld rD,rWORD2,rSHL - or rWORD2,rA,rB - ld rWORD3,16(rSTR1) - ld rWORD4,16(rSTR2) - cmpld cr0,rWORD1,rWORD2 - srd rC,rWORD4,rSHR - sld rF,rWORD4,rSHL - bne cr5,L(duLcr5) - or rWORD4,rC,rD - ld rWORD5,24(rSTR1) - ld rWORD6,24(rSTR2) - cmpld cr1,rWORD3,rWORD4 - srd rE,rWORD6,rSHR - sld rH,rWORD6,rSHL - bne cr0,L(duLcr0) - or rWORD6,rE,rF - cmpld cr6,rWORD5,rWORD6 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 8(rSTR1) + ld rWORD2, 8(rSTR2) +#endif + cmpld cr5, rWORD7, rWORD8 + srd r0, rWORD2, rSHR + sld rWORD2_SHIFT, rWORD2, rSHL + or rWORD2, r0, rWORD8_SHIFT +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD3, 0, rSTR1 + ldbrx rWORD4, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD3, 16(rSTR1) + ld rWORD4, 16(rSTR2) +#endif + cmpld cr7, rWORD1, rWORD2 + srd r12, rWORD4, rSHR + sld rWORD4_SHIFT, rWORD4, rSHL + bne cr5, L(duLcr5) + or rWORD4, r12, rWORD2_SHIFT +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD5, 0, rSTR1 + ldbrx rWORD6, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD5, 24(rSTR1) + ld rWORD6, 24(rSTR2) +#endif + cmpld cr1, rWORD3, rWORD4 + srd r0, rWORD6, rSHR + sld rWORD6_SHIFT, rWORD6, rSHL + bne cr7, L(duLcr7) + or rWORD6, r0, rWORD4_SHIFT + cmpld cr6, rWORD5, rWORD6 b L(duLoop3) .align 4 /* At this point we exit early with the first double word compare complete and remainder of 0 to 7 bytes. See L(du14) for details on how we handle the remaining bytes. */ L(duP1x): - cmpld cr5,rWORD7,rWORD8 - sldi. rN,rN,3 - bne cr5,L(duLcr5) - cmpld cr7,rN,rSHR + cmpld cr5, rWORD7, rWORD8 + sldi. rN, rN, 3 + bne cr5, L(duLcr5) + cmpld cr7, rN, rSHR beq L(duZeroReturn) - li rA,0 - ble cr7,L(dutrim) - ld rWORD2,8(rSTR2) - srd rA,rWORD2,rSHR + li r0, 0 + ble cr7, L(dutrim) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD2, 0, rSTR2 + addi rSTR2, rSTR2, 8 +#else + ld rWORD2, 8(rSTR2) +#endif + srd r0, rWORD2, rSHR b L(dutrim) /* Remainder is 16 */ .align 4 L(duP2): - srd rE,rWORD8,rSHR - ld rWORD5,0(rSTR1) - or rWORD6,rE,rH - sld rH,rWORD8,rSHL + srd r0, rWORD8, rSHR +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD5, 0, rSTR1 + addi rSTR1, rSTR1, 8 +#else + ld rWORD5, 0(rSTR1) +#endif + or rWORD6, r0, rWORD6_SHIFT + sld rWORD6_SHIFT, rWORD8, rSHL L(duP2e): - ld rWORD7,8(rSTR1) - ld rWORD8,8(rSTR2) - cmpld cr6,rWORD5,rWORD6 - srd rG,rWORD8,rSHR - sld rB,rWORD8,rSHL - or rWORD8,rG,rH - blt cr7,L(duP2x) - ld rWORD1,16(rSTR1) - ld rWORD2,16(rSTR2) - cmpld cr5,rWORD7,rWORD8 - bne cr6,L(duLcr6) - srd rA,rWORD2,rSHR - sld rD,rWORD2,rSHL - or rWORD2,rA,rB - ld rWORD3,24(rSTR1) - ld rWORD4,24(rSTR2) - cmpld cr0,rWORD1,rWORD2 - bne cr5,L(duLcr5) - srd rC,rWORD4,rSHR - sld rF,rWORD4,rSHL - or rWORD4,rC,rD - addi rSTR1,rSTR1,8 - addi rSTR2,rSTR2,8 - cmpld cr1,rWORD3,rWORD4 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD7, 0, rSTR1 + ldbrx rWORD8, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD7, 8(rSTR1) + ld rWORD8, 8(rSTR2) +#endif + cmpld cr6, rWORD5, rWORD6 + srd r12, rWORD8, rSHR + sld rWORD8_SHIFT, rWORD8, rSHL + or rWORD8, r12, rWORD6_SHIFT + blt cr7, L(duP2x) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 16(rSTR1) + ld rWORD2, 16(rSTR2) +#endif + cmpld cr5, rWORD7, rWORD8 + bne cr6, L(duLcr6) + srd r0, rWORD2, rSHR + sld rWORD2_SHIFT, rWORD2, rSHL + or rWORD2, r0, rWORD8_SHIFT +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD3, 0, rSTR1 + ldbrx rWORD4, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD3, 24(rSTR1) + ld rWORD4, 24(rSTR2) +#endif + cmpld cr7, rWORD1, rWORD2 + bne cr5, L(duLcr5) + srd r12, rWORD4, rSHR + sld rWORD4_SHIFT, rWORD4, rSHL + or rWORD4, r12, rWORD2_SHIFT +#ifndef __LITTLE_ENDIAN__ + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#endif + cmpld cr1, rWORD3, rWORD4 b L(duLoop2) .align 4 L(duP2x): - cmpld cr5,rWORD7,rWORD8 - addi rSTR1,rSTR1,8 - addi rSTR2,rSTR2,8 - bne cr6,L(duLcr6) - sldi. rN,rN,3 - bne cr5,L(duLcr5) - cmpld cr7,rN,rSHR + cmpld cr5, rWORD7, rWORD8 +#ifndef __LITTLE_ENDIAN__ + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#endif + bne cr6, L(duLcr6) + sldi. rN, rN, 3 + bne cr5, L(duLcr5) + cmpld cr7, rN, rSHR beq L(duZeroReturn) - li rA,0 - ble cr7,L(dutrim) - ld rWORD2,8(rSTR2) - srd rA,rWORD2,rSHR + li r0, 0 + ble cr7, L(dutrim) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD2, 0, rSTR2 + addi rSTR2, rSTR2, 8 +#else + ld rWORD2, 8(rSTR2) +#endif + srd r0, rWORD2, rSHR b L(dutrim) /* Remainder is 24 */ .align 4 L(duP3): - srd rC,rWORD8,rSHR - ld rWORD3,0(rSTR1) - sld rF,rWORD8,rSHL - or rWORD4,rC,rH + srd r12, rWORD8, rSHR +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD3, 0, rSTR1 + addi rSTR1, rSTR1, 8 +#else + ld rWORD3, 0(rSTR1) +#endif + sld rWORD4_SHIFT, rWORD8, rSHL + or rWORD4, r12, rWORD6_SHIFT L(duP3e): - ld rWORD5,8(rSTR1) - ld rWORD6,8(rSTR2) - cmpld cr1,rWORD3,rWORD4 - srd rE,rWORD6,rSHR - sld rH,rWORD6,rSHL - or rWORD6,rE,rF - ld rWORD7,16(rSTR1) - ld rWORD8,16(rSTR2) - cmpld cr6,rWORD5,rWORD6 - bne cr1,L(duLcr1) - srd rG,rWORD8,rSHR - sld rB,rWORD8,rSHL - or rWORD8,rG,rH - blt cr7,L(duP3x) - ld rWORD1,24(rSTR1) - ld rWORD2,24(rSTR2) - cmpld cr5,rWORD7,rWORD8 - bne cr6,L(duLcr6) - srd rA,rWORD2,rSHR - sld rD,rWORD2,rSHL - or rWORD2,rA,rB - addi rSTR1,rSTR1,16 - addi rSTR2,rSTR2,16 - cmpld cr0,rWORD1,rWORD2 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD5, 0, rSTR1 + ldbrx rWORD6, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD5, 8(rSTR1) + ld rWORD6, 8(rSTR2) +#endif + cmpld cr1, rWORD3, rWORD4 + srd r0, rWORD6, rSHR + sld rWORD6_SHIFT, rWORD6, rSHL + or rWORD6, r0, rWORD4_SHIFT +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD7, 0, rSTR1 + ldbrx rWORD8, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD7, 16(rSTR1) + ld rWORD8, 16(rSTR2) +#endif + cmpld cr6, rWORD5, rWORD6 + bne cr1, L(duLcr1) + srd r12, rWORD8, rSHR + sld rWORD8_SHIFT, rWORD8, rSHL + or rWORD8, r12, rWORD6_SHIFT + blt cr7, L(duP3x) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 24(rSTR1) + ld rWORD2, 24(rSTR2) +#endif + cmpld cr5, rWORD7, rWORD8 + bne cr6, L(duLcr6) + srd r0, rWORD2, rSHR + sld rWORD2_SHIFT, rWORD2, rSHL + or rWORD2, r0, rWORD8_SHIFT +#ifndef __LITTLE_ENDIAN__ + addi rSTR1, rSTR1, 16 + addi rSTR2, rSTR2, 16 +#endif + cmpld cr7, rWORD1, rWORD2 b L(duLoop1) .align 4 L(duP3x): - addi rSTR1,rSTR1,16 - addi rSTR2,rSTR2,16 - bne cr1,L(duLcr1) - cmpld cr5,rWORD7,rWORD8 - bne cr6,L(duLcr6) - sldi. rN,rN,3 - bne cr5,L(duLcr5) - cmpld cr7,rN,rSHR +#ifndef __LITTLE_ENDIAN__ + addi rSTR1, rSTR1, 16 + addi rSTR2, rSTR2, 16 +#endif +#if 0 +/* Huh? We've already branched on cr1! */ + bne cr1, L(duLcr1) +#endif + cmpld cr5, rWORD7, rWORD8 + bne cr6, L(duLcr6) + sldi. rN, rN, 3 + bne cr5, L(duLcr5) + cmpld cr7, rN, rSHR beq L(duZeroReturn) - li rA,0 - ble cr7,L(dutrim) - ld rWORD2,8(rSTR2) - srd rA,rWORD2,rSHR + li r0, 0 + ble cr7, L(dutrim) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD2, 0, rSTR2 + addi rSTR2, rSTR2, 8 +#else + ld rWORD2, 8(rSTR2) +#endif + srd r0, rWORD2, rSHR b L(dutrim) /* Count is a multiple of 32, remainder is 0 */ .align 4 L(duP4): - mtctr rTMP - srd rA,rWORD8,rSHR - ld rWORD1,0(rSTR1) - sld rD,rWORD8,rSHL - or rWORD2,rA,rH + mtctr r0 + srd r0, rWORD8, rSHR +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + addi rSTR1, rSTR1, 8 +#else + ld rWORD1, 0(rSTR1) +#endif + sld rWORD2_SHIFT, rWORD8, rSHL + or rWORD2, r0, rWORD6_SHIFT L(duP4e): - ld rWORD3,8(rSTR1) - ld rWORD4,8(rSTR2) - cmpld cr0,rWORD1,rWORD2 - srd rC,rWORD4,rSHR - sld rF,rWORD4,rSHL - or rWORD4,rC,rD - ld rWORD5,16(rSTR1) - ld rWORD6,16(rSTR2) - cmpld cr1,rWORD3,rWORD4 - bne cr0,L(duLcr0) - srd rE,rWORD6,rSHR - sld rH,rWORD6,rSHL - or rWORD6,rE,rF - ldu rWORD7,24(rSTR1) - ldu rWORD8,24(rSTR2) - cmpld cr6,rWORD5,rWORD6 - bne cr1,L(duLcr1) - srd rG,rWORD8,rSHR - sld rB,rWORD8,rSHL - or rWORD8,rG,rH - cmpld cr5,rWORD7,rWORD8 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD3, 0, rSTR1 + ldbrx rWORD4, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD3, 8(rSTR1) + ld rWORD4, 8(rSTR2) +#endif + cmpld cr7, rWORD1, rWORD2 + srd r12, rWORD4, rSHR + sld rWORD4_SHIFT, rWORD4, rSHL + or rWORD4, r12, rWORD2_SHIFT +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD5, 0, rSTR1 + ldbrx rWORD6, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD5, 16(rSTR1) + ld rWORD6, 16(rSTR2) +#endif + cmpld cr1, rWORD3, rWORD4 + bne cr7, L(duLcr7) + srd r0, rWORD6, rSHR + sld rWORD6_SHIFT, rWORD6, rSHL + or rWORD6, r0, rWORD4_SHIFT +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD7, 0, rSTR1 + ldbrx rWORD8, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ldu rWORD7, 24(rSTR1) + ldu rWORD8, 24(rSTR2) +#endif + cmpld cr6, rWORD5, rWORD6 + bne cr1, L(duLcr1) + srd r12, rWORD8, rSHR + sld rWORD8_SHIFT, rWORD8, rSHL + or rWORD8, r12, rWORD6_SHIFT + cmpld cr5, rWORD7, rWORD8 bdz L(du24) /* Adjust CTR as we start with +4 */ /* This is the primary loop */ .align 4 L(duLoop): - ld rWORD1,8(rSTR1) - ld rWORD2,8(rSTR2) - cmpld cr1,rWORD3,rWORD4 - bne cr6,L(duLcr6) - srd rA,rWORD2,rSHR - sld rD,rWORD2,rSHL - or rWORD2,rA,rB +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 + ldbrx rWORD2, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD1, 8(rSTR1) + ld rWORD2, 8(rSTR2) +#endif + cmpld cr1, rWORD3, rWORD4 + bne cr6, L(duLcr6) + srd r0, rWORD2, rSHR + sld rWORD2_SHIFT, rWORD2, rSHL + or rWORD2, r0, rWORD8_SHIFT L(duLoop1): - ld rWORD3,16(rSTR1) - ld rWORD4,16(rSTR2) - cmpld cr6,rWORD5,rWORD6 - bne cr5,L(duLcr5) - srd rC,rWORD4,rSHR - sld rF,rWORD4,rSHL - or rWORD4,rC,rD +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD3, 0, rSTR1 + ldbrx rWORD4, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD3, 16(rSTR1) + ld rWORD4, 16(rSTR2) +#endif + cmpld cr6, rWORD5, rWORD6 + bne cr5, L(duLcr5) + srd r12, rWORD4, rSHR + sld rWORD4_SHIFT, rWORD4, rSHL + or rWORD4, r12, rWORD2_SHIFT L(duLoop2): - ld rWORD5,24(rSTR1) - ld rWORD6,24(rSTR2) - cmpld cr5,rWORD7,rWORD8 - bne cr0,L(duLcr0) - srd rE,rWORD6,rSHR - sld rH,rWORD6,rSHL - or rWORD6,rE,rF +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD5, 0, rSTR1 + ldbrx rWORD6, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ld rWORD5, 24(rSTR1) + ld rWORD6, 24(rSTR2) +#endif + cmpld cr5, rWORD7, rWORD8 + bne cr7, L(duLcr7) + srd r0, rWORD6, rSHR + sld rWORD6_SHIFT, rWORD6, rSHL + or rWORD6, r0, rWORD4_SHIFT L(duLoop3): - ldu rWORD7,32(rSTR1) - ldu rWORD8,32(rSTR2) - cmpld cr0,rWORD1,rWORD2 - bne- cr1,L(duLcr1) - srd rG,rWORD8,rSHR - sld rB,rWORD8,rSHL - or rWORD8,rG,rH +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD7, 0, rSTR1 + ldbrx rWORD8, 0, rSTR2 + addi rSTR1, rSTR1, 8 + addi rSTR2, rSTR2, 8 +#else + ldu rWORD7, 32(rSTR1) + ldu rWORD8, 32(rSTR2) +#endif + cmpld cr7, rWORD1, rWORD2 + bne cr1, L(duLcr1) + srd r12, rWORD8, rSHR + sld rWORD8_SHIFT, rWORD8, rSHL + or rWORD8, r12, rWORD6_SHIFT bdnz L(duLoop) L(duL4): - bne cr1,L(duLcr1) - cmpld cr1,rWORD3,rWORD4 - bne cr6,L(duLcr6) - cmpld cr6,rWORD5,rWORD6 - bne cr5,L(duLcr5) - cmpld cr5,rWORD7,rWORD8 +#if 0 +/* Huh? We've already branched on cr1! */ + bne cr1, L(duLcr1) +#endif + cmpld cr1, rWORD3, rWORD4 + bne cr6, L(duLcr6) + cmpld cr6, rWORD5, rWORD6 + bne cr5, L(duLcr5) + cmpld cr5, rWORD7, rWORD8 L(du44): - bne cr0,L(duLcr0) + bne cr7, L(duLcr7) L(du34): - bne cr1,L(duLcr1) + bne cr1, L(duLcr1) L(du24): - bne cr6,L(duLcr6) + bne cr6, L(duLcr6) L(du14): - sldi. rN,rN,3 - bne cr5,L(duLcr5) + sldi. rN, rN, 3 + bne cr5, L(duLcr5) /* At this point we have a remainder of 1 to 7 bytes to compare. We use shift right double to eliminate bits beyond the compare length. - This allows the use of double word subtract to compute the final - result. However it may not be safe to load rWORD2 which may be beyond the string length. So we compare the bit length of the remainder to the right shift count (rSHR). If the bit count is less than or equal we do not need to load rWORD2 (all significant bits are already in - rB). */ - cmpld cr7,rN,rSHR + rWORD8_SHIFT). */ + cmpld cr7, rN, rSHR beq L(duZeroReturn) - li rA,0 - ble cr7,L(dutrim) - ld rWORD2,8(rSTR2) - srd rA,rWORD2,rSHR + li r0, 0 + ble cr7, L(dutrim) +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD2, 0, rSTR2 + addi rSTR2, rSTR2, 8 +#else + ld rWORD2, 8(rSTR2) +#endif + srd r0, rWORD2, rSHR .align 4 L(dutrim): - ld rWORD1,8(rSTR1) - ld rWORD8,-8(r1) - subfic rN,rN,64 /* Shift count is 64 - (rN * 8). */ - or rWORD2,rA,rB - ld rWORD7,-16(r1) - ld r29,-24(r1) - srd rWORD1,rWORD1,rN - srd rWORD2,rWORD2,rN - ld r28,-32(r1) - ld r27,-40(r1) - li rRTN,0 - cmpld cr0,rWORD1,rWORD2 - ld r26,-48(r1) - ld r25,-56(r1) - beq cr0,L(dureturn24) - li rRTN,1 - ld r24,-64(r1) - bgtlr cr0 - li rRTN,-1 +#ifdef __LITTLE_ENDIAN__ + ldbrx rWORD1, 0, rSTR1 +#else + ld rWORD1, 8(rSTR1) +#endif + ld rWORD8, -8(r1) + subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ + or rWORD2, r0, rWORD8_SHIFT + ld rWORD7, -16(r1) + ld rSHL, -24(r1) + srd rWORD1, rWORD1, rN + srd rWORD2, rWORD2, rN + ld rSHR, -32(r1) + ld rWORD8_SHIFT, -40(r1) + li rRTN, 0 + cmpld cr7, rWORD1, rWORD2 + ld rWORD2_SHIFT, -48(r1) + ld rWORD4_SHIFT, -56(r1) + beq cr7, L(dureturn24) + li rRTN, 1 + ld rWORD6_SHIFT, -64(r1) + bgtlr cr7 + li rRTN, -1 blr .align 4 -L(duLcr0): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN,1 - bgt cr0,L(dureturn29) - ld r29,-24(r1) - ld r28,-32(r1) - li rRTN,-1 +L(duLcr7): + ld rWORD8, -8(r1) + ld rWORD7, -16(r1) + li rRTN, 1 + bgt cr7, L(dureturn29) + ld rSHL, -24(r1) + ld rSHR, -32(r1) + li rRTN, -1 b L(dureturn27) .align 4 L(duLcr1): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN,1 - bgt cr1,L(dureturn29) - ld r29,-24(r1) - ld r28,-32(r1) - li rRTN,-1 + ld rWORD8, -8(r1) + ld rWORD7, -16(r1) + li rRTN, 1 + bgt cr1, L(dureturn29) + ld rSHL, -24(r1) + ld rSHR, -32(r1) + li rRTN, -1 b L(dureturn27) .align 4 L(duLcr6): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN,1 - bgt cr6,L(dureturn29) - ld r29,-24(r1) - ld r28,-32(r1) - li rRTN,-1 + ld rWORD8, -8(r1) + ld rWORD7, -16(r1) + li rRTN, 1 + bgt cr6, L(dureturn29) + ld rSHL, -24(r1) + ld rSHR, -32(r1) + li rRTN, -1 b L(dureturn27) .align 4 L(duLcr5): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) - li rRTN,1 - bgt cr5,L(dureturn29) - ld r29,-24(r1) - ld r28,-32(r1) - li rRTN,-1 + ld rWORD8, -8(r1) + ld rWORD7, -16(r1) + li rRTN, 1 + bgt cr5, L(dureturn29) + ld rSHL, -24(r1) + ld rSHR, -32(r1) + li rRTN, -1 b L(dureturn27) .align 3 L(duZeroReturn): - li rRTN,0 + li rRTN, 0 .align 4 L(dureturn): - ld rWORD8,-8(r1) - ld rWORD7,-16(r1) + ld rWORD8, -8(r1) + ld rWORD7, -16(r1) L(dureturn29): - ld r29,-24(r1) - ld r28,-32(r1) + ld rSHL, -24(r1) + ld rSHR, -32(r1) L(dureturn27): - ld r27,-40(r1) + ld rWORD8_SHIFT, -40(r1) L(dureturn26): - ld r26,-48(r1) + ld rWORD2_SHIFT, -48(r1) L(dureturn25): - ld r25,-56(r1) + ld rWORD4_SHIFT, -56(r1) L(dureturn24): - ld r24,-64(r1) + ld rWORD6_SHIFT, -64(r1) blr L(duzeroLength): - li rRTN,0 + li rRTN, 0 blr END (memcmp) libc_hidden_builtin_def (memcmp) -weak_alias (memcmp,bcmp) +weak_alias (memcmp, bcmp) diff --git a/libc/sysdeps/powerpc/powerpc64/power7/memcpy.S b/libc/sysdeps/powerpc/powerpc64/power7/memcpy.S index 800a9f1bb..e8df75f59 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/memcpy.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/memcpy.S @@ -23,418 +23,361 @@ /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); Returns 'dst'. */ +#define dst 11 /* Use r11 so r3 kept unchanged. */ +#define src 4 +#define cnt 5 + .machine power7 EALIGN (memcpy, 5, 0) CALL_MCOUNT 3 - cmpldi cr1,5,31 + cmpldi cr1,cnt,31 neg 0,3 - std 3,-16(1) - std 31,-8(1) - cfi_offset(31,-8) ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move code. */ - andi. 11,3,7 /* Check alignment of DST. */ - - - clrldi 10,4,61 /* Check alignment of SRC. */ - cmpld cr6,10,11 /* SRC and DST alignments match? */ - mr 12,4 - mr 31,5 +#ifdef __LITTLE_ENDIAN__ +/* In little-endian mode, power7 takes an alignment trap on any lxvd2x + or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy + loop is only used for quadword aligned copies. */ + andi. 10,3,15 + clrldi 11,4,60 +#else + andi. 10,3,7 /* Check alignment of DST. */ + clrldi 11,4,61 /* Check alignment of SRC. */ +#endif + cmpld cr6,10,11 /* SRC and DST alignments match? */ + + mr dst,3 bne cr6,L(copy_GE_32_unaligned) + beq L(aligned_copy) - srdi 9,5,3 /* Number of full quadwords remaining. */ - - beq L(copy_GE_32_aligned_cont) - - clrldi 0,0,61 - mtcrf 0x01,0 - subf 31,0,5 - - /* Get the SRC aligned to 8 bytes. */ - -1: bf 31,2f - lbz 6,0(12) - addi 12,12,1 - stb 6,0(3) - addi 3,3,1 -2: bf 30,4f - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -4: bf 29,0f - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -0: - clrldi 10,12,61 /* Check alignment of SRC again. */ - srdi 9,31,3 /* Number of full doublewords remaining. */ - -L(copy_GE_32_aligned_cont): - - clrldi 11,31,61 - mtcrf 0x01,9 - - srdi 8,31,5 - cmpldi cr1,9,4 - cmpldi cr6,11,0 - mr 11,12 - - /* Copy 1~3 doublewords so the main loop starts - at a multiple of 32 bytes. */ + mtocrf 0x01,0 +#ifdef __LITTLE_ENDIAN__ + clrldi 0,0,60 +#else + clrldi 0,0,61 +#endif - bf 30,1f - ld 6,0(12) - ld 7,8(12) - addi 11,12,16 - mtctr 8 - std 6,0(3) - std 7,8(3) - addi 10,3,16 - bf 31,4f - ld 0,16(12) - std 0,16(3) - blt cr1,3f - addi 11,12,24 - addi 10,3,24 - b 4f - - .align 4 -1: /* Copy 1 doubleword and set the counter. */ - mr 10,3 - mtctr 8 - bf 31,4f - ld 6,0(12) - addi 11,12,8 - std 6,0(3) - addi 10,3,8 - -L(aligned_copy): - /* Main aligned copy loop. Copies up to 128-bytes at a time. */ - .align 4 +/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */ +1: + bf 31,2f + lbz 6,0(src) + addi src,src,1 + stb 6,0(dst) + addi dst,dst,1 +2: + bf 30,4f + lhz 6,0(src) + addi src,src,2 + sth 6,0(dst) + addi dst,dst,2 4: - /* check for any 32-byte or 64-byte lumps that are outside of a - nice 128-byte range. R8 contains the number of 32-byte - lumps, so drop this into the CR, and use the SO/EQ bits to help - handle the 32- or 64- byte lumps. Then handle the rest with an - unrolled 128-bytes-at-a-time copy loop. */ - mtocrf 1,8 - li 6,16 # 16() index - li 7,32 # 32() index - li 8,48 # 48() index - -L(aligned_32byte): - /* if the SO bit (indicating a 32-byte lump) is not set, move along. */ - bns cr7,L(aligned_64byte) - lxvd2x 6,0,11 - lxvd2x 7,11,6 - addi 11,11,32 - stxvd2x 6,0,10 - stxvd2x 7,10,6 - addi 10,10,32 - -L(aligned_64byte): - /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */ - bne cr7,L(aligned_128setup) - lxvd2x 6,0,11 - lxvd2x 7,11,6 - lxvd2x 8,11,7 - lxvd2x 9,11,8 - addi 11,11,64 - stxvd2x 6,0,10 - stxvd2x 7,10,6 - stxvd2x 8,10,7 - stxvd2x 9,10,8 - addi 10,10,64 - -L(aligned_128setup): - /* Set up for the 128-byte at a time copy loop. */ - srdi 8,31,7 - cmpdi 8,0 # Any 4x lumps left? - beq 3f # if not, move along. - lxvd2x 6,0,11 - lxvd2x 7,11,6 - mtctr 8 # otherwise, load the ctr and begin. - li 8,48 # 48() index + bf 29,8f + lwz 6,0(src) + addi src,src,4 + stw 6,0(dst) + addi dst,dst,4 +8: +#ifdef __LITTLE_ENDIAN__ + bf 28,16f + ld 6,0(src) + addi src,src,8 + std 6,0(dst) + addi dst,dst,8 +16: +#endif + subf cnt,0,cnt + +/* Main aligned copy loop. Copies 128 bytes at a time. */ +L(aligned_copy): + li 6,16 + li 7,32 + li 8,48 + mtocrf 0x02,cnt + srdi 12,cnt,7 + cmpdi 12,0 + beq L(aligned_tail) + lxvd2x 6,0,src + lxvd2x 7,src,6 + mtctr 12 b L(aligned_128loop) + .align 4 L(aligned_128head): /* for the 2nd + iteration of this loop. */ - lxvd2x 6,0,11 - lxvd2x 7,11,6 + lxvd2x 6,0,src + lxvd2x 7,src,6 L(aligned_128loop): - lxvd2x 8,11,7 - lxvd2x 9,11,8 - stxvd2x 6,0,10 - addi 11,11,64 - stxvd2x 7,10,6 - stxvd2x 8,10,7 - stxvd2x 9,10,8 - lxvd2x 6,0,11 - lxvd2x 7,11,6 - addi 10,10,64 - lxvd2x 8,11,7 - lxvd2x 9,11,8 - addi 11,11,64 - stxvd2x 6,0,10 - stxvd2x 7,10,6 - stxvd2x 8,10,7 - stxvd2x 9,10,8 - addi 10,10,64 + lxvd2x 8,src,7 + lxvd2x 9,src,8 + stxvd2x 6,0,dst + addi src,src,64 + stxvd2x 7,dst,6 + stxvd2x 8,dst,7 + stxvd2x 9,dst,8 + lxvd2x 6,0,src + lxvd2x 7,src,6 + addi dst,dst,64 + lxvd2x 8,src,7 + lxvd2x 9,src,8 + addi src,src,64 + stxvd2x 6,0,dst + stxvd2x 7,dst,6 + stxvd2x 8,dst,7 + stxvd2x 9,dst,8 + addi dst,dst,64 bdnz L(aligned_128head) -3: - /* Check for tail bytes. */ - rldicr 0,31,0,60 - mtcrf 0x01,31 - beq cr6,0f - -.L9: - add 3,3,0 - add 12,12,0 - - /* At this point we have a tail of 0-7 bytes and we know that the - destination is doubleword-aligned. */ -4: /* Copy 4 bytes. */ - bf 29,2f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -2: /* Copy 2 bytes. */ - bf 30,1f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -1: /* Copy 1 byte. */ - bf 31,0f - - lbz 6,0(12) - stb 6,0(3) -0: /* Return original DST pointer. */ - ld 31,-8(1) - ld 3,-16(1) +L(aligned_tail): + mtocrf 0x01,cnt + bf 25,32f + lxvd2x 6,0,src + lxvd2x 7,src,6 + lxvd2x 8,src,7 + lxvd2x 9,src,8 + addi src,src,64 + stxvd2x 6,0,dst + stxvd2x 7,dst,6 + stxvd2x 8,dst,7 + stxvd2x 9,dst,8 + addi dst,dst,64 +32: + bf 26,16f + lxvd2x 6,0,src + lxvd2x 7,src,6 + addi src,src,32 + stxvd2x 6,0,dst + stxvd2x 7,dst,6 + addi dst,dst,32 +16: + bf 27,8f + lxvd2x 6,0,src + addi src,src,16 + stxvd2x 6,0,dst + addi dst,dst,16 +8: + bf 28,4f + ld 6,0(src) + addi src,src,8 + std 6,0(dst) + addi dst,dst,8 +4: /* Copies 4~7 bytes. */ + bf 29,L(tail2) + lwz 6,0(src) + stw 6,0(dst) + bf 30,L(tail5) + lhz 7,4(src) + sth 7,4(dst) + bflr 31 + lbz 8,6(src) + stb 8,6(dst) + /* Return original DST pointer. */ blr - /* Handle copies of 0~31 bytes. */ - .align 4 + +/* Handle copies of 0~31 bytes. */ + .align 4 L(copy_LT_32): - cmpldi cr6,5,8 - mr 12,4 - mtcrf 0x01,5 + mr dst,3 + cmpldi cr6,cnt,8 + mtocrf 0x01,cnt ble cr6,L(copy_LE_8) /* At least 9 bytes to go. */ neg 8,4 - clrrdi 11,4,2 - andi. 0,8,3 - cmpldi cr1,5,16 - mr 10,5 + andi. 0,8,3 + cmpldi cr1,cnt,16 beq L(copy_LT_32_aligned) - /* Force 4-bytes alignment for SRC. */ - mtocrf 0x01,0 - subf 10,0,5 -2: bf 30,1f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -1: bf 31,L(end_4bytes_alignment) - - lbz 6,0(12) - addi 12,12,1 - stb 6,0(3) - addi 3,3,1 - - .align 4 + /* Force 4-byte alignment for SRC. */ + mtocrf 0x01,0 + subf cnt,0,cnt +2: + bf 30,1f + lhz 6,0(src) + addi src,src,2 + sth 6,0(dst) + addi dst,dst,2 +1: + bf 31,L(end_4bytes_alignment) + lbz 6,0(src) + addi src,src,1 + stb 6,0(dst) + addi dst,dst,1 + + .align 4 L(end_4bytes_alignment): - cmpldi cr1,10,16 - mtcrf 0x01,10 + cmpldi cr1,cnt,16 + mtocrf 0x01,cnt L(copy_LT_32_aligned): /* At least 6 bytes to go, and SRC is word-aligned. */ blt cr1,8f /* Copy 16 bytes. */ - lwz 6,0(12) - lwz 7,4(12) - stw 6,0(3) - lwz 8,8(12) - stw 7,4(3) - lwz 6,12(12) - addi 12,12,16 - stw 8,8(3) - stw 6,12(3) - addi 3,3,16 + lwz 6,0(src) + lwz 7,4(src) + stw 6,0(dst) + lwz 8,8(src) + stw 7,4(dst) + lwz 6,12(src) + addi src,src,16 + stw 8,8(dst) + stw 6,12(dst) + addi dst,dst,16 8: /* Copy 8 bytes. */ - bf 28,4f + bf 28,L(tail4) + lwz 6,0(src) + lwz 7,4(src) + addi src,src,8 + stw 6,0(dst) + stw 7,4(dst) + addi dst,dst,8 + + .align 4 +/* Copies 4~7 bytes. */ +L(tail4): + bf 29,L(tail2) + lwz 6,0(src) + stw 6,0(dst) + bf 30,L(tail5) + lhz 7,4(src) + sth 7,4(dst) + bflr 31 + lbz 8,6(src) + stb 8,6(dst) + /* Return original DST pointer. */ + blr - lwz 6,0(12) - lwz 7,4(12) - addi 12,12,8 - stw 6,0(3) - stw 7,4(3) - addi 3,3,8 -4: /* Copy 4 bytes. */ - bf 29,2f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -2: /* Copy 2-3 bytes. */ + .align 4 +/* Copies 2~3 bytes. */ +L(tail2): bf 30,1f - - lhz 6,0(12) - sth 6,0(3) - bf 31,0f - lbz 7,2(12) - stb 7,2(3) - ld 3,-16(1) + lhz 6,0(src) + sth 6,0(dst) + bflr 31 + lbz 7,2(src) + stb 7,2(dst) blr - .align 4 -1: /* Copy 1 byte. */ - bf 31,0f + .align 4 +L(tail5): + bflr 31 + lbz 6,4(src) + stb 6,4(dst) + blr - lbz 6,0(12) - stb 6,0(3) -0: /* Return original DST pointer. */ - ld 3,-16(1) + .align 4 +1: + bflr 31 + lbz 6,0(src) + stb 6,0(dst) + /* Return original DST pointer. */ blr - /* Handles copies of 0~8 bytes. */ - .align 4 + +/* Handles copies of 0~8 bytes. */ + .align 4 L(copy_LE_8): - bne cr6,4f + bne cr6,L(tail4) /* Though we could've used ld/std here, they are still slow for unaligned cases. */ - lwz 6,0(4) - lwz 7,4(4) - stw 6,0(3) - stw 7,4(3) - ld 3,-16(1) /* Return original DST pointers. */ + lwz 6,0(src) + lwz 7,4(src) + stw 6,0(dst) + stw 7,4(dst) blr - .align 4 -4: /* Copies 4~7 bytes. */ - bf 29,2b - - lwz 6,0(4) - stw 6,0(3) - bf 30,5f - lhz 7,4(4) - sth 7,4(3) - bf 31,0f - lbz 8,6(4) - stb 8,6(3) - ld 3,-16(1) - blr - - .align 4 -5: /* Copy 1 byte. */ - bf 31,0f - - lbz 6,4(4) - stb 6,4(3) - -0: /* Return original DST pointer. */ - ld 3,-16(1) - blr - /* Handle copies of 32+ bytes where DST is aligned (to quadword) but - SRC is not. Use aligned quadword loads from SRC, shifted to realign - the data, allowing for aligned DST stores. */ - .align 4 +/* Handle copies of 32+ bytes where DST is aligned (to quadword) but + SRC is not. Use aligned quadword loads from SRC, shifted to realign + the data, allowing for aligned DST stores. */ + .align 4 L(copy_GE_32_unaligned): - clrldi 0,0,60 /* Number of bytes until the 1st - quadword. */ - andi. 11,3,15 /* Check alignment of DST (against - quadwords). */ - srdi 9,5,4 /* Number of full quadwords remaining. */ + clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */ +#ifndef __LITTLE_ENDIAN__ + andi. 10,3,15 /* Check alignment of DST (against quadwords). */ +#endif + srdi 9,cnt,4 /* Number of full quadwords remaining. */ beq L(copy_GE_32_unaligned_cont) - /* SRC is not quadword aligned, get it aligned. */ + /* DST is not quadword aligned, get it aligned. */ - mtcrf 0x01,0 - subf 31,0,5 + mtocrf 0x01,0 + subf cnt,0,cnt /* Vector instructions work best when proper alignment (16-bytes) is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ -1: /* Copy 1 byte. */ +1: bf 31,2f - - lbz 6,0(12) - addi 12,12,1 - stb 6,0(3) - addi 3,3,1 -2: /* Copy 2 bytes. */ + lbz 6,0(src) + addi src,src,1 + stb 6,0(dst) + addi dst,dst,1 +2: bf 30,4f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -4: /* Copy 4 bytes. */ + lhz 6,0(src) + addi src,src,2 + sth 6,0(dst) + addi dst,dst,2 +4: bf 29,8f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -8: /* Copy 8 bytes. */ + lwz 6,0(src) + addi src,src,4 + stw 6,0(dst) + addi dst,dst,4 +8: bf 28,0f - - ld 6,0(12) - addi 12,12,8 - std 6,0(3) - addi 3,3,8 + ld 6,0(src) + addi src,src,8 + std 6,0(dst) + addi dst,dst,8 0: - clrldi 10,12,60 /* Check alignment of SRC. */ - srdi 9,31,4 /* Number of full quadwords remaining. */ + srdi 9,cnt,4 /* Number of full quadwords remaining. */ /* The proper alignment is present, it is OK to copy the bytes now. */ L(copy_GE_32_unaligned_cont): /* Setup two indexes to speed up the indexed vector operations. */ - clrldi 11,31,60 - li 6,16 /* Index for 16-bytes offsets. */ + clrldi 10,cnt,60 + li 6,16 /* Index for 16-bytes offsets. */ li 7,32 /* Index for 32-bytes offsets. */ - cmpldi cr1,11,0 - srdi 8,31,5 /* Setup the loop counter. */ - mr 10,3 - mr 11,12 - mtcrf 0x01,9 - cmpldi cr6,9,1 - lvsl 5,0,12 - lvx 3,0,12 - bf 31,L(setup_unaligned_loop) - - /* Copy another 16 bytes to align to 32-bytes due to the loop . */ - lvx 4,12,6 - vperm 6,3,4,5 - addi 11,12,16 - addi 10,3,16 - stvx 6,0,3 + cmpldi cr1,10,0 + srdi 8,cnt,5 /* Setup the loop counter. */ + mtocrf 0x01,9 + cmpldi cr6,9,1 +#ifdef __LITTLE_ENDIAN__ + lvsr 5,0,src +#else + lvsl 5,0,src +#endif + lvx 3,0,src + li 0,0 + bf 31,L(setup_unaligned_loop) + + /* Copy another 16 bytes to align to 32-bytes due to the loop. */ + lvx 4,src,6 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif + addi src,src,16 + stvx 6,0,dst + addi dst,dst,16 vor 3,4,4 + clrrdi 0,src,60 L(setup_unaligned_loop): - mtctr 8 - ble cr6,L(end_unaligned_loop) + mtctr 8 + ble cr6,L(end_unaligned_loop) /* Copy 32 bytes at a time using vector instructions. */ - .align 4 + .align 4 L(unaligned_loop): /* Note: vr6/vr10 may contain data that was already copied, @@ -442,62 +385,55 @@ L(unaligned_loop): some portions again. This is faster than having unaligned vector instructions though. */ - lvx 4,11,6 /* vr4 = r11+16. */ - vperm 6,3,4,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr6. */ - lvx 3,11,7 /* vr3 = r11+32. */ - vperm 10,4,3,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr10. */ - addi 11,11,32 - stvx 6,0,10 - stvx 10,10,6 - addi 10,10,32 - + lvx 4,src,6 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif + lvx 3,src,7 +#ifdef __LITTLE_ENDIAN__ + vperm 10,3,4,5 +#else + vperm 10,4,3,5 +#endif + addi src,src,32 + stvx 6,0,dst + stvx 10,dst,6 + addi dst,dst,32 bdnz L(unaligned_loop) - .align 4 + clrrdi 0,src,60 + + .align 4 L(end_unaligned_loop): /* Check for tail bytes. */ - rldicr 0,31,0,59 - mtcrf 0x01,31 - beq cr1,0f + mtocrf 0x01,cnt + beqlr cr1 - add 3,3,0 - add 12,12,0 + add src,src,0 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ -8: /* Copy 8 bytes. */ + /* Copy 8 bytes. */ bf 28,4f - - lwz 6,0(12) - lwz 7,4(12) - addi 12,12,8 - stw 6,0(3) - stw 7,4(3) - addi 3,3,8 -4: /* Copy 4 bytes. */ - bf 29,2f - - lwz 6,0(12) - addi 12,12,4 - stw 6,0(3) - addi 3,3,4 -2: /* Copy 2~3 bytes. */ - bf 30,1f - - lhz 6,0(12) - addi 12,12,2 - sth 6,0(3) - addi 3,3,2 -1: /* Copy 1 byte. */ - bf 31,0f - - lbz 6,0(12) - stb 6,0(3) -0: /* Return original DST pointer. */ - ld 31,-8(1) - ld 3,-16(1) + lwz 6,0(src) + lwz 7,4(src) + addi src,src,8 + stw 6,0(dst) + stw 7,4(dst) + addi dst,dst,8 +4: /* Copy 4~7 bytes. */ + bf 29,L(tail2) + lwz 6,0(src) + stw 6,0(dst) + bf 30,L(tail5) + lhz 7,4(src) + sth 7,4(dst) + bflr 31 + lbz 8,6(src) + stb 8,6(dst) + /* Return original DST pointer. */ blr END_GEN_TB (memcpy,TB_TOCLESS) diff --git a/libc/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/libc/sysdeps/powerpc/powerpc64/power7/mempcpy.S index f20be938d..b93ab7da5 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/mempcpy.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/mempcpy.S @@ -365,13 +365,21 @@ L(copy_GE_32_unaligned_cont): mr 11,12 mtcrf 0x01,9 cmpldi cr6,9,1 - lvsl 5,0,12 +#ifdef __LITTLE_ENDIAN__ + lvsr 5,0,12 +#else + lvsl 5,0,12 +#endif lvx 3,0,12 bf 31,L(setup_unaligned_loop) /* Copy another 16 bytes to align to 32-bytes due to the loop . */ lvx 4,12,6 - vperm 6,3,4,5 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif addi 11,12,16 addi 10,3,16 stvx 6,0,3 @@ -391,11 +399,17 @@ L(unaligned_loop): vector instructions though. */ lvx 4,11,6 /* vr4 = r11+16. */ - vperm 6,3,4,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr6. */ +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif lvx 3,11,7 /* vr3 = r11+32. */ - vperm 10,4,3,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr10. */ +#ifdef __LITTLE_ENDIAN__ + vperm 10,3,4,5 +#else + vperm 10,4,3,5 +#endif addi 11,11,32 stvx 6,0,10 stvx 10,10,6 diff --git a/libc/sysdeps/powerpc/powerpc64/power7/memrchr.S b/libc/sysdeps/powerpc/powerpc64/power7/memrchr.S index c49995210..a9e86cb19 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/memrchr.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/memrchr.S @@ -23,118 +23,132 @@ .machine power7 ENTRY (__memrchr) CALL_MCOUNT - dcbt 0,r3 - mr r7,r3 - add r3,r7,r5 /* Calculate the last acceptable address. */ - cmpld cr7,r3,r7 /* Is the address equal or less than r3? */ + add r7,r3,r5 /* Calculate the last acceptable address. */ + neg r0,r7 + addi r7,r7,-1 + mr r10,r3 + clrrdi r6,r7,7 + li r9,3<<5 + dcbt r9,r6,16 /* Stream hint, decreasing addresses. */ /* Replicate BYTE to doubleword. */ - rlwimi r4,r4,8,16,23 - rlwimi r4,r4,16,0,15 + insrdi r4,r4,8,48 + insrdi r4,r4,16,32 insrdi r4,r4,32,0 - bge cr7,L(proceed) - - li r3,-1 /* Make r11 the biggest if r4 <= 0. */ -L(proceed): li r6,-8 - addi r9,r3,-1 - clrrdi r8,r9,3 - addi r8,r8,8 - neg r0,r3 + li r9,-1 rlwinm r0,r0,3,26,28 /* Calculate padding. */ - + clrrdi r8,r7,3 + srd r9,r9,r0 cmpldi r5,32 + clrrdi r0,r10,3 ble L(small_range) - ldbrx r12,r8,r6 /* Load reversed doubleword from memory. */ - cmpb r10,r12,r4 /* Check for BYTE in DWORD1. */ - sld r10,r10,r0 - srd r10,r10,r0 - cmpldi cr7,r10,0 /* If r10 == 0, no BYTEs have been found. */ +#ifdef __LITTLE_ENDIAN__ + ldx r12,0,r8 +#else + ldbrx r12,0,r8 /* Load reversed doubleword from memory. */ +#endif + cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */ + and r3,r3,r9 + cmpldi cr7,r3,0 /* If r3 == 0, no BYTEs have been found. */ bne cr7,L(done) - /* Are we done already? */ - addi r9,r8,-8 - cmpld cr6,r9,r7 - ble cr6,L(null) - mtcrf 0x01,r8 - /* Are we now aligned to a doubleword boundary? If so, skip to + /* Are we now aligned to a quadword boundary? If so, skip to the main loop. Otherwise, go through the alignment code. */ - mr r8,r9 - bt 28,L(loop_setup) + bf 28,L(loop_setup) /* Handle DWORD2 of pair. */ +#ifdef __LITTLE_ENDIAN__ + ldx r12,r8,r6 +#else ldbrx r12,r8,r6 - cmpb r10,r12,r4 - cmpldi cr7,r10,0 - bne cr7,L(done) - - /* Are we done already. */ +#endif addi r8,r8,-8 - cmpld cr6,r8,r7 - ble cr6,L(null) + cmpb r3,r12,r4 + cmpldi cr7,r3,0 + bne cr7,L(done) L(loop_setup): - li r0,-16 - sub r5,r8,r7 - srdi r9,r5,4 /* Number of loop iterations. */ + /* The last dword we want to read in the loop below is the one + containing the first byte of the string, ie. the dword at + s & ~7, or r0. The first dword read is at r8 - 8, we + read 2 * cnt dwords, so the last dword read will be at + r8 - 8 - 16 * cnt + 8. Solving for cnt gives + cnt = (r8 - r0) / 16 */ + sub r5,r8,r0 + addi r8,r8,-8 + srdi r9,r5,4 /* Number of loop iterations. */ mtctr r9 /* Setup the counter. */ - b L(loop) - /* Main loop to look for BYTE backwards in the string. Since it's a - small loop (< 8 instructions), align it to 32-bytes. */ - .p2align 5 + + /* Main loop to look for BYTE backwards in the string. + FIXME: Investigate whether 32 byte align helps with this + 9 instruction loop. */ + .align 5 L(loop): /* Load two doublewords, compare and merge in a single register for speed. This is an attempt to speed up the byte-checking process for bigger strings. */ - ldbrx r12,r8,r6 - ldbrx r11,r8,r0 - addi r8,r8,-8 - cmpb r10,r12,r4 +#ifdef __LITTLE_ENDIAN__ + ldx r12,0,r8 + ldx r11,r8,r6 +#else + ldbrx r12,0,r8 + ldbrx r11,r8,r6 +#endif + cmpb r3,r12,r4 cmpb r9,r11,r4 - or r5,r9,r10 /* Merge everything in one doubleword. */ + or r5,r9,r3 /* Merge everything in one doubleword. */ cmpldi cr7,r5,0 bne cr7,L(found) - addi r8,r8,-8 + addi r8,r8,-16 bdnz L(loop) - /* We're here because the counter reached 0, and that means we - didn't have any matches for BYTE in the whole range. Just return - the original range. */ - addi r8,r8,8 - cmpld cr6,r8,r7 - bgt cr6,L(loop_small) - b L(null) - - /* OK, one (or both) of the words contains BYTE. Check - the first word and decrement the address in case the first - word really contains BYTE. */ + + /* We may have one more word to read. */ + cmpld r8,r0 + bnelr + +#ifdef __LITTLE_ENDIAN__ + ldx r12,0,r8 +#else + ldbrx r12,0,r8 +#endif + cmpb r3,r12,r4 + cmpldi cr7,r3,0 + bne cr7,L(done) + blr + .align 4 L(found): - cmpldi cr6,r10,0 - addi r8,r8,8 + /* OK, one (or both) of the dwords contains BYTE. Check + the first dword. */ + cmpldi cr6,r3,0 bne cr6,L(done) /* BYTE must be in the second word. Adjust the address - again and move the result of cmpb to r10 so we can calculate the + again and move the result of cmpb to r3 so we can calculate the pointer. */ - mr r10,r9 + mr r3,r9 addi r8,r8,-8 - /* r10 has the output of the cmpb instruction, that is, it contains - 0xff in the same position as the BYTE in the original + /* r3 has the output of the cmpb instruction, that is, it contains + 0xff in the same position as BYTE in the original word from the string. Use that to calculate the pointer. We need to make sure BYTE is *before* the end of the range. */ L(done): - cntlzd r0,r10 /* Count leading zeroes before the match. */ - srdi r6,r0,3 /* Convert leading zeroes to bytes. */ - addi r0,r6,1 + cntlzd r9,r3 /* Count leading zeros before the match. */ + cmpld r8,r0 /* Are we on the last word? */ + srdi r6,r9,3 /* Convert leading zeros to bytes. */ + addi r0,r6,-7 sub r3,r8,r0 - cmpld r3,r7 - blt L(null) + cmpld cr7,r3,r10 + bnelr + bgelr cr7 + li r3,0 blr .align 4 @@ -148,29 +162,35 @@ L(small_range): cmpldi r5,0 beq L(null) - ldbrx r12,r8,r6 /* Load reversed doubleword from memory. */ - cmpb r10,r12,r4 /* Check for BYTE in DWORD1. */ - sld r10,r10,r0 - srd r10,r10,r0 - cmpldi cr7,r10,0 +#ifdef __LITTLE_ENDIAN__ + ldx r12,0,r8 +#else + ldbrx r12,0,r8 /* Load reversed doubleword from memory. */ +#endif + cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */ + and r3,r3,r9 + cmpldi cr7,r3,0 bne cr7,L(done) /* Are we done already? */ + cmpld r8,r0 addi r8,r8,-8 - cmpld r8,r7 - ble L(null) - b L(loop_small) + beqlr - .p2align 5 + .align 5 L(loop_small): - ldbrx r12,r8,r6 - cmpb r10,r12,r4 - cmpldi cr6,r10,0 - bne cr6,L(done) +#ifdef __LITTLE_ENDIAN__ + ldx r12,0,r8 +#else + ldbrx r12,0,r8 +#endif + cmpb r3,r12,r4 + cmpld r8,r0 + cmpldi cr7,r3,0 + bne cr7,L(done) addi r8,r8,-8 - cmpld r8,r7 - ble L(null) - b L(loop_small) + bne L(loop_small) + blr END (__memrchr) weak_alias (__memrchr, memrchr) diff --git a/libc/sysdeps/powerpc/powerpc64/power7/memset.S b/libc/sysdeps/powerpc/powerpc64/power7/memset.S index b24cfa163..8b081e87c 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/memset.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/memset.S @@ -32,8 +32,8 @@ L(_memset): mr 10,3 /* Replicate byte to word. */ - rlwimi 4,4,8,16,23 - rlwimi 4,4,16,0,15 + insrdi 4,4,8,48 + insrdi 4,4,16,32 ble cr6,L(small) /* If length <= 8, use short copy code. */ neg 0,3 @@ -321,7 +321,7 @@ L(medium): clrldi 0,0,62 beq L(medium_aligned) - /* Force 4-bytes alignment for SRC. */ + /* Force 4-bytes alignment for DST. */ mtocrf 0x01,0 subf 5,0,5 1: /* Copy 1 byte. */ diff --git a/libc/sysdeps/powerpc/powerpc64/power7/rawmemchr.S b/libc/sysdeps/powerpc/powerpc64/power7/rawmemchr.S index 50a33d8fa..547aed771 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/rawmemchr.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/rawmemchr.S @@ -27,8 +27,8 @@ ENTRY (__rawmemchr) clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ /* Replicate byte to doubleword. */ - rlwimi r4,r4,8,16,23 - rlwimi r4,r4,16,0,15 + insrdi r4,r4,8,48 + insrdi r4,r4,16,32 insrdi r4,r4,32,0 /* Now r4 has a doubleword of c bytes. */ @@ -36,8 +36,13 @@ ENTRY (__rawmemchr) rlwinm r6,r3,3,26,28 /* Calculate padding. */ ld r12,0(r8) /* Load doubleword from memory. */ cmpb r5,r12,r4 /* Compare each byte against c byte. */ +#ifdef __LITTLE_ENDIAN__ + srd r5,r5,r6 + sld r5,r5,r6 +#else sld r5,r5,r6 /* Move left to discard ignored bits. */ srd r5,r5,r6 /* Bring the bits back as zeros. */ +#endif cmpdi cr7,r5,0 /* If r5 == 0, no c bytes have been found. */ bne cr7,L(done) @@ -91,8 +96,14 @@ L(loop): doubleword from the string. Use that fact to find out what is the position of the byte inside the string. */ L(done): +#ifdef __LITTLE_ENDIAN__ + addi r0,r5,-1 + andc r0,r0,r5 + popcntd r0,r0 /* Count trailing zeros. */ +#else cntlzd r0,r5 /* Count leading zeros before the match. */ - srdi r0,r0,3 /* Convert leading zeroes to bytes. */ +#endif + srdi r0,r0,3 /* Convert leading zeros to bytes. */ add r3,r8,r0 /* Return address of the matching char. */ blr END (__rawmemchr) diff --git a/libc/sysdeps/powerpc/powerpc64/power7/strchr.S b/libc/sysdeps/powerpc/powerpc64/power7/strchr.S index 3ffe7a188..4679a158f 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/strchr.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/strchr.S @@ -35,8 +35,8 @@ ENTRY (strchr) beq cr7,L(null_match) /* Replicate byte to doubleword. */ - rlwimi r4,r4,8,16,23 - rlwimi r4,r4,16,0,15 + insrdi r4,r4,8,48 + insrdi r4,r4,16,32 insrdi r4,r4,32,0 /* Now r4 has a doubleword of c bytes and r0 has @@ -47,11 +47,17 @@ ENTRY (strchr) /* Move the doublewords left and right to discard the bits that are not part of the string and bring them back as zeros. */ - +#ifdef __LITTLE_ENDIAN__ + srd r10,r10,r6 + srd r11,r11,r6 + sld r10,r10,r6 + sld r11,r11,r6 +#else sld r10,r10,r6 sld r11,r11,r6 srd r10,r10,r6 srd r11,r11,r6 +#endif or r5,r10,r11 /* OR the results to speed things up. */ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes have been found. */ @@ -108,15 +114,24 @@ L(loop): mr r11,r7 addi r8,r8,8 - /* r5 has the output of the cmpb instruction, that is, it contains + /* r10/r11 have the output of the cmpb instructions, that is, 0xff in the same position as the c/null byte in the original doubleword from the string. Use that to calculate the pointer. */ L(done): - cntlzd r4,r10 /* Count leading zeroes before c matches. */ - cntlzd r0,r11 /* Count leading zeroes before null matches. */ - cmpld cr7,r4,r0 +#ifdef __LITTLE_ENDIAN__ + addi r3,r10,-1 + andc r3,r3,r10 + popcntd r0,r3 + addi r4,r11,-1 + andc r4,r4,r11 + cmpld cr7,r3,r4 bgt cr7,L(no_match) - srdi r0,r4,3 /* Convert leading zeroes to bytes. */ +#else + cntlzd r0,r10 /* Count leading zeros before c matches. */ + cmpld cr7,r11,r10 + bgt cr7,L(no_match) +#endif + srdi r0,r0,3 /* Convert leading zeros to bytes. */ add r3,r8,r0 /* Return address of the matching c byte or null in case c was not found. */ blr @@ -135,9 +150,13 @@ L(null_match): /* Move the doublewords left and right to discard the bits that are not part of the string and bring them back as zeros. */ - +#ifdef __LITTLE_ENDIAN__ + srd r5,r5,r6 + sld r5,r5,r6 +#else sld r5,r5,r6 srd r5,r5,r6 +#endif cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes have been found. */ bne cr7,L(done_null) @@ -192,7 +211,13 @@ L(loop_null): 0xff in the same position as the null byte in the original doubleword from the string. Use that to calculate the pointer. */ L(done_null): +#ifdef __LITTLE_ENDIAN__ + addi r0,r5,-1 + andc r0,r0,r5 + popcntd r0,r0 +#else cntlzd r0,r5 /* Count leading zeros before the match. */ +#endif srdi r0,r0,3 /* Convert leading zeros to bytes. */ add r3,r8,r0 /* Return address of the matching null byte. */ blr diff --git a/libc/sysdeps/powerpc/powerpc64/power7/strchrnul.S b/libc/sysdeps/powerpc/powerpc64/power7/strchrnul.S index 9dbc51b0d..df457525e 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/strchrnul.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/strchrnul.S @@ -27,8 +27,8 @@ ENTRY (__strchrnul) clrrdi r8,r3,3 /* Align the address to doubleword boundary. */ /* Replicate byte to doubleword. */ - rlwimi r4,r4,8,16,23 - rlwimi r4,r4,16,0,15 + insrdi r4,r4,8,48 + insrdi r4,r4,16,32 insrdi r4,r4,32,0 rlwinm r6,r3,3,26,28 /* Calculate padding. */ @@ -44,10 +44,17 @@ ENTRY (__strchrnul) /* Move the doublewords left and right to discard the bits that are not part of the string and to bring them back as zeros. */ +#ifdef __LITTLE_ENDIAN__ + srd r10,r10,r6 + srd r9,r9,r6 + sld r10,r10,r6 + sld r9,r9,r6 +#else sld r10,r10,r6 sld r9,r9,r6 srd r10,r10,r6 srd r9,r9,r6 +#endif or r5,r9,r10 /* OR the results to speed things up. */ cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes have been found. */ @@ -97,7 +104,7 @@ L(loop): bne cr6,L(done) /* The c/null byte must be in the second doubleword. Adjust the - address again and move the result of cmpb to r10 so we can calculate + address again and move the result of cmpb to r5 so we can calculate the pointer. */ mr r5,r10 addi r8,r8,8 @@ -106,7 +113,13 @@ L(loop): 0xff in the same position as the c/null byte in the original doubleword from the string. Use that to calculate the pointer. */ L(done): +#ifdef __LITTLE_ENDIAN__ + addi r0,r5,-1 + andc r0,r0,r5 + popcntd r0,r0 +#else cntlzd r0,r5 /* Count leading zeros before the match. */ +#endif srdi r0,r0,3 /* Convert leading zeros to bytes. */ add r3,r8,r0 /* Return address of matching c/null byte. */ blr diff --git a/libc/sysdeps/powerpc/powerpc64/power7/strlen.S b/libc/sysdeps/powerpc/powerpc64/power7/strlen.S index 343216952..807ef1082 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/strlen.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/strlen.S @@ -30,7 +30,11 @@ ENTRY (strlen) with cmpb. */ li r5,-1 /* MASK = 0xffffffffffffffff. */ ld r12,0(r4) /* Load doubleword from memory. */ +#ifdef __LITTLE_ENDIAN__ + sld r5,r5,r6 +#else srd r5,r5,r6 /* MASK = MASK >> padding. */ +#endif orc r9,r12,r5 /* Mask bits that are not part of the string. */ cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ @@ -48,9 +52,6 @@ ENTRY (strlen) cmpb r10,r12,r0 cmpdi cr7,r10,0 bne cr7,L(done) - b L(loop) /* We branch here (rather than falling through) - to skip the nops due to heavy alignment - of the loop below. */ /* Main loop to look for the end of the string. Since it's a small loop (< 8 instructions), align it to 32-bytes. */ @@ -87,9 +88,15 @@ L(loop): 0xff in the same position as the null byte in the original doubleword from the string. Use that to calculate the length. */ L(done): - cntlzd r0,r10 /* Count leading zeroes before the match. */ +#ifdef __LITTLE_ENDIAN__ + addi r9, r10, -1 /* Form a mask from trailing zeros. */ + andc r9, r9, r10 + popcntd r0, r9 /* Count the bits in the mask. */ +#else + cntlzd r0,r10 /* Count leading zeros before the match. */ +#endif subf r5,r3,r4 - srdi r0,r0,3 /* Convert leading zeroes to bytes. */ + srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ add r3,r5,r0 /* Compute final length. */ blr END (strlen) diff --git a/libc/sysdeps/powerpc/powerpc64/power7/strncmp.S b/libc/sysdeps/powerpc/powerpc64/power7/strncmp.S index 77ecad5ab..e618b010b 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/strncmp.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/strncmp.S @@ -27,7 +27,7 @@ EALIGN (strncmp,5,0) CALL_MCOUNT 3 -#define rTMP r0 +#define rTMP2 r0 #define rRTN r3 #define rSTR1 r3 /* first string arg */ #define rSTR2 r4 /* second string arg */ @@ -40,6 +40,7 @@ EALIGN (strncmp,5,0) #define r7F7F r9 /* constant 0x7f7f7f7f7f7f7f7f */ #define rNEG r10 /* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */ #define rBITDIF r11 /* bits that differ in s1 & s2 words */ +#define rTMP r12 dcbt 0,rSTR1 nop @@ -83,12 +84,57 @@ L(g1): add rTMP,rFEFE,rWORD1 we don't compare two strings as different because of gunk beyond the end of the strings... */ +#ifdef __LITTLE_ENDIAN__ +L(endstring): + addi rTMP2, rTMP, -1 + beq cr1, L(equal) + andc rTMP2, rTMP2, rTMP + rldimi rTMP2, rTMP2, 1, 0 + and rWORD2, rWORD2, rTMP2 /* Mask off gunk. */ + and rWORD1, rWORD1, rTMP2 + cmpd cr1, rWORD1, rWORD2 + beq cr1, L(equal) + cmpb rBITDIF, rWORD1, rWORD2 /* 0xff on equal bytes. */ + addi rNEG, rBITDIF, 1 + orc rNEG, rNEG, rBITDIF /* 0's below LS differing byte. */ + sldi rNEG, rNEG, 8 /* 1's above LS differing byte. */ + andc rWORD1, rWORD1, rNEG /* mask off MS bytes. */ + andc rWORD2, rWORD2, rNEG + xor. rBITDIF, rWORD1, rWORD2 + sub rRTN, rWORD1, rWORD2 + blt L(highbit) + sradi rRTN, rRTN, 63 /* must return an int. */ + ori rRTN, rRTN, 1 + blr +L(equal): + li rRTN, 0 + blr + +L(different): + ld rWORD1, -8(rSTR1) + cmpb rBITDIF, rWORD1, rWORD2 /* 0xff on equal bytes. */ + addi rNEG, rBITDIF, 1 + orc rNEG, rNEG, rBITDIF /* 0's below LS differing byte. */ + sldi rNEG, rNEG, 8 /* 1's above LS differing byte. */ + andc rWORD1, rWORD1, rNEG /* mask off MS bytes. */ + andc rWORD2, rWORD2, rNEG + xor. rBITDIF, rWORD1, rWORD2 + sub rRTN, rWORD1, rWORD2 + blt L(highbit) + sradi rRTN, rRTN, 63 + ori rRTN, rRTN, 1 + blr +L(highbit): + sradi rRTN, rWORD2, 63 + ori rRTN, rRTN, 1 + blr + +#else L(endstring): and rTMP,r7F7F,rWORD1 beq cr1,L(equal) add rTMP,rTMP,r7F7F xor. rBITDIF,rWORD1,rWORD2 - andc rNEG,rNEG,rTMP blt L(highbit) cntlzd rBITDIF,rBITDIF @@ -97,7 +143,7 @@ L(endstring): cmpd cr1,rNEG,rBITDIF sub rRTN,rWORD1,rWORD2 blt cr1,L(equal) - sradi rRTN,rRTN,63 + sradi rRTN,rRTN,63 /* must return an int. */ ori rRTN,rRTN,1 blr L(equal): @@ -105,7 +151,7 @@ L(equal): blr L(different): - ldu rWORD1,-8(rSTR1) + ld rWORD1,-8(rSTR1) xor. rBITDIF,rWORD1,rWORD2 sub rRTN,rWORD1,rWORD2 blt L(highbit) @@ -113,11 +159,10 @@ L(different): ori rRTN,rRTN,1 blr L(highbit): - srdi rWORD2,rWORD2,56 - srdi rWORD1,rWORD1,56 - sub rRTN,rWORD1,rWORD2 + sradi rRTN,rWORD2,63 + ori rRTN,rRTN,1 blr - +#endif /* Oh well. In this case, we just do a byte-by-byte comparison. */ .align 4 diff --git a/libc/sysdeps/powerpc/powerpc64/power7/strnlen.S b/libc/sysdeps/powerpc/powerpc64/power7/strnlen.S index 37c7dbfe8..51591069d 100644 --- a/libc/sysdeps/powerpc/powerpc64/power7/strnlen.S +++ b/libc/sysdeps/powerpc/powerpc64/power7/strnlen.S @@ -24,33 +24,29 @@ ENTRY (__strnlen) CALL_MCOUNT 2 dcbt 0,r3 - clrrdi r8,r3,3 + clrrdi r8,r3,3 add r7,r3,r4 /* Calculate the last acceptable address. */ cmpldi r4,32 li r0,0 /* Doubleword with null chars. */ + addi r7,r7,-1 + /* If we have less than 33 bytes to search, skip to a faster code. */ ble L(small_range) - cmpld cr7,r3,r7 /* Is the address equal or less than r3? If - it's equal or less, it means size is either 0 - or a negative number. */ - ble cr7,L(proceed) - - li r7,-1 /* Make r11 the biggest if r4 <= 0. */ -L(proceed): rlwinm r6,r3,3,26,28 /* Calculate padding. */ ld r12,0(r8) /* Load doubleword from memory. */ cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */ +#ifdef __LITTLE_ENDIAN__ + srd r10,r10,r6 + sld r10,r10,r6 +#else sld r10,r10,r6 srd r10,r10,r6 +#endif cmpldi cr7,r10,0 /* If r10 == 0, no null's have been found. */ bne cr7,L(done) - /* Are we done already? */ - addi r9,r8,8 - cmpld cr6,r9,r7 - bge cr6,L(end_max) - + clrrdi r7,r7,3 /* Address of last doubleword. */ mtcrf 0x01,r8 /* Are we now aligned to a quadword boundary? If so, skip to the main loop. Otherwise, go through the alignment code. */ @@ -63,17 +59,18 @@ L(proceed): cmpldi cr7,r10,0 bne cr7,L(done) - /* Are we done already? */ - addi r9,r8,8 - cmpld cr6,r9,r7 - bge cr6,L(end_max) - L(loop_setup): - sub r5,r7,r9 + /* The last dword we want to read in the loop below is the one + containing the last byte of the string, ie. the dword at + (s + size - 1) & ~7, or r7. The first dword read is at + r8 + 8, we read 2 * cnt dwords, so the last dword read will + be at r8 + 8 + 16 * cnt - 8. Solving for cnt gives + cnt = (r7 - r8) / 16 */ + sub r5,r7,r8 srdi r6,r5,4 /* Number of loop iterations. */ mtctr r6 /* Setup the counter. */ - b L(loop) - /* Main loop to look for the null byte backwards in the string. Since + + /* Main loop to look for the null byte in the string. Since it's a small loop (< 8 instructions), align it to 32-bytes. */ .p2align 5 L(loop): @@ -89,15 +86,18 @@ L(loop): cmpldi cr7,r5,0 bne cr7,L(found) bdnz L(loop) - /* We're here because the counter reached 0, and that means we - didn't have any matches for null in the whole range. Just return - the original size. */ - addi r9,r8,8 - cmpld cr6,r9,r7 - blt cr6,L(loop_small) + + /* We may have one more dword to read. */ + cmpld cr6,r8,r7 + beq cr6,L(end_max) + + ldu r12,8(r8) + cmpb r10,r12,r0 + cmpldi cr6,r10,0 + bne cr6,L(done) L(end_max): - sub r3,r7,r3 + mr r3,r4 blr /* OK, one (or both) of the doublewords contains a null byte. Check @@ -119,52 +119,59 @@ L(found): /* r10 has the output of the cmpb instruction, that is, it contains 0xff in the same position as the null byte in the original doubleword from the string. Use that to calculate the length. - We need to make sure the null char is *before* the start of the - range (since we're going backwards). */ + We need to make sure the null char is *before* the end of the + range. */ L(done): - cntlzd r0,r10 /* Count leading zeroes before the match. */ - srdi r0,r0,3 /* Convert leading zeroes to bytes. */ - add r9,r8,r0 - sub r6,r9,r3 /* Length until the match. */ - cmpld r9,r7 - bgt L(end_max) - mr r3,r6 - blr - - .align 4 -L(zero): - li r3,0 +#ifdef __LITTLE_ENDIAN__ + addi r0,r10,-1 + andc r0,r0,r10 + popcntd r0,r0 +#else + cntlzd r0,r10 /* Count leading zeros before the match. */ +#endif + sub r3,r8,r3 + srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ + add r3,r3,r0 /* Length until the match. */ + cmpld r3,r4 + blelr + mr r3,r4 blr /* Deals with size <= 32. */ .align 4 L(small_range): cmpldi r4,0 - beq L(zero) + beq L(end_max) + + clrrdi r7,r7,3 /* Address of last doubleword. */ rlwinm r6,r3,3,26,28 /* Calculate padding. */ - ld r12,0(r8) /* Load word from memory. */ + ld r12,0(r8) /* Load doubleword from memory. */ cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */ +#ifdef __LITTLE_ENDIAN__ + srd r10,r10,r6 + sld r10,r10,r6 +#else sld r10,r10,r6 srd r10,r10,r6 +#endif cmpldi cr7,r10,0 bne cr7,L(done) - addi r9,r8,8 - cmpld r9,r7 - bge L(end_max) - b L(loop_small) + cmpld r8,r7 + beq L(end_max) .p2align 5 L(loop_small): ldu r12,8(r8) cmpb r10,r12,r0 - addi r9,r8,8 cmpldi cr6,r10,0 bne cr6,L(done) - cmpld r9,r7 - bge L(end_max) - b L(loop_small) + cmpld r8,r7 + bne L(loop_small) + mr r3,r4 + blr + END (__strnlen) weak_alias (__strnlen, strnlen) libc_hidden_builtin_def (strnlen) |