diff options
author | Roland McGrath <roland@gnu.org> | 2003-04-04 22:03:25 +0000 |
---|---|---|
committer | Roland McGrath <roland@gnu.org> | 2003-04-04 22:03:25 +0000 |
commit | beb03cee27a133e3fd34795e32d6d51c7b7b4d4d (patch) | |
tree | 920a38ae4bd7d92bbb77e229af7b92e14c4c66ab | |
parent | 91613ed9d8e3cdef7a4257b1bec241828fa222c1 (diff) | |
download | glibc-beb03cee27a133e3fd34795e32d6d51c7b7b4d4d.tar.gz |
* sysdeps/powerpc/powerpc64/strchr.S: 64-bit optimizations.
* sysdeps/powerpc/powerpc64/strlen.S: 64-bit optimizations.
* sysdeps/powerpc/fpu/bits/mathdef.h (FLT_EVAL_METHOD): Undef before
defining.
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/strchr.S | 42 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/strlen.S | 70 |
3 files changed, 74 insertions, 46 deletions
@@ -1,3 +1,11 @@ +2003-04-04 Steven Munroe <sjmunroe@us.ibm.com> + + * sysdeps/powerpc/powerpc64/strchr.S: 64-bit optimizations. + * sysdeps/powerpc/powerpc64/strlen.S: 64-bit optimizations. + + * sysdeps/powerpc/fpu/bits/mathdef.h (FLT_EVAL_METHOD): Undef before + defining. + 2003-04-04 Alexandre Oliva <aoliva@redhat.com> * sysdeps/unix/sysv/linux/mips/bits/fcntl.h (struct flock): Adjust diff --git a/sysdeps/powerpc/powerpc64/strchr.S b/sysdeps/powerpc/powerpc64/strchr.S index f6d418bcae..e581f8e77a 100644 --- a/sysdeps/powerpc/powerpc64/strchr.S +++ b/sysdeps/powerpc/powerpc64/strchr.S @@ -1,5 +1,5 @@ /* Optimized strchr implementation for PowerPC64. - Copyright (C) 1997, 1999, 2000, 2002 Free Software Foundation, Inc. + Copyright (C) 1997, 1999, 2000, 2002, 2003 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -29,6 +29,11 @@ ENTRY (BP_SYM (strchr)) #define rTMP1 r0 #define rRTN r3 /* outgoing result */ +/* Note: The Bounded pointer support in this code is broken. This code + was inherited from PPC32 and and that support was never completed. + Currently PPC gcc does not support -fbounds-check or -fbounded-pointers. + These artifacts are left in the code as a reminder in case we need + bounded pointer support in the future. */ #if __BOUNDED_POINTERS__ # define rSTR r4 # define rCHR r5 /* byte we're looking for, spread over the whole word */ @@ -39,8 +44,8 @@ ENTRY (BP_SYM (strchr)) # define rWORD r5 /* the current word */ #endif #define rCLZB rCHR /* leading zero byte count */ -#define rFEFE r6 /* constant 0xfefefeff (-0x01010101) */ -#define r7F7F r7 /* constant 0x7f7f7f7f */ +#define rFEFE r6 /* constant 0xfefefefefefefeff (-0x0101010101010101) */ +#define r7F7F r7 /* constant 0x7f7f7f7f7f7f7f7f */ #define rTMP2 r9 #define rIGN r10 /* number of bits we should ignore in the first word */ #define rMASK r11 /* mask with the bits to ignore set to 0 */ @@ -49,18 +54,23 @@ ENTRY (BP_SYM (strchr)) CHECK_BOUNDS_LOW (rSTR, rTMP1, rTMP2) STORE_RETURN_BOUNDS (rTMP1, rTMP2) + dcbt 0,rRTN rlwimi rCHR, rCHR, 8, 16, 23 li rMASK, -1 rlwimi rCHR, rCHR, 16, 0, 15 - rlwinm rIGN, rRTN, 3, 27, 28 + rlwinm rIGN, rRTN, 3, 26, 28 + insrdi rCHR, rCHR, 32, 0 lis rFEFE, -0x101 lis r7F7F, 0x7f7f - clrrdi rSTR, rRTN, 2 + clrrdi rSTR, rRTN, 3 addi rFEFE, rFEFE, -0x101 addi r7F7F, r7F7F, 0x7f7f + sldi rTMP1, rFEFE, 32 + insrdi r7F7F, r7F7F, 32, 0 + add rFEFE, rFEFE, rTMP1 /* Test the first (partial?) word. */ - lwz rWORD, 0(rSTR) - srw rMASK, rMASK, rIGN + ld rWORD, 0(rSTR) + srd rMASK, rMASK, rIGN orc rWORD, rWORD, rMASK add rTMP1, rFEFE, rWORD nor rTMP2, r7F7F, rWORD @@ -71,7 +81,7 @@ ENTRY (BP_SYM (strchr)) /* The loop. */ -L(loop):lwzu rWORD, 4(rSTR) +L(loop):ldu rWORD, 8(rSTR) and. rTMP1, rTMP1, rTMP2 /* Test for 0. */ add rTMP1, rFEFE, rWORD @@ -104,12 +114,12 @@ L(missed): add rTMP1, rTMP1, r7F7F nor rWORD, rMASK, rFEFE nor rTMP2, rIGN, rTMP1 - cmplw rWORD, rTMP2 + cmpld rWORD, rTMP2 bgtlr - cntlzw rCLZB, rTMP2 - srwi rCLZB, rCLZB, 3 + cntlzd rCLZB, rTMP2 + srdi rCLZB, rCLZB, 3 add rRTN, rSTR, rCLZB - CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, twlge) + CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, tdlge) STORE_RETURN_VALUE (rSTR) blr @@ -118,11 +128,11 @@ L(foundit): or rIGN, r7F7F, rTMP3 add rTMP1, rTMP1, r7F7F nor rTMP2, rIGN, rTMP1 - cntlzw rCLZB, rTMP2 - subi rSTR, rSTR, 4 - srwi rCLZB, rCLZB, 3 + cntlzd rCLZB, rTMP2 + subi rSTR, rSTR, 8 + srdi rCLZB, rCLZB, 3 add rRTN, rSTR, rCLZB - CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, twlge) + CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, tdlge) STORE_RETURN_VALUE (rSTR) blr END (BP_SYM (strchr)) diff --git a/sysdeps/powerpc/powerpc64/strlen.S b/sysdeps/powerpc/powerpc64/strlen.S index 7907382002..22a835b109 100644 --- a/sysdeps/powerpc/powerpc64/strlen.S +++ b/sysdeps/powerpc/powerpc64/strlen.S @@ -1,5 +1,5 @@ /* Optimized strlen implementation for PowerPC64. - Copyright (C) 1997, 1999, 2000, 2002 Free Software Foundation, Inc. + Copyright (C) 1997, 1999, 2000, 2002, 2003 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -60,7 +60,12 @@ 2) How popular are bytes with the high bit set? If they are very rare, on some processors it might be useful to use the simpler expression ~((x - 0x01010101) | 0x7f7f7f7f) (that is, on processors with only one - ALU), but this fails when any character has its high bit set. */ + ALU), but this fails when any character has its high bit set. + + Answer: + 1) Added a Data Cache Block Touch early to prefetch the first 128 + byte cache line. Adding dcbt instructions to the loop would not be + effective since most strings will be shorter than the cache line.*/ /* Some notes on register usage: Under the SVR4 ABI, we can use registers 0 and 3 through 12 (so long as we don't call any procedures) without @@ -80,63 +85,68 @@ ENTRY (BP_SYM (strlen)) #define rSTR r4 /* current string position */ #define rPADN r5 /* number of padding bits we prepend to the string to make it start at a word boundary */ -#define rFEFE r6 /* constant 0xfefefeff (-0x01010101) */ -#define r7F7F r7 /* constant 0x7f7f7f7f */ -#define rWORD1 r8 /* current string word */ -#define rWORD2 r9 /* next string word */ -#define rMASK r9 /* mask for first string word */ +#define rFEFE r6 /* constant 0xfefefefefefefeff (-0x0101010101010101) */ +#define r7F7F r7 /* constant 0x7f7f7f7f7f7f7f7f */ +#define rWORD1 r8 /* current string doubleword */ +#define rWORD2 r9 /* next string doubleword */ +#define rMASK r9 /* mask for first string doubleword */ #define rTMP2 r10 #define rTMP3 r11 #define rTMP4 r12 +/* Note: The Bounded pointer support in this code is broken. This code + was inherited from PPC32 and and that support was never completed. + Current PPC gcc does not support -fbounds-check or -fbounded-pointers. + These artifacts are left in the code as a reminder in case we need + bounded pointer support in the future. */ CHECK_BOUNDS_LOW (rRTN, rTMP1, rTMP2) - clrrdi rSTR, rRTN, 2 + dcbt 0,rRTN + clrrdi rSTR, rRTN, 3 lis r7F7F, 0x7f7f - rlwinm rPADN, rRTN, 3, 27, 28 - lwz rWORD1, 0(rSTR) - li rMASK, -1 + rlwinm rPADN, rRTN, 3, 26, 28 + ld rWORD1, 0(rSTR) addi r7F7F, r7F7F, 0x7f7f -/* That's the setup done, now do the first pair of words. - We make an exception and use method (2) on the first two words, to reduce - overhead. */ - srw rMASK, rMASK, rPADN + li rMASK, -1 + insrdi r7F7F, r7F7F, 32, 0 +/* That's the setup done, now do the first pair of doublewords. + We make an exception and use method (2) on the first two doublewords, + to reduce overhead. */ + srd rMASK, rMASK, rPADN and rTMP1, r7F7F, rWORD1 or rTMP2, r7F7F, rWORD1 + lis rFEFE, -0x101 add rTMP1, rTMP1, r7F7F + addi rFEFE, rFEFE, -0x101 nor rTMP1, rTMP2, rTMP1 and. rWORD1, rTMP1, rMASK mtcrf 0x01, rRTN bne L(done0) - lis rFEFE, -0x101 - addi rFEFE, rFEFE, -0x101 - clrldi rFEFE,rFEFE,32 /* clear upper 32 */ + sldi rTMP1, rFEFE, 32 + add rFEFE, rFEFE, rTMP1 /* Are we now aligned to a doubleword boundary? */ - bt 29, L(loop) + bt 28, L(loop) -/* Handle second word of pair. */ - lwzu rWORD1, 4(rSTR) +/* Handle second doubleword of pair. */ + ldu rWORD1, 8(rSTR) and rTMP1, r7F7F, rWORD1 or rTMP2, r7F7F, rWORD1 add rTMP1, rTMP1, r7F7F nor. rWORD1, rTMP2, rTMP1 - clrldi. rWORD1,rWORD1,32 /* clear upper 32 */ bne L(done0) /* The loop. */ L(loop): - lwz rWORD1, 4(rSTR) - lwzu rWORD2, 8(rSTR) + ld rWORD1, 8(rSTR) + ldu rWORD2, 16(rSTR) add rTMP1, rFEFE, rWORD1 nor rTMP2, r7F7F, rWORD1 and. rTMP1, rTMP1, rTMP2 - clrldi. rTMP1,rTMP1,32 /* clear upper 32 */ add rTMP3, rFEFE, rWORD2 nor rTMP4, r7F7F, rWORD2 bne L(done1) and. rTMP1, rTMP3, rTMP4 - clrldi. rTMP1,rTMP1,32 /* clear upper 32 */ beq L(loop) and rTMP1, r7F7F, rWORD2 @@ -146,17 +156,17 @@ L(loop): L(done1): and rTMP1, r7F7F, rWORD1 - subi rSTR, rSTR, 4 + subi rSTR, rSTR, 8 add rTMP1, rTMP1, r7F7F andc rWORD1, rTMP2, rTMP1 -/* When we get to here, rSTR points to the first word in the string that +/* When we get to here, rSTR points to the first doubleword in the string that contains a zero byte, and the most significant set bit in rWORD1 is in that byte. */ L(done0): - cntlzw rTMP3, rWORD1 + cntlzd rTMP3, rWORD1 subf rTMP1, rRTN, rSTR - srwi rTMP3, rTMP3, 3 + srdi rTMP3, rTMP3, 3 add rRTN, rTMP1, rTMP3 /* GKM FIXME: check high bound. */ blr |