diff options
Diffstat (limited to 'sysdeps/alpha/alphaev6')
-rw-r--r-- | sysdeps/alpha/alphaev6/Implies | 1 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/addmul_1.s | 479 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/memchr.S | 192 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/memcpy.S | 254 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/memset.S | 224 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/stxcpy.S | 329 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/stxncpy.S | 405 |
7 files changed, 1884 insertions, 0 deletions
diff --git a/sysdeps/alpha/alphaev6/Implies b/sysdeps/alpha/alphaev6/Implies new file mode 100644 index 0000000000..0e7fc170ba --- /dev/null +++ b/sysdeps/alpha/alphaev6/Implies @@ -0,0 +1 @@ +alpha/alphaev5 diff --git a/sysdeps/alpha/alphaev6/addmul_1.s b/sysdeps/alpha/alphaev6/addmul_1.s new file mode 100644 index 0000000000..a061fb9edb --- /dev/null +++ b/sysdeps/alpha/alphaev6/addmul_1.s @@ -0,0 +1,479 @@ + # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add + # the result to a second limb vector. + # + # Copyright (C) 2000 Free Software Foundation, Inc. + # + # This file is part of the GNU MP Library. + # + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published + # by the Free Software Foundation; either version 2.1 of the License, or (at + # your option) any later version. + # + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + # + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # size $18 + # s2_limb $19 + # + # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and + # exactly 3.625 cycles/limb on EV6... + # + # This code was written in close cooperation with ev6 pipeline expert + # Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though. + # + # Register usages for unrolled loop: + # 0-3 mul's + # 4-7 acc's + # 8-15 mul results + # 20,21 carry's + # 22,23 save for stores + # + # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop. + # + # The stores can issue a cycle late so we have paired no-op's to 'catch' + # them, so that further disturbance to the schedule is damped. + # + # We couldn't pair the loads, because the entangled schedule of the + # carry's has to happen on one side {0} of the machine. Note, the total + # use of U0, and the total use of L0 (after attending to the stores). + # which is part of the reason why.... + # + # This is a great schedule for the d_cache, a poor schedule for the + # b_cache. The lockup on U0 means that any stall can't be recovered + # from. Consider a ldq in L1. say that load gets stalled because it + # collides with a fill from the b_Cache. On the next cycle, this load + # gets priority. If first looks at L0, and goes there. The instruction + # we intended for L0 gets to look at L1, which is NOT where we want + # it. It either stalls 1, because it can't go in L0, or goes there, and + # causes a further instruction to stall. + # + # So for b_cache, we're likely going to want to put one or more cycles + # back into the code! And, of course, put in prefetches. For the + # accumulator, lds, intent to modify. For the multiplier, you might + # want ldq, evict next, if you're not wanting to use it again soon. Use + # 256 ahead of present pointer value. At a place where we have an mt + # followed by a bookkeeping, put the bookkeeping in upper, and the + # prefetch into lower. + # + # Note, the usage of physical registers per cycle is smoothed off, as + # much as possible. + # + # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd + # like not to have a ldq or stq to preceded a conditional branch in a + # quadpack. The conditional branch moves the retire pointer one cycle + # later. + # + # Optimization notes: + # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27? + # Reserved regs: $29 $30 $31 + # Free caller-saves regs in unrolled code: $24 $25 $28 + # We should swap some of the callee-saves regs for some of the free + # caller-saves regs, saving some overhead cycles. + # Most importantly, we should write fast code for the 0-7 case. + # The code we use there are for the 21164, and runs at 7 cycles/limb + # on the 21264. Should not be hard, if we write specialized code for + # 1-7 limbs (the one for 0 limbs should be straightforward). We then just + # need a jump table indexed by the low 3 bits of the count argument. + + .set noreorder + .set noat + .text + + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 +__mpn_addmul_1: + .frame $30,0,$26,0 + .prologue 0 + + cmpult $18, 8, $1 + beq $1, $Large + + ldq $2, 0($17) # $2 = s1_limb + addq $17, 8, $17 # s1_ptr++ + subq $18, 1, $18 # size-- + mulq $2, $19, $3 # $3 = prod_low + ldq $5, 0($16) # $5 = *res_ptr + umulh $2, $19, $0 # $0 = prod_high + beq $18, $Lend0b # jump if size was == 1 + ldq $2, 0($17) # $2 = s1_limb + addq $17, 8, $17 # s1_ptr++ + subq $18, 1, $18 # size-- + addq $5, $3, $3 + cmpult $3, $5, $4 + stq $3, 0($16) + addq $16, 8, $16 # res_ptr++ + beq $18, $Lend0a # jump if size was == 2 + + .align 3 +$Loop0: mulq $2, $19, $3 # $3 = prod_low + ldq $5, 0($16) # $5 = *res_ptr + addq $4, $0, $0 # cy_limb = cy_limb + 'cy' + subq $18, 1, $18 # size-- + umulh $2, $19, $4 # $4 = cy_limb + ldq $2, 0($17) # $2 = s1_limb + addq $17, 8, $17 # s1_ptr++ + addq $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addq $5, $3, $3 + cmpult $3, $5, $5 + stq $3, 0($16) + addq $16, 8, $16 # res_ptr++ + addq $5, $0, $0 # combine carries + bne $18, $Loop0 +$Lend0a: + mulq $2, $19, $3 # $3 = prod_low + ldq $5, 0($16) # $5 = *res_ptr + addq $4, $0, $0 # cy_limb = cy_limb + 'cy' + umulh $2, $19, $4 # $4 = cy_limb + addq $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addq $5, $3, $3 + cmpult $3, $5, $5 + stq $3, 0($16) + addq $5, $0, $0 # combine carries + addq $4, $0, $0 # cy_limb = prod_high + cy + ret $31, ($26), 1 +$Lend0b: + addq $5, $3, $3 + cmpult $3, $5, $5 + stq $3, 0($16) + addq $0, $5, $0 + ret $31, ($26), 1 + +$Large: + lda $30, -240($30) + stq $9, 8($30) + stq $10, 16($30) + stq $11, 24($30) + stq $12, 32($30) + stq $13, 40($30) + stq $14, 48($30) + stq $15, 56($30) + + and $18, 7, $20 # count for the first loop, 0-7 + srl $18, 3, $18 # count for unrolled loop + bis $31, $31, $0 + beq $20, $Lunroll + ldq $2, 0($17) # $2 = s1_limb + addq $17, 8, $17 # s1_ptr++ + subq $20, 1, $20 # size-- + mulq $2, $19, $3 # $3 = prod_low + ldq $5, 0($16) # $5 = *res_ptr + umulh $2, $19, $0 # $0 = prod_high + beq $20, $Lend1b # jump if size was == 1 + ldq $2, 0($17) # $2 = s1_limb + addq $17, 8, $17 # s1_ptr++ + subq $20, 1, $20 # size-- + addq $5, $3, $3 + cmpult $3, $5, $4 + stq $3, 0($16) + addq $16, 8, $16 # res_ptr++ + beq $20, $Lend1a # jump if size was == 2 + + .align 3 +$Loop1: mulq $2, $19, $3 # $3 = prod_low + ldq $5, 0($16) # $5 = *res_ptr + addq $4, $0, $0 # cy_limb = cy_limb + 'cy' + subq $20, 1, $20 # size-- + umulh $2, $19, $4 # $4 = cy_limb + ldq $2, 0($17) # $2 = s1_limb + addq $17, 8, $17 # s1_ptr++ + addq $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addq $5, $3, $3 + cmpult $3, $5, $5 + stq $3, 0($16) + addq $16, 8, $16 # res_ptr++ + addq $5, $0, $0 # combine carries + bne $20, $Loop1 + +$Lend1a: + mulq $2, $19, $3 # $3 = prod_low + ldq $5, 0($16) # $5 = *res_ptr + addq $4, $0, $0 # cy_limb = cy_limb + 'cy' + umulh $2, $19, $4 # $4 = cy_limb + addq $3, $0, $3 # $3 = cy_limb + prod_low + cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) + addq $5, $3, $3 + cmpult $3, $5, $5 + stq $3, 0($16) + addq $16, 8, $16 # res_ptr++ + addq $5, $0, $0 # combine carries + addq $4, $0, $0 # cy_limb = prod_high + cy + br $31, $Lunroll +$Lend1b: + addq $5, $3, $3 + cmpult $3, $5, $5 + stq $3, 0($16) + addq $16, 8, $16 # res_ptr++ + addq $0, $5, $0 + +$Lunroll: + lda $17, -16($17) # L1 bookkeeping + lda $16, -16($16) # L1 bookkeeping + bis $0, $31, $12 + + # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ + + ldq $2, 16($17) # L1 + ldq $3, 24($17) # L1 + lda $18, -1($18) # L1 bookkeeping + ldq $6, 16($16) # L1 + ldq $7, 24($16) # L1 + ldq $0, 32($17) # L1 + mulq $19, $2, $13 # U1 + ldq $1, 40($17) # L1 + umulh $19, $2, $14 # U1 + mulq $19, $3, $15 # U1 + lda $17, 64($17) # L1 bookkeeping + ldq $4, 32($16) # L1 + ldq $5, 40($16) # L1 + umulh $19, $3, $8 # U1 + ldq $2, -16($17) # L1 + mulq $19, $0, $9 # U1 + ldq $3, -8($17) # L1 + umulh $19, $0, $10 # U1 + addq $6, $13, $6 # L0 lo + acc + mulq $19, $1, $11 # U1 + cmpult $6, $13, $20 # L0 lo add => carry + lda $16, 64($16) # L1 bookkeeping + addq $6, $12, $22 # U0 hi add => answer + cmpult $22, $12, $21 # L0 hi add => carry + addq $14, $20, $14 # U0 hi mul + carry + ldq $6, -16($16) # L1 + addq $7, $15, $23 # L0 lo + acc + addq $14, $21, $14 # U0 hi mul + carry + ldq $7, -8($16) # L1 + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addq $23, $14, $23 # U0 hi add => answer + ldq $0, 0($17) # L1 + mulq $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addq $8, $20, $8 # U0 hi mul + carry + ldq $1, 8($17) # L1 + umulh $19, $2, $14 # U1 + addq $4, $9, $4 # L0 lo + acc + stq $22, -48($16) # L0 + stq $23, -40($16) # L1 + mulq $19, $3, $15 # U1 + addq $8, $21, $8 # U0 hi mul + carry + cmpult $4, $9, $20 # L0 lo add => carry + addq $4, $8, $22 # U0 hi add => answer + ble $18, $Lend # U1 bookkeeping + + # ____ MAIN UNROLLED LOOP ____ + .align 4 +$Loop: + bis $31, $31, $31 # U1 mt + cmpult $22, $8, $21 # L0 hi add => carry + addq $10, $20, $10 # U0 hi mul + carry + ldq $4, 0($16) # L1 + + bis $31, $31, $31 # U1 mt + addq $5, $11, $23 # L0 lo + acc + addq $10, $21, $10 # L0 hi mul + carry + ldq $5, 8($16) # L1 + + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addq $23, $10, $23 # U0 hi add => answer + ldq $2, 16($17) # L1 + + mulq $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addq $12, $20, $12 # U0 hi mul + carry + ldq $3, 24($17) # L1 + + umulh $19, $0, $10 # U1 + addq $6, $13, $6 # L0 lo + acc + stq $22, -32($16) # L0 + stq $23, -24($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mulq $19, $1, $11 # U1 + bis $31, $31, $31 # L1 st slosh + addq $12, $21, $12 # U0 hi mul + carry + + cmpult $6, $13, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + lda $18, -1($18) # L1 bookkeeping + addq $6, $12, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $12, $21 # L0 hi add => carry + addq $14, $20, $14 # U0 hi mul + carry + ldq $6, 16($16) # L1 + + bis $31, $31, $31 # U1 mt + addq $7, $15, $23 # L0 lo + acc + addq $14, $21, $14 # U0 hi mul + carry + ldq $7, 24($16) # L1 + + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addq $23, $14, $23 # U0 hi add => answer + ldq $0, 32($17) # L1 + + mulq $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addq $8, $20, $8 # U0 hi mul + carry + ldq $1, 40($17) # L1 + + umulh $19, $2, $14 # U1 + addq $4, $9, $4 # U0 lo + acc + stq $22, -16($16) # L0 + stq $23, -8($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mulq $19, $3, $15 # U1 + bis $31, $31, $31 # L1 st slosh + addq $8, $21, $8 # L0 hi mul + carry + + cmpult $4, $9, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + lda $17, 64($17) # L1 bookkeeping + addq $4, $8, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $8, $21 # L0 hi add => carry + addq $10, $20, $10 # U0 hi mul + carry + ldq $4, 32($16) # L1 + + bis $31, $31, $31 # U1 mt + addq $5, $11, $23 # L0 lo + acc + addq $10, $21, $10 # L0 hi mul + carry + ldq $5, 40($16) # L1 + + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addq $23, $10, $23 # U0 hi add => answer + ldq $2, -16($17) # L1 + + mulq $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addq $12, $20, $12 # U0 hi mul + carry + ldq $3, -8($17) # L1 + + umulh $19, $0, $10 # U1 + addq $6, $13, $6 # L0 lo + acc + stq $22, 0($16) # L0 + stq $23, 8($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mulq $19, $1, $11 # U1 + bis $31, $31, $31 # L1 st slosh + addq $12, $21, $12 # U0 hi mul + carry + + cmpult $6, $13, $20 # L0 lo add => carry + bis $31, $31, $31 # U1 mt + lda $16, 64($16) # L1 bookkeeping + addq $6, $12, $22 # U0 hi add => answer + + bis $31, $31, $31 # U1 mt + cmpult $22, $12, $21 # L0 hi add => carry + addq $14, $20, $14 # U0 hi mul + carry + ldq $6, -16($16) # L1 + + bis $31, $31, $31 # U1 mt + addq $7, $15, $23 # L0 lo + acc + addq $14, $21, $14 # U0 hi mul + carry + ldq $7, -8($16) # L1 + + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addq $23, $14, $23 # U0 hi add => answer + ldq $0, 0($17) # L1 + + mulq $19, $2, $13 # U1 + cmpult $23, $14, $21 # L0 hi add => carry + addq $8, $20, $8 # U0 hi mul + carry + ldq $1, 8($17) # L1 + + umulh $19, $2, $14 # U1 + addq $4, $9, $4 # L0 lo + acc + stq $22, -48($16) # L0 + stq $23, -40($16) # L1 + + bis $31, $31, $31 # L0 st slosh + mulq $19, $3, $15 # U1 + bis $31, $31, $31 # L1 st slosh + addq $8, $21, $8 # U0 hi mul + carry + + cmpult $4, $9, $20 # L0 lo add => carry + addq $4, $8, $22 # U0 hi add => answer + bis $31, $31, $31 # L1 mt + bgt $18, $Loop # U1 bookkeeping + +# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ +$Lend: + cmpult $22, $8, $21 # L0 hi add => carry + addq $10, $20, $10 # U0 hi mul + carry + ldq $4, 0($16) # L1 + addq $5, $11, $23 # L0 lo + acc + addq $10, $21, $10 # L0 hi mul + carry + ldq $5, 8($16) # L1 + umulh $19, $3, $8 # U1 + cmpult $23, $11, $20 # L0 lo add => carry + addq $23, $10, $23 # U0 hi add => answer + mulq $19, $0, $9 # U1 + cmpult $23, $10, $21 # L0 hi add => carry + addq $12, $20, $12 # U0 hi mul + carry + umulh $19, $0, $10 # U1 + addq $6, $13, $6 # L0 lo + acc + stq $22, -32($16) # L0 + stq $23, -24($16) # L1 + mulq $19, $1, $11 # U1 + addq $12, $21, $12 # U0 hi mul + carry + cmpult $6, $13, $20 # L0 lo add => carry + addq $6, $12, $22 # U0 hi add => answer + cmpult $22, $12, $21 # L0 hi add => carry + addq $14, $20, $14 # U0 hi mul + carry + addq $7, $15, $23 # L0 lo + acc + addq $14, $21, $14 # U0 hi mul + carry + umulh $19, $1, $12 # U1 + cmpult $23, $15, $20 # L0 lo add => carry + addq $23, $14, $23 # U0 hi add => answer + cmpult $23, $14, $21 # L0 hi add => carry + addq $8, $20, $8 # U0 hi mul + carry + addq $4, $9, $4 # U0 lo + acc + stq $22, -16($16) # L0 + stq $23, -8($16) # L1 + bis $31, $31, $31 # L0 st slosh + addq $8, $21, $8 # L0 hi mul + carry + cmpult $4, $9, $20 # L0 lo add => carry + addq $4, $8, $22 # U0 hi add => answer + cmpult $22, $8, $21 # L0 hi add => carry + addq $10, $20, $10 # U0 hi mul + carry + addq $5, $11, $23 # L0 lo + acc + addq $10, $21, $10 # L0 hi mul + carry + cmpult $23, $11, $20 # L0 lo add => carry + addq $23, $10, $23 # U0 hi add => answer + cmpult $23, $10, $21 # L0 hi add => carry + addq $12, $20, $12 # U0 hi mul + carry + stq $22, 0($16) # L0 + stq $23, 8($16) # L1 + addq $12, $21, $0 # U0 hi mul + carry + + ldq $9, 8($30) + ldq $10, 16($30) + ldq $11, 24($30) + ldq $12, 32($30) + ldq $13, 40($30) + ldq $14, 48($30) + ldq $15, 56($30) + lda $30, 240($30) + ret $31, ($26), 1 + + .end __mpn_addmul_1 diff --git a/sysdeps/alpha/alphaev6/memchr.S b/sysdeps/alpha/alphaev6/memchr.S new file mode 100644 index 0000000000..0dfcbea76a --- /dev/null +++ b/sysdeps/alpha/alphaev6/memchr.S @@ -0,0 +1,192 @@ +/* Copyright (C) 2000 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by David Mosberger (davidm@cs.arizona.edu). + EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#include <sysdep.h> + + .arch ev6 + .set noreorder + .set noat + +ENTRY(__memchr) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + # Hack -- if someone passes in (size_t)-1, hoping to just + # search til the end of the address space, we will overflow + # below when we find the address of the last byte. Given + # that we will never have a 56-bit address space, cropping + # the length is the easiest way to avoid trouble. + zap $18, 0x80, $5 # U : Bound length + beq $18, $not_found # U : + ldq_u $1, 0($16) # L : load first quadword Latency=3 + and $17, 0xff, $17 # E : L L U U : 00000000000000ch + + insbl $17, 1, $2 # U : 000000000000ch00 + cmpult $18, 9, $4 # E : small (< 1 quad) string? + or $2, $17, $17 # E : 000000000000chch + lda $3, -1($31) # E : U L L U + + sll $17, 16, $2 # U : 00000000chch0000 + addq $16, $5, $5 # E : Max search address + or $2, $17, $17 # E : 00000000chchchch + sll $17, 32, $2 # U : U L L U : chchchch00000000 + + or $2, $17, $17 # E : chchchchchchchch + extql $1, $16, $7 # U : $7 is upper bits + beq $4, $first_quad # U : + ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3 + + extqh $6, $16, $6 # U : 2 cycle stall for $6 + mov $16, $0 # E : + nop # E : + or $7, $6, $1 # E : L U L U $1 = quadword starting at $16 + + # Deal with the case where at most 8 bytes remain to be searched + # in $1. E.g.: + # $18 = 6 + # $1 = ????c6c5c4c3c2c1 +$last_quad: + negq $18, $6 # E : + xor $17, $1, $1 # E : + srl $3, $6, $6 # U : $6 = mask of $18 bits set + cmpbge $31, $1, $2 # E : L U L U + + nop + nop + and $2, $6, $2 # E : + beq $2, $not_found # U : U L U L + +$found_it: +#if defined(__alpha_fix__) && defined(__alpha_cix__) + /* + * Since we are guaranteed to have set one of the bits, we don't + * have to worry about coming back with a 0x40 out of cttz... + */ + cttz $2, $3 # U0 : + addq $0, $3, $0 # E : All done + nop # E : + ret # L0 : L U L U +#else + /* + * Slow and clunky. It can probably be improved. + * An exercise left for others. + */ + negq $2, $3 # E : + and $2, $3, $2 # E : + and $2, 0x0f, $1 # E : + addq $0, 4, $3 # E : + + cmoveq $1, $3, $0 # E : Latency 2, extra map cycle + nop # E : keep with cmov + and $2, 0x33, $1 # E : + addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0 + + cmoveq $1, $3, $0 # E : Latency 2, extra map cycle + nop # E : keep with cmov + and $2, 0x55, $1 # E : + addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0 + + cmoveq $1, $3, $0 # E : Latency 2, extra map cycle + nop + nop + ret # L0 : L U L U +#endif + + # Deal with the case where $18 > 8 bytes remain to be + # searched. $16 may not be aligned. + .align 4 +$first_quad: + andnot $16, 0x7, $0 # E : + insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff) + xor $1, $17, $1 # E : + or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff + + cmpbge $31, $1, $2 # E : + bne $2, $found_it # U : + # At least one byte left to process. + ldq $1, 8($0) # L : + subq $5, 1, $18 # E : U L U L + + addq $0, 8, $0 # E : + # Make $18 point to last quad to be accessed (the + # last quad may or may not be partial). + andnot $18, 0x7, $18 # E : + cmpult $0, $18, $2 # E : + beq $2, $final # U : U L U L + + # At least two quads remain to be accessed. + + subq $18, $0, $4 # E : $4 <- nr quads to be processed + and $4, 8, $4 # E : odd number of quads? + bne $4, $odd_quad_count # U : + # At least three quads remain to be accessed + mov $1, $4 # E : L U L U : move prefetched value to correct reg + + .align 4 +$unrolled_loop: + ldq $1, 8($0) # L : prefetch $1 + xor $17, $4, $2 # E : + cmpbge $31, $2, $2 # E : + bne $2, $found_it # U : U L U L + + addq $0, 8, $0 # E : + nop # E : + nop # E : + nop # E : + +$odd_quad_count: + xor $17, $1, $2 # E : + ldq $4, 8($0) # L : prefetch $4 + cmpbge $31, $2, $2 # E : + addq $0, 8, $6 # E : + + bne $2, $found_it # U : + cmpult $6, $18, $6 # E : + addq $0, 8, $0 # E : + nop # E : + + bne $6, $unrolled_loop # U : + mov $4, $1 # E : move prefetched value into $1 + nop # E : + nop # E : + +$final: subq $5, $0, $18 # E : $18 <- number of bytes left to do + nop # E : + nop # E : + bne $18, $last_quad # U : + +$not_found: + mov $31, $0 # E : + nop # E : + nop # E : + ret # L0 : + + END(__memchr) + +weak_alias (__memchr, memchr) +#if !__BOUNDED_POINTERS__ +weak_alias (__memchr, __ubp_memchr) +#endif diff --git a/sysdeps/alpha/alphaev6/memcpy.S b/sysdeps/alpha/alphaev6/memcpy.S new file mode 100644 index 0000000000..35f17e7f9e --- /dev/null +++ b/sysdeps/alpha/alphaev6/memcpy.S @@ -0,0 +1,254 @@ +/* Copyright (C) 2000 Free Software Foundation, Inc. + This file is part of the GNU C Library. + EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* + * Much of the information about 21264 scheduling/coding comes from: + * Compiler Writer's Guide for the Alpha 21264 + * abbreviated as 'CWG' in other comments here + * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html + * Scheduling notation: + * E - either cluster + * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 + * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 + * + * Temp usage notes: + * $0 - destination address + * $1,$2, - scratch + */ + +#include <sysdep.h> + + .arch ev6 + .set noreorder + .set noat + +ENTRY(memcpy) + + mov $16, $0 # E : copy dest to return + ble $18, $nomoredata # U : done with the copy? + xor $16, $17, $1 # E : are source and dest alignments the same? + and $1, 7, $1 # E : are they the same mod 8? + + bne $1, $misaligned # U : Nope - gotta do this the slow way + /* source and dest are same mod 8 address */ + and $16, 7, $1 # E : Are both 0mod8? + beq $1, $both_0mod8 # U : Yes + nop # E : + + /* + * source and dest are same misalignment. move a byte at a time + * until a 0mod8 alignment for both is reached. + * At least one byte more to move + */ + +$head_align: + ldbu $1, 0($17) # L : grab a byte + subq $18, 1, $18 # E : count-- + addq $17, 1, $17 # E : src++ + stb $1, 0($16) # L : + addq $16, 1, $16 # E : dest++ + and $16, 7, $1 # E : Are we at 0mod8 yet? + ble $18, $nomoredata # U : done with the copy? + bne $1, $head_align # U : + +$both_0mod8: + cmple $18, 127, $1 # E : Can we unroll the loop? + bne $1, $no_unroll # U : + and $16, 63, $1 # E : get mod64 alignment + beq $1, $do_unroll # U : no single quads to fiddle + +$single_head_quad: + ldq $1, 0($17) # L : get 8 bytes + subq $18, 8, $18 # E : count -= 8 + addq $17, 8, $17 # E : src += 8 + nop # E : + + stq $1, 0($16) # L : store + addq $16, 8, $16 # E : dest += 8 + and $16, 63, $1 # E : get mod64 alignment + bne $1, $single_head_quad # U : still not fully aligned + +$do_unroll: + addq $16, 64, $7 # E : Initial (+1 trip) wh64 address + cmple $18, 63, $1 # E : Can we go through the unrolled loop? + bne $1, $tail_quads # U : Nope + nop # E : + +$unroll_body: + wh64 ($7) # L1 : memory subsystem hint: 64 bytes at + # ($7) are about to be over-written + ldq $6, 0($17) # L0 : bytes 0..7 + nop # E : + nop # E : + + ldq $4, 8($17) # L : bytes 8..15 + ldq $5, 16($17) # L : bytes 16..23 + addq $7, 64, $7 # E : Update next wh64 address + nop # E : + + ldq $3, 24($17) # L : bytes 24..31 + addq $16, 64, $1 # E : fallback value for wh64 + nop # E : + nop # E : + + addq $17, 32, $17 # E : src += 32 bytes + stq $6, 0($16) # L : bytes 0..7 + nop # E : + nop # E : + + stq $4, 8($16) # L : bytes 8..15 + stq $5, 16($16) # L : bytes 16..23 + subq $18, 192, $2 # E : At least two more trips to go? + nop # E : + + stq $3, 24($16) # L : bytes 24..31 + addq $16, 32, $16 # E : dest += 32 bytes + nop # E : + nop # E : + + ldq $6, 0($17) # L : bytes 0..7 + ldq $4, 8($17) # L : bytes 8..15 + cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use + # fallback wh64 address if < 2 more trips + nop # E : + + ldq $5, 16($17) # L : bytes 16..23 + ldq $3, 24($17) # L : bytes 24..31 + addq $16, 32, $16 # E : dest += 32 + subq $18, 64, $18 # E : count -= 64 + + addq $17, 32, $17 # E : src += 32 + stq $6, -32($16) # L : bytes 0..7 + stq $4, -24($16) # L : bytes 8..15 + cmple $18, 63, $1 # E : At least one more trip? + + stq $5, -16($16) # L : bytes 16..23 + stq $3, -8($16) # L : bytes 24..31 + nop # E : + beq $1, $unroll_body + +$tail_quads: +$no_unroll: + .align 4 + subq $18, 8, $18 # E : At least a quad left? + blt $18, $less_than_8 # U : Nope + nop # E : + nop # E : + +$move_a_quad: + ldq $1, 0($17) # L : fetch 8 + subq $18, 8, $18 # E : count -= 8 + addq $17, 8, $17 # E : src += 8 + nop # E : + + stq $1, 0($16) # L : store 8 + addq $16, 8, $16 # E : dest += 8 + bge $18, $move_a_quad # U : + nop # E : + +$less_than_8: + .align 4 + addq $18, 8, $18 # E : add back for trailing bytes + ble $18, $nomoredata # U : All-done + nop # E : + nop # E : + + /* Trailing bytes */ +$tail_bytes: + subq $18, 1, $18 # E : count-- + ldbu $1, 0($17) # L : fetch a byte + addq $17, 1, $17 # E : src++ + nop # E : + + stb $1, 0($16) # L : store a byte + addq $16, 1, $16 # E : dest++ + bgt $18, $tail_bytes # U : more to be done? + nop # E : + + /* branching to exit takes 3 extra cycles, so replicate exit here */ + ret $31, ($26), 1 # L0 : + nop # E : + nop # E : + nop # E : + +$misaligned: + mov $0, $4 # E : dest temp + and $0, 7, $1 # E : dest alignment mod8 + beq $1, $dest_0mod8 # U : life doesnt totally suck + nop + +$aligndest: + ble $18, $nomoredata # U : + ldbu $1, 0($17) # L : fetch a byte + subq $18, 1, $18 # E : count-- + addq $17, 1, $17 # E : src++ + + stb $1, 0($4) # L : store it + addq $4, 1, $4 # E : dest++ + and $4, 7, $1 # E : dest 0mod8 yet? + bne $1, $aligndest # U : go until we are aligned. + + /* Source has unknown alignment, but dest is known to be 0mod8 */ +$dest_0mod8: + subq $18, 8, $18 # E : At least a quad left? + blt $18, $misalign_tail # U : Nope + ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes + nop # E : + +$mis_quad: + ldq_u $16, 8($17) # L : Fetch next 8 + extql $3, $17, $3 # U : masking + extqh $16, $17, $1 # U : masking + bis $3, $1, $1 # E : merged bytes to store + + subq $18, 8, $18 # E : count -= 8 + addq $17, 8, $17 # E : src += 8 + stq $1, 0($4) # L : store 8 (aligned) + mov $16, $3 # E : "rotate" source data + + addq $4, 8, $4 # E : dest += 8 + bge $18, $mis_quad # U : More quads to move + nop + nop + +$misalign_tail: + addq $18, 8, $18 # E : account for tail stuff + ble $18, $nomoredata # U : + nop + nop + +$misalign_byte: + ldbu $1, 0($17) # L : fetch 1 + subq $18, 1, $18 # E : count-- + addq $17, 1, $17 # E : src++ + nop # E : + + stb $1, 0($4) # L : store + addq $4, 1, $4 # E : dest++ + bgt $18, $misalign_byte # U : more to go? + nop + + +$nomoredata: + ret $31, ($26), 1 # L0 : + nop # E : + nop # E : + nop # E : + +END(memcpy) diff --git a/sysdeps/alpha/alphaev6/memset.S b/sysdeps/alpha/alphaev6/memset.S new file mode 100644 index 0000000000..363b3a588b --- /dev/null +++ b/sysdeps/alpha/alphaev6/memset.S @@ -0,0 +1,224 @@ +/* Copyright (C) 2000 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#include <sysdep.h> + + .arch ev6 + .set noat + .set noreorder + +ENTRY(memset) +#ifdef PROF + ldgp gp, 0(pv) + lda AT, _mcount + jsr AT, (AT), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + /* + * Serious stalling happens. The only way to mitigate this is to + * undertake a major re-write to interleave the constant materialization + * with other parts of the fall-through code. This is important, even + * though it makes maintenance tougher. + * Do this later. + */ + and $17, 255, $1 # E : 00000000000000ch + insbl $17, 1, $2 # U : 000000000000ch00 + mov $16, $0 # E : return value + ble $18, $end # U : zero length requested? + + addq $18, $16, $6 # E : max address to write to + or $1, $2, $17 # E : 000000000000chch + insbl $1, 2, $3 # U : 0000000000ch0000 + insbl $1, 3, $4 # U : 00000000ch000000 + + or $3, $4, $3 # E : 00000000chch0000 + inswl $17, 4, $5 # U : 0000chch00000000 + xor $16, $6, $1 # E : will complete write be within one quadword? + inswl $17, 6, $2 # U : chch000000000000 + + or $17, $3, $17 # E : 00000000chchchch + or $2, $5, $2 # E : chchchch00000000 + bic $1, 7, $1 # E : fit within a single quadword? + and $16, 7, $3 # E : Target addr misalignment + + or $17, $2, $17 # E : chchchchchchchch + beq $1, $within_quad # U : + nop # E : + beq $3, $aligned # U : target is 0mod8 + + /* + * Target address is misaligned, and won't fit within a quadword. + */ + ldq_u $4, 0($16) # L : Fetch first partial + mov $16, $5 # E : Save the address + insql $17, $16, $2 # U : Insert new bytes + subq $3, 8, $3 # E : Invert (for addressing uses) + + addq $18, $3, $18 # E : $18 is new count ($3 is negative) + mskql $4, $16, $4 # U : clear relevant parts of the quad + subq $16, $3, $16 # E : $16 is new aligned destination + or $2, $4, $1 # E : Final bytes + + nop + stq_u $1,0($5) # L : Store result + nop + nop + + .align 4 +$aligned: + /* + * We are now guaranteed to be quad aligned, with at least + * one partial quad to write. + */ + + sra $18, 3, $3 # U : Number of remaining quads to write + and $18, 7, $18 # E : Number of trailing bytes to write + mov $16, $5 # E : Save dest address + beq $3, $no_quad # U : tail stuff only + + /* + * It's worth the effort to unroll this and use wh64 if possible. + * At this point, entry values are: + * $16 Current destination address + * $5 A copy of $16 + * $6 The max quadword address to write to + * $18 Number trailer bytes + * $3 Number quads to write + */ + + and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) + subq $3, 16, $4 # E : Only try to unroll if > 128 bytes + subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) + blt $4, $loop # U : + + /* + * We know we've got at least 16 quads, minimum of one trip + * through unrolled loop. Do a quad at a time to get us 0mod64 + * aligned. + */ + + nop # E : + nop # E : + nop # E : + beq $1, $bigalign # U : + +$alignmod64: + stq $17, 0($5) # L : + subq $3, 1, $3 # E : For consistency later + addq $1, 8, $1 # E : Increment towards zero for alignment + addq $5, 8, $4 # E : Initial wh64 address (filler instruction) + + nop + nop + addq $5, 8, $5 # E : Inc address + blt $1, $alignmod64 # U : + +$bigalign: + /* + * $3 - number quads left to go + * $5 - target address (aligned 0mod64) + * $17 - mask of stuff to store + * Scratch registers available: $7, $2, $4, $1 + * We know that we'll be taking a minimum of one trip through. + * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle + * Assumes the wh64 needs to be for 2 trips through the loop in the future. + * The wh64 is issued on for the starting destination address for trip +2 + * through the loop, and if there are less than two trips left, the target + * address will be for the current trip. + */ + +$do_wh64: + wh64 ($4) # L1 : memory subsystem write hint + subq $3, 24, $2 # E : For determining future wh64 addresses + stq $17, 0($5) # L : + nop # E : + + addq $5, 128, $4 # E : speculative target of next wh64 + stq $17, 8($5) # L : + stq $17, 16($5) # L : + addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) + + stq $17, 24($5) # L : + stq $17, 32($5) # L : + cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle + nop + + stq $17, 40($5) # L : + stq $17, 48($5) # L : + subq $3, 16, $2 # E : Repeat the loop at least once more? + nop + + stq $17, 56($5) # L : + addq $5, 64, $5 # E : + subq $3, 8, $3 # E : + bge $2, $do_wh64 # U : + + nop + nop + nop + beq $3, $no_quad # U : Might have finished already + + .align 4 + /* + * Simple loop for trailing quadwords, or for small amounts + * of data (where we can't use an unrolled loop and wh64) + */ +$loop: + stq $17, 0($5) # L : + subq $3, 1, $3 # E : Decrement number quads left + addq $5, 8, $5 # E : Inc address + bne $3, $loop # U : more? + +$no_quad: + /* + * Write 0..7 trailing bytes. + */ + nop # E : + beq $18, $end # U : All done? + ldq $7, 0($5) # L : + mskqh $7, $6, $2 # U : Mask final quad + + insqh $17, $6, $4 # U : New bits + or $2, $4, $1 # E : Put it all together + stq $1, 0($5) # L : And back to memory + ret $31,($26),1 # L0 : + +$within_quad: + ldq_u $1, 0($16) # L : + insql $17, $16, $2 # U : New bits + mskql $1, $16, $4 # U : Clear old + or $2, $4, $2 # E : New result + + mskql $2, $6, $4 # U : + mskqh $1, $6, $2 # U : + or $2, $4, $1 # E : + stq_u $1, 0($16) # L : + +$end: + nop + nop + nop + ret $31,($26),1 # L0 : + + END(memset) diff --git a/sysdeps/alpha/alphaev6/stxcpy.S b/sysdeps/alpha/alphaev6/stxcpy.S new file mode 100644 index 0000000000..0df20438fc --- /dev/null +++ b/sysdeps/alpha/alphaev6/stxcpy.S @@ -0,0 +1,329 @@ +/* Copyright (C) 2000 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* Copy a null-terminated string from SRC to DST. + + This is an internal routine used by strcpy, stpcpy, and strcat. + As such, it uses special linkage conventions to make implementation + of these public functions more efficient. + + On input: + t9 = return address + a0 = DST + a1 = SRC + + On output: + t8 = bitmask (with one bit set) indicating the last byte written + a0 = unaligned address of the last *word* written + + Furthermore, v0, a3-a5, t11, and t12 are untouched. +*/ + + +#include <sysdep.h> + + .arch ev6 + .set noat + .set noreorder + .text + +/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that + doesn't like putting the entry point for a procedure somewhere in the + middle of the procedure descriptor. Work around this by putting the + aligned copy in its own procedure descriptor */ + + + .ent stxcpy_aligned + .align 4 +stxcpy_aligned: + .frame sp, 0, t9 + .prologue 0 + + /* On entry to this basic block: + t0 == the first destination word for masking back in + t1 == the first source word. */ + + /* Create the 1st output word and detect 0's in the 1st input word. */ + lda t2, -1 # E : build a mask against false zero + mskqh t2, a1, t2 # U : detection in the src word (stall) + mskqh t1, a1, t3 # U : + ornot t1, t2, t2 # E : (stall) + + mskql t0, a1, t0 # U : assemble the first output word + cmpbge zero, t2, t8 # E : bits set iff null found + or t0, t3, t1 # E : (stall) + bne t8, $a_eos # U : (stall) + + /* On entry to this basic block: + t0 == the first destination word for masking back in + t1 == a source word not containing a null. */ + /* Nops here to separate store quads from load quads */ + +$a_loop: + stq_u t1, 0(a0) # L : + addq a0, 8, a0 # E : + nop + nop + + ldq_u t1, 0(a1) # L : Latency=3 + addq a1, 8, a1 # E : + cmpbge zero, t1, t8 # E : (3 cycle stall) + beq t8, $a_loop # U : (stall for t8) + + /* Take care of the final (partial) word store. + On entry to this basic block we have: + t1 == the source word containing the null + t8 == the cmpbge mask that found it. */ +$a_eos: + negq t8, t6 # E : find low bit set + and t8, t6, t10 # E : (stall) + /* For the sake of the cache, don't read a destination word + if we're not going to need it. */ + and t10, 0x80, t6 # E : (stall) + bne t6, 1f # U : (stall) + + /* We're doing a partial word store and so need to combine + our source and original destination words. */ + ldq_u t0, 0(a0) # L : Latency=3 + subq t10, 1, t6 # E : + zapnot t1, t6, t1 # U : clear src bytes >= null (stall) + or t10, t6, t8 # E : (stall) + + zap t0, t8, t0 # E : clear dst bytes <= null + or t0, t1, t1 # E : (stall) + nop + nop + +1: stq_u t1, 0(a0) # L : + ret (t9) # L0 : Latency=3 + nop + nop + + .end stxcpy_aligned + + .align 4 + .ent __stxcpy + .globl __stxcpy +__stxcpy: + .frame sp, 0, t9 + .prologue 0 + + /* Are source and destination co-aligned? */ + xor a0, a1, t0 # E : + unop # E : + and t0, 7, t0 # E : (stall) + bne t0, $unaligned # U : (stall) + + /* We are co-aligned; take care of a partial first word. */ + ldq_u t1, 0(a1) # L : load first src word + and a0, 7, t0 # E : take care not to load a word ... + addq a1, 8, a1 # E : + beq t0, stxcpy_aligned # U : ... if we wont need it (stall) + + ldq_u t0, 0(a0) # L : + br stxcpy_aligned # L0 : Latency=3 + nop + nop + + +/* The source and destination are not co-aligned. Align the destination + and cope. We have to be very careful about not reading too much and + causing a SEGV. */ + + .align 4 +$u_head: + /* We know just enough now to be able to assemble the first + full source word. We can still find a zero at the end of it + that prevents us from outputting the whole thing. + + On entry to this basic block: + t0 == the first dest word, for masking back in, if needed else 0 + t1 == the low bits of the first source word + t6 == bytemask that is -1 in dest word bytes */ + + ldq_u t2, 8(a1) # L : + addq a1, 8, a1 # E : + extql t1, a1, t1 # U : (stall on a1) + extqh t2, a1, t4 # U : (stall on a1) + + mskql t0, a0, t0 # U : + or t1, t4, t1 # E : + mskqh t1, a0, t1 # U : (stall on t1) + or t0, t1, t1 # E : (stall on t1) + + or t1, t6, t6 # E : + cmpbge zero, t6, t8 # E : (stall) + lda t6, -1 # E : for masking just below + bne t8, $u_final # U : (stall) + + mskql t6, a1, t6 # U : mask out the bits we have + or t6, t2, t2 # E : already extracted before (stall) + cmpbge zero, t2, t8 # E : testing eos (stall) + bne t8, $u_late_head_exit # U : (stall) + + /* Finally, we've got all the stupid leading edge cases taken care + of and we can set up to enter the main loop. */ + + stq_u t1, 0(a0) # L : store first output word + addq a0, 8, a0 # E : + extql t2, a1, t0 # U : position ho-bits of lo word + ldq_u t2, 8(a1) # U : read next high-order source word + + addq a1, 8, a1 # E : + cmpbge zero, t2, t8 # E : (stall for t2) + nop # E : + bne t8, $u_eos # U : (stall) + + /* Unaligned copy main loop. In order to avoid reading too much, + the loop is structured to detect zeros in aligned source words. + This has, unfortunately, effectively pulled half of a loop + iteration out into the head and half into the tail, but it does + prevent nastiness from accumulating in the very thing we want + to run as fast as possible. + + On entry to this basic block: + t0 == the shifted high-order bits from the previous source word + t2 == the unshifted current source word + + We further know that t2 does not contain a null terminator. */ + + .align 3 +$u_loop: + extqh t2, a1, t1 # U : extract high bits for current word + addq a1, 8, a1 # E : (stall) + extql t2, a1, t3 # U : extract low bits for next time (stall) + addq a0, 8, a0 # E : + + or t0, t1, t1 # E : current dst word now complete + ldq_u t2, 0(a1) # L : Latency=3 load high word for next time + stq_u t1, -8(a0) # L : save the current word (stall) + mov t3, t0 # E : + + cmpbge zero, t2, t8 # E : test new word for eos + beq t8, $u_loop # U : (stall) + nop + nop + + /* We've found a zero somewhere in the source word we just read. + If it resides in the lower half, we have one (probably partial) + word to write out, and if it resides in the upper half, we + have one full and one partial word left to write out. + + On entry to this basic block: + t0 == the shifted high-order bits from the previous source word + t2 == the unshifted current source word. */ +$u_eos: + extqh t2, a1, t1 # U : + or t0, t1, t1 # E : first (partial) source word complete (stall) + cmpbge zero, t1, t8 # E : is the null in this first bit? (stall) + bne t8, $u_final # U : (stall) + +$u_late_head_exit: + stq_u t1, 0(a0) # L : the null was in the high-order bits + addq a0, 8, a0 # E : + extql t2, a1, t1 # U : + cmpbge zero, t1, t8 # E : (stall) + + /* Take care of a final (probably partial) result word. + On entry to this basic block: + t1 == assembled source word + t8 == cmpbge mask that found the null. */ +$u_final: + negq t8, t6 # E : isolate low bit set + and t6, t8, t10 # E : (stall) + and t10, 0x80, t6 # E : avoid dest word load if we can (stall) + bne t6, 1f # U : (stall) + + ldq_u t0, 0(a0) # E : + subq t10, 1, t6 # E : + or t6, t10, t8 # E : (stall) + zapnot t1, t6, t1 # U : kill source bytes >= null (stall) + + zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall) + or t0, t1, t1 # E : (stall) + nop + nop + +1: stq_u t1, 0(a0) # L : + ret (t9) # L0 : Latency=3 + nop + nop + + /* Unaligned copy entry point. */ + .align 4 +$unaligned: + + ldq_u t1, 0(a1) # L : load first source word + and a0, 7, t4 # E : find dest misalignment + and a1, 7, t5 # E : find src misalignment + /* Conditionally load the first destination word and a bytemask + with 0xff indicating that the destination byte is sacrosanct. */ + mov zero, t0 # E : + + mov zero, t6 # E : + beq t4, 1f # U : + ldq_u t0, 0(a0) # L : + lda t6, -1 # E : + + mskql t6, a0, t6 # U : + nop + nop + nop +1: + subq a1, t4, a1 # E : sub dest misalignment from src addr + /* If source misalignment is larger than dest misalignment, we need + extra startup checks to avoid SEGV. */ + cmplt t4, t5, t10 # E : + beq t10, $u_head # U : + lda t2, -1 # E : mask out leading garbage in source + + mskqh t2, t5, t2 # U : + ornot t1, t2, t3 # E : (stall) + cmpbge zero, t3, t8 # E : is there a zero? (stall) + beq t8, $u_head # U : (stall) + + /* At this point we've found a zero in the first partial word of + the source. We need to isolate the valid source data and mask + it into the original destination data. (Incidentally, we know + that we'll need at least one byte of that original dest word.) */ + + ldq_u t0, 0(a0) # L : + negq t8, t6 # E : build bitmask of bytes <= zero + and t6, t8, t10 # E : (stall) + and a1, 7, t5 # E : + + subq t10, 1, t6 # E : + or t6, t10, t8 # E : (stall) + srl t10, t5, t10 # U : adjust final null return value + zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall) + + and t1, t2, t1 # E : to source validity mask + extql t2, a1, t2 # U : + extql t1, a1, t1 # U : (stall) + andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall) + + or t0, t1, t1 # e1 : and put it there + stq_u t1, 0(a0) # .. e0 : (stall) + ret (t9) # e1 : + nop + + .end __stxcpy + diff --git a/sysdeps/alpha/alphaev6/stxncpy.S b/sysdeps/alpha/alphaev6/stxncpy.S new file mode 100644 index 0000000000..140279106a --- /dev/null +++ b/sysdeps/alpha/alphaev6/stxncpy.S @@ -0,0 +1,405 @@ +/* Copyright (C) 2000 Free Software Foundation, Inc. + Contributed by Richard Henderson (rth@tamu.edu) + EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* Copy no more than COUNT bytes of the null-terminated string from + SRC to DST. + + This is an internal routine used by strncpy, stpncpy, and strncat. + As such, it uses special linkage conventions to make implementation + of these public functions more efficient. + + On input: + t9 = return address + a0 = DST + a1 = SRC + a2 = COUNT + + Furthermore, COUNT may not be zero. + + On output: + t0 = last word written + t8 = bitmask (with one bit set) indicating the last byte written + t10 = bitmask (with one bit set) indicating the byte position of + the end of the range specified by COUNT + a0 = unaligned address of the last *word* written + a2 = the number of full words left in COUNT + + Furthermore, v0, a3-a5, t11, and t12 are untouched. +*/ + +#include <sysdep.h> + + .arch ev6 + .set noat + .set noreorder + +/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that + doesn't like putting the entry point for a procedure somewhere in the + middle of the procedure descriptor. Work around this by putting the + aligned copy in its own procedure descriptor */ + + + .ent stxncpy_aligned + .align 4 +stxncpy_aligned: + .frame sp, 0, t9, 0 + .prologue 0 + + /* On entry to this basic block: + t0 == the first destination word for masking back in + t1 == the first source word. */ + + /* Create the 1st output word and detect 0's in the 1st input word. */ + lda t2, -1 # E : build a mask against false zero + mskqh t2, a1, t2 # U : detection in the src word (stall) + mskqh t1, a1, t3 # U : + ornot t1, t2, t2 # E : (stall) + + mskql t0, a1, t0 # U : assemble the first output word + cmpbge zero, t2, t7 # E : bits set iff null found + or t0, t3, t0 # E : (stall) + beq a2, $a_eoc # U : + + bne t7, $a_eos # U : + nop + nop + nop + + /* On entry to this basic block: + t0 == a source word not containing a null. */ + + /* + * nops here to: + * separate store quads from load quads + * limit of 1 bcond/quad to permit training + */ +$a_loop: + stq_u t0, 0(a0) # L : + addq a0, 8, a0 # E : + subq a2, 1, a2 # E : + nop + + ldq_u t0, 0(a1) # L : + addq a1, 8, a1 # E : + cmpbge zero, t0, t7 # E : + beq a2, $a_eoc # U : + + beq t7, $a_loop # U : + nop + nop + nop + + /* Take care of the final (partial) word store. At this point + the end-of-count bit is set in t7 iff it applies. + + On entry to this basic block we have: + t0 == the source word containing the null + t7 == the cmpbge mask that found it. */ + +$a_eos: + negq t7, t8 # E : find low bit set + and t7, t8, t8 # E : (stall) + /* For the sake of the cache, don't read a destination word + if we're not going to need it. */ + and t8, 0x80, t6 # E : (stall) + bne t6, 1f # U : (stall) + + /* We're doing a partial word store and so need to combine + our source and original destination words. */ + ldq_u t1, 0(a0) # L : + subq t8, 1, t6 # E : + or t8, t6, t7 # E : (stall) + zapnot t0, t7, t0 # U : clear src bytes > null (stall) + + zap t1, t7, t1 # .. e1 : clear dst bytes <= null + or t0, t1, t0 # e1 : (stall) + nop + nop + +1: stq_u t0, 0(a0) # L : + ret (t9) # L0 : Latency=3 + nop + nop + + /* Add the end-of-count bit to the eos detection bitmask. */ +$a_eoc: + or t10, t7, t7 # E : + br $a_eos # L0 : Latency=3 + nop + nop + + .end stxncpy_aligned + + .align 4 + .ent __stxncpy + .globl __stxncpy +__stxncpy: + .frame sp, 0, t9, 0 + .prologue 0 + + /* Are source and destination co-aligned? */ + xor a0, a1, t1 # E : + and a0, 7, t0 # E : find dest misalignment + and t1, 7, t1 # E : (stall) + addq a2, t0, a2 # E : bias count by dest misalignment (stall) + + subq a2, 1, a2 # E : + and a2, 7, t2 # E : (stall) + srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall) + addq zero, 1, t10 # E : + + sll t10, t2, t10 # U : t10 = bitmask of last count byte + bne t1, $unaligned # U : + /* We are co-aligned; take care of a partial first word. */ + ldq_u t1, 0(a1) # L : load first src word + addq a1, 8, a1 # E : + + beq t0, stxncpy_aligned # U : avoid loading dest word if not needed + ldq_u t0, 0(a0) # L : + nop + nop + + br stxncpy_aligned # .. e1 : + nop + nop + nop + + + +/* The source and destination are not co-aligned. Align the destination + and cope. We have to be very careful about not reading too much and + causing a SEGV. */ + + .align 4 +$u_head: + /* We know just enough now to be able to assemble the first + full source word. We can still find a zero at the end of it + that prevents us from outputting the whole thing. + + On entry to this basic block: + t0 == the first dest word, unmasked + t1 == the shifted low bits of the first source word + t6 == bytemask that is -1 in dest word bytes */ + + ldq_u t2, 8(a1) # L : Latency=3 load second src word + addq a1, 8, a1 # E : + mskql t0, a0, t0 # U : mask trailing garbage in dst + extqh t2, a1, t4 # U : (3 cycle stall on t2) + + or t1, t4, t1 # E : first aligned src word complete (stall) + mskqh t1, a0, t1 # U : mask leading garbage in src (stall) + or t0, t1, t0 # E : first output word complete (stall) + or t0, t6, t6 # E : mask original data for zero test (stall) + + cmpbge zero, t6, t7 # E : + beq a2, $u_eocfin # U : + nop + nop + + bne t7, $u_final # U : + lda t6, -1 # E : mask out the bits we have + mskql t6, a1, t6 # U : already seen (stall) + stq_u t0, 0(a0) # L : store first output word + + or t6, t2, t2 # E : + cmpbge zero, t2, t7 # E : find nulls in second partial (stall) + addq a0, 8, a0 # E : + subq a2, 1, a2 # E : + + bne t7, $u_late_head_exit # U : + /* Finally, we've got all the stupid leading edge cases taken care + of and we can set up to enter the main loop. */ + extql t2, a1, t1 # U : position hi-bits of lo word + ldq_u t2, 8(a1) # L : read next high-order source word + addq a1, 8, a1 # E : + + cmpbge zero, t2, t7 # E : (stall) + beq a2, $u_eoc # U : + nop + nop + + bne t7, $u_eos # e1 : + nop + nop + nop + + /* Unaligned copy main loop. In order to avoid reading too much, + the loop is structured to detect zeros in aligned source words. + This has, unfortunately, effectively pulled half of a loop + iteration out into the head and half into the tail, but it does + prevent nastiness from accumulating in the very thing we want + to run as fast as possible. + + On entry to this basic block: + t1 == the shifted high-order bits from the previous source word + t2 == the unshifted current source word + + We further know that t2 does not contain a null terminator. */ + + .align 4 +$u_loop: + extqh t2, a1, t0 # U : extract high bits for current word + addq a1, 8, a1 # E : + extql t2, a1, t3 # U : extract low bits for next time + addq a0, 8, a0 # E : + + or t0, t1, t0 # E : current dst word now complete + ldq_u t2, 0(a1) # U : Latency=3 load high word for next time + stq_u t0, -8(a0) # U : save the current word (stall) + mov t3, t1 # E : + + subq a2, 1, a2 # E : + cmpbge zero, t2, t7 # E : test new word for eos (2 cycle stall for data) + beq a2, $u_eoc # U : (stall) + nop + + beq t7, $u_loop # U : + nop + nop + nop + + /* We've found a zero somewhere in the source word we just read. + If it resides in the lower half, we have one (probably partial) + word to write out, and if it resides in the upper half, we + have one full and one partial word left to write out. + + On entry to this basic block: + t1 == the shifted high-order bits from the previous source word + t2 == the unshifted current source word. */ +$u_eos: + extqh t2, a1, t0 # U : + or t0, t1, t0 # E : first (partial) source word complete (stall) + cmpbge zero, t0, t7 # E : is the null in this first bit? (stall) + bne t7, $u_final # U : (stall) + + stq_u t0, 0(a0) # L : the null was in the high-order bits + addq a0, 8, a0 # E : + subq a2, 1, a2 # E : + nop + +$u_late_head_exit: + extql t2, a1, t0 # U : + cmpbge zero, t0, t7 # E : + or t7, t10, t6 # E : (stall) + cmoveq a2, t6, t7 # E : Latency=2, extra map slot (stall) + + /* Take care of a final (probably partial) result word. + On entry to this basic block: + t0 == assembled source word + t7 == cmpbge mask that found the null. */ +$u_final: + negq t7, t6 # E : isolate low bit set + and t6, t7, t8 # E : (stall) + and t8, 0x80, t6 # E : avoid dest word load if we can (stall) + bne t6, 1f # U : (stall) + + ldq_u t1, 0(a0) # L : + subq t8, 1, t6 # E : + or t6, t8, t7 # E : (stall) + zapnot t0, t7, t0 # U : kill source bytes > null + + zap t1, t7, t1 # U : kill dest bytes <= null + or t0, t1, t0 # E : (stall) + nop + nop + +1: stq_u t0, 0(a0) # L : + ret (t9) # L0 : Latency=3 + +$u_eoc: # end-of-count + extqh t2, a1, t0 # U : + or t0, t1, t0 # E : (stall) + cmpbge zero, t0, t7 # E : (stall) + nop + +$u_eocfin: # end-of-count, final word + or t10, t7, t7 # E : + br $u_final # L0 : Latency=3 + nop + nop + + /* Unaligned copy entry point. */ + .align 4 +$unaligned: + + ldq_u t1, 0(a1) # L : load first source word + and a0, 7, t4 # E : find dest misalignment + and a1, 7, t5 # E : find src misalignment + /* Conditionally load the first destination word and a bytemask + with 0xff indicating that the destination byte is sacrosanct. */ + mov zero, t0 # E : + + mov zero, t6 # E : + beq t4, 1f # U : + ldq_u t0, 0(a0) # L : + lda t6, -1 # E : + + mskql t6, a0, t6 # U : + nop + nop + nop +1: + subq a1, t4, a1 # E : sub dest misalignment from src addr + + /* If source misalignment is larger than dest misalignment, we need + extra startup checks to avoid SEGV. */ + + cmplt t4, t5, t8 # E : + extql t1, a1, t1 # U : shift src into place + lda t2, -1 # E : for creating masks later + beq t8, $u_head # U : (stall) + + mskqh t2, t5, t2 # U : begin src byte validity mask + cmpbge zero, t1, t7 # E : is there a zero? + extql t2, a1, t2 # U : + or t7, t10, t5 # E : test for end-of-count too + + cmpbge zero, t2, t3 # E : + cmoveq a2, t5, t7 # E : Latency=2, extra map slot + nop # E : keep with cmoveq + andnot t7, t3, t7 # E : (stall) + + beq t7, $u_head # U : + /* At this point we've found a zero in the first partial word of + the source. We need to isolate the valid source data and mask + it into the original destination data. (Incidentally, we know + that we'll need at least one byte of that original dest word.) */ + ldq_u t0, 0(a0) # L : + negq t7, t6 # E : build bitmask of bytes <= zero + mskqh t1, t4, t1 # U : + + and t6, t7, t8 # E : + subq t8, 1, t6 # E : (stall) + or t6, t8, t7 # E : (stall) + zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall) + + zapnot t1, t7, t1 # U : to source validity mask + andnot t0, t2, t0 # E : zero place for source to reside + or t0, t1, t0 # E : and put it there (stall both t0, t1) + stq_u t0, 0(a0) # L : (stall) + + ret (t9) # L0 : Latency=3 + nop + nop + nop + + .end __stxncpy + |