diff options
author | wilson <wilson@138bc75d-0d04-0410-961f-82ee72b054a4> | 2000-03-09 00:26:04 +0000 |
---|---|---|
committer | wilson <wilson@138bc75d-0d04-0410-961f-82ee72b054a4> | 2000-03-09 00:26:04 +0000 |
commit | ac445222719fb55c55fa5838c01217869e92b024 (patch) | |
tree | 4497d3e80c79da1bcf1a62991162605a8eb6a303 /gcc/config/ia64/lib1funcs.asm | |
parent | 07c967f908bcefff491dd9200630d4428446c332 (diff) | |
download | gcc-ac445222719fb55c55fa5838c01217869e92b024.tar.gz |
Add ia64 port.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@32438 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/ia64/lib1funcs.asm')
-rw-r--r-- | gcc/config/ia64/lib1funcs.asm | 635 |
1 files changed, 635 insertions, 0 deletions
diff --git a/gcc/config/ia64/lib1funcs.asm b/gcc/config/ia64/lib1funcs.asm new file mode 100644 index 00000000000..d8af8dbd83c --- /dev/null +++ b/gcc/config/ia64/lib1funcs.asm @@ -0,0 +1,635 @@ +#ifdef L__divdf3 +// Compute a 64-bit IEEE double quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// farg0 holds the dividend. farg1 holds the divisor. + + .text + .align 16 + .global __divdf3 + .proc __divdf3 +__divdf3: + frcpa f10, p6 = farg0, farg1 + ;; +(p6) fma.s1 f11 = farg0, f10, f0 +(p6) fnma.s1 f12 = farg1, f10, f1 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fma.s1 f13 = f12, f12, f0 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fma.s1 f11 = f13, f11, f11 +(p6) fma.s1 f12 = f13, f13, f0 +(p6) fma.s1 f10 = f13, f10, f10 + ;; +(p6) fma.d.s1 f11 = f12, f11, f11 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fnma.d.s1 f8 = farg1, f11, farg0 + ;; +(p6) fma.d f10 = f8, f10, f11 + ;; + mov fret0 = f10 + br.ret.sptk rp + ;; + .endp __divdf3 +#endif + +#ifdef L__divsf3 +// Compute a 32-bit IEEE float quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// farg0 holds the dividend. farg1 holds the divisor. + + .text + .align 16 + .global __divsf3 + .proc __divsf3 +__divsf3: + frcpa f10, p6 = farg0, farg1 + ;; +(p6) fma.s1 f8 = farg0, f10, f0 +(p6) fnma.s1 f9 = farg1, f10, f1 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fma.s1 f9 = f9, f9, f0 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fma.s1 f9 = f9, f9, f0 + ;; +(p6) fma.d.s1 f8 = f9, f8, f8 + ;; +(p6) fma.s f10 = f8, f1, f0 + ;; + mov fret0 = f10 + br.ret.sptk rp + ;; + .endp __divsf3 +#endif + +#ifdef L__divdi3 +// Compute a 64-bit integer quotient. +// +// Use reciprocal approximation and Newton-Raphson iteration to compute the +// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations +// to get more than the 64 bits of precision that we need for DImode. +// +// Must use max precision for the reciprocal computations to get 64 bits of +// precision. +// +// r32/f8 holds the dividend. r33/f9 holds the divisor. +// f10 holds the value 2.0. f11 holds the reciprocal approximation. +// f12 is a temporary. + + .text + .align 16 + .global __divdi3 + .proc __divdi3 +__divdi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f8 = in0 + setf.sig f9 = in1 + ;; + // Convert the inputs to FP, so that they won't be treated as unsigned. + fcvt.xf f8 = f8 + fcvt.xf f9 = f9 + ;; + // Compute the reciprocal approximation. + frcpa f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fma.s1 f11 = farg0, f10, f0 +(p6) fnma.s1 f12 = farg1, f10, f1 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fma.s1 f13 = f12, f12, f0 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fma.s1 f11 = f13, f11, f11 +(p6) fma.s1 f12 = f13, f13, f0 +(p6) fma.s1 f10 = f13, f10, f10 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fnma.s1 f8 = f9, f11, f8 + ;; +(p6) fma f10 = f8, f10, f11 + ;; + // Round quotient to an integer. + fcvt.fx.trunc f8 = f10 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f8 + br.ret.sptk rp + ;; + .endp __divdi3 +#endif + +#ifdef L__moddi3 +// Compute a 64-bit integer modulus. +// +// Use reciprocal approximation and Newton-Raphson iteration to compute the +// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations +// to get more than the 64 bits of precision that we need for DImode. +// +// Must use max precision for the reciprocal computations to get 64 bits of +// precision. +// +// r32/f8 holds the dividend. r33/f9 holds the divisor. +// f10 holds the value 2.0. f11 holds the reciprocal approximation. +// f12 is a temporary. + + .text + .align 16 + .global __moddi3 + .proc __moddi3 +__moddi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f8 = in0 + setf.sig f9 = in1 + ;; + // Convert the inputs to FP, so that they won't be treated as unsigned. + fcvt.xf f8 = f8 + fcvt.xf f9 = f9 + ;; + // Compute the reciprocal approximation. + frcpa f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fma.s1 f11 = farg0, f10, f0 +(p6) fnma.s1 f12 = farg1, f10, f1 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fma.s1 f13 = f12, f12, f0 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fma.s1 f11 = f13, f11, f11 +(p6) fma.s1 f12 = f13, f13, f0 +(p6) fma.s1 f10 = f13, f10, f10 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fnma.s1 f12 = f9, f11, f8 + ;; +(p6) fma f10 = f12, f10, f11 + ;; + // Round quotient to an integer. + fcvt.fx.trunc f10 = f10 + ;; + // Renormalize. + fcvt.xf f10 = f10 + ;; + // Compute remainder. + fnma f8 = f10, f9, f8 + ;; + // Round remainder to an integer. + fcvt.fx.trunc f8 = f8 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f8 + br.ret.sptk rp + ;; + .endp __moddi3 +#endif + +#ifdef L__udivdi3 +// Compute a 64-bit unsigned integer quotient. +// +// Use reciprocal approximation and Newton-Raphson iteration to compute the +// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations +// to get more than the 64 bits of precision that we need for DImode. +// +// Must use max precision for the reciprocal computations to get 64 bits of +// precision. +// +// r32/f8 holds the dividend. r33/f9 holds the divisor. +// f10 holds the value 2.0. f11 holds the reciprocal approximation. +// f12 is a temporary. + + .text + .align 16 + .global __udivdi3 + .proc __udivdi3 +__udivdi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f8 = in0 + setf.sig f9 = in1 + ;; + // Convert the inputs to FP, to avoid FP software-assist faults. + fcvt.xuf f8 = f8 + fcvt.xuf f9 = f9 + ;; + // Compute the reciprocal approximation. + frcpa f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fma.s1 f11 = farg0, f10, f0 +(p6) fnma.s1 f12 = farg1, f10, f1 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fma.s1 f13 = f12, f12, f0 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fma.s1 f11 = f13, f11, f11 +(p6) fma.s1 f12 = f13, f13, f0 +(p6) fma.s1 f10 = f13, f10, f10 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fnma.s1 f8 = f9, f11, f8 + ;; +(p6) fma f10 = f8, f10, f11 + ;; + // Round quotient to an unsigned integer. + fcvt.fxu.trunc f8 = f10 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f8 + br.ret.sptk rp + ;; + .endp __udivdi3 +#endif + +#ifdef L__umoddi3 +// Compute a 64-bit unsigned integer modulus. +// +// Use reciprocal approximation and Newton-Raphson iteration to compute the +// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations +// to get more than the 64 bits of precision that we need for DImode. +// +// Must use max precision for the reciprocal computations to get 64 bits of +// precision. +// +// r32/f8 holds the dividend. r33/f9 holds the divisor. +// f10 holds the value 2.0. f11 holds the reciprocal approximation. +// f12 is a temporary. + + .text + .align 16 + .global __umoddi3 + .proc __umoddi3 +__umoddi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f8 = in0 + setf.sig f9 = in1 + ;; + // Convert the inputs to FP, to avoid FP software assist faults. + fcvt.xuf f8 = f8 + fcvt.xuf f9 = f9 + ;; + // Compute the reciprocal approximation. + frcpa f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fma.s1 f11 = farg0, f10, f0 +(p6) fnma.s1 f12 = farg1, f10, f1 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fma.s1 f13 = f12, f12, f0 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fma.s1 f11 = f13, f11, f11 +(p6) fma.s1 f12 = f13, f13, f0 +(p6) fma.s1 f10 = f13, f10, f10 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fnma.s1 f12 = f9, f11, f8 + ;; +(p6) fma f10 = f12, f10, f11 + ;; + // Round quotient to an unsigned integer. + fcvt.fxu.trunc f10 = f10 + ;; + // Renormalize. + fcvt.xuf f10 = f10 + ;; + // Compute remainder. + fnma f8 = f10, f9, f8 + ;; + // Round remainder to an integer. + fcvt.fxu.trunc f8 = f8 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f8 + br.ret.sptk rp + ;; + .endp __umoddi3 +#endif + +#ifdef L__divsi3 +// Compute a 32-bit integer quotient. +// +// Use reciprocal approximation and Newton-Raphson iteration to compute the +// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations +// to get more than the 32 bits of precision that we need for SImode. +// +// ??? This is currently not used. It needs to be fixed to be more like the +// above DImode routines. +// +// ??? Check to see if the error is less than >.5ulp error. We may need +// some adjustment code to get precise enough results. +// +// ??? Should probably use max precision for the reciprocal computations. +// +// r32/f8 holds the dividend. r33/f9 holds the divisor. +// f10 holds the value 2.0. f11 holds the reciprocal approximation. +// f12 is a temporary. + + .text + .align 16 + .global __divsi3 + .proc __divsi3 +__divsi3: + .regstk 2,0,0,0 + setf.sig f8 = in0 + setf.sig f9 = in1 + ;; + fcvt.xf f8 = f8 + fcvt.xf f9 = f9 + ;; + frcpa f11, p6 = f8, f9 + fadd f10 = f1, f1 + ;; + fnma f12 = f9, f11, f10 + ;; + fmpy f11 = f11, f12 + ;; + fnma f12 = f9, f11, f10 + ;; + fmpy f11 = f11, f12 + ;; + fmpy f8 = f8, f11 + ;; + fcvt.fx.trunc f8 = f8 + ;; + getf.sig ret0 = f8 + br.ret.sptk rp + ;; + .endp __divsi3 +#endif + +#ifdef L__modsi3 +// Compute a 32-bit integer modulus. +// +// Use reciprocal approximation and Newton-Raphson iteration to compute the +// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations +// to get more than the 32 bits of precision that we need for SImode. +// +// ??? This is currently not used. It needs to be fixed to be more like the +// above DImode routines. +// +// ??? Check to see if the error is less than >.5ulp error. We may need +// some adjustment code to get precise enough results. +// +// ??? Should probably use max precision for the reciprocal computations. +// +// r32/f8 holds the dividend. r33/f9 holds the divisor. +// f10 holds the value 2.0. f11 holds the reciprocal approximation. +// f12 is a temporary. + + .text + .align 16 + .global __modsi3 + .proc __modsi3 +__modsi3: + .regstk 2,0,0,0 + setf.sig f8 = r32 + setf.sig f9 = r33 + ;; + fcvt.xf f8 = f8 + fcvt.xf f9 = f9 + ;; + frcpa f11, p6 = f8, f9 + fadd f10 = f1, f1 + ;; + fnma f12 = f9, f11, f10 + ;; + fmpy f11 = f11, f12 + ;; + fnma f12 = f9, f11, f10 + ;; + fmpy f11 = f11, f12 + ;; + fmpy f10 = f8, f11 + ;; + fcvt.fx.trunc f10 = f10 + ;; + fcvt.xf f10 = f10 + ;; + fnma f8 = f10, f9, f8 + ;; + fcvt.fx f8 = f8 + ;; + getf.sig r32 = f8 + br.ret.sptk rp + ;; + .endp __modsi3 +#endif + +#ifdef L__udivsi3 +// Compute a 32-bit unsigned integer quotient. +// +// Use reciprocal approximation and Newton-Raphson iteration to compute the +// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations +// to get more than the 32 bits of precision that we need for SImode. +// +// ??? This is currently not used. It needs to be fixed to be more like the +// above DImode routines. +// +// ??? Check to see if the error is less than >.5ulp error. We may need +// some adjustment code to get precise enough results. +// +// ??? Should probably use max precision for the reciprocal computations. +// +// r32/f8 holds the dividend. r33/f9 holds the divisor. +// f10 holds the value 2.0. f11 holds the reciprocal approximation. +// f12 is a temporary. +// +// This is the same as divsi3, except that we don't need fcvt instructions +// before the frcpa. + + .text + .align 16 + .global __udivsi3 + .proc __udivsi3 +__udivsi3: + .regstk 2,0,0,0 + setf.sig f8 = r32 + setf.sig f9 = r33 + ;; + frcpa f11, p6 = f8, f9 + fadd f10 = f1, f1 + ;; + fnma f12 = f9, f11, f10 + ;; + fmpy f11 = f11, f12 + ;; + fnma f12 = f9, f11, f10 + ;; + fmpy f11 = f11, f12 + ;; + fmpy f8 = f8, f11 + ;; + fcvt.fxu.trunc f8 = f8 + ;; + getf.sig ret0 = f8 + br.ret.sptk rp + ;; + .endp __udivsi3 +#endif + +#ifdef L__umodsi3 +// Compute a 32-bit unsigned integer modulus. +// +// Use reciprocal approximation and Newton-Raphson iteration to compute the +// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations +// to get more than the 32 bits of precision that we need for SImode. +// +// ??? This is currently not used. It needs to be fixed to be more like the +// above DImode routines. +// +// ??? Check to see if the error is less than >.5ulp error. We may need +// some adjustment code to get precise enough results. +// +// ??? Should probably use max precision for the reciprocal computations. +// +// r32/f8 holds the dividend. r33/f9 holds the divisor. +// f10 holds the value 2.0. f11 holds the reciprocal approximation. +// f12 is a temporary. +// +// This is the same as modsi3, except that we don't need fcvt instructions +// before the frcpa. + + .text + .align 16 + .global __umodsi3 + .proc __umodsi3 +__umodsi3: + .regstk 2,0,0,0 + setf.sig f8 = r32 + setf.sig f9 = r33 + ;; + frcpa f11, p6 = f8, f9 + fadd f10 = f1, f1 + ;; + fnma f12 = f9, f11, f10 + ;; + fmpy f11 = f11, f12 + ;; + fnma f12 = f9, f11, f10 + ;; + fmpy f11 = f11, f12 + ;; + fmpy f10 = f8, f11 + ;; + fcvt.fxu.trunc f10 = f10 + ;; + fcvt.xuf f10 = f10 + ;; + fnma f8 = f10, f9, f8 + ;; + fcvt.fxu f8 = f8 + ;; + getf.sig r32 = f8 + br.ret.sptk rp + ;; + .endp __umodsi3 +#endif + +#ifdef L__save_stack_nonlocal +// Notes on save/restore stack nonlocal: We read ar.bsp but write +// ar.bspstore. This is because ar.bsp can be read at all times +// (independent of the RSE mode) but since it's read-only we need to +// restore the value via ar.bspstore. This is OK because +// ar.bsp==ar.bspstore after executing "flushrs". + +// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) + + .text + .align 16 + .global __ia64_save_stack_nonlocal + .proc __ia64_save_stack_nonlocal +__ia64_save_stack_nonlocal: + alloc r18=ar.pfs,2,0,0,0 + st8 [in0]=in1,8 + mov r19=ar.rsc + ;; + flushrs + and r19=0x1c,r19 + mov ar.pfs=r18 + ;; + mov ar.rsc=r19 + mov r16=ar.bsp + adds r2=16,in0 + ;; + mov r17=ar.rnat + st8 [in0]=r16,8 + or r19=0x3,r19 + ;; + st8 [in0]=r17 + mov ar.rsc=r19 + st8 [r2]=r18 + mov ar.pfs=r18 + br.ret.sptk.few rp + ;; + .endp __ia64_save_stack_nonlocal +#endif + +#ifdef L__nonlocal_goto +// void __ia64_nonlocal_goto(void *fp, void *target_label, void *save_area, +// void *static_chain); + + .text + .align 16 + .global __ia64_nonlocal_goto + .proc __ia64_nonlocal_goto +__ia64_nonlocal_goto: + alloc r20=ar.pfs,4,0,0,0 + mov r19=ar.rsc + adds r2=8,in2 + ld8 r12=[in2],16 + mov.ret.sptk.few.dc.dc rp = r33, .L0 +// ??? flushrs must be first instruction of a group. Gas is unfortunately +// putting the stop bit before the padding nop instead of after it, making +// flushrs the first instruction of its bundle, but the second instruction +// of its group. We explicitly add the nop to avoid this problem. + nop.i 0 + ;; + flushrs + ld8 r16=[r2],16 + and r19=0x1c,r19 + ld8 r17=[in2] + ;; + ld8 r18=[r2] + mov ar.rsc=r19 + ;; + mov ar.bspstore=r16 + ;; + mov ar.rnat=r17 + mov ar.pfs=r18 + or r19=0x3,r19 + ;; + loadrs + invala + mov r7=r32 +.L0: { + mov ar.rsc=r19 + mov r15=r35 + br.ret.sptk.few rp + } + ;; + .endp __ia64_nonlocal_goto +#endif |