summaryrefslogtreecommitdiff
path: root/gcc/config/ia64/lib1funcs.asm
diff options
context:
space:
mode:
authorwilson <wilson@138bc75d-0d04-0410-961f-82ee72b054a4>2000-03-09 00:26:04 +0000
committerwilson <wilson@138bc75d-0d04-0410-961f-82ee72b054a4>2000-03-09 00:26:04 +0000
commitac445222719fb55c55fa5838c01217869e92b024 (patch)
tree4497d3e80c79da1bcf1a62991162605a8eb6a303 /gcc/config/ia64/lib1funcs.asm
parent07c967f908bcefff491dd9200630d4428446c332 (diff)
downloadgcc-ac445222719fb55c55fa5838c01217869e92b024.tar.gz
Add ia64 port.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@32438 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/ia64/lib1funcs.asm')
-rw-r--r--gcc/config/ia64/lib1funcs.asm635
1 files changed, 635 insertions, 0 deletions
diff --git a/gcc/config/ia64/lib1funcs.asm b/gcc/config/ia64/lib1funcs.asm
new file mode 100644
index 00000000000..d8af8dbd83c
--- /dev/null
+++ b/gcc/config/ia64/lib1funcs.asm
@@ -0,0 +1,635 @@
+#ifdef L__divdf3
+// Compute a 64-bit IEEE double quotient.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// farg0 holds the dividend. farg1 holds the divisor.
+
+ .text
+ .align 16
+ .global __divdf3
+ .proc __divdf3
+__divdf3:
+ frcpa f10, p6 = farg0, farg1
+ ;;
+(p6) fma.s1 f11 = farg0, f10, f0
+(p6) fnma.s1 f12 = farg1, f10, f1
+ ;;
+(p6) fma.s1 f11 = f12, f11, f11
+(p6) fma.s1 f13 = f12, f12, f0
+(p6) fma.s1 f10 = f12, f10, f10
+ ;;
+(p6) fma.s1 f11 = f13, f11, f11
+(p6) fma.s1 f12 = f13, f13, f0
+(p6) fma.s1 f10 = f13, f10, f10
+ ;;
+(p6) fma.d.s1 f11 = f12, f11, f11
+(p6) fma.s1 f10 = f12, f10, f10
+ ;;
+(p6) fnma.d.s1 f8 = farg1, f11, farg0
+ ;;
+(p6) fma.d f10 = f8, f10, f11
+ ;;
+ mov fret0 = f10
+ br.ret.sptk rp
+ ;;
+ .endp __divdf3
+#endif
+
+#ifdef L__divsf3
+// Compute a 32-bit IEEE float quotient.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// farg0 holds the dividend. farg1 holds the divisor.
+
+ .text
+ .align 16
+ .global __divsf3
+ .proc __divsf3
+__divsf3:
+ frcpa f10, p6 = farg0, farg1
+ ;;
+(p6) fma.s1 f8 = farg0, f10, f0
+(p6) fnma.s1 f9 = farg1, f10, f1
+ ;;
+(p6) fma.s1 f8 = f9, f8, f8
+(p6) fma.s1 f9 = f9, f9, f0
+ ;;
+(p6) fma.s1 f8 = f9, f8, f8
+(p6) fma.s1 f9 = f9, f9, f0
+ ;;
+(p6) fma.d.s1 f8 = f9, f8, f8
+ ;;
+(p6) fma.s f10 = f8, f1, f0
+ ;;
+ mov fret0 = f10
+ br.ret.sptk rp
+ ;;
+ .endp __divsf3
+#endif
+
+#ifdef L__divdi3
+// Compute a 64-bit integer quotient.
+//
+// Use reciprocal approximation and Newton-Raphson iteration to compute the
+// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
+// to get more than the 64 bits of precision that we need for DImode.
+//
+// Must use max precision for the reciprocal computations to get 64 bits of
+// precision.
+//
+// r32/f8 holds the dividend. r33/f9 holds the divisor.
+// f10 holds the value 2.0. f11 holds the reciprocal approximation.
+// f12 is a temporary.
+
+ .text
+ .align 16
+ .global __divdi3
+ .proc __divdi3
+__divdi3:
+ .regstk 2,0,0,0
+ // Transfer inputs to FP registers.
+ setf.sig f8 = in0
+ setf.sig f9 = in1
+ ;;
+ // Convert the inputs to FP, so that they won't be treated as unsigned.
+ fcvt.xf f8 = f8
+ fcvt.xf f9 = f9
+ ;;
+ // Compute the reciprocal approximation.
+ frcpa f10, p6 = f8, f9
+ ;;
+ // 3 Newton-Raphson iterations.
+(p6) fma.s1 f11 = farg0, f10, f0
+(p6) fnma.s1 f12 = farg1, f10, f1
+ ;;
+(p6) fma.s1 f11 = f12, f11, f11
+(p6) fma.s1 f13 = f12, f12, f0
+(p6) fma.s1 f10 = f12, f10, f10
+ ;;
+(p6) fma.s1 f11 = f13, f11, f11
+(p6) fma.s1 f12 = f13, f13, f0
+(p6) fma.s1 f10 = f13, f10, f10
+ ;;
+(p6) fma.s1 f11 = f12, f11, f11
+(p6) fma.s1 f10 = f12, f10, f10
+ ;;
+(p6) fnma.s1 f8 = f9, f11, f8
+ ;;
+(p6) fma f10 = f8, f10, f11
+ ;;
+ // Round quotient to an integer.
+ fcvt.fx.trunc f8 = f10
+ ;;
+ // Transfer result to GP registers.
+ getf.sig ret0 = f8
+ br.ret.sptk rp
+ ;;
+ .endp __divdi3
+#endif
+
+#ifdef L__moddi3
+// Compute a 64-bit integer modulus.
+//
+// Use reciprocal approximation and Newton-Raphson iteration to compute the
+// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
+// to get more than the 64 bits of precision that we need for DImode.
+//
+// Must use max precision for the reciprocal computations to get 64 bits of
+// precision.
+//
+// r32/f8 holds the dividend. r33/f9 holds the divisor.
+// f10 holds the value 2.0. f11 holds the reciprocal approximation.
+// f12 is a temporary.
+
+ .text
+ .align 16
+ .global __moddi3
+ .proc __moddi3
+__moddi3:
+ .regstk 2,0,0,0
+ // Transfer inputs to FP registers.
+ setf.sig f8 = in0
+ setf.sig f9 = in1
+ ;;
+ // Convert the inputs to FP, so that they won't be treated as unsigned.
+ fcvt.xf f8 = f8
+ fcvt.xf f9 = f9
+ ;;
+ // Compute the reciprocal approximation.
+ frcpa f10, p6 = f8, f9
+ ;;
+ // 3 Newton-Raphson iterations.
+(p6) fma.s1 f11 = farg0, f10, f0
+(p6) fnma.s1 f12 = farg1, f10, f1
+ ;;
+(p6) fma.s1 f11 = f12, f11, f11
+(p6) fma.s1 f13 = f12, f12, f0
+(p6) fma.s1 f10 = f12, f10, f10
+ ;;
+(p6) fma.s1 f11 = f13, f11, f11
+(p6) fma.s1 f12 = f13, f13, f0
+(p6) fma.s1 f10 = f13, f10, f10
+ ;;
+(p6) fma.s1 f11 = f12, f11, f11
+(p6) fma.s1 f10 = f12, f10, f10
+ ;;
+(p6) fnma.s1 f12 = f9, f11, f8
+ ;;
+(p6) fma f10 = f12, f10, f11
+ ;;
+ // Round quotient to an integer.
+ fcvt.fx.trunc f10 = f10
+ ;;
+ // Renormalize.
+ fcvt.xf f10 = f10
+ ;;
+ // Compute remainder.
+ fnma f8 = f10, f9, f8
+ ;;
+ // Round remainder to an integer.
+ fcvt.fx.trunc f8 = f8
+ ;;
+ // Transfer result to GP registers.
+ getf.sig ret0 = f8
+ br.ret.sptk rp
+ ;;
+ .endp __moddi3
+#endif
+
+#ifdef L__udivdi3
+// Compute a 64-bit unsigned integer quotient.
+//
+// Use reciprocal approximation and Newton-Raphson iteration to compute the
+// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
+// to get more than the 64 bits of precision that we need for DImode.
+//
+// Must use max precision for the reciprocal computations to get 64 bits of
+// precision.
+//
+// r32/f8 holds the dividend. r33/f9 holds the divisor.
+// f10 holds the value 2.0. f11 holds the reciprocal approximation.
+// f12 is a temporary.
+
+ .text
+ .align 16
+ .global __udivdi3
+ .proc __udivdi3
+__udivdi3:
+ .regstk 2,0,0,0
+ // Transfer inputs to FP registers.
+ setf.sig f8 = in0
+ setf.sig f9 = in1
+ ;;
+ // Convert the inputs to FP, to avoid FP software-assist faults.
+ fcvt.xuf f8 = f8
+ fcvt.xuf f9 = f9
+ ;;
+ // Compute the reciprocal approximation.
+ frcpa f10, p6 = f8, f9
+ ;;
+ // 3 Newton-Raphson iterations.
+(p6) fma.s1 f11 = farg0, f10, f0
+(p6) fnma.s1 f12 = farg1, f10, f1
+ ;;
+(p6) fma.s1 f11 = f12, f11, f11
+(p6) fma.s1 f13 = f12, f12, f0
+(p6) fma.s1 f10 = f12, f10, f10
+ ;;
+(p6) fma.s1 f11 = f13, f11, f11
+(p6) fma.s1 f12 = f13, f13, f0
+(p6) fma.s1 f10 = f13, f10, f10
+ ;;
+(p6) fma.s1 f11 = f12, f11, f11
+(p6) fma.s1 f10 = f12, f10, f10
+ ;;
+(p6) fnma.s1 f8 = f9, f11, f8
+ ;;
+(p6) fma f10 = f8, f10, f11
+ ;;
+ // Round quotient to an unsigned integer.
+ fcvt.fxu.trunc f8 = f10
+ ;;
+ // Transfer result to GP registers.
+ getf.sig ret0 = f8
+ br.ret.sptk rp
+ ;;
+ .endp __udivdi3
+#endif
+
+#ifdef L__umoddi3
+// Compute a 64-bit unsigned integer modulus.
+//
+// Use reciprocal approximation and Newton-Raphson iteration to compute the
+// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
+// to get more than the 64 bits of precision that we need for DImode.
+//
+// Must use max precision for the reciprocal computations to get 64 bits of
+// precision.
+//
+// r32/f8 holds the dividend. r33/f9 holds the divisor.
+// f10 holds the value 2.0. f11 holds the reciprocal approximation.
+// f12 is a temporary.
+
+ .text
+ .align 16
+ .global __umoddi3
+ .proc __umoddi3
+__umoddi3:
+ .regstk 2,0,0,0
+ // Transfer inputs to FP registers.
+ setf.sig f8 = in0
+ setf.sig f9 = in1
+ ;;
+ // Convert the inputs to FP, to avoid FP software assist faults.
+ fcvt.xuf f8 = f8
+ fcvt.xuf f9 = f9
+ ;;
+ // Compute the reciprocal approximation.
+ frcpa f10, p6 = f8, f9
+ ;;
+ // 3 Newton-Raphson iterations.
+(p6) fma.s1 f11 = farg0, f10, f0
+(p6) fnma.s1 f12 = farg1, f10, f1
+ ;;
+(p6) fma.s1 f11 = f12, f11, f11
+(p6) fma.s1 f13 = f12, f12, f0
+(p6) fma.s1 f10 = f12, f10, f10
+ ;;
+(p6) fma.s1 f11 = f13, f11, f11
+(p6) fma.s1 f12 = f13, f13, f0
+(p6) fma.s1 f10 = f13, f10, f10
+ ;;
+(p6) fma.s1 f11 = f12, f11, f11
+(p6) fma.s1 f10 = f12, f10, f10
+ ;;
+(p6) fnma.s1 f12 = f9, f11, f8
+ ;;
+(p6) fma f10 = f12, f10, f11
+ ;;
+ // Round quotient to an unsigned integer.
+ fcvt.fxu.trunc f10 = f10
+ ;;
+ // Renormalize.
+ fcvt.xuf f10 = f10
+ ;;
+ // Compute remainder.
+ fnma f8 = f10, f9, f8
+ ;;
+ // Round remainder to an integer.
+ fcvt.fxu.trunc f8 = f8
+ ;;
+ // Transfer result to GP registers.
+ getf.sig ret0 = f8
+ br.ret.sptk rp
+ ;;
+ .endp __umoddi3
+#endif
+
+#ifdef L__divsi3
+// Compute a 32-bit integer quotient.
+//
+// Use reciprocal approximation and Newton-Raphson iteration to compute the
+// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
+// to get more than the 32 bits of precision that we need for SImode.
+//
+// ??? This is currently not used. It needs to be fixed to be more like the
+// above DImode routines.
+//
+// ??? Check to see if the error is less than >.5ulp error. We may need
+// some adjustment code to get precise enough results.
+//
+// ??? Should probably use max precision for the reciprocal computations.
+//
+// r32/f8 holds the dividend. r33/f9 holds the divisor.
+// f10 holds the value 2.0. f11 holds the reciprocal approximation.
+// f12 is a temporary.
+
+ .text
+ .align 16
+ .global __divsi3
+ .proc __divsi3
+__divsi3:
+ .regstk 2,0,0,0
+ setf.sig f8 = in0
+ setf.sig f9 = in1
+ ;;
+ fcvt.xf f8 = f8
+ fcvt.xf f9 = f9
+ ;;
+ frcpa f11, p6 = f8, f9
+ fadd f10 = f1, f1
+ ;;
+ fnma f12 = f9, f11, f10
+ ;;
+ fmpy f11 = f11, f12
+ ;;
+ fnma f12 = f9, f11, f10
+ ;;
+ fmpy f11 = f11, f12
+ ;;
+ fmpy f8 = f8, f11
+ ;;
+ fcvt.fx.trunc f8 = f8
+ ;;
+ getf.sig ret0 = f8
+ br.ret.sptk rp
+ ;;
+ .endp __divsi3
+#endif
+
+#ifdef L__modsi3
+// Compute a 32-bit integer modulus.
+//
+// Use reciprocal approximation and Newton-Raphson iteration to compute the
+// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
+// to get more than the 32 bits of precision that we need for SImode.
+//
+// ??? This is currently not used. It needs to be fixed to be more like the
+// above DImode routines.
+//
+// ??? Check to see if the error is less than >.5ulp error. We may need
+// some adjustment code to get precise enough results.
+//
+// ??? Should probably use max precision for the reciprocal computations.
+//
+// r32/f8 holds the dividend. r33/f9 holds the divisor.
+// f10 holds the value 2.0. f11 holds the reciprocal approximation.
+// f12 is a temporary.
+
+ .text
+ .align 16
+ .global __modsi3
+ .proc __modsi3
+__modsi3:
+ .regstk 2,0,0,0
+ setf.sig f8 = r32
+ setf.sig f9 = r33
+ ;;
+ fcvt.xf f8 = f8
+ fcvt.xf f9 = f9
+ ;;
+ frcpa f11, p6 = f8, f9
+ fadd f10 = f1, f1
+ ;;
+ fnma f12 = f9, f11, f10
+ ;;
+ fmpy f11 = f11, f12
+ ;;
+ fnma f12 = f9, f11, f10
+ ;;
+ fmpy f11 = f11, f12
+ ;;
+ fmpy f10 = f8, f11
+ ;;
+ fcvt.fx.trunc f10 = f10
+ ;;
+ fcvt.xf f10 = f10
+ ;;
+ fnma f8 = f10, f9, f8
+ ;;
+ fcvt.fx f8 = f8
+ ;;
+ getf.sig r32 = f8
+ br.ret.sptk rp
+ ;;
+ .endp __modsi3
+#endif
+
+#ifdef L__udivsi3
+// Compute a 32-bit unsigned integer quotient.
+//
+// Use reciprocal approximation and Newton-Raphson iteration to compute the
+// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
+// to get more than the 32 bits of precision that we need for SImode.
+//
+// ??? This is currently not used. It needs to be fixed to be more like the
+// above DImode routines.
+//
+// ??? Check to see if the error is less than >.5ulp error. We may need
+// some adjustment code to get precise enough results.
+//
+// ??? Should probably use max precision for the reciprocal computations.
+//
+// r32/f8 holds the dividend. r33/f9 holds the divisor.
+// f10 holds the value 2.0. f11 holds the reciprocal approximation.
+// f12 is a temporary.
+//
+// This is the same as divsi3, except that we don't need fcvt instructions
+// before the frcpa.
+
+ .text
+ .align 16
+ .global __udivsi3
+ .proc __udivsi3
+__udivsi3:
+ .regstk 2,0,0,0
+ setf.sig f8 = r32
+ setf.sig f9 = r33
+ ;;
+ frcpa f11, p6 = f8, f9
+ fadd f10 = f1, f1
+ ;;
+ fnma f12 = f9, f11, f10
+ ;;
+ fmpy f11 = f11, f12
+ ;;
+ fnma f12 = f9, f11, f10
+ ;;
+ fmpy f11 = f11, f12
+ ;;
+ fmpy f8 = f8, f11
+ ;;
+ fcvt.fxu.trunc f8 = f8
+ ;;
+ getf.sig ret0 = f8
+ br.ret.sptk rp
+ ;;
+ .endp __udivsi3
+#endif
+
+#ifdef L__umodsi3
+// Compute a 32-bit unsigned integer modulus.
+//
+// Use reciprocal approximation and Newton-Raphson iteration to compute the
+// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
+// to get more than the 32 bits of precision that we need for SImode.
+//
+// ??? This is currently not used. It needs to be fixed to be more like the
+// above DImode routines.
+//
+// ??? Check to see if the error is less than >.5ulp error. We may need
+// some adjustment code to get precise enough results.
+//
+// ??? Should probably use max precision for the reciprocal computations.
+//
+// r32/f8 holds the dividend. r33/f9 holds the divisor.
+// f10 holds the value 2.0. f11 holds the reciprocal approximation.
+// f12 is a temporary.
+//
+// This is the same as modsi3, except that we don't need fcvt instructions
+// before the frcpa.
+
+ .text
+ .align 16
+ .global __umodsi3
+ .proc __umodsi3
+__umodsi3:
+ .regstk 2,0,0,0
+ setf.sig f8 = r32
+ setf.sig f9 = r33
+ ;;
+ frcpa f11, p6 = f8, f9
+ fadd f10 = f1, f1
+ ;;
+ fnma f12 = f9, f11, f10
+ ;;
+ fmpy f11 = f11, f12
+ ;;
+ fnma f12 = f9, f11, f10
+ ;;
+ fmpy f11 = f11, f12
+ ;;
+ fmpy f10 = f8, f11
+ ;;
+ fcvt.fxu.trunc f10 = f10
+ ;;
+ fcvt.xuf f10 = f10
+ ;;
+ fnma f8 = f10, f9, f8
+ ;;
+ fcvt.fxu f8 = f8
+ ;;
+ getf.sig r32 = f8
+ br.ret.sptk rp
+ ;;
+ .endp __umodsi3
+#endif
+
+#ifdef L__save_stack_nonlocal
+// Notes on save/restore stack nonlocal: We read ar.bsp but write
+// ar.bspstore. This is because ar.bsp can be read at all times
+// (independent of the RSE mode) but since it's read-only we need to
+// restore the value via ar.bspstore. This is OK because
+// ar.bsp==ar.bspstore after executing "flushrs".
+
+// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
+
+ .text
+ .align 16
+ .global __ia64_save_stack_nonlocal
+ .proc __ia64_save_stack_nonlocal
+__ia64_save_stack_nonlocal:
+ alloc r18=ar.pfs,2,0,0,0
+ st8 [in0]=in1,8
+ mov r19=ar.rsc
+ ;;
+ flushrs
+ and r19=0x1c,r19
+ mov ar.pfs=r18
+ ;;
+ mov ar.rsc=r19
+ mov r16=ar.bsp
+ adds r2=16,in0
+ ;;
+ mov r17=ar.rnat
+ st8 [in0]=r16,8
+ or r19=0x3,r19
+ ;;
+ st8 [in0]=r17
+ mov ar.rsc=r19
+ st8 [r2]=r18
+ mov ar.pfs=r18
+ br.ret.sptk.few rp
+ ;;
+ .endp __ia64_save_stack_nonlocal
+#endif
+
+#ifdef L__nonlocal_goto
+// void __ia64_nonlocal_goto(void *fp, void *target_label, void *save_area,
+// void *static_chain);
+
+ .text
+ .align 16
+ .global __ia64_nonlocal_goto
+ .proc __ia64_nonlocal_goto
+__ia64_nonlocal_goto:
+ alloc r20=ar.pfs,4,0,0,0
+ mov r19=ar.rsc
+ adds r2=8,in2
+ ld8 r12=[in2],16
+ mov.ret.sptk.few.dc.dc rp = r33, .L0
+// ??? flushrs must be first instruction of a group. Gas is unfortunately
+// putting the stop bit before the padding nop instead of after it, making
+// flushrs the first instruction of its bundle, but the second instruction
+// of its group. We explicitly add the nop to avoid this problem.
+ nop.i 0
+ ;;
+ flushrs
+ ld8 r16=[r2],16
+ and r19=0x1c,r19
+ ld8 r17=[in2]
+ ;;
+ ld8 r18=[r2]
+ mov ar.rsc=r19
+ ;;
+ mov ar.bspstore=r16
+ ;;
+ mov ar.rnat=r17
+ mov ar.pfs=r18
+ or r19=0x3,r19
+ ;;
+ loadrs
+ invala
+ mov r7=r32
+.L0: {
+ mov ar.rsc=r19
+ mov r15=r35
+ br.ret.sptk.few rp
+ }
+ ;;
+ .endp __ia64_nonlocal_goto
+#endif