summaryrefslogtreecommitdiff
path: root/libgcc/config/avr/lib1funcs-fixed.S
diff options
context:
space:
mode:
Diffstat (limited to 'libgcc/config/avr/lib1funcs-fixed.S')
-rw-r--r--libgcc/config/avr/lib1funcs-fixed.S1927
1 files changed, 1927 insertions, 0 deletions
diff --git a/libgcc/config/avr/lib1funcs-fixed.S b/libgcc/config/avr/lib1funcs-fixed.S
new file mode 100644
index 0000000000..6dd68ee614
--- /dev/null
+++ b/libgcc/config/avr/lib1funcs-fixed.S
@@ -0,0 +1,1927 @@
+/* -*- Mode: Asm -*- */
+;; Copyright (C) 2012-2017 Free Software Foundation, Inc.
+;; Contributed by Sean D'Epagnier (sean@depagnier.com)
+;; Georg-Johann Lay (avr@gjlay.de)
+
+;; This file is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by the
+;; Free Software Foundation; either version 3, or (at your option) any
+;; later version.
+
+;; In addition to the permissions in the GNU General Public License, the
+;; Free Software Foundation gives you unlimited permission to link the
+;; compiled version of this file into combinations with other programs,
+;; and to distribute those combinations without any restriction coming
+;; from the use of this file. (The General Public License restrictions
+;; do apply in other respects; for example, they cover modification of
+;; the file, and distribution when not linked into a combine
+;; executable.)
+
+;; This file is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with this program; see the file COPYING. If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Fixed point library routines for AVR
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#if defined __AVR_TINY__
+#define __zero_reg__ r17
+#define __tmp_reg__ r16
+#else
+#define __zero_reg__ r1
+#define __tmp_reg__ r0
+#endif
+
+.section .text.libgcc.fixed, "ax", @progbits
+
+#ifndef __AVR_TINY__
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Conversions to float
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#if defined (L_fractqqsf)
+DEFUN __fractqqsf
+ ;; Move in place for SA -> SF conversion
+ clr r22
+ mov r23, r24
+ ;; Sign-extend
+ lsl r24
+ sbc r24, r24
+ mov r25, r24
+ XJMP __fractsasf
+ENDF __fractqqsf
+#endif /* L_fractqqsf */
+
+#if defined (L_fractuqqsf)
+DEFUN __fractuqqsf
+ ;; Move in place for USA -> SF conversion
+ clr r22
+ mov r23, r24
+ ;; Zero-extend
+ clr r24
+ clr r25
+ XJMP __fractusasf
+ENDF __fractuqqsf
+#endif /* L_fractuqqsf */
+
+#if defined (L_fracthqsf)
+DEFUN __fracthqsf
+ ;; Move in place for SA -> SF conversion
+ wmov 22, 24
+ ;; Sign-extend
+ lsl r25
+ sbc r24, r24
+ mov r25, r24
+ XJMP __fractsasf
+ENDF __fracthqsf
+#endif /* L_fracthqsf */
+
+#if defined (L_fractuhqsf)
+DEFUN __fractuhqsf
+ ;; Move in place for USA -> SF conversion
+ wmov 22, 24
+ ;; Zero-extend
+ clr r24
+ clr r25
+ XJMP __fractusasf
+ENDF __fractuhqsf
+#endif /* L_fractuhqsf */
+
+#if defined (L_fracthasf)
+DEFUN __fracthasf
+ ;; Move in place for SA -> SF conversion
+ clr r22
+ mov r23, r24
+ mov r24, r25
+ ;; Sign-extend
+ lsl r25
+ sbc r25, r25
+ XJMP __fractsasf
+ENDF __fracthasf
+#endif /* L_fracthasf */
+
+#if defined (L_fractuhasf)
+DEFUN __fractuhasf
+ ;; Move in place for USA -> SF conversion
+ clr r22
+ mov r23, r24
+ mov r24, r25
+ ;; Zero-extend
+ clr r25
+ XJMP __fractusasf
+ENDF __fractuhasf
+#endif /* L_fractuhasf */
+
+
+#if defined (L_fractsqsf)
+DEFUN __fractsqsf
+ XCALL __floatsisf
+ ;; Divide non-zero results by 2^31 to move the
+ ;; decimal point into place
+ tst r25
+ breq 0f
+ subi r24, exp_lo (31)
+ sbci r25, exp_hi (31)
+0: ret
+ENDF __fractsqsf
+#endif /* L_fractsqsf */
+
+#if defined (L_fractusqsf)
+DEFUN __fractusqsf
+ XCALL __floatunsisf
+ ;; Divide non-zero results by 2^32 to move the
+ ;; decimal point into place
+ cpse r25, __zero_reg__
+ subi r25, exp_hi (32)
+ ret
+ENDF __fractusqsf
+#endif /* L_fractusqsf */
+
+#if defined (L_fractsasf)
+DEFUN __fractsasf
+ XCALL __floatsisf
+ ;; Divide non-zero results by 2^15 to move the
+ ;; decimal point into place
+ tst r25
+ breq 0f
+ subi r24, exp_lo (15)
+ sbci r25, exp_hi (15)
+0: ret
+ENDF __fractsasf
+#endif /* L_fractsasf */
+
+#if defined (L_fractusasf)
+DEFUN __fractusasf
+ XCALL __floatunsisf
+ ;; Divide non-zero results by 2^16 to move the
+ ;; decimal point into place
+ cpse r25, __zero_reg__
+ subi r25, exp_hi (16)
+ ret
+ENDF __fractusasf
+#endif /* L_fractusasf */
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Conversions from float
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#if defined (L_fractsfqq)
+DEFUN __fractsfqq
+ ;; Multiply with 2^{24+7} to get a QQ result in r25
+ subi r24, exp_lo (-31)
+ sbci r25, exp_hi (-31)
+ XCALL __fixsfsi
+ mov r24, r25
+ ret
+ENDF __fractsfqq
+#endif /* L_fractsfqq */
+
+#if defined (L_fractsfuqq)
+DEFUN __fractsfuqq
+ ;; Multiply with 2^{24+8} to get a UQQ result in r25
+ subi r25, exp_hi (-32)
+ XCALL __fixunssfsi
+ mov r24, r25
+ ret
+ENDF __fractsfuqq
+#endif /* L_fractsfuqq */
+
+#if defined (L_fractsfha)
+DEFUN __fractsfha
+ ;; Multiply with 2^{16+7} to get a HA result in r25:r24
+ subi r24, exp_lo (-23)
+ sbci r25, exp_hi (-23)
+ XJMP __fixsfsi
+ENDF __fractsfha
+#endif /* L_fractsfha */
+
+#if defined (L_fractsfuha)
+DEFUN __fractsfuha
+ ;; Multiply with 2^24 to get a UHA result in r25:r24
+ subi r25, exp_hi (-24)
+ XJMP __fixunssfsi
+ENDF __fractsfuha
+#endif /* L_fractsfuha */
+
+#if defined (L_fractsfhq)
+FALIAS __fractsfsq
+
+DEFUN __fractsfhq
+ ;; Multiply with 2^{16+15} to get a HQ result in r25:r24
+ ;; resp. with 2^31 to get a SQ result in r25:r22
+ subi r24, exp_lo (-31)
+ sbci r25, exp_hi (-31)
+ XJMP __fixsfsi
+ENDF __fractsfhq
+#endif /* L_fractsfhq */
+
+#if defined (L_fractsfuhq)
+FALIAS __fractsfusq
+
+DEFUN __fractsfuhq
+ ;; Multiply with 2^{16+16} to get a UHQ result in r25:r24
+ ;; resp. with 2^32 to get a USQ result in r25:r22
+ subi r25, exp_hi (-32)
+ XJMP __fixunssfsi
+ENDF __fractsfuhq
+#endif /* L_fractsfuhq */
+
+#if defined (L_fractsfsa)
+DEFUN __fractsfsa
+ ;; Multiply with 2^15 to get a SA result in r25:r22
+ subi r24, exp_lo (-15)
+ sbci r25, exp_hi (-15)
+ XJMP __fixsfsi
+ENDF __fractsfsa
+#endif /* L_fractsfsa */
+
+#if defined (L_fractsfusa)
+DEFUN __fractsfusa
+ ;; Multiply with 2^16 to get a USA result in r25:r22
+ subi r25, exp_hi (-16)
+ XJMP __fixunssfsi
+ENDF __fractsfusa
+#endif /* L_fractsfusa */
+
+
+;; For multiplication the functions here are called directly from
+;; avr-fixed.md instead of using the standard libcall mechanisms.
+;; This can make better code because GCC knows exactly which
+;; of the call-used registers (not all of them) are clobbered. */
+
+/*******************************************************
+ Fractional Multiplication 8 x 8 without MUL
+*******************************************************/
+
+#if defined (L_mulqq3) && !defined (__AVR_HAVE_MUL__)
+;;; R23 = R24 * R25
+;;; Clobbers: __tmp_reg__, R22, R24, R25
+;;; Rounding: ???
+DEFUN __mulqq3
+ XCALL __fmuls
+ ;; TR 18037 requires that (-1) * (-1) does not overflow
+ ;; The only input that can produce -1 is (-1)^2.
+ dec r23
+ brvs 0f
+ inc r23
+0: ret
+ENDF __mulqq3
+#endif /* L_mulqq3 && ! HAVE_MUL */
+
+/*******************************************************
+ Fractional Multiply .16 x .16 with and without MUL
+*******************************************************/
+
+#if defined (L_mulhq3)
+;;; Same code with and without MUL, but the interfaces differ:
+;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
+;;; Clobbers: ABI, called by optabs
+;;; MUL: (R25:R24) = (R19:R18) * (R27:R26)
+;;; Clobbers: __tmp_reg__, R22, R23
+;;; Rounding: -0.5 LSB <= error <= 0.5 LSB
+DEFUN __mulhq3
+ XCALL __mulhisi3
+ ;; Shift result into place
+ lsl r23
+ rol r24
+ rol r25
+ brvs 1f
+ ;; Round
+ sbrc r23, 7
+ adiw r24, 1
+ ret
+1: ;; Overflow. TR 18037 requires (-1)^2 not to overflow
+ ldi r24, lo8 (0x7fff)
+ ldi r25, hi8 (0x7fff)
+ ret
+ENDF __mulhq3
+#endif /* defined (L_mulhq3) */
+
+#if defined (L_muluhq3)
+;;; Same code with and without MUL, but the interfaces differ:
+;;; no MUL: (R25:R24) *= (R23:R22)
+;;; Clobbers: ABI, called by optabs
+;;; MUL: (R25:R24) = (R19:R18) * (R27:R26)
+;;; Clobbers: __tmp_reg__, R22, R23
+;;; Rounding: -0.5 LSB < error <= 0.5 LSB
+DEFUN __muluhq3
+ XCALL __umulhisi3
+ ;; Round
+ sbrc r23, 7
+ adiw r24, 1
+ ret
+ENDF __muluhq3
+#endif /* L_muluhq3 */
+
+
+/*******************************************************
+ Fixed Multiply 8.8 x 8.8 with and without MUL
+*******************************************************/
+
+#if defined (L_mulha3)
+;;; Same code with and without MUL, but the interfaces differ:
+;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
+;;; Clobbers: ABI, called by optabs
+;;; MUL: (R25:R24) = (R19:R18) * (R27:R26)
+;;; Clobbers: __tmp_reg__, R22, R23
+;;; Rounding: -0.5 LSB <= error <= 0.5 LSB
+DEFUN __mulha3
+ XCALL __mulhisi3
+ lsl r22
+ rol r23
+ rol r24
+ XJMP __muluha3_round
+ENDF __mulha3
+#endif /* L_mulha3 */
+
+#if defined (L_muluha3)
+;;; Same code with and without MUL, but the interfaces differ:
+;;; no MUL: (R25:R24) *= (R23:R22)
+;;; Clobbers: ABI, called by optabs
+;;; MUL: (R25:R24) = (R19:R18) * (R27:R26)
+;;; Clobbers: __tmp_reg__, R22, R23
+;;; Rounding: -0.5 LSB < error <= 0.5 LSB
+DEFUN __muluha3
+ XCALL __umulhisi3
+ XJMP __muluha3_round
+ENDF __muluha3
+#endif /* L_muluha3 */
+
+#if defined (L_muluha3_round)
+DEFUN __muluha3_round
+ ;; Shift result into place
+ mov r25, r24
+ mov r24, r23
+ ;; Round
+ sbrc r22, 7
+ adiw r24, 1
+ ret
+ENDF __muluha3_round
+#endif /* L_muluha3_round */
+
+
+/*******************************************************
+ Fixed Multiplication 16.16 x 16.16
+*******************************************************/
+
+;; Bits outside the result (below LSB), used in the signed version
+#define GUARD __tmp_reg__
+
+#if defined (__AVR_HAVE_MUL__)
+
+;; Multiplier
+#define A0 16
+#define A1 A0+1
+#define A2 A1+1
+#define A3 A2+1
+
+;; Multiplicand
+#define B0 20
+#define B1 B0+1
+#define B2 B1+1
+#define B3 B2+1
+
+;; Result
+#define C0 24
+#define C1 C0+1
+#define C2 C1+1
+#define C3 C2+1
+
+#if defined (L_mulusa3)
+;;; (C3:C0) = (A3:A0) * (B3:B0)
+DEFUN __mulusa3
+ set
+ ;; Fallthru
+ENDF __mulusa3
+
+;;; Round for last digit iff T = 1
+;;; Return guard bits in GUARD (__tmp_reg__).
+;;; Rounding, T = 0: -1.0 LSB < error <= 0 LSB
+;;; Rounding, T = 1: -0.5 LSB < error <= 0.5 LSB
+DEFUN __mulusa3_round
+ ;; Some of the MUL instructions have LSBs outside the result.
+ ;; Don't ignore these LSBs in order to tame rounding error.
+ ;; Use C2/C3 for these LSBs.
+
+ clr C0
+ clr C1
+ mul A0, B0 $ movw C2, r0
+
+ mul A1, B0 $ add C3, r0 $ adc C0, r1
+ mul A0, B1 $ add C3, r0 $ adc C0, r1 $ rol C1
+
+ ;; Round if T = 1. Store guarding bits outside the result for rounding
+ ;; and left-shift by the signed version (function below).
+ brtc 0f
+ sbrc C3, 7
+ adiw C0, 1
+0: push C3
+
+ ;; The following MULs don't have LSBs outside the result.
+ ;; C2/C3 is the high part.
+
+ mul A0, B2 $ add C0, r0 $ adc C1, r1 $ sbc C2, C2
+ mul A1, B1 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0
+ mul A2, B0 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0
+ neg C2
+
+ mul A0, B3 $ add C1, r0 $ adc C2, r1 $ sbc C3, C3
+ mul A1, B2 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0
+ mul A2, B1 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0
+ mul A3, B0 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0
+ neg C3
+
+ mul A1, B3 $ add C2, r0 $ adc C3, r1
+ mul A2, B2 $ add C2, r0 $ adc C3, r1
+ mul A3, B1 $ add C2, r0 $ adc C3, r1
+
+ mul A2, B3 $ add C3, r0
+ mul A3, B2 $ add C3, r0
+
+ ;; Guard bits used in the signed version below.
+ pop GUARD
+ clr __zero_reg__
+ ret
+ENDF __mulusa3_round
+#endif /* L_mulusa3 */
+
+#if defined (L_mulsa3)
+;;; (C3:C0) = (A3:A0) * (B3:B0)
+;;; Clobbers: __tmp_reg__, T
+;;; Rounding: -0.5 LSB <= error <= 0.5 LSB
+DEFUN __mulsa3
+ clt
+ XCALL __mulusa3_round
+ ;; A posteriori sign extension of the operands
+ tst B3
+ brpl 1f
+ sub C2, A0
+ sbc C3, A1
+1: sbrs A3, 7
+ rjmp 2f
+ sub C2, B0
+ sbc C3, B1
+2:
+ ;; Shift 1 bit left to adjust for 15 fractional bits
+ lsl GUARD
+ rol C0
+ rol C1
+ rol C2
+ rol C3
+ ;; Round last digit
+ lsl GUARD
+ adc C0, __zero_reg__
+ adc C1, __zero_reg__
+ adc C2, __zero_reg__
+ adc C3, __zero_reg__
+ ret
+ENDF __mulsa3
+#endif /* L_mulsa3 */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+
+#else /* __AVR_HAVE_MUL__ */
+
+#define A0 18
+#define A1 A0+1
+#define A2 A0+2
+#define A3 A0+3
+
+#define B0 22
+#define B1 B0+1
+#define B2 B0+2
+#define B3 B0+3
+
+#define C0 22
+#define C1 C0+1
+#define C2 C0+2
+#define C3 C0+3
+
+;; __tmp_reg__
+#define CC0 0
+;; __zero_reg__
+#define CC1 1
+#define CC2 16
+#define CC3 17
+
+#define AA0 26
+#define AA1 AA0+1
+#define AA2 30
+#define AA3 AA2+1
+
+#if defined (L_mulsa3)
+;;; (R25:R22) *= (R21:R18)
+;;; Clobbers: ABI, called by optabs
+;;; Rounding: -1 LSB <= error <= 1 LSB
+DEFUN __mulsa3
+ push B0
+ push B1
+ push B3
+ clt
+ XCALL __mulusa3_round
+ pop r30
+ ;; sign-extend B
+ bst r30, 7
+ brtc 1f
+ ;; A1, A0 survived in R27:R26
+ sub C2, AA0
+ sbc C3, AA1
+1:
+ pop AA1 ;; B1
+ pop AA0 ;; B0
+
+ ;; sign-extend A. A3 survived in R31
+ bst AA3, 7
+ brtc 2f
+ sub C2, AA0
+ sbc C3, AA1
+2:
+ ;; Shift 1 bit left to adjust for 15 fractional bits
+ lsl GUARD
+ rol C0
+ rol C1
+ rol C2
+ rol C3
+ ;; Round last digit
+ lsl GUARD
+ adc C0, __zero_reg__
+ adc C1, __zero_reg__
+ adc C2, __zero_reg__
+ adc C3, __zero_reg__
+ ret
+ENDF __mulsa3
+#endif /* L_mulsa3 */
+
+#if defined (L_mulusa3)
+;;; (R25:R22) *= (R21:R18)
+;;; Clobbers: ABI, called by optabs
+;;; Rounding: -1 LSB <= error <= 1 LSB
+DEFUN __mulusa3
+ set
+ ;; Fallthru
+ENDF __mulusa3
+
+;;; A[] survives in 26, 27, 30, 31
+;;; Also used by __mulsa3 with T = 0
+;;; Round if T = 1
+;;; Return Guard bits in GUARD (__tmp_reg__), used by signed version.
+DEFUN __mulusa3_round
+ push CC2
+ push CC3
+ ; clear result
+ clr __tmp_reg__
+ wmov CC2, CC0
+ ; save multiplicand
+ wmov AA0, A0
+ wmov AA2, A2
+ rjmp 3f
+
+ ;; Loop the integral part
+
+1: ;; CC += A * 2^n; n >= 0
+ add CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3
+
+2: ;; A <<= 1
+ lsl A0 $ rol A1 $ rol A2 $ rol A3
+
+3: ;; IBIT(B) >>= 1
+ ;; Carry = n-th bit of B; n >= 0
+ lsr B3
+ ror B2
+ brcs 1b
+ sbci B3, 0
+ brne 2b
+
+ ;; Loop the fractional part
+ ;; B2/B3 is 0 now, use as guard bits for rounding
+ ;; Restore multiplicand
+ wmov A0, AA0
+ wmov A2, AA2
+ rjmp 5f
+
+4: ;; CC += A:Guard * 2^n; n < 0
+ add B3,B2 $ adc CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3
+5:
+ ;; A:Guard >>= 1
+ lsr A3 $ ror A2 $ ror A1 $ ror A0 $ ror B2
+
+ ;; FBIT(B) <<= 1
+ ;; Carry = n-th bit of B; n < 0
+ lsl B0
+ rol B1
+ brcs 4b
+ sbci B0, 0
+ brne 5b
+
+ ;; Save guard bits and set carry for rounding
+ push B3
+ lsl B3
+ ;; Move result into place
+ wmov C2, CC2
+ wmov C0, CC0
+ clr __zero_reg__
+ brtc 6f
+ ;; Round iff T = 1
+ adc C0, __zero_reg__
+ adc C1, __zero_reg__
+ adc C2, __zero_reg__
+ adc C3, __zero_reg__
+6:
+ pop GUARD
+ ;; Epilogue
+ pop CC3
+ pop CC2
+ ret
+ENDF __mulusa3_round
+#endif /* L_mulusa3 */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef AA0
+#undef AA1
+#undef AA2
+#undef AA3
+#undef CC0
+#undef CC1
+#undef CC2
+#undef CC3
+
+#endif /* __AVR_HAVE_MUL__ */
+
+#undef GUARD
+
+/***********************************************************
+ Fixed unsigned saturated Multiplication 8.8 x 8.8
+***********************************************************/
+
+#define C0 22
+#define C1 C0+1
+#define C2 C0+2
+#define C3 C0+3
+#define SS __tmp_reg__
+
+#if defined (L_usmuluha3)
+DEFUN __usmuluha3
+ ;; Widening multiply
+#ifdef __AVR_HAVE_MUL__
+ ;; Adjust interface
+ movw R26, R22
+ movw R18, R24
+#endif /* HAVE MUL */
+ XCALL __umulhisi3
+ tst C3
+ brne .Lmax
+ ;; Round, target is in C1..C2
+ lsl C0
+ adc C1, __zero_reg__
+ adc C2, __zero_reg__
+ brcs .Lmax
+ ;; Move result into place
+ mov C3, C2
+ mov C2, C1
+ ret
+.Lmax:
+ ;; Saturate
+ ldi C2, 0xff
+ ldi C3, 0xff
+ ret
+ENDF __usmuluha3
+#endif /* L_usmuluha3 */
+
+/***********************************************************
+ Fixed signed saturated Multiplication s8.7 x s8.7
+***********************************************************/
+
+#if defined (L_ssmulha3)
+DEFUN __ssmulha3
+ ;; Widening multiply
+#ifdef __AVR_HAVE_MUL__
+ ;; Adjust interface
+ movw R26, R22
+ movw R18, R24
+#endif /* HAVE MUL */
+ XCALL __mulhisi3
+ ;; Adjust decimal point
+ lsl C0
+ rol C1
+ rol C2
+ brvs .LsatC3.3
+ ;; The 9 MSBs must be the same
+ rol C3
+ sbc SS, SS
+ cp C3, SS
+ brne .LsatSS
+ ;; Round
+ lsl C0
+ adc C1, __zero_reg__
+ adc C2, __zero_reg__
+ brvs .Lmax
+ ;; Move result into place
+ mov C3, C2
+ mov C2, C1
+ ret
+.Lmax:
+ ;; Load 0x7fff
+ clr C3
+.LsatC3.3:
+ ;; C3 < 0 --> 0x8000
+ ;; C3 >= 0 --> 0x7fff
+ mov SS, C3
+.LsatSS:
+ ;; Load min / max value:
+ ;; SS = -1 --> 0x8000
+ ;; SS = 0 --> 0x7fff
+ ldi C3, 0x7f
+ ldi C2, 0xff
+ sbrc SS, 7
+ adiw C2, 1
+ ret
+ENDF __ssmulha3
+#endif /* L_ssmulha3 */
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef SS
+
+/***********************************************************
+ Fixed unsigned saturated Multiplication 16.16 x 16.16
+***********************************************************/
+
+#define C0 18
+#define C1 C0+1
+#define C2 C0+2
+#define C3 C0+3
+#define C4 C0+4
+#define C5 C0+5
+#define C6 C0+6
+#define C7 C0+7
+#define SS __tmp_reg__
+
+#if defined (L_usmulusa3)
+;; R22[4] = R22[4] *{ssat} R18[4]
+;; Ordinary ABI function
+DEFUN __usmulusa3
+ ;; Widening multiply
+ XCALL __umulsidi3
+ or C7, C6
+ brne .Lmax
+ ;; Round, target is in C2..C5
+ lsl C1
+ adc C2, __zero_reg__
+ adc C3, __zero_reg__
+ adc C4, __zero_reg__
+ adc C5, __zero_reg__
+ brcs .Lmax
+ ;; Move result into place
+ wmov C6, C4
+ wmov C4, C2
+ ret
+.Lmax:
+ ;; Saturate
+ ldi C7, 0xff
+ ldi C6, 0xff
+ wmov C4, C6
+ ret
+ENDF __usmulusa3
+#endif /* L_usmulusa3 */
+
+/***********************************************************
+ Fixed signed saturated Multiplication s16.15 x s16.15
+***********************************************************/
+
+#if defined (L_ssmulsa3)
+;; R22[4] = R22[4] *{ssat} R18[4]
+;; Ordinary ABI function
+DEFUN __ssmulsa3
+ ;; Widening multiply
+ XCALL __mulsidi3
+ ;; Adjust decimal point
+ lsl C1
+ rol C2
+ rol C3
+ rol C4
+ rol C5
+ brvs .LsatC7.7
+ ;; The 17 MSBs must be the same
+ rol C6
+ rol C7
+ sbc SS, SS
+ cp C6, SS
+ cpc C7, SS
+ brne .LsatSS
+ ;; Round
+ lsl C1
+ adc C2, __zero_reg__
+ adc C3, __zero_reg__
+ adc C4, __zero_reg__
+ adc C5, __zero_reg__
+ brvs .Lmax
+ ;; Move result into place
+ wmov C6, C4
+ wmov C4, C2
+ ret
+
+.Lmax:
+ ;; Load 0x7fffffff
+ clr C7
+.LsatC7.7:
+ ;; C7 < 0 --> 0x80000000
+ ;; C7 >= 0 --> 0x7fffffff
+ lsl C7
+ sbc SS, SS
+.LsatSS:
+ ;; Load min / max value:
+ ;; SS = -1 --> 0x80000000
+ ;; SS = 0 --> 0x7fffffff
+ com SS
+ mov C4, SS
+ mov C5, C4
+ wmov C6, C4
+ subi C7, 0x80
+ ret
+ENDF __ssmulsa3
+#endif /* L_ssmulsa3 */
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#undef SS
+
+/*******************************************************
+ Fractional Division 8 / 8
+*******************************************************/
+
+#define r_divd r25 /* dividend */
+#define r_quo r24 /* quotient */
+#define r_div r22 /* divisor */
+#define r_sign __tmp_reg__
+
+#if defined (L_divqq3)
+DEFUN __divqq3
+ mov r_sign, r_divd
+ eor r_sign, r_div
+ sbrc r_div, 7
+ neg r_div
+ sbrc r_divd, 7
+ neg r_divd
+ XCALL __divqq_helper
+ lsr r_quo
+ sbrc r_sign, 7 ; negate result if needed
+ neg r_quo
+ ret
+ENDF __divqq3
+#endif /* L_divqq3 */
+
+#if defined (L_udivuqq3)
+DEFUN __udivuqq3
+ cp r_divd, r_div
+ brsh 0f
+ XJMP __divqq_helper
+ ;; Result is out of [0, 1) ==> Return 1 - eps.
+0: ldi r_quo, 0xff
+ ret
+ENDF __udivuqq3
+#endif /* L_udivuqq3 */
+
+
+#if defined (L_divqq_helper)
+DEFUN __divqq_helper
+ clr r_quo ; clear quotient
+ inc __zero_reg__ ; init loop counter, used per shift
+__udivuqq3_loop:
+ lsl r_divd ; shift dividend
+ brcs 0f ; dividend overflow
+ cp r_divd,r_div ; compare dividend & divisor
+ brcc 0f ; dividend >= divisor
+ rol r_quo ; shift quotient (with CARRY)
+ rjmp __udivuqq3_cont
+0:
+ sub r_divd,r_div ; restore dividend
+ lsl r_quo ; shift quotient (without CARRY)
+__udivuqq3_cont:
+ lsl __zero_reg__ ; shift loop-counter bit
+ brne __udivuqq3_loop
+ com r_quo ; complement result
+ ; because C flag was complemented in loop
+ ret
+ENDF __divqq_helper
+#endif /* L_divqq_helper */
+
+#undef r_divd
+#undef r_quo
+#undef r_div
+#undef r_sign
+
+
+/*******************************************************
+ Fractional Division 16 / 16
+*******************************************************/
+#define r_divdL 26 /* dividend Low */
+#define r_divdH 27 /* dividend Hig */
+#define r_quoL 24 /* quotient Low */
+#define r_quoH 25 /* quotient High */
+#define r_divL 22 /* divisor */
+#define r_divH 23 /* divisor */
+#define r_cnt 21
+
+#if defined (L_divhq3)
+DEFUN __divhq3
+ mov r0, r_divdH
+ eor r0, r_divH
+ sbrs r_divH, 7
+ rjmp 1f
+ NEG2 r_divL
+1:
+ sbrs r_divdH, 7
+ rjmp 2f
+ NEG2 r_divdL
+2:
+ cp r_divdL, r_divL
+ cpc r_divdH, r_divH
+ breq __divhq3_minus1 ; if equal return -1
+ XCALL __udivuhq3
+ lsr r_quoH
+ ror r_quoL
+ brpl 9f
+ ;; negate result if needed
+ NEG2 r_quoL
+9:
+ ret
+__divhq3_minus1:
+ ldi r_quoH, 0x80
+ clr r_quoL
+ ret
+ENDF __divhq3
+#endif /* defined (L_divhq3) */
+
+#if defined (L_udivuhq3)
+DEFUN __udivuhq3
+ sub r_quoH,r_quoH ; clear quotient and carry
+ ;; FALLTHRU
+ENDF __udivuhq3
+
+DEFUN __udivuha3_common
+ clr r_quoL ; clear quotient
+ ldi r_cnt,16 ; init loop counter
+__udivuhq3_loop:
+ rol r_divdL ; shift dividend (with CARRY)
+ rol r_divdH
+ brcs __udivuhq3_ep ; dividend overflow
+ cp r_divdL,r_divL ; compare dividend & divisor
+ cpc r_divdH,r_divH
+ brcc __udivuhq3_ep ; dividend >= divisor
+ rol r_quoL ; shift quotient (with CARRY)
+ rjmp __udivuhq3_cont
+__udivuhq3_ep:
+ sub r_divdL,r_divL ; restore dividend
+ sbc r_divdH,r_divH
+ lsl r_quoL ; shift quotient (without CARRY)
+__udivuhq3_cont:
+ rol r_quoH ; shift quotient
+ dec r_cnt ; decrement loop counter
+ brne __udivuhq3_loop
+ com r_quoL ; complement result
+ com r_quoH ; because C flag was complemented in loop
+ ret
+ENDF __udivuha3_common
+#endif /* defined (L_udivuhq3) */
+
+/*******************************************************
+ Fixed Division 8.8 / 8.8
+*******************************************************/
+#if defined (L_divha3)
+DEFUN __divha3
+ mov r0, r_divdH
+ eor r0, r_divH
+ sbrs r_divH, 7
+ rjmp 1f
+ NEG2 r_divL
+1:
+ sbrs r_divdH, 7
+ rjmp 2f
+ NEG2 r_divdL
+2:
+ XCALL __udivuha3
+ lsr r_quoH ; adjust to 7 fractional bits
+ ror r_quoL
+ sbrs r0, 7 ; negate result if needed
+ ret
+ NEG2 r_quoL
+ ret
+ENDF __divha3
+#endif /* defined (L_divha3) */
+
+#if defined (L_udivuha3)
+DEFUN __udivuha3
+ mov r_quoH, r_divdL
+ mov r_divdL, r_divdH
+ clr r_divdH
+ lsl r_quoH ; shift quotient into carry
+ XJMP __udivuha3_common ; same as fractional after rearrange
+ENDF __udivuha3
+#endif /* defined (L_udivuha3) */
+
+#undef r_divdL
+#undef r_divdH
+#undef r_quoL
+#undef r_quoH
+#undef r_divL
+#undef r_divH
+#undef r_cnt
+
+/*******************************************************
+ Fixed Division 16.16 / 16.16
+*******************************************************/
+
+#define r_arg1L 24 /* arg1 gets passed already in place */
+#define r_arg1H 25
+#define r_arg1HL 26
+#define r_arg1HH 27
+#define r_divdL 26 /* dividend Low */
+#define r_divdH 27
+#define r_divdHL 30
+#define r_divdHH 31 /* dividend High */
+#define r_quoL 22 /* quotient Low */
+#define r_quoH 23
+#define r_quoHL 24
+#define r_quoHH 25 /* quotient High */
+#define r_divL 18 /* divisor Low */
+#define r_divH 19
+#define r_divHL 20
+#define r_divHH 21 /* divisor High */
+#define r_cnt __zero_reg__ /* loop count (0 after the loop!) */
+
+#if defined (L_divsa3)
+DEFUN __divsa3
+ mov r0, r_arg1HH
+ eor r0, r_divHH
+ sbrs r_divHH, 7
+ rjmp 1f
+ NEG4 r_divL
+1:
+ sbrs r_arg1HH, 7
+ rjmp 2f
+ NEG4 r_arg1L
+2:
+ XCALL __udivusa3
+ lsr r_quoHH ; adjust to 15 fractional bits
+ ror r_quoHL
+ ror r_quoH
+ ror r_quoL
+ sbrs r0, 7 ; negate result if needed
+ ret
+ ;; negate r_quoL
+ XJMP __negsi2
+ENDF __divsa3
+#endif /* defined (L_divsa3) */
+
+#if defined (L_udivusa3)
+DEFUN __udivusa3
+ ldi r_divdHL, 32 ; init loop counter
+ mov r_cnt, r_divdHL
+ clr r_divdHL
+ clr r_divdHH
+ wmov r_quoL, r_divdHL
+ lsl r_quoHL ; shift quotient into carry
+ rol r_quoHH
+__udivusa3_loop:
+ rol r_divdL ; shift dividend (with CARRY)
+ rol r_divdH
+ rol r_divdHL
+ rol r_divdHH
+ brcs __udivusa3_ep ; dividend overflow
+ cp r_divdL,r_divL ; compare dividend & divisor
+ cpc r_divdH,r_divH
+ cpc r_divdHL,r_divHL
+ cpc r_divdHH,r_divHH
+ brcc __udivusa3_ep ; dividend >= divisor
+ rol r_quoL ; shift quotient (with CARRY)
+ rjmp __udivusa3_cont
+__udivusa3_ep:
+ sub r_divdL,r_divL ; restore dividend
+ sbc r_divdH,r_divH
+ sbc r_divdHL,r_divHL
+ sbc r_divdHH,r_divHH
+ lsl r_quoL ; shift quotient (without CARRY)
+__udivusa3_cont:
+ rol r_quoH ; shift quotient
+ rol r_quoHL
+ rol r_quoHH
+ dec r_cnt ; decrement loop counter
+ brne __udivusa3_loop
+ com r_quoL ; complement result
+ com r_quoH ; because C flag was complemented in loop
+ com r_quoHL
+ com r_quoHH
+ ret
+ENDF __udivusa3
+#endif /* defined (L_udivusa3) */
+
+#undef r_arg1L
+#undef r_arg1H
+#undef r_arg1HL
+#undef r_arg1HH
+#undef r_divdL
+#undef r_divdH
+#undef r_divdHL
+#undef r_divdHH
+#undef r_quoL
+#undef r_quoH
+#undef r_quoHL
+#undef r_quoHH
+#undef r_divL
+#undef r_divH
+#undef r_divHL
+#undef r_divHH
+#undef r_cnt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Saturation, 1 Byte
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; First Argument and Return Register
+#define A0 24
+
+#if defined (L_ssabs_1)
+DEFUN __ssabs_1
+ sbrs A0, 7
+ ret
+ neg A0
+ sbrc A0,7
+ dec A0
+ ret
+ENDF __ssabs_1
+#endif /* L_ssabs_1 */
+
+#undef A0
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Saturation, 2 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; First Argument and Return Register
+#define A0 24
+#define A1 A0+1
+
+#if defined (L_ssneg_2)
+DEFUN __ssneg_2
+ NEG2 A0
+ brvc 0f
+ sbiw A0, 1
+0: ret
+ENDF __ssneg_2
+#endif /* L_ssneg_2 */
+
+#if defined (L_ssabs_2)
+DEFUN __ssabs_2
+ sbrs A1, 7
+ ret
+ XJMP __ssneg_2
+ENDF __ssabs_2
+#endif /* L_ssabs_2 */
+
+#undef A0
+#undef A1
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Saturation, 4 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; First Argument and Return Register
+#define A0 22
+#define A1 A0+1
+#define A2 A0+2
+#define A3 A0+3
+
+#if defined (L_ssneg_4)
+DEFUN __ssneg_4
+ XCALL __negsi2
+ brvc 0f
+ ldi A3, 0x7f
+ ldi A2, 0xff
+ ldi A1, 0xff
+ ldi A0, 0xff
+0: ret
+ENDF __ssneg_4
+#endif /* L_ssneg_4 */
+
+#if defined (L_ssabs_4)
+DEFUN __ssabs_4
+ sbrs A3, 7
+ ret
+ XJMP __ssneg_4
+ENDF __ssabs_4
+#endif /* L_ssabs_4 */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Saturation, 8 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; First Argument and Return Register
+#define A0 18
+#define A1 A0+1
+#define A2 A0+2
+#define A3 A0+3
+#define A4 A0+4
+#define A5 A0+5
+#define A6 A0+6
+#define A7 A0+7
+
+#if defined (L_clr_8)
+FALIAS __usneguta2
+FALIAS __usneguda2
+FALIAS __usnegudq2
+
+;; Clear Carry and all Bytes
+DEFUN __clr_8
+ ;; Clear Carry and set Z
+ sub A7, A7
+ ;; FALLTHRU
+ENDF __clr_8
+;; Propagate Carry to all Bytes, Carry unaltered
+DEFUN __sbc_8
+ sbc A7, A7
+ sbc A6, A6
+ wmov A4, A6
+ wmov A2, A6
+ wmov A0, A6
+ ret
+ENDF __sbc_8
+#endif /* L_clr_8 */
+
+#if defined (L_ssneg_8)
+FALIAS __ssnegta2
+FALIAS __ssnegda2
+FALIAS __ssnegdq2
+
+DEFUN __ssneg_8
+ XCALL __negdi2
+ brvc 0f
+ ;; A[] = 0x7fffffff
+ sec
+ XCALL __sbc_8
+ ldi A7, 0x7f
+0: ret
+ENDF __ssneg_8
+#endif /* L_ssneg_8 */
+
+#if defined (L_ssabs_8)
+FALIAS __ssabsta2
+FALIAS __ssabsda2
+FALIAS __ssabsdq2
+
+DEFUN __ssabs_8
+ sbrs A7, 7
+ ret
+ XJMP __ssneg_8
+ENDF __ssabs_8
+#endif /* L_ssabs_8 */
+
+;; Second Argument
+#define B0 10
+#define B1 B0+1
+#define B2 B0+2
+#define B3 B0+3
+#define B4 B0+4
+#define B5 B0+5
+#define B6 B0+6
+#define B7 B0+7
+
+#if defined (L_usadd_8)
+FALIAS __usadduta3
+FALIAS __usadduda3
+FALIAS __usaddudq3
+
+DEFUN __usadd_8
+ XCALL __adddi3
+ brcs 0f
+ ret
+0: ;; A[] = 0xffffffff
+ XJMP __sbc_8
+ENDF __usadd_8
+#endif /* L_usadd_8 */
+
+#if defined (L_ussub_8)
+FALIAS __ussubuta3
+FALIAS __ussubuda3
+FALIAS __ussubudq3
+
+DEFUN __ussub_8
+ XCALL __subdi3
+ brcs 0f
+ ret
+0: ;; A[] = 0
+ XJMP __clr_8
+ENDF __ussub_8
+#endif /* L_ussub_8 */
+
+#if defined (L_ssadd_8)
+FALIAS __ssaddta3
+FALIAS __ssaddda3
+FALIAS __ssadddq3
+
+DEFUN __ssadd_8
+ XCALL __adddi3
+ brvc 0f
+ ;; A = (B >= 0) ? INT64_MAX : INT64_MIN
+ cpi B7, 0x80
+ XCALL __sbc_8
+ subi A7, 0x80
+0: ret
+ENDF __ssadd_8
+#endif /* L_ssadd_8 */
+
+#if defined (L_sssub_8)
+FALIAS __sssubta3
+FALIAS __sssubda3
+FALIAS __sssubdq3
+
+DEFUN __sssub_8
+ XCALL __subdi3
+ brvc 0f
+ ;; A = (B < 0) ? INT64_MAX : INT64_MIN
+ ldi A7, 0x7f
+ cp A7, B7
+ XCALL __sbc_8
+ subi A7, 0x80
+0: ret
+ENDF __sssub_8
+#endif /* L_sssub_8 */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef A4
+#undef A5
+#undef A6
+#undef A7
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef B4
+#undef B5
+#undef B6
+#undef B7
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rounding Helpers
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#ifdef L_mask1
+
+#define AA 24
+#define CC 25
+
+;; R25 = 1 << (R24 & 7)
+;; CC = 1 << (AA & 7)
+;; Clobbers: None
+DEFUN __mask1
+ ;; CC = 2 ^ AA.1
+ ldi CC, 1 << 2
+ sbrs AA, 1
+ ldi CC, 1 << 0
+ ;; CC *= 2 ^ AA.0
+ sbrc AA, 0
+ lsl CC
+ ;; CC *= 2 ^ AA.2
+ sbrc AA, 2
+ swap CC
+ ret
+ENDF __mask1
+
+#undef AA
+#undef CC
+#endif /* L_mask1 */
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The rounding point. Any bits smaller than
+;; 2^{-RP} will be cleared.
+#define RP R24
+
+#define A0 22
+#define A1 A0 + 1
+
+#define C0 24
+#define C1 C0 + 1
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rounding, 1 Byte
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#ifdef L_roundqq3
+
+;; R24 = round (R22, R24)
+;; Clobbers: R22, __tmp_reg__
+DEFUN __roundqq3
+ mov __tmp_reg__, C1
+ subi RP, __QQ_FBIT__ - 1
+ neg RP
+ ;; R25 = 1 << RP (Total offset is FBIT-1 - RP)
+ XCALL __mask1
+ mov C0, C1
+ ;; Add-Saturate 2^{-RP-1}
+ add A0, C0
+ brvc 0f
+ ldi C0, 0x7f
+ rjmp 9f
+0: ;; Mask out bits beyond RP
+ lsl C0
+ neg C0
+ and C0, A0
+9: mov C1, __tmp_reg__
+ ret
+ENDF __roundqq3
+#endif /* L_roundqq3 */
+
+#ifdef L_rounduqq3
+
+;; R24 = round (R22, R24)
+;; Clobbers: R22, __tmp_reg__
+DEFUN __rounduqq3
+ mov __tmp_reg__, C1
+ subi RP, __UQQ_FBIT__ - 1
+ neg RP
+ ;; R25 = 1 << RP (Total offset is FBIT-1 - RP)
+ XCALL __mask1
+ mov C0, C1
+ ;; Add-Saturate 2^{-RP-1}
+ add A0, C0
+ brcc 0f
+ ldi C0, 0xff
+ rjmp 9f
+0: ;; Mask out bits beyond RP
+ lsl C0
+ neg C0
+ and C0, A0
+9: mov C1, __tmp_reg__
+ ret
+ENDF __rounduqq3
+#endif /* L_rounduqq3 */
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rounding, 2 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#ifdef L_addmask_2
+
+;; [ R25:R24 = 1 << (R24 & 15)
+;; R23:R22 += 1 << (R24 & 15) ]
+;; SREG is set according to the addition
+DEFUN __addmask_2
+ ;; R25 = 1 << (R24 & 7)
+ XCALL __mask1
+ cpi RP, 1 << 3
+ sbc C0, C0
+ ;; Swap C0 and C1 if RP.3 was set
+ and C0, C1
+ eor C1, C0
+ ;; Finally, add the power-of-two: A[] += C[]
+ add A0, C0
+ adc A1, C1
+ ret
+ENDF __addmask_2
+#endif /* L_addmask_2 */
+
+#ifdef L_round_s2
+
+;; R25:R24 = round (R23:R22, R24)
+;; Clobbers: R23, R22
+DEFUN __roundhq3
+ subi RP, __HQ_FBIT__ - __HA_FBIT__
+ENDF __roundhq3
+DEFUN __roundha3
+ subi RP, __HA_FBIT__ - 1
+ neg RP
+ ;; [ R25:R24 = 1 << (FBIT-1 - RP)
+ ;; R23:R22 += 1 << (FBIT-1 - RP) ]
+ XCALL __addmask_2
+ XJMP __round_s2_const
+ENDF __roundha3
+
+#endif /* L_round_s2 */
+
+#ifdef L_round_u2
+
+;; R25:R24 = round (R23:R22, R24)
+;; Clobbers: R23, R22
+DEFUN __rounduhq3
+ subi RP, __UHQ_FBIT__ - __UHA_FBIT__
+ENDF __rounduhq3
+DEFUN __rounduha3
+ subi RP, __UHA_FBIT__ - 1
+ neg RP
+ ;; [ R25:R24 = 1 << (FBIT-1 - RP)
+ ;; R23:R22 += 1 << (FBIT-1 - RP) ]
+ XCALL __addmask_2
+ XJMP __round_u2_const
+ENDF __rounduha3
+
+#endif /* L_round_u2 */
+
+
+#ifdef L_round_2_const
+
+;; Helpers for 2 byte wide rounding
+
+DEFUN __round_s2_const
+ brvc 2f
+ ldi C1, 0x7f
+ rjmp 1f
+ ;; FALLTHRU (Barrier)
+ENDF __round_s2_const
+
+DEFUN __round_u2_const
+ brcc 2f
+ ldi C1, 0xff
+1:
+ ldi C0, 0xff
+ rjmp 9f
+2:
+ ;; Saturation is performed now.
+ ;; Currently, we have C[] = 2^{-RP-1}
+ ;; C[] = 2^{-RP}
+ lsl C0
+ rol C1
+ ;;
+ NEG2 C0
+ ;; Clear the bits beyond the rounding point.
+ and C0, A0
+ and C1, A1
+9: ret
+ENDF __round_u2_const
+
+#endif /* L_round_2_const */
+
+#undef A0
+#undef A1
+#undef C0
+#undef C1
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rounding, 4 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#define A0 18
+#define A1 A0 + 1
+#define A2 A0 + 2
+#define A3 A0 + 3
+
+#define C0 22
+#define C1 C0 + 1
+#define C2 C0 + 2
+#define C3 C0 + 3
+
+#ifdef L_addmask_4
+
+;; [ R25:R22 = 1 << (R24 & 31)
+;; R21:R18 += 1 << (R24 & 31) ]
+;; SREG is set according to the addition
+DEFUN __addmask_4
+ ;; R25 = 1 << (R24 & 7)
+ XCALL __mask1
+ cpi RP, 1 << 4
+ sbc C0, C0
+ sbc C1, C1
+ ;; Swap C2 with C3 if RP.3 is not set
+ cpi RP, 1 << 3
+ sbc C2, C2
+ and C2, C3
+ eor C3, C2
+ ;; Swap C3:C2 with C1:C0 if RP.4 is not set
+ and C0, C2 $ eor C2, C0
+ and C1, C3 $ eor C3, C1
+ ;; Finally, add the power-of-two: A[] += C[]
+ add A0, C0
+ adc A1, C1
+ adc A2, C2
+ adc A3, C3
+ ret
+ENDF __addmask_4
+#endif /* L_addmask_4 */
+
+#ifdef L_round_s4
+
+;; R25:R22 = round (R21:R18, R24)
+;; Clobbers: R18...R21
+DEFUN __roundsq3
+ subi RP, __SQ_FBIT__ - __SA_FBIT__
+ENDF __roundsq3
+DEFUN __roundsa3
+ subi RP, __SA_FBIT__ - 1
+ neg RP
+ ;; [ R25:R22 = 1 << (FBIT-1 - RP)
+ ;; R21:R18 += 1 << (FBIT-1 - RP) ]
+ XCALL __addmask_4
+ XJMP __round_s4_const
+ENDF __roundsa3
+
+#endif /* L_round_s4 */
+
+#ifdef L_round_u4
+
+;; R25:R22 = round (R21:R18, R24)
+;; Clobbers: R18...R21
+DEFUN __roundusq3
+ subi RP, __USQ_FBIT__ - __USA_FBIT__
+ENDF __roundusq3
+DEFUN __roundusa3
+ subi RP, __USA_FBIT__ - 1
+ neg RP
+ ;; [ R25:R22 = 1 << (FBIT-1 - RP)
+ ;; R21:R18 += 1 << (FBIT-1 - RP) ]
+ XCALL __addmask_4
+ XJMP __round_u4_const
+ENDF __roundusa3
+
+#endif /* L_round_u4 */
+
+
+#ifdef L_round_4_const
+
+;; Helpers for 4 byte wide rounding
+
+DEFUN __round_s4_const
+ brvc 2f
+ ldi C3, 0x7f
+ rjmp 1f
+ ;; FALLTHRU (Barrier)
+ENDF __round_s4_const
+
+DEFUN __round_u4_const
+ brcc 2f
+ ldi C3, 0xff
+1:
+ ldi C2, 0xff
+ ldi C1, 0xff
+ ldi C0, 0xff
+ rjmp 9f
+2:
+ ;; Saturation is performed now.
+ ;; Currently, we have C[] = 2^{-RP-1}
+ ;; C[] = 2^{-RP}
+ lsl C0
+ rol C1
+ rol C2
+ rol C3
+ XCALL __negsi2
+ ;; Clear the bits beyond the rounding point.
+ and C0, A0
+ and C1, A1
+ and C2, A2
+ and C3, A3
+9: ret
+ENDF __round_u4_const
+
+#endif /* L_round_4_const */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+
+#undef RP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rounding, 8 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#define RP 16
+#define FBITm1 31
+
+#define C0 18
+#define C1 C0 + 1
+#define C2 C0 + 2
+#define C3 C0 + 3
+#define C4 C0 + 4
+#define C5 C0 + 5
+#define C6 C0 + 6
+#define C7 C0 + 7
+
+#define A0 16
+#define A1 17
+#define A2 26
+#define A3 27
+#define A4 28
+#define A5 29
+#define A6 30
+#define A7 31
+
+
+#ifdef L_rounddq3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN __rounddq3
+ ldi FBITm1, __DQ_FBIT__ - 1
+ clt
+ XJMP __round_x8
+ENDF __rounddq3
+#endif /* L_rounddq3 */
+
+#ifdef L_roundudq3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN __roundudq3
+ ldi FBITm1, __UDQ_FBIT__ - 1
+ set
+ XJMP __round_x8
+ENDF __roundudq3
+#endif /* L_roundudq3 */
+
+#ifdef L_roundda3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN __roundda3
+ ldi FBITm1, __DA_FBIT__ - 1
+ clt
+ XJMP __round_x8
+ENDF __roundda3
+#endif /* L_roundda3 */
+
+#ifdef L_rounduda3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN __rounduda3
+ ldi FBITm1, __UDA_FBIT__ - 1
+ set
+ XJMP __round_x8
+ENDF __rounduda3
+#endif /* L_rounduda3 */
+
+#ifdef L_roundta3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN __roundta3
+ ldi FBITm1, __TA_FBIT__ - 1
+ clt
+ XJMP __round_x8
+ENDF __roundta3
+#endif /* L_roundta3 */
+
+#ifdef L_rounduta3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN __rounduta3
+ ldi FBITm1, __UTA_FBIT__ - 1
+ set
+ XJMP __round_x8
+ENDF __rounduta3
+#endif /* L_rounduta3 */
+
+
+#ifdef L_round_x8
+DEFUN __round_x8
+ push r16
+ push r17
+ push r28
+ push r29
+ ;; Compute log2 of addend from rounding point
+ sub RP, FBITm1
+ neg RP
+ ;; Move input to work register A[]
+ push C0
+ mov A1, C1
+ wmov A2, C2
+ wmov A4, C4
+ wmov A6, C6
+ ;; C[] = 1 << (FBIT-1 - RP)
+ XCALL __clr_8
+ inc C0
+ XCALL __ashldi3
+ pop A0
+ ;; A[] += C[]
+ add A0, C0
+ adc A1, C1
+ adc A2, C2
+ adc A3, C3
+ adc A4, C4
+ adc A5, C5
+ adc A6, C6
+ adc A7, C7
+ brts 1f
+ ;; Signed
+ brvc 3f
+ ;; Signed overflow: A[] = 0x7f...
+ brvs 2f
+1: ;; Unsigned
+ brcc 3f
+ ;; Unsigned overflow: A[] = 0xff...
+2: ldi C7, 0xff
+ ldi C6, 0xff
+ wmov C0, C6
+ wmov C2, C6
+ wmov C4, C6
+ bld C7, 7
+ rjmp 9f
+3:
+ ;; C[] = -C[] - C[]
+ push A0
+ ldi r16, 1
+ XCALL __ashldi3
+ pop A0
+ XCALL __negdi2
+ ;; Clear the bits beyond the rounding point.
+ and C0, A0
+ and C1, A1
+ and C2, A2
+ and C3, A3
+ and C4, A4
+ and C5, A5
+ and C6, A6
+ and C7, A7
+9: ;; Epilogue
+ pop r29
+ pop r28
+ pop r17
+ pop r16
+ ret
+ENDF __round_x8
+
+#endif /* L_round_x8 */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef A4
+#undef A5
+#undef A6
+#undef A7
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+
+#undef RP
+#undef FBITm1
+
+
+;; Supply implementations / symbols for the bit-banging functions
+;; __builtin_avr_bitsfx and __builtin_avr_fxbits
+#ifdef L_ret
+DEFUN __ret
+ ret
+ENDF __ret
+#endif /* L_ret */
+
+#endif /* if not __AVR_TINY__ */