1 files changed, 1927 insertions, 0 deletions
diff --git a/libgcc/config/avr/lib1funcs-fixed.S b/libgcc/config/avr/lib1funcs-fixed.S
new file mode 100644
index 0000000000..6dd68ee614
--- /dev/null
+++ b/libgcc/config/avr/lib1funcs-fixed.S
@@ -0,0 +1,1927 @@
+/*  -*- Mode: Asm -*-  */
+;;    Copyright (C) 2012-2017 Free Software Foundation, Inc.
+;;    Contributed by Sean D'Epagnier  (sean@depagnier.com)
+;;                   Georg-Johann Lay (avr@gjlay.de)
+
+;; This file is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by the
+;; Free Software Foundation; either version 3, or (at your option) any
+;; later version.
+
+;; In addition to the permissions in the GNU General Public License, the
+;; Free Software Foundation gives you unlimited permission to link the
+;; compiled version of this file into combinations with other programs,
+;; and to distribute those combinations without any restriction coming
+;; from the use of this file.  (The General Public License restrictions
+;; do apply in other respects; for example, they cover modification of
+;; the file, and distribution when not linked into a combine
+;; executable.)
+
+;; This file is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with this program; see the file COPYING.  If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Fixed point library routines for AVR
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#if defined __AVR_TINY__
+#define __zero_reg__ r17
+#define __tmp_reg__ r16
+#else                                                                                                                                              
+#define __zero_reg__ r1
+#define __tmp_reg__ r0
+#endif
+
+.section .text.libgcc.fixed, "ax", @progbits
+
+#ifndef __AVR_TINY__
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Conversions to float
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#if defined (L_fractqqsf)
+DEFUN __fractqqsf
+    ;; Move in place for SA -> SF conversion
+    clr     r22
+    mov     r23, r24
+    ;; Sign-extend
+    lsl     r24
+    sbc     r24, r24
+    mov     r25, r24
+    XJMP    __fractsasf
+ENDF __fractqqsf
+#endif  /* L_fractqqsf */
+
+#if defined (L_fractuqqsf)
+DEFUN __fractuqqsf
+    ;; Move in place for USA -> SF conversion
+    clr     r22
+    mov     r23, r24
+    ;; Zero-extend
+    clr     r24
+    clr     r25
+    XJMP    __fractusasf
+ENDF __fractuqqsf
+#endif  /* L_fractuqqsf */
+
+#if defined (L_fracthqsf)
+DEFUN __fracthqsf
+    ;; Move in place for SA -> SF conversion
+    wmov    22, 24
+    ;; Sign-extend
+    lsl     r25
+    sbc     r24, r24
+    mov     r25, r24
+    XJMP    __fractsasf
+ENDF __fracthqsf
+#endif  /* L_fracthqsf */
+
+#if defined (L_fractuhqsf)
+DEFUN __fractuhqsf
+    ;; Move in place for USA -> SF conversion
+    wmov    22, 24
+    ;; Zero-extend
+    clr     r24
+    clr     r25
+    XJMP    __fractusasf
+ENDF __fractuhqsf
+#endif  /* L_fractuhqsf */
+
+#if defined (L_fracthasf)
+DEFUN __fracthasf
+    ;; Move in place for SA -> SF conversion
+    clr     r22
+    mov     r23, r24
+    mov     r24, r25
+    ;; Sign-extend
+    lsl     r25
+    sbc     r25, r25
+    XJMP    __fractsasf
+ENDF __fracthasf
+#endif  /* L_fracthasf */
+
+#if defined (L_fractuhasf)
+DEFUN __fractuhasf
+    ;; Move in place for USA -> SF conversion
+    clr     r22
+    mov     r23, r24
+    mov     r24, r25
+    ;; Zero-extend
+    clr     r25
+    XJMP    __fractusasf
+ENDF __fractuhasf
+#endif  /* L_fractuhasf */
+
+
+#if defined (L_fractsqsf)
+DEFUN __fractsqsf
+    XCALL   __floatsisf
+    ;; Divide non-zero results by 2^31 to move the
+    ;; decimal point into place
+    tst     r25
+    breq    0f
+    subi    r24, exp_lo (31)
+    sbci    r25, exp_hi (31)
+0:  ret
+ENDF __fractsqsf
+#endif  /* L_fractsqsf */
+
+#if defined (L_fractusqsf)
+DEFUN __fractusqsf
+    XCALL   __floatunsisf
+    ;; Divide non-zero results by 2^32 to move the
+    ;; decimal point into place
+    cpse    r25, __zero_reg__
+    subi    r25, exp_hi (32)
+    ret
+ENDF __fractusqsf
+#endif  /* L_fractusqsf */
+
+#if defined (L_fractsasf)
+DEFUN __fractsasf
+    XCALL   __floatsisf
+    ;; Divide non-zero results by 2^15 to move the
+    ;; decimal point into place
+    tst     r25
+    breq    0f
+    subi    r24, exp_lo (15)
+    sbci    r25, exp_hi (15)
+0:  ret
+ENDF __fractsasf
+#endif  /* L_fractsasf */
+
+#if defined (L_fractusasf)
+DEFUN __fractusasf
+    XCALL   __floatunsisf
+    ;; Divide non-zero results by 2^16 to move the
+    ;; decimal point into place
+    cpse    r25, __zero_reg__
+    subi    r25, exp_hi (16)
+    ret
+ENDF __fractusasf
+#endif  /* L_fractusasf */
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Conversions from float
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#if defined (L_fractsfqq)
+DEFUN __fractsfqq
+    ;; Multiply with 2^{24+7} to get a QQ result in r25
+    subi    r24, exp_lo (-31)
+    sbci    r25, exp_hi (-31)
+    XCALL   __fixsfsi
+    mov     r24, r25
+    ret
+ENDF __fractsfqq
+#endif  /* L_fractsfqq */
+
+#if defined (L_fractsfuqq)
+DEFUN __fractsfuqq
+    ;; Multiply with 2^{24+8} to get a UQQ result in r25
+    subi    r25, exp_hi (-32)
+    XCALL   __fixunssfsi
+    mov     r24, r25
+    ret
+ENDF __fractsfuqq
+#endif  /* L_fractsfuqq */
+
+#if defined (L_fractsfha)
+DEFUN __fractsfha
+    ;; Multiply with 2^{16+7} to get a HA result in r25:r24
+    subi    r24, exp_lo (-23)
+    sbci    r25, exp_hi (-23)
+    XJMP    __fixsfsi
+ENDF __fractsfha
+#endif  /* L_fractsfha */
+
+#if defined (L_fractsfuha)
+DEFUN __fractsfuha
+    ;; Multiply with 2^24 to get a UHA result in r25:r24
+    subi    r25, exp_hi (-24)
+    XJMP    __fixunssfsi
+ENDF __fractsfuha
+#endif  /* L_fractsfuha */
+
+#if defined (L_fractsfhq)
+FALIAS __fractsfsq
+
+DEFUN __fractsfhq
+    ;; Multiply with 2^{16+15} to get a HQ result in r25:r24
+    ;; resp. with 2^31 to get a SQ result in r25:r22
+    subi    r24, exp_lo (-31)
+    sbci    r25, exp_hi (-31)
+    XJMP    __fixsfsi
+ENDF __fractsfhq
+#endif  /* L_fractsfhq */
+
+#if defined (L_fractsfuhq)
+FALIAS __fractsfusq
+
+DEFUN __fractsfuhq
+    ;; Multiply with 2^{16+16} to get a UHQ result in r25:r24
+    ;; resp. with 2^32 to get a USQ result in r25:r22
+    subi    r25, exp_hi (-32)
+    XJMP    __fixunssfsi
+ENDF __fractsfuhq
+#endif  /* L_fractsfuhq */
+
+#if defined (L_fractsfsa)
+DEFUN __fractsfsa
+    ;; Multiply with 2^15 to get a SA result in r25:r22
+    subi    r24, exp_lo (-15)
+    sbci    r25, exp_hi (-15)
+    XJMP    __fixsfsi
+ENDF __fractsfsa
+#endif  /* L_fractsfsa */
+
+#if defined (L_fractsfusa)
+DEFUN __fractsfusa
+    ;; Multiply with 2^16 to get a USA result in r25:r22
+    subi    r25, exp_hi (-16)
+    XJMP    __fixunssfsi
+ENDF __fractsfusa
+#endif  /* L_fractsfusa */
+
+
+;; For multiplication the functions here are called directly from
+;; avr-fixed.md instead of using the standard libcall mechanisms.
+;; This can make better code because GCC knows exactly which
+;; of the call-used registers (not all of them) are clobbered.  */
+
+/*******************************************************
+    Fractional  Multiplication  8 x 8  without MUL
+*******************************************************/
+
+#if defined (L_mulqq3) && !defined (__AVR_HAVE_MUL__)
+;;; R23 = R24 * R25
+;;; Clobbers: __tmp_reg__, R22, R24, R25
+;;; Rounding: ???
+DEFUN __mulqq3
+    XCALL   __fmuls
+    ;; TR 18037 requires that  (-1) * (-1)  does not overflow
+    ;; The only input that can produce  -1  is  (-1)^2.
+    dec     r23
+    brvs    0f
+    inc     r23
+0:  ret
+ENDF  __mulqq3
+#endif /* L_mulqq3 && ! HAVE_MUL */
+
+/*******************************************************
+    Fractional Multiply  .16 x .16  with and without MUL
+*******************************************************/
+
+#if defined (L_mulhq3)
+;;; Same code with and without MUL, but the interfaces differ:
+;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
+;;;         Clobbers: ABI, called by optabs
+;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
+;;;         Clobbers: __tmp_reg__, R22, R23
+;;; Rounding:  -0.5 LSB  <= error  <=  0.5 LSB
+DEFUN   __mulhq3
+    XCALL   __mulhisi3
+    ;; Shift result into place
+    lsl     r23
+    rol     r24
+    rol     r25
+    brvs    1f
+    ;; Round
+    sbrc    r23, 7
+    adiw    r24, 1
+    ret
+1:  ;; Overflow.  TR 18037 requires  (-1)^2  not to overflow
+    ldi     r24, lo8 (0x7fff)
+    ldi     r25, hi8 (0x7fff)
+    ret
+ENDF __mulhq3
+#endif  /* defined (L_mulhq3) */
+
+#if defined (L_muluhq3)
+;;; Same code with and without MUL, but the interfaces differ:
+;;; no MUL: (R25:R24) *= (R23:R22)
+;;;         Clobbers: ABI, called by optabs
+;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
+;;;         Clobbers: __tmp_reg__, R22, R23
+;;; Rounding:  -0.5 LSB  <  error  <=  0.5 LSB
+DEFUN   __muluhq3
+    XCALL   __umulhisi3
+    ;; Round
+    sbrc    r23, 7
+    adiw    r24, 1
+    ret
+ENDF __muluhq3
+#endif  /* L_muluhq3 */
+
+
+/*******************************************************
+    Fixed  Multiply  8.8 x 8.8  with and without MUL
+*******************************************************/
+
+#if defined (L_mulha3)
+;;; Same code with and without MUL, but the interfaces differ:
+;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
+;;;         Clobbers: ABI, called by optabs
+;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
+;;;         Clobbers: __tmp_reg__, R22, R23
+;;; Rounding:  -0.5 LSB  <=  error  <=  0.5 LSB
+DEFUN   __mulha3
+    XCALL   __mulhisi3
+    lsl     r22
+    rol     r23
+    rol     r24
+    XJMP    __muluha3_round
+ENDF __mulha3
+#endif  /* L_mulha3 */
+
+#if defined (L_muluha3)
+;;; Same code with and without MUL, but the interfaces differ:
+;;; no MUL: (R25:R24) *= (R23:R22)
+;;;         Clobbers: ABI, called by optabs
+;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
+;;;         Clobbers: __tmp_reg__, R22, R23
+;;; Rounding:  -0.5 LSB  <  error  <=  0.5 LSB
+DEFUN   __muluha3
+    XCALL   __umulhisi3
+    XJMP    __muluha3_round
+ENDF __muluha3
+#endif  /* L_muluha3 */
+
+#if defined (L_muluha3_round)
+DEFUN   __muluha3_round
+    ;; Shift result into place
+    mov     r25, r24
+    mov     r24, r23
+    ;; Round
+    sbrc    r22, 7
+    adiw    r24, 1
+    ret
+ENDF __muluha3_round
+#endif  /* L_muluha3_round */
+
+
+/*******************************************************
+    Fixed  Multiplication  16.16 x 16.16
+*******************************************************/
+
+;; Bits outside the result (below LSB), used in the signed version
+#define GUARD __tmp_reg__
+
+#if defined (__AVR_HAVE_MUL__)
+
+;; Multiplier
+#define A0  16
+#define A1  A0+1
+#define A2  A1+1
+#define A3  A2+1
+
+;; Multiplicand
+#define B0  20
+#define B1  B0+1
+#define B2  B1+1
+#define B3  B2+1
+
+;; Result
+#define C0  24
+#define C1  C0+1
+#define C2  C1+1
+#define C3  C2+1
+
+#if defined (L_mulusa3)
+;;; (C3:C0) = (A3:A0) * (B3:B0)
+DEFUN __mulusa3
+    set
+    ;; Fallthru
+ENDF  __mulusa3
+
+;;; Round for last digit iff T = 1
+;;; Return guard bits in GUARD (__tmp_reg__).
+;;; Rounding, T = 0:  -1.0 LSB  <  error  <=  0   LSB
+;;; Rounding, T = 1:  -0.5 LSB  <  error  <=  0.5 LSB
+DEFUN __mulusa3_round
+    ;; Some of the MUL instructions have LSBs outside the result.
+    ;; Don't ignore these LSBs in order to tame rounding error.
+    ;; Use C2/C3 for these LSBs.
+
+    clr C0
+    clr C1
+    mul A0, B0  $  movw C2, r0
+
+    mul A1, B0  $  add  C3, r0  $  adc C0, r1
+    mul A0, B1  $  add  C3, r0  $  adc C0, r1  $  rol C1
+
+    ;; Round if T = 1.  Store guarding bits outside the result for rounding
+    ;; and left-shift by the signed version (function below).
+    brtc 0f
+    sbrc C3, 7
+    adiw C0, 1
+0:  push C3
+
+    ;; The following MULs don't have LSBs outside the result.
+    ;; C2/C3 is the high part.
+
+    mul  A0, B2  $  add C0, r0  $  adc C1, r1  $  sbc  C2, C2
+    mul  A1, B1  $  add C0, r0  $  adc C1, r1  $  sbci C2, 0
+    mul  A2, B0  $  add C0, r0  $  adc C1, r1  $  sbci C2, 0
+    neg  C2
+
+    mul  A0, B3  $  add C1, r0  $  adc C2, r1  $  sbc  C3, C3
+    mul  A1, B2  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
+    mul  A2, B1  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
+    mul  A3, B0  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
+    neg  C3
+
+    mul  A1, B3  $  add C2, r0  $  adc C3, r1
+    mul  A2, B2  $  add C2, r0  $  adc C3, r1
+    mul  A3, B1  $  add C2, r0  $  adc C3, r1
+
+    mul  A2, B3  $  add C3, r0
+    mul  A3, B2  $  add C3, r0
+
+    ;; Guard bits used in the signed version below.
+    pop  GUARD
+    clr  __zero_reg__
+    ret
+ENDF __mulusa3_round
+#endif /* L_mulusa3 */
+
+#if defined (L_mulsa3)
+;;; (C3:C0) = (A3:A0) * (B3:B0)
+;;; Clobbers: __tmp_reg__, T
+;;; Rounding:  -0.5 LSB  <=  error  <=  0.5 LSB
+DEFUN __mulsa3
+    clt
+    XCALL   __mulusa3_round
+    ;; A posteriori sign extension of the operands
+    tst     B3
+    brpl 1f
+    sub     C2, A0
+    sbc     C3, A1
+1:  sbrs    A3, 7
+    rjmp 2f
+    sub     C2, B0
+    sbc     C3, B1
+2:
+    ;;  Shift 1 bit left to adjust for 15 fractional bits
+    lsl     GUARD
+    rol     C0
+    rol     C1
+    rol     C2
+    rol     C3
+    ;; Round last digit
+    lsl     GUARD
+    adc     C0, __zero_reg__
+    adc     C1, __zero_reg__
+    adc     C2, __zero_reg__
+    adc     C3, __zero_reg__
+    ret
+ENDF __mulsa3
+#endif /* L_mulsa3 */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+
+#else /* __AVR_HAVE_MUL__ */
+
+#define A0 18
+#define A1 A0+1
+#define A2 A0+2
+#define A3 A0+3
+
+#define B0 22
+#define B1 B0+1
+#define B2 B0+2
+#define B3 B0+3
+
+#define C0  22
+#define C1  C0+1
+#define C2  C0+2
+#define C3  C0+3
+
+;; __tmp_reg__
+#define CC0  0
+;; __zero_reg__
+#define CC1  1
+#define CC2  16
+#define CC3  17
+
+#define AA0  26
+#define AA1  AA0+1
+#define AA2  30
+#define AA3  AA2+1
+
+#if defined (L_mulsa3)
+;;; (R25:R22)  *=  (R21:R18)
+;;; Clobbers: ABI, called by optabs
+;;; Rounding:  -1 LSB  <=  error  <=  1 LSB
+DEFUN   __mulsa3
+    push    B0
+    push    B1
+    push    B3
+    clt
+    XCALL   __mulusa3_round
+    pop     r30
+    ;; sign-extend B
+    bst     r30, 7
+    brtc 1f
+    ;; A1, A0 survived in  R27:R26
+    sub     C2, AA0
+    sbc     C3, AA1
+1:
+    pop     AA1  ;; B1
+    pop     AA0  ;; B0
+
+    ;; sign-extend A.  A3 survived in  R31
+    bst     AA3, 7
+    brtc 2f
+    sub     C2, AA0
+    sbc     C3, AA1
+2:
+    ;;  Shift 1 bit left to adjust for 15 fractional bits
+    lsl     GUARD
+    rol     C0
+    rol     C1
+    rol     C2
+    rol     C3
+    ;; Round last digit
+    lsl     GUARD
+    adc     C0, __zero_reg__
+    adc     C1, __zero_reg__
+    adc     C2, __zero_reg__
+    adc     C3, __zero_reg__
+    ret
+ENDF __mulsa3
+#endif  /* L_mulsa3 */
+
+#if defined (L_mulusa3)
+;;; (R25:R22)  *=  (R21:R18)
+;;; Clobbers: ABI, called by optabs
+;;; Rounding:  -1 LSB  <=  error  <=  1 LSB
+DEFUN __mulusa3
+    set
+    ;; Fallthru
+ENDF  __mulusa3
+
+;;; A[] survives in 26, 27, 30, 31
+;;; Also used by __mulsa3 with T = 0
+;;; Round if T = 1
+;;; Return Guard bits in GUARD (__tmp_reg__), used by signed version.
+DEFUN __mulusa3_round
+    push    CC2
+    push    CC3
+    ; clear result
+    clr     __tmp_reg__
+    wmov    CC2, CC0
+    ; save multiplicand
+    wmov    AA0, A0
+    wmov    AA2, A2
+    rjmp 3f
+
+    ;; Loop the integral part
+
+1:  ;; CC += A * 2^n;  n >= 0
+    add  CC0,A0  $  adc CC1,A1  $  adc  CC2,A2  $  adc  CC3,A3
+
+2:  ;; A <<= 1
+    lsl  A0      $  rol A1      $  rol  A2      $  rol  A3
+
+3:  ;; IBIT(B) >>= 1
+    ;; Carry = n-th bit of B;  n >= 0
+    lsr     B3
+    ror     B2
+    brcs 1b
+    sbci    B3, 0
+    brne 2b
+
+    ;; Loop the fractional part
+    ;; B2/B3 is 0 now, use as guard bits for rounding
+    ;; Restore multiplicand
+    wmov    A0, AA0
+    wmov    A2, AA2
+    rjmp 5f
+
+4:  ;; CC += A:Guard * 2^n;  n < 0
+    add  B3,B2 $  adc  CC0,A0  $  adc  CC1,A1  $  adc  CC2,A2  $  adc  CC3,A3
+5:
+    ;; A:Guard >>= 1
+    lsr  A3   $  ror  A2  $  ror  A1  $  ror   A0  $   ror  B2
+
+    ;; FBIT(B) <<= 1
+    ;; Carry = n-th bit of B;  n < 0
+    lsl     B0
+    rol     B1
+    brcs 4b
+    sbci    B0, 0
+    brne 5b
+
+    ;; Save guard bits and set carry for rounding
+    push    B3
+    lsl     B3
+    ;; Move result into place
+    wmov    C2, CC2
+    wmov    C0, CC0
+    clr     __zero_reg__
+    brtc 6f
+    ;; Round iff T = 1
+    adc     C0, __zero_reg__
+    adc     C1, __zero_reg__
+    adc     C2, __zero_reg__
+    adc     C3, __zero_reg__
+6:
+    pop     GUARD
+    ;; Epilogue
+    pop     CC3
+    pop     CC2
+    ret
+ENDF __mulusa3_round
+#endif  /* L_mulusa3 */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef AA0
+#undef AA1
+#undef AA2
+#undef AA3
+#undef CC0
+#undef CC1
+#undef CC2
+#undef CC3
+
+#endif /* __AVR_HAVE_MUL__ */
+
+#undef GUARD
+
+/***********************************************************
+    Fixed  unsigned saturated Multiplication  8.8 x 8.8
+***********************************************************/
+
+#define C0  22
+#define C1  C0+1
+#define C2  C0+2
+#define C3  C0+3
+#define SS __tmp_reg__
+
+#if defined (L_usmuluha3)
+DEFUN __usmuluha3
+    ;; Widening multiply
+#ifdef __AVR_HAVE_MUL__
+    ;; Adjust interface
+    movw    R26, R22
+    movw    R18, R24
+#endif /* HAVE MUL */
+    XCALL   __umulhisi3
+    tst     C3
+    brne .Lmax
+    ;; Round, target is in C1..C2
+    lsl     C0
+    adc     C1, __zero_reg__
+    adc     C2, __zero_reg__
+    brcs .Lmax
+    ;; Move result into place
+    mov     C3, C2
+    mov     C2, C1
+    ret
+.Lmax:
+    ;; Saturate
+    ldi     C2, 0xff
+    ldi     C3, 0xff
+    ret
+ENDF  __usmuluha3
+#endif /* L_usmuluha3 */
+
+/***********************************************************
+    Fixed signed saturated Multiplication  s8.7 x s8.7
+***********************************************************/
+
+#if defined (L_ssmulha3)
+DEFUN __ssmulha3
+    ;; Widening multiply
+#ifdef __AVR_HAVE_MUL__
+    ;; Adjust interface
+    movw    R26, R22
+    movw    R18, R24
+#endif /* HAVE MUL */
+    XCALL   __mulhisi3
+    ;; Adjust decimal point
+    lsl     C0
+    rol     C1
+    rol     C2
+    brvs .LsatC3.3
+    ;; The 9 MSBs must be the same
+    rol     C3
+    sbc     SS, SS
+    cp      C3, SS
+    brne .LsatSS
+    ;; Round
+    lsl     C0
+    adc     C1, __zero_reg__
+    adc     C2, __zero_reg__
+    brvs .Lmax
+    ;; Move result into place
+    mov    C3, C2
+    mov    C2, C1
+    ret
+.Lmax:
+    ;; Load 0x7fff
+    clr     C3
+.LsatC3.3:
+    ;; C3 <  0 -->  0x8000
+    ;; C3 >= 0 -->  0x7fff
+    mov     SS, C3
+.LsatSS:
+    ;; Load min / max value:
+    ;; SS = -1  -->  0x8000
+    ;; SS =  0  -->  0x7fff
+    ldi     C3, 0x7f
+    ldi     C2, 0xff
+    sbrc    SS, 7
+    adiw    C2, 1
+    ret
+ENDF  __ssmulha3
+#endif /* L_ssmulha3 */
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef SS
+
+/***********************************************************
+    Fixed  unsigned saturated Multiplication  16.16 x 16.16
+***********************************************************/
+
+#define C0  18
+#define C1  C0+1
+#define C2  C0+2
+#define C3  C0+3
+#define C4  C0+4
+#define C5  C0+5
+#define C6  C0+6
+#define C7  C0+7
+#define SS __tmp_reg__
+
+#if defined (L_usmulusa3)
+;; R22[4] = R22[4] *{ssat} R18[4]
+;; Ordinary ABI function
+DEFUN __usmulusa3
+    ;; Widening multiply
+    XCALL   __umulsidi3
+    or      C7, C6
+    brne .Lmax
+    ;; Round, target is in C2..C5
+    lsl     C1
+    adc     C2, __zero_reg__
+    adc     C3, __zero_reg__
+    adc     C4, __zero_reg__
+    adc     C5, __zero_reg__
+    brcs .Lmax
+    ;; Move result into place
+    wmov    C6, C4
+    wmov    C4, C2
+    ret
+.Lmax:
+    ;; Saturate
+    ldi     C7, 0xff
+    ldi     C6, 0xff
+    wmov    C4, C6
+    ret
+ENDF  __usmulusa3
+#endif /* L_usmulusa3 */
+
+/***********************************************************
+    Fixed signed saturated Multiplication  s16.15 x s16.15
+***********************************************************/
+
+#if defined (L_ssmulsa3)
+;; R22[4] = R22[4] *{ssat} R18[4]
+;; Ordinary ABI function
+DEFUN __ssmulsa3
+    ;; Widening multiply
+    XCALL   __mulsidi3
+    ;; Adjust decimal point
+    lsl     C1
+    rol     C2
+    rol     C3
+    rol     C4
+    rol     C5
+    brvs .LsatC7.7
+    ;; The 17 MSBs must be the same
+    rol     C6
+    rol     C7
+    sbc     SS, SS
+    cp      C6, SS
+    cpc     C7, SS
+    brne .LsatSS
+    ;; Round
+    lsl     C1
+    adc     C2, __zero_reg__
+    adc     C3, __zero_reg__
+    adc     C4, __zero_reg__
+    adc     C5, __zero_reg__
+    brvs .Lmax
+    ;; Move result into place
+    wmov    C6, C4
+    wmov    C4, C2
+    ret
+
+.Lmax:
+    ;; Load 0x7fffffff
+    clr     C7
+.LsatC7.7:
+    ;; C7 <  0 -->  0x80000000
+    ;; C7 >= 0 -->  0x7fffffff
+    lsl     C7
+    sbc     SS, SS
+.LsatSS:
+    ;; Load min / max value:
+    ;; SS = -1  -->  0x80000000
+    ;; SS =  0  -->  0x7fffffff
+    com     SS
+    mov     C4, SS
+    mov     C5, C4
+    wmov    C6, C4
+    subi    C7, 0x80
+    ret
+ENDF  __ssmulsa3
+#endif /* L_ssmulsa3 */
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#undef SS
+
+/*******************************************************
+      Fractional Division 8 / 8
+*******************************************************/
+
+#define r_divd  r25     /* dividend */
+#define r_quo   r24     /* quotient */
+#define r_div   r22     /* divisor */
+#define r_sign  __tmp_reg__
+
+#if defined (L_divqq3)
+DEFUN   __divqq3
+    mov     r_sign, r_divd
+    eor     r_sign, r_div
+    sbrc    r_div, 7
+    neg     r_div
+    sbrc    r_divd, 7
+    neg     r_divd
+    XCALL   __divqq_helper
+    lsr     r_quo
+    sbrc    r_sign, 7   ; negate result if needed
+    neg     r_quo
+    ret
+ENDF __divqq3
+#endif  /* L_divqq3 */
+
+#if defined (L_udivuqq3)
+DEFUN   __udivuqq3
+    cp      r_divd, r_div
+    brsh    0f
+    XJMP __divqq_helper
+    ;; Result is out of [0, 1)  ==>  Return 1 - eps.
+0:  ldi     r_quo, 0xff
+    ret
+ENDF __udivuqq3
+#endif  /* L_udivuqq3 */
+
+
+#if defined (L_divqq_helper)
+DEFUN   __divqq_helper
+    clr     r_quo           ; clear quotient
+    inc     __zero_reg__    ; init loop counter, used per shift
+__udivuqq3_loop:
+    lsl     r_divd          ; shift dividend
+    brcs    0f              ; dividend overflow
+    cp      r_divd,r_div    ; compare dividend & divisor
+    brcc    0f              ; dividend >= divisor
+    rol     r_quo           ; shift quotient (with CARRY)
+    rjmp    __udivuqq3_cont
+0:
+    sub     r_divd,r_div    ; restore dividend
+    lsl     r_quo           ; shift quotient (without CARRY)
+__udivuqq3_cont:
+    lsl     __zero_reg__    ; shift loop-counter bit
+    brne    __udivuqq3_loop
+    com     r_quo           ; complement result
+                            ; because C flag was complemented in loop
+    ret
+ENDF __divqq_helper
+#endif  /* L_divqq_helper */
+
+#undef  r_divd
+#undef  r_quo
+#undef  r_div
+#undef  r_sign
+
+
+/*******************************************************
+    Fractional Division 16 / 16
+*******************************************************/
+#define r_divdL 26     /* dividend Low */
+#define r_divdH 27     /* dividend Hig */
+#define r_quoL  24     /* quotient Low */
+#define r_quoH  25     /* quotient High */
+#define r_divL  22     /* divisor */
+#define r_divH  23     /* divisor */
+#define r_cnt   21
+
+#if defined (L_divhq3)
+DEFUN   __divhq3
+    mov     r0, r_divdH
+    eor     r0, r_divH
+    sbrs    r_divH, 7
+    rjmp    1f
+    NEG2    r_divL
+1:
+    sbrs    r_divdH, 7
+    rjmp    2f
+    NEG2    r_divdL
+2:
+    cp      r_divdL, r_divL
+    cpc     r_divdH, r_divH
+    breq    __divhq3_minus1  ; if equal return -1
+    XCALL   __udivuhq3
+    lsr     r_quoH
+    ror     r_quoL
+    brpl    9f
+    ;; negate result if needed
+    NEG2    r_quoL
+9:
+    ret
+__divhq3_minus1:
+    ldi     r_quoH, 0x80
+    clr     r_quoL
+    ret
+ENDF __divhq3
+#endif  /* defined (L_divhq3) */
+
+#if defined (L_udivuhq3)
+DEFUN   __udivuhq3
+    sub     r_quoH,r_quoH   ; clear quotient and carry
+    ;; FALLTHRU
+ENDF __udivuhq3
+
+DEFUN   __udivuha3_common
+    clr     r_quoL          ; clear quotient
+    ldi     r_cnt,16        ; init loop counter
+__udivuhq3_loop:
+    rol     r_divdL         ; shift dividend (with CARRY)
+    rol     r_divdH
+    brcs    __udivuhq3_ep   ; dividend overflow
+    cp      r_divdL,r_divL  ; compare dividend & divisor
+    cpc     r_divdH,r_divH
+    brcc    __udivuhq3_ep   ; dividend >= divisor
+    rol     r_quoL          ; shift quotient (with CARRY)
+    rjmp    __udivuhq3_cont
+__udivuhq3_ep:
+    sub     r_divdL,r_divL  ; restore dividend
+    sbc     r_divdH,r_divH
+    lsl     r_quoL          ; shift quotient (without CARRY)
+__udivuhq3_cont:
+    rol     r_quoH          ; shift quotient
+    dec     r_cnt           ; decrement loop counter
+    brne    __udivuhq3_loop
+    com     r_quoL          ; complement result
+    com     r_quoH          ; because C flag was complemented in loop
+    ret
+ENDF __udivuha3_common
+#endif  /* defined (L_udivuhq3) */
+
+/*******************************************************
+    Fixed Division 8.8 / 8.8
+*******************************************************/
+#if defined (L_divha3)
+DEFUN   __divha3
+    mov     r0, r_divdH
+    eor     r0, r_divH
+    sbrs    r_divH, 7
+    rjmp    1f
+    NEG2    r_divL
+1:
+    sbrs    r_divdH, 7
+    rjmp    2f
+    NEG2    r_divdL
+2:
+    XCALL   __udivuha3
+    lsr     r_quoH  ; adjust to 7 fractional bits
+    ror     r_quoL
+    sbrs    r0, 7   ; negate result if needed
+    ret
+    NEG2    r_quoL
+    ret
+ENDF __divha3
+#endif  /* defined (L_divha3) */
+
+#if defined (L_udivuha3)
+DEFUN   __udivuha3
+    mov     r_quoH, r_divdL
+    mov     r_divdL, r_divdH
+    clr     r_divdH
+    lsl     r_quoH     ; shift quotient into carry
+    XJMP    __udivuha3_common ; same as fractional after rearrange
+ENDF __udivuha3
+#endif  /* defined (L_udivuha3) */
+
+#undef  r_divdL
+#undef  r_divdH
+#undef  r_quoL
+#undef  r_quoH
+#undef  r_divL
+#undef  r_divH
+#undef  r_cnt
+
+/*******************************************************
+    Fixed Division 16.16 / 16.16
+*******************************************************/
+
+#define r_arg1L  24    /* arg1 gets passed already in place */
+#define r_arg1H  25
+#define r_arg1HL 26
+#define r_arg1HH 27
+#define r_divdL  26    /* dividend Low */
+#define r_divdH  27
+#define r_divdHL 30
+#define r_divdHH 31    /* dividend High */
+#define r_quoL   22    /* quotient Low */
+#define r_quoH   23
+#define r_quoHL  24
+#define r_quoHH  25    /* quotient High */
+#define r_divL   18    /* divisor Low */
+#define r_divH   19
+#define r_divHL  20
+#define r_divHH  21    /* divisor High */
+#define r_cnt  __zero_reg__  /* loop count (0 after the loop!) */
+
+#if defined (L_divsa3)
+DEFUN   __divsa3
+    mov     r0, r_arg1HH
+    eor     r0, r_divHH
+    sbrs    r_divHH, 7
+    rjmp    1f
+    NEG4    r_divL
+1:
+    sbrs    r_arg1HH, 7
+    rjmp    2f
+    NEG4    r_arg1L
+2:
+    XCALL   __udivusa3
+    lsr     r_quoHH ; adjust to 15 fractional bits
+    ror     r_quoHL
+    ror     r_quoH
+    ror     r_quoL
+    sbrs    r0, 7   ; negate result if needed
+    ret
+    ;; negate r_quoL
+    XJMP    __negsi2
+ENDF __divsa3
+#endif  /* defined (L_divsa3) */
+
+#if defined (L_udivusa3)
+DEFUN   __udivusa3
+    ldi     r_divdHL, 32    ; init loop counter
+    mov     r_cnt, r_divdHL
+    clr     r_divdHL
+    clr     r_divdHH
+    wmov    r_quoL, r_divdHL
+    lsl     r_quoHL         ; shift quotient into carry
+    rol     r_quoHH
+__udivusa3_loop:
+    rol     r_divdL         ; shift dividend (with CARRY)
+    rol     r_divdH
+    rol     r_divdHL
+    rol     r_divdHH
+    brcs    __udivusa3_ep   ; dividend overflow
+    cp      r_divdL,r_divL  ; compare dividend & divisor
+    cpc     r_divdH,r_divH
+    cpc     r_divdHL,r_divHL
+    cpc     r_divdHH,r_divHH
+    brcc    __udivusa3_ep   ; dividend >= divisor
+    rol     r_quoL          ; shift quotient (with CARRY)
+    rjmp    __udivusa3_cont
+__udivusa3_ep:
+    sub     r_divdL,r_divL  ; restore dividend
+    sbc     r_divdH,r_divH
+    sbc     r_divdHL,r_divHL
+    sbc     r_divdHH,r_divHH
+    lsl     r_quoL          ; shift quotient (without CARRY)
+__udivusa3_cont:
+    rol     r_quoH          ; shift quotient
+    rol     r_quoHL
+    rol     r_quoHH
+    dec     r_cnt           ; decrement loop counter
+    brne    __udivusa3_loop
+    com     r_quoL          ; complement result
+    com     r_quoH          ; because C flag was complemented in loop
+    com     r_quoHL
+    com     r_quoHH
+    ret
+ENDF __udivusa3
+#endif  /* defined (L_udivusa3) */
+
+#undef  r_arg1L
+#undef  r_arg1H
+#undef  r_arg1HL
+#undef  r_arg1HH
+#undef  r_divdL
+#undef  r_divdH
+#undef  r_divdHL
+#undef  r_divdHH
+#undef  r_quoL
+#undef  r_quoH
+#undef  r_quoHL
+#undef  r_quoHH
+#undef  r_divL
+#undef  r_divH
+#undef  r_divHL
+#undef  r_divHH
+#undef  r_cnt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Saturation, 1 Byte
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; First Argument and Return Register
+#define A0  24
+
+#if defined (L_ssabs_1)
+DEFUN __ssabs_1
+    sbrs    A0, 7
+    ret
+    neg     A0
+    sbrc    A0,7
+    dec     A0
+    ret
+ENDF __ssabs_1
+#endif /* L_ssabs_1 */
+
+#undef A0
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Saturation, 2 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; First Argument and Return Register
+#define A0  24
+#define A1  A0+1
+
+#if defined (L_ssneg_2)
+DEFUN __ssneg_2
+    NEG2    A0
+    brvc 0f
+    sbiw    A0, 1
+0:  ret
+ENDF __ssneg_2
+#endif /* L_ssneg_2 */
+
+#if defined (L_ssabs_2)
+DEFUN __ssabs_2
+    sbrs    A1, 7
+    ret
+    XJMP    __ssneg_2
+ENDF __ssabs_2
+#endif /* L_ssabs_2 */
+
+#undef A0
+#undef A1
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Saturation, 4 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; First Argument and Return Register
+#define A0  22
+#define A1  A0+1
+#define A2  A0+2
+#define A3  A0+3
+
+#if defined (L_ssneg_4)
+DEFUN __ssneg_4
+    XCALL   __negsi2
+    brvc 0f
+    ldi     A3, 0x7f
+    ldi     A2, 0xff
+    ldi     A1, 0xff
+    ldi     A0, 0xff
+0:  ret
+ENDF __ssneg_4
+#endif /* L_ssneg_4 */
+
+#if defined (L_ssabs_4)
+DEFUN __ssabs_4
+    sbrs    A3, 7
+    ret
+    XJMP    __ssneg_4
+ENDF __ssabs_4
+#endif /* L_ssabs_4 */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Saturation, 8 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; First Argument and Return Register
+#define A0  18
+#define A1  A0+1
+#define A2  A0+2
+#define A3  A0+3
+#define A4  A0+4
+#define A5  A0+5
+#define A6  A0+6
+#define A7  A0+7
+
+#if defined (L_clr_8)
+FALIAS __usneguta2
+FALIAS __usneguda2
+FALIAS __usnegudq2
+
+;; Clear Carry and all Bytes
+DEFUN __clr_8
+    ;; Clear Carry and set Z
+    sub     A7, A7
+    ;; FALLTHRU
+ENDF  __clr_8
+;; Propagate Carry to all Bytes, Carry unaltered
+DEFUN __sbc_8
+    sbc     A7, A7
+    sbc     A6, A6
+    wmov    A4, A6
+    wmov    A2, A6
+    wmov    A0, A6
+    ret
+ENDF __sbc_8
+#endif /* L_clr_8 */
+
+#if defined (L_ssneg_8)
+FALIAS __ssnegta2
+FALIAS __ssnegda2
+FALIAS __ssnegdq2
+
+DEFUN __ssneg_8
+    XCALL   __negdi2
+    brvc 0f
+    ;; A[] = 0x7fffffff
+    sec
+    XCALL   __sbc_8
+    ldi     A7, 0x7f
+0:  ret
+ENDF __ssneg_8
+#endif /* L_ssneg_8 */
+
+#if defined (L_ssabs_8)
+FALIAS __ssabsta2
+FALIAS __ssabsda2
+FALIAS __ssabsdq2
+
+DEFUN __ssabs_8
+    sbrs    A7, 7
+    ret
+    XJMP    __ssneg_8
+ENDF __ssabs_8
+#endif /* L_ssabs_8 */
+
+;; Second Argument
+#define B0  10
+#define B1  B0+1
+#define B2  B0+2
+#define B3  B0+3
+#define B4  B0+4
+#define B5  B0+5
+#define B6  B0+6
+#define B7  B0+7
+
+#if defined (L_usadd_8)
+FALIAS __usadduta3
+FALIAS __usadduda3
+FALIAS __usaddudq3
+
+DEFUN __usadd_8
+    XCALL   __adddi3
+    brcs 0f
+    ret
+0:  ;; A[] = 0xffffffff
+    XJMP    __sbc_8
+ENDF __usadd_8
+#endif /* L_usadd_8 */
+
+#if defined (L_ussub_8)
+FALIAS __ussubuta3
+FALIAS __ussubuda3
+FALIAS __ussubudq3
+
+DEFUN __ussub_8
+    XCALL   __subdi3
+    brcs 0f
+    ret
+0:  ;; A[] = 0
+    XJMP    __clr_8
+ENDF __ussub_8
+#endif /* L_ussub_8 */
+
+#if defined (L_ssadd_8)
+FALIAS __ssaddta3
+FALIAS __ssaddda3
+FALIAS __ssadddq3
+
+DEFUN __ssadd_8
+    XCALL   __adddi3
+    brvc 0f
+    ;; A = (B >= 0) ? INT64_MAX : INT64_MIN
+    cpi     B7, 0x80
+    XCALL   __sbc_8
+    subi    A7, 0x80
+0:  ret
+ENDF __ssadd_8
+#endif /* L_ssadd_8 */
+
+#if defined (L_sssub_8)
+FALIAS __sssubta3
+FALIAS __sssubda3
+FALIAS __sssubdq3
+
+DEFUN __sssub_8
+    XCALL   __subdi3
+    brvc 0f
+    ;; A = (B < 0) ? INT64_MAX : INT64_MIN
+    ldi     A7, 0x7f
+    cp      A7, B7
+    XCALL   __sbc_8
+    subi    A7, 0x80
+0:  ret
+ENDF __sssub_8
+#endif /* L_sssub_8 */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef A4
+#undef A5
+#undef A6
+#undef A7
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+#undef B4
+#undef B5
+#undef B6
+#undef B7
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rounding Helpers
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#ifdef L_mask1
+
+#define AA 24
+#define CC 25
+
+;; R25 = 1 << (R24 & 7)
+;; CC  = 1 << (AA  & 7)
+;; Clobbers: None
+DEFUN __mask1
+    ;; CC = 2 ^ AA.1
+    ldi     CC, 1 << 2
+    sbrs    AA, 1
+    ldi     CC, 1 << 0
+    ;; CC *= 2 ^ AA.0
+    sbrc    AA, 0
+    lsl     CC
+    ;; CC *= 2 ^ AA.2
+    sbrc    AA, 2
+    swap    CC
+    ret
+ENDF __mask1
+
+#undef AA
+#undef CC
+#endif /* L_mask1 */
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The rounding point. Any bits smaller than
+;; 2^{-RP} will be cleared.
+#define RP R24
+
+#define A0 22
+#define A1 A0 + 1
+
+#define C0 24
+#define C1 C0 + 1
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rounding, 1 Byte
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#ifdef L_roundqq3
+
+;; R24 = round (R22, R24)
+;; Clobbers: R22, __tmp_reg__
+DEFUN  __roundqq3
+    mov     __tmp_reg__, C1
+    subi    RP, __QQ_FBIT__ - 1
+    neg     RP
+    ;; R25 = 1 << RP  (Total offset is FBIT-1 - RP)
+    XCALL   __mask1
+    mov     C0, C1
+    ;; Add-Saturate 2^{-RP-1}
+    add     A0, C0
+    brvc 0f
+    ldi     C0, 0x7f
+    rjmp 9f
+0:  ;; Mask out bits beyond RP
+    lsl     C0
+    neg     C0
+    and     C0, A0
+9:  mov     C1, __tmp_reg__
+    ret
+ENDF  __roundqq3
+#endif /* L_roundqq3 */
+
+#ifdef L_rounduqq3
+
+;; R24 = round (R22, R24)
+;; Clobbers: R22, __tmp_reg__
+DEFUN  __rounduqq3
+    mov     __tmp_reg__, C1
+    subi    RP, __UQQ_FBIT__ - 1
+    neg     RP
+    ;; R25 = 1 << RP  (Total offset is FBIT-1 - RP)
+    XCALL   __mask1
+    mov     C0, C1
+    ;; Add-Saturate 2^{-RP-1}
+    add     A0, C0
+    brcc 0f
+    ldi     C0, 0xff
+    rjmp 9f
+0:  ;; Mask out bits beyond RP
+    lsl     C0
+    neg     C0
+    and     C0, A0
+9:  mov     C1, __tmp_reg__
+    ret
+ENDF  __rounduqq3
+#endif /* L_rounduqq3 */
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rounding, 2 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#ifdef L_addmask_2
+
+;; [ R25:R24 =  1 << (R24 & 15)
+;;   R23:R22 += 1 << (R24 & 15) ]
+;; SREG is set according to the addition
+DEFUN __addmask_2
+    ;; R25 = 1 << (R24 & 7)
+    XCALL   __mask1
+    cpi     RP, 1 << 3
+    sbc     C0, C0
+    ;; Swap C0 and C1 if RP.3 was set
+    and     C0, C1
+    eor     C1, C0
+    ;; Finally, add the power-of-two:  A[] += C[]
+    add     A0, C0
+    adc     A1, C1
+    ret
+ENDF  __addmask_2
+#endif /* L_addmask_2 */
+
+#ifdef L_round_s2
+
+;; R25:R24 = round (R23:R22, R24)
+;; Clobbers: R23, R22
+DEFUN  __roundhq3
+    subi    RP, __HQ_FBIT__ - __HA_FBIT__
+ENDF   __roundhq3
+DEFUN  __roundha3
+    subi    RP, __HA_FBIT__ - 1
+    neg     RP
+    ;; [ R25:R24  = 1 << (FBIT-1 - RP)
+    ;;   R23:R22 += 1 << (FBIT-1 - RP) ]
+    XCALL   __addmask_2
+    XJMP    __round_s2_const
+ENDF  __roundha3
+
+#endif /* L_round_s2 */
+
+#ifdef L_round_u2
+
+;; R25:R24 = round (R23:R22, R24)
+;; Clobbers: R23, R22
+DEFUN  __rounduhq3
+    subi    RP, __UHQ_FBIT__ - __UHA_FBIT__
+ENDF   __rounduhq3
+DEFUN  __rounduha3
+    subi    RP, __UHA_FBIT__ - 1
+    neg     RP
+    ;; [ R25:R24  = 1 << (FBIT-1 - RP)
+    ;;   R23:R22 += 1 << (FBIT-1 - RP) ]
+    XCALL   __addmask_2
+    XJMP    __round_u2_const 
+ENDF  __rounduha3
+
+#endif /* L_round_u2 */
+
+
+#ifdef L_round_2_const
+
+;; Helpers for 2 byte wide rounding
+
+DEFUN  __round_s2_const
+    brvc 2f
+    ldi     C1, 0x7f
+    rjmp 1f
+    ;; FALLTHRU (Barrier)
+ENDF  __round_s2_const
+
+DEFUN __round_u2_const
+    brcc 2f
+    ldi     C1, 0xff
+1:
+    ldi     C0, 0xff
+    rjmp 9f
+2:
+    ;; Saturation is performed now.
+    ;; Currently, we have C[] = 2^{-RP-1}
+    ;; C[] = 2^{-RP}
+    lsl     C0
+    rol     C1
+    ;;      
+    NEG2    C0
+    ;; Clear the bits beyond the rounding point.
+    and     C0, A0
+    and     C1, A1
+9:  ret
+ENDF  __round_u2_const
+
+#endif /* L_round_2_const */
+
+#undef A0
+#undef A1
+#undef C0
+#undef C1
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rounding, 4 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#define A0 18
+#define A1 A0 + 1
+#define A2 A0 + 2
+#define A3 A0 + 3
+
+#define C0 22
+#define C1 C0 + 1
+#define C2 C0 + 2
+#define C3 C0 + 3
+
+#ifdef L_addmask_4
+
+;; [ R25:R22 =  1 << (R24 & 31)
+;;   R21:R18 += 1 << (R24 & 31) ]
+;; SREG is set according to the addition
+DEFUN __addmask_4
+    ;; R25 = 1 << (R24 & 7)
+    XCALL   __mask1
+    cpi     RP, 1 << 4
+    sbc     C0, C0
+    sbc     C1, C1
+    ;; Swap C2 with C3 if RP.3 is not set
+    cpi     RP, 1 << 3
+    sbc     C2, C2
+    and     C2, C3
+    eor     C3, C2
+    ;; Swap C3:C2 with C1:C0 if RP.4 is not set
+    and     C0, C2  $  eor     C2, C0
+    and     C1, C3  $  eor     C3, C1
+    ;; Finally, add the power-of-two:  A[] += C[]
+    add     A0, C0
+    adc     A1, C1
+    adc     A2, C2
+    adc     A3, C3
+    ret
+ENDF  __addmask_4
+#endif /* L_addmask_4 */
+
+#ifdef L_round_s4
+
+;; R25:R22 = round (R21:R18, R24)
+;; Clobbers: R18...R21
+DEFUN  __roundsq3
+    subi    RP, __SQ_FBIT__ - __SA_FBIT__
+ENDF   __roundsq3
+DEFUN  __roundsa3
+    subi    RP, __SA_FBIT__ - 1
+    neg     RP
+    ;; [ R25:R22  = 1 << (FBIT-1 - RP)
+    ;;   R21:R18 += 1 << (FBIT-1 - RP) ]
+    XCALL   __addmask_4
+    XJMP    __round_s4_const
+ENDF  __roundsa3
+
+#endif /* L_round_s4 */
+
+#ifdef L_round_u4
+
+;; R25:R22 = round (R21:R18, R24)
+;; Clobbers: R18...R21
+DEFUN  __roundusq3
+    subi    RP, __USQ_FBIT__ - __USA_FBIT__
+ENDF   __roundusq3
+DEFUN  __roundusa3
+    subi    RP, __USA_FBIT__ - 1
+    neg     RP
+    ;; [ R25:R22  = 1 << (FBIT-1 - RP)
+    ;;   R21:R18 += 1 << (FBIT-1 - RP) ]
+    XCALL   __addmask_4
+    XJMP    __round_u4_const 
+ENDF  __roundusa3
+
+#endif /* L_round_u4 */
+
+
+#ifdef L_round_4_const
+
+;; Helpers for 4 byte wide rounding
+
+DEFUN  __round_s4_const
+    brvc 2f
+    ldi     C3, 0x7f
+    rjmp 1f
+    ;; FALLTHRU (Barrier)
+ENDF  __round_s4_const
+
+DEFUN __round_u4_const
+    brcc 2f
+    ldi     C3, 0xff
+1:
+    ldi     C2, 0xff
+    ldi     C1, 0xff
+    ldi     C0, 0xff
+    rjmp 9f
+2:
+    ;; Saturation is performed now.
+    ;; Currently, we have C[] = 2^{-RP-1}
+    ;; C[] = 2^{-RP}
+    lsl     C0
+    rol     C1
+    rol     C2
+    rol     C3
+    XCALL   __negsi2
+    ;; Clear the bits beyond the rounding point.
+    and     C0, A0
+    and     C1, A1
+    and     C2, A2
+    and     C3, A3
+9:  ret
+ENDF  __round_u4_const
+
+#endif /* L_round_4_const */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+
+#undef RP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rounding, 8 Bytes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+#define RP     16
+#define FBITm1 31
+
+#define C0 18
+#define C1 C0 + 1
+#define C2 C0 + 2
+#define C3 C0 + 3
+#define C4 C0 + 4
+#define C5 C0 + 5
+#define C6 C0 + 6
+#define C7 C0 + 7
+
+#define A0 16
+#define A1 17
+#define A2 26
+#define A3 27
+#define A4 28
+#define A5 29
+#define A6 30
+#define A7 31
+
+
+#ifdef L_rounddq3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN  __rounddq3
+    ldi     FBITm1, __DQ_FBIT__ - 1
+    clt
+    XJMP    __round_x8
+ENDF  __rounddq3
+#endif /* L_rounddq3 */
+
+#ifdef L_roundudq3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN  __roundudq3
+    ldi     FBITm1, __UDQ_FBIT__ - 1
+    set
+    XJMP    __round_x8
+ENDF  __roundudq3
+#endif /* L_roundudq3 */
+
+#ifdef L_roundda3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN  __roundda3
+    ldi     FBITm1, __DA_FBIT__ - 1
+    clt
+    XJMP    __round_x8
+ENDF  __roundda3
+#endif /* L_roundda3 */
+
+#ifdef L_rounduda3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN  __rounduda3
+    ldi     FBITm1, __UDA_FBIT__ - 1
+    set
+    XJMP    __round_x8
+ENDF  __rounduda3
+#endif /* L_rounduda3 */
+
+#ifdef L_roundta3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN  __roundta3
+    ldi     FBITm1, __TA_FBIT__ - 1
+    clt
+    XJMP    __round_x8
+ENDF  __roundta3
+#endif /* L_roundta3 */
+
+#ifdef L_rounduta3
+;; R25:R18 = round (R25:R18, R16)
+;; Clobbers: ABI
+DEFUN  __rounduta3
+    ldi     FBITm1, __UTA_FBIT__ - 1
+    set
+    XJMP    __round_x8
+ENDF  __rounduta3
+#endif /* L_rounduta3 */
+
+
+#ifdef L_round_x8
+DEFUN __round_x8
+    push r16
+    push r17
+    push r28
+    push r29
+    ;; Compute log2 of addend from rounding point
+    sub     RP, FBITm1
+    neg     RP
+    ;; Move input to work register A[]
+    push    C0
+    mov     A1, C1
+    wmov    A2, C2
+    wmov    A4, C4
+    wmov    A6, C6
+    ;; C[] = 1 << (FBIT-1 - RP)
+    XCALL   __clr_8
+    inc     C0
+    XCALL   __ashldi3
+    pop     A0
+    ;; A[] += C[]
+    add     A0, C0
+    adc     A1, C1
+    adc     A2, C2
+    adc     A3, C3
+    adc     A4, C4
+    adc     A5, C5
+    adc     A6, C6
+    adc     A7, C7
+    brts    1f
+    ;; Signed
+    brvc    3f
+    ;; Signed overflow: A[] = 0x7f...
+    brvs    2f
+1:  ;; Unsigned
+    brcc    3f
+    ;; Unsigned overflow: A[] = 0xff...
+2:  ldi     C7, 0xff
+    ldi     C6, 0xff
+    wmov    C0, C6
+    wmov    C2, C6
+    wmov    C4, C6
+    bld     C7, 7
+    rjmp 9f
+3:
+    ;;  C[] = -C[] - C[]
+    push    A0
+    ldi     r16, 1
+    XCALL   __ashldi3
+    pop     A0
+    XCALL   __negdi2
+    ;; Clear the bits beyond the rounding point.
+    and     C0, A0
+    and     C1, A1
+    and     C2, A2
+    and     C3, A3
+    and     C4, A4
+    and     C5, A5
+    and     C6, A6
+    and     C7, A7
+9:  ;; Epilogue
+    pop r29
+    pop r28
+    pop r17
+    pop r16
+    ret
+ENDF  __round_x8
+
+#endif /* L_round_x8 */
+
+#undef A0
+#undef A1
+#undef A2
+#undef A3
+#undef A4
+#undef A5
+#undef A6
+#undef A7
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+
+#undef RP
+#undef FBITm1
+
+
+;; Supply implementations / symbols for the bit-banging functions
+;; __builtin_avr_bitsfx and __builtin_avr_fxbits
+#ifdef L_ret
+DEFUN __ret
+    ret
+ENDF  __ret
+#endif /* L_ret */
+
+#endif /* if not __AVR_TINY__ */