From 1a8fa518934840fce207cc6a74256a28477beccd Mon Sep 17 00:00:00 2001
From: Torbjorn Granlund <tg@gmplib.org>
Date: Sun, 17 Jan 2021 22:19:23 +0100
Subject: Provide mpn_sbpi1_bdiv_r for bwl/skl/zen.

---
 mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm | 710 +++++++++++++++++++++++++++++++++++
 1 file changed, 710 insertions(+)
 create mode 100644 mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm

(limited to 'mpn')

diff --git a/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm b/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
new file mode 100644
index 000000000..ff3512422
--- /dev/null
+++ b/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
@@ -0,0 +1,710 @@
+dnl  AMD64 mpn_sbpi1_bdiv_r optimised for Intel Broadwell.
+
+dnl  Copyright 2015, 2021 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		addmul_1
+C AMD K8,K9	n/a		n/a
+C AMD K10	n/a		n/a
+C AMD bd1	n/a		n/a
+C AMD bd2	n/a		n/a
+C AMD bd3	n/a		n/a
+C AMD bd4	 ?		 ?
+C AMD zn1	 ?		 ?
+C AMD zn2	 ?		 ?
+C AMD zn3	 ?		 ?
+C AMD bt1	n/a		n/a
+C AMD bt2	n/a		n/a
+C Intel P4	n/a		n/a
+C Intel PNR	n/a		n/a
+C Intel NHM	n/a		n/a
+C Intel SBR	n/a		n/a
+C Intel IBR	n/a		n/a
+C Intel HWL	 1.68		n/a
+C Intel BWL	 1.51	      1.67-1.74
+C Intel SKL	 1.52	      1.63-1.71
+C Intel atom	n/a		n/a
+C Intel SLM	n/a		n/a
+C VIA nano	n/a		n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Do overlapped software pipelining.
+C  * Reduce register use, i.e., by combining n_neg and n_save.
+C  * Supporess initial store through up, it's always a zero.
+C  * Streamline up and dp setup.
+C  * When changing this, make sure the code which falls into the inner loops
+C    does not execute too many no-ops (for both PIC and non-PIC).
+
+dnl  mp_limb_t
+dnl  mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un,
+dnl		       mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+
+define(`up',      `%rdi')
+define(`un',      `%rsi')
+define(`dp_param',`%rdx')
+define(`dn_param',`%rcx')
+define(`dinv',    `%r8')
+
+define(`n',       `%rcx')
+define(`n_save',  `%rbp')
+define(`dp',      `%r14')
+define(`n_neg',   `%rbx')
+define(`q',       `%rdx')
+define(`jaddr',   `%rax')
+
+define(`w0',	`%r12')
+define(`w1',	`%r9')
+define(`w2',	`%r10')
+define(`w3',	`%r11')
+
+ifdef(`MAX_SPECIAL',,`
+define(`MAX_SPECIAL', 8)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+
+	lea	L(atab)(%rip), %r10
+
+	cmp	$MAX_SPECIAL, dn_param
+	jbe	L(sma)
+
+ifelse(MAX_SPECIAL,8,,`
+forloop(i,eval(MAX_SPECIAL+1),9,`L(i):
+')')
+
+L(gen):	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	xor	%r13, %r13
+
+	sub	dn_param, un		C outer loop count
+
+	lea	-8(,dn_param,8), n_neg
+	neg	n_neg
+	mov	dn_param, n_save
+	mov	R32(dn_param), R32(%rax)
+	shr	$3, n_save		C loop count
+	and	$7, R32(%rax)		C clear CF and OF as side-effect
+
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %rax
+	lea	(%rax,%r10), jaddr
+',`
+	mov	(%r10,%rax,8), jaddr
+')
+	mov	(up), q
+	imul	dinv, q
+	jmp	L(outer)
+
+L(f0):	mulx(	(dp), w2, w3)
+	lea	-1(n), n
+	mulx(	8,(dp), w0, w1)
+	lea	-8(dp), dp
+	adcx(	w3, w0)
+	adox(	(up), w2)
+	lea	-8(up), up
+	jmp	L(b0x)
+
+L(f3):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	adox(	(up), w0)
+	lea	-48(up), up
+	lea	16(dp), dp
+	jmp	L(b3x)
+
+L(f4):	mulx(	(dp), w2, w3)
+	mulx(	8,(dp), w0, w1)
+	lea	24(dp), dp
+	adox(	(up), w2)
+	lea	-40(up), up
+	adcx(	w3, w0)
+	jmp	L(b4x)
+
+L(f5):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	lea	32(dp), dp
+	adcx(	w1, w2)
+	adox(	(up), w0)
+	lea	-32(up), up
+	jmp	L(b5x)
+
+L(f6):	mulx(	(dp), w2, w3)
+	mulx(	8,(dp), w0, w1)
+	lea	40(dp), dp
+	adox(	(up), w2)
+	lea	-24(up), up
+	adcx(	w3, w0)
+	jmp	L(b6x)
+
+L(f7):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	lea	48(dp), dp
+	adcx(	w1, w2)
+	adox(	(up), w0)
+	lea	-16(up), up
+	jmp	L(b7x)
+
+L(f1):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	adox(	(up), w0)
+	lea	-1(n), n
+	jmp	L(b1x)
+
+L(f2):	mulx(	(dp), w2, w3)
+	mulx(	8,(dp), w0, w1)
+	lea	8(dp), dp
+	adox(	(up), w2)
+	lea	8(up), up
+	adcx(	w3, w0)
+	jmp	L(b2x)
+
+L(end):	adox(	(up), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+	mov	w0, (up)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	8(up,n_neg), q		C Compute next quotient early...
+	mulx(	dinv, q, %r12)		C ...(unused in last iteration)
+	bt	$0, R32(%r13)
+	adc	w1, 8(up)
+	setc	R8(%r13)
+	dec	un			C clear OF as side-effect
+	jz	L(done)
+
+	lea	(dp,n_neg), dp		C reset dp to D[]'s beginning
+	lea	8(up,n_neg), up		C point up to U[]'s current beginning
+L(outer):
+	mov	n_save, n
+	test	%eax, %eax		C clear CF and OF
+	jmp	*jaddr
+
+	ALIGN(16)
+L(top):	adox(	-8,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(up)
+	jrcxz	L(end)
+L(b2x):	mulx(	8,(dp), w2, w3)
+	adox(	(up), w0)
+	lea	-1(n), n
+	mov	w0, (up)
+L(b1x):	adcx(	w1, w2)
+	mulx(	16,(dp), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(up), w2)
+	mov	w2, 8(up)
+L(b0x):	mulx(	24,(dp), w2, w3)
+	lea	64(dp), dp
+	adcx(	w1, w2)
+	adox(	16,(up), w0)
+	mov	w0, 16(up)
+L(b7x):	mulx(	-32,(dp), w0, w1)
+	adox(	24,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(up)
+L(b6x):	mulx(	-24,(dp), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(up), w0)
+	mov	w0, 32(up)
+L(b5x):	mulx(	-16,(dp), w0, w1)
+	adox(	40,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(up)
+L(b4x):	adox(	48,(up), w0)
+	mulx(	-8,(dp), w2, w3)
+	mov	w0, 48(up)
+L(b3x):	lea	64(up), up
+	adcx(	w1, w2)
+	mulx(	(dp), w0, w1)
+	jmp	L(top)
+
+L(done):mov	%r13, %rax
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(sma):
+ifdef(`PIC',
+`	movslq	28(%r10,dn_param,4), %rax
+	lea	(%rax,%r10), jaddr
+',`
+	mov	56(%r10,dn_param,8), jaddr
+')
+	jmp	*jaddr
+
+L(1):	mov	(dp_param), %r10
+	xor	R32(%rax), R32(%rax)
+	mov	(up), %rdx
+	dec	un
+	mov	%rdx, %r9
+L(o1):	mulx(	dinv, %rdx, %r11)	C next quotient
+	lea	8(up), up
+	mulx(	%r10, %rcx, %rdx)	C 0 1
+	add	%r9, %rcx		C 0
+	adc	%rax, %rdx		C 1
+	add	(up), %rdx		C 1
+	setc	R8(%rax)		C 2
+	mov	%rdx, %r9		C 1
+	dec	un
+	jnz	L(o1)
+	mov	%r9, (up)
+
+	FUNC_EXIT()
+	ret
+
+ifdef(`VER',,`define(`VER',1)')
+L(2):	push	%r12
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	sub	dn_param, un		C loop count
+	mov	(up), q
+	imul	dinv, q
+
+ifelse(VER,0,`
+	xor	R32(%rax), R32(%rax)
+L(o2):	test	%eax, %eax		C clear CF and OF
+	mulx(	(dp), w2, w3)		C 0 1
+	mulx(	8,(dp), %rdx, w1)		C 1 2
+	add	(up), w2		C 0
+	adc	8(up), %rdx		C 1
+	adc	$0, w1			C 2 cannot carry further
+	add	w3, %rdx			C 1
+	mov	%rdx, 8(up)		C 1
+	adc	$0, w1			C 2
+	imul	dinv, q			C
+	bt	$0, R32(%rax)
+	adc	16(up), w1		C 2
+	mov	w1, 16(up)
+	setc	R8(%rax)
+	lea	8(up), up
+	dec	un
+	jnz	L(o2)
+')
+ifelse(VER,1,`
+	push	%rbx
+	push	%r13
+	xor	R32(%r13), R32(%r13)
+	mov	(up), %rax
+	mov	8(up), %rbx
+L(o2):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w2, w3)		C 0 1
+	mulx(	8,(dp), %rdx, w1)	C 1 2
+	adox(	%rax, w2)		C 0
+	adcx(	w3, %rdx)		C 1
+	adox(	%rbx, %rdx)		C 1
+	adox(	%rcx, w1)		C 2 cannot carry further
+	mov	%rdx, %rax		C 1
+	adc	%rcx, w1		C 2
+	imul	dinv, q			C
+	bt	$0, R32(%r13)
+	adc	16(up), w1		C 2
+	mov	w1, %rbx
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o2)
+
+	mov	%rax, (up)
+	mov	%rbx, 8(up)
+	mov	%r13, %rax
+	pop	%r13
+	pop	%rbx
+')
+ifelse(VER,2,`
+	xor	R32(%rax), R32(%rax)
+	mov	(up), %r10
+	mov	8(up), %r9
+L(o2):	mulx(	(dp), %r12, %r11)
+	mulx(	8,(dp), %rdx, %rcx)
+	add	%r11, %rdx		C 1
+	adc	$0, %rcx		C 2
+	add	%r10, %r12		C 0  add just to produce carry
+	adc	%r9, %rdx		C 1
+	mov	%rdx, %r10		C 1
+	mulx(	dinv, %rdx, %r12)	C next quotient
+	adc	%rax, %rcx		C 2
+	setc	R8(%rax)		C 3
+	mov	16(up), %r9		C 2
+	add	%rcx, %r9		C 2
+	adc	$0, R32(%rax)		C 3
+	lea	8(up), up
+	dec	un
+	jnz	L(o2)
+
+	mov	%r10, (up)
+	mov	%r9, 8(up)
+')
+ifelse(VER,3,`
+	xor	R32(%rax), R32(%rax)
+	mov	(up), %r10
+	mov	8(up), %r9
+L(o2):	mulx(	(dp), %r12, %r11)
+	add	%r10, %r12		C 0  add just to produce carry
+	mulx(	8,(dp), %rdx, %rcx)
+	adc	%r11, %rdx		C 1
+	adc	$0, %rcx		C 2
+	add	%r9, %rdx		C 1
+	mov	%rdx, %r10		C 1
+	mulx(	dinv, %rdx, %r12)	C next quotient
+	adc	%rax, %rcx		C 2
+	setc	R8(%rax)		C 3
+	mov	16(up), %r9		C 2
+	add	%rcx, %r9		C 2
+	adc	$0, R32(%rax)		C 3
+	lea	8(up), up
+	dec	un
+	jnz	L(o2)
+
+	mov	%r10, (up)
+	mov	%r9, 8(up)
+')
+	pop	%r14
+	pop	%r12
+	FUNC_EXIT()
+	ret
+
+ifelse(eval(MAX_SPECIAL>=3),1,`
+L(3):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	xor	%r13, %r13
+	sub	dn_param, un		C outer loop count
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o3):	xor	R32(%rcx), R32(%rcx)	C clear rcx, CF, and OF
+	mulx(	(dp), w0, w1)		C 0 1
+	adox(	%rax, w0)		C 0
+	mulx(	8,(dp), %rax, w3)	C 1 2
+	adcx(	w1, %rax)		C 1
+	adox(	%rbx, %rax)		C 1
+	mulx(	16,(dp), %rbx, w1)	C 2 3
+	mov	dinv, q			C 1
+	mulx(	%rax, q, w0)
+	adcx(	w3, %rbx)		C 2
+	adox(	16,(up), %rbx)		C 2
+	adox(	%rcx, w1)		C 3
+	adc	$0, w1			C 3
+	bt	$0, R32(%r13)
+	adc	w1, 24(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o3)
+	jmp	L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=4),1,`
+L(4):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	xor	%r13, %r13
+	sub	dn_param, un		C outer loop count
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o4):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w2, w3)
+	adox(	%rax, w2)
+	mulx(	8,(dp), %rax, w1)
+	adcx(	w3, %rax)
+	adox(	%rbx, %rax)
+	mulx(	16,(dp), %rbx, w3)
+	adcx(	w1, %rbx)
+	mulx(	24,(dp), w0, w1)
+	mov	dinv, q
+	mulx(	%rax, q, w2)
+	adox(	16,(up), %rbx)
+	adcx(	w3, w0)
+	adox(	24,(up), w0)
+	adox(	%rcx, w1)
+	mov	w0, 24(up)
+	adc	%rcx, w1
+	bt	$0, R32(%r13)
+	adc	w1, 32(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o4)
+	jmp	L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=5),1,`
+L(5):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	xor	%r13, %r13
+	sub	dn_param, un		C outer loop count
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o5):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w0, w1)
+	adox(	%rax, w0)
+	mulx(	8,(dp), %rax, w3)
+	adcx(	w1, %rax)
+	adox(	%rbx, %rax)
+	mulx(	16,(dp), %rbx, w1)
+	adcx(	w3, %rbx)
+	adox(	16,(up), %rbx)
+	mulx(	24,(dp), w2, w3)
+	adcx(	w1, w2)
+	mulx(	32,(dp), w0, w1)
+	adox(	24,(up), w2)
+	adcx(	w3, w0)
+	mov	dinv, q
+	mulx(	%rax, q, w3)
+	mov	w2, 24(up)
+	adox(	32,(up), w0)
+	adox(	%rcx, w1)
+	mov	w0, 32(up)
+	adc	%rcx, w1
+	bt	$0, R32(%r13)
+	adc	w1, 40(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o5)
+	jmp	L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=6),1,`
+L(6):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	xor	%r13, %r13
+	sub	dn_param, un		C outer loop count
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o6):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w2, w3)
+	adox(	%rax, w2)
+	mulx(	8,(dp), %rax, w1)
+	adcx(	w3, %rax)
+	adox(	%rbx, %rax)
+	mulx(	16,(dp), %rbx, w3)
+	adcx(	w1, %rbx)
+	mulx(	24,(dp), w0, w1)
+	adox(	16,(up), %rbx)
+	adcx(	w3, w0)
+	adox(	24,(up), w0)
+	mulx(	32,(dp), w2, w3)
+	mov	w0, 24(up)
+	adcx(	w1, w2)
+	mulx(	40,(dp), w0, w1)
+	adox(	32,(up), w2)
+	adcx(	w3, w0)
+	mov	dinv, q
+	mulx(	%rax, q, w3)
+	mov	w2, 32(up)
+	adox(	40,(up), w0)
+	adox(	%rcx, w1)
+	mov	w0, 40(up)
+	adc	%rcx, w1
+	bt	$0, R32(%r13)
+	adc	w1, 48(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o6)
+	jmp	L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=7),1,`
+L(7):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp
+	xor	%r13, %r13
+	sub	dn_param, un
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o7):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w0, w1)
+	adox(	%rax, w0)
+	mulx(	8,(dp), %rax, w3)
+	adcx(	w1, %rax)
+	adox(	%rbx, %rax)
+	mulx(	16,(dp), %rbx, w1)
+	adcx(	w3, %rbx)
+	mulx(	24,(dp), w2, w3)
+	adcx(	w1, w2)
+	adox(	16,(up), %rbx)
+	mulx(	32,(dp), w0, w1)
+	adox(	24,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(up)
+	adox(	32,(up), w0)
+	mulx(	40,(dp), w2, w3)
+	mov	w0, 32(up)
+	adcx(	w1, w2)
+	mulx(	48,(dp), w0, w1)
+	adox(	40,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(up)
+	mov	%rax, q
+	mulx(	dinv, q, w2)
+	adox(	48,(up), w0)
+	adox(	%rcx, w1)
+	mov	w0, 48(up)
+	adc	%rcx, w1
+	bt	$0, R32(%r13)
+	adc	w1, 56(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o7)
+	jmp	L(esma)
+')
+
+ifelse(eval(MAX_SPECIAL>=8),1,`
+L(8):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp
+	xor	%r13, %r13
+	sub	dn_param, un
+	mov	(up), %rax
+	mov	8(up), %rbx
+	mov	%rax, q
+	imul	dinv, q
+L(o8):	xor	R32(%rcx), R32(%rcx)
+	mulx(	(dp), w2, w3)
+	adox(	%rax, w2)
+	mulx(	8,(dp), %rax, w1)
+	adcx(	w3, %rax)
+	adox(	%rbx, %rax)
+	mulx(	16,(dp), %rbx, w3)
+	adcx(	w1, %rbx)
+	mulx(	24,(dp), w0, w1)
+	adox(	16,(up), %rbx)
+	adcx(	w3, w0)
+	mulx(	32,(dp), w2, w3)
+	adcx(	w1, w2)
+	adox(	24,(up), w0)
+	mov	w0, 24(up)
+	mulx(	40,(dp), w0, w1)
+	adox(	32,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 32(up)
+	adox(	40,(up), w0)
+	mulx(	48,(dp), w2, w3)
+	mov	w0, 40(up)
+	adcx(	w1, w2)
+	mulx(	56,(dp), w0, w1)
+	adox(	48,(up), w2)
+	adcx(	w3, w0)
+	mov	dinv, q
+	mulx(	%rax, q, w3)
+	mov	w2, 48(up)
+	adox(	56,(up), w0)
+	adox(	%rcx, w1)
+	mov	w0, 56(up)
+	adc	%rcx, w1
+	bt	$0, R32(%r13)
+	adc	w1, 64(up)
+	setc	R8(%r13)
+	lea	8(up), up
+	dec	un
+	jnz	L(o8)
+	jmp	L(esma)
+')
+
+L(esma):mov	%rax, (up)
+	mov	%rbx, 8(up)
+	mov	%r13, %rax
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+
+	JUMPTABSECT
+	ALIGN(8)
+L(atab):JMPENT(	L(f0), L(atab))
+	JMPENT(	L(f1), L(atab))
+	JMPENT(	L(f2), L(atab))
+	JMPENT(	L(f3), L(atab))
+	JMPENT(	L(f4), L(atab))
+	JMPENT(	L(f5), L(atab))
+	JMPENT(	L(f6), L(atab))
+	JMPENT(	L(f7), L(atab))
+	JMPENT(	L(1), L(atab))
+	JMPENT(	L(2), L(atab))
+	JMPENT(	L(3), L(atab))
+	JMPENT(	L(4), L(atab))
+	JMPENT(	L(5), L(atab))
+	JMPENT(	L(6), L(atab))
+	JMPENT(	L(7), L(atab))
+	JMPENT(	L(8), L(atab))
+	TEXT
+EPILOGUE()
-- 
cgit v1.2.1