/* AMD64 (x86_64) sub_n -- Subtract two limb vectors of the same length > 0 and store
 *		   sum in a third limb vector.
 *
 *      Copyright (C) 1992, 1994, 1995, 1998, 
 *                    2001, 2002, 2006 Free Software Foundation, Inc.
 *      Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 *
 * This file is part of Libgcrypt.
 *
 * Libgcrypt is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * Libgcrypt is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
 *
 * Note: This code is heavily based on the GNU MP Library.
 *	 Actually it's the same code with only minor changes in the
 *	 way the data is stored; this is to support the abstraction
 *	 of an optional secure memory allocation which may be used
 *	 to avoid revealing of sensitive data due to paging etc.
 */


#include "sysdep.h"
#include "asm-syntax.h"


/*******************
 *  mpi_limb_t
 *  _gcry_mpih_sub_n( mpi_ptr_t res_ptr,	rdi
 *		   mpi_ptr_t s1_ptr,		rsi
 *		   mpi_ptr_t s2_ptr,		rdx
 *		   mpi_size_t size)		rcx
 */
	TEXT
	ALIGN(4)
	.globl C_SYMBOL_NAME(_gcry_mpih_sub_n)
C_SYMBOL_NAME(_gcry_mpih_sub_n:)
	FUNC_ENTRY()
	movl	%ecx, %r9d
	andl	$3, %r9d
	je	.Lprehandle0
	cmpl	$2, %r9d
	jb	.Lprehandle1
	je	.Lprehandle2

#define FIRST_SUB() \
	movq	(%rsi), %rax; \
	subq	(%rdx), %rax; \
	movq	%rax, (%rdi)

#define NEXT_SUB(offset) \
	movq	offset(%rsi), %rax; \
	sbbq	offset(%rdx), %rax; \
	movq	%rax, offset(%rdi)

.Lprehandle3:
	leaq	-2(%rcx), %rcx
	FIRST_SUB();
	NEXT_SUB(8);
	NEXT_SUB(16);
	decq	%rcx
	je	.Lend
	leaq	24(%rsi), %rsi
	leaq	24(%rdx), %rdx
	leaq	24(%rdi), %rdi
	jmp	.Loop

	ALIGN(3)
.Lprehandle2:
	leaq	-1(%rcx), %rcx
	FIRST_SUB();
	NEXT_SUB(8);
	decq	%rcx
	je	.Lend
	leaq	16(%rsi), %rsi
	leaq	16(%rdx), %rdx
	leaq	16(%rdi), %rdi
	jmp	.Loop

	ALIGN(3)
.Lprehandle1:
	FIRST_SUB();
	decq	%rcx
	je	.Lend
	leaq	8(%rsi), %rsi
	leaq	8(%rdx), %rdx
	leaq	8(%rdi), %rdi
	jmp	.Loop

	ALIGN(3)
.Lprehandle0:
	clc				/* clear cy */

	ALIGN(4)			/* minimal alignment for claimed speed */
.Loop:	leaq	-3(%rcx), %rcx
	NEXT_SUB(0);
	NEXT_SUB(8);
	NEXT_SUB(16);
	NEXT_SUB(24);
	leaq	32(%rsi), %rsi
	leaq	32(%rdx), %rdx
	leaq	32(%rdi), %rdi
	decq	%rcx
	jne	.Loop

	ALIGN(2)
.Lend:
	movl	$0, %eax		/* zero %rax */
	adcl	%eax, %eax
	FUNC_EXIT()