diff options
Diffstat (limited to 'rts/gmp/mpn')
347 files changed, 50022 insertions, 0 deletions
diff --git a/rts/gmp/mpn/Makefile.am b/rts/gmp/mpn/Makefile.am new file mode 100644 index 0000000000..1c49ccda25 --- /dev/null +++ b/rts/gmp/mpn/Makefile.am @@ -0,0 +1,94 @@ +## Process this file with automake to generate Makefile.in + +# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +AUTOMAKE_OPTIONS = gnu no-dependencies +SUBDIRS = tests + +CPP = @CPP@ + +# -DOPERATION_$* tells multi-function files which function to produce. +INCLUDES = -I$(top_srcdir) -DOPERATION_$* + +GENERIC_SOURCES = mp_bases.c +OFILES = @mpn_objects@ + +noinst_LTLIBRARIES = libmpn.la +libmpn_la_SOURCES = $(GENERIC_SOURCES) +libmpn_la_LIBADD = $(OFILES) +libmpn_la_DEPENDENCIES = $(OFILES) + +TARG_DIST = a29k alpha arm clipper cray generic hppa i960 lisp m68k m88k \ + mips2 mips3 ns32k pa64 pa64w power powerpc32 powerpc64 pyr sh sparc32 \ + sparc64 thumb vax x86 z8000 z8000x + +EXTRA_DIST = underscore.h asm-defs.m4 $(TARG_DIST) + +# COMPILE minus CC. FIXME: Really pass *_CFLAGS to CPP? +COMPILE_FLAGS = \ + $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + +SUFFIXES = .s .S .asm + +# *.s are not preprocessed at all. +.s.o: + $(CCAS) $(COMPILE_FLAGS) $< +.s.obj: + $(CCAS) $(COMPILE_FLAGS) `cygpath -w $<` +.s.lo: + $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) $< + +# *.S are preprocessed with CPP. +.S.o: + $(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s +.S.obj: + $(CPP) $(COMPILE_FLAGS) `cygpath -w $<` | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s + +# We have to rebuild the static object file without passing -DPIC to +# preprocessor. The overhead cost is one extra assemblation. FIXME: +# Teach libtool how to assemble with a preprocessor pass (CPP or m4). + +.S.lo: + $(CPP) $(COMPILE_FLAGS) -DPIC $< | grep -v '^#' >tmp-$*.s + $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o + rm -f tmp-$*.s + +# *.m4 are preprocessed with m4. +.asm.o: + $(M4) -DOPERATION_$* $< >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s +.asm.obj: + $(M4) -DOPERATION_$* `cygpath -w $<` >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s +.asm.lo: + $(M4) -DPIC -DOPERATION_$* $< >tmp-$*.s + $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(M4) -DOPERATION_$* $< >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o + rm -f tmp-$*.s diff --git a/rts/gmp/mpn/Makefile.in b/rts/gmp/mpn/Makefile.in new file mode 100644 index 0000000000..59ee958c92 --- /dev/null +++ b/rts/gmp/mpn/Makefile.in @@ -0,0 +1,472 @@ +# Makefile.in generated automatically by automake 1.4a from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +SHELL = @SHELL@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ + +bindir = @bindir@ +sbindir = @sbindir@ +libexecdir = @libexecdir@ +datadir = @datadir@ +sysconfdir = @sysconfdir@ +sharedstatedir = @sharedstatedir@ +localstatedir = @localstatedir@ +libdir = @libdir@ +infodir = @infodir@ +mandir = @mandir@ +includedir = @includedir@ +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ + +top_builddir = .. + +ACLOCAL = @ACLOCAL@ +AUTOCONF = @AUTOCONF@ +AUTOMAKE = @AUTOMAKE@ +AUTOHEADER = @AUTOHEADER@ + +INSTALL = @INSTALL@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_FLAG = +transform = @program_transform_name@ + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : + +@SET_MAKE@ +build_alias = @build_alias@ +build_triplet = @build@ +host_alias = @host_alias@ +host_triplet = @host@ +target_alias = @target_alias@ +target_triplet = @target@ +AMDEP = @AMDEP@ +AMTAR = @AMTAR@ +AR = @AR@ +AS = @AS@ +AWK = @AWK@ +CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@ +CC = @CC@ +CCAS = @CCAS@ +CPP = @CPP@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +EXEEXT = @EXEEXT@ +LIBTOOL = @LIBTOOL@ +LN_S = @LN_S@ +M4 = @M4@ +MAINT = @MAINT@ +MAKEINFO = @MAKEINFO@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +RANLIB = @RANLIB@ +SPEED_CYCLECOUNTER_OBJS = @SPEED_CYCLECOUNTER_OBJS@ +STRIP = @STRIP@ +U = @U@ +VERSION = @VERSION@ +gmp_srclinks = @gmp_srclinks@ +install_sh = @install_sh@ +mpn_objects = @mpn_objects@ +mpn_objs_in_libgmp = @mpn_objs_in_libgmp@ + +# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +AUTOMAKE_OPTIONS = gnu no-dependencies +SUBDIRS = + +CPP = @CPP@ + +# -DOPERATION_$* tells multi-function files which function to produce. +INCLUDES = -I$(top_srcdir) -DOPERATION_$* + +GENERIC_SOURCES = mp_bases.c +OFILES = @mpn_objects@ + +noinst_LTLIBRARIES = libmpn.la +libmpn_la_SOURCES = $(GENERIC_SOURCES) +libmpn_la_LIBADD = $(OFILES) +libmpn_la_DEPENDENCIES = $(OFILES) + +TARG_DIST = a29k alpha arm clipper cray generic hppa i960 lisp m68k m88k \ + mips2 mips3 ns32k pa64 pa64w power powerpc32 powerpc64 pyr sh sparc32 \ + sparc64 thumb vax x86 z8000 z8000x + + +EXTRA_DIST = underscore.h asm-defs.m4 $(TARG_DIST) + +# COMPILE minus CC. FIXME: Really pass *_CFLAGS to CPP? +COMPILE_FLAGS = \ + $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + + +SUFFIXES = .s .S .asm +subdir = mpn +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_HEADER = ../config.h +CONFIG_CLEAN_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) + + +DEFS = @DEFS@ -I. -I$(srcdir) -I.. +CPPFLAGS = @CPPFLAGS@ +LDFLAGS = @LDFLAGS@ +LIBS = @LIBS@ +libmpn_la_LDFLAGS = +am_libmpn_la_OBJECTS = mp_bases.lo +libmpn_la_OBJECTS = $(am_libmpn_la_OBJECTS) +COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CFLAGS = @CFLAGS@ +CCLD = $(CC) +LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +DIST_SOURCES = $(libmpn_la_SOURCES) +DIST_COMMON = README Makefile.am Makefile.in + + +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) + +GZIP_ENV = --best +depcomp = +SOURCES = $(libmpn_la_SOURCES) +OBJECTS = $(am_libmpn_la_OBJECTS) + +all: all-redirect +.SUFFIXES: +.SUFFIXES: .S .asm .c .lo .o .obj .s +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu mpn/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +mostlyclean-noinstLTLIBRARIES: + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + +distclean-noinstLTLIBRARIES: + +maintainer-clean-noinstLTLIBRARIES: + +mostlyclean-compile: + -rm -f *.o core *.core + -rm -f *.$(OBJEXT) + +clean-compile: + +distclean-compile: + -rm -f *.tab.c + +maintainer-clean-compile: + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +distclean-libtool: + +maintainer-clean-libtool: + +libmpn.la: $(libmpn_la_OBJECTS) $(libmpn_la_DEPENDENCIES) + $(LINK) $(libmpn_la_LDFLAGS) $(libmpn_la_OBJECTS) $(libmpn_la_LIBADD) $(LIBS) +.c.o: + $(COMPILE) -c $< +.c.obj: + $(COMPILE) -c `cygpath -w $<` +.c.lo: + $(LTCOMPILE) -c -o $@ $< + +# This directory's subdirectories are mostly independent; you can cd +# into them and run `make' without going through this Makefile. +# To change the values of `make' variables: instead of editing Makefiles, +# (1) if the variable is set in `config.status', edit `config.status' +# (which will cause the Makefiles to be regenerated when you run `make'); +# (2) otherwise, pass the desired values on the `make' command line. + +all-recursive install-data-recursive install-exec-recursive \ +installdirs-recursive install-recursive uninstall-recursive \ +check-recursive installcheck-recursive info-recursive dvi-recursive: + @set fnord $(MAKEFLAGS); amf=$$2; \ + dot_seen=no; \ + target=`echo $@ | sed s/-recursive//`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + echo "Making $$target in $$subdir"; \ + if test "$$subdir" = "."; then \ + dot_seen=yes; \ + local_target="$$target-am"; \ + else \ + local_target="$$target"; \ + fi; \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ + || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \ + done; \ + if test "$$dot_seen" = "no"; then \ + $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ + fi; test -z "$$fail" + +mostlyclean-recursive clean-recursive distclean-recursive \ +maintainer-clean-recursive: + @set fnord $(MAKEFLAGS); amf=$$2; \ + dot_seen=no; \ + rev=''; list='$(SUBDIRS)'; for subdir in $$list; do \ + rev="$$subdir $$rev"; \ + if test "$$subdir" = "."; then dot_seen=yes; else :; fi; \ + done; \ + test "$$dot_seen" = "no" && rev=". $$rev"; \ + target=`echo $@ | sed s/-recursive//`; \ + for subdir in $$rev; do \ + echo "Making $$target in $$subdir"; \ + if test "$$subdir" = "."; then \ + local_target="$$target-am"; \ + else \ + local_target="$$target"; \ + fi; \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ + || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \ + done && test -z "$$fail" +tags-recursive: + list='$(SUBDIRS)'; for subdir in $$list; do \ + test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ + done + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + mkid -f$$here/ID $$unique $(LISP) + +TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + if test "$$subdir" = .; then :; else \ + test -f $$subdir/TAGS && tags="$$tags -i $$here/$$subdir/TAGS"; \ + fi; \ + done; \ + list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \ + || etags $(ETAGS_ARGS) $$tags $$unique $(LISP) + +mostlyclean-tags: + +clean-tags: + +distclean-tags: + -rm -f TAGS ID + +maintainer-clean-tags: + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pR $$d/$$file $(distdir); \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done + for subdir in $(SUBDIRS); do \ + if test "$$subdir" = .; then :; else \ + test -d $(distdir)/$$subdir \ + || mkdir $(distdir)/$$subdir \ + || exit 1; \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(top_distdir) distdir=../$(distdir)/$$subdir distdir) \ + || exit 1; \ + fi; \ + done +info-am: +info: info-recursive +dvi-am: +dvi: dvi-recursive +check-am: all-am +check: check-recursive +installcheck-am: +installcheck: installcheck-recursive +install-exec-am: +install-exec: install-exec-recursive + +install-data-am: +install-data: install-data-recursive + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-recursive +uninstall-am: +uninstall: uninstall-recursive +all-am: Makefile $(LTLIBRARIES) +all-redirect: all-recursive +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_STRIP_FLAG=-s install +installdirs: installdirs-recursive +installdirs-am: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: + -rm -f Makefile.in +mostlyclean-am: mostlyclean-noinstLTLIBRARIES mostlyclean-compile \ + mostlyclean-libtool mostlyclean-tags \ + mostlyclean-generic + +mostlyclean: mostlyclean-recursive + +clean-am: clean-noinstLTLIBRARIES clean-compile clean-libtool \ + clean-tags clean-generic mostlyclean-am + +clean: clean-recursive + +distclean-am: distclean-noinstLTLIBRARIES distclean-compile \ + distclean-libtool distclean-tags distclean-generic \ + clean-am + -rm -f libtool + +distclean: distclean-recursive + +maintainer-clean-am: maintainer-clean-noinstLTLIBRARIES \ + maintainer-clean-compile maintainer-clean-libtool \ + maintainer-clean-tags maintainer-clean-generic \ + distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-recursive + +.PHONY: mostlyclean-noinstLTLIBRARIES distclean-noinstLTLIBRARIES \ +clean-noinstLTLIBRARIES maintainer-clean-noinstLTLIBRARIES \ +mostlyclean-compile distclean-compile clean-compile \ +maintainer-clean-compile mostlyclean-libtool distclean-libtool \ +clean-libtool maintainer-clean-libtool install-recursive \ +uninstall-recursive install-data-recursive uninstall-data-recursive \ +install-exec-recursive uninstall-exec-recursive installdirs-recursive \ +uninstalldirs-recursive all-recursive check-recursive \ +installcheck-recursive info-recursive dvi-recursive \ +mostlyclean-recursive distclean-recursive clean-recursive \ +maintainer-clean-recursive tags tags-recursive mostlyclean-tags \ +distclean-tags clean-tags maintainer-clean-tags distdir info-am info \ +dvi-am dvi check check-am installcheck-am installcheck install-exec-am \ +install-exec install-data-am install-data install-am install \ +uninstall-am uninstall all-redirect all-am all install-strip \ +installdirs-am installdirs mostlyclean-generic distclean-generic \ +clean-generic maintainer-clean-generic clean mostlyclean distclean \ +maintainer-clean + + +# *.s are not preprocessed at all. +.s.o: + $(CCAS) $(COMPILE_FLAGS) $< +.s.obj: + $(CCAS) $(COMPILE_FLAGS) `cygpath -w $<` +.s.lo: + $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) $< + +# *.S are preprocessed with CPP. +.S.o: + $(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s +.S.obj: + $(CPP) $(COMPILE_FLAGS) `cygpath -w $<` | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s + +# We have to rebuild the static object file without passing -DPIC to +# preprocessor. The overhead cost is one extra assemblation. FIXME: +# Teach libtool how to assemble with a preprocessor pass (CPP or m4). + +.S.lo: + $(CPP) $(COMPILE_FLAGS) -DPIC $< | grep -v '^#' >tmp-$*.s + $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o + rm -f tmp-$*.s + +# *.m4 are preprocessed with m4. +.asm.o: + $(M4) -DOPERATION_$* $< >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s +.asm.obj: + $(M4) -DOPERATION_$* `cygpath -w $<` >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + rm -f tmp-$*.s +.asm.lo: + $(M4) -DPIC -DOPERATION_$* $< >tmp-$*.s + $(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@ + $(M4) -DOPERATION_$* $< >tmp-$*.s + $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o + rm -f tmp-$*.s + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/rts/gmp/mpn/README b/rts/gmp/mpn/README new file mode 100644 index 0000000000..7453c9d03e --- /dev/null +++ b/rts/gmp/mpn/README @@ -0,0 +1,13 @@ +This directory contains all code for the mpn layer of GMP. + +Most subdirectories contain machine-dependent code, written in assembly or C. +The `generic' subdirectory contains default code, used when there is no +machine-dependent replacement for a particular machine. + +There is one subdirectory for each ISA family. Note that e.g., 32-bit SPARC +and 64-bit SPARC are very different ISA's, and thus cannot share any code. + +A particular compile will only use code from one subdirectory, and the +`generic' subdirectory. The ISA-specific subdirectories contain hierachies of +directories for various architecture variants and implementations; the +top-most level contains code that runs correctly on all variants. diff --git a/rts/gmp/mpn/a29k/add_n.s b/rts/gmp/mpn/a29k/add_n.s new file mode 100644 index 0000000000..e3ee6dfa60 --- /dev/null +++ b/rts/gmp/mpn/a29k/add_n.s @@ -0,0 +1,120 @@ +; 29000 __gmpn_add -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr lr2 +; s1_ptr lr3 +; s2_ptr lr4 +; size lr5 + +; We use the loadm/storem instructions and operate on chunks of 8 +; limbs/per iteration, until less than 8 limbs remain. + +; The 29k has no addition or subtraction instructions that doesn't +; affect carry, so we need to save and restore that as soon as we +; adjust the pointers. gr116 is used for this purpose. Note that +; gr116==0 means that carry should be set. + + .sect .lit,lit + .text + .align 4 + .global ___gmpn_add_n + .word 0x60000 +___gmpn_add_n: + srl gr117,lr5,3 + sub gr118,gr117,1 + jmpt gr118,Ltail + constn gr116,-1 ; init cy reg + sub gr117,gr117,2 ; count for jmpfdec + +; Main loop working 8 limbs/iteration. +Loop: mtsrim cr,(8-1) + loadm 0,0,gr96,lr3 + add lr3,lr3,32 + mtsrim cr,(8-1) + loadm 0,0,gr104,lr4 + add lr4,lr4,32 + + subr gr116,gr116,0 ; restore carry + addc gr96,gr96,gr104 + addc gr97,gr97,gr105 + addc gr98,gr98,gr106 + addc gr99,gr99,gr107 + addc gr100,gr100,gr108 + addc gr101,gr101,gr109 + addc gr102,gr102,gr110 + addc gr103,gr103,gr111 + subc gr116,gr116,gr116 ; gr116 = not(cy) + + mtsrim cr,(8-1) + storem 0,0,gr96,lr2 + jmpfdec gr117,Loop + add lr2,lr2,32 + +; Code for the last up-to-7 limbs. +; This code might look very strange, but it's hard to write it +; differently without major slowdown. + + and lr5,lr5,(8-1) +Ltail: sub gr118,lr5,1 ; count for CR + jmpt gr118,Lend + sub gr117,lr5,2 ; count for jmpfdec + + mtsr cr,gr118 + loadm 0,0,gr96,lr3 + mtsr cr,gr118 + loadm 0,0,gr104,lr4 + + subr gr116,gr116,0 ; restore carry + + jmpfdec gr117,L1 + addc gr96,gr96,gr104 + jmp Lstore + mtsr cr,gr118 +L1: jmpfdec gr117,L2 + addc gr97,gr97,gr105 + jmp Lstore + mtsr cr,gr118 +L2: jmpfdec gr117,L3 + addc gr98,gr98,gr106 + jmp Lstore + mtsr cr,gr118 +L3: jmpfdec gr117,L4 + addc gr99,gr99,gr107 + jmp Lstore + mtsr cr,gr118 +L4: jmpfdec gr117,L5 + addc gr100,gr100,gr108 + jmp Lstore + mtsr cr,gr118 +L5: jmpfdec gr117,L6 + addc gr101,gr101,gr109 + jmp Lstore + mtsr cr,gr118 +L6: addc gr102,gr102,gr110 + +Lstore: storem 0,0,gr96,lr2 + subc gr116,gr116,gr116 ; gr116 = not(cy) + +Lend: jmpi lr0 + add gr96,gr116,1 diff --git a/rts/gmp/mpn/a29k/addmul_1.s b/rts/gmp/mpn/a29k/addmul_1.s new file mode 100644 index 0000000000..f51b6d7af6 --- /dev/null +++ b/rts/gmp/mpn/a29k/addmul_1.s @@ -0,0 +1,113 @@ +; 29000 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and +; add the product to a second limb vector. + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr lr2 +; s1_ptr lr3 +; size lr4 +; s2_limb lr5 + + .cputype 29050 + .sect .lit,lit + .text + .align 4 + .global ___gmpn_addmul_1 + .word 0x60000 +___gmpn_addmul_1: + sub lr4,lr4,8 + jmpt lr4,Ltail + const gr120,0 ; init cylimb reg + + srl gr117,lr4,3 ; divide by 8 + sub gr117,gr117,1 ; count for jmpfdec + +Loop: mtsrim cr,(8-1) + loadm 0,0,gr96,lr3 + add lr3,lr3,32 + + multiplu gr104,gr96,lr5 + multmu gr96,gr96,lr5 + multiplu gr105,gr97,lr5 + multmu gr97,gr97,lr5 + multiplu gr106,gr98,lr5 + multmu gr98,gr98,lr5 + multiplu gr107,gr99,lr5 + multmu gr99,gr99,lr5 + multiplu gr108,gr100,lr5 + multmu gr100,gr100,lr5 + multiplu gr109,gr101,lr5 + multmu gr101,gr101,lr5 + multiplu gr110,gr102,lr5 + multmu gr102,gr102,lr5 + multiplu gr111,gr103,lr5 + multmu gr103,gr103,lr5 + + add gr104,gr104,gr120 + addc gr105,gr105,gr96 + addc gr106,gr106,gr97 + addc gr107,gr107,gr98 + addc gr108,gr108,gr99 + addc gr109,gr109,gr100 + addc gr110,gr110,gr101 + addc gr111,gr111,gr102 + addc gr120,gr103,0 + + mtsrim cr,(8-1) + loadm 0,0,gr96,lr2 + + add gr104,gr96,gr104 + addc gr105,gr97,gr105 + addc gr106,gr98,gr106 + addc gr107,gr99,gr107 + addc gr108,gr100,gr108 + addc gr109,gr101,gr109 + addc gr110,gr102,gr110 + addc gr111,gr103,gr111 + addc gr120,gr120,0 + + mtsrim cr,(8-1) + storem 0,0,gr104,lr2 + jmpfdec gr117,Loop + add lr2,lr2,32 + +Ltail: and lr4,lr4,(8-1) + sub gr118,lr4,1 ; count for CR + jmpt gr118,Lend + sub lr4,lr4,2 + sub lr2,lr2,4 ; offset res_ptr by one limb + +Loop2: load 0,0,gr116,lr3 + add lr3,lr3,4 + multiplu gr117,gr116,lr5 + multmu gr118,gr116,lr5 + add lr2,lr2,4 + load 0,0,gr119,lr2 + add gr117,gr117,gr120 + addc gr118,gr118,0 + add gr117,gr117,gr119 + store 0,0,gr117,lr2 + jmpfdec lr4,Loop2 + addc gr120,gr118,0 + +Lend: jmpi lr0 + or gr96,gr120,0 ; copy diff --git a/rts/gmp/mpn/a29k/lshift.s b/rts/gmp/mpn/a29k/lshift.s new file mode 100644 index 0000000000..93e1917127 --- /dev/null +++ b/rts/gmp/mpn/a29k/lshift.s @@ -0,0 +1,93 @@ +; 29000 __gmpn_lshift -- + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr lr2 +; s1_ptr lr3 +; s2_ptr lr4 +; size lr5 + +; We use the loadm/storem instructions and operate on chunks of 8 +; limbs/per iteration, until less than 8 limbs remain. + + .sect .lit,lit + .text + .align 4 + .global ___gmpn_lshift + .word 0x60000 +___gmpn_lshift: + sll gr116,lr4,2 + add lr3,gr116,lr3 + add lr2,gr116,lr2 + sub lr3,lr3,4 + load 0,0,gr119,lr3 + + subr gr116,lr5,32 + srl gr96,gr119,gr116 ; return value + sub lr4,lr4,1 ; actual loop count is SIZE - 1 + + srl gr117,lr4,3 ; chuck count = (actual count) / 8 + cpeq gr118,gr117,0 + jmpt gr118,Ltail + mtsr fc,lr5 + + sub gr117,gr117,2 ; count for jmpfdec + +; Main loop working 8 limbs/iteration. +Loop: sub lr3,lr3,32 + mtsrim cr,(8-1) + loadm 0,0,gr100,lr3 + + extract gr109,gr119,gr107 + extract gr108,gr107,gr106 + extract gr107,gr106,gr105 + extract gr106,gr105,gr104 + extract gr105,gr104,gr103 + extract gr104,gr103,gr102 + extract gr103,gr102,gr101 + extract gr102,gr101,gr100 + + sub lr2,lr2,32 + mtsrim cr,(8-1) + storem 0,0,gr102,lr2 + jmpfdec gr117,Loop + or gr119,gr100,0 + +; Code for the last up-to-7 limbs. + + and lr4,lr4,(8-1) +Ltail: cpeq gr118,lr4,0 + jmpt gr118,Lend + sub lr4,lr4,2 ; count for jmpfdec + +Loop2: sub lr3,lr3,4 + load 0,0,gr116,lr3 + extract gr117,gr119,gr116 + sub lr2,lr2,4 + store 0,0,gr117,lr2 + jmpfdec lr4,Loop2 + or gr119,gr116,0 + +Lend: extract gr117,gr119,0 + sub lr2,lr2,4 + jmpi lr0 + store 0,0,gr117,lr2 diff --git a/rts/gmp/mpn/a29k/mul_1.s b/rts/gmp/mpn/a29k/mul_1.s new file mode 100644 index 0000000000..6bcf7ce0cf --- /dev/null +++ b/rts/gmp/mpn/a29k/mul_1.s @@ -0,0 +1,97 @@ +; 29000 __gmpn_mul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr lr2 +; s1_ptr lr3 +; size lr4 +; s2_limb lr5 + + .cputype 29050 + .sect .lit,lit + .text + .align 4 + .global ___gmpn_mul_1 + .word 0x60000 +___gmpn_mul_1: + sub lr4,lr4,8 + jmpt lr4,Ltail + const gr120,0 ; init cylimb reg + + srl gr117,lr4,3 ; divide by 8 + sub gr117,gr117,1 ; count for jmpfdec + +Loop: mtsrim cr,(8-1) + loadm 0,0,gr96,lr3 + add lr3,lr3,32 + + multiplu gr104,gr96,lr5 + multmu gr96,gr96,lr5 + multiplu gr105,gr97,lr5 + multmu gr97,gr97,lr5 + multiplu gr106,gr98,lr5 + multmu gr98,gr98,lr5 + multiplu gr107,gr99,lr5 + multmu gr99,gr99,lr5 + multiplu gr108,gr100,lr5 + multmu gr100,gr100,lr5 + multiplu gr109,gr101,lr5 + multmu gr101,gr101,lr5 + multiplu gr110,gr102,lr5 + multmu gr102,gr102,lr5 + multiplu gr111,gr103,lr5 + multmu gr103,gr103,lr5 + + add gr104,gr104,gr120 + addc gr105,gr105,gr96 + addc gr106,gr106,gr97 + addc gr107,gr107,gr98 + addc gr108,gr108,gr99 + addc gr109,gr109,gr100 + addc gr110,gr110,gr101 + addc gr111,gr111,gr102 + addc gr120,gr103,0 + + mtsrim cr,(8-1) + storem 0,0,gr104,lr2 + jmpfdec gr117,Loop + add lr2,lr2,32 + +Ltail: and lr4,lr4,(8-1) + sub gr118,lr4,1 ; count for CR + jmpt gr118,Lend + sub lr4,lr4,2 + sub lr2,lr2,4 ; offset res_ptr by one limb + +Loop2: load 0,0,gr116,lr3 + add lr3,lr3,4 + multiplu gr117,gr116,lr5 + multmu gr118,gr116,lr5 + add lr2,lr2,4 + add gr117,gr117,gr120 + store 0,0,gr117,lr2 + jmpfdec lr4,Loop2 + addc gr120,gr118,0 + +Lend: jmpi lr0 + or gr96,gr120,0 ; copy diff --git a/rts/gmp/mpn/a29k/rshift.s b/rts/gmp/mpn/a29k/rshift.s new file mode 100644 index 0000000000..ea163bff2b --- /dev/null +++ b/rts/gmp/mpn/a29k/rshift.s @@ -0,0 +1,89 @@ +; 29000 __gmpn_rshift -- + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr lr2 +; s1_ptr lr3 +; s2_ptr lr4 +; size lr5 + +; We use the loadm/storem instructions and operate on chunks of 8 +; limbs/per iteration, until less than 8 limbs remain. + + .sect .lit,lit + .text + .align 4 + .global ___gmpn_rshift + .word 0x60000 +___gmpn_rshift: + load 0,0,gr119,lr3 + add lr3,lr3,4 + + subr gr116,lr5,32 + sll gr96,gr119,gr116 ; return value + sub lr4,lr4,1 ; actual loop count is SIZE - 1 + + srl gr117,lr4,3 ; chuck count = (actual count) / 8 + cpeq gr118,gr117,0 + jmpt gr118,Ltail + mtsr fc,gr116 + + sub gr117,gr117,2 ; count for jmpfdec + +; Main loop working 8 limbs/iteration. +Loop: mtsrim cr,(8-1) + loadm 0,0,gr100,lr3 + add lr3,lr3,32 + + extract gr98,gr100,gr119 + extract gr99,gr101,gr100 + extract gr100,gr102,gr101 + extract gr101,gr103,gr102 + extract gr102,gr104,gr103 + extract gr103,gr105,gr104 + extract gr104,gr106,gr105 + extract gr105,gr107,gr106 + + mtsrim cr,(8-1) + storem 0,0,gr98,lr2 + add lr2,lr2,32 + jmpfdec gr117,Loop + or gr119,gr107,0 + +; Code for the last up-to-7 limbs. + + and lr4,lr4,(8-1) +Ltail: cpeq gr118,lr4,0 + jmpt gr118,Lend + sub lr4,lr4,2 ; count for jmpfdec + +Loop2: load 0,0,gr100,lr3 + add lr3,lr3,4 + extract gr117,gr100,gr119 + store 0,0,gr117,lr2 + add lr2,lr2,4 + jmpfdec lr4,Loop2 + or gr119,gr100,0 + +Lend: srl gr117,gr119,lr5 + jmpi lr0 + store 0,0,gr117,lr2 diff --git a/rts/gmp/mpn/a29k/sub_n.s b/rts/gmp/mpn/a29k/sub_n.s new file mode 100644 index 0000000000..c6b64c5bee --- /dev/null +++ b/rts/gmp/mpn/a29k/sub_n.s @@ -0,0 +1,120 @@ +; 29000 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr lr2 +; s1_ptr lr3 +; s2_ptr lr4 +; size lr5 + +; We use the loadm/storem instructions and operate on chunks of 8 +; limbs/per iteration, until less than 8 limbs remain. + +; The 29k has no addition or subtraction instructions that doesn't +; affect carry, so we need to save and restore that as soon as we +; adjust the pointers. gr116 is used for this purpose. Note that +; gr116==0 means that carry should be set. + + .sect .lit,lit + .text + .align 4 + .global ___gmpn_sub_n + .word 0x60000 +___gmpn_sub_n: + srl gr117,lr5,3 + sub gr118,gr117,1 + jmpt gr118,Ltail + constn gr116,-1 ; init cy reg + sub gr117,gr117,2 ; count for jmpfdec + +; Main loop working 8 limbs/iteration. +Loop: mtsrim cr,(8-1) + loadm 0,0,gr96,lr3 + add lr3,lr3,32 + mtsrim cr,(8-1) + loadm 0,0,gr104,lr4 + add lr4,lr4,32 + + subr gr116,gr116,0 ; restore carry + subc gr96,gr96,gr104 + subc gr97,gr97,gr105 + subc gr98,gr98,gr106 + subc gr99,gr99,gr107 + subc gr100,gr100,gr108 + subc gr101,gr101,gr109 + subc gr102,gr102,gr110 + subc gr103,gr103,gr111 + subc gr116,gr116,gr116 ; gr116 = not(cy) + + mtsrim cr,(8-1) + storem 0,0,gr96,lr2 + jmpfdec gr117,Loop + add lr2,lr2,32 + +; Code for the last up-to-7 limbs. +; This code might look very strange, but it's hard to write it +; differently without major slowdown. + + and lr5,lr5,(8-1) +Ltail: sub gr118,lr5,1 ; count for CR + jmpt gr118,Lend + sub gr117,lr5,2 ; count for jmpfdec + + mtsr cr,gr118 + loadm 0,0,gr96,lr3 + mtsr cr,gr118 + loadm 0,0,gr104,lr4 + + subr gr116,gr116,0 ; restore carry + + jmpfdec gr117,L1 + subc gr96,gr96,gr104 + jmp Lstore + mtsr cr,gr118 +L1: jmpfdec gr117,L2 + subc gr97,gr97,gr105 + jmp Lstore + mtsr cr,gr118 +L2: jmpfdec gr117,L3 + subc gr98,gr98,gr106 + jmp Lstore + mtsr cr,gr118 +L3: jmpfdec gr117,L4 + subc gr99,gr99,gr107 + jmp Lstore + mtsr cr,gr118 +L4: jmpfdec gr117,L5 + subc gr100,gr100,gr108 + jmp Lstore + mtsr cr,gr118 +L5: jmpfdec gr117,L6 + subc gr101,gr101,gr109 + jmp Lstore + mtsr cr,gr118 +L6: subc gr102,gr102,gr110 + +Lstore: storem 0,0,gr96,lr2 + subc gr116,gr116,gr116 ; gr116 = not(cy) + +Lend: jmpi lr0 + add gr96,gr116,1 diff --git a/rts/gmp/mpn/a29k/submul_1.s b/rts/gmp/mpn/a29k/submul_1.s new file mode 100644 index 0000000000..ef97d8d4e5 --- /dev/null +++ b/rts/gmp/mpn/a29k/submul_1.s @@ -0,0 +1,116 @@ +; 29000 __gmpn_submul_1 -- Multiply a limb vector with a single limb and +; subtract the product from a second limb vector. + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr lr2 +; s1_ptr lr3 +; size lr4 +; s2_limb lr5 + + .cputype 29050 + .sect .lit,lit + .text + .align 4 + .global ___gmpn_submul_1 + .word 0x60000 +___gmpn_submul_1: + sub lr4,lr4,8 + jmpt lr4,Ltail + const gr120,0 ; init cylimb reg + + srl gr117,lr4,3 ; divide by 8 + sub gr117,gr117,1 ; count for jmpfdec + +Loop: mtsrim cr,(8-1) + loadm 0,0,gr96,lr3 + add lr3,lr3,32 + + multiplu gr104,gr96,lr5 + multmu gr96,gr96,lr5 + multiplu gr105,gr97,lr5 + multmu gr97,gr97,lr5 + multiplu gr106,gr98,lr5 + multmu gr98,gr98,lr5 + multiplu gr107,gr99,lr5 + multmu gr99,gr99,lr5 + multiplu gr108,gr100,lr5 + multmu gr100,gr100,lr5 + multiplu gr109,gr101,lr5 + multmu gr101,gr101,lr5 + multiplu gr110,gr102,lr5 + multmu gr102,gr102,lr5 + multiplu gr111,gr103,lr5 + multmu gr103,gr103,lr5 + + add gr104,gr104,gr120 + addc gr105,gr105,gr96 + addc gr106,gr106,gr97 + addc gr107,gr107,gr98 + addc gr108,gr108,gr99 + addc gr109,gr109,gr100 + addc gr110,gr110,gr101 + addc gr111,gr111,gr102 + addc gr120,gr103,0 + + mtsrim cr,(8-1) + loadm 0,0,gr96,lr2 + + sub gr96,gr96,gr104 + subc gr97,gr97,gr105 + subc gr98,gr98,gr106 + subc gr99,gr99,gr107 + subc gr100,gr100,gr108 + subc gr101,gr101,gr109 + subc gr102,gr102,gr110 + subc gr103,gr103,gr111 + + add gr104,gr103,gr111 ; invert carry from previus sub + addc gr120,gr120,0 + + mtsrim cr,(8-1) + storem 0,0,gr96,lr2 + jmpfdec gr117,Loop + add lr2,lr2,32 + +Ltail: and lr4,lr4,(8-1) + sub gr118,lr4,1 ; count for CR + jmpt gr118,Lend + sub lr4,lr4,2 + sub lr2,lr2,4 ; offset res_ptr by one limb + +Loop2: load 0,0,gr116,lr3 + add lr3,lr3,4 + multiplu gr117,gr116,lr5 + multmu gr118,gr116,lr5 + add lr2,lr2,4 + load 0,0,gr119,lr2 + add gr117,gr117,gr120 + addc gr118,gr118,0 + sub gr119,gr119,gr117 + add gr104,gr119,gr117 ; invert carry from previus sub + store 0,0,gr119,lr2 + jmpfdec lr4,Loop2 + addc gr120,gr118,0 + +Lend: jmpi lr0 + or gr96,gr120,0 ; copy diff --git a/rts/gmp/mpn/a29k/udiv.s b/rts/gmp/mpn/a29k/udiv.s new file mode 100644 index 0000000000..fdd53a9a88 --- /dev/null +++ b/rts/gmp/mpn/a29k/udiv.s @@ -0,0 +1,30 @@ +; Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + .sect .lit,lit + .text + .align 4 + .global ___udiv_qrnnd + .word 0x60000 +___udiv_qrnnd: + mtsr q,lr3 + dividu gr96,lr4,lr5 + mfsr gr116,q + jmpi lr0 + store 0,0,gr116,lr2 diff --git a/rts/gmp/mpn/a29k/umul.s b/rts/gmp/mpn/a29k/umul.s new file mode 100644 index 0000000000..7741981167 --- /dev/null +++ b/rts/gmp/mpn/a29k/umul.s @@ -0,0 +1,29 @@ +; Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + .sect .lit,lit + .text + .align 4 + .global ___umul_ppmm + .word 0x50000 +___umul_ppmm: + multiplu gr116,lr3,lr4 + multmu gr96,lr3,lr4 + jmpi lr0 + store 0,0,gr116,lr2 diff --git a/rts/gmp/mpn/alpha/README b/rts/gmp/mpn/alpha/README new file mode 100644 index 0000000000..744260c7c5 --- /dev/null +++ b/rts/gmp/mpn/alpha/README @@ -0,0 +1,224 @@ +This directory contains mpn functions optimized for DEC Alpha processors. + +ALPHA ASSEMBLY RULES AND REGULATIONS + +The `.prologue N' pseudo op marks the end of instruction that needs +special handling by unwinding. It also says whether $27 is really +needed for computing the gp. The `.mask M' pseudo op says which +registers are saved on the stack, and at what offset in the frame. + +Cray code is very very different... + + +RELEVANT OPTIMIZATION ISSUES + +EV4 + +1. This chip has very limited store bandwidth. The on-chip L1 cache is + write-through, and a cache line is transfered from the store buffer to + the off-chip L2 in as much 15 cycles on most systems. This delay hurts + mpn_add_n, mpn_sub_n, mpn_lshift, and mpn_rshift. + +2. Pairing is possible between memory instructions and integer arithmetic + instructions. + +3. mulq and umulh are documented to have a latency of 23 cycles, but 2 of + these cycles are pipelined. Thus, multiply instructions can be issued at + a rate of one each 21st cycle. + +EV5 + +1. The memory bandwidth of this chip seems excellent, both for loads and + stores. Even when the working set is larger than the on-chip L1 and L2 + caches, the performance remain almost unaffected. + +2. mulq has a latency of 12 cycles and an issue rate of 1 each 8th cycle. + umulh has a measured latency of 14 cycles and an issue rate of 1 each + 10th cycle. But the exact timing is somewhat confusing. + +3. mpn_add_n. With 4-fold unrolling, we need 37 instructions, whereof 12 + are memory operations. This will take at least + ceil(37/2) [dual issue] + 1 [taken branch] = 19 cycles + We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data + cache cycles, which should be completely hidden in the 19 issue cycles. + The computation is inherently serial, with these dependencies: + + ldq ldq + \ /\ + (or) addq | + |\ / \ | + | addq cmpult + \ | | + cmpult | + \ / + or + + I.e., 3 operations are needed between carry-in and carry-out, making 12 + cycles the absolute minimum for the 4 limbs. We could replace the `or' + with a cmoveq/cmovne, which could issue one cycle earlier that the `or', + but that might waste a cycle on EV4. The total depth remain unaffected, + since cmov has a latency of 2 cycles. + + addq + / \ + addq cmpult + | \ + cmpult -> cmovne + +Montgomery has a slightly different way of computing carry that requires one +less instruction, but has depth 4 (instead of the current 3). Since the +code is currently instruction issue bound, Montgomery's idea should save us +1/2 cycle per limb, or bring us down to a total of 17 cycles or 4.25 +cycles/limb. Unfortunately, this method will not be good for the EV6. + +EV6 + +Here we have a really parallel pipeline, capable of issuing up to 4 integer +instructions per cycle. One integer multiply instruction can issue each +cycle. To get optimal speed, we need to pretend we are vectorizing the code, +i.e., minimize the iterative dependencies. + +There are two dependencies to watch out for. 1) Address arithmetic +dependencies, and 2) carry propagation dependencies. + +We can avoid serializing due to address arithmetic by unrolling the loop, so +that addresses don't depend heavily on an index variable. Avoiding +serializing because of carry propagation is trickier; the ultimate performance +of the code will be determined of the number of latency cycles it takes from +accepting carry-in to a vector point until we can generate carry-out. + +Most integer instructions can execute in either the L0, U0, L1, or U1 +pipelines. Shifts only execute in U0 and U1, and multiply only in U1. + +CMOV instructions split into two internal instructions, CMOV1 and CMOV2, but +the execute efficiently. But CMOV split the mapping process (see pg 2-26 in +cmpwrgd.pdf), suggesting the CMOV should always be placed as the last +instruction of an aligned 4 instruction block (?). + +Perhaps the most important issue is the latency between the L0/U0 and L1/U1 +clusters; a result obtained on either cluster has an extra cycle of latency +for consumers in the opposite cluster. Because of the dynamic nature of the +implementation, it is hard to predict where an instruction will execute. + +The shift loops need (per limb): + 1 load (Lx pipes) + 1 store (Lx pipes) + 2 shift (Ux pipes) + 1 iaddlog (Lx pipes, Ux pipes) +Obviously, since the pipes are very equally loaded, we should get 4 insn/cycle, or 1.25 cycles/limb. + +For mpn_add_n, we currently have + 2 load (Lx pipes) + 1 store (Lx pipes) + 5 iaddlog (Lx pipes, Ux pipes) + +Again, we have a perfect balance and will be limited by carry propagation +delays, currently three cycles. The superoptimizer indicates that ther +might be sequences that--using a final cmov--have a carry propagation delay +of just two. Montgomery's subtraction sequence could perhaps be used, by +complementing some operands. All in all, we should get down to 2 cycles +without much problems. + +For mpn_mul_1, we could do, just like for mpn_add_n: + not newlo,notnewlo + addq cylimb,newlo,newlo || cmpult cylimb,notnewlo,cyout + addq cyout,newhi,cylimb +and get 2-cycle carry propagation. The instructions needed will be + 1 ld (Lx pipes) + 1 st (Lx pipes) + 2 mul (U1 pipe) + 4 iaddlog (Lx pipes, Ux pipes) +issue1: addq not mul ld +issue2: cmpult addq mul st +Conclusion: no cluster delays and 2-cycle carry delays will give us 2 cycles/limb! + +Last, we have mpn_addmul_1. Almost certainly, we will get down to 3 +cycles/limb, which would be absolutely awesome. + +Old, perhaps obsolete addmul_1 dependency diagram (needs 175 columns wide screen): + + i + s + s i + u n + e s + d t + r + i u +l n c +i s t +v t i +e r o + u n +v c +a t t +l i y +u o p +e n e +s s s + issue + in + cycle + -1 ldq + / \ + 0 | \ + | \ + 1 | | + | | + 2 | | ldq + | | / \ + 3 | mulq | \ + | \ | \ + 4 umulh \ | | + | | | | + 5 | | | | ldq + | | | | / \ + 4calm 6 | | ldq | mulq | \ + | | / | \ | \ + 4casm 7 | | / umulh \ | | +6 | || | | | | + 3aal 8 | || | | | | ldq +7 | || | | | | / \ + 4calm 9 | || | | ldq | mulq | \ +9 | || | | / | \ | \ + 4casm 10 | || | | / umulh \ | | +9 | || | || | | | | + 3aal 11 | addq | || | | | | ldq +9 | // \ | || | | | | / \ + 4calm 12 \ cmpult addq<-cy | || | | ldq | mulq | \ +13 \ / // \ | || | | / | \ | \ + 4casm 13 addq cmpult stq | || | | / umulh \ | | +11 \ / | || | || | | | | + 3aal 14 addq | addq | || | | | | ldq +10 \ | // \ | || | | | | / \ + 4calm 15 cy ----> \ cmpult addq<-cy | || | | ldq | mulq | \ +13 \ / // \ | || | | / | \ | \ + 4casm 16 addq cmpult stq | || | | / umulh \ | | +11 \ / | || | || | | | | + 3aal 17 addq | addq | || | | | | +10 \ | // \ | || | | | | + 4calm 18 cy ----> \ cmpult addq<-cy | || | | ldq | mulq +13 \ / // \ | || | | / | \ + 4casm 19 addq cmpult stq | || | | / umulh \ +11 \ / | || | || | | + 3aal 20 addq | addq | || | | +10 \ | // \ | || | | + 4calm 21 cy ----> \ cmpult addq<-cy | || | | ldq + \ / // \ | || | | / + 22 addq cmpult stq | || | | / + \ / | || | || + 23 addq | addq | || + \ | // \ | || + 24 cy ----> \ cmpult addq<-cy | || + \ / // \ | || + 25 addq cmpult stq | || + \ / | || + 26 addq | addq + \ | // \ + 27 cy ----> \ cmpult addq<-cy + \ / // \ + 28 addq cmpult stq + \ / +As many as 6 consecutive points will be under execution simultaneously, or if we addq +schedule loads even further away, maybe 7 or 8. But the number of live quantities \ +is reasonable, and can easily be satisfied. cy ----> diff --git a/rts/gmp/mpn/alpha/add_n.asm b/rts/gmp/mpn/alpha/add_n.asm new file mode 100644 index 0000000000..08d6a9f7b8 --- /dev/null +++ b/rts/gmp/mpn/alpha/add_n.asm @@ -0,0 +1,114 @@ +dnl Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_add_n) + ldq r3,0(r17) + ldq r4,0(r18) + + subq r19,1,r19 + and r19,4-1,r2 C number of limbs in first loop + bis r31,r31,r0 + beq r2,$L0 C if multiple of 4 limbs, skip first loop + + subq r19,r2,r19 + +$Loop0: subq r2,1,r2 + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + + addq r17,8,r17 + addq r18,8,r18 + bis r5,r5,r3 + bis r6,r6,r4 + addq r16,8,r16 + bne r2,$Loop0 + +$L0: beq r19,$Lend + + ALIGN(8) +$Loop: subq r19,4,r19 + + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + + ldq r3,16(r17) + addq r6,r0,r6 + ldq r4,16(r18) + cmpult r6,r0,r1 + addq r5,r6,r6 + cmpult r6,r5,r0 + stq r6,8(r16) + bis r0,r1,r0 + + ldq r5,24(r17) + addq r4,r0,r4 + ldq r6,24(r18) + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,16(r16) + bis r0,r1,r0 + + ldq r3,32(r17) + addq r6,r0,r6 + ldq r4,32(r18) + cmpult r6,r0,r1 + addq r5,r6,r6 + cmpult r6,r5,r0 + stq r6,24(r16) + bis r0,r1,r0 + + addq r17,32,r17 + addq r18,32,r18 + addq r16,32,r16 + bne r19,$Loop + +$Lend: addq r4,r0,r4 + cmpult r4,r0,r1 + addq r3,r4,r4 + cmpult r4,r3,r0 + stq r4,0(r16) + bis r0,r1,r0 + ret r31,(r26),1 +EPILOGUE(mpn_add_n) +ASM_END() diff --git a/rts/gmp/mpn/alpha/addmul_1.asm b/rts/gmp/mpn/alpha/addmul_1.asm new file mode 100644 index 0000000000..4ea900be6b --- /dev/null +++ b/rts/gmp/mpn/alpha/addmul_1.asm @@ -0,0 +1,87 @@ +dnl Alpha __gmpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_addmul_1) + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + addq r5,r3,r3 + cmpult r3,r5,r4 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + addq r5,r0,r0 C combine carries + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r5,r0,r0 C combine carries + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: addq r5,r3,r3 + cmpult r3,r5,r5 + stq r3,0(r16) + addq r0,r5,r0 + ret r31,(r26),1 +EPILOGUE(mpn_addmul_1) +ASM_END() diff --git a/rts/gmp/mpn/alpha/cntlz.asm b/rts/gmp/mpn/alpha/cntlz.asm new file mode 100644 index 0000000000..febb3b70d9 --- /dev/null +++ b/rts/gmp/mpn/alpha/cntlz.asm @@ -0,0 +1,68 @@ +dnl Alpha auxiliary for longlong.h's count_leading_zeros + +dnl Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl DISCUSSION: + +dnl Other methods have been tried, and using a 128-entry table actually trims +dnl about 10% of the execution time (on a 21164) when the table is in the L1 +dnl cache. But under non-benchmarking conditions, the table will hardly be in +dnl the L1 cache. Tricky bit-fiddling methods with multiplies and magic tables +dnl are also possible, but they require many more instructions than the current +dnl code. (But for count_trailing_zeros, such tricks are beneficial.) +dnl Finally, converting to floating-point and extracting the exponent is much +dnl slower. + +ASM_START() +PROLOGUE(MPN(count_leading_zeros)) + bis r31,63,r0 C initialize partial result count + + srl r16,32,r1 C shift down 32 steps -> r1 + cmovne r1,r1,r16 C select r1 if non-zero + cmovne r1,31,r0 C if r1 is nonzero choose smaller count + + srl r16,16,r1 C shift down 16 steps -> r1 + subq r0,16,r2 C generate new partial result count + cmovne r1,r1,r16 C choose new r1 if non-zero + cmovne r1,r2,r0 C choose new count if r1 was non-zero + + srl r16,8,r1 + subq r0,8,r2 + cmovne r1,r1,r16 + cmovne r1,r2,r0 + + srl r16,4,r1 + subq r0,4,r2 + cmovne r1,r1,r16 + cmovne r1,r2,r0 + + srl r16,2,r1 + subq r0,2,r2 + cmovne r1,r1,r16 + cmovne r1,r2,r0 + + srl r16,1,r1 C extract bit 1 + subq r0,r1,r0 C subtract it from partial result + + ret r31,(r26),1 +EPILOGUE(MPN(count_leading_zeros)) +ASM_END() diff --git a/rts/gmp/mpn/alpha/default.m4 b/rts/gmp/mpn/alpha/default.m4 new file mode 100644 index 0000000000..5f4c48dc73 --- /dev/null +++ b/rts/gmp/mpn/alpha/default.m4 @@ -0,0 +1,77 @@ +divert(-1) + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +define(`ASM_START', + ` + .set noreorder + .set noat') + +define(`X',`0x$1') +define(`FLOAT64', + ` + .align 3 +$1: .t_floating $2') + +define(`PROLOGUE', + ` + .text + .align 3 + .globl $1 + .ent $1 +$1: + .frame r30,0,r26 + .prologue 0') + +define(`PROLOGUE_GP', + ` + .text + .align 3 + .globl $1 + .ent $1 +$1: + ldgp r29,0(r27) + .frame r30,0,r26 + .prologue 1') + +define(`EPILOGUE', + ` + .end $1') + +dnl Map register names r0, r1, etc, to `$0', `$1', etc. +dnl This is needed on all systems but Unicos +forloop(i,0,31, +`define(`r'i,``$''i)' +) +forloop(i,0,31, +`define(`f'i,``$f''i)' +) + +define(`DATASTART', + `dnl + DATA +$1:') +define(`DATAEND',`dnl') + +define(`ASM_END',`dnl') + +divert diff --git a/rts/gmp/mpn/alpha/ev5/add_n.asm b/rts/gmp/mpn/alpha/ev5/add_n.asm new file mode 100644 index 0000000000..716d6404ae --- /dev/null +++ b/rts/gmp/mpn/alpha/ev5/add_n.asm @@ -0,0 +1,143 @@ +dnl Alpha EV5 __gmpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_add_n) + bis r31,r31,r25 C clear cy + subq r19,4,r19 C decr loop cnt + blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop +C Start software pipeline for 1st loop + ldq r0,0(r18) + ldq r4,0(r17) + ldq r1,8(r18) + ldq r5,8(r17) + addq r17,32,r17 C update s1_ptr + ldq r2,16(r18) + addq r0,r4,r20 C 1st main add + ldq r3,24(r18) + subq r19,4,r19 C decr loop cnt + ldq r6,-16(r17) + cmpult r20,r0,r25 C compute cy from last add + ldq r7,-8(r17) + addq r1,r5,r28 C 2nd main add + addq r18,32,r18 C update s2_ptr + addq r28,r25,r21 C 2nd carry add + cmpult r28,r5,r8 C compute cy from last add + blt r19,$Lend1 C if less than 4 limbs remain, jump +C 1st loop handles groups of 4 limbs in a software pipeline + ALIGN(16) +$Loop: cmpult r21,r28,r25 C compute cy from last add + ldq r0,0(r18) + bis r8,r25,r25 C combine cy from the two adds + ldq r1,8(r18) + addq r2,r6,r28 C 3rd main add + ldq r4,0(r17) + addq r28,r25,r22 C 3rd carry add + ldq r5,8(r17) + cmpult r28,r6,r8 C compute cy from last add + cmpult r22,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + stq r21,8(r16) + addq r3,r7,r28 C 4th main add + addq r28,r25,r23 C 4th carry add + cmpult r28,r7,r8 C compute cy from last add + cmpult r23,r28,r25 C compute cy from last add + addq r17,32,r17 C update s1_ptr + bis r8,r25,r25 C combine cy from the two adds + addq r16,32,r16 C update res_ptr + addq r0,r4,r28 C 1st main add + ldq r2,16(r18) + addq r25,r28,r20 C 1st carry add + ldq r3,24(r18) + cmpult r28,r4,r8 C compute cy from last add + ldq r6,-16(r17) + cmpult r20,r28,r25 C compute cy from last add + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two adds + subq r19,4,r19 C decr loop cnt + stq r22,-16(r16) + addq r1,r5,r28 C 2nd main add + stq r23,-8(r16) + addq r25,r28,r21 C 2nd carry add + addq r18,32,r18 C update s2_ptr + cmpult r28,r5,r8 C compute cy from last add + bge r19,$Loop +C Finish software pipeline for 1st loop +$Lend1: cmpult r21,r28,r25 C compute cy from last add + bis r8,r25,r25 C combine cy from the two adds + addq r2,r6,r28 C 3rd main add + addq r28,r25,r22 C 3rd carry add + cmpult r28,r6,r8 C compute cy from last add + cmpult r22,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + stq r21,8(r16) + addq r3,r7,r28 C 4th main add + addq r28,r25,r23 C 4th carry add + cmpult r28,r7,r8 C compute cy from last add + cmpult r23,r28,r25 C compute cy from last add + bis r8,r25,r25 C combine cy from the two adds + addq r16,32,r16 C update res_ptr + stq r22,-16(r16) + stq r23,-8(r16) +$Lend2: addq r19,4,r19 C restore loop cnt + beq r19,$Lret +C Start software pipeline for 2nd loop + ldq r0,0(r18) + ldq r4,0(r17) + subq r19,1,r19 + beq r19,$Lend0 +C 2nd loop handles remaining 1-3 limbs + ALIGN(16) +$Loop0: addq r0,r4,r28 C main add + ldq r0,8(r18) + cmpult r28,r4,r8 C compute cy from last add + ldq r4,8(r17) + addq r28,r25,r20 C carry add + addq r18,8,r18 + addq r17,8,r17 + stq r20,0(r16) + cmpult r20,r28,r25 C compute cy from last add + subq r19,1,r19 C decr loop cnt + bis r8,r25,r25 C combine cy from the two adds + addq r16,8,r16 + bne r19,$Loop0 +$Lend0: addq r0,r4,r28 C main add + addq r28,r25,r20 C carry add + cmpult r28,r4,r8 C compute cy from last add + cmpult r20,r28,r25 C compute cy from last add + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two adds + +$Lret: bis r25,r31,r0 C return cy + ret r31,(r26),1 +EPILOGUE(mpn_add_n) +ASM_END() diff --git a/rts/gmp/mpn/alpha/ev5/lshift.asm b/rts/gmp/mpn/alpha/ev5/lshift.asm new file mode 100644 index 0000000000..cb181dda66 --- /dev/null +++ b/rts/gmp/mpn/alpha/ev5/lshift.asm @@ -0,0 +1,169 @@ +dnl Alpha EV5 __gmpn_lshift -- Shift a number left. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 3.25 cycles/limb on the EV5. + +ASM_START() +PROLOGUE(mpn_lshift) + s8addq r18,r17,r17 C make r17 point at end of s1 + ldq r4,-8(r17) C load first limb + subq r31,r19,r20 + s8addq r18,r16,r16 C make r16 point at end of RES + subq r18,1,r18 + and r18,4-1,r28 C number of limbs in first loop + srl r4,r20,r0 C compute function result + + beq r28,$L0 + subq r18,r28,r18 + + ALIGN(8) +$Loop0: ldq r3,-16(r17) + subq r16,8,r16 + sll r4,r19,r5 + subq r17,8,r17 + subq r28,1,r28 + srl r3,r20,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,0(r16) + bne r28,$Loop0 + +$L0: sll r4,r19,r24 + beq r18,$Lend +C warm up phase 1 + ldq r1,-16(r17) + subq r18,4,r18 + ldq r2,-24(r17) + ldq r3,-32(r17) + ldq r4,-40(r17) + beq r18,$Lend1 +C warm up phase 2 + srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + ldq r1,-48(r17) + sll r2,r19,r22 + ldq r2,-56(r17) + srl r3,r20,r5 + bis r7,r24,r7 + sll r3,r19,r23 + bis r8,r21,r8 + srl r4,r20,r6 + ldq r3,-64(r17) + sll r4,r19,r24 + ldq r4,-72(r17) + subq r18,4,r18 + beq r18,$Lend2 + ALIGN(16) +C main loop +$Loop: stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + + srl r1,r20,r7 + subq r18,4,r18 + sll r1,r19,r21 + unop C ldq r31,-96(r17) + + srl r2,r20,r8 + ldq r1,-80(r17) + sll r2,r19,r22 + ldq r2,-88(r17) + + stq r5,-24(r16) + bis r7,r24,r7 + stq r6,-32(r16) + bis r8,r21,r8 + + srl r3,r20,r5 + unop C ldq r31,-96(r17) + sll r3,r19,r23 + subq r16,32,r16 + + srl r4,r20,r6 + ldq r3,-96(r17) + sll r4,r19,r24 + ldq r4,-104(r17) + + subq r17,32,r17 + bne r18,$Loop +C cool down phase 2/1 +$Lend2: stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + sll r2,r19,r22 + stq r5,-24(r16) + bis r7,r24,r7 + stq r6,-32(r16) + bis r8,r21,r8 + srl r3,r20,r5 + sll r3,r19,r23 + srl r4,r20,r6 + sll r4,r19,r24 +C cool down phase 2/2 + stq r7,-40(r16) + bis r5,r22,r5 + stq r8,-48(r16) + bis r6,r23,r6 + stq r5,-56(r16) + stq r6,-64(r16) +C cool down phase 2/3 + stq r24,-72(r16) + ret r31,(r26),1 + +C cool down phase 1/1 +$Lend1: srl r1,r20,r7 + sll r1,r19,r21 + srl r2,r20,r8 + sll r2,r19,r22 + srl r3,r20,r5 + bis r7,r24,r7 + sll r3,r19,r23 + bis r8,r21,r8 + srl r4,r20,r6 + sll r4,r19,r24 +C cool down phase 1/2 + stq r7,-8(r16) + bis r5,r22,r5 + stq r8,-16(r16) + bis r6,r23,r6 + stq r5,-24(r16) + stq r6,-32(r16) + stq r24,-40(r16) + ret r31,(r26),1 + +$Lend: stq r24,-8(r16) + ret r31,(r26),1 +EPILOGUE(mpn_lshift) +ASM_END() diff --git a/rts/gmp/mpn/alpha/ev5/rshift.asm b/rts/gmp/mpn/alpha/ev5/rshift.asm new file mode 100644 index 0000000000..9940d83fad --- /dev/null +++ b/rts/gmp/mpn/alpha/ev5/rshift.asm @@ -0,0 +1,167 @@ +dnl Alpha EV5 __gmpn_rshift -- Shift a number right. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 3.25 cycles/limb on the EV5. + +ASM_START() +PROLOGUE(mpn_rshift) + ldq r4,0(r17) C load first limb + subq r31,r19,r20 + subq r18,1,r18 + and r18,4-1,r28 C number of limbs in first loop + sll r4,r20,r0 C compute function result + + beq r28,$L0 + subq r18,r28,r18 + + ALIGN(8) +$Loop0: ldq r3,8(r17) + addq r16,8,r16 + srl r4,r19,r5 + addq r17,8,r17 + subq r28,1,r28 + sll r3,r20,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,-8(r16) + bne r28,$Loop0 + +$L0: srl r4,r19,r24 + beq r18,$Lend +C warm up phase 1 + ldq r1,8(r17) + subq r18,4,r18 + ldq r2,16(r17) + ldq r3,24(r17) + ldq r4,32(r17) + beq r18,$Lend1 +C warm up phase 2 + sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + ldq r1,40(r17) + srl r2,r19,r22 + ldq r2,48(r17) + sll r3,r20,r5 + bis r7,r24,r7 + srl r3,r19,r23 + bis r8,r21,r8 + sll r4,r20,r6 + ldq r3,56(r17) + srl r4,r19,r24 + ldq r4,64(r17) + subq r18,4,r18 + beq r18,$Lend2 + ALIGN(16) +C main loop +$Loop: stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + + sll r1,r20,r7 + subq r18,4,r18 + srl r1,r19,r21 + unop C ldq r31,-96(r17) + + sll r2,r20,r8 + ldq r1,72(r17) + srl r2,r19,r22 + ldq r2,80(r17) + + stq r5,16(r16) + bis r7,r24,r7 + stq r6,24(r16) + bis r8,r21,r8 + + sll r3,r20,r5 + unop C ldq r31,-96(r17) + srl r3,r19,r23 + addq r16,32,r16 + + sll r4,r20,r6 + ldq r3,88(r17) + srl r4,r19,r24 + ldq r4,96(r17) + + addq r17,32,r17 + bne r18,$Loop +C cool down phase 2/1 +$Lend2: stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + srl r2,r19,r22 + stq r5,16(r16) + bis r7,r24,r7 + stq r6,24(r16) + bis r8,r21,r8 + sll r3,r20,r5 + srl r3,r19,r23 + sll r4,r20,r6 + srl r4,r19,r24 +C cool down phase 2/2 + stq r7,32(r16) + bis r5,r22,r5 + stq r8,40(r16) + bis r6,r23,r6 + stq r5,48(r16) + stq r6,56(r16) +C cool down phase 2/3 + stq r24,64(r16) + ret r31,(r26),1 + +C cool down phase 1/1 +$Lend1: sll r1,r20,r7 + srl r1,r19,r21 + sll r2,r20,r8 + srl r2,r19,r22 + sll r3,r20,r5 + bis r7,r24,r7 + srl r3,r19,r23 + bis r8,r21,r8 + sll r4,r20,r6 + srl r4,r19,r24 +C cool down phase 1/2 + stq r7,0(r16) + bis r5,r22,r5 + stq r8,8(r16) + bis r6,r23,r6 + stq r5,16(r16) + stq r6,24(r16) + stq r24,32(r16) + ret r31,(r26),1 + +$Lend: stq r24,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_rshift) +ASM_END() diff --git a/rts/gmp/mpn/alpha/ev5/sub_n.asm b/rts/gmp/mpn/alpha/ev5/sub_n.asm new file mode 100644 index 0000000000..5248a2aa38 --- /dev/null +++ b/rts/gmp/mpn/alpha/ev5/sub_n.asm @@ -0,0 +1,143 @@ +dnl Alpha EV5 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 +dnl and store difference in a third limb vector. + +dnl Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_sub_n) + bis r31,r31,r25 C clear cy + subq r19,4,r19 C decr loop cnt + blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop +C Start software pipeline for 1st loop + ldq r0,0(r18) + ldq r4,0(r17) + ldq r1,8(r18) + ldq r5,8(r17) + addq r17,32,r17 C update s1_ptr + ldq r2,16(r18) + subq r4,r0,r20 C 1st main subtract + ldq r3,24(r18) + subq r19,4,r19 C decr loop cnt + ldq r6,-16(r17) + cmpult r4,r0,r25 C compute cy from last subtract + ldq r7,-8(r17) + subq r5,r1,r28 C 2nd main subtract + addq r18,32,r18 C update s2_ptr + subq r28,r25,r21 C 2nd carry subtract + cmpult r5,r1,r8 C compute cy from last subtract + blt r19,$Lend1 C if less than 4 limbs remain, jump +C 1st loop handles groups of 4 limbs in a software pipeline + ALIGN(16) +$Loop: cmpult r28,r25,r25 C compute cy from last subtract + ldq r0,0(r18) + bis r8,r25,r25 C combine cy from the two subtracts + ldq r1,8(r18) + subq r6,r2,r28 C 3rd main subtract + ldq r4,0(r17) + subq r28,r25,r22 C 3rd carry subtract + ldq r5,8(r17) + cmpult r6,r2,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + stq r21,8(r16) + subq r7,r3,r28 C 4th main subtract + subq r28,r25,r23 C 4th carry subtract + cmpult r7,r3,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + addq r17,32,r17 C update s1_ptr + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,32,r16 C update res_ptr + subq r4,r0,r28 C 1st main subtract + ldq r2,16(r18) + subq r28,r25,r20 C 1st carry subtract + ldq r3,24(r18) + cmpult r4,r0,r8 C compute cy from last subtract + ldq r6,-16(r17) + cmpult r28,r25,r25 C compute cy from last subtract + ldq r7,-8(r17) + bis r8,r25,r25 C combine cy from the two subtracts + subq r19,4,r19 C decr loop cnt + stq r22,-16(r16) + subq r5,r1,r28 C 2nd main subtract + stq r23,-8(r16) + subq r28,r25,r21 C 2nd carry subtract + addq r18,32,r18 C update s2_ptr + cmpult r5,r1,r8 C compute cy from last subtract + bge r19,$Loop +C Finish software pipeline for 1st loop +$Lend1: cmpult r28,r25,r25 C compute cy from last subtract + bis r8,r25,r25 C combine cy from the two subtracts + subq r6,r2,r28 C cy add + subq r28,r25,r22 C 3rd main subtract + cmpult r6,r2,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + stq r21,8(r16) + subq r7,r3,r28 C cy add + subq r28,r25,r23 C 4th main subtract + cmpult r7,r3,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,32,r16 C update res_ptr + stq r22,-16(r16) + stq r23,-8(r16) +$Lend2: addq r19,4,r19 C restore loop cnt + beq r19,$Lret +C Start software pipeline for 2nd loop + ldq r0,0(r18) + ldq r4,0(r17) + subq r19,1,r19 + beq r19,$Lend0 +C 2nd loop handles remaining 1-3 limbs + ALIGN(16) +$Loop0: subq r4,r0,r28 C main subtract + cmpult r4,r0,r8 C compute cy from last subtract + ldq r0,8(r18) + ldq r4,8(r17) + subq r28,r25,r20 C carry subtract + addq r18,8,r18 + addq r17,8,r17 + stq r20,0(r16) + cmpult r28,r25,r25 C compute cy from last subtract + subq r19,1,r19 C decr loop cnt + bis r8,r25,r25 C combine cy from the two subtracts + addq r16,8,r16 + bne r19,$Loop0 +$Lend0: subq r4,r0,r28 C main subtract + subq r28,r25,r20 C carry subtract + cmpult r4,r0,r8 C compute cy from last subtract + cmpult r28,r25,r25 C compute cy from last subtract + stq r20,0(r16) + bis r8,r25,r25 C combine cy from the two subtracts + +$Lret: bis r25,r31,r0 C return cy + ret r31,(r26),1 +EPILOGUE(mpn_sub_n) +ASM_END() diff --git a/rts/gmp/mpn/alpha/ev6/addmul_1.asm b/rts/gmp/mpn/alpha/ev6/addmul_1.asm new file mode 100644 index 0000000000..2f588626a5 --- /dev/null +++ b/rts/gmp/mpn/alpha/ev6/addmul_1.asm @@ -0,0 +1,474 @@ +dnl Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright (C) 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and +dnl exactly 3.625 cycles/limb on EV6... + +dnl This code was written in close cooperation with ev6 pipeline expert +dnl Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though. +dnl +dnl Register usages for unrolled loop: +dnl 0-3 mul's +dnl 4-7 acc's +dnl 8-15 mul results +dnl 20,21 carry's +dnl 22,23 save for stores + +dnl Sustains 8 mul-adds in 29 cycles in the unrolled inner loop. + +dnl The stores can issue a cycle late so we have paired no-op's to 'catch' +dnl them, so that further disturbance to the schedule is damped. + +dnl We couldn't pair the loads, because the entangled schedule of the +dnl carry's has to happen on one side {0} of the machine. Note, the total +dnl use of U0, and the total use of L0 (after attending to the stores). +dnl which is part of the reason why.... + +dnl This is a great schedule for the d_cache, a poor schedule for the +dnl b_cache. The lockup on U0 means that any stall can't be recovered +dnl from. Consider a ldq in L1. say that load gets stalled because it +dnl collides with a fill from the b_Cache. On the next cycle, this load +dnl gets priority. If first looks at L0, and goes there. The instruction +dnl we intended for L0 gets to look at L1, which is NOT where we want +dnl it. It either stalls 1, because it can't go in L0, or goes there, and +dnl causes a further instruction to stall. + +dnl So for b_cache, we're likely going to want to put one or more cycles +dnl back into the code! And, of course, put in prefetches. For the +dnl accumulator, lds, intent to modify. For the multiplier, you might +dnl want ldq, evict next, if you're not wanting to use it again soon. Use +dnl 256 ahead of present pointer value. At a place where we have an mt +dnl followed by a bookkeeping, put the bookkeeping in upper, and the +dnl prefetch into lower. + +dnl Note, the usage of physical registers per cycle is smoothed off, as +dnl much as possible. + +dnl Note, the ldq's and stq's are at the end of the quadpacks. note, we'd +dnl like not to have a ldq or stq to preceded a conditional branch in a +dnl quadpack. The conditional branch moves the retire pointer one cycle +dnl later. + +dnl Optimization notes: +dnl Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27? +dnl Reserved regs: r29 r30 r31 +dnl Free caller-saves regs in unrolled code: r24 r25 r28 +dnl We should swap some of the callee-saves regs for some of the free +dnl caller-saves regs, saving some overhead cycles. +dnl Most importantly, we should write fast code for the 0-7 case. +dnl The code we use there are for the 21164, and runs at 7 cycles/limb +dnl on the 21264. Should not be hard, if we write specialized code for +dnl 1-7 limbs (the one for 0 limbs should be straightforward). We then just +dnl need a jump table indexed by the low 3 bits of the count argument. + + +ASM_START() +PROLOGUE(mpn_addmul_1) + cmpult r18, 8, r1 + beq r1, $Large + + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r18, 1, r18 C size-- + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + umulh r2, r19, r0 C r0 = prod_high + beq r18, $Lend0b C jump if size was == 1 + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r18, 1, r18 C size-- + addq r5, r3, r3 + cmpult r3, r5, r4 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + beq r18, $Lend0a C jump if size was == 2 + + ALIGN(8) +$Loop0: mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + subq r18, 1, r18 C size-- + umulh r2, r19, r4 C r4 = cy_limb + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r5, r0, r0 C combine carries + bne r18, $Loop0 +$Lend0a: + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + umulh r2, r19, r4 C r4 = cy_limb + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r5, r0, r0 C combine carries + addq r4, r0, r0 C cy_limb = prod_high + cy + ret r31, (r26), 1 +$Lend0b: + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r0, r5, r0 + ret r31, (r26), 1 + +$Large: + lda $30, -240($30) + stq $9, 8($30) + stq $10, 16($30) + stq $11, 24($30) + stq $12, 32($30) + stq $13, 40($30) + stq $14, 48($30) + stq $15, 56($30) + + and r18, 7, r20 C count for the first loop, 0-7 + srl r18, 3, r18 C count for unrolled loop + bis r31, r31, r0 + beq r20, $Lunroll + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r20, 1, r20 C size-- + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + umulh r2, r19, r0 C r0 = prod_high + beq r20, $Lend1b C jump if size was == 1 + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + subq r20, 1, r20 C size-- + addq r5, r3, r3 + cmpult r3, r5, r4 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + beq r20, $Lend1a C jump if size was == 2 + + ALIGN(8) +$Loop1: mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + subq r20, 1, r20 C size-- + umulh r2, r19, r4 C r4 = cy_limb + ldq r2, 0(r17) C r2 = s1_limb + addq r17, 8, r17 C s1_ptr++ + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r5, r0, r0 C combine carries + bne r20, $Loop1 + +$Lend1a: + mulq r2, r19, r3 C r3 = prod_low + ldq r5, 0(r16) C r5 = *res_ptr + addq r4, r0, r0 C cy_limb = cy_limb + 'cy' + umulh r2, r19, r4 C r4 = cy_limb + addq r3, r0, r3 C r3 = cy_limb + prod_low + cmpult r3, r0, r0 C r0 = carry from (cy_limb + prod_low) + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r5, r0, r0 C combine carries + addq r4, r0, r0 C cy_limb = prod_high + cy + br r31, $Lunroll +$Lend1b: + addq r5, r3, r3 + cmpult r3, r5, r5 + stq r3, 0(r16) + addq r16, 8, r16 C res_ptr++ + addq r0, r5, r0 + +$Lunroll: + lda r17, -16(r17) C L1 bookkeeping + lda r16, -16(r16) C L1 bookkeeping + bis r0, r31, r12 + +C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ + + ldq r2, 16(r17) C L1 + ldq r3, 24(r17) C L1 + lda r18, -1(r18) C L1 bookkeeping + ldq r6, 16(r16) C L1 + ldq r7, 24(r16) C L1 + ldq r0, 32(r17) C L1 + mulq r19, r2, r13 C U1 + ldq r1, 40(r17) C L1 + umulh r19, r2, r14 C U1 + mulq r19, r3, r15 C U1 + lda r17, 64(r17) C L1 bookkeeping + ldq r4, 32(r16) C L1 + ldq r5, 40(r16) C L1 + umulh r19, r3, r8 C U1 + ldq r2, -16(r17) C L1 + mulq r19, r0, r9 C U1 + ldq r3, -8(r17) C L1 + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + mulq r19, r1, r11 C U1 + cmpult r6, r13, r20 C L0 lo add => carry + lda r16, 64(r16) C L1 bookkeeping + addq r6, r12, r22 C U0 hi add => answer + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + ldq r6, -16(r16) C L1 + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + ldq r7, -8(r16) C L1 + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + ldq r0, 0(r17) C L1 + mulq r19, r2, r13 C U1 + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r1, 8(r17) C L1 + umulh r19, r2, r14 C U1 + addq r4, r9, r4 C L0 lo + acc + stq r22, -48(r16) C L0 + stq r23, -40(r16) C L1 + mulq r19, r3, r15 C U1 + addq r8, r21, r8 C U0 hi mul + carry + cmpult r4, r9, r20 C L0 lo add => carry + addq r4, r8, r22 C U0 hi add => answer + ble r18, $Lend C U1 bookkeeping + +C ____ MAIN UNROLLED LOOP ____ + ALIGN(16) +$Loop: + bis r31, r31, r31 C U1 mt + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + ldq r4, 0(r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + ldq r5, 8(r16) C L1 + + umulh r19, r3, r8 C U1 + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + ldq r2, 16(r17) C L1 + + mulq r19, r0, r9 C U1 + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + ldq r3, 24(r17) C L1 + + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + stq r22, -32(r16) C L0 + stq r23, -24(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r1, r11 C U1 + bis r31, r31, r31 C L1 st slosh + addq r12, r21, r12 C U0 hi mul + carry + + cmpult r6, r13, r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r18, -1(r18) C L1 bookkeeping + addq r6, r12, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + ldq r6, 16(r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + ldq r7, 24(r16) C L1 + + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + ldq r0, 32(r17) C L1 + + mulq r19, r2, r13 C U1 + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r1, 40(r17) C L1 + + umulh r19, r2, r14 C U1 + addq r4, r9, r4 C U0 lo + acc + stq r22, -16(r16) C L0 + stq r23, -8(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r3, r15 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C L0 hi mul + carry + + cmpult r4, r9, r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r17, 64(r17) C L1 bookkeeping + addq r4, r8, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + ldq r4, 32(r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + ldq r5, 40(r16) C L1 + + umulh r19, r3, r8 C U1 + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + ldq r2, -16(r17) C L1 + + mulq r19, r0, r9 C U1 + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + ldq r3, -8(r17) C L1 + + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + stq r22, 0(r16) C L0 + stq r23, 8(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r1, r11 C U1 + bis r31, r31, r31 C L1 st slosh + addq r12, r21, r12 C U0 hi mul + carry + + cmpult r6, r13, r20 C L0 lo add => carry + bis r31, r31, r31 C U1 mt + lda r16, 64(r16) C L1 bookkeeping + addq r6, r12, r22 C U0 hi add => answer + + bis r31, r31, r31 C U1 mt + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + ldq r6, -16(r16) C L1 + + bis r31, r31, r31 C U1 mt + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + ldq r7, -8(r16) C L1 + + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + ldq r0, 0(r17) C L1 + + mulq r19, r2, r13 C U1 + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + ldq r1, 8(r17) C L1 + + umulh r19, r2, r14 C U1 + addq r4, r9, r4 C L0 lo + acc + stq r22, -48(r16) C L0 + stq r23, -40(r16) C L1 + + bis r31, r31, r31 C L0 st slosh + mulq r19, r3, r15 C U1 + bis r31, r31, r31 C L1 st slosh + addq r8, r21, r8 C U0 hi mul + carry + + cmpult r4, r9, r20 C L0 lo add => carry + addq r4, r8, r22 C U0 hi add => answer + bis r31, r31, r31 C L1 mt + bgt r18, $Loop C U1 bookkeeping + +C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ +$Lend: + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + ldq r4, 0(r16) C L1 + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + ldq r5, 8(r16) C L1 + umulh r19, r3, r8 C U1 + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + mulq r19, r0, r9 C U1 + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + umulh r19, r0, r10 C U1 + addq r6, r13, r6 C L0 lo + acc + stq r22, -32(r16) C L0 + stq r23, -24(r16) C L1 + mulq r19, r1, r11 C U1 + addq r12, r21, r12 C U0 hi mul + carry + cmpult r6, r13, r20 C L0 lo add => carry + addq r6, r12, r22 C U0 hi add => answer + cmpult r22, r12, r21 C L0 hi add => carry + addq r14, r20, r14 C U0 hi mul + carry + addq r7, r15, r23 C L0 lo + acc + addq r14, r21, r14 C U0 hi mul + carry + umulh r19, r1, r12 C U1 + cmpult r23, r15, r20 C L0 lo add => carry + addq r23, r14, r23 C U0 hi add => answer + cmpult r23, r14, r21 C L0 hi add => carry + addq r8, r20, r8 C U0 hi mul + carry + addq r4, r9, r4 C U0 lo + acc + stq r22, -16(r16) C L0 + stq r23, -8(r16) C L1 + bis r31, r31, r31 C L0 st slosh + addq r8, r21, r8 C L0 hi mul + carry + cmpult r4, r9, r20 C L0 lo add => carry + addq r4, r8, r22 C U0 hi add => answer + cmpult r22, r8, r21 C L0 hi add => carry + addq r10, r20, r10 C U0 hi mul + carry + addq r5, r11, r23 C L0 lo + acc + addq r10, r21, r10 C L0 hi mul + carry + cmpult r23, r11, r20 C L0 lo add => carry + addq r23, r10, r23 C U0 hi add => answer + cmpult r23, r10, r21 C L0 hi add => carry + addq r12, r20, r12 C U0 hi mul + carry + stq r22, 0(r16) C L0 + stq r23, 8(r16) C L1 + addq r12, r21, r0 C U0 hi mul + carry + + ldq $9, 8($30) + ldq $10, 16($30) + ldq $11, 24($30) + ldq $12, 32($30) + ldq $13, 40($30) + ldq $14, 48($30) + ldq $15, 56($30) + lda $30, 240($30) + ret r31, (r26), 1 +EPILOGUE(mpn_addmul_1) +ASM_END() diff --git a/rts/gmp/mpn/alpha/ev6/gmp-mparam.h b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h new file mode 100644 index 0000000000..7ea20577f8 --- /dev/null +++ b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h @@ -0,0 +1,62 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* Generated by tuneup.c, 2000-08-02. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 47 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 70 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 94 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 101 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 33 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 70 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 29 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 46 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 33 +#endif diff --git a/rts/gmp/mpn/alpha/gmp-mparam.h b/rts/gmp/mpn/alpha/gmp-mparam.h new file mode 100644 index 0000000000..054ff2fe5f --- /dev/null +++ b/rts/gmp/mpn/alpha/gmp-mparam.h @@ -0,0 +1,64 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values are for the 21164 family. The 21264 will require + different values, since it has such quick multiplication. */ +/* Generated by tuneup.c, 2000-07-19. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 22 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 53 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 31 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 47 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 64 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 98 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 17 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 4 +#endif diff --git a/rts/gmp/mpn/alpha/invert_limb.asm b/rts/gmp/mpn/alpha/invert_limb.asm new file mode 100644 index 0000000000..a921b32b3f --- /dev/null +++ b/rts/gmp/mpn/alpha/invert_limb.asm @@ -0,0 +1,345 @@ +dnl Alpha mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +dnl +dnl This is based on sophie:/gmp-stuff/dbg-inv-limb.c. +dnl The ideas are due to Peter L. Montgomery +dnl +dnl The table below uses 4096 bytes. The file mentioned above has an +dnl alternative function that doesn't require the table, but it runs 50% +dnl slower than this. + +include(`../config.m4') + +ASM_START() + +FLOAT64($C36,9223372036854775808.0) C 2^63 + +PROLOGUE_GP(mpn_invert_limb) + lda r30,-16(r30) + addq r16,r16,r1 + bne r1,$73 + lda r0,-1 + br r31,$Lend +$73: + srl r16,1,r1 + stq r1,0(r30) + ldt f11,0(r30) + cvtqt f11,f1 + lda r1,$C36 + ldt f10,0(r1) + divt f10,f1,f10 + lda r2,$invtab-4096 + srl r16,52,r1 + addq r1,r1,r1 + addq r1,r2,r1 + bic r1,6,r2 + ldq r2,0(r2) + bic r1,1,r1 + extwl r2,r1,r2 + sll r2,48,r0 + umulh r16,r0,r1 + addq r16,r1,r3 + stq r3,0(r30) + ldt f11,0(r30) + cvtqt f11,f1 + mult f1,f10,f1 + cvttqc f1,f1 + stt f1,0(r30) + ldq r4,0(r30) + subq r0,r4,r0 + umulh r16,r0,r1 + mulq r16,r0,r2 + addq r16,r1,r3 + bge r3,$Loop2 +$Loop1: addq r2,r16,r2 + cmpult r2,r16,r1 + addq r3,r1,r3 + addq r0,1,r0 + blt r3,$Loop1 +$Loop2: cmpult r2,r16,r1 + subq r0,1,r0 + subq r3,r1,r3 + subq r2,r16,r2 + bge r3,$Loop2 +$Lend: + lda r30,16(r30) + ret r31,(r26),1 +EPILOGUE(mpn_invert_limb) +DATASTART(`$invtab',4) + .word 0xffff,0xffc0,0xff80,0xff40,0xff00,0xfec0,0xfe81,0xfe41 + .word 0xfe01,0xfdc2,0xfd83,0xfd43,0xfd04,0xfcc5,0xfc86,0xfc46 + .word 0xfc07,0xfbc8,0xfb8a,0xfb4b,0xfb0c,0xfacd,0xfa8e,0xfa50 + .word 0xfa11,0xf9d3,0xf994,0xf956,0xf918,0xf8d9,0xf89b,0xf85d + .word 0xf81f,0xf7e1,0xf7a3,0xf765,0xf727,0xf6ea,0xf6ac,0xf66e + .word 0xf631,0xf5f3,0xf5b6,0xf578,0xf53b,0xf4fd,0xf4c0,0xf483 + .word 0xf446,0xf409,0xf3cc,0xf38f,0xf352,0xf315,0xf2d8,0xf29c + .word 0xf25f,0xf222,0xf1e6,0xf1a9,0xf16d,0xf130,0xf0f4,0xf0b8 + .word 0xf07c,0xf03f,0xf003,0xefc7,0xef8b,0xef4f,0xef14,0xeed8 + .word 0xee9c,0xee60,0xee25,0xede9,0xedae,0xed72,0xed37,0xecfb + .word 0xecc0,0xec85,0xec4a,0xec0e,0xebd3,0xeb98,0xeb5d,0xeb22 + .word 0xeae8,0xeaad,0xea72,0xea37,0xe9fd,0xe9c2,0xe988,0xe94d + .word 0xe913,0xe8d8,0xe89e,0xe864,0xe829,0xe7ef,0xe7b5,0xe77b + .word 0xe741,0xe707,0xe6cd,0xe694,0xe65a,0xe620,0xe5e6,0xe5ad + .word 0xe573,0xe53a,0xe500,0xe4c7,0xe48d,0xe454,0xe41b,0xe3e2 + .word 0xe3a9,0xe370,0xe336,0xe2fd,0xe2c5,0xe28c,0xe253,0xe21a + .word 0xe1e1,0xe1a9,0xe170,0xe138,0xe0ff,0xe0c7,0xe08e,0xe056 + .word 0xe01e,0xdfe5,0xdfad,0xdf75,0xdf3d,0xdf05,0xdecd,0xde95 + .word 0xde5d,0xde25,0xdded,0xddb6,0xdd7e,0xdd46,0xdd0f,0xdcd7 + .word 0xdca0,0xdc68,0xdc31,0xdbf9,0xdbc2,0xdb8b,0xdb54,0xdb1d + .word 0xdae6,0xdaae,0xda78,0xda41,0xda0a,0xd9d3,0xd99c,0xd965 + .word 0xd92f,0xd8f8,0xd8c1,0xd88b,0xd854,0xd81e,0xd7e8,0xd7b1 + .word 0xd77b,0xd745,0xd70e,0xd6d8,0xd6a2,0xd66c,0xd636,0xd600 + .word 0xd5ca,0xd594,0xd55f,0xd529,0xd4f3,0xd4bd,0xd488,0xd452 + .word 0xd41d,0xd3e7,0xd3b2,0xd37c,0xd347,0xd312,0xd2dd,0xd2a7 + .word 0xd272,0xd23d,0xd208,0xd1d3,0xd19e,0xd169,0xd134,0xd100 + .word 0xd0cb,0xd096,0xd061,0xd02d,0xcff8,0xcfc4,0xcf8f,0xcf5b + .word 0xcf26,0xcef2,0xcebe,0xce89,0xce55,0xce21,0xcded,0xcdb9 + .word 0xcd85,0xcd51,0xcd1d,0xcce9,0xccb5,0xcc81,0xcc4e,0xcc1a + .word 0xcbe6,0xcbb3,0xcb7f,0xcb4c,0xcb18,0xcae5,0xcab1,0xca7e + .word 0xca4b,0xca17,0xc9e4,0xc9b1,0xc97e,0xc94b,0xc918,0xc8e5 + .word 0xc8b2,0xc87f,0xc84c,0xc819,0xc7e7,0xc7b4,0xc781,0xc74f + .word 0xc71c,0xc6e9,0xc6b7,0xc684,0xc652,0xc620,0xc5ed,0xc5bb + .word 0xc589,0xc557,0xc524,0xc4f2,0xc4c0,0xc48e,0xc45c,0xc42a + .word 0xc3f8,0xc3c7,0xc395,0xc363,0xc331,0xc300,0xc2ce,0xc29c + .word 0xc26b,0xc239,0xc208,0xc1d6,0xc1a5,0xc174,0xc142,0xc111 + .word 0xc0e0,0xc0af,0xc07e,0xc04d,0xc01c,0xbfeb,0xbfba,0xbf89 + .word 0xbf58,0xbf27,0xbef6,0xbec5,0xbe95,0xbe64,0xbe33,0xbe03 + .word 0xbdd2,0xbda2,0xbd71,0xbd41,0xbd10,0xbce0,0xbcb0,0xbc80 + .word 0xbc4f,0xbc1f,0xbbef,0xbbbf,0xbb8f,0xbb5f,0xbb2f,0xbaff + .word 0xbacf,0xba9f,0xba6f,0xba40,0xba10,0xb9e0,0xb9b1,0xb981 + .word 0xb951,0xb922,0xb8f2,0xb8c3,0xb894,0xb864,0xb835,0xb806 + .word 0xb7d6,0xb7a7,0xb778,0xb749,0xb71a,0xb6eb,0xb6bc,0xb68d + .word 0xb65e,0xb62f,0xb600,0xb5d1,0xb5a2,0xb574,0xb545,0xb516 + .word 0xb4e8,0xb4b9,0xb48a,0xb45c,0xb42e,0xb3ff,0xb3d1,0xb3a2 + .word 0xb374,0xb346,0xb318,0xb2e9,0xb2bb,0xb28d,0xb25f,0xb231 + .word 0xb203,0xb1d5,0xb1a7,0xb179,0xb14b,0xb11d,0xb0f0,0xb0c2 + .word 0xb094,0xb067,0xb039,0xb00b,0xafde,0xafb0,0xaf83,0xaf55 + .word 0xaf28,0xaefb,0xaecd,0xaea0,0xae73,0xae45,0xae18,0xadeb + .word 0xadbe,0xad91,0xad64,0xad37,0xad0a,0xacdd,0xacb0,0xac83 + .word 0xac57,0xac2a,0xabfd,0xabd0,0xaba4,0xab77,0xab4a,0xab1e + .word 0xaaf1,0xaac5,0xaa98,0xaa6c,0xaa40,0xaa13,0xa9e7,0xa9bb + .word 0xa98e,0xa962,0xa936,0xa90a,0xa8de,0xa8b2,0xa886,0xa85a + .word 0xa82e,0xa802,0xa7d6,0xa7aa,0xa77e,0xa753,0xa727,0xa6fb + .word 0xa6d0,0xa6a4,0xa678,0xa64d,0xa621,0xa5f6,0xa5ca,0xa59f + .word 0xa574,0xa548,0xa51d,0xa4f2,0xa4c6,0xa49b,0xa470,0xa445 + .word 0xa41a,0xa3ef,0xa3c4,0xa399,0xa36e,0xa343,0xa318,0xa2ed + .word 0xa2c2,0xa297,0xa26d,0xa242,0xa217,0xa1ed,0xa1c2,0xa197 + .word 0xa16d,0xa142,0xa118,0xa0ed,0xa0c3,0xa098,0xa06e,0xa044 + .word 0xa01a,0x9fef,0x9fc5,0x9f9b,0x9f71,0x9f47,0x9f1c,0x9ef2 + .word 0x9ec8,0x9e9e,0x9e74,0x9e4b,0x9e21,0x9df7,0x9dcd,0x9da3 + .word 0x9d79,0x9d50,0x9d26,0x9cfc,0x9cd3,0x9ca9,0x9c80,0x9c56 + .word 0x9c2d,0x9c03,0x9bda,0x9bb0,0x9b87,0x9b5e,0x9b34,0x9b0b + .word 0x9ae2,0x9ab9,0x9a8f,0x9a66,0x9a3d,0x9a14,0x99eb,0x99c2 + .word 0x9999,0x9970,0x9947,0x991e,0x98f6,0x98cd,0x98a4,0x987b + .word 0x9852,0x982a,0x9801,0x97d8,0x97b0,0x9787,0x975f,0x9736 + .word 0x970e,0x96e5,0x96bd,0x9695,0x966c,0x9644,0x961c,0x95f3 + .word 0x95cb,0x95a3,0x957b,0x9553,0x952b,0x9503,0x94db,0x94b3 + .word 0x948b,0x9463,0x943b,0x9413,0x93eb,0x93c3,0x939b,0x9374 + .word 0x934c,0x9324,0x92fd,0x92d5,0x92ad,0x9286,0x925e,0x9237 + .word 0x920f,0x91e8,0x91c0,0x9199,0x9172,0x914a,0x9123,0x90fc + .word 0x90d4,0x90ad,0x9086,0x905f,0x9038,0x9011,0x8fea,0x8fc3 + .word 0x8f9c,0x8f75,0x8f4e,0x8f27,0x8f00,0x8ed9,0x8eb2,0x8e8b + .word 0x8e65,0x8e3e,0x8e17,0x8df1,0x8dca,0x8da3,0x8d7d,0x8d56 + .word 0x8d30,0x8d09,0x8ce3,0x8cbc,0x8c96,0x8c6f,0x8c49,0x8c23 + .word 0x8bfc,0x8bd6,0x8bb0,0x8b8a,0x8b64,0x8b3d,0x8b17,0x8af1 + .word 0x8acb,0x8aa5,0x8a7f,0x8a59,0x8a33,0x8a0d,0x89e7,0x89c1 + .word 0x899c,0x8976,0x8950,0x892a,0x8904,0x88df,0x88b9,0x8893 + .word 0x886e,0x8848,0x8823,0x87fd,0x87d8,0x87b2,0x878d,0x8767 + .word 0x8742,0x871d,0x86f7,0x86d2,0x86ad,0x8687,0x8662,0x863d + .word 0x8618,0x85f3,0x85ce,0x85a9,0x8583,0x855e,0x8539,0x8514 + .word 0x84f0,0x84cb,0x84a6,0x8481,0x845c,0x8437,0x8412,0x83ee + .word 0x83c9,0x83a4,0x8380,0x835b,0x8336,0x8312,0x82ed,0x82c9 + .word 0x82a4,0x8280,0x825b,0x8237,0x8212,0x81ee,0x81ca,0x81a5 + .word 0x8181,0x815d,0x8138,0x8114,0x80f0,0x80cc,0x80a8,0x8084 + .word 0x8060,0x803c,0x8018,0x7ff4,0x7fd0,0x7fac,0x7f88,0x7f64 + .word 0x7f40,0x7f1c,0x7ef8,0x7ed4,0x7eb1,0x7e8d,0x7e69,0x7e45 + .word 0x7e22,0x7dfe,0x7ddb,0x7db7,0x7d93,0x7d70,0x7d4c,0x7d29 + .word 0x7d05,0x7ce2,0x7cbf,0x7c9b,0x7c78,0x7c55,0x7c31,0x7c0e + .word 0x7beb,0x7bc7,0x7ba4,0x7b81,0x7b5e,0x7b3b,0x7b18,0x7af5 + .word 0x7ad2,0x7aaf,0x7a8c,0x7a69,0x7a46,0x7a23,0x7a00,0x79dd + .word 0x79ba,0x7997,0x7975,0x7952,0x792f,0x790c,0x78ea,0x78c7 + .word 0x78a4,0x7882,0x785f,0x783c,0x781a,0x77f7,0x77d5,0x77b2 + .word 0x7790,0x776e,0x774b,0x7729,0x7706,0x76e4,0x76c2,0x76a0 + .word 0x767d,0x765b,0x7639,0x7617,0x75f5,0x75d2,0x75b0,0x758e + .word 0x756c,0x754a,0x7528,0x7506,0x74e4,0x74c2,0x74a0,0x747e + .word 0x745d,0x743b,0x7419,0x73f7,0x73d5,0x73b4,0x7392,0x7370 + .word 0x734f,0x732d,0x730b,0x72ea,0x72c8,0x72a7,0x7285,0x7264 + .word 0x7242,0x7221,0x71ff,0x71de,0x71bc,0x719b,0x717a,0x7158 + .word 0x7137,0x7116,0x70f5,0x70d3,0x70b2,0x7091,0x7070,0x704f + .word 0x702e,0x700c,0x6feb,0x6fca,0x6fa9,0x6f88,0x6f67,0x6f46 + .word 0x6f26,0x6f05,0x6ee4,0x6ec3,0x6ea2,0x6e81,0x6e60,0x6e40 + .word 0x6e1f,0x6dfe,0x6dde,0x6dbd,0x6d9c,0x6d7c,0x6d5b,0x6d3a + .word 0x6d1a,0x6cf9,0x6cd9,0x6cb8,0x6c98,0x6c77,0x6c57,0x6c37 + .word 0x6c16,0x6bf6,0x6bd6,0x6bb5,0x6b95,0x6b75,0x6b54,0x6b34 + .word 0x6b14,0x6af4,0x6ad4,0x6ab4,0x6a94,0x6a73,0x6a53,0x6a33 + .word 0x6a13,0x69f3,0x69d3,0x69b3,0x6993,0x6974,0x6954,0x6934 + .word 0x6914,0x68f4,0x68d4,0x68b5,0x6895,0x6875,0x6855,0x6836 + .word 0x6816,0x67f6,0x67d7,0x67b7,0x6798,0x6778,0x6758,0x6739 + .word 0x6719,0x66fa,0x66db,0x66bb,0x669c,0x667c,0x665d,0x663e + .word 0x661e,0x65ff,0x65e0,0x65c0,0x65a1,0x6582,0x6563,0x6544 + .word 0x6524,0x6505,0x64e6,0x64c7,0x64a8,0x6489,0x646a,0x644b + .word 0x642c,0x640d,0x63ee,0x63cf,0x63b0,0x6391,0x6373,0x6354 + .word 0x6335,0x6316,0x62f7,0x62d9,0x62ba,0x629b,0x627c,0x625e + .word 0x623f,0x6221,0x6202,0x61e3,0x61c5,0x61a6,0x6188,0x6169 + .word 0x614b,0x612c,0x610e,0x60ef,0x60d1,0x60b3,0x6094,0x6076 + .word 0x6058,0x6039,0x601b,0x5ffd,0x5fdf,0x5fc0,0x5fa2,0x5f84 + .word 0x5f66,0x5f48,0x5f2a,0x5f0b,0x5eed,0x5ecf,0x5eb1,0x5e93 + .word 0x5e75,0x5e57,0x5e39,0x5e1b,0x5dfd,0x5de0,0x5dc2,0x5da4 + .word 0x5d86,0x5d68,0x5d4a,0x5d2d,0x5d0f,0x5cf1,0x5cd3,0x5cb6 + .word 0x5c98,0x5c7a,0x5c5d,0x5c3f,0x5c21,0x5c04,0x5be6,0x5bc9 + .word 0x5bab,0x5b8e,0x5b70,0x5b53,0x5b35,0x5b18,0x5afb,0x5add + .word 0x5ac0,0x5aa2,0x5a85,0x5a68,0x5a4b,0x5a2d,0x5a10,0x59f3 + .word 0x59d6,0x59b8,0x599b,0x597e,0x5961,0x5944,0x5927,0x590a + .word 0x58ed,0x58d0,0x58b3,0x5896,0x5879,0x585c,0x583f,0x5822 + .word 0x5805,0x57e8,0x57cb,0x57ae,0x5791,0x5775,0x5758,0x573b + .word 0x571e,0x5702,0x56e5,0x56c8,0x56ac,0x568f,0x5672,0x5656 + .word 0x5639,0x561c,0x5600,0x55e3,0x55c7,0x55aa,0x558e,0x5571 + .word 0x5555,0x5538,0x551c,0x5500,0x54e3,0x54c7,0x54aa,0x548e + .word 0x5472,0x5456,0x5439,0x541d,0x5401,0x53e5,0x53c8,0x53ac + .word 0x5390,0x5374,0x5358,0x533c,0x5320,0x5304,0x52e8,0x52cb + .word 0x52af,0x5293,0x5277,0x525c,0x5240,0x5224,0x5208,0x51ec + .word 0x51d0,0x51b4,0x5198,0x517c,0x5161,0x5145,0x5129,0x510d + .word 0x50f2,0x50d6,0x50ba,0x509f,0x5083,0x5067,0x504c,0x5030 + .word 0x5015,0x4ff9,0x4fdd,0x4fc2,0x4fa6,0x4f8b,0x4f6f,0x4f54 + .word 0x4f38,0x4f1d,0x4f02,0x4ee6,0x4ecb,0x4eb0,0x4e94,0x4e79 + .word 0x4e5e,0x4e42,0x4e27,0x4e0c,0x4df0,0x4dd5,0x4dba,0x4d9f + .word 0x4d84,0x4d69,0x4d4d,0x4d32,0x4d17,0x4cfc,0x4ce1,0x4cc6 + .word 0x4cab,0x4c90,0x4c75,0x4c5a,0x4c3f,0x4c24,0x4c09,0x4bee + .word 0x4bd3,0x4bb9,0x4b9e,0x4b83,0x4b68,0x4b4d,0x4b32,0x4b18 + .word 0x4afd,0x4ae2,0x4ac7,0x4aad,0x4a92,0x4a77,0x4a5d,0x4a42 + .word 0x4a27,0x4a0d,0x49f2,0x49d8,0x49bd,0x49a3,0x4988,0x496e + .word 0x4953,0x4939,0x491e,0x4904,0x48e9,0x48cf,0x48b5,0x489a + .word 0x4880,0x4865,0x484b,0x4831,0x4817,0x47fc,0x47e2,0x47c8 + .word 0x47ae,0x4793,0x4779,0x475f,0x4745,0x472b,0x4711,0x46f6 + .word 0x46dc,0x46c2,0x46a8,0x468e,0x4674,0x465a,0x4640,0x4626 + .word 0x460c,0x45f2,0x45d8,0x45be,0x45a5,0x458b,0x4571,0x4557 + .word 0x453d,0x4523,0x4509,0x44f0,0x44d6,0x44bc,0x44a2,0x4489 + .word 0x446f,0x4455,0x443c,0x4422,0x4408,0x43ef,0x43d5,0x43bc + .word 0x43a2,0x4388,0x436f,0x4355,0x433c,0x4322,0x4309,0x42ef + .word 0x42d6,0x42bc,0x42a3,0x428a,0x4270,0x4257,0x423d,0x4224 + .word 0x420b,0x41f2,0x41d8,0x41bf,0x41a6,0x418c,0x4173,0x415a + .word 0x4141,0x4128,0x410e,0x40f5,0x40dc,0x40c3,0x40aa,0x4091 + .word 0x4078,0x405f,0x4046,0x402d,0x4014,0x3ffb,0x3fe2,0x3fc9 + .word 0x3fb0,0x3f97,0x3f7e,0x3f65,0x3f4c,0x3f33,0x3f1a,0x3f01 + .word 0x3ee8,0x3ed0,0x3eb7,0x3e9e,0x3e85,0x3e6c,0x3e54,0x3e3b + .word 0x3e22,0x3e0a,0x3df1,0x3dd8,0x3dc0,0x3da7,0x3d8e,0x3d76 + .word 0x3d5d,0x3d45,0x3d2c,0x3d13,0x3cfb,0x3ce2,0x3cca,0x3cb1 + .word 0x3c99,0x3c80,0x3c68,0x3c50,0x3c37,0x3c1f,0x3c06,0x3bee + .word 0x3bd6,0x3bbd,0x3ba5,0x3b8d,0x3b74,0x3b5c,0x3b44,0x3b2b + .word 0x3b13,0x3afb,0x3ae3,0x3acb,0x3ab2,0x3a9a,0x3a82,0x3a6a + .word 0x3a52,0x3a3a,0x3a22,0x3a09,0x39f1,0x39d9,0x39c1,0x39a9 + .word 0x3991,0x3979,0x3961,0x3949,0x3931,0x3919,0x3901,0x38ea + .word 0x38d2,0x38ba,0x38a2,0x388a,0x3872,0x385a,0x3843,0x382b + .word 0x3813,0x37fb,0x37e3,0x37cc,0x37b4,0x379c,0x3785,0x376d + .word 0x3755,0x373e,0x3726,0x370e,0x36f7,0x36df,0x36c8,0x36b0 + .word 0x3698,0x3681,0x3669,0x3652,0x363a,0x3623,0x360b,0x35f4 + .word 0x35dc,0x35c5,0x35ae,0x3596,0x357f,0x3567,0x3550,0x3539 + .word 0x3521,0x350a,0x34f3,0x34db,0x34c4,0x34ad,0x3496,0x347e + .word 0x3467,0x3450,0x3439,0x3422,0x340a,0x33f3,0x33dc,0x33c5 + .word 0x33ae,0x3397,0x3380,0x3368,0x3351,0x333a,0x3323,0x330c + .word 0x32f5,0x32de,0x32c7,0x32b0,0x3299,0x3282,0x326c,0x3255 + .word 0x323e,0x3227,0x3210,0x31f9,0x31e2,0x31cb,0x31b5,0x319e + .word 0x3187,0x3170,0x3159,0x3143,0x312c,0x3115,0x30fe,0x30e8 + .word 0x30d1,0x30ba,0x30a4,0x308d,0x3076,0x3060,0x3049,0x3033 + .word 0x301c,0x3005,0x2fef,0x2fd8,0x2fc2,0x2fab,0x2f95,0x2f7e + .word 0x2f68,0x2f51,0x2f3b,0x2f24,0x2f0e,0x2ef8,0x2ee1,0x2ecb + .word 0x2eb4,0x2e9e,0x2e88,0x2e71,0x2e5b,0x2e45,0x2e2e,0x2e18 + .word 0x2e02,0x2dec,0x2dd5,0x2dbf,0x2da9,0x2d93,0x2d7c,0x2d66 + .word 0x2d50,0x2d3a,0x2d24,0x2d0e,0x2cf8,0x2ce1,0x2ccb,0x2cb5 + .word 0x2c9f,0x2c89,0x2c73,0x2c5d,0x2c47,0x2c31,0x2c1b,0x2c05 + .word 0x2bef,0x2bd9,0x2bc3,0x2bad,0x2b97,0x2b81,0x2b6c,0x2b56 + .word 0x2b40,0x2b2a,0x2b14,0x2afe,0x2ae8,0x2ad3,0x2abd,0x2aa7 + .word 0x2a91,0x2a7c,0x2a66,0x2a50,0x2a3a,0x2a25,0x2a0f,0x29f9 + .word 0x29e4,0x29ce,0x29b8,0x29a3,0x298d,0x2977,0x2962,0x294c + .word 0x2937,0x2921,0x290c,0x28f6,0x28e0,0x28cb,0x28b5,0x28a0 + .word 0x288b,0x2875,0x2860,0x284a,0x2835,0x281f,0x280a,0x27f5 + .word 0x27df,0x27ca,0x27b4,0x279f,0x278a,0x2774,0x275f,0x274a + .word 0x2735,0x271f,0x270a,0x26f5,0x26e0,0x26ca,0x26b5,0x26a0 + .word 0x268b,0x2676,0x2660,0x264b,0x2636,0x2621,0x260c,0x25f7 + .word 0x25e2,0x25cd,0x25b8,0x25a2,0x258d,0x2578,0x2563,0x254e + .word 0x2539,0x2524,0x250f,0x24fa,0x24e5,0x24d1,0x24bc,0x24a7 + .word 0x2492,0x247d,0x2468,0x2453,0x243e,0x2429,0x2415,0x2400 + .word 0x23eb,0x23d6,0x23c1,0x23ad,0x2398,0x2383,0x236e,0x235a + .word 0x2345,0x2330,0x231c,0x2307,0x22f2,0x22dd,0x22c9,0x22b4 + .word 0x22a0,0x228b,0x2276,0x2262,0x224d,0x2239,0x2224,0x2210 + .word 0x21fb,0x21e6,0x21d2,0x21bd,0x21a9,0x2194,0x2180,0x216c + .word 0x2157,0x2143,0x212e,0x211a,0x2105,0x20f1,0x20dd,0x20c8 + .word 0x20b4,0x20a0,0x208b,0x2077,0x2063,0x204e,0x203a,0x2026 + .word 0x2012,0x1ffd,0x1fe9,0x1fd5,0x1fc1,0x1fac,0x1f98,0x1f84 + .word 0x1f70,0x1f5c,0x1f47,0x1f33,0x1f1f,0x1f0b,0x1ef7,0x1ee3 + .word 0x1ecf,0x1ebb,0x1ea7,0x1e93,0x1e7f,0x1e6a,0x1e56,0x1e42 + .word 0x1e2e,0x1e1a,0x1e06,0x1df3,0x1ddf,0x1dcb,0x1db7,0x1da3 + .word 0x1d8f,0x1d7b,0x1d67,0x1d53,0x1d3f,0x1d2b,0x1d18,0x1d04 + .word 0x1cf0,0x1cdc,0x1cc8,0x1cb5,0x1ca1,0x1c8d,0x1c79,0x1c65 + .word 0x1c52,0x1c3e,0x1c2a,0x1c17,0x1c03,0x1bef,0x1bdb,0x1bc8 + .word 0x1bb4,0x1ba0,0x1b8d,0x1b79,0x1b66,0x1b52,0x1b3e,0x1b2b + .word 0x1b17,0x1b04,0x1af0,0x1add,0x1ac9,0x1ab6,0x1aa2,0x1a8f + .word 0x1a7b,0x1a68,0x1a54,0x1a41,0x1a2d,0x1a1a,0x1a06,0x19f3 + .word 0x19e0,0x19cc,0x19b9,0x19a5,0x1992,0x197f,0x196b,0x1958 + .word 0x1945,0x1931,0x191e,0x190b,0x18f8,0x18e4,0x18d1,0x18be + .word 0x18ab,0x1897,0x1884,0x1871,0x185e,0x184b,0x1837,0x1824 + .word 0x1811,0x17fe,0x17eb,0x17d8,0x17c4,0x17b1,0x179e,0x178b + .word 0x1778,0x1765,0x1752,0x173f,0x172c,0x1719,0x1706,0x16f3 + .word 0x16e0,0x16cd,0x16ba,0x16a7,0x1694,0x1681,0x166e,0x165b + .word 0x1648,0x1635,0x1623,0x1610,0x15fd,0x15ea,0x15d7,0x15c4 + .word 0x15b1,0x159f,0x158c,0x1579,0x1566,0x1553,0x1541,0x152e + .word 0x151b,0x1508,0x14f6,0x14e3,0x14d0,0x14bd,0x14ab,0x1498 + .word 0x1485,0x1473,0x1460,0x144d,0x143b,0x1428,0x1416,0x1403 + .word 0x13f0,0x13de,0x13cb,0x13b9,0x13a6,0x1394,0x1381,0x136f + .word 0x135c,0x1349,0x1337,0x1325,0x1312,0x1300,0x12ed,0x12db + .word 0x12c8,0x12b6,0x12a3,0x1291,0x127f,0x126c,0x125a,0x1247 + .word 0x1235,0x1223,0x1210,0x11fe,0x11ec,0x11d9,0x11c7,0x11b5 + .word 0x11a3,0x1190,0x117e,0x116c,0x1159,0x1147,0x1135,0x1123 + .word 0x1111,0x10fe,0x10ec,0x10da,0x10c8,0x10b6,0x10a4,0x1091 + .word 0x107f,0x106d,0x105b,0x1049,0x1037,0x1025,0x1013,0x1001 + .word 0x0fef,0x0fdc,0x0fca,0x0fb8,0x0fa6,0x0f94,0x0f82,0x0f70 + .word 0x0f5e,0x0f4c,0x0f3a,0x0f28,0x0f17,0x0f05,0x0ef3,0x0ee1 + .word 0x0ecf,0x0ebd,0x0eab,0x0e99,0x0e87,0x0e75,0x0e64,0x0e52 + .word 0x0e40,0x0e2e,0x0e1c,0x0e0a,0x0df9,0x0de7,0x0dd5,0x0dc3 + .word 0x0db2,0x0da0,0x0d8e,0x0d7c,0x0d6b,0x0d59,0x0d47,0x0d35 + .word 0x0d24,0x0d12,0x0d00,0x0cef,0x0cdd,0x0ccb,0x0cba,0x0ca8 + .word 0x0c97,0x0c85,0x0c73,0x0c62,0x0c50,0x0c3f,0x0c2d,0x0c1c + .word 0x0c0a,0x0bf8,0x0be7,0x0bd5,0x0bc4,0x0bb2,0x0ba1,0x0b8f + .word 0x0b7e,0x0b6c,0x0b5b,0x0b4a,0x0b38,0x0b27,0x0b15,0x0b04 + .word 0x0af2,0x0ae1,0x0ad0,0x0abe,0x0aad,0x0a9c,0x0a8a,0x0a79 + .word 0x0a68,0x0a56,0x0a45,0x0a34,0x0a22,0x0a11,0x0a00,0x09ee + .word 0x09dd,0x09cc,0x09bb,0x09a9,0x0998,0x0987,0x0976,0x0965 + .word 0x0953,0x0942,0x0931,0x0920,0x090f,0x08fe,0x08ec,0x08db + .word 0x08ca,0x08b9,0x08a8,0x0897,0x0886,0x0875,0x0864,0x0853 + .word 0x0842,0x0831,0x081f,0x080e,0x07fd,0x07ec,0x07db,0x07ca + .word 0x07b9,0x07a8,0x0798,0x0787,0x0776,0x0765,0x0754,0x0743 + .word 0x0732,0x0721,0x0710,0x06ff,0x06ee,0x06dd,0x06cd,0x06bc + .word 0x06ab,0x069a,0x0689,0x0678,0x0668,0x0657,0x0646,0x0635 + .word 0x0624,0x0614,0x0603,0x05f2,0x05e1,0x05d1,0x05c0,0x05af + .word 0x059e,0x058e,0x057d,0x056c,0x055c,0x054b,0x053a,0x052a + .word 0x0519,0x0508,0x04f8,0x04e7,0x04d6,0x04c6,0x04b5,0x04a5 + .word 0x0494,0x0484,0x0473,0x0462,0x0452,0x0441,0x0431,0x0420 + .word 0x0410,0x03ff,0x03ef,0x03de,0x03ce,0x03bd,0x03ad,0x039c + .word 0x038c,0x037b,0x036b,0x035b,0x034a,0x033a,0x0329,0x0319 + .word 0x0309,0x02f8,0x02e8,0x02d7,0x02c7,0x02b7,0x02a6,0x0296 + .word 0x0286,0x0275,0x0265,0x0255,0x0245,0x0234,0x0224,0x0214 + .word 0x0204,0x01f3,0x01e3,0x01d3,0x01c3,0x01b2,0x01a2,0x0192 + .word 0x0182,0x0172,0x0161,0x0151,0x0141,0x0131,0x0121,0x0111 + .word 0x0101,0x00f0,0x00e0,0x00d0,0x00c0,0x00b0,0x00a0,0x0090 + .word 0x0080,0x0070,0x0060,0x0050,0x0040,0x0030,0x0020,0x0010 +DATAEND() +ASM_END() diff --git a/rts/gmp/mpn/alpha/lshift.asm b/rts/gmp/mpn/alpha/lshift.asm new file mode 100644 index 0000000000..87c46f6fe7 --- /dev/null +++ b/rts/gmp/mpn/alpha/lshift.asm @@ -0,0 +1,104 @@ +dnl Alpha mpn_lshift -- Shift a number left. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, +dnl it would take 4 cycles/limb. It should be possible to get down to 3 +dnl cycles/limb since both ldq and stq can be paired with the other used +dnl instructions. But there are many restrictions in the 21064 pipeline that +dnl makes it hard, if not impossible, to get down to 3 cycles/limb: + +dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. +dnl 2. Only aligned instruction pairs can be paired. +dnl 3. The store buffer or silo might not be able to deal with the bandwidth. + +ASM_START() +PROLOGUE(mpn_lshift) + s8addq r18,r17,r17 C make r17 point at end of s1 + ldq r4,-8(r17) C load first limb + subq r17,8,r17 + subq r31,r19,r7 + s8addq r18,r16,r16 C make r16 point at end of RES + subq r18,1,r18 + and r18,4-1,r20 C number of limbs in first loop + srl r4,r7,r0 C compute function result + + beq r20,$L0 + subq r18,r20,r18 + + ALIGN(8) +$Loop0: + ldq r3,-8(r17) + subq r16,8,r16 + subq r17,8,r17 + subq r20,1,r20 + sll r4,r19,r5 + srl r3,r7,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,0(r16) + bne r20,$Loop0 + +$L0: beq r18,$Lend + + ALIGN(8) +$Loop: ldq r3,-8(r17) + subq r16,32,r16 + subq r18,4,r18 + sll r4,r19,r5 + srl r3,r7,r6 + + ldq r4,-16(r17) + sll r3,r19,r1 + bis r5,r6,r8 + stq r8,24(r16) + srl r4,r7,r2 + + ldq r3,-24(r17) + sll r4,r19,r5 + bis r1,r2,r8 + stq r8,16(r16) + srl r3,r7,r6 + + ldq r4,-32(r17) + sll r3,r19,r1 + bis r5,r6,r8 + stq r8,8(r16) + srl r4,r7,r2 + + subq r17,32,r17 + bis r1,r2,r8 + stq r8,0(r16) + + bgt r18,$Loop + +$Lend: sll r4,r19,r8 + stq r8,-8(r16) + ret r31,(r26),1 +EPILOGUE(mpn_lshift) +ASM_END() diff --git a/rts/gmp/mpn/alpha/mul_1.asm b/rts/gmp/mpn/alpha/mul_1.asm new file mode 100644 index 0000000000..46b8df34f5 --- /dev/null +++ b/rts/gmp/mpn/alpha/mul_1.asm @@ -0,0 +1,71 @@ +dnl Alpha __gmpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_mul_1) + ldq r2,0(r17) C r2 = s1_limb + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + bic r31,r31,r4 C clear cy_limb + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,8(r17) C r2 = s1_limb + subq r18,1,r18 C size-- + stq r3,0(r16) + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,16(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + stq r3,8(r16) + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + addq r16,8,r16 C res_ptr++ + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + stq r3,8(r16) + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: stq r3,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_mul_1) +ASM_END() diff --git a/rts/gmp/mpn/alpha/rshift.asm b/rts/gmp/mpn/alpha/rshift.asm new file mode 100644 index 0000000000..aa25eda54e --- /dev/null +++ b/rts/gmp/mpn/alpha/rshift.asm @@ -0,0 +1,102 @@ +dnl Alpha mpn_rshift -- Shift a number right. + +dnl Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl cnt r19 + +dnl This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, +dnl it would take 4 cycles/limb. It should be possible to get down to 3 +dnl cycles/limb since both ldq and stq can be paired with the other used +dnl instructions. But there are many restrictions in the 21064 pipeline that +dnl makes it hard, if not impossible, to get down to 3 cycles/limb: + +dnl 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. +dnl 2. Only aligned instruction pairs can be paired. +dnl 3. The store buffer or silo might not be able to deal with the bandwidth. + +ASM_START() +PROLOGUE(mpn_rshift) + ldq r4,0(r17) C load first limb + addq r17,8,r17 + subq r31,r19,r7 + subq r18,1,r18 + and r18,4-1,r20 C number of limbs in first loop + sll r4,r7,r0 C compute function result + + beq r20,$L0 + subq r18,r20,r18 + + ALIGN(8) +$Loop0: + ldq r3,0(r17) + addq r16,8,r16 + addq r17,8,r17 + subq r20,1,r20 + srl r4,r19,r5 + sll r3,r7,r6 + bis r3,r3,r4 + bis r5,r6,r8 + stq r8,-8(r16) + bne r20,$Loop0 + +$L0: beq r18,$Lend + + ALIGN(8) +$Loop: ldq r3,0(r17) + addq r16,32,r16 + subq r18,4,r18 + srl r4,r19,r5 + sll r3,r7,r6 + + ldq r4,8(r17) + srl r3,r19,r1 + bis r5,r6,r8 + stq r8,-32(r16) + sll r4,r7,r2 + + ldq r3,16(r17) + srl r4,r19,r5 + bis r1,r2,r8 + stq r8,-24(r16) + sll r3,r7,r6 + + ldq r4,24(r17) + srl r3,r19,r1 + bis r5,r6,r8 + stq r8,-16(r16) + sll r4,r7,r2 + + addq r17,32,r17 + bis r1,r2,r8 + stq r8,-8(r16) + + bgt r18,$Loop + +$Lend: srl r4,r19,r8 + stq r8,0(r16) + ret r31,(r26),1 +EPILOGUE(mpn_rshift) +ASM_END() diff --git a/rts/gmp/mpn/alpha/sub_n.asm b/rts/gmp/mpn/alpha/sub_n.asm new file mode 100644 index 0000000000..718f657141 --- /dev/null +++ b/rts/gmp/mpn/alpha/sub_n.asm @@ -0,0 +1,114 @@ +dnl Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl s2_ptr r18 +dnl size r19 + +ASM_START() +PROLOGUE(mpn_sub_n) + ldq r3,0(r17) + ldq r4,0(r18) + + subq r19,1,r19 + and r19,4-1,r2 C number of limbs in first loop + bis r31,r31,r0 + beq r2,$L0 C if multiple of 4 limbs, skip first loop + + subq r19,r2,r19 + +$Loop0: subq r2,1,r2 + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + + addq r17,8,r17 + addq r18,8,r18 + bis r5,r5,r3 + bis r6,r6,r4 + addq r16,8,r16 + bne r2,$Loop0 + +$L0: beq r19,$Lend + + ALIGN(8) +$Loop: subq r19,4,r19 + + ldq r5,8(r17) + addq r4,r0,r4 + ldq r6,8(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + + ldq r3,16(r17) + addq r6,r0,r6 + ldq r4,16(r18) + cmpult r6,r0,r1 + subq r5,r6,r6 + cmpult r5,r6,r0 + stq r6,8(r16) + bis r0,r1,r0 + + ldq r5,24(r17) + addq r4,r0,r4 + ldq r6,24(r18) + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,16(r16) + bis r0,r1,r0 + + ldq r3,32(r17) + addq r6,r0,r6 + ldq r4,32(r18) + cmpult r6,r0,r1 + subq r5,r6,r6 + cmpult r5,r6,r0 + stq r6,24(r16) + bis r0,r1,r0 + + addq r17,32,r17 + addq r18,32,r18 + addq r16,32,r16 + bne r19,$Loop + +$Lend: addq r4,r0,r4 + cmpult r4,r0,r1 + subq r3,r4,r4 + cmpult r3,r4,r0 + stq r4,0(r16) + bis r0,r1,r0 + ret r31,(r26),1 +EPILOGUE(mpn_sub_n) +ASM_END() diff --git a/rts/gmp/mpn/alpha/submul_1.asm b/rts/gmp/mpn/alpha/submul_1.asm new file mode 100644 index 0000000000..caec1a720b --- /dev/null +++ b/rts/gmp/mpn/alpha/submul_1.asm @@ -0,0 +1,87 @@ +dnl Alpha __gmpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +dnl INPUT PARAMETERS +dnl res_ptr r16 +dnl s1_ptr r17 +dnl size r18 +dnl s2_limb r19 + +dnl This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7 +dnl cycles/limb on EV6. + +ASM_START() +PROLOGUE(mpn_submul_1) + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Lend1 C jump if size was == 1 + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + subq r18,1,r18 C size-- + subq r5,r3,r3 + cmpult r5,r3,r4 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + beq r18,$Lend2 C jump if size was == 2 + + ALIGN(8) +$Loop: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + subq r18,1,r18 C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,0(r17) C r2 = s1_limb + addq r17,8,r17 C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r16,8,r16 C res_ptr++ + addq r5,r0,r0 C combine carries + bne r18,$Loop + +$Lend2: mulq r2,r19,r3 C r3 = prod_low + ldq r5,0(r16) C r5 = *res_ptr + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r5,r0,r0 C combine carries + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Lend1: subq r5,r3,r3 + cmpult r5,r3,r5 + stq r3,0(r16) + addq r0,r5,r0 + ret r31,(r26),1 +EPILOGUE(mpn_submul_1) +ASM_END() diff --git a/rts/gmp/mpn/alpha/udiv_qrnnd.S b/rts/gmp/mpn/alpha/udiv_qrnnd.S new file mode 100644 index 0000000000..53814bbcb0 --- /dev/null +++ b/rts/gmp/mpn/alpha/udiv_qrnnd.S @@ -0,0 +1,151 @@ + # Alpha 21064 __udiv_qrnnd + + # Copyright (C) 1992, 1994, 1995, 1997, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + .set noreorder + .set noat +.text + .align 3 + .globl __gmpn_udiv_qrnnd + .ent __gmpn_udiv_qrnnd +__gmpn_udiv_qrnnd: + .frame $30,0,$26,0 + .prologue 0 +#define cnt $2 +#define tmp $3 +#define rem_ptr $16 +#define n1 $17 +#define n0 $18 +#define d $19 +#define qb $20 + + ldiq cnt,16 + blt d,.Largedivisor + +.Loop1: cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule d,n1,qb + subq n1,d,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule d,n1,qb + subq n1,d,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule d,n1,qb + subq n1,d,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule d,n1,qb + subq n1,d,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + subq cnt,1,cnt + bgt cnt,.Loop1 + stq n1,0(rem_ptr) + bis $31,n0,$0 + ret $31,($26),1 + +.Largedivisor: + and n0,1,$4 + + srl n0,1,n0 + sll n1,63,tmp + or tmp,n0,n0 + srl n1,1,n1 + + and d,1,$6 + srl d,1,$5 + addq $5,$6,$5 + +.Loop2: cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule $5,n1,qb + subq n1,$5,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule $5,n1,qb + subq n1,$5,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule $5,n1,qb + subq n1,$5,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + cmplt n0,0,tmp + addq n1,n1,n1 + bis n1,tmp,n1 + addq n0,n0,n0 + cmpule $5,n1,qb + subq n1,$5,tmp + cmovne qb,tmp,n1 + bis n0,qb,n0 + subq cnt,1,cnt + bgt cnt,.Loop2 + + addq n1,n1,n1 + addq $4,n1,n1 + bne $6,.LOdd + stq n1,0(rem_ptr) + bis $31,n0,$0 + ret $31,($26),1 + +.LOdd: + /* q' in n0. r' in n1 */ + addq n1,n0,n1 + cmpult n1,n0,tmp # tmp := carry from addq + beq tmp,.LLp6 + addq n0,1,n0 + subq n1,d,n1 +.LLp6: cmpult n1,d,tmp + bne tmp,.LLp7 + addq n0,1,n0 + subq n1,d,n1 +.LLp7: + stq n1,0(rem_ptr) + bis $31,n0,$0 + ret $31,($26),1 + + .end __gmpn_udiv_qrnnd diff --git a/rts/gmp/mpn/alpha/umul.asm b/rts/gmp/mpn/alpha/umul.asm new file mode 100644 index 0000000000..44428ed5f5 --- /dev/null +++ b/rts/gmp/mpn/alpha/umul.asm @@ -0,0 +1,39 @@ +dnl Currently unused. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + .set noreorder + .set volatile + .set noat + +.text + .align 3 + .globl __umul_ppmm + .ent __umul_ppmm +__umul_ppmm: +__umul_ppmm..ng: + .frame $30,0,$26,0 + .prologue 0 + mulq $17,$18,$1 + umulh $17,$18,$0 + stq $1,0($16) + ret $31,($26),1 + .end __umul_ppmm diff --git a/rts/gmp/mpn/alpha/unicos.m4 b/rts/gmp/mpn/alpha/unicos.m4 new file mode 100644 index 0000000000..7ff26c090c --- /dev/null +++ b/rts/gmp/mpn/alpha/unicos.m4 @@ -0,0 +1,63 @@ +divert(-1) + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +define(`ASM_START', + `.ident dummy') + +define(`X',`^X$1') +define(`FLOAT64', + `dnl + .psect $1@crud,data +$1: .t_floating $2 + .endp') + +define(`PROLOGUE', + `dnl + .stack 192 ; What does this mean? Only Cray knows. + .psect $1@code,code,cache +$1::') +define(`PROLOGUE_GP', `PROLOGUE($1)') + +define(`EPILOGUE', + `dnl + .endp') + +define(`DATASTART', + `dnl + .psect $1@crud,data +$1:') +define(`DATAEND', + `dnl + .endp') + +define(`ASM_END', + `dnl + .end') + +define(`unop',`bis r31,r31,r31') ; Unicos assembler lacks unop +define(`cvttqc',`cvttq/c') + +define(`ALIGN',`') ; Unicos assembler seems to align using garbage + +divert + diff --git a/rts/gmp/mpn/arm/add_n.S b/rts/gmp/mpn/arm/add_n.S new file mode 100644 index 0000000000..fb3f8f703b --- /dev/null +++ b/rts/gmp/mpn/arm/add_n.S @@ -0,0 +1,77 @@ +@ ARM mpn_add -- Add two limb vectors of the same length > 0 and store sum in +@ a third limb vector. +@ Contributed by Robert Harley. + +@ Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + +#define s r0 +#define a r1 +#define b r2 +#define n r3 + +#define sl r10 +#define fp r11 +#define ip r12 +#define sp r13 +#define lr r14 +#define pc r15 + +.text + .align 0 + .global __gmpn_add_n + .type __gmpn_add_n,%function +__gmpn_add_n: + stmfd sp!, { r8, r9, lr } + movs n, n, lsr #1 + bcc skip1 + ldr ip, [a], #4 + ldr lr, [b], #4 + adds ip, ip, lr + str ip, [s], #4 +skip1: + tst n, #1 + beq skip2 + ldmia a!, { r8, r9 } + ldmia b!, { ip, lr } + adcs r8, r8, ip + adcs r9, r9, lr + stmia s!, { r8, r9 } +skip2: + bics n, n, #1 + beq return + stmfd sp!, { r4, r5, r6, r7 } +add_n_loop: + ldmia a!, { r4, r5, r6, r7 } + ldmia b!, { r8, r9, ip, lr } + adcs r4, r4, r8 + ldr r8, [s] /* Bring stuff into cache. */ + adcs r5, r5, r9 + adcs r6, r6, ip + adcs r7, r7, lr + stmia s!, { r4, r5, r6, r7 } + sub n, n, #2 + teq n, #0 + bne add_n_loop + ldmfd sp!, { r4, r5, r6, r7 } +return: + adc r0, n, #0 + ldmfd sp!, { r8, r9, pc } +end: + .size __gmpn_add_n, end - __gmpn_add_n diff --git a/rts/gmp/mpn/arm/addmul_1.S b/rts/gmp/mpn/arm/addmul_1.S new file mode 100644 index 0000000000..396fff77a3 --- /dev/null +++ b/rts/gmp/mpn/arm/addmul_1.S @@ -0,0 +1,89 @@ +@ ARM mpn_mul_1 -- Multiply a limb vector with a limb and add the result to a +@ second limb vector. +@ Contributed by Robert Harley. + +@ Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + +#define p r0 +#define a r1 +#define n r2 +#define w r3 + +#define z r11 + +#define ip r12 +#define sp r13 +#define lr r14 +#define pc r15 + +.text + .align 0 + .global __gmpn_addmul_1 + .type __gmpn_addmul_1,%function +__gmpn_addmul_1: + stmfd sp!, { r8-r11, lr } + mov z, #0 + mov ip, #0 + movs n, n, lsr #1 + bcc skip1 + ldr lr, [a], #4 + ldr r9, [p] + umlal r9, ip, w, lr + str r9, [p], #4 +skip1: + movs n, n, lsr #1 + bcc skip2 + ldmia p, { r9, r10 } + adds r8, ip, r9 + adc r9, z, #0 + ldmia a!, { ip, lr } + umlal r8, r9, w, ip + adds r9, r9, r10 + adc ip, z, #0 + umlal r9, ip, w, lr + stmia p!, { r8, r9 } +skip2: + teq n, #0 + beq return + stmfd sp!, { r4-r7 } +addmul_loop: + ldmia p, { r5, r6, r7, r8 } + adds r4, ip, r5 + adc r5, z, #0 + ldmia a!, { r9, r10, ip, lr } + umlal r4, r5, w, r9 + adds r5, r5, r6 + adc r6, z, #0 + umlal r5, r6, w, r10 + adds r6, r6, r7 + adc r7, z, #0 + umlal r6, r7, w, ip + adds r7, r7, r8 + adc ip, z, #0 + umlal r7, ip, w, lr + subs n, n, #1 + stmia p!, { r4, r5, r6, r7 } + bne addmul_loop + ldmfd sp!, { r4-r7 } +return: + mov r0, ip + ldmfd sp!, { r8-r11, pc } +end: + .size __gmpn_addmul_1, end - __gmpn_addmul_1 diff --git a/rts/gmp/mpn/arm/gmp-mparam.h b/rts/gmp/mpn/arm/gmp-mparam.h new file mode 100644 index 0000000000..a35b0c7b66 --- /dev/null +++ b/rts/gmp/mpn/arm/gmp-mparam.h @@ -0,0 +1,34 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 21 +#endif +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 48 +#endif diff --git a/rts/gmp/mpn/arm/mul_1.S b/rts/gmp/mpn/arm/mul_1.S new file mode 100644 index 0000000000..bae526a0f0 --- /dev/null +++ b/rts/gmp/mpn/arm/mul_1.S @@ -0,0 +1,81 @@ +@ ARM mpn_addmul_1 -- Multiply a limb vector with a limb and store the result +@ in a second limb vector. +@ Contributed by Robert Harley. + +@ Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + +#define p r0 +#define a r1 +#define n r2 +#define w r3 + +#define sl r10 +#define fp r11 +#define ip r12 +#define sp r13 +#define lr r14 +#define pc r15 + +.text + .align 0 + .global __gmpn_mul_1 + .type __gmpn_mul_1,%function +__gmpn_mul_1: + stmfd sp!, { r8, r9, lr } + ands ip, n, #1 + beq skip1 + ldr lr, [a], #4 + umull r9, ip, w, lr + str r9, [p], #4 +skip1: + tst n, #2 + beq skip2 + mov r8, ip + ldmia a!, { ip, lr } + mov r9, #0 + umlal r8, r9, w, ip + mov ip, #0 + umlal r9, ip, w, lr + stmia p!, { r8, r9 } +skip2: + bics n, n, #3 + beq return + stmfd sp!, { r6, r7 } +mul_1_loop: + mov r6, ip + ldmia a!, { r8, r9, ip, lr } + ldr r7, [p] /* Bring stuff into cache. */ + mov r7, #0 + umlal r6, r7, w, r8 + mov r8, #0 + umlal r7, r8, w, r9 + mov r9, #0 + umlal r8, r9, w, ip + mov ip, #0 + umlal r9, ip, w, lr + subs n, n, #4 + stmia p!, { r6, r7, r8, r9 } + bne mul_1_loop + ldmfd sp!, { r6, r7 } +return: + mov r0, ip + ldmfd sp!, { r8, r9, pc } +end: + .size __gmpn_mul_1, end - __gmpn_mul_1 diff --git a/rts/gmp/mpn/arm/sub_n.S b/rts/gmp/mpn/arm/sub_n.S new file mode 100644 index 0000000000..856505fe21 --- /dev/null +++ b/rts/gmp/mpn/arm/sub_n.S @@ -0,0 +1,79 @@ +@ ARM mpn_sub -- Subtract two limb vectors of the same length > 0 and store +@ difference in a third limb vector. +@ Contributed by Robert Harley. + +@ Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + +#define d r0 +#define a r1 +#define b r2 +#define n r3 + +#define sl r10 +#define fp r11 +#define ip r12 +#define sp r13 +#define lr r14 +#define pc r15 + +.text + .align 0 + .global __gmpn_sub_n + .type __gmpn_sub_n,%function +__gmpn_sub_n: + stmfd sp!, { r8, r9, lr } + subs ip, ip, ip + tst n, #1 + beq skip1 + ldr ip, [a], #4 + ldr lr, [b], #4 + subs ip, ip, lr + str ip, [d], #4 +skip1: + tst n, #2 + beq skip2 + ldmia a!, { r8, r9 } + ldmia b!, { ip, lr } + sbcs r8, r8, ip + sbcs r9, r9, lr + stmia d!, { r8, r9 } +skip2: + bics n, n, #3 + beq return + stmfd sp!, { r4, r5, r6, r7 } +sub_n_loop: + ldmia a!, { r4, r5, r6, r7 } + ldmia b!, { r8, r9, ip, lr } + sbcs r4, r4, r8 + ldr r8, [d] /* Bring stuff into cache. */ + sbcs r5, r5, r9 + sbcs r6, r6, ip + sbcs r7, r7, lr + stmia d!, { r4, r5, r6, r7 } + sub n, n, #4 + teq n, #0 + bne sub_n_loop + ldmfd sp!, { r4, r5, r6, r7 } +return: + sbc r0, r0, r0 + and r0, r0, #1 + ldmfd sp!, { r8, r9, pc } +end: + .size __gmpn_sub_n, end - __gmpn_sub_n diff --git a/rts/gmp/mpn/asm-defs.m4 b/rts/gmp/mpn/asm-defs.m4 new file mode 100644 index 0000000000..aa2024138b --- /dev/null +++ b/rts/gmp/mpn/asm-defs.m4 @@ -0,0 +1,1182 @@ +divert(-1) +dnl +dnl m4 macros for gmp assembly code, shared by all CPUs. +dnl +dnl These macros are designed for use with any m4 and have been used on +dnl GNU, FreeBSD, OpenBSD and SysV. +dnl +dnl GNU m4 and OpenBSD 2.7 m4 will give filenames and line numbers in error +dnl messages. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl Macros: +dnl +dnl Most new m4 specific macros have an "m4_" prefix to emphasise they're +dnl m4 expansions. But new defining things like deflit() and defreg() are +dnl named like the builtin define(), and forloop() is named following the +dnl GNU m4 example on which it's based. +dnl +dnl GNU m4 with the -P option uses "m4_" as a prefix for builtins, but that +dnl option isn't going to be used, so there's no conflict or confusion. +dnl +dnl +dnl Comments in output: +dnl +dnl The m4 comment delimiters are left at # and \n, the normal assembler +dnl commenting for most CPUs. m4 passes comment text through without +dnl expanding macros in it, which is generally a good thing since it stops +dnl unexpected expansions and possible resultant errors. +dnl +dnl But note that when a quoted string is being read, a # isn't special, so +dnl apostrophes in comments in quoted strings must be avoided or they'll be +dnl interpreted as a closing quote mark. But when the quoted text is +dnl re-read # will still act like a normal comment, supressing macro +dnl expansion. +dnl +dnl For example, +dnl +dnl # apostrophes in comments that're outside quotes are ok +dnl # and using macro names like PROLOGUE is ok too +dnl ... +dnl ifdef(`PIC',` +dnl # but apostrophes aren't ok inside quotes +dnl # ^--wrong +dnl ... +dnl # though macro names like PROLOGUE are still ok +dnl ... +dnl ') +dnl +dnl If macro expansion in a comment is wanted, use `#' in the .asm (ie. a +dnl quoted hash symbol), which will turn into # in the .s but get +dnl expansions done on that line. This can make the .s more readable to +dnl humans, but it won't make a blind bit of difference to the assembler. +dnl +dnl All the above applies, mutatis mutandis, when changecom() is used to +dnl select @ ! ; or whatever other commenting. +dnl +dnl +dnl Variations in m4 affecting gmp: +dnl +dnl $# - When a macro is called as "foo" with no brackets, BSD m4 sets $# +dnl to 1, whereas GNU or SysV m4 set it to 0. In all cases though +dnl "foo()" sets $# to 1. This is worked around in various places. +dnl +dnl len() - When "len()" is given an empty argument, BSD m4 evaluates to +dnl nothing, whereas GNU, SysV, and the new OpenBSD, evaluate to 0. +dnl See m4_length() below which works around this. +dnl +dnl translit() - GNU m4 accepts character ranges like A-Z, and the new +dnl OpenBSD m4 does under option -g, but basic BSD and SysV don't. +dnl +dnl popdef() - in BSD and SysV m4 popdef() takes multiple arguments and +dnl pops each, but GNU m4 only takes one argument. +dnl +dnl push back - BSD m4 has some limits on the amount of text that can be +dnl pushed back. The limit is reasonably big and so long as macros +dnl don't gratuitously duplicate big arguments it isn't a problem. +dnl Normally an error message is given, but sometimes it just hangs. +dnl +dnl eval() &,|,^ - GNU and SysV m4 have bitwise operators &,|,^ available, +dnl but BSD m4 doesn't (contrary to what the man page suggests) and +dnl instead ^ is exponentiation. +dnl +dnl eval() ?: - The C ternary operator "?:" is available in BSD m4, but not +dnl in SysV or GNU m4 (as of GNU m4 1.4 and betas of 1.5). +dnl +dnl eval() -2^31 - BSD m4 has a bug where an eval() resulting in -2^31 +dnl (ie. -2147483648) gives "-(". Using -2147483648 within an +dnl expression is ok, it just can't be a final result. "-(" will of +dnl course upset parsing, with all sorts of strange effects. +dnl +dnl eval() <<,>> - SysV m4 doesn't support shift operators in eval() (on +dnl SunOS 5.7 /usr/xpg4/m4 has them but /usr/ccs/m4 doesn't). See +dnl m4_lshift() and m4_rshift() below for workarounds. +dnl +dnl m4wrap() - in BSD m4, m4wrap() replaces any previous m4wrap() string, +dnl in SysV m4 it appends to it, and in GNU m4 it prepends. See +dnl m4wrap_prepend() below which brings uniformity to this. +dnl +dnl __file__,__line__ - GNU m4 and OpenBSD 2.7 m4 provide these, and +dnl they're used here to make error messages more informative. GNU m4 +dnl gives an unhelpful "NONE 0" in an m4wrap(), but that's worked +dnl around. +dnl +dnl __file__ quoting - OpenBSD m4, unlike GNU m4, doesn't quote the +dnl filename in __file__, so care should be taken that no macro has +dnl the same name as a file, or an unwanted expansion will occur when +dnl printing an error or warning. +dnl +dnl OpenBSD 2.6 m4 - this m4 rejects decimal constants containing an 8 or 9 +dnl in eval(), making it pretty much unusable. This bug is confined +dnl to version 2.6 (it's not in 2.5, and has been fixed in 2.7). +dnl +dnl SunOS /usr/bin/m4 - this m4 lacks a number of desired features, +dnl including $# and $@, defn(), m4exit(), m4wrap(), pushdef(), +dnl popdef(). /usr/5bin/m4 is a SysV style m4 which should always be +dnl available, and "configure" will reject /usr/bin/m4 in favour of +dnl /usr/5bin/m4 (if necessary). +dnl +dnl The sparc code actually has modest m4 requirements currently and +dnl could manage with /usr/bin/m4, but there's no reason to put our +dnl macros through contortions when /usr/5bin/m4 is available or GNU +dnl m4 can be installed. + + +ifdef(`__ASM_DEFS_M4_INCLUDED__', +`m4_error(`asm-defs.m4 already included, dont include it twice +')m4exit(1)') +define(`__ASM_DEFS_M4_INCLUDED__') + + +dnl Detect and give a message about the unsuitable OpenBSD 2.6 m4. + +ifelse(eval(89),89,, +`errprint( +`This m4 doesnt accept 8 and/or 9 in constants in eval(), making it unusable. +This is probably OpenBSD 2.6 m4 (September 1999). Upgrade to OpenBSD 2.7, +or get a bug fix from the CVS (expr.c rev 1.9), or get GNU m4. Dont forget +to configure with M4=/wherever/m4 if you install one of these in a directory +not in $PATH. +')m4exit(1)') + + +dnl Detect and give a message about the unsuitable SunOS /usr/bin/m4. +dnl +dnl Unfortunately this test doesn't work when m4 is run in the normal way +dnl from mpn/Makefile with "m4 -DOPERATION_foo foo.asm", since the bad m4 +dnl takes "-" in "-D..." to mean read stdin, so it will look like it just +dnl hangs. But running "m4 asm-defs.m4" to try it out will work. +dnl +dnl We'd like to abort immediately on finding a problem, but unfortunately +dnl the bad m4 doesn't have an m4exit(), nor does an invalid eval() kill +dnl it. Unexpanded $#'s in some m4_assert_numargs() later on will comment +dnl out some closing parentheses and kill it with "m4: arg stack overflow". + +define(m4_dollarhash_works_test,``$#'') +ifelse(m4_dollarhash_works_test(x),1,, +`errprint( +`This m4 doesnt support $# and cant be used for GMP asm processing. +If this is on SunOS, ./configure should choose /usr/5bin/m4 if you have that +or can get it, otherwise install GNU m4. Dont forget to configure with +M4=/wherever/m4 if you install in a directory not in $PATH. +')') +undefine(`m4_dollarhash_works_test') + + +dnl -------------------------------------------------------------------------- +dnl Basic error handling things. + + +dnl Usage: m4_dollarhash_1_if_noparen_p +dnl +dnl Expand to 1 if a call "foo" gives $# set to 1 (as opposed to 0 like GNU +dnl and SysV m4 give). + +define(m4_dollarhash_1_if_noparen_test,`$#') +define(m4_dollarhash_1_if_noparen_p, +eval(m4_dollarhash_1_if_noparen_test==1)) +undefine(`m4_dollarhash_1_if_noparen_test') + + +dnl Usage: m4wrap_prepend(string) +dnl +dnl Prepend the given string to what will be exapanded under m4wrap at the +dnl end of input. +dnl +dnl This macro exists to work around variations in m4wrap() behaviour in +dnl the various m4s (notes at the start of this file). Don't use m4wrap() +dnl directly since it will interfere with this scheme. + +define(m4wrap_prepend, +m4_assert_numargs(1) +`define(`m4wrap_string',`$1'defn(`m4wrap_string'))') + +m4wrap(`m4wrap_string') +define(m4wrap_string,`') + + +dnl Usage: m4_file_and_line +dnl +dnl Expand to the current file and line number, if the GNU m4 extensions +dnl __file__ and __line__ are available. +dnl +dnl In GNU m4 1.4 at the end of input when m4wrap text is expanded, +dnl __file__ is NONE and __line__ is 0, which is not a helpful thing to +dnl print. If m4_file_seen() has been called to note the last file seen, +dnl then that file at a big line number is used, otherwise "end of input" +dnl is used (although "end of input" won't parse as an error message). + +define(m4_file_and_line, +`ifdef(`__file__', +`ifelse(__file__`'__line__,`NONE0', +`ifdef(`m4_file_seen_last',`m4_file_seen_last: 999999: ',`end of input: ')', +`__file__: __line__: ')')') + + +dnl Usage: m4_errprint_commas(arg,...) +dnl +dnl The same as errprint(), but commas are printed between arguments +dnl instead of spaces. + +define(m4_errprint_commas, +`errprint(`$1')dnl +ifelse(eval($#>1),1,`errprint(`,')m4_errprint_commas(shift($@))')') + + +dnl Usage: m4_error(args...) +dnl m4_warning(args...) +dnl +dnl Print an error message, using m4_errprint_commas, prefixed with the +dnl current filename and line number (if available). m4_error sets up to +dnl give an error exit at the end of processing, m4_warning just prints. +dnl These macros are the recommended way to print errors. +dnl +dnl The arguments here should be quoted in the usual way to prevent them +dnl being expanded when the macro call is read. (m4_error takes care not +dnl to do any further expansion.) +dnl +dnl For example, +dnl +dnl m4_error(`some error message +dnl ') +dnl +dnl which prints +dnl +dnl foo.asm:123: some error message +dnl +dnl or if __file__ and __line__ aren't available +dnl +dnl some error message +dnl +dnl The "file:line:" format is a basic style, used by gcc and GNU m4, so +dnl emacs and other editors will recognise it in their normal error message +dnl parsing. + +define(m4_warning, +`m4_errprint_commas(m4_file_and_line`'$@)') + +define(m4_error, +`define(`m4_error_occurred',1)m4_warning($@)') + +define(`m4_error_occurred',0) + +dnl This m4wrap_prepend() is first, so it'll be executed last. +m4wrap_prepend( +`ifelse(m4_error_occurred,1, +`m4_error(`Errors occurred during m4 processing +')m4exit(1)')') + + +dnl Usage: m4_assert_numargs(num) +dnl +dnl Put this unquoted on a line on its own at the start of a macro +dnl definition to add some code to check that num many arguments get passed +dnl to the macro. For example, +dnl +dnl define(foo, +dnl m4_assert_numargs(2) +dnl `something `$1' and `$2' blah blah') +dnl +dnl Then a call like foo(one,two,three) will provoke an error like +dnl +dnl file:10: foo expected 2 arguments, got 3 arguments +dnl +dnl Here are some calls and how many arguments they're interpreted as passing. +dnl +dnl foo(abc,def) 2 +dnl foo(xyz) 1 +dnl foo() 0 +dnl foo -1 +dnl +dnl The -1 for no parentheses at all means a macro that's meant to be used +dnl that way can be checked with m4_assert_numargs(-1). For example, +dnl +dnl define(SPECIAL_SUFFIX, +dnl m4_assert_numargs(-1) +dnl `ifdef(`FOO',`_foo',`_bar')') +dnl +dnl But as an alternative see also deflit() below where parenthesized +dnl expressions following a macro are passed through to the output. +dnl +dnl Note that in BSD m4 there's no way to differentiate calls "foo" and +dnl "foo()", so in BSD m4 the distinction between the two isn't enforced. +dnl (In GNU and SysV m4 it can be checked, and is.) + + +dnl m4_assert_numargs is able to check its own arguments by calling +dnl assert_numargs_internal directly. +dnl +dnl m4_doublequote($`'0) expands to ``$0'', whereas ``$`'0'' would expand +dnl to `$`'0' and do the wrong thing, and likewise for $1. The same is +dnl done in other assert macros. +dnl +dnl $`#' leaves $# in the new macro being defined, and stops # being +dnl interpreted as a comment character. +dnl +dnl `dnl ' means an explicit dnl isn't necessary when m4_assert_numargs is +dnl used. The space means that if there is a dnl it'll still work. + +dnl Usage: m4_doublequote(x) expands to ``x'' +define(m4_doublequote, +`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))``$1''') + +define(m4_assert_numargs, +`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))dnl +`m4_assert_numargs_internal'(m4_doublequote($`'0),$1,$`#',`len'(m4_doublequote($`'1)))`dnl '') + +dnl Called: m4_assert_numargs_internal(`macroname',wantargs,$#,len(`$1')) +define(m4_assert_numargs_internal, +`m4_assert_numargs_internal_check(`$1',`$2',m4_numargs_count(`$3',`$4'))') + +dnl Called: m4_assert_numargs_internal_check(`macroname',wantargs,gotargs) +dnl +dnl If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it +dnl should be -1. If wantargs is -1 but gotargs is 0 and the two can't be +dnl distinguished then it's allowed to pass. +dnl +define(m4_assert_numargs_internal_check, +`ifelse(eval($2 == $3 + || ($2==-1 && $3==0 && m4_dollarhash_1_if_noparen_p)),0, +`m4_error(`$1 expected 'm4_Narguments(`$2')`, got 'm4_Narguments(`$3') +)')') + +dnl Called: m4_numargs_count($#,len(`$1')) +dnl If $#==0 then -1 args, if $#==1 but len(`$1')==0 then 0 args, otherwise +dnl $# args. +define(m4_numargs_count, +`ifelse($1,0, -1, +`ifelse(eval($1==1 && $2-0==0),1, 0, $1)')') + +dnl Usage: m4_Narguments(N) +dnl "$1 argument" or "$1 arguments" with the plural according to $1. +define(m4_Narguments, +`$1 argument`'ifelse(`$1',1,,s)') + + +dnl -------------------------------------------------------------------------- +dnl Additional error checking things. + + +dnl Usage: m4_file_seen() +dnl +dnl Record __file__ for the benefit of m4_file_and_line in m4wrap text. +dnl The basic __file__ macro comes out quoted, like `foo.asm', and +dnl m4_file_seen_last is defined like that too. +dnl +dnl This only needs to be used with something that could generate an error +dnl message in m4wrap text. The x86 PROLOGUE is the only such at the +dnl moment (at end of input its m4wrap checks for missing EPILOGUE). A few +dnl include()s can easily trick this scheme, but you'd expect an EPILOGUE +dnl in the same file as the PROLOGUE. + +define(m4_file_seen, +m4_assert_numargs(0) +`ifelse(__file__,`NONE',, +`define(`m4_file_seen_last',m4_doublequote(__file__))')') + + +dnl Usage: m4_assert_onearg() +dnl +dnl Put this, unquoted, at the start of a macro definition to add some code +dnl to check that one argument is passed to the macro, but with that +dnl argument allowed to be empty. For example, +dnl +dnl define(foo, +dnl m4_assert_onearg() +dnl `blah blah $1 blah blah') +dnl +dnl Calls "foo(xyz)" or "foo()" are accepted. A call "foo(xyz,abc)" fails. +dnl A call "foo" fails too, but BSD m4 can't detect this case (GNU and SysV +dnl m4 can). + +define(m4_assert_onearg, +m4_assert_numargs(0) +`m4_assert_onearg_internal'(m4_doublequote($`'0),$`#')`dnl ') + +dnl Called: m4_assert_onearg(`macroname',$#) +define(m4_assert_onearg_internal, +`ifelse($2,1,, +`m4_error(`$1 expected 1 argument, got 'm4_Narguments(`$2') +)')') + + +dnl Usage: m4_assert_numargs_range(low,high) +dnl +dnl Put this, unquoted, at the start of a macro definition to add some code +dnl to check that between low and high many arguments get passed to the +dnl macro. For example, +dnl +dnl define(foo, +dnl m4_assert_numargs_range(3,5) +dnl `mandatory $1 $2 $3 optional $4 $5 end') +dnl +dnl See m4_assert_numargs() for more info. + +define(m4_assert_numargs_range, +m4_assert_numargs(2) +``m4_assert_numargs_range_internal'(m4_doublequote($`'0),$1,$2,$`#',`len'(m4_doublequote($`'1)))`dnl '') + +dnl Called: m4_assert_numargs_range_internal(`name',low,high,$#,len(`$1')) +define(m4_assert_numargs_range_internal, +m4_assert_numargs(5) +`m4_assert_numargs_range_check(`$1',`$2',`$3',m4_numargs_count(`$4',`$5'))') + +dnl Called: m4_assert_numargs_range_check(`name',low,high,gotargs) +dnl +dnl If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it +dnl should be -1. To ensure a `high' of -1 works, a fudge is applied to +dnl gotargs if it's 0 and the 0 and -1 cases can't be distinguished. +dnl +define(m4_assert_numargs_range_check, +m4_assert_numargs(4) +`ifelse(eval($2 <= $4 && + ($4 - ($4==0 && m4_dollarhash_1_if_noparen_p) <= $3)),0, +`m4_error(`$1 expected $2 to $3 arguments, got 'm4_Narguments(`$4') +)')') + + +dnl Usage: m4_assert_defined(symbol) +dnl +dnl Put this unquoted on a line of its own at the start of a macro +dnl definition to add some code to check that the given symbol is defined +dnl when the macro is used. For example, +dnl +dnl define(foo, +dnl m4_assert_defined(`FOO_PREFIX') +dnl `FOO_PREFIX whatever') +dnl +dnl This is a convenient way to check that the user or ./configure or +dnl whatever has defined the things needed by a macro, as opposed to +dnl silently generating garbage. + +define(m4_assert_defined, +m4_assert_numargs(1) +``m4_assert_defined_internal'(m4_doublequote($`'0),``$1'')`dnl '') + +dnl Called: m4_assert_defined_internal(`macroname',`define_required') +define(m4_assert_defined_internal, +m4_assert_numargs(2) +`ifdef(`$2',, +`m4_error(`$1 needs $2 defined +')')') + + +dnl Usage: m4_not_for_expansion(`SYMBOL') +dnl define_not_for_expansion(`SYMBOL') +dnl +dnl m4_not_for_expansion turns SYMBOL, if defined, into something which +dnl will give an error if expanded. For example, +dnl +dnl m4_not_for_expansion(`PIC') +dnl +dnl define_not_for_expansion is the same, but always makes a definition. +dnl +dnl These are for symbols that should be tested with ifdef(`FOO',...) +dnl rather than be expanded as such. They guard against accidentally +dnl omitting the quotes, as in ifdef(FOO,...). Note though that they only +dnl catches this when FOO is defined, so be sure to test code both with and +dnl without each definition. + +define(m4_not_for_expansion, +m4_assert_numargs(1) +`ifdef(`$1',`define_not_for_expansion(`$1')')') + +define(define_not_for_expansion, +m4_assert_numargs(1) +`ifelse(defn(`$1'),,, +`m4_error(``$1' has a non-empty value, maybe it shouldnt be munged with m4_not_for_expansion() +')')dnl +define(`$1',`m4_not_for_expansion_internal(`$1')')') + +define(m4_not_for_expansion_internal, +`m4_error(``$1' is not meant to be expanded, perhaps you mean `ifdef(`$1',...)' +')') + + +dnl -------------------------------------------------------------------------- +dnl Various generic m4 things. + + +dnl Usage: m4_ifdef_anyof_p(`symbol',...) +dnl +dnl Expand to 1 if any of the symbols in the argument list are defined, or +dnl to 0 if not. + +define(m4_ifdef_anyof_p, +`ifelse(eval($#<=1 && m4_length(`$1')==0),1, 0, +`ifdef(`$1', 1, +`m4_ifdef_anyof_p(shift($@))')')') + + +dnl Usage: m4_length(string) +dnl +dnl Determine the length of a string. This is the same as len(), but +dnl always expands to a number, working around the BSD len() which +dnl evaluates to nothing given an empty argument. + +define(m4_length, +m4_assert_onearg() +`eval(len(`$1')-0)') + + +dnl Usage: m4_stringequal_p(x,y) +dnl +dnl Expand to 1 or 0 according as strings x and y are equal or not. + +define(m4_stringequal_p, +`ifelse(`$1',`$2',1,0)') + + +dnl Usage: m4_incr_or_decr(n,last) +dnl +dnl Do an incr(n) or decr(n), whichever is in the direction of "last". +dnl Both n and last must be numbers of course. + +define(m4_incr_or_decr, +m4_assert_numargs(2) +`ifelse(eval($1<$2),1,incr($1),decr($1))') + + +dnl Usage: forloop(i, first, last, statement) +dnl +dnl Based on GNU m4 examples/forloop.m4, but extended. +dnl +dnl statement is expanded repeatedly, with i successively defined as +dnl +dnl first, first+1, ..., last-1, last +dnl +dnl Or if first > last, then it's +dnl +dnl first, first-1, ..., last+1, last +dnl +dnl If first == last, then one expansion is done. +dnl +dnl A pushdef/popdef of i is done to preserve any previous definition (or +dnl lack of definition). first and last are eval()ed and so can be +dnl expressions. +dnl +dnl forloop_first is defined to 1 on the first iteration, 0 on the rest. +dnl forloop_last is defined to 1 on the last iteration, 0 on the others. +dnl Nested forloops are allowed, in which case forloop_first and +dnl forloop_last apply to the innermost loop that's open. +dnl +dnl A simple example, +dnl +dnl forloop(i, 1, 2*2+1, `dnl +dnl iteration number i ... ifelse(forloop_first,1,FIRST) +dnl ') + + +dnl "i" and "statement" are carefully quoted, but "first" and "last" are +dnl just plain numbers once eval()ed. + +define(`forloop', +m4_assert_numargs(4) +`pushdef(`$1',eval(`$2'))dnl +pushdef(`forloop_first',1)dnl +pushdef(`forloop_last',0)dnl +forloop_internal(`$1',eval(`$3'),`$4')`'dnl +popdef(`forloop_first')dnl +popdef(`forloop_last')dnl +popdef(`$1')') + +dnl Called: forloop_internal(`var',last,statement) +define(`forloop_internal', +m4_assert_numargs(3) +`ifelse($1,$2, +`define(`forloop_last',1)$3', +`$3`'dnl +define(`forloop_first',0)dnl +define(`$1',m4_incr_or_decr($1,$2))dnl +forloop_internal(`$1',$2,`$3')')') + + +dnl Usage: m4_toupper(x) +dnl m4_tolower(x) +dnl +dnl Convert the argument string to upper or lower case, respectively. +dnl Only one argument accepted. +dnl +dnl BSD m4 doesn't take ranges like a-z in translit(), so the full alphabet +dnl is written out. + +define(m4_alphabet_lower, `abcdefghijklmnopqrstuvwxyz') +define(m4_alphabet_upper, `ABCDEFGHIJKLMNOPQRSTUVWXYZ') + +define(m4_toupper, +m4_assert_onearg() +`translit(`$1', m4_alphabet_lower, m4_alphabet_upper)') + +define(m4_tolower, +m4_assert_onearg() +`translit(`$1', m4_alphabet_upper, m4_alphabet_lower)') + + +dnl Usage: m4_empty_if_zero(x) +dnl +dnl Evaluate to x, or to nothing if x is 0. x is eval()ed and so can be an +dnl expression. +dnl +dnl This is useful for x86 addressing mode displacements since forms like +dnl (%ebx) are one byte shorter than 0(%ebx). A macro `foo' for use as +dnl foo(%ebx) could be defined with the following so it'll be empty if the +dnl expression comes out zero. +dnl +dnl deflit(`foo', `m4_empty_if_zero(a+b*4-c)') +dnl +dnl Naturally this shouldn't be done if, say, a computed jump depends on +dnl the code being a particular size. + +define(m4_empty_if_zero, +m4_assert_onearg() +`ifelse(eval($1),0,,eval($1))') + + +dnl Usage: m4_log2(x) +dnl +dnl Calculate a logarithm to base 2. +dnl x must be an integral power of 2, between 2**0 and 2**30. +dnl x is eval()ed, so it can be an expression. +dnl An error results if x is invalid. +dnl +dnl 2**31 isn't supported, because an unsigned 2147483648 is out of range +dnl of a 32-bit signed int. Also, the bug in BSD m4 where an eval() +dnl resulting in 2147483648 (or -2147483648 as the case may be) gives `-(' +dnl means tests like eval(1<<31==(x)) would be necessary, but that then +dnl gives an unattractive explosion of eval() error messages if x isn't +dnl numeric. + +define(m4_log2, +m4_assert_numargs(1) +`m4_log2_internal(0,1,eval(`$1'))') + +dnl Called: m4_log2_internal(n,2**n,target) +define(m4_log2_internal, +m4_assert_numargs(3) +`ifelse($2,$3,$1, +`ifelse($1,30, +`m4_error(`m4_log2() argument too big or not a power of two: $3 +')', +`m4_log2_internal(incr($1),eval(2*$2),$3)')')') + + +dnl Usage: m4_div2_towards_zero +dnl +dnl m4 division is probably whatever a C signed division is, and C doesn't +dnl specify what rounding gets used on negatives, so this expression forces +dnl a rounding towards zero. + +define(m4_div2_towards_zero, +m4_assert_numargs(1) +`eval((($1) + ((($1)<0) & ($1))) / 2)') + + +dnl Usage: m4_lshift(n,count) +dnl m4_rshift(n,count) +dnl +dnl Calculate n shifted left or right by count many bits. Both n and count +dnl are eval()ed and so can be expressions. +dnl +dnl Negative counts are allowed and mean a shift in the opposite direction. +dnl Negative n is allowed and right shifts will be arithmetic (meaning +dnl divide by 2**count, rounding towards zero, also meaning the sign bit is +dnl duplicated). +dnl +dnl Use these macros instead of << and >> in eval() since the basic ccs +dnl SysV m4 doesn't have those operators. + +define(m4_rshift, +m4_assert_numargs(2) +`m4_lshift(`$1',-(`$2'))') + +define(m4_lshift, +m4_assert_numargs(2) +`m4_lshift_internal(eval(`$1'),eval(`$2'))') + +define(m4_lshift_internal, +m4_assert_numargs(2) +`ifelse(eval($2-0==0),1,$1, +`ifelse(eval($2>0),1, +`m4_lshift_internal(eval($1*2),decr($2))', +`m4_lshift_internal(m4_div2_towards_zero($1),incr($2))')')') + + +dnl Usage: deflit(name,value) +dnl +dnl Like define(), but "name" expands like a literal, rather than taking +dnl arguments. For example "name(%eax)" expands to "value(%eax)". +dnl +dnl Limitations: +dnl +dnl $ characters in the value part must have quotes to stop them looking +dnl like macro parameters. For example, deflit(reg,`123+$`'4+567'). See +dnl defreg() below for handling simple register definitions like $7 etc. +dnl +dnl "name()" is turned into "name", unfortunately. In GNU and SysV m4 an +dnl error is generated when this happens, but in BSD m4 it will happen +dnl silently. The problem is that in BSD m4 $# is 1 in both "name" or +dnl "name()", so there's no way to differentiate them. Because we want +dnl plain "name" to turn into plain "value", we end up with "name()" +dnl turning into plain "value" too. +dnl +dnl "name(foo)" will lose any whitespace after commas in "foo", for example +dnl "disp(%eax, %ecx)" would become "128(%eax,%ecx)". +dnl +dnl These parentheses oddities shouldn't matter in assembler text, but if +dnl they do the suggested workaround is to write "name ()" or "name (foo)" +dnl to stop the parentheses looking like a macro argument list. If a space +dnl isn't acceptable in the output, then write "name`'()" or "name`'(foo)". +dnl The `' is stripped when read, but again stops the parentheses looking +dnl like parameters. + +dnl Quoting for deflit_emptyargcheck is similar to m4_assert_numargs. The +dnl stuff in the ifelse gives a $#, $1 and $@ evaluated in the new macro +dnl created, not in deflit. +define(deflit, +m4_assert_numargs(2) +`define(`$1', +`deflit_emptyargcheck'(``$1'',$`#',m4_doublequote($`'1))`dnl +$2`'dnl +ifelse(eval($'`#>1 || m4_length('m4_doublequote($`'1)`)!=0),1,($'`@))')') + +dnl Called: deflit_emptyargcheck(macroname,$#,`$1') +define(deflit_emptyargcheck, +`ifelse(eval($2==1 && !m4_dollarhash_1_if_noparen_p && m4_length(`$3')==0),1, +`m4_error(`dont use a deflit as $1() because it loses the brackets (see deflit in asm-incl.m4 for more information) +')')') + + +dnl Usage: m4_assert(`expr') +dnl +dnl Test a compile-time requirement with an m4 expression. The expression +dnl should be quoted, and will be eval()ed and expected to be non-zero. +dnl For example, +dnl +dnl m4_assert(`FOO*2+6 < 14') + +define(m4_assert, +m4_assert_numargs(1) +`ifelse(eval($1),1,, +`m4_error(`assertion failed: $1 +')')') + + +dnl -------------------------------------------------------------------------- +dnl Various assembler things, not specific to any particular CPU. +dnl + + +dnl Usage: include_mpn(`filename') +dnl +dnl Like include(), but adds a path to the mpn source directory. For +dnl example, +dnl +dnl include_mpn(`sparc64/addmul_1h.asm') + +define(include_mpn, +m4_assert_numargs(1) +m4_assert_defined(`CONFIG_TOP_SRCDIR') +`include(CONFIG_TOP_SRCDIR`/mpn/$1')') + + +dnl Usage: C comment ... +dnl +dnl "C" works like a FORTRAN-style comment character. This can be used for +dnl comments to the right of assembly instructions, where just dnl would +dnl remove the linefeed, and concatenate adjacent lines. +dnl +dnl "C" and/or "dnl" are useful when an assembler doesn't support comments, +dnl or where different assemblers for a particular CPU have different +dnl comment styles. The intermediate ".s" files will end up with no +dnl comments, just code. +dnl +dnl Using "C" is not intended to cause offence to anyone who doesn't like +dnl FORTRAN; but if that happens it's an unexpected bonus. + +define(C, ` +dnl') + + +dnl Various possible defines passed from the Makefile that are to be tested +dnl with ifdef() rather than be expanded. + +m4_not_for_expansion(`PIC') + +dnl aors_n +m4_not_for_expansion(`OPERATION_add_n') +m4_not_for_expansion(`OPERATION_sub_n') + +dnl aorsmul_n +m4_not_for_expansion(`OPERATION_addmul_1') +m4_not_for_expansion(`OPERATION_submul_1') + +dnl logops_n +m4_not_for_expansion(`OPERATION_and_n') +m4_not_for_expansion(`OPERATION_andn_n') +m4_not_for_expansion(`OPERATION_nand_n') +m4_not_for_expansion(`OPERATION_ior_n') +m4_not_for_expansion(`OPERATION_iorn_n') +m4_not_for_expansion(`OPERATION_nior_n') +m4_not_for_expansion(`OPERATION_xor_n') +m4_not_for_expansion(`OPERATION_xnor_n') + +dnl popham +m4_not_for_expansion(`OPERATION_popcount') +m4_not_for_expansion(`OPERATION_hamdist') + + +dnl Usage: m4_config_gmp_mparam(`symbol') +dnl +dnl Check that `symbol' is defined. If it isn't, issue an error and +dnl terminate immediately. The error message explains that the symbol +dnl should be in config.m4, copied from gmp-mparam.h. +dnl +dnl Processing is terminated immediately since missing something like +dnl KARATSUBA_SQR_THRESHOLD can lead to infinite loops with endless error +dnl messages. + +define(m4_config_gmp_mparam, +m4_assert_numargs(1) +`ifdef(`$1',, +`m4_error(`$1 is not defined. + "configure" should have extracted this from gmp-mparam.h and put it + in config.m4, but somehow this has failed. +')m4exit(1)')') + + +dnl Usage: defreg(name,reg) +dnl +dnl Give a name to a $ style register. For example, +dnl +dnl defreg(foo,$12) +dnl +dnl defreg() inserts an extra pair of quotes after the $ so that it's not +dnl interpreted as an m4 macro parameter, ie. foo is actually $`'12. m4 +dnl strips those quotes when foo is expanded. +dnl +dnl deflit() is used to make the new definition, so it will expand +dnl literally even if followed by parentheses ie. foo(99) will become +dnl $12(99). (But there's nowhere that would be used is there?) +dnl +dnl When making further definitions from existing defreg() macros, remember +dnl to use defreg() again to protect the $ in the new definitions too. For +dnl example, +dnl +dnl defreg(a0,$4) +dnl defreg(a1,$5) +dnl ... +dnl +dnl defreg(PARAM_DST,a0) +dnl +dnl This is only because a0 is expanding at the time the PARAM_DST +dnl definition is made, leaving a literal $4 that must be re-quoted. On +dnl the other hand in something like the following ra is only expanded when +dnl ret is used and its $`'31 protection will have its desired effect at +dnl that time. +dnl +dnl defreg(ra,$31) +dnl ... +dnl define(ret,`j ra') +dnl +dnl Note that only $n forms are meant to be used here, and something like +dnl 128($30) doesn't get protected and will come out wrong. + +define(defreg, +m4_assert_numargs(2) +`deflit(`$1', +substr(`$2',0,1)``''substr(`$2',1))') + + +dnl Usage: m4_instruction_wrapper(num) +dnl +dnl Put this, unquoted, on a line on its own, at the start of a macro +dnl that's a wrapper around an assembler instruction. It adds code to give +dnl a descriptive error message if the macro is invoked without arguments. +dnl +dnl For example, suppose jmp needs to be wrapped, +dnl +dnl define(jmp, +dnl m4_instruction_wrapper() +dnl m4_assert_numargs(1) +dnl `.byte 0x42 +dnl .long $1 +dnl nop') +dnl +dnl The point of m4_instruction_wrapper is to get a better error message +dnl than m4_assert_numargs would give if jmp is accidentally used as plain +dnl "jmp foo" instead of the intended "jmp( foo)". "jmp()" with no +dnl argument also provokes the error message. +dnl +dnl m4_instruction_wrapper should only be used with wrapped instructions +dnl that take arguments, since obviously something meant to be used as +dnl plain "ret", say, doesn't want to give an error when used that way. + +define(m4_instruction_wrapper, +m4_assert_numargs(0) +``m4_instruction_wrapper_internal'(m4_doublequote($`'0),dnl +m4_doublequote(ifdef(`__file__',__file__,`the m4 sources')),dnl +$`#',m4_doublequote($`'1))`dnl'') + +dnl Called: m4_instruction_wrapper_internal($0,`filename',$#,$1) +define(m4_instruction_wrapper_internal, +`ifelse(eval($3<=1 && m4_length(`$4')==0),1, +`m4_error(`$1 is a macro replacing that instruction and needs arguments, see $2 for details +')')') + + +dnl Usage: UNROLL_LOG2, UNROLL_MASK, UNROLL_BYTES +dnl CHUNK_LOG2, CHUNK_MASK, CHUNK_BYTES +dnl +dnl When code supports a variable amount of loop unrolling, the convention +dnl is to define UNROLL_COUNT to the number of limbs processed per loop. +dnl When testing code this can be varied to see how much the loop overhead +dnl is costing. For example, +dnl +dnl deflit(UNROLL_COUNT, 32) +dnl +dnl If the forloop() generating the unrolled loop has a pattern processing +dnl more than one limb, the convention is to express this with CHUNK_COUNT. +dnl For example, +dnl +dnl deflit(CHUNK_COUNT, 2) +dnl +dnl The LOG2, MASK and BYTES definitions below are derived from these COUNT +dnl definitions. If COUNT is redefined, the LOG2, MASK and BYTES follow +dnl the new definition automatically. +dnl +dnl LOG2 is the log base 2 of COUNT. MASK is COUNT-1, which can be used as +dnl a bit mask. BYTES is BYTES_PER_MP_LIMB*COUNT, the number of bytes +dnl processed in each unrolled loop. +dnl +dnl BYTES_PER_MP_LIMB is defined in a CPU specific m4 include file. It +dnl exists only so the BYTES definitions here can be common to all CPUs. +dnl In the actual code for a given CPU, an explicit 4 or 8 may as well be +dnl used because the code is only for a particular CPU, it doesn't need to +dnl be general. +dnl +dnl Note that none of these macros do anything except give conventional +dnl names to commonly used things. You still have to write your own +dnl expressions for a forloop() and the resulting address displacements. +dnl Something like the following would be typical for 4 bytes per limb. +dnl +dnl forloop(`i',0,UNROLL_COUNT-1,` +dnl deflit(`disp',eval(i*4)) +dnl ... +dnl ') +dnl +dnl Or when using CHUNK_COUNT, +dnl +dnl forloop(`i',0,UNROLL_COUNT/CHUNK_COUNT-1,` +dnl deflit(`disp0',eval(i*CHUNK_COUNT*4)) +dnl deflit(`disp1',eval(disp0+4)) +dnl ... +dnl ') +dnl +dnl Clearly `i' can be run starting from 1, or from high to low or whatever +dnl best suits. + +deflit(UNROLL_LOG2, +m4_assert_defined(`UNROLL_COUNT') +`m4_log2(UNROLL_COUNT)') + +deflit(UNROLL_MASK, +m4_assert_defined(`UNROLL_COUNT') +`eval(UNROLL_COUNT-1)') + +deflit(UNROLL_BYTES, +m4_assert_defined(`UNROLL_COUNT') +m4_assert_defined(`BYTES_PER_MP_LIMB') +`eval(UNROLL_COUNT * BYTES_PER_MP_LIMB)') + +deflit(CHUNK_LOG2, +m4_assert_defined(`CHUNK_COUNT') +`m4_log2(CHUNK_COUNT)') + +deflit(CHUNK_MASK, +m4_assert_defined(`CHUNK_COUNT') +`eval(CHUNK_COUNT-1)') + +deflit(CHUNK_BYTES, +m4_assert_defined(`CHUNK_COUNT') +m4_assert_defined(`BYTES_PER_MP_LIMB') +`eval(CHUNK_COUNT * BYTES_PER_MP_LIMB)') + + +dnl Usage: MPN(name) +dnl +dnl Add MPN_PREFIX to a name. +dnl MPN_PREFIX defaults to "__gmpn_" if not defined. + +ifdef(`MPN_PREFIX',, +`define(`MPN_PREFIX',`__gmpn_')') + +define(MPN, +m4_assert_numargs(1) +`MPN_PREFIX`'$1') + + +dnl Usage: mpn_add_n, etc +dnl +dnl Convenience definitions using MPN(), like the #defines in gmp.h. Each +dnl function that might be implemented in assembler is here. + +define(define_mpn, +m4_assert_numargs(1) +`define(`mpn_$1',`MPN(`$1')')') + +define_mpn(add) +define_mpn(add_1) +define_mpn(add_n) +define_mpn(add_nc) +define_mpn(addmul_1) +define_mpn(addmul_1c) +define_mpn(addsub_n) +define_mpn(addsub_nc) +define_mpn(and_n) +define_mpn(andn_n) +define_mpn(bdivmod) +define_mpn(cmp) +define_mpn(com_n) +define_mpn(copyd) +define_mpn(copyi) +define_mpn(divexact_by3c) +define_mpn(divrem) +define_mpn(divrem_1) +define_mpn(divrem_1c) +define_mpn(divrem_2) +define_mpn(divrem_classic) +define_mpn(divrem_newton) +define_mpn(dump) +define_mpn(gcd) +define_mpn(gcd_1) +define_mpn(gcdext) +define_mpn(get_str) +define_mpn(hamdist) +define_mpn(invert_limb) +define_mpn(ior_n) +define_mpn(iorn_n) +define_mpn(kara_mul_n) +define_mpn(kara_sqr_n) +define_mpn(lshift) +define_mpn(lshiftc) +define_mpn(mod_1) +define_mpn(mod_1c) +define_mpn(mul) +define_mpn(mul_1) +define_mpn(mul_1c) +define_mpn(mul_basecase) +define_mpn(mul_n) +define_mpn(perfect_square_p) +define_mpn(popcount) +define_mpn(preinv_mod_1) +define_mpn(nand_n) +define_mpn(nior_n) +define_mpn(random) +define_mpn(random2) +define_mpn(rshift) +define_mpn(rshiftc) +define_mpn(scan0) +define_mpn(scan1) +define_mpn(set_str) +define_mpn(sqr_basecase) +define_mpn(sub_n) +define_mpn(sqrtrem) +define_mpn(sub) +define_mpn(sub_1) +define_mpn(sub_n) +define_mpn(sub_nc) +define_mpn(submul_1) +define_mpn(submul_1c) +define_mpn(toom3_mul_n) +define_mpn(toom3_sqr_n) +define_mpn(umul_ppmm) +define_mpn(udiv_qrnnd) +define_mpn(xnor_n) +define_mpn(xor_n) + +define(`ASM_START', + `') + +define(`PROLOGUE', + ` + TEXT + ALIGN(4) + GLOBL GSYM_PREFIX`$1' + TYPE(GSYM_PREFIX`$1',`function') +GSYM_PREFIX`$1':') + +define(`EPILOGUE', + ` + SIZE(GSYM_PREFIX`$1',.-GSYM_PREFIX`$1')') + +dnl LSYM_PREFIX might be L$, so defn() must be used to quote it or the L +dnl will expand as the L macro, an infinite recursion. +define(`L',`defn(`LSYM_PREFIX')$1') + +define(`INT32', + ` + ALIGN(4) +$1: + W32 $2 + ') + +define(`INT64', + ` + ALIGN(8) +$1: + W32 $2 + W32 $3 + ') + + +dnl Usage: ALIGN(bytes) +dnl +dnl Emit a ".align" directive. The alignment is specified in bytes, and +dnl will normally need to be a power of 2. The actual ".align" generated +dnl is either bytes or logarithmic according to what ./configure detects. +dnl +dnl ALIGN_FILL_0x90, if defined and equal to "yes", means a ", 0x90" should +dnl be appended (this is for x86). + +define(ALIGN, +m4_assert_numargs(1) +m4_assert_defined(`ALIGN_LOGARITHMIC') +`.align ifelse(ALIGN_LOGARITHMIC,yes,`m4_log2($1)',`eval($1)')dnl +ifelse(ALIGN_FILL_0x90,yes,`, 0x90')') + + +dnl Usage: MULFUNC_PROLOGUE(function function...) +dnl +dnl A dummy macro which is grepped for by ./configure to know what +dnl functions a multi-function file is providing. Use this if there aren't +dnl explicit PROLOGUE()s for each possible function. +dnl +dnl Multiple MULFUNC_PROLOGUEs can be used, or just one with the function +dnl names separated by spaces. + +define(`MULFUNC_PROLOGUE', +m4_assert_numargs(1) +`') + + +divert`'dnl diff --git a/rts/gmp/mpn/clipper/add_n.s b/rts/gmp/mpn/clipper/add_n.s new file mode 100644 index 0000000000..538a1caed0 --- /dev/null +++ b/rts/gmp/mpn/clipper/add_n.s @@ -0,0 +1,48 @@ +; Clipper __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +.text + .align 16 +.globl ___gmpn_add_n +___gmpn_add_n: + subq $8,sp + storw r6,(sp) + loadw 12(sp),r2 + loadw 16(sp),r3 + loadq $0,r6 ; clear carry-save register + +.Loop: loadw (r1),r4 + loadw (r2),r5 + addwc r6,r6 ; restore carry from r6 + addwc r5,r4 + storw r4,(r0) + subwc r6,r6 ; save carry in r6 + addq $4,r0 + addq $4,r1 + addq $4,r2 + subq $1,r3 + brne .Loop + + negw r6,r0 + loadw (sp),r6 + addq $8,sp + ret sp diff --git a/rts/gmp/mpn/clipper/mul_1.s b/rts/gmp/mpn/clipper/mul_1.s new file mode 100644 index 0000000000..c0c756488c --- /dev/null +++ b/rts/gmp/mpn/clipper/mul_1.s @@ -0,0 +1,47 @@ +; Clipper __gmpn_mul_1 -- Multiply a limb vector with a limb and store +; the result in a second limb vector. + +; Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +.text + .align 16 +.globl ___gmpn_mul_1 +___gmpn_mul_1: + subq $8,sp + storw r6,(sp) + loadw 12(sp),r2 + loadw 16(sp),r3 + loadq $0,r6 ; clear carry limb + +.Loop: loadw (r1),r4 + mulwux r3,r4 + addw r6,r4 ; add old carry limb into low product limb + loadq $0,r6 + addwc r5,r6 ; propagate cy into high product limb + storw r4,(r0) + addq $4,r0 + addq $4,r1 + subq $1,r2 + brne .Loop + + movw r6,r0 + loadw 0(sp),r6 + addq $8,sp + ret sp diff --git a/rts/gmp/mpn/clipper/sub_n.s b/rts/gmp/mpn/clipper/sub_n.s new file mode 100644 index 0000000000..44d8797289 --- /dev/null +++ b/rts/gmp/mpn/clipper/sub_n.s @@ -0,0 +1,48 @@ +; Clipper __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +.text + .align 16 +.globl ___gmpn_sub_n +___gmpn_sub_n: + subq $8,sp + storw r6,(sp) + loadw 12(sp),r2 + loadw 16(sp),r3 + loadq $0,r6 ; clear carry-save register + +.Loop: loadw (r1),r4 + loadw (r2),r5 + addwc r6,r6 ; restore carry from r6 + subwc r5,r4 + storw r4,(r0) + subwc r6,r6 ; save carry in r6 + addq $4,r0 + addq $4,r1 + addq $4,r2 + subq $1,r3 + brne .Loop + + negw r6,r0 + loadw (sp),r6 + addq $8,sp + ret sp diff --git a/rts/gmp/mpn/cray/README b/rts/gmp/mpn/cray/README new file mode 100644 index 0000000000..8195c67e21 --- /dev/null +++ b/rts/gmp/mpn/cray/README @@ -0,0 +1,14 @@ +The (poorly optimized) code in this directory was originally written for a +j90 system, but finished on a c90. It should work on all Cray vector +computers. For the T3E and T3D systems, the `alpha' subdirectory at the +same level as the directory containing this file, is much better. + +* `+' seems to be faster than `|' when combining carries. + +* It is possible that the best multiply performance would be achived by + storing only 24 bits per element, and using lazy carry propagation. Before + calling i24mult, full carry propagation would be needed. + +* Supply tasking versions of the C loops. + + diff --git a/rts/gmp/mpn/cray/add_n.c b/rts/gmp/mpn/cray/add_n.c new file mode 100644 index 0000000000..1fdb394993 --- /dev/null +++ b/rts/gmp/mpn/cray/add_n.c @@ -0,0 +1,96 @@ +/* mpn_add_n -- Add two limb vectors of equal, non-zero length. + For Cray vector processors. + + Copyright (C) 1996, 2000 Free Software Foundation, Inc. + + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +mpn_add_n (c, a, b, n) + mp_ptr c; + mp_srcptr a, b; + mp_size_t n; +{ + mp_size_t i; + mp_size_t nm1 = n - 1; + int more_carries = 0; + int carry_out; + + /* For small operands the non-vector code is faster. */ + if (n < 16) + goto sequential; + + if (a == c || b == c) + { + TMP_DECL (marker); + TMP_MARK (marker); + if (c == a) + { + /* allocate temp space for a */ + mp_ptr ax = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB); + MPN_COPY (ax, a, n); + a = (mp_srcptr) ax; + } + if (c == b) + { + /* allocate temp space for b */ + mp_ptr bx = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB); + MPN_COPY (bx, b, n); + b = (mp_srcptr) bx; + } + carry_out = mpn_add_n (c, a, b, n); + TMP_FREE (marker); + return carry_out; + } + + carry_out = a[nm1] + b[nm1] < a[nm1]; + +#pragma _CRI ivdep /* Cray PVP systems */ + for (i = nm1; i > 0; i--) + { + int cy_in; + cy_in = a[i - 1] + b[i - 1] < a[i - 1]; + c[i] = a[i] + b[i] + cy_in; + more_carries += c[i] < cy_in; + } + c[0] = a[0] + b[0]; + + if (more_carries) + { + /* This won't vectorize, but we should come here rarely. */ + int cy; + sequential: + cy = 0; + for (i = 0; i < n; i++) + { + mp_limb_t ai, ci, t; + ai = a[i]; + t = b[i] + cy; + cy = t < cy; + ci = ai + t; + cy += ci < ai; + c[i] = ci; + } + carry_out = cy; + } + + return carry_out; +} diff --git a/rts/gmp/mpn/cray/addmul_1.c b/rts/gmp/mpn/cray/addmul_1.c new file mode 100644 index 0000000000..031b4e8e8d --- /dev/null +++ b/rts/gmp/mpn/cray/addmul_1.c @@ -0,0 +1,46 @@ +/* mpn_addmul_1 for Cray PVP. + +Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb) +{ + mp_ptr p0, p1, tp; + mp_limb_t cy_limb; + TMP_DECL (marker); + TMP_MARK (marker); + + p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + + GMPN_MULWW (p1, p0, up, &n, &limb); + cy_limb = mpn_add_n (tp, rp, p0, n); + rp[0] = tp[0]; + cy_limb += mpn_add_n (rp + 1, tp + 1, p1, n - 1); + cy_limb += p1[n - 1]; + + TMP_FREE (marker); + return cy_limb; +} diff --git a/rts/gmp/mpn/cray/gmp-mparam.h b/rts/gmp/mpn/cray/gmp-mparam.h new file mode 100644 index 0000000000..14f7b8e05b --- /dev/null +++ b/rts/gmp/mpn/cray/gmp-mparam.h @@ -0,0 +1,27 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 64 +#define BITS_PER_SHORTINT 32 +#define BITS_PER_CHAR 8 diff --git a/rts/gmp/mpn/cray/mul_1.c b/rts/gmp/mpn/cray/mul_1.c new file mode 100644 index 0000000000..0c8750b4ac --- /dev/null +++ b/rts/gmp/mpn/cray/mul_1.c @@ -0,0 +1,44 @@ +/* mpn_mul_1 for Cray PVP. + +Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb) +{ + mp_ptr p0, p1; + mp_limb_t cy_limb; + TMP_DECL (marker); + TMP_MARK (marker); + + p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + + GMPN_MULWW (p1, p0, up, &n, &limb); + rp[0] = p0[0]; + cy_limb = mpn_add_n (rp + 1, p0 + 1, p1, n - 1); + cy_limb += p1[n - 1]; + + TMP_FREE (marker); + return cy_limb; +} diff --git a/rts/gmp/mpn/cray/mulww.f b/rts/gmp/mpn/cray/mulww.f new file mode 100644 index 0000000000..99507c1e44 --- /dev/null +++ b/rts/gmp/mpn/cray/mulww.f @@ -0,0 +1,54 @@ +c Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP. + +c Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +c This file is part of the GNU MP Library. + +c The GNU MP Library is free software; you can redistribute it and/or +c modify it under the terms of the GNU Lesser General Public License as +c published by the Free Software Foundation; either version 2.1 of the +c License, or (at your option) any later version. + +c The GNU MP Library is distributed in the hope that it will be useful, +c but WITHOUT ANY WARRANTY; without even the implied warranty of +c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +c Lesser General Public License for more details. + +c You should have received a copy of the GNU Lesser General Public +c License along with the GNU MP Library; see the file COPYING.LIB. If +c not, write to the Free Software Foundation, Inc., 59 Temple Place - +c Suite 330, Boston, MA 02111-1307, USA. + +c p1[] = hi(a[]*s); the upper limbs of each product +c p0[] = low(a[]*s); the corresponding lower limbs +c n is number of limbs in the vectors + + subroutine gmpn_mulww(p1,p0,a,n,s) + integer*8 p1(0:*),p0(0:*),a(0:*),s + integer n + + integer*8 a0,a1,a2,s0,s1,s2,c + integer*8 ai,t0,t1,t2,t3,t4 + + s0 = shiftl(and(s,4194303),24) + s1 = shiftl(and(shiftr(s,22),4194303),24) + s2 = shiftl(and(shiftr(s,44),4194303),24) + + do i = 0,n-1 + ai = a(i) + a0 = shiftl(and(ai,4194303),24) + a1 = shiftl(and(shiftr(ai,22),4194303),24) + a2 = shiftl(and(shiftr(ai,44),4194303),24) + + t0 = i24mult(a0,s0) + t1 = i24mult(a0,s1)+i24mult(a1,s0) + t2 = i24mult(a0,s2)+i24mult(a1,s1)+i24mult(a2,s0) + t3 = i24mult(a1,s2)+i24mult(a2,s1) + t4 = i24mult(a2,s2) + + p0(i)=shiftl(t2,44)+shiftl(t1,22)+t0 + c=shiftr(shiftr(t0,22)+and(t1,4398046511103)+ + $ shiftl(and(t2,1048575),22),42) + p1(i)=shiftl(t4,24)+shiftl(t3,2)+shiftr(t2,20)+shiftr(t1,42)+c + end do + end diff --git a/rts/gmp/mpn/cray/mulww.s b/rts/gmp/mpn/cray/mulww.s new file mode 100644 index 0000000000..890cdcf94d --- /dev/null +++ b/rts/gmp/mpn/cray/mulww.s @@ -0,0 +1,245 @@ +* Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP. + +* Copyright (C) 1996, 2000 Free Software Foundation, Inc. +* This file is generated from mulww.f in this same directory. + +* This file is part of the GNU MP Library. + +* The GNU MP Library is free software; you can redistribute it and/or +* modify it under the terms of the GNU Lesser General Public License as +* published by the Free Software Foundation; either version 2.1 of the +* License, or (at your option) any later version. + +* The GNU MP Library is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* Lesser General Public License for more details. + +* You should have received a copy of the GNU Lesser General Public +* License along with the GNU MP Library; see the file COPYING.LIB. If +* not, write to the Free Software Foundation, Inc., 59 Temple Place - +* Suite 330, Boston, MA 02111-1307, USA. + + IDENT GMPN_MULWW +********************************************** +* Assemble with Cal Version 2.0 * +* * +* Generated by CFT77 6.0.4.19 * +* on 06/27/00 at 04:34:13 * +* * +********************************************** +* ALLOW UNDERSCORES IN IDENTIFIERS + EDIT OFF + FORMAT NEW +@DATA SECTION DATA,CM +@DATA = W.* + CON O'0000000000040000000000 + CON O'0435152404713723252514 ;GMPN_MUL 1 + CON O'0535270000000000000000 ;WW 1 + CON O'0000000000000001200012 ;trbk tbl 1 + VWD 32/0,32/P.GMPN_MULWW ;trbk tbl 1 + CON O'0014003000000000001416 ;trbk tbl 1 + CON O'0000000000000000000011 ;trbk tbl 1 + CON O'0000000000000000000215 ;trbk tbl 1 + BSSZ 1 ;trbk tbl 1 +@CODE SECTION CODE +@CODE = P.* +L3 = P.* ; 1 + A0 A6 ;arg base 1 + A5 6 ;num Darg 1 + B03,A5 0,A0 ;load DAs 1 + A0 A1+A2 ; 1 + A5 1 ;num Ts 1 + 0,A0 T00,A5 ; 1 + B02 A2 ;new base 1 + B66 A3 ;stk top 1 + B01 A6 ;arg base 1 + A7 P.L4 ;ofrn rtn 1 + B00 A7 ;return 1 + A6 @DATA ; 1 + J $STKOFEN ;$STKOFEN 1 +GMPN_MULWW = P.* ; 1 + A0 @DATA+3 ;(trbk) 1 + B77 A0 ;(trbk) 1 + A1 13 ;num Bs 1 + A0 B66 ;stk top 1 + A2 B66 ;stk tmp 1 + A4 B67 ;stk limt 1 + 0,A0 B77,A1 ; 1 + A7 782 ;stk size 1 + A3 A2+A7 ; 1 + A0 A4-A3 ; 1 + JAM L3 ;overflow 1 + A0 A6 ;arg base 1 + A5 6 ;num Darg 1 + B03,A5 0,A0 ;load DAs 1 + A0 A1+A2 ; 1 + A5 1 ;num Ts 1 + 0,A0 T00,A5 ; 1 + B02 A2 ;new base 1 + B66 A3 ;new top 1 + B01 A6 ;arg base 1 +L4 = P.* ;ofrn rtn 1 + A7 B07 ;regs 14 + S7 0,A7 ; 14 + A6 B10 ;regs 9 + S6 0,A6 ; 9 + S5 1 ; 14 + S4 <22 ; 9 + S7 S7-S5 ; 14 + S5 #S7 ; 14 + T00 S6 ;regs 10 + S6 S6>22 ; 10 + S7 T00 ;regs 11 + S7 S7>44 ; 11 + S3 T00 ;regs 9 + S3 S3&S4 ; 9 + S6 S6&S4 ; 10 + S7 S7&S4 ; 11 + S3 S3<24 ; 9 + S6 S6<24 ; 10 + S7 S7<24 ; 11 + S0 S5 ;regs 14 + S4 S5 ;regs 14 + S1 S6 ;regs 14 + S2 S3 ;regs 14 + S3 S7 ;regs 14 + JSP L5 ; 14 +L6 = P.* ; 14 + S7 -S4 ; 14 + A2 S7 ;regs 14 + VL A2 ;regs 14 + A3 B06 ;s_bt_sp 14 + A5 B05 ;s_bt_sp 14 + A4 B04 ;s_bt_sp 14 + A1 VL ; 14 + A2 S4 ;regs 14 +L7 = P.* ; 14 + A0 A3 ;regs 15 + VL A1 ;regs 15 + V7 ,A0,1 ; 15 + B11 A5 ;s_bt_sp 15 + A7 22 ; 17 + B12 A4 ;s_bt_sp 17 + V6 V7>A7 ; 17 + B13 A3 ;s_bt_sp 17 + S7 <22 ; 17 + A3 B02 ;s_bt_sp 17 + V5 S7&V6 ; 17 + A6 24 ; 17 + V4 V5<A6 ; 17 + V3 S1*FV4 ; 22 + V2 S7&V7 ; 16 + V1 V2<A6 ; 16 + V0 S3*FV1 ; 22 + V6 V0+V3 ; 22 + A5 44 ; 18 + V5 V7>A5 ; 18 + V2 S1*FV1 ; 21 + V3 S7&V5 ; 18 + A0 14 ; 34 + B77 A0 ;regs 34 + A4 B77 ;regs 34 + A0 A4+A3 ; 34 + ,A0,1 V2 ;v_ld_str 34 + V0 V3<A6 ; 18 + V7 S2*FV1 ; 20 + A4 142 ; 34 + A0 A4+A3 ; 34 + ,A0,1 V7 ;v_ld_str 34 + V5 V7>A7 ; 28 + V2 S2*FV0 ; 22 + V3 V6+V2 ; 22 + S7 <20 ; 28 + V1 S7&V3 ; 28 + A4 270 ; 34 + A0 A4+A3 ; 34 + ,A0,1 V0 ;v_ld_str 34 + A4 14 ; 34 + A0 A4+A3 ; 34 + V7 ,A0,1 ;v_ld_str 34 + V6 V1<A7 ; 28 + V2 S2*FV4 ; 21 + V0 V7+V2 ; 21 + S7 <42 ; 28 + V1 S7&V0 ; 28 + A4 398 ; 34 + A0 A4+A3 ; 34 + ,A0,1 V0 ;v_ld_str 34 + V7 S3*FV4 ; 23 + V2 V5+V1 ; 28 + V0 V3<A5 ; 26 + A5 526 ; 34 + A0 A5+A3 ; 34 + ,A0,1 V0 ;v_ld_str 34 + A5 270 ; 34 + A0 A5+A3 ; 34 + V4 ,A0,1 ;v_ld_str 34 + V5 V2+V6 ; 28 + A5 20 ; 32 + V1 V3>A5 ; 32 + V0 S1*FV4 ; 23 + A5 654 ; 34 + A0 A5+A3 ; 34 + ,A0,1 V1 ;v_ld_str 34 + V6 V7+V0 ; 23 + A5 2 ; 32 + V2 V6<A5 ; 32 + V3 S3*FV4 ; 24 + A5 142 ; 34 + A0 A5+A3 ; 34 + V1 ,A0,1 ;v_ld_str 34 + A5 526 ; 34 + A0 A5+A3 ; 34 + V7 ,A0,1 ;v_ld_str 34 + V0 V1+V7 ; 26 + V6 V3<A6 ; 32 + V4 V6+V2 ; 32 + A6 42 ; 28 + V7 V5>A6 ; 28 + A5 654 ; 34 + CPW ;cmr_vrsp 34 + A0 A5+A3 ; 34 + V1 ,A0,1 ;v_ld_str 34 + A5 398 ; 34 + A0 A5+A3 ; 34 + V3 ,A0,1 ;v_ld_str 34 + V6 V4+V1 ; 32 + V2 V3>A6 ; 32 + V5 V6+V2 ; 32 + A6 B12 ;s_bt_sp 32 + V4 V3<A7 ; 26 + A7 B13 ;regs 34 + A3 A7+A1 ; 34 + A7 B11 ;regs 34 + A5 A7+A1 ; 34 + A4 A6+A1 ; 34 + A7 A2+A1 ; 34 + A0 A2+A1 ; 34 + A2 128 ; 34 + B13 A0 ;s_bt_sp 34 + V1 V0+V4 ; 26 + A0 B11 ;regs 31 + ,A0,1 V1 ; 31 + V6 V5+V7 ; 33 + A0 A6 ;regs 33 + ,A0,1 V6 ; 33 + A0 B13 ;regs 34 + A1 A2 ;regs 34 + A2 A7 ;regs 34 + JAN L7 ; 34 +L8 = P.* ; 34 +L5 = P.* ; 34 + S1 0 ; 35 + A0 B02 ; 35 + A2 B02 ; 35 + A1 13 ;num Bs 35 + B66 A0 ; 35 + B77,A1 0,A0 ; 35 + A0 A2+A1 ; 35 + A1 1 ;num Ts 35 + T00,A1 0,A0 ; 35 + J B00 ; 35 + EXT $STKOFEN:p + ENTRY GMPN_MULWW + END diff --git a/rts/gmp/mpn/cray/sub_n.c b/rts/gmp/mpn/cray/sub_n.c new file mode 100644 index 0000000000..902e07a727 --- /dev/null +++ b/rts/gmp/mpn/cray/sub_n.c @@ -0,0 +1,97 @@ +/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length. + For Cray vector processors. + + Copyright (C) 1996, 2000 Free Software Foundation, Inc. + + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +mpn_sub_n (c, a, b, n) + mp_ptr c; + mp_srcptr a, b; + mp_size_t n; +{ + mp_size_t i; + mp_size_t nm1 = n - 1; + int more_carries = 0; + int carry_out; + + /* For small operands the non-vector code is faster. */ + if (n < 16) + goto sequential; + + if (a == c || b == c) + { + TMP_DECL (marker); + TMP_MARK (marker); + if (c == a) + { + /* allocate temp space for a */ + mp_ptr ax = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB); + MPN_COPY (ax, a, n); + a = (mp_srcptr) ax; + } + if (c == b) + { + /* allocate temp space for b */ + mp_ptr bx = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB); + MPN_COPY (bx, b, n); + b = (mp_srcptr) bx; + } + carry_out = mpn_sub_n (c, a, b, n); + TMP_FREE (marker); + return carry_out; + } + + carry_out = a[nm1] < b[nm1]; + +#pragma _CRI ivdep /* Cray PVP systems */ + for (i = nm1; i > 0; i--) + { + int cy_in; mp_limb_t t; + cy_in = a[i - 1] < b[i - 1]; + t = a[i] - b[i]; + more_carries += t < cy_in; + c[i] = t - cy_in; + } + c[0] = a[0] - b[0]; + + if (more_carries) + { + /* This won't vectorize, but we should come here rarely. */ + int cy; + sequential: + cy = 0; + for (i = 0; i < n; i++) + { + mp_limb_t ai, ci, t; + ai = a[i]; + t = b[i] + cy; + cy = t < cy; + ci = ai - t; + cy += ci > ai; + c[i] = ci; + } + carry_out = cy; + } + + return carry_out; +} diff --git a/rts/gmp/mpn/cray/submul_1.c b/rts/gmp/mpn/cray/submul_1.c new file mode 100644 index 0000000000..4d2fb13c62 --- /dev/null +++ b/rts/gmp/mpn/cray/submul_1.c @@ -0,0 +1,46 @@ +/* mpn_submul_1 for Cray PVP. + +Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb) +{ + mp_ptr p0, p1, tp; + mp_limb_t cy_limb; + TMP_DECL (marker); + TMP_MARK (marker); + + p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB); + + GMPN_MULWW (p1, p0, up, &n, &limb); + cy_limb = mpn_sub_n (tp, rp, p0, n); + rp[0] = tp[0]; + cy_limb += mpn_sub_n (rp + 1, tp + 1, p1, n - 1); + cy_limb += p1[n - 1]; + + TMP_FREE (marker); + return cy_limb; +} diff --git a/rts/gmp/mpn/generic/add_n.c b/rts/gmp/mpn/generic/add_n.c new file mode 100644 index 0000000000..5fcb7e4835 --- /dev/null +++ b/rts/gmp/mpn/generic/add_n.c @@ -0,0 +1,62 @@ +/* mpn_add_n -- Add two limb vectors of equal, non-zero length. + +Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +#if __STDC__ +mpn_add_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size) +#else +mpn_add_n (res_ptr, s1_ptr, s2_ptr, size) + register mp_ptr res_ptr; + register mp_srcptr s1_ptr; + register mp_srcptr s2_ptr; + mp_size_t size; +#endif +{ + register mp_limb_t x, y, cy; + register mp_size_t j; + + /* The loop counter and index J goes from -SIZE to -1. This way + the loop becomes faster. */ + j = -size; + + /* Offset the base pointers to compensate for the negative indices. */ + s1_ptr -= j; + s2_ptr -= j; + res_ptr -= j; + + cy = 0; + do + { + y = s2_ptr[j]; + x = s1_ptr[j]; + y += cy; /* add previous carry to one addend */ + cy = (y < cy); /* get out carry from that addition */ + y = x + y; /* add other addend */ + cy = (y < x) + cy; /* get out carry from that add, combine */ + res_ptr[j] = y; + } + while (++j != 0); + + return cy; +} diff --git a/rts/gmp/mpn/generic/addmul_1.c b/rts/gmp/mpn/generic/addmul_1.c new file mode 100644 index 0000000000..746ae31307 --- /dev/null +++ b/rts/gmp/mpn/generic/addmul_1.c @@ -0,0 +1,65 @@ +/* mpn_addmul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR + by S2_LIMB, add the S1_SIZE least significant limbs of the product to the + limb vector pointed to by RES_PTR. Return the most significant limb of + the product, adjusted for carry-out from the addition. + +Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_addmul_1 (res_ptr, s1_ptr, s1_size, s2_limb) + register mp_ptr res_ptr; + register mp_srcptr s1_ptr; + mp_size_t s1_size; + register mp_limb_t s2_limb; +{ + register mp_limb_t cy_limb; + register mp_size_t j; + register mp_limb_t prod_high, prod_low; + register mp_limb_t x; + + /* The loop counter and index J goes from -SIZE to -1. This way + the loop becomes faster. */ + j = -s1_size; + + /* Offset the base pointers to compensate for the negative indices. */ + res_ptr -= j; + s1_ptr -= j; + + cy_limb = 0; + do + { + umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb); + + prod_low += cy_limb; + cy_limb = (prod_low < cy_limb) + prod_high; + + x = res_ptr[j]; + prod_low = x + prod_low; + cy_limb += (prod_low < x); + res_ptr[j] = prod_low; + } + while (++j != 0); + + return cy_limb; +} diff --git a/rts/gmp/mpn/generic/addsub_n.c b/rts/gmp/mpn/generic/addsub_n.c new file mode 100644 index 0000000000..c9bab3ef60 --- /dev/null +++ b/rts/gmp/mpn/generic/addsub_n.c @@ -0,0 +1,167 @@ +/* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length. + +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#ifndef L1_CACHE_SIZE +#define L1_CACHE_SIZE 8192 /* only 68040 has less than this */ +#endif + +#define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6) + + +/* mpn_addsub_n. + r1[] = s1[] + s2[] + r2[] = s1[] - s2[] + All operands have n limbs. + In-place operations allowed. */ +mp_limb_t +#if __STDC__ +mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n) +#else +mpn_addsub_n (r1p, r2p, s1p, s2p, n) + mp_ptr r1p, r2p; + mp_srcptr s1p, s2p; + mp_size_t n; +#endif +{ + mp_limb_t acyn, acyo; /* carry for add */ + mp_limb_t scyn, scyo; /* carry for subtract */ + mp_size_t off; /* offset in operands */ + mp_size_t this_n; /* size of current chunk */ + + /* We alternatingly add and subtract in chunks that fit into the (L1) + cache. Since the chunks are several hundred limbs, the function call + overhead is insignificant, but we get much better locality. */ + + /* We have three variant of the inner loop, the proper loop is chosen + depending on whether r1 or r2 are the same operand as s1 or s2. */ + + if (r1p != s1p && r1p != s2p) + { + /* r1 is not identical to either input operand. We can therefore write + to r1 directly, without using temporary storage. */ + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n + acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); +#endif +#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif + } + } + else if (r2p != s1p && r2p != s2p) + { + /* r2 is not identical to either input operand. We can therefore write + to r2 directly, without using temporary storage. */ + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif +#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n + acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); +#endif + } + } + else + { + /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa) + Need temporary storage. */ + mp_limb_t tp[PART_SIZE]; + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n + acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo); +#endif +#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif + MPN_COPY (r1p + off, tp, this_n); + } + } + + return 2 * acyo + scyo; +} + +#ifdef MAIN +#include <stdlib.h> +#include <stdio.h> +#include "timing.h" + +long cputime (); + +int +main (int argc, char **argv) +{ + mp_ptr r1p, r2p, s1p, s2p; + double t; + mp_size_t n; + + n = strtol (argv[1], 0, 0); + + r1p = malloc (n * BYTES_PER_MP_LIMB); + r2p = malloc (n * BYTES_PER_MP_LIMB); + s1p = malloc (n * BYTES_PER_MP_LIMB); + s2p = malloc (n * BYTES_PER_MP_LIMB); + TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n))); + printf (" separate add and sub: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n)); + printf ("combined addsub separate variables: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n)); + printf (" combined addsub r1 overlap: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n)); + printf (" combined addsub r2 overlap: %.3f\n", t); + TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n)); + printf (" combined addsub in-place: %.3f\n", t); + + return 0; +} +#endif diff --git a/rts/gmp/mpn/generic/bdivmod.c b/rts/gmp/mpn/generic/bdivmod.c new file mode 100644 index 0000000000..c4bcb414e6 --- /dev/null +++ b/rts/gmp/mpn/generic/bdivmod.c @@ -0,0 +1,120 @@ +/* mpn/bdivmod.c: mpn_bdivmod for computing U/V mod 2^d. + +Copyright (C) 1991, 1993, 1994, 1995, 1996, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* q_high = mpn_bdivmod (qp, up, usize, vp, vsize, d). + + Puts the low d/BITS_PER_MP_LIMB limbs of Q = U / V mod 2^d at qp, and + returns the high d%BITS_PER_MP_LIMB bits of Q as the result. + + Also, U - Q * V mod 2^(usize*BITS_PER_MP_LIMB) is placed at up. Since the + low d/BITS_PER_MP_LIMB limbs of this difference are zero, the code allows + the limb vectors at qp to overwrite the low limbs at up, provided qp <= up. + + Preconditions: + 1. V is odd. + 2. usize * BITS_PER_MP_LIMB >= d. + 3. If Q and U overlap, qp <= up. + + Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu) + + Funding for this work has been partially provided by Conselho Nacional + de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant + 301314194-2, and was done while I was a visiting reseacher in the Instituto + de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS). + + References: + T. Jebelean, An algorithm for exact division, Journal of Symbolic + Computation, v. 15, 1993, pp. 169-180. + + K. Weber, The accelerated integer GCD algorithm, ACM Transactions on + Mathematical Software, v. 21 (March), 1995, pp. 111-122. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +#if __STDC__ +mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize, + mp_srcptr vp, mp_size_t vsize, unsigned long int d) +#else +mpn_bdivmod (qp, up, usize, vp, vsize, d) + mp_ptr qp; + mp_ptr up; + mp_size_t usize; + mp_srcptr vp; + mp_size_t vsize; + unsigned long int d; +#endif +{ + mp_limb_t v_inv; + + /* 1/V mod 2^BITS_PER_MP_LIMB. */ + modlimb_invert (v_inv, vp[0]); + + /* Fast code for two cases previously used by the accel part of mpn_gcd. + (Could probably remove this now it's inlined there.) */ + if (usize == 2 && vsize == 2 && + (d == BITS_PER_MP_LIMB || d == 2*BITS_PER_MP_LIMB)) + { + mp_limb_t hi, lo; + mp_limb_t q = up[0] * v_inv; + umul_ppmm (hi, lo, q, vp[0]); + up[0] = 0, up[1] -= hi + q*vp[1], qp[0] = q; + if (d == 2*BITS_PER_MP_LIMB) + q = up[1] * v_inv, up[1] = 0, qp[1] = q; + return 0; + } + + /* Main loop. */ + while (d >= BITS_PER_MP_LIMB) + { + mp_limb_t q = up[0] * v_inv; + mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q); + if (usize > vsize) + mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); + d -= BITS_PER_MP_LIMB; + up += 1, usize -= 1; + *qp++ = q; + } + + if (d) + { + mp_limb_t b; + mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1); + if (q <= 1) + { + if (q == 0) + return 0; + else + b = mpn_sub_n (up, up, vp, MIN (usize, vsize)); + } + else + b = mpn_submul_1 (up, vp, MIN (usize, vsize), q); + + if (usize > vsize) + mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); + return q; + } + + return 0; +} diff --git a/rts/gmp/mpn/generic/bz_divrem_n.c b/rts/gmp/mpn/generic/bz_divrem_n.c new file mode 100644 index 0000000000..d234b22af5 --- /dev/null +++ b/rts/gmp/mpn/generic/bz_divrem_n.c @@ -0,0 +1,153 @@ +/* mpn_bz_divrem_n and auxilliary routines. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A + FUTURE GNU MP RELEASE. + + +Copyright (C) 2000 Free Software Foundation, Inc. +Contributed by Paul Zimmermann. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* +[1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler, + Technical report MPI-I-98-1-022, october 1998. + http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz +*/ + +static mp_limb_t mpn_bz_div_3_halves_by_2 + _PROTO ((mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)); + + +/* mpn_bz_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than + div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way, + i.e. n/2 >= KARATSUBA_MUL_THRESHOLD */ +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD) +#endif + +#if 0 +static +unused_mpn_divrem (qp, qxn, np, nn, dp, dn) + mp_ptr qp; + mp_size_t qxn; + mp_ptr np; + mp_size_t nn; + mp_srcptr dp; + mp_size_t dn; +{ + /* This might be useful: */ + if (qxn != 0) + { + mp_limb_t c; + mp_ptr tp = alloca ((nn + qxn) * BYTES_PER_MP_LIMB); + MPN_COPY (tp + qxn - nn, np, nn); + MPN_ZERO (tp, qxn); + c = mpn_divrem (qp, 0L, tp, nn + qxn, dp, dn); + /* Maybe copy proper part of tp to np? Documentation is unclear about + the returned np value when qxn != 0 */ + return c; + } +} +#endif + + +/* mpn_bz_divrem_n - Implements algorithm of page 8 in [1]: divides (np,2n) + by (dp,n) and puts the quotient in (qp,n), the remainder in (np,n). + Returns most significant limb of the quotient, which is 0 or 1. + Requires that the most significant bit of the divisor is set. */ + +mp_limb_t +#if __STDC__ +mpn_bz_divrem_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n) +#else +mpn_bz_divrem_n (qp, np, dp, n) + mp_ptr qp; + mp_ptr np; + mp_srcptr dp; + mp_size_t n; +#endif +{ + mp_limb_t qhl, cc; + + if (n % 2 != 0) + { + qhl = mpn_bz_divrem_n (qp + 1, np + 2, dp + 1, n - 1); + cc = mpn_submul_1 (np + 1, qp + 1, n - 1, dp[0]); + cc = mpn_sub_1 (np + n, np + n, 1, cc); + if (qhl) cc += mpn_sub_1 (np + n, np + n, 1, dp[0]); + while (cc) + { + qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, (mp_limb_t) 1); + cc -= mpn_add_n (np + 1, np + 1, dp, n); + } + qhl += mpn_add_1 (qp + 1, qp + 1, n - 1, + mpn_sb_divrem_mn (qp, np, n + 1, dp, n)); + } + else + { + mp_size_t n2 = n/2; + qhl = mpn_bz_div_3_halves_by_2 (qp + n2, np + n2, dp, n2); + qhl += mpn_add_1 (qp + n2, qp + n2, n2, + mpn_bz_div_3_halves_by_2 (qp, np, dp, n2)); + } + return qhl; +} + + +/* divides (np, 3n) by (dp, 2n) and puts the quotient in (qp, n), + the remainder in (np, 2n) */ + +static mp_limb_t +#if __STDC__ +mpn_bz_div_3_halves_by_2 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n) +#else +mpn_bz_div_3_halves_by_2 (qp, np, dp, n) + mp_ptr qp; + mp_ptr np; + mp_srcptr dp; + mp_size_t n; +#endif +{ + mp_size_t twon = n + n; + mp_limb_t qhl, cc; + mp_ptr tmp; + TMP_DECL (marker); + + TMP_MARK (marker); + if (n < BZ_THRESHOLD) + qhl = mpn_sb_divrem_mn (qp, np + n, twon, dp + n, n); + else + qhl = mpn_bz_divrem_n (qp, np + n, dp + n, n); + tmp = (mp_ptr) TMP_ALLOC (twon * BYTES_PER_MP_LIMB); + mpn_mul_n (tmp, qp, dp, n); + cc = mpn_sub_n (np, np, tmp, twon); + TMP_FREE (marker); + if (qhl) cc += mpn_sub_n (np + n, np + n, dp, n); + while (cc) + { + qhl -= mpn_sub_1 (qp, qp, n, (mp_limb_t) 1); + cc -= mpn_add_n (np, np, dp, twon); + } + return qhl; +} diff --git a/rts/gmp/mpn/generic/cmp.c b/rts/gmp/mpn/generic/cmp.c new file mode 100644 index 0000000000..8e9792f54e --- /dev/null +++ b/rts/gmp/mpn/generic/cmp.c @@ -0,0 +1,56 @@ +/* mpn_cmp -- Compare two low-level natural-number integers. + +Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE. + There are no restrictions on the relative sizes of + the two arguments. + Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2. */ + +int +#if __STDC__ +mpn_cmp (mp_srcptr op1_ptr, mp_srcptr op2_ptr, mp_size_t size) +#else +mpn_cmp (op1_ptr, op2_ptr, size) + mp_srcptr op1_ptr; + mp_srcptr op2_ptr; + mp_size_t size; +#endif +{ + mp_size_t i; + mp_limb_t op1_word, op2_word; + + for (i = size - 1; i >= 0; i--) + { + op1_word = op1_ptr[i]; + op2_word = op2_ptr[i]; + if (op1_word != op2_word) + goto diff; + } + return 0; + diff: + /* This can *not* be simplified to + op2_word - op2_word + since that expression might give signed overflow. */ + return (op1_word > op2_word) ? 1 : -1; +} diff --git a/rts/gmp/mpn/generic/diveby3.c b/rts/gmp/mpn/generic/diveby3.c new file mode 100644 index 0000000000..a2fb552bfa --- /dev/null +++ b/rts/gmp/mpn/generic/diveby3.c @@ -0,0 +1,77 @@ +/* mpn_divexact_by3 -- mpn division by 3, expecting no remainder. */ + +/* +Copyright (C) 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + + +#include "gmp.h" +#include "gmp-impl.h" + + +/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB. + 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */ +#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1) + + +/* The "c += ..."s are adding the high limb of 3*l to c. That high limb + will be 0, 1 or 2. Doing two separate "+="s seems to turn out better + code on gcc (as of 2.95.2 at least). + + When a subtraction of a 0,1,2 carry value causes a borrow, that leaves a + limb value of either 0xFF...FF or 0xFF...FE and the multiply by INVERSE_3 + gives 0x55...55 or 0xAA...AA respectively, producing a further borrow of + only 0 or 1 respectively. Hence the carry out of each stage and for the + return value is always only 0, 1 or 2. */ + +mp_limb_t +#if __STDC__ +mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t c) +#else +mpn_divexact_by3c (dst, src, size, c) + mp_ptr dst; + mp_srcptr src; + mp_size_t size; + mp_limb_t c; +#endif +{ + mp_size_t i; + + ASSERT (size >= 1); + + i = 0; + do + { + mp_limb_t l, s; + + s = src[i]; + l = s - c; + c = (l > s); + + l *= INVERSE_3; + dst[i] = l; + + c += (l > MP_LIMB_T_MAX/3); + c += (l > (MP_LIMB_T_MAX/3)*2); + } + while (++i < size); + + return c; +} diff --git a/rts/gmp/mpn/generic/divrem.c b/rts/gmp/mpn/generic/divrem.c new file mode 100644 index 0000000000..30673e76d9 --- /dev/null +++ b/rts/gmp/mpn/generic/divrem.c @@ -0,0 +1,101 @@ +/* mpn_divrem -- Divide natural numbers, producing both remainder and + quotient. This is now just a middle layer for calling the new + internal mpn_tdiv_qr. + +Copyright (C) 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +#if __STDC__ +mpn_divrem (mp_ptr qp, mp_size_t qxn, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn) +#else +mpn_divrem (qp, qxn, np, nn, dp, dn) + mp_ptr qp; + mp_size_t qxn; + mp_ptr np; + mp_size_t nn; + mp_srcptr dp; + mp_size_t dn; +#endif +{ + if (dn == 1) + { + mp_limb_t ret; + mp_ptr q2p; + mp_size_t qn; + TMP_DECL (marker); + + TMP_MARK (marker); + q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB); + + np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]); + qn = nn + qxn - 1; + MPN_COPY (qp, q2p, qn); + ret = q2p[qn]; + + TMP_FREE (marker); + return ret; + } + else if (dn == 2) + { + return mpn_divrem_2 (qp, qxn, np, nn, dp); + } + else + { + mp_ptr rp, q2p; + mp_limb_t qhl; + mp_size_t qn; + TMP_DECL (marker); + + TMP_MARK (marker); + if (qxn != 0) + { + mp_ptr n2p; + n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB); + MPN_ZERO (n2p, qxn); + MPN_COPY (n2p + qxn, np, nn); + q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB); + rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn); + MPN_COPY (np, rp, dn); + qn = nn - dn + qxn; + MPN_COPY (qp, q2p, qn); + qhl = q2p[qn]; + } + else + { + q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB); + rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn); + MPN_COPY (np, rp, dn); /* overwrite np area with remainder */ + qn = nn - dn; + MPN_COPY (qp, q2p, qn); + qhl = q2p[qn]; + } + TMP_FREE (marker); + return qhl; + } +} diff --git a/rts/gmp/mpn/generic/divrem_1.c b/rts/gmp/mpn/generic/divrem_1.c new file mode 100644 index 0000000000..e93f241c9d --- /dev/null +++ b/rts/gmp/mpn/generic/divrem_1.c @@ -0,0 +1,248 @@ +/* mpn_divrem_1(quot_ptr, qsize, dividend_ptr, dividend_size, divisor_limb) -- + Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB. + Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR. + Return the single-limb remainder. + There are no constraints on the value of the divisor. + + QUOT_PTR and DIVIDEND_PTR might point to the same limb. + +Copyright (C) 1991, 1993, 1994, 1996, 1998, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + + +/* __gmpn_divmod_1_internal(quot_ptr,dividend_ptr,dividend_size,divisor_limb) + Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB. + Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR. + Return the single-limb remainder. + There are no constraints on the value of the divisor. + + QUOT_PTR and DIVIDEND_PTR might point to the same limb. */ + +#ifndef UMUL_TIME +#define UMUL_TIME 1 +#endif + +#ifndef UDIV_TIME +#define UDIV_TIME UMUL_TIME +#endif + +static mp_limb_t +#if __STDC__ +__gmpn_divmod_1_internal (mp_ptr quot_ptr, + mp_srcptr dividend_ptr, mp_size_t dividend_size, + mp_limb_t divisor_limb) +#else +__gmpn_divmod_1_internal (quot_ptr, dividend_ptr, dividend_size, divisor_limb) + mp_ptr quot_ptr; + mp_srcptr dividend_ptr; + mp_size_t dividend_size; + mp_limb_t divisor_limb; +#endif +{ + mp_size_t i; + mp_limb_t n1, n0, r; + int dummy; + + /* ??? Should this be handled at all? Rely on callers? */ + if (dividend_size == 0) + return 0; + + /* If multiplication is much faster than division, and the + dividend is large, pre-invert the divisor, and use + only multiplications in the inner loop. */ + + /* This test should be read: + Does it ever help to use udiv_qrnnd_preinv? + && Does what we save compensate for the inversion overhead? */ + if (UDIV_TIME > (2 * UMUL_TIME + 6) + && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, divisor_limb); + if (normalization_steps != 0) + { + mp_limb_t divisor_limb_inverted; + + divisor_limb <<= normalization_steps; + invert_limb (divisor_limb_inverted, divisor_limb); + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); + + /* Possible optimization: + if (r == 0 + && divisor_limb > ((n1 << normalization_steps) + | (dividend_ptr[dividend_size - 2] >> ...))) + ...one division less... */ + + for (i = dividend_size - 2; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (quot_ptr[i + 1], r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))), + divisor_limb, divisor_limb_inverted); + n1 = n0; + } + udiv_qrnnd_preinv (quot_ptr[0], r, r, + n1 << normalization_steps, + divisor_limb, divisor_limb_inverted); + return r >> normalization_steps; + } + else + { + mp_limb_t divisor_limb_inverted; + + invert_limb (divisor_limb_inverted, divisor_limb); + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + { + quot_ptr[i] = 0; + i--; + } + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (quot_ptr[i], r, r, + n0, divisor_limb, divisor_limb_inverted); + } + return r; + } + } + else + { + if (UDIV_NEEDS_NORMALIZATION) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, divisor_limb); + if (normalization_steps != 0) + { + divisor_limb <<= normalization_steps; + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); + + /* Possible optimization: + if (r == 0 + && divisor_limb > ((n1 << normalization_steps) + | (dividend_ptr[dividend_size - 2] >> ...))) + ...one division less... */ + + for (i = dividend_size - 2; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd (quot_ptr[i + 1], r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))), + divisor_limb); + n1 = n0; + } + udiv_qrnnd (quot_ptr[0], r, r, + n1 << normalization_steps, + divisor_limb); + return r >> normalization_steps; + } + } + /* No normalization needed, either because udiv_qrnnd doesn't require + it, or because DIVISOR_LIMB is already normalized. */ + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + { + quot_ptr[i] = 0; + i--; + } + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd (quot_ptr[i], r, r, n0, divisor_limb); + } + return r; + } +} + + + +mp_limb_t +#if __STDC__ +mpn_divrem_1 (mp_ptr qp, mp_size_t qxn, + mp_srcptr np, mp_size_t nn, + mp_limb_t d) +#else +mpn_divrem_1 (qp, qxn, np, nn, d) + mp_ptr qp; + mp_size_t qxn; + mp_srcptr np; + mp_size_t nn; + mp_limb_t d; +#endif +{ + mp_limb_t rlimb; + mp_size_t i; + + /* Develop integer part of quotient. */ + rlimb = __gmpn_divmod_1_internal (qp + qxn, np, nn, d); + + /* Develop fraction part of quotient. This is not as fast as it should; + the preinvert stuff from __gmpn_divmod_1_internal ought to be used here + too. */ + if (UDIV_NEEDS_NORMALIZATION) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, d); + if (normalization_steps != 0) + { + d <<= normalization_steps; + rlimb <<= normalization_steps; + + for (i = qxn - 1; i >= 0; i--) + udiv_qrnnd (qp[i], rlimb, rlimb, 0, d); + + return rlimb >> normalization_steps; + } + else + /* fall out */ + ; + } + + for (i = qxn - 1; i >= 0; i--) + udiv_qrnnd (qp[i], rlimb, rlimb, 0, d); + + return rlimb; +} diff --git a/rts/gmp/mpn/generic/divrem_2.c b/rts/gmp/mpn/generic/divrem_2.c new file mode 100644 index 0000000000..0bc31ae2e7 --- /dev/null +++ b/rts/gmp/mpn/generic/divrem_2.c @@ -0,0 +1,151 @@ +/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and + quotient. The divisor is two limbs. + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS + ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP + RELEASE. + + +Copyright (C) 1993, 1994, 1995, 1996, 1999, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Divide num (NP/NSIZE) by den (DP/2) and write + the NSIZE-2 least significant quotient limbs at QP + and the 2 long remainder at NP. If QEXTRA_LIMBS is + non-zero, generate that many fraction bits and append them after the + other quotient limbs. + Return the most significant limb of the quotient, this is always 0 or 1. + + Preconditions: + 0. NSIZE >= 2. + 1. The most significant bit of the divisor must be set. + 2. QP must either not overlap with the input operands at all, or + QP + 2 >= NP must hold true. (This means that it's + possible to put the quotient in the high part of NUM, right after the + remainder in NUM. + 3. NSIZE >= 2, even if QEXTRA_LIMBS is non-zero. */ + +mp_limb_t +#if __STDC__ +mpn_divrem_2 (mp_ptr qp, mp_size_t qxn, + mp_ptr np, mp_size_t nsize, + mp_srcptr dp) +#else +mpn_divrem_2 (qp, qxn, np, nsize, dp) + mp_ptr qp; + mp_size_t qxn; + mp_ptr np; + mp_size_t nsize; + mp_srcptr dp; +#endif +{ + mp_limb_t most_significant_q_limb = 0; + mp_size_t i; + mp_limb_t n1, n0, n2; + mp_limb_t d1, d0; + mp_limb_t d1inv; + int have_preinv; + + np += nsize - 2; + d1 = dp[1]; + d0 = dp[0]; + n1 = np[1]; + n0 = np[0]; + + if (n1 >= d1 && (n1 > d1 || n0 >= d0)) + { + sub_ddmmss (n1, n0, n1, n0, d1, d0); + most_significant_q_limb = 1; + } + + /* If multiplication is much faster than division, preinvert the most + significant divisor limb before entering the loop. */ + if (UDIV_TIME > 2 * UMUL_TIME + 6) + { + have_preinv = 0; + if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - 2) > UDIV_TIME) + { + invert_limb (d1inv, d1); + have_preinv = 1; + } + } + + for (i = qxn + nsize - 2 - 1; i >= 0; i--) + { + mp_limb_t q; + mp_limb_t r; + + if (i >= qxn) + np--; + else + np[0] = 0; + + if (n1 == d1) + { + /* Q should be either 111..111 or 111..110. Need special treatment + of this rare case as normal division would give overflow. */ + q = ~(mp_limb_t) 0; + + r = n0 + d1; + if (r < d1) /* Carry in the addition? */ + { + add_ssaaaa (n1, n0, r - d0, np[0], 0, d0); + qp[i] = q; + continue; + } + n1 = d0 - (d0 != 0); + n0 = -d0; + } + else + { + if (UDIV_TIME > 2 * UMUL_TIME + 6 && have_preinv) + udiv_qrnnd_preinv (q, r, n1, n0, d1, d1inv); + else + udiv_qrnnd (q, r, n1, n0, d1); + umul_ppmm (n1, n0, d0, q); + } + + n2 = np[0]; + + q_test: + if (n1 > r || (n1 == r && n0 > n2)) + { + /* The estimated Q was too large. */ + q--; + + sub_ddmmss (n1, n0, n1, n0, 0, d0); + r += d1; + if (r >= d1) /* If not carry, test Q again. */ + goto q_test; + } + + qp[i] = q; + sub_ddmmss (n1, n0, r, n2, n1, n0); + } + np[1] = n1; + np[0] = n0; + + return most_significant_q_limb; +} diff --git a/rts/gmp/mpn/generic/dump.c b/rts/gmp/mpn/generic/dump.c new file mode 100644 index 0000000000..66f375c74b --- /dev/null +++ b/rts/gmp/mpn/generic/dump.c @@ -0,0 +1,76 @@ +/* THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS NOT SAFE TO + CALL THIS FUNCTION DIRECTLY. IN FACT, IT IS ALMOST GUARANTEED THAT THIS + FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include <stdio.h> +#include "gmp.h" +#include "gmp-impl.h" + +void +#if __STDC__ +mpn_dump (mp_srcptr ptr, mp_size_t size) +#else +mpn_dump (ptr, size) + mp_srcptr ptr; + mp_size_t size; +#endif +{ + MPN_NORMALIZE (ptr, size); + + if (size == 0) + printf ("0\n"); + else + { + size--; + if (BYTES_PER_MP_LIMB > sizeof (long)) + { + if ((ptr[size] >> BITS_PER_MP_LIMB/2) != 0) + { + printf ("%lX", + (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2)); + printf ("%0*lX", (int) (BYTES_PER_MP_LIMB), + (unsigned long) ptr[size]); + } + else + printf ("%lX", (unsigned long) ptr[size]); + } + else + printf ("%lX", ptr[size]); + + while (size) + { + size--; + if (BYTES_PER_MP_LIMB > sizeof (long)) + { + printf ("%0*lX", (int) (BYTES_PER_MP_LIMB), + (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2)); + printf ("%0*lX", (int) (BYTES_PER_MP_LIMB), + (unsigned long) ptr[size]); + } + else + printf ("%0*lX", (int) (2 * BYTES_PER_MP_LIMB), ptr[size]); + } + printf ("\n"); + } +} diff --git a/rts/gmp/mpn/generic/gcd.c b/rts/gmp/mpn/generic/gcd.c new file mode 100644 index 0000000000..059e219a06 --- /dev/null +++ b/rts/gmp/mpn/generic/gcd.c @@ -0,0 +1,414 @@ +/* mpn/gcd.c: mpn_gcd for gcd of two odd integers. + +Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* Integer greatest common divisor of two unsigned integers, using + the accelerated algorithm (see reference below). + + mp_size_t mpn_gcd (up, usize, vp, vsize). + + Preconditions [U = (up, usize) and V = (vp, vsize)]: + + 1. V is odd. + 2. numbits(U) >= numbits(V). + + Both U and V are destroyed by the operation. The result is left at vp, + and its size is returned. + + Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu) + + Funding for this work has been partially provided by Conselho Nacional + de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant + 301314194-2, and was done while I was a visiting reseacher in the Instituto + de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS). + + Refer to + K. Weber, The accelerated integer GCD algorithm, ACM Transactions on + Mathematical Software, v. 21 (March), 1995, pp. 111-122. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated + algorithm is used, otherwise the binary algorithm is used. This may be + adjusted for different architectures. */ +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 5 +#endif + +/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated + algorithm reduces using the bmod operation. Otherwise, the k-ary reduction + is used. 0 <= BMOD_THRESHOLD < BITS_PER_MP_LIMB. */ +enum + { + BMOD_THRESHOLD = BITS_PER_MP_LIMB/2 + }; + + +/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2. + Both U and V must be odd. */ +static __gmp_inline mp_size_t +#if __STDC__ +gcd_2 (mp_ptr vp, mp_srcptr up) +#else +gcd_2 (vp, up) + mp_ptr vp; + mp_srcptr up; +#endif +{ + mp_limb_t u0, u1, v0, v1; + mp_size_t vsize; + + u0 = up[0], u1 = up[1], v0 = vp[0], v1 = vp[1]; + + while (u1 != v1 && u0 != v0) + { + unsigned long int r; + if (u1 > v1) + { + u1 -= v1 + (u0 < v0), u0 -= v0; + count_trailing_zeros (r, u0); + u0 = u1 << (BITS_PER_MP_LIMB - r) | u0 >> r; + u1 >>= r; + } + else /* u1 < v1. */ + { + v1 -= u1 + (v0 < u0), v0 -= u0; + count_trailing_zeros (r, v0); + v0 = v1 << (BITS_PER_MP_LIMB - r) | v0 >> r; + v1 >>= r; + } + } + + vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0); + + /* If U == V == GCD, done. Otherwise, compute GCD (V, |U - V|). */ + if (u1 == v1 && u0 == v0) + return vsize; + + v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0; + vp[0] = mpn_gcd_1 (vp, vsize, v0); + + return 1; +} + +/* The function find_a finds 0 < N < 2^BITS_PER_MP_LIMB such that there exists + 0 < |D| < 2^BITS_PER_MP_LIMB, and N == D * C mod 2^(2*BITS_PER_MP_LIMB). + In the reference article, D was computed along with N, but it is better to + compute D separately as D <-- N / C mod 2^(BITS_PER_MP_LIMB + 1), treating + the result as a twos' complement signed integer. + + Initialize N1 to C mod 2^(2*BITS_PER_MP_LIMB). According to the reference + article, N2 should be initialized to 2^(2*BITS_PER_MP_LIMB), but we use + 2^(2*BITS_PER_MP_LIMB) - N1 to start the calculations within double + precision. If N2 > N1 initially, the first iteration of the while loop + will swap them. In all other situations, N1 >= N2 is maintained. */ + +static +#if ! defined (__i386__) +__gmp_inline /* don't inline this for the x86 */ +#endif +mp_limb_t +#if __STDC__ +find_a (mp_srcptr cp) +#else +find_a (cp) + mp_srcptr cp; +#endif +{ + unsigned long int leading_zero_bits = 0; + + mp_limb_t n1_l = cp[0]; /* N1 == n1_h * 2^BITS_PER_MP_LIMB + n1_l. */ + mp_limb_t n1_h = cp[1]; + + mp_limb_t n2_l = -n1_l; /* N2 == n2_h * 2^BITS_PER_MP_LIMB + n2_l. */ + mp_limb_t n2_h = ~n1_h; + + /* Main loop. */ + while (n2_h) /* While N2 >= 2^BITS_PER_MP_LIMB. */ + { + /* N1 <-- N1 % N2. */ + if ((MP_LIMB_T_HIGHBIT >> leading_zero_bits & n2_h) == 0) + { + unsigned long int i; + count_leading_zeros (i, n2_h); + i -= leading_zero_bits, leading_zero_bits += i; + n2_h = n2_h<<i | n2_l>>(BITS_PER_MP_LIMB - i), n2_l <<= i; + do + { + if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l)) + n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l; + n2_l = n2_l>>1 | n2_h<<(BITS_PER_MP_LIMB - 1), n2_h >>= 1; + i -= 1; + } + while (i); + } + if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l)) + n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l; + + MP_LIMB_T_SWAP (n1_h, n2_h); + MP_LIMB_T_SWAP (n1_l, n2_l); + } + + return n2_l; +} + +mp_size_t +#if __STDC__ +mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) +#else +mpn_gcd (gp, up, usize, vp, vsize) + mp_ptr gp; + mp_ptr up; + mp_size_t usize; + mp_ptr vp; + mp_size_t vsize; +#endif +{ + mp_ptr orig_vp = vp; + mp_size_t orig_vsize = vsize; + int binary_gcd_ctr; /* Number of times binary gcd will execute. */ + TMP_DECL (marker); + + TMP_MARK (marker); + + /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD. + Two EXTRA limbs for U and V are required for kary reduction. */ + if (vsize >= GCD_ACCEL_THRESHOLD) + { + unsigned long int vbitsize, d; + mp_ptr orig_up = up; + mp_size_t orig_usize = usize; + mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB); + + MPN_COPY (anchor_up, orig_up, usize); + up = anchor_up; + + count_leading_zeros (d, up[usize-1]); + d = usize * BITS_PER_MP_LIMB - d; + count_leading_zeros (vbitsize, vp[vsize-1]); + vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize; + d = d - vbitsize + 1; + + /* Use bmod reduction to quickly discover whether V divides U. */ + up[usize++] = 0; /* Insert leading zero. */ + mpn_bdivmod (up, up, usize, vp, vsize, d); + + /* Now skip U/V mod 2^d and any low zero limbs. */ + d /= BITS_PER_MP_LIMB, up += d, usize -= d; + while (usize != 0 && up[0] == 0) + up++, usize--; + + if (usize == 0) /* GCD == ORIG_V. */ + goto done; + + vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB); + MPN_COPY (vp, orig_vp, vsize); + + do /* Main loop. */ + { + /* mpn_com_n can't be used here because anchor_up and up may + partially overlap */ + if (up[usize-1] & MP_LIMB_T_HIGHBIT) /* U < 0; take twos' compl. */ + { + mp_size_t i; + anchor_up[0] = -up[0]; + for (i = 1; i < usize; i++) + anchor_up[i] = ~up[i]; + up = anchor_up; + } + + MPN_NORMALIZE_NOT_ZERO (up, usize); + + if ((up[0] & 1) == 0) /* Result even; remove twos. */ + { + unsigned int r; + count_trailing_zeros (r, up[0]); + mpn_rshift (anchor_up, up, usize, r); + usize -= (anchor_up[usize-1] == 0); + } + else if (anchor_up != up) + MPN_COPY_INCR (anchor_up, up, usize); + + MPN_PTR_SWAP (anchor_up,usize, vp,vsize); + up = anchor_up; + + if (vsize <= 2) /* Kary can't handle < 2 limbs and */ + break; /* isn't efficient for == 2 limbs. */ + + d = vbitsize; + count_leading_zeros (vbitsize, vp[vsize-1]); + vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize; + d = d - vbitsize + 1; + + if (d > BMOD_THRESHOLD) /* Bmod reduction. */ + { + up[usize++] = 0; + mpn_bdivmod (up, up, usize, vp, vsize, d); + d /= BITS_PER_MP_LIMB, up += d, usize -= d; + } + else /* Kary reduction. */ + { + mp_limb_t bp[2], cp[2]; + + /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB). */ + { + mp_limb_t u_inv, hi, lo; + modlimb_invert (u_inv, up[0]); + cp[0] = vp[0] * u_inv; + umul_ppmm (hi, lo, cp[0], up[0]); + cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv; + } + + /* U <-- find_a (C) * U. */ + up[usize] = mpn_mul_1 (up, up, usize, find_a (cp)); + usize++; + + /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1). + bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and + bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2 + + Like V/U above, but simplified because only the low bit of + bp[1] is wanted. */ + { + mp_limb_t v_inv, hi, lo; + modlimb_invert (v_inv, vp[0]); + bp[0] = up[0] * v_inv; + umul_ppmm (hi, lo, bp[0], vp[0]); + bp[1] = (up[1] + hi + (bp[0]&vp[1])) & 1; + } + + up[usize++] = 0; + if (bp[1]) /* B < 0: U <-- U + (-B) * V. */ + { + mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0]); + mpn_add_1 (up + vsize, up + vsize, usize - vsize, c); + } + else /* B >= 0: U <-- U - B * V. */ + { + mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]); + mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b); + } + + up += 2, usize -= 2; /* At least two low limbs are zero. */ + } + + /* Must remove low zero limbs before complementing. */ + while (usize != 0 && up[0] == 0) + up++, usize--; + } + while (usize); + + /* Compute GCD (ORIG_V, GCD (ORIG_U, V)). Binary will execute twice. */ + up = orig_up, usize = orig_usize; + binary_gcd_ctr = 2; + } + else + binary_gcd_ctr = 1; + + /* Finish up with the binary algorithm. Executes once or twice. */ + for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize) + { + if (usize > 2) /* First make U close to V in size. */ + { + unsigned long int vbitsize, d; + count_leading_zeros (d, up[usize-1]); + d = usize * BITS_PER_MP_LIMB - d; + count_leading_zeros (vbitsize, vp[vsize-1]); + vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize; + d = d - vbitsize - 1; + if (d != -(unsigned long int)1 && d > 2) + { + mpn_bdivmod (up, up, usize, vp, vsize, d); /* Result > 0. */ + d /= (unsigned long int)BITS_PER_MP_LIMB, up += d, usize -= d; + } + } + + /* Start binary GCD. */ + do + { + mp_size_t zeros; + + /* Make sure U is odd. */ + MPN_NORMALIZE (up, usize); + while (up[0] == 0) + up += 1, usize -= 1; + if ((up[0] & 1) == 0) + { + unsigned int r; + count_trailing_zeros (r, up[0]); + mpn_rshift (up, up, usize, r); + usize -= (up[usize-1] == 0); + } + + /* Keep usize >= vsize. */ + if (usize < vsize) + MPN_PTR_SWAP (up, usize, vp, vsize); + + if (usize <= 2) /* Double precision. */ + { + if (vsize == 1) + vp[0] = mpn_gcd_1 (up, usize, vp[0]); + else + vsize = gcd_2 (vp, up); + break; /* Binary GCD done. */ + } + + /* Count number of low zero limbs of U - V. */ + for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; ) + continue; + + /* If U < V, swap U and V; in any case, subtract V from U. */ + if (zeros == vsize) /* Subtract done. */ + up += zeros, usize -= zeros; + else if (usize == vsize) + { + mp_size_t size = vsize; + do + size--; + while (up[size] == vp[size]); + if (up[size] < vp[size]) /* usize == vsize. */ + MP_PTR_SWAP (up, vp); + up += zeros, usize = size + 1 - zeros; + mpn_sub_n (up, up, vp + zeros, usize); + } + else + { + mp_size_t size = vsize - zeros; + up += zeros, usize -= zeros; + if (mpn_sub_n (up, up, vp + zeros, size)) + { + while (up[size] == 0) /* Propagate borrow. */ + up[size++] = -(mp_limb_t)1; + up[size] -= 1; + } + } + } + while (usize); /* End binary GCD. */ + } + +done: + if (vp != gp) + MPN_COPY (gp, vp, vsize); + TMP_FREE (marker); + return vsize; +} diff --git a/rts/gmp/mpn/generic/gcd_1.c b/rts/gmp/mpn/generic/gcd_1.c new file mode 100644 index 0000000000..1832636636 --- /dev/null +++ b/rts/gmp/mpn/generic/gcd_1.c @@ -0,0 +1,77 @@ +/* mpn_gcd_1 -- + +Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Does not work for U == 0 or V == 0. It would be tough to make it work for + V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t. */ + +mp_limb_t +#if __STDC__ +mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb) +#else +mpn_gcd_1 (up, size, vlimb) + mp_srcptr up; + mp_size_t size; + mp_limb_t vlimb; +#endif +{ + mp_limb_t ulimb; + unsigned long int u_low_zero_bits, v_low_zero_bits; + + if (size > 1) + { + ulimb = mpn_mod_1 (up, size, vlimb); + if (ulimb == 0) + return vlimb; + } + else + ulimb = up[0]; + + /* Need to eliminate low zero bits. */ + count_trailing_zeros (u_low_zero_bits, ulimb); + ulimb >>= u_low_zero_bits; + + count_trailing_zeros (v_low_zero_bits, vlimb); + vlimb >>= v_low_zero_bits; + + while (ulimb != vlimb) + { + if (ulimb > vlimb) + { + ulimb -= vlimb; + do + ulimb >>= 1; + while ((ulimb & 1) == 0); + } + else /* vlimb > ulimb. */ + { + vlimb -= ulimb; + do + vlimb >>= 1; + while ((vlimb & 1) == 0); + } + } + + return ulimb << MIN (u_low_zero_bits, v_low_zero_bits); +} diff --git a/rts/gmp/mpn/generic/gcdext.c b/rts/gmp/mpn/generic/gcdext.c new file mode 100644 index 0000000000..fe22d779a6 --- /dev/null +++ b/rts/gmp/mpn/generic/gcdext.c @@ -0,0 +1,700 @@ +/* mpn_gcdext -- Extended Greatest Common Divisor. + +Copyright (C) 1996, 1998, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 17 +#endif + +#ifndef EXTEND +#define EXTEND 1 +#endif + +#if STAT +int arr[BITS_PER_MP_LIMB]; +#endif + + +/* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE) + + Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the + greatest common divisor at GP (unless it is 0), and the first cofactor at + SP. Write the size of the cofactor through the pointer SSIZE. Return the + size of the value at GP. Note that SP might be a negative number; this is + denoted by storing the negative of the size through SSIZE. + + {UP,USIZE} and {VP,VSIZE} are both clobbered. + + The space allocation for all four areas needs to be USIZE+1. + + Preconditions: 1) U >= V. + 2) V > 0. */ + +/* We use Lehmer's algorithm. The idea is to extract the most significant + bits of the operands, and compute the continued fraction for them. We then + apply the gathered cofactors to the full operands. + + Idea 1: After we have performed a full division, don't shift operands back, + but instead account for the extra factors-of-2 thus introduced. + Idea 2: Simple generalization to use divide-and-conquer would give us an + algorithm that runs faster than O(n^2). + Idea 3: The input numbers need less space as the computation progresses, + while the s0 and s1 variables need more space. To save memory, we + could make them share space, and have the latter variables grow + into the former. + Idea 4: We should not do double-limb arithmetic from the start. Instead, + do things in single-limb arithmetic until the quotients differ, + and then switch to double-limb arithmetic. */ + + +/* Division optimized for small quotients. If the quotient is more than one limb, + store 1 in *qh and return 0. */ +static mp_limb_t +#if __STDC__ +div2 (mp_limb_t *qh, mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0) +#else +div2 (qh, n1, n0, d1, d0) + mp_limb_t *qh; + mp_limb_t n1; + mp_limb_t n0; + mp_limb_t d1; + mp_limb_t d0; +#endif +{ + if (d1 == 0) + { + *qh = 1; + return 0; + } + + if ((mp_limb_signed_t) n1 < 0) + { + mp_limb_t q; + int cnt; + for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++) + { + d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1)); + d0 = d0 << 1; + } + + q = 0; + while (cnt) + { + q <<= 1; + if (n1 > d1 || (n1 == d1 && n0 >= d0)) + { + sub_ddmmss (n1, n0, n1, n0, d1, d0); + q |= 1; + } + d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1); + d1 = d1 >> 1; + cnt--; + } + + *qh = 0; + return q; + } + else + { + mp_limb_t q; + int cnt; + for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++) + { + d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1)); + d0 = d0 << 1; + } + + q = 0; + while (cnt) + { + d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1); + d1 = d1 >> 1; + q <<= 1; + if (n1 > d1 || (n1 == d1 && n0 >= d0)) + { + sub_ddmmss (n1, n0, n1, n0, d1, d0); + q |= 1; + } + cnt--; + } + + *qh = 0; + return q; + } +} + +mp_size_t +#if EXTEND +#if __STDC__ +mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size, + mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize) +#else +mpn_gcdext (gp, s0p, s0size, up, size, vp, vsize) + mp_ptr gp; + mp_ptr s0p; + mp_size_t *s0size; + mp_ptr up; + mp_size_t size; + mp_ptr vp; + mp_size_t vsize; +#endif +#else +#if __STDC__ +mpn_gcd (mp_ptr gp, + mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize) +#else +mpn_gcd (gp, up, size, vp, vsize) + mp_ptr gp; + mp_ptr up; + mp_size_t size; + mp_ptr vp; + mp_size_t vsize; +#endif +#endif +{ + mp_limb_t A, B, C, D; + int cnt; + mp_ptr tp, wp; +#if RECORD + mp_limb_t max = 0; +#endif +#if EXTEND + mp_ptr s1p; + mp_ptr orig_s0p = s0p; + mp_size_t ssize; + int sign = 1; +#endif + int use_double_flag; + TMP_DECL (mark); + + TMP_MARK (mark); + + use_double_flag = (size >= GCDEXT_THRESHOLD); + + tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); + wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); +#if EXTEND + s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); + + MPN_ZERO (s0p, size); + MPN_ZERO (s1p, size); + + s0p[0] = 1; + s1p[0] = 0; + ssize = 1; +#endif + + if (size > vsize) + { + /* Normalize V (and shift up U the same amount). */ + count_leading_zeros (cnt, vp[vsize - 1]); + if (cnt != 0) + { + mp_limb_t cy; + mpn_lshift (vp, vp, vsize, cnt); + cy = mpn_lshift (up, up, size, cnt); + up[size] = cy; + size += cy != 0; + } + + mpn_divmod (up + vsize, up, size, vp, vsize); +#if EXTEND + /* This is really what it boils down to in this case... */ + s0p[0] = 0; + s1p[0] = 1; + sign = -sign; +#endif + size = vsize; + if (cnt != 0) + { + mpn_rshift (up, up, size, cnt); + mpn_rshift (vp, vp, size, cnt); + } + MP_PTR_SWAP (up, vp); + } + + for (;;) + { + mp_limb_t asign; + /* Figure out exact size of V. */ + vsize = size; + MPN_NORMALIZE (vp, vsize); + if (vsize <= 1) + break; + + if (use_double_flag) + { + mp_limb_t uh, vh, ul, vl; + /* Let UH,UL be the most significant limbs of U, and let VH,VL be + the corresponding bits from V. */ + uh = up[size - 1]; + vh = vp[size - 1]; + ul = up[size - 2]; + vl = vp[size - 2]; + count_leading_zeros (cnt, uh); + if (cnt != 0) + { + uh = (uh << cnt) | (ul >> (BITS_PER_MP_LIMB - cnt)); + vh = (vh << cnt) | (vl >> (BITS_PER_MP_LIMB - cnt)); + vl <<= cnt; + ul <<= cnt; + if (size >= 3) + { + ul |= (up[size - 3] >> (BITS_PER_MP_LIMB - cnt)); + vl |= (vp[size - 3] >> (BITS_PER_MP_LIMB - cnt)); + } + } + + A = 1; + B = 0; + C = 0; + D = 1; + + asign = 0; + for (;;) + { + mp_limb_t T; + mp_limb_t qh, q1, q2; + mp_limb_t nh, nl, dh, dl; + mp_limb_t t1, t0; + mp_limb_t Th, Tl; + + sub_ddmmss (dh, dl, vh, vl, 0, C); + if ((dl | dh) == 0) + break; + add_ssaaaa (nh, nl, uh, ul, 0, A); + q1 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + add_ssaaaa (dh, dl, vh, vl, 0, D); + if ((dl | dh) == 0) + break; + sub_ddmmss (nh, nl, uh, ul, 0, B); + q2 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + if (q1 != q2) + break; + + asign = ~asign; + + T = A + q1 * C; + A = C; + C = T; + T = B + q1 * D; + B = D; + D = T; + umul_ppmm (t1, t0, q1, vl); + t1 += q1 * vh; + sub_ddmmss (Th, Tl, uh, ul, t1, t0); + uh = vh, ul = vl; + vh = Th, vl = Tl; + + add_ssaaaa (dh, dl, vh, vl, 0, C); + sub_ddmmss (nh, nl, uh, ul, 0, A); + q1 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + sub_ddmmss (dh, dl, vh, vl, 0, D); + if ((dl | dh) == 0) + break; + add_ssaaaa (nh, nl, uh, ul, 0, B); + q2 = div2 (&qh, nh, nl, dh, dl); + if (qh != 0) + break; /* could handle this */ + + if (q1 != q2) + break; + + asign = ~asign; + + T = A + q1 * C; + A = C; + C = T; + T = B + q1 * D; + B = D; + D = T; + umul_ppmm (t1, t0, q1, vl); + t1 += q1 * vh; + sub_ddmmss (Th, Tl, uh, ul, t1, t0); + uh = vh, ul = vl; + vh = Th, vl = Tl; + } +#if EXTEND + if (asign) + sign = -sign; +#endif + } + else /* Same, but using single-limb calculations. */ + { + mp_limb_t uh, vh; + /* Make UH be the most significant limb of U, and make VH be + corresponding bits from V. */ + uh = up[size - 1]; + vh = vp[size - 1]; + count_leading_zeros (cnt, uh); + if (cnt != 0) + { + uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt)); + vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt)); + } + + A = 1; + B = 0; + C = 0; + D = 1; + + asign = 0; + for (;;) + { + mp_limb_t q, T; + if (vh - C == 0 || vh + D == 0) + break; + + q = (uh + A) / (vh - C); + if (q != (uh - B) / (vh + D)) + break; + + asign = ~asign; + + T = A + q * C; + A = C; + C = T; + T = B + q * D; + B = D; + D = T; + T = uh - q * vh; + uh = vh; + vh = T; + + if (vh - D == 0) + break; + + q = (uh - A) / (vh + C); + if (q != (uh + B) / (vh - D)) + break; + + asign = ~asign; + + T = A + q * C; + A = C; + C = T; + T = B + q * D; + B = D; + D = T; + T = uh - q * vh; + uh = vh; + vh = T; + } +#if EXTEND + if (asign) + sign = -sign; +#endif + } + +#if RECORD + max = MAX (A, max); max = MAX (B, max); + max = MAX (C, max); max = MAX (D, max); +#endif + + if (B == 0) + { + mp_limb_t qh; + mp_size_t i; + /* This is quite rare. I.e., optimize something else! */ + + /* Normalize V (and shift up U the same amount). */ + count_leading_zeros (cnt, vp[vsize - 1]); + if (cnt != 0) + { + mp_limb_t cy; + mpn_lshift (vp, vp, vsize, cnt); + cy = mpn_lshift (up, up, size, cnt); + up[size] = cy; + size += cy != 0; + } + + qh = mpn_divmod (up + vsize, up, size, vp, vsize); +#if EXTEND + MPN_COPY (tp, s0p, ssize); + { + mp_size_t qsize; + + qsize = size - vsize; /* size of stored quotient from division */ + if (ssize < qsize) + { + MPN_ZERO (tp + ssize, qsize - ssize); + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < ssize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, up + vsize, qsize, s1p[i]); + tp[qsize + i] = cy; + } + if (qh != 0) + { + mp_limb_t cy; + cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize); + if (cy != 0) + abort (); + } + } + else + { + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < qsize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]); + tp[ssize + i] = cy; + } + if (qh != 0) + { + mp_limb_t cy; + cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize); + if (cy != 0) + { + tp[qsize + ssize] = cy; + s1p[qsize + ssize] = 0; + ssize++; + } + } + } + ssize += qsize; + ssize -= tp[ssize - 1] == 0; + } + + sign = -sign; + MP_PTR_SWAP (s0p, s1p); + MP_PTR_SWAP (s1p, tp); +#endif + size = vsize; + if (cnt != 0) + { + mpn_rshift (up, up, size, cnt); + mpn_rshift (vp, vp, size, cnt); + } + MP_PTR_SWAP (up, vp); + } + else + { +#if EXTEND + mp_size_t tsize, wsize; +#endif + /* T = U*A + V*B + W = U*C + V*D + U = T + V = W */ + +#if STAT + { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x); + arr[BITS_PER_MP_LIMB - cnt]++; } +#endif + if (A == 0) + { + /* B == 1 and C == 1 (D is arbitrary) */ + mp_limb_t cy; + MPN_COPY (tp, vp, size); + MPN_COPY (wp, up, size); + mpn_submul_1 (wp, vp, size, D); + MP_PTR_SWAP (tp, up); + MP_PTR_SWAP (wp, vp); +#if EXTEND + MPN_COPY (tp, s1p, ssize); + tsize = ssize; + tp[ssize] = 0; /* must zero since wp might spill below */ + MPN_COPY (wp, s0p, ssize); + cy = mpn_addmul_1 (wp, s1p, ssize, D); + wp[ssize] = cy; + wsize = ssize + (cy != 0); + MP_PTR_SWAP (tp, s0p); + MP_PTR_SWAP (wp, s1p); + ssize = MAX (wsize, tsize); +#endif + } + else + { + if (asign) + { + mp_limb_t cy; + mpn_mul_1 (tp, vp, size, B); + mpn_submul_1 (tp, up, size, A); + mpn_mul_1 (wp, up, size, C); + mpn_submul_1 (wp, vp, size, D); + MP_PTR_SWAP (tp, up); + MP_PTR_SWAP (wp, vp); +#if EXTEND + cy = mpn_mul_1 (tp, s1p, ssize, B); + cy += mpn_addmul_1 (tp, s0p, ssize, A); + tp[ssize] = cy; + tsize = ssize + (cy != 0); + cy = mpn_mul_1 (wp, s0p, ssize, C); + cy += mpn_addmul_1 (wp, s1p, ssize, D); + wp[ssize] = cy; + wsize = ssize + (cy != 0); + MP_PTR_SWAP (tp, s0p); + MP_PTR_SWAP (wp, s1p); + ssize = MAX (wsize, tsize); +#endif + } + else + { + mp_limb_t cy; + mpn_mul_1 (tp, up, size, A); + mpn_submul_1 (tp, vp, size, B); + mpn_mul_1 (wp, vp, size, D); + mpn_submul_1 (wp, up, size, C); + MP_PTR_SWAP (tp, up); + MP_PTR_SWAP (wp, vp); +#if EXTEND + cy = mpn_mul_1 (tp, s0p, ssize, A); + cy += mpn_addmul_1 (tp, s1p, ssize, B); + tp[ssize] = cy; + tsize = ssize + (cy != 0); + cy = mpn_mul_1 (wp, s1p, ssize, D); + cy += mpn_addmul_1 (wp, s0p, ssize, C); + wp[ssize] = cy; + wsize = ssize + (cy != 0); + MP_PTR_SWAP (tp, s0p); + MP_PTR_SWAP (wp, s1p); + ssize = MAX (wsize, tsize); +#endif + } + } + + size -= up[size - 1] == 0; + } + } + +#if RECORD + printf ("max: %lx\n", max); +#endif + +#if STAT + {int i; for (i = 0; i < BITS_PER_MP_LIMB; i++) printf ("%d:%d\n", i, arr[i]);} +#endif + + if (vsize == 0) + { + if (gp != up && gp != 0) + MPN_COPY (gp, up, size); +#if EXTEND + MPN_NORMALIZE (s0p, ssize); + if (orig_s0p != s0p) + MPN_COPY (orig_s0p, s0p, ssize); + *s0size = sign >= 0 ? ssize : -ssize; +#endif + TMP_FREE (mark); + return size; + } + else + { + mp_limb_t vl, ul, t; +#if EXTEND + mp_size_t qsize, i; +#endif + vl = vp[0]; +#if EXTEND + t = mpn_divmod_1 (wp, up, size, vl); + + MPN_COPY (tp, s0p, ssize); + + qsize = size - (wp[size - 1] == 0); /* size of quotient from division */ + if (ssize < qsize) + { + MPN_ZERO (tp + ssize, qsize - ssize); + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < ssize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]); + tp[qsize + i] = cy; + } + } + else + { + MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ + for (i = 0; i < qsize; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]); + tp[ssize + i] = cy; + } + } + ssize += qsize; + ssize -= tp[ssize - 1] == 0; + + sign = -sign; + MP_PTR_SWAP (s0p, s1p); + MP_PTR_SWAP (s1p, tp); +#else + t = mpn_mod_1 (up, size, vl); +#endif + ul = vl; + vl = t; + while (vl != 0) + { + mp_limb_t t; +#if EXTEND + mp_limb_t q; + q = ul / vl; + t = ul - q * vl; + + MPN_COPY (tp, s0p, ssize); + + MPN_ZERO (s1p + ssize, 1); /* zero s1 too */ + + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp, s1p, ssize, q); + tp[ssize] = cy; + } + + ssize += 1; + ssize -= tp[ssize - 1] == 0; + + sign = -sign; + MP_PTR_SWAP (s0p, s1p); + MP_PTR_SWAP (s1p, tp); +#else + t = ul % vl; +#endif + ul = vl; + vl = t; + } + if (gp != 0) + gp[0] = ul; +#if EXTEND + MPN_NORMALIZE (s0p, ssize); + if (orig_s0p != s0p) + MPN_COPY (orig_s0p, s0p, ssize); + *s0size = sign >= 0 ? ssize : -ssize; +#endif + TMP_FREE (mark); + return 1; + } +} diff --git a/rts/gmp/mpn/generic/get_str.c b/rts/gmp/mpn/generic/get_str.c new file mode 100644 index 0000000000..a713b61825 --- /dev/null +++ b/rts/gmp/mpn/generic/get_str.c @@ -0,0 +1,216 @@ +/* mpn_get_str -- Convert a MSIZE long limb vector pointed to by MPTR + to a printable string in STR in base BASE. + +Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Convert the limb vector pointed to by MPTR and MSIZE long to a + char array, using base BASE for the result array. Store the + result in the character array STR. STR must point to an array with + space for the largest possible number represented by a MSIZE long + limb vector + 1 extra character. + + The result is NOT in Ascii, to convert it to printable format, add + '0' or 'A' depending on the base and range. + + Return the number of digits in the result string. + This may include some leading zeros. + + The limb vector pointed to by MPTR is clobbered. */ + +size_t +#if __STDC__ +mpn_get_str (unsigned char *str, int base, mp_ptr mptr, mp_size_t msize) +#else +mpn_get_str (str, base, mptr, msize) + unsigned char *str; + int base; + mp_ptr mptr; + mp_size_t msize; +#endif +{ + mp_limb_t big_base; +#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME + int normalization_steps; +#endif +#if UDIV_TIME > 2 * UMUL_TIME + mp_limb_t big_base_inverted; +#endif + unsigned int dig_per_u; + mp_size_t out_len; + register unsigned char *s; + + big_base = __mp_bases[base].big_base; + + s = str; + + /* Special case zero, as the code below doesn't handle it. */ + if (msize == 0) + { + s[0] = 0; + return 1; + } + + if ((base & (base - 1)) == 0) + { + /* The base is a power of 2. Make conversion from most + significant side. */ + mp_limb_t n1, n0; + register int bits_per_digit = big_base; + register int x; + register int bit_pos; + register int i; + + n1 = mptr[msize - 1]; + count_leading_zeros (x, n1); + + /* BIT_POS should be R when input ends in least sign. nibble, + R + bits_per_digit * n when input ends in n:th least significant + nibble. */ + + { + int bits; + + bits = BITS_PER_MP_LIMB * msize - x; + x = bits % bits_per_digit; + if (x != 0) + bits += bits_per_digit - x; + bit_pos = bits - (msize - 1) * BITS_PER_MP_LIMB; + } + + /* Fast loop for bit output. */ + i = msize - 1; + for (;;) + { + bit_pos -= bits_per_digit; + while (bit_pos >= 0) + { + *s++ = (n1 >> bit_pos) & ((1 << bits_per_digit) - 1); + bit_pos -= bits_per_digit; + } + i--; + if (i < 0) + break; + n0 = (n1 << -bit_pos) & ((1 << bits_per_digit) - 1); + n1 = mptr[i]; + bit_pos += BITS_PER_MP_LIMB; + *s++ = n0 | (n1 >> bit_pos); + } + + *s = 0; + + return s - str; + } + else + { + /* General case. The base is not a power of 2. Make conversion + from least significant end. */ + + /* If udiv_qrnnd only handles divisors with the most significant bit + set, prepare BIG_BASE for being a divisor by shifting it to the + left exactly enough to set the most significant bit. */ +#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME + count_leading_zeros (normalization_steps, big_base); + big_base <<= normalization_steps; +#if UDIV_TIME > 2 * UMUL_TIME + /* Get the fixed-point approximation to 1/(BIG_BASE << NORMALIZATION_STEPS). */ + big_base_inverted = __mp_bases[base].big_base_inverted; +#endif +#endif + + dig_per_u = __mp_bases[base].chars_per_limb; + out_len = ((size_t) msize * BITS_PER_MP_LIMB + * __mp_bases[base].chars_per_bit_exactly) + 1; + s += out_len; + + while (msize != 0) + { + int i; + mp_limb_t n0, n1; + +#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME + /* If we shifted BIG_BASE above, shift the dividend too, to get + the right quotient. We need to do this every loop, + since the intermediate quotients are OK, but the quotient from + one turn in the loop is going to be the dividend in the + next turn, and the dividend needs to be up-shifted. */ + if (normalization_steps != 0) + { + n0 = mpn_lshift (mptr, mptr, msize, normalization_steps); + + /* If the shifting gave a carry out limb, store it and + increase the length. */ + if (n0 != 0) + { + mptr[msize] = n0; + msize++; + } + } +#endif + + /* Divide the number at TP with BIG_BASE to get a quotient and a + remainder. The remainder is our new digit in base BIG_BASE. */ + i = msize - 1; + n1 = mptr[i]; + + if (n1 >= big_base) + n1 = 0; + else + { + msize--; + i--; + } + + for (; i >= 0; i--) + { + n0 = mptr[i]; +#if UDIV_TIME > 2 * UMUL_TIME + udiv_qrnnd_preinv (mptr[i], n1, n1, n0, big_base, big_base_inverted); +#else + udiv_qrnnd (mptr[i], n1, n1, n0, big_base); +#endif + } + +#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME + /* If we shifted above (at previous UDIV_NEEDS_NORMALIZATION tests) + the remainder will be up-shifted here. Compensate. */ + n1 >>= normalization_steps; +#endif + + /* Convert N1 from BIG_BASE to a string of digits in BASE + using single precision operations. */ + for (i = dig_per_u - 1; i >= 0; i--) + { + *--s = n1 % base; + n1 /= base; + if (n1 == 0 && msize == 0) + break; + } + } + + while (s != str) + *--s = 0; + return out_len; + } +} diff --git a/rts/gmp/mpn/generic/gmp-mparam.h b/rts/gmp/mpn/generic/gmp-mparam.h new file mode 100644 index 0000000000..14bcaece83 --- /dev/null +++ b/rts/gmp/mpn/generic/gmp-mparam.h @@ -0,0 +1,27 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 diff --git a/rts/gmp/mpn/generic/hamdist.c b/rts/gmp/mpn/generic/hamdist.c new file mode 100644 index 0000000000..35c10e8450 --- /dev/null +++ b/rts/gmp/mpn/generic/hamdist.c @@ -0,0 +1,94 @@ +/* mpn_hamdist -- + +Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#if defined __GNUC__ +/* No processor claiming to be SPARC v9 compliant seem to + implement the POPC instruction. Disable pattern for now. */ +#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64 +#define popc_limb(a) \ + ({ \ + DItype __res; \ + asm ("popc %1,%0" : "=r" (__res) : "rI" (a)); \ + __res; \ + }) +#endif +#endif + +#ifndef popc_limb + +/* Cool population count of a mp_limb_t. + You have to figure out how this works, I won't tell you! */ + +static inline unsigned int +#if __STDC__ +popc_limb (mp_limb_t x) +#else +popc_limb (x) + mp_limb_t x; +#endif +{ +#if BITS_PER_MP_LIMB == 64 + /* We have to go into some trouble to define these constants. + (For mp_limb_t being `long long'.) */ + mp_limb_t cnst; + cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2); + x -= (x & cnst) >> 1; + cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2); + x = ((x & ~cnst) >> 2) + (x & cnst); + cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2); + x = ((x >> 4) + x) & cnst; + x = ((x >> 8) + x); + x = ((x >> 16) + x); + x = ((x >> 32) + x) & 0xff; +#endif +#if BITS_PER_MP_LIMB == 32 + x -= (x & 0xaaaaaaaa) >> 1; + x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L); + x = ((x >> 4) + x) & 0x0f0f0f0fL; + x = ((x >> 8) + x); + x = ((x >> 16) + x) & 0xff; +#endif + return x; +} +#endif + +unsigned long int +#if __STDC__ +mpn_hamdist (mp_srcptr up, mp_srcptr vp, mp_size_t size) +#else +mpn_hamdist (up, vp, size) + register mp_srcptr up; + register mp_srcptr vp; + register mp_size_t size; +#endif +{ + unsigned long int hamdist; + mp_size_t i; + + hamdist = 0; + for (i = 0; i < size; i++) + hamdist += popc_limb (up[i] ^ vp[i]); + + return hamdist; +} diff --git a/rts/gmp/mpn/generic/inlines.c b/rts/gmp/mpn/generic/inlines.c new file mode 100644 index 0000000000..9487e58cf2 --- /dev/null +++ b/rts/gmp/mpn/generic/inlines.c @@ -0,0 +1,24 @@ +/* +Copyright (C) 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#define _FORCE_INLINES +#define _EXTERN_INLINE /* empty */ +#include "gmp.h" diff --git a/rts/gmp/mpn/generic/jacbase.c b/rts/gmp/mpn/generic/jacbase.c new file mode 100644 index 0000000000..dd437f1ac1 --- /dev/null +++ b/rts/gmp/mpn/generic/jacbase.c @@ -0,0 +1,136 @@ +/* mpn_jacobi_base -- limb/limb Jacobi symbol with restricted arguments. + + THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO + INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP. */ + +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +#if COUNT_TRAILING_ZEROS_TIME <= 7 +/* If count_trailing_zeros is fast, use it. + K7 at 7 cycles and P6 at 2 are good here. K6 at 12-27 and P5 at 18-42 + are not. The default 15 in longlong.h is meant to mean not good here. */ + +#define PROCESS_TWOS_ANY \ + { \ + mp_limb_t twos; \ + count_trailing_zeros (twos, a); \ + result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b); \ + a >>= twos; \ + } + +#define PROCESS_TWOS_EVEN PROCESS_TWOS_ANY + +#else +/* Use a loop instead. With "a" uniformly distributed there will usually be + only a few trailing zeros. + + Unfortunately the branch for the while loop here will be on a 50/50 + chance of a 1 or 0, which is bad for branch prediction. */ + +#define PROCESS_TWOS_EVEN \ + { \ + int two; \ + two = JACOBI_TWO_U_BIT1 (b); \ + do \ + { \ + a >>= 1; \ + result_bit1 ^= two; \ + ASSERT (a != 0); \ + } \ + while ((a & 1) == 0); \ + } + +#define PROCESS_TWOS_ANY \ + if ((a & 1) == 0) \ + PROCESS_TWOS_EVEN; + +#endif + + +/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but + with a restricted range of inputs accepted, namely b>1, b odd, and a<=b. + + The initial result_bit1 is taken as a parameter for the convenience of + mpz_kronecker_zi_ui() et al. The sign changes both here and in those + routines accumulate nicely in bit 1, see the JACOBI macros. + + The return value here is the normal +1, 0, or -1. Note that +1 and -1 + have bit 1 in the "BIT1" sense, which could be useful if the caller is + accumulating it into some extended calculation. + + Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be + possible, but a couple of tests suggest it's not a significant speedup, + and may even be a slowdown, so what's here is good enough for now. + + Future: The code doesn't demand a<=b actually, so maybe this could be + relaxed. All the places this is used currently call with a<=b though. */ + +int +#if __STDC__ +mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1) +#else +mpn_jacobi_base (a, b, result_bit1) + mp_limb_t a; + mp_limb_t b; + int result_bit1; +#endif +{ + ASSERT (b & 1); /* b odd */ + ASSERT (b != 1); + ASSERT (a <= b); + + if (a == 0) + return 0; + + PROCESS_TWOS_ANY; + if (a == 1) + goto done; + + for (;;) + { + result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b); + MP_LIMB_T_SWAP (a, b); + + do + { + /* working on (a/b), a,b odd, a>=b */ + ASSERT (a & 1); + ASSERT (b & 1); + ASSERT (a >= b); + + if ((a -= b) == 0) + return 0; + + PROCESS_TWOS_EVEN; + if (a == 1) + goto done; + } + while (a >= b); + } + + done: + return JACOBI_BIT1_TO_PN (result_bit1); +} diff --git a/rts/gmp/mpn/generic/lshift.c b/rts/gmp/mpn/generic/lshift.c new file mode 100644 index 0000000000..0b58389658 --- /dev/null +++ b/rts/gmp/mpn/generic/lshift.c @@ -0,0 +1,87 @@ +/* mpn_lshift -- Shift left low level. + +Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left + and store the USIZE least significant digits of the result at WP. + Return the bits shifted out from the most significant digit. + + Argument constraints: + 1. 0 < CNT < BITS_PER_MP_LIMB + 2. If the result is to be written over the input, WP must be >= UP. +*/ + +mp_limb_t +#if __STDC__ +mpn_lshift (register mp_ptr wp, + register mp_srcptr up, mp_size_t usize, + register unsigned int cnt) +#else +mpn_lshift (wp, up, usize, cnt) + register mp_ptr wp; + register mp_srcptr up; + mp_size_t usize; + register unsigned int cnt; +#endif +{ + register mp_limb_t high_limb, low_limb; + register unsigned sh_1, sh_2; + register mp_size_t i; + mp_limb_t retval; + +#ifdef DEBUG + if (usize == 0 || cnt == 0) + abort (); +#endif + + sh_1 = cnt; +#if 0 + if (sh_1 == 0) + { + if (wp != up) + { + /* Copy from high end to low end, to allow specified input/output + overlapping. */ + for (i = usize - 1; i >= 0; i--) + wp[i] = up[i]; + } + return 0; + } +#endif + + wp += 1; + sh_2 = BITS_PER_MP_LIMB - sh_1; + i = usize - 1; + low_limb = up[i]; + retval = low_limb >> sh_2; + high_limb = low_limb; + while (--i >= 0) + { + low_limb = up[i]; + wp[i] = (high_limb << sh_1) | (low_limb >> sh_2); + high_limb = low_limb; + } + wp[i] = high_limb << sh_1; + + return retval; +} diff --git a/rts/gmp/mpn/generic/mod_1.c b/rts/gmp/mpn/generic/mod_1.c new file mode 100644 index 0000000000..168ec9df49 --- /dev/null +++ b/rts/gmp/mpn/generic/mod_1.c @@ -0,0 +1,175 @@ +/* mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) -- + Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB. + Return the single-limb remainder. + There are no constraints on the value of the divisor. + +Copyright (C) 1991, 1993, 1994, 1999 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef UMUL_TIME +#define UMUL_TIME 1 +#endif + +#ifndef UDIV_TIME +#define UDIV_TIME UMUL_TIME +#endif + +mp_limb_t +#if __STDC__ +mpn_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size, + mp_limb_t divisor_limb) +#else +mpn_mod_1 (dividend_ptr, dividend_size, divisor_limb) + mp_srcptr dividend_ptr; + mp_size_t dividend_size; + mp_limb_t divisor_limb; +#endif +{ + mp_size_t i; + mp_limb_t n1, n0, r; + int dummy; + + /* Botch: Should this be handled at all? Rely on callers? */ + if (dividend_size == 0) + return 0; + + /* If multiplication is much faster than division, and the + dividend is large, pre-invert the divisor, and use + only multiplications in the inner loop. */ + + /* This test should be read: + Does it ever help to use udiv_qrnnd_preinv? + && Does what we save compensate for the inversion overhead? */ + if (UDIV_TIME > (2 * UMUL_TIME + 6) + && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, divisor_limb); + if (normalization_steps != 0) + { + mp_limb_t divisor_limb_inverted; + + divisor_limb <<= normalization_steps; + invert_limb (divisor_limb_inverted, divisor_limb); + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); + + /* Possible optimization: + if (r == 0 + && divisor_limb > ((n1 << normalization_steps) + | (dividend_ptr[dividend_size - 2] >> ...))) + ...one division less... */ + + for (i = dividend_size - 2; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (dummy, r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))), + divisor_limb, divisor_limb_inverted); + n1 = n0; + } + udiv_qrnnd_preinv (dummy, r, r, + n1 << normalization_steps, + divisor_limb, divisor_limb_inverted); + return r >> normalization_steps; + } + else + { + mp_limb_t divisor_limb_inverted; + + invert_limb (divisor_limb_inverted, divisor_limb); + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + i--; + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (dummy, r, r, + n0, divisor_limb, divisor_limb_inverted); + } + return r; + } + } + else + { + if (UDIV_NEEDS_NORMALIZATION) + { + int normalization_steps; + + count_leading_zeros (normalization_steps, divisor_limb); + if (normalization_steps != 0) + { + divisor_limb <<= normalization_steps; + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); + + /* Possible optimization: + if (r == 0 + && divisor_limb > ((n1 << normalization_steps) + | (dividend_ptr[dividend_size - 2] >> ...))) + ...one division less... */ + + for (i = dividend_size - 2; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd (dummy, r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))), + divisor_limb); + n1 = n0; + } + udiv_qrnnd (dummy, r, r, + n1 << normalization_steps, + divisor_limb); + return r >> normalization_steps; + } + } + /* No normalization needed, either because udiv_qrnnd doesn't require + it, or because DIVISOR_LIMB is already normalized. */ + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + i--; + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd (dummy, r, r, n0, divisor_limb); + } + return r; + } +} diff --git a/rts/gmp/mpn/generic/mod_1_rs.c b/rts/gmp/mpn/generic/mod_1_rs.c new file mode 100644 index 0000000000..62aaa94b92 --- /dev/null +++ b/rts/gmp/mpn/generic/mod_1_rs.c @@ -0,0 +1,111 @@ +/* mpn_mod_1_rshift -- mpn remainder under hypothetical right shift. + + THE FUNCTION IN THIS FILE IS FOR INTERNAL USE AND HAS A MUTABLE + INTERFACE. IT IS ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. + IT'S ALMOST GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP + RELEASE. */ + +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +/* When testing on a CPU with UDIV_NEEDS_NORMALIZATION equal to 0, it can be + changed to 1 temporarily to test the code under that case too. */ +#if 0 +#undef UDIV_NEEDS_NORMALIZATION +#define UDIV_NEEDS_NORMALIZATION 1 +#endif + + +/* Calculate the remainder "(ptr,size >> shift) % divisor". Note ptr,size + is unchanged, the shift is only for its effect on the remainder. + The shift doesn't even need to be considered until the last limb. + + This function has the normal size!=0 restriction, unlike the basic + mpn_mod_1. */ + +mp_limb_t +#if __STDC__ +mpn_mod_1_rshift (mp_srcptr ptr, mp_size_t size, unsigned shift, + mp_limb_t divisor) +#else +mpn_mod_1_rshift (ptr, size, shift, divisor) + mp_srcptr ptr; + mp_size_t size; + unsigned shift; + mp_limb_t divisor; +#endif +{ + mp_limb_t quot, rem; + + ASSERT (shift >= 1); + ASSERT (shift < BITS_PER_MP_LIMB); + ASSERT (size >= 1); + + if (size == 1) + return (ptr[0] >> shift) % divisor; + +#if UDIV_NEEDS_NORMALIZATION + { + int norm; + int delta; + + count_leading_zeros (norm, divisor); + divisor <<= norm; + + delta = shift - norm; + if (delta == 0) + return mpn_mod_1 (ptr, size, divisor) >> norm; + + if (delta > 0) + { + rem = mpn_mod_1 (ptr+1, size-1, divisor); + udiv_qrnnd (quot, rem, + rem >> delta, + (rem << (BITS_PER_MP_LIMB-delta)) | (ptr[0] >> delta), + divisor); + return rem >> norm; + } + else + { + rem = mpn_mod_1 (ptr, size, divisor); + udiv_qrnnd (quot, rem, + rem >> (BITS_PER_MP_LIMB+delta), + rem << -delta, + divisor); + return rem >> norm; + } + } + +#else /* !UDIV_NEEDS_NORMALIZATION */ + + rem = mpn_mod_1 (ptr+1, size-1, divisor); + udiv_qrnnd (quot, rem, + rem >> shift, + (rem << (BITS_PER_MP_LIMB-shift)) | (ptr[0] >> shift), + divisor); + return rem; + +#endif +} diff --git a/rts/gmp/mpn/generic/mul.c b/rts/gmp/mpn/generic/mul.c new file mode 100644 index 0000000000..cecfa19ca1 --- /dev/null +++ b/rts/gmp/mpn/generic/mul.c @@ -0,0 +1,190 @@ +/* mpn_mul -- Multiply two natural numbers. + + THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul) + ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH + THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED + THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v + (pointed to by VP, with VN limbs), and store the result at PRODP. The + result is UN + VN limbs. Return the most significant limb of the result. + + NOTE: The space pointed to by PRODP is overwritten before finished with U + and V, so overlap is an error. + + Argument constraints: + 1. UN >= VN. + 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from + the multiplier and the multiplicand. */ + +void +#if __STDC__ +mpn_sqr_n (mp_ptr prodp, + mp_srcptr up, mp_size_t un) +#else +mpn_sqr_n (prodp, up, un) + mp_ptr prodp; + mp_srcptr up; + mp_size_t un; +#endif +{ + if (un < KARATSUBA_SQR_THRESHOLD) + { /* plain schoolbook multiplication */ + if (un == 0) + return; + mpn_sqr_basecase (prodp, up, un); + } + else if (un < TOOM3_SQR_THRESHOLD) + { /* karatsuba multiplication */ + mp_ptr tspace; + TMP_DECL (marker); + TMP_MARK (marker); + tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB); + mpn_kara_sqr_n (prodp, up, un, tspace); + TMP_FREE (marker); + } +#if WANT_FFT || TUNE_PROGRAM_BUILD + else if (un < FFT_SQR_THRESHOLD) +#else + else +#endif + { /* toom3 multiplication */ + mp_ptr tspace; + TMP_DECL (marker); + TMP_MARK (marker); + tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB); + mpn_toom3_sqr_n (prodp, up, un, tspace); + TMP_FREE (marker); + } +#if WANT_FFT || TUNE_PROGRAM_BUILD + else + { + /* schoenhage multiplication */ + mpn_mul_fft_full (prodp, up, un, up, un); + } +#endif +} + +mp_limb_t +#if __STDC__ +mpn_mul (mp_ptr prodp, + mp_srcptr up, mp_size_t un, + mp_srcptr vp, mp_size_t vn) +#else +mpn_mul (prodp, up, un, vp, vn) + mp_ptr prodp; + mp_srcptr up; + mp_size_t un; + mp_srcptr vp; + mp_size_t vn; +#endif +{ + mp_size_t l; + mp_limb_t c; + + if (up == vp && un == vn) + { + mpn_sqr_n (prodp, up, un); + return prodp[2 * un - 1]; + } + + if (vn < KARATSUBA_MUL_THRESHOLD) + { /* long multiplication */ + mpn_mul_basecase (prodp, up, un, vp, vn); + return prodp[un + vn - 1]; + } + + mpn_mul_n (prodp, up, vp, vn); + if (un != vn) + { mp_limb_t t; + mp_ptr ws; + TMP_DECL (marker); + TMP_MARK (marker); + + prodp += vn; + l = vn; + up += vn; + un -= vn; + + if (un < vn) + { + /* Swap u's and v's. */ + MPN_SRCPTR_SWAP (up,un, vp,vn); + } + + ws = (mp_ptr) TMP_ALLOC (((vn >= KARATSUBA_MUL_THRESHOLD ? vn : un) + vn) + * BYTES_PER_MP_LIMB); + + t = 0; + while (vn >= KARATSUBA_MUL_THRESHOLD) + { + mpn_mul_n (ws, up, vp, vn); + if (l <= 2*vn) + { + t += mpn_add_n (prodp, prodp, ws, l); + if (l != 2*vn) + { + t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t); + l = 2*vn; + } + } + else + { + c = mpn_add_n (prodp, prodp, ws, 2*vn); + t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c); + } + prodp += vn; + l -= vn; + up += vn; + un -= vn; + if (un < vn) + { + /* Swap u's and v's. */ + MPN_SRCPTR_SWAP (up,un, vp,vn); + } + } + + if (vn) + { + mpn_mul_basecase (ws, up, un, vp, vn); + if (l <= un + vn) + { + t += mpn_add_n (prodp, prodp, ws, l); + if (l != un + vn) + t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t); + } + else + { + c = mpn_add_n (prodp, prodp, ws, un + vn); + t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c); + } + } + + TMP_FREE (marker); + } + return prodp[un + vn - 1]; +} diff --git a/rts/gmp/mpn/generic/mul_1.c b/rts/gmp/mpn/generic/mul_1.c new file mode 100644 index 0000000000..1c36b5fb1f --- /dev/null +++ b/rts/gmp/mpn/generic/mul_1.c @@ -0,0 +1,59 @@ +/* mpn_mul_1 -- Multiply a limb vector with a single limb and + store the product in a second limb vector. + +Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_mul_1 (res_ptr, s1_ptr, s1_size, s2_limb) + register mp_ptr res_ptr; + register mp_srcptr s1_ptr; + mp_size_t s1_size; + register mp_limb_t s2_limb; +{ + register mp_limb_t cy_limb; + register mp_size_t j; + register mp_limb_t prod_high, prod_low; + + /* The loop counter and index J goes from -S1_SIZE to -1. This way + the loop becomes faster. */ + j = -s1_size; + + /* Offset the base pointers to compensate for the negative indices. */ + s1_ptr -= j; + res_ptr -= j; + + cy_limb = 0; + do + { + umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb); + + prod_low += cy_limb; + cy_limb = (prod_low < cy_limb) + prod_high; + + res_ptr[j] = prod_low; + } + while (++j != 0); + + return cy_limb; +} diff --git a/rts/gmp/mpn/generic/mul_basecase.c b/rts/gmp/mpn/generic/mul_basecase.c new file mode 100644 index 0000000000..00c06aa5c4 --- /dev/null +++ b/rts/gmp/mpn/generic/mul_basecase.c @@ -0,0 +1,87 @@ +/* mpn_mul_basecase -- Internal routine to multiply two natural numbers + of length m and n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* Handle simple cases with traditional multiplication. + + This is the most critical code of multiplication. All multiplies rely on + this, both small and huge. Small ones arrive here immediately, huge ones + arrive here as this is the base case for Karatsuba's recursive algorithm. */ + +void +#if __STDC__ +mpn_mul_basecase (mp_ptr prodp, + mp_srcptr up, mp_size_t usize, + mp_srcptr vp, mp_size_t vsize) +#else +mpn_mul_basecase (prodp, up, usize, vp, vsize) + mp_ptr prodp; + mp_srcptr up; + mp_size_t usize; + mp_srcptr vp; + mp_size_t vsize; +#endif +{ + /* We first multiply by the low order one or two limbs, as the result can + be stored, not added, to PROD. We also avoid a loop for zeroing this + way. */ +#if HAVE_NATIVE_mpn_mul_2 + if (vsize >= 2) + { + prodp[usize + 1] = mpn_mul_2 (prodp, up, usize, vp[0], vp[1]); + prodp += 2, vp += 2, vsize -= 2; + } + else + { + prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]); + return; + } +#else + prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]); + prodp += 1, vp += 1, vsize -= 1; +#endif + +#if HAVE_NATIVE_mpn_addmul_2 + while (vsize >= 2) + { + prodp[usize + 1] = mpn_addmul_2 (prodp, up, usize, vp[0], vp[1]); + prodp += 2, vp += 2, vsize -= 2; + } + if (vsize != 0) + prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]); +#else + /* For each iteration in the loop, multiply U with one limb from V, and + add the result to PROD. */ + while (vsize != 0) + { + prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]); + prodp += 1, vp += 1, vsize -= 1; + } +#endif +} diff --git a/rts/gmp/mpn/generic/mul_fft.c b/rts/gmp/mpn/generic/mul_fft.c new file mode 100644 index 0000000000..00fd6d72de --- /dev/null +++ b/rts/gmp/mpn/generic/mul_fft.c @@ -0,0 +1,772 @@ +/* An implementation in GMP of Scho"nhage's fast multiplication algorithm + modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998. + + THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE + MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED + INTERFACES. IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN + A FUTURE GNU MP RELEASE. + +Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +/* References: + + Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker + Strassen, Computing 7, p. 281-292, 1971. + + Asymptotically fast algorithms for the numerical multiplication + and division of polynomials with complex coefficients, by Arnold Scho"nhage, + Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982. + + Tapes versus Pointers, a study in implementing fast algorithms, + by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986. + + See also http://www.loria.fr/~zimmerma/bignum + + + Future: + + K==2 isn't needed in the current uses of this code and the bits specific + for that could be dropped. + + It might be possible to avoid a small number of MPN_COPYs by using a + rotating temporary or two. + + Multiplications of unequal sized operands can be done with this code, but + it needs a tighter test for identifying squaring (same sizes as well as + same pointers). */ + + +#include <stdio.h> +#include "gmp.h" +#include "gmp-impl.h" + + +/* Change this to "#define TRACE(x) x" for some traces. */ +#define TRACE(x) + + + +FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = { + FFT_MUL_TABLE, + FFT_SQR_TABLE +}; + + +static void mpn_mul_fft_internal +_PROTO ((mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl, + int k, int K, + mp_limb_t **Ap, mp_limb_t **Bp, + mp_limb_t *A, mp_limb_t *B, + mp_size_t nprime, mp_size_t l, mp_size_t Mp, int **_fft_l, + mp_limb_t *T, int rec)); + + +/* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT. + sqr==0 if for a multiply, sqr==1 for a square */ +int +#if __STDC__ +mpn_fft_best_k (mp_size_t n, int sqr) +#else +mpn_fft_best_k (n, sqr) + mp_size_t n; + int sqr; +#endif +{ + mp_size_t t; + int i; + + for (i = 0; mpn_fft_table[sqr][i] != 0; i++) + if (n < mpn_fft_table[sqr][i]) + return i + FFT_FIRST_K; + + /* treat 4*last as one further entry */ + if (i == 0 || n < 4*mpn_fft_table[sqr][i-1]) + return i + FFT_FIRST_K; + else + return i + FFT_FIRST_K + 1; +} + + +/* Returns smallest possible number of limbs >= pl for a fft of size 2^k. + FIXME: Is this simply pl rounded up to the next multiple of 2^k ? */ + +mp_size_t +#if __STDC__ +mpn_fft_next_size (mp_size_t pl, int k) +#else +mpn_fft_next_size (pl, k) + mp_size_t pl; + int k; +#endif +{ + mp_size_t N, M; + int K; + + /* if (k==0) k = mpn_fft_best_k (pl, sqr); */ + N = pl*BITS_PER_MP_LIMB; + K = 1<<k; + if (N%K) N=(N/K+1)*K; + M = N/K; + if (M%BITS_PER_MP_LIMB) N=((M/BITS_PER_MP_LIMB)+1)*BITS_PER_MP_LIMB*K; + return (N/BITS_PER_MP_LIMB); +} + + +static void +#if __STDC__ +mpn_fft_initl(int **l, int k) +#else +mpn_fft_initl(l, k) + int **l; + int k; +#endif +{ + int i,j,K; + + l[0][0] = 0; + for (i=1,K=2;i<=k;i++,K*=2) { + for (j=0;j<K/2;j++) { + l[i][j] = 2*l[i-1][j]; + l[i][K/2+j] = 1+l[i][j]; + } + } +} + + +/* a <- -a mod 2^(n*BITS_PER_MP_LIMB)+1 */ +static void +#if __STDC__ +mpn_fft_neg_modF(mp_limb_t *ap, mp_size_t n) +#else +mpn_fft_neg_modF(ap, n) + mp_limb_t *ap; + mp_size_t n; +#endif +{ + mp_limb_t c; + + c = ap[n]+2; + mpn_com_n (ap, ap, n); + ap[n]=0; mpn_incr_u(ap, c); +} + + +/* a <- a*2^e mod 2^(n*BITS_PER_MP_LIMB)+1 */ +static void +#if __STDC__ +mpn_fft_mul_2exp_modF(mp_limb_t *ap, int e, mp_size_t n, mp_limb_t *tp) +#else +mpn_fft_mul_2exp_modF(ap, e, n, tp) + mp_limb_t *ap; + int e; + mp_size_t n; + mp_limb_t *tp; +#endif +{ + int d, sh, i; mp_limb_t cc; + + d = e%(n*BITS_PER_MP_LIMB); /* 2^e = (+/-) 2^d */ + sh = d % BITS_PER_MP_LIMB; + if (sh) mpn_lshift(tp, ap, n+1, sh); /* no carry here */ + else MPN_COPY(tp, ap, n+1); + d /= BITS_PER_MP_LIMB; /* now shift of d limbs to the left */ + if (d) { + /* ap[d..n-1] = tp[0..n-d-1], ap[0..d-1] = -tp[n-d..n-1] */ + /* mpn_xor would be more efficient here */ + for (i=d-1;i>=0;i--) ap[i] = ~tp[n-d+i]; + cc = 1-mpn_add_1(ap, ap, d, 1); + if (cc) cc=mpn_sub_1(ap+d, tp, n-d, 1); + else MPN_COPY(ap+d, tp, n-d); + if (cc+=mpn_sub_1(ap+d, ap+d, n-d, tp[n])) + ap[n]=mpn_add_1(ap, ap, n, cc); + else ap[n]=0; + } + else if ((ap[n]=mpn_sub_1(ap, tp, n, tp[n]))) { + ap[n]=mpn_add_1(ap, ap, n, 1); + } + if ((e/(n*BITS_PER_MP_LIMB))%2) mpn_fft_neg_modF(ap, n); +} + + +/* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */ +static void +#if __STDC__ +mpn_fft_add_modF (mp_limb_t *ap, mp_limb_t *bp, int n) +#else +mpn_fft_add_modF (ap, bp, n) + mp_limb_t *ap,*bp; + int n; +#endif +{ + mp_limb_t c; + + c = ap[n] + bp[n] + mpn_add_n(ap, ap, bp, n); + if (c>1) c -= 1+mpn_sub_1(ap,ap,n,1); + ap[n]=c; +} + + +/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where + N=n*BITS_PER_MP_LIMB + 2^omega is a primitive root mod 2^N+1 + output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */ + +static void +#if __STDC__ +mpn_fft_fft_sqr (mp_limb_t **Ap, mp_size_t K, int **ll, + mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp) +#else +mpn_fft_fft_sqr(Ap,K,ll,omega,n,inc,tp) +mp_limb_t **Ap,*tp; +mp_size_t K,omega,n,inc; +int **ll; +#endif +{ + if (K==2) { +#ifdef ADDSUB + if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1) +#else + MPN_COPY(tp, Ap[0], n+1); + mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1); + if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1)) +#endif + Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1); + } + else { + int j, inc2=2*inc; + int *lk = *ll; + mp_limb_t *tmp; + TMP_DECL(marker); + + TMP_MARK(marker); + tmp = TMP_ALLOC_LIMBS (n+1); + mpn_fft_fft_sqr(Ap, K/2,ll-1,2*omega,n,inc2, tp); + mpn_fft_fft_sqr(Ap+inc, K/2,ll-1,2*omega,n,inc2, tp); + /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc] + A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */ + for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc) { + MPN_COPY(tp, Ap[inc], n+1); + mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp); + mpn_fft_add_modF(Ap[inc], Ap[0], n); + mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp); + mpn_fft_add_modF(Ap[0], tp, n); + } + TMP_FREE(marker); + } +} + + +/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where + N=n*BITS_PER_MP_LIMB + 2^omega is a primitive root mod 2^N+1 + output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */ + +static void +#if __STDC__ +mpn_fft_fft (mp_limb_t **Ap, mp_limb_t **Bp, mp_size_t K, int **ll, + mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp) +#else +mpn_fft_fft(Ap,Bp,K,ll,omega,n,inc,tp) + mp_limb_t **Ap,**Bp,*tp; + mp_size_t K,omega,n,inc; + int **ll; +#endif +{ + if (K==2) { +#ifdef ADDSUB + if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1) +#else + MPN_COPY(tp, Ap[0], n+1); + mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1); + if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1)) +#endif + Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1); +#ifdef ADDSUB + if (mpn_addsub_n(Bp[0], Bp[inc], Bp[0], Bp[inc], n+1) & 1) +#else + MPN_COPY(tp, Bp[0], n+1); + mpn_add_n(Bp[0], Bp[0], Bp[inc],n+1); + if (mpn_sub_n(Bp[inc], tp, Bp[inc],n+1)) +#endif + Bp[inc][n] = mpn_add_1(Bp[inc], Bp[inc], n, 1); + } + else { + int j, inc2=2*inc; + int *lk=*ll; + mp_limb_t *tmp; + TMP_DECL(marker); + + TMP_MARK(marker); + tmp = TMP_ALLOC_LIMBS (n+1); + mpn_fft_fft(Ap, Bp, K/2,ll-1,2*omega,n,inc2, tp); + mpn_fft_fft(Ap+inc, Bp+inc, K/2,ll-1,2*omega,n,inc2, tp); + /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc] + A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */ + for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc,Bp+=2*inc) { + MPN_COPY(tp, Ap[inc], n+1); + mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp); + mpn_fft_add_modF(Ap[inc], Ap[0], n); + mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp); + mpn_fft_add_modF(Ap[0], tp, n); + MPN_COPY(tp, Bp[inc], n+1); + mpn_fft_mul_2exp_modF(Bp[inc], lk[1]*omega, n, tmp); + mpn_fft_add_modF(Bp[inc], Bp[0], n); + mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp); + mpn_fft_add_modF(Bp[0], tp, n); + } + TMP_FREE(marker); + } +} + + +/* a[i] <- a[i]*b[i] mod 2^(n*BITS_PER_MP_LIMB)+1 for 0 <= i < K */ +static void +#if __STDC__ +mpn_fft_mul_modF_K (mp_limb_t **ap, mp_limb_t **bp, mp_size_t n, int K) +#else +mpn_fft_mul_modF_K(ap, bp, n, K) + mp_limb_t **ap, **bp; + mp_size_t n; + int K; +#endif +{ + int i; + int sqr = (ap == bp); + TMP_DECL(marker); + + TMP_MARK(marker); + + if (n >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) { + int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2; + int **_fft_l; + mp_limb_t **Ap,**Bp,*A,*B,*T; + + k = mpn_fft_best_k (n, sqr); + K2 = 1<<k; + maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB; + M2 = n*BITS_PER_MP_LIMB/K2; + l = n/K2; + Nprime2 = ((2*M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/ + nprime2 = Nprime2/BITS_PER_MP_LIMB; + Mp2 = Nprime2/K2; + + Ap = TMP_ALLOC_MP_PTRS (K2); + Bp = TMP_ALLOC_MP_PTRS (K2); + A = TMP_ALLOC_LIMBS (2*K2*(nprime2+1)); + T = TMP_ALLOC_LIMBS (nprime2+1); + B = A + K2*(nprime2+1); + _fft_l = TMP_ALLOC_TYPE (k+1, int*); + for (i=0;i<=k;i++) + _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int); + mpn_fft_initl(_fft_l, k); + + TRACE (printf("recurse: %dx%d limbs -> %d times %dx%d (%1.2f)\n", n, + n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2)); + + for (i=0;i<K;i++,ap++,bp++) + mpn_mul_fft_internal(*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2, + l, Mp2, _fft_l, T, 1); + } + else { + mp_limb_t *a, *b, cc, *tp, *tpn; int n2=2*n; + tp = TMP_ALLOC_LIMBS (n2); + tpn = tp+n; + TRACE (printf (" mpn_mul_n %d of %d limbs\n", K, n)); + for (i=0;i<K;i++) { + a = *ap++; b=*bp++; + if (sqr) + mpn_sqr_n(tp, a, n); + else + mpn_mul_n(tp, b, a, n); + if (a[n]) cc=mpn_add_n(tpn, tpn, b, n); else cc=0; + if (b[n]) cc += mpn_add_n(tpn, tpn, a, n) + a[n]; + if (cc) { + cc = mpn_add_1(tp, tp, n2, cc); + ASSERT_NOCARRY (mpn_add_1(tp, tp, n2, cc)); + } + a[n] = mpn_sub_n(a, tp, tpn, n) && mpn_add_1(a, a, n, 1); + } + } + TMP_FREE(marker); +} + + +/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]] + output: K*A[0] K*A[K-1] ... K*A[1] */ + +static void +#if __STDC__ +mpn_fft_fftinv (mp_limb_t **Ap, int K, mp_size_t omega, mp_size_t n, + mp_limb_t *tp) +#else +mpn_fft_fftinv(Ap,K,omega,n,tp) + mp_limb_t **Ap, *tp; + int K; + mp_size_t omega, n; +#endif +{ + if (K==2) { +#ifdef ADDSUB + if (mpn_addsub_n(Ap[0], Ap[1], Ap[0], Ap[1], n+1) & 1) +#else + MPN_COPY(tp, Ap[0], n+1); + mpn_add_n(Ap[0], Ap[0], Ap[1], n+1); + if (mpn_sub_n(Ap[1], tp, Ap[1], n+1)) +#endif + Ap[1][n] = mpn_add_1(Ap[1], Ap[1], n, 1); + } + else { + int j, K2=K/2; mp_limb_t **Bp=Ap+K2, *tmp; + TMP_DECL(marker); + + TMP_MARK(marker); + tmp = TMP_ALLOC_LIMBS (n+1); + mpn_fft_fftinv(Ap, K2, 2*omega, n, tp); + mpn_fft_fftinv(Bp, K2, 2*omega, n, tp); + /* A[j] <- A[j] + omega^j A[j+K/2] + A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */ + for (j=0;j<K2;j++,Ap++,Bp++) { + MPN_COPY(tp, Bp[0], n+1); + mpn_fft_mul_2exp_modF(Bp[0], (j+K2)*omega, n, tmp); + mpn_fft_add_modF(Bp[0], Ap[0], n); + mpn_fft_mul_2exp_modF(tp, j*omega, n, tmp); + mpn_fft_add_modF(Ap[0], tp, n); + } + TMP_FREE(marker); + } +} + + +/* A <- A/2^k mod 2^(n*BITS_PER_MP_LIMB)+1 */ +static void +#if __STDC__ +mpn_fft_div_2exp_modF (mp_limb_t *ap, int k, mp_size_t n, mp_limb_t *tp) +#else +mpn_fft_div_2exp_modF(ap,k,n,tp) + mp_limb_t *ap,*tp; + int k; + mp_size_t n; +#endif +{ + int i; + + i = 2*n*BITS_PER_MP_LIMB; + i = (i-k) % i; + mpn_fft_mul_2exp_modF(ap,i,n,tp); + /* 1/2^k = 2^(2nL-k) mod 2^(n*BITS_PER_MP_LIMB)+1 */ + /* normalize so that A < 2^(n*BITS_PER_MP_LIMB)+1 */ + if (ap[n]==1) { + for (i=0;i<n && ap[i]==0;i++); + if (i<n) { + ap[n]=0; + mpn_sub_1(ap, ap, n, 1); + } + } +} + + +/* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n<=an<=3*n */ +static void +#if __STDC__ +mpn_fft_norm_modF(mp_limb_t *rp, mp_limb_t *ap, mp_size_t n, mp_size_t an) +#else +mpn_fft_norm_modF(rp, ap, n, an) + mp_limb_t *rp; + mp_limb_t *ap; + mp_size_t n; + mp_size_t an; +#endif +{ + mp_size_t l; + + if (an>2*n) { + l = n; + rp[n] = mpn_add_1(rp+an-2*n, ap+an-2*n, 3*n-an, + mpn_add_n(rp,ap,ap+2*n,an-2*n)); + } + else { + l = an-n; + MPN_COPY(rp, ap, n); + rp[n]=0; + } + if (mpn_sub_n(rp,rp,ap+n,l)) { + if (mpn_sub_1(rp+l,rp+l,n+1-l,1)) + rp[n]=mpn_add_1(rp,rp,n,1); + } +} + + +static void +#if __STDC__ +mpn_mul_fft_internal(mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl, + int k, int K, + mp_limb_t **Ap, mp_limb_t **Bp, + mp_limb_t *A, mp_limb_t *B, + mp_size_t nprime, mp_size_t l, mp_size_t Mp, + int **_fft_l, + mp_limb_t *T, int rec) +#else +mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,rec) + mp_limb_t *op; + mp_srcptr n, m; + mp_limb_t **Ap,**Bp,*A,*B,*T; + mp_size_t pl,nprime; + int **_fft_l; + int k,K,l,Mp,rec; +#endif +{ + int i, sqr, pla, lo, sh, j; + mp_limb_t *p; + + sqr = (n==m); + + TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n", + pl,k,K,nprime,l,Mp,rec,sqr)); + + /* decomposition of inputs into arrays Ap[i] and Bp[i] */ + if (rec) for (i=0;i<K;i++) { + Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1); + /* store the next M bits of n into A[i] */ + /* supposes that M is a multiple of BITS_PER_MP_LIMB */ + MPN_COPY(Ap[i], n, l); n+=l; MPN_ZERO(Ap[i]+l, nprime+1-l); + /* set most significant bits of n and m (important in recursive calls) */ + if (i==K-1) Ap[i][l]=n[0]; + mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T); + if (!sqr) { + MPN_COPY(Bp[i], m, l); m+=l; MPN_ZERO(Bp[i]+l, nprime+1-l); + if (i==K-1) Bp[i][l]=m[0]; + mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T); + } + } + + /* direct fft's */ + if (sqr) mpn_fft_fft_sqr(Ap,K,_fft_l+k,2*Mp,nprime,1, T); + else mpn_fft_fft(Ap,Bp,K,_fft_l+k,2*Mp,nprime,1, T); + + /* term to term multiplications */ + mpn_fft_mul_modF_K(Ap, (sqr) ? Ap : Bp, nprime, K); + + /* inverse fft's */ + mpn_fft_fftinv(Ap, K, 2*Mp, nprime, T); + + /* division of terms after inverse fft */ + for (i=0;i<K;i++) mpn_fft_div_2exp_modF(Ap[i],k+((K-i)%K)*Mp,nprime, T); + + /* addition of terms in result p */ + MPN_ZERO(T,nprime+1); + pla = l*(K-1)+nprime+1; /* number of required limbs for p */ + p = B; /* B has K*(n'+1) limbs, which is >= pla, i.e. enough */ + MPN_ZERO(p, pla); + sqr=0; /* will accumulate the (signed) carry at p[pla] */ + for (i=K-1,lo=l*i+nprime,sh=l*i;i>=0;i--,lo-=l,sh-=l) { + mp_ptr n = p+sh; + j = (K-i)%K; + if (mpn_add_n(n,n,Ap[j],nprime+1)) + sqr += mpn_add_1(n+nprime+1,n+nprime+1,pla-sh-nprime-1,1); + T[2*l]=i+1; /* T = (i+1)*2^(2*M) */ + if (mpn_cmp(Ap[j],T,nprime+1)>0) { /* subtract 2^N'+1 */ + sqr -= mpn_sub_1(n,n,pla-sh,1); + sqr -= mpn_sub_1(p+lo,p+lo,pla-lo,1); + } + } + if (sqr==-1) { + if ((sqr=mpn_add_1(p+pla-pl,p+pla-pl,pl,1))) { + /* p[pla-pl]...p[pla-1] are all zero */ + mpn_sub_1(p+pla-pl-1,p+pla-pl-1,pl+1,1); + mpn_sub_1(p+pla-1,p+pla-1,1,1); + } + } + else if (sqr==1) { + if (pla>=2*pl) + while ((sqr=mpn_add_1(p+pla-2*pl,p+pla-2*pl,2*pl,sqr))); + else { + sqr = mpn_sub_1(p+pla-pl,p+pla-pl,pl,sqr); + ASSERT (sqr == 0); + } + } + else + ASSERT (sqr == 0); + + /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ] + < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ] + < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */ + mpn_fft_norm_modF(op,p,pl,pla); +} + + +/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB + n and m have respectively nl and ml limbs + op must have space for pl+1 limbs + One must have pl = mpn_fft_next_size(pl, k). +*/ + +void +#if __STDC__ +mpn_mul_fft (mp_ptr op, mp_size_t pl, + mp_srcptr n, mp_size_t nl, + mp_srcptr m, mp_size_t ml, + int k) +#else +mpn_mul_fft (op, pl, n, nl, m, ml, k) + mp_ptr op; + mp_size_t pl; + mp_srcptr n; + mp_size_t nl; + mp_srcptr m; + mp_size_t ml; + int k; +#endif +{ + int K,maxLK,i,j; + mp_size_t N,Nprime,nprime,M,Mp,l; + mp_limb_t **Ap,**Bp,*A,*T,*B; + int **_fft_l; + int sqr = (n==m && nl==ml); + TMP_DECL(marker); + + TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n", + pl, nl, ml, k)); + ASSERT_ALWAYS (mpn_fft_next_size(pl, k) == pl); + + TMP_MARK(marker); + N = pl*BITS_PER_MP_LIMB; + _fft_l = TMP_ALLOC_TYPE (k+1, int*); + for (i=0;i<=k;i++) + _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int); + mpn_fft_initl(_fft_l, k); + K = 1<<k; + M = N/K; /* N = 2^k M */ + l = M/BITS_PER_MP_LIMB; + maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB; + + Nprime = ((2*M+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */ + nprime = Nprime/BITS_PER_MP_LIMB; + TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n", + N, K, M, l, maxLK, Nprime, nprime)); + if (nprime >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) { + maxLK = (1<<mpn_fft_best_k(nprime,n==m))*BITS_PER_MP_LIMB; + if (Nprime % maxLK) { + Nprime=((Nprime/maxLK)+1)*maxLK; + nprime = Nprime/BITS_PER_MP_LIMB; + } + TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime)); + } + + T = TMP_ALLOC_LIMBS (nprime+1); + Mp = Nprime/K; + + TRACE (printf("%dx%d limbs -> %d times %dx%d limbs (%1.2f)\n", + pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K); + printf(" temp space %ld\n", 2*K*(nprime+1))); + + A = _MP_ALLOCATE_FUNC_LIMBS (2*K*(nprime+1)); + B = A+K*(nprime+1); + Ap = TMP_ALLOC_MP_PTRS (K); + Bp = TMP_ALLOC_MP_PTRS (K); + /* special decomposition for main call */ + for (i=0;i<K;i++) { + Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1); + /* store the next M bits of n into A[i] */ + /* supposes that M is a multiple of BITS_PER_MP_LIMB */ + if (nl>0) { + j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */ + MPN_COPY(Ap[i], n, j); n+=l; MPN_ZERO(Ap[i]+j, nprime+1-j); + mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T); + } + else MPN_ZERO(Ap[i], nprime+1); + nl -= l; + if (n!=m) { + if (ml>0) { + j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */ + MPN_COPY(Bp[i], m, j); m+=l; MPN_ZERO(Bp[i]+j, nprime+1-j); + mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T); + } + else MPN_ZERO(Bp[i], nprime+1); + } + ml -= l; + } + mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,0); + TMP_FREE(marker); + _MP_FREE_FUNC_LIMBS (A, 2*K*(nprime+1)); +} + + +#if WANT_ASSERT +static int +#if __STDC__ +mpn_zero_p (mp_ptr p, mp_size_t n) +#else + mpn_zero_p (p, n) + mp_ptr p; + mp_size_t n; +#endif +{ + mp_size_t i; + + for (i = 0; i < n; i++) + { + if (p[i] != 0) + return 0; + } + + return 1; +} +#endif + + +/* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}. + + FIXME: Duplicating the result like this is wasteful, do something better + perhaps at the norm_modF stage above. */ + +void +#if __STDC__ +mpn_mul_fft_full (mp_ptr op, + mp_srcptr n, mp_size_t nl, + mp_srcptr m, mp_size_t ml) +#else +mpn_mul_fft_full (op, n, nl, m, ml) + mp_ptr op; + mp_srcptr n; + mp_size_t nl; + mp_srcptr m; + mp_size_t ml; +#endif +{ + mp_ptr pad_op; + mp_size_t pl; + int k; + int sqr = (n==m && nl==ml); + + k = mpn_fft_best_k (nl+ml, sqr); + pl = mpn_fft_next_size (nl+ml, k); + + TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n", + nl, ml, pl, k)); + + pad_op = _MP_ALLOCATE_FUNC_LIMBS (pl+1); + mpn_mul_fft (pad_op, pl, n, nl, m, ml, k); + + ASSERT (mpn_zero_p (pad_op+nl+ml, pl+1-(nl+ml))); + MPN_COPY (op, pad_op, nl+ml); + + _MP_FREE_FUNC_LIMBS (pad_op, pl+1); +} diff --git a/rts/gmp/mpn/generic/mul_n.c b/rts/gmp/mpn/generic/mul_n.c new file mode 100644 index 0000000000..b7563be2d3 --- /dev/null +++ b/rts/gmp/mpn/generic/mul_n.c @@ -0,0 +1,1343 @@ +/* mpn_mul_n and helper function -- Multiply/square natural numbers. + + THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n) + ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY SAFE TO REACH + THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED + THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB. + 0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */ +#define INVERSE_3 ((MP_LIMB_T_MAX / 3) * 2 + 1) + +#if !defined (__alpha) && !defined (__mips) +/* For all other machines, we want to call mpn functions for the compund + operations instead of open-coding them. */ +#define USE_MORE_MPN +#endif + +/*== Function declarations =================================================*/ + +static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr, + mp_ptr, mp_ptr, mp_ptr, + mp_srcptr, mp_srcptr, mp_srcptr, + mp_size_t, mp_size_t)); +static void interpolate3 _PROTO ((mp_srcptr, + mp_ptr, mp_ptr, mp_ptr, + mp_srcptr, + mp_ptr, mp_ptr, mp_ptr, + mp_size_t, mp_size_t)); +static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t)); + + +/*-- mpn_kara_mul_n ---------------------------------------------------------------*/ + +/* Multiplies using 3 half-sized mults and so on recursively. + * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1]. + * No overlap of p[...] with a[...] or b[...]. + * ws is workspace. + */ + +void +#if __STDC__ +mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) +#else +mpn_kara_mul_n(p, a, b, n, ws) + mp_ptr p; + mp_srcptr a; + mp_srcptr b; + mp_size_t n; + mp_ptr ws; +#endif +{ + mp_limb_t i, sign, w, w0, w1; + mp_size_t n2; + mp_srcptr x, y; + + n2 = n >> 1; + ASSERT (n2 > 0); + + if (n & 1) + { + /* Odd length. */ + mp_size_t n1, n3, nm1; + + n3 = n - n2; + + sign = 0; + w = a[n2]; + if (w != 0) + w -= mpn_sub_n (p, a, a + n3, n2); + else + { + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n3; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n3; + } + mpn_sub_n (p, x, y, n2); + } + p[n2] = w; + + w = b[n2]; + if (w != 0) + w -= mpn_sub_n (p + n3, b, b + n3, n2); + else + { + i = n2; + do + { + --i; + w0 = b[i]; + w1 = b[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = b + n3; + y = b; + sign ^= 1; + } + else + { + x = b; + y = b + n3; + } + mpn_sub_n (p + n3, x, y, n2); + } + p[n] = w; + + n1 = n + 1; + if (n2 < KARATSUBA_MUL_THRESHOLD) + { + if (n3 < KARATSUBA_MUL_THRESHOLD) + { + mpn_mul_basecase (ws, p, n3, p + n3, n3); + mpn_mul_basecase (p, a, n3, b, n3); + } + else + { + mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); + mpn_kara_mul_n (p, a, b, n3, ws + n1); + } + mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2); + } + else + { + mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1); + mpn_kara_mul_n (p, a, b, n3, ws + n1); + mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1); + } + + if (sign) + mpn_add_n (ws, p, ws, n1); + else + mpn_sub_n (ws, p, ws, n1); + + nm1 = n - 1; + if (mpn_add_n (ws, p + n1, ws, nm1)) + { + mp_limb_t x = ws[nm1] + 1; + ws[nm1] = x; + if (x == 0) + ++ws[n]; + } + if (mpn_add_n (p + n3, p + n3, ws, n1)) + { + mp_limb_t x; + i = n1 + n3; + do + { + x = p[i] + 1; + p[i] = x; + ++i; + } while (x == 0); + } + } + else + { + /* Even length. */ + mp_limb_t t; + + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n2+i]; + } + while (w0 == w1 && i != 0); + sign = 0; + if (w0 < w1) + { + x = a + n2; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n2; + } + mpn_sub_n (p, x, y, n2); + + i = n2; + do + { + --i; + w0 = b[i]; + w1 = b[n2+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = b + n2; + y = b; + sign ^= 1; + } + else + { + x = b; + y = b + n2; + } + mpn_sub_n (p + n2, x, y, n2); + + /* Pointwise products. */ + if (n2 < KARATSUBA_MUL_THRESHOLD) + { + mpn_mul_basecase (ws, p, n2, p + n2, n2); + mpn_mul_basecase (p, a, n2, b, n2); + mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2); + } + else + { + mpn_kara_mul_n (ws, p, p + n2, n2, ws + n); + mpn_kara_mul_n (p, a, b, n2, ws + n); + mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n); + } + + /* Interpolate. */ + if (sign) + w = mpn_add_n (ws, p, ws, n); + else + w = -mpn_sub_n (ws, p, ws, n); + w += mpn_add_n (ws, p + n, ws, n); + w += mpn_add_n (p + n2, p + n2, ws, n); + /* TO DO: could put "if (w) { ... }" here. + * Less work but badly predicted branch. + * No measurable difference in speed on Alpha. + */ + i = n + n2; + t = p[i] + w; + p[i] = t; + if (t < w) + { + do + { + ++i; + w = p[i] + 1; + p[i] = w; + } + while (w == 0); + } + } +} + +void +#if __STDC__ +mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) +#else +mpn_kara_sqr_n (p, a, n, ws) + mp_ptr p; + mp_srcptr a; + mp_size_t n; + mp_ptr ws; +#endif +{ + mp_limb_t i, sign, w, w0, w1; + mp_size_t n2; + mp_srcptr x, y; + + n2 = n >> 1; + ASSERT (n2 > 0); + + if (n & 1) + { + /* Odd length. */ + mp_size_t n1, n3, nm1; + + n3 = n - n2; + + sign = 0; + w = a[n2]; + if (w != 0) + w -= mpn_sub_n (p, a, a + n3, n2); + else + { + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n3; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n3; + } + mpn_sub_n (p, x, y, n2); + } + p[n2] = w; + + w = a[n2]; + if (w != 0) + w -= mpn_sub_n (p + n3, a, a + n3, n2); + else + { + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n3+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n3; + y = a; + sign ^= 1; + } + else + { + x = a; + y = a + n3; + } + mpn_sub_n (p + n3, x, y, n2); + } + p[n] = w; + + n1 = n + 1; + if (n2 < KARATSUBA_SQR_THRESHOLD) + { + if (n3 < KARATSUBA_SQR_THRESHOLD) + { + mpn_sqr_basecase (ws, p, n3); + mpn_sqr_basecase (p, a, n3); + } + else + { + mpn_kara_sqr_n (ws, p, n3, ws + n1); + mpn_kara_sqr_n (p, a, n3, ws + n1); + } + mpn_sqr_basecase (p + n1, a + n3, n2); + } + else + { + mpn_kara_sqr_n (ws, p, n3, ws + n1); + mpn_kara_sqr_n (p, a, n3, ws + n1); + mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1); + } + + if (sign) + mpn_add_n (ws, p, ws, n1); + else + mpn_sub_n (ws, p, ws, n1); + + nm1 = n - 1; + if (mpn_add_n (ws, p + n1, ws, nm1)) + { + mp_limb_t x = ws[nm1] + 1; + ws[nm1] = x; + if (x == 0) + ++ws[n]; + } + if (mpn_add_n (p + n3, p + n3, ws, n1)) + { + mp_limb_t x; + i = n1 + n3; + do + { + x = p[i] + 1; + p[i] = x; + ++i; + } while (x == 0); + } + } + else + { + /* Even length. */ + mp_limb_t t; + + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n2+i]; + } + while (w0 == w1 && i != 0); + sign = 0; + if (w0 < w1) + { + x = a + n2; + y = a; + sign = 1; + } + else + { + x = a; + y = a + n2; + } + mpn_sub_n (p, x, y, n2); + + i = n2; + do + { + --i; + w0 = a[i]; + w1 = a[n2+i]; + } + while (w0 == w1 && i != 0); + if (w0 < w1) + { + x = a + n2; + y = a; + sign ^= 1; + } + else + { + x = a; + y = a + n2; + } + mpn_sub_n (p + n2, x, y, n2); + + /* Pointwise products. */ + if (n2 < KARATSUBA_SQR_THRESHOLD) + { + mpn_sqr_basecase (ws, p, n2); + mpn_sqr_basecase (p, a, n2); + mpn_sqr_basecase (p + n, a + n2, n2); + } + else + { + mpn_kara_sqr_n (ws, p, n2, ws + n); + mpn_kara_sqr_n (p, a, n2, ws + n); + mpn_kara_sqr_n (p + n, a + n2, n2, ws + n); + } + + /* Interpolate. */ + if (sign) + w = mpn_add_n (ws, p, ws, n); + else + w = -mpn_sub_n (ws, p, ws, n); + w += mpn_add_n (ws, p + n, ws, n); + w += mpn_add_n (p + n2, p + n2, ws, n); + /* TO DO: could put "if (w) { ... }" here. + * Less work but badly predicted branch. + * No measurable difference in speed on Alpha. + */ + i = n + n2; + t = p[i] + w; + p[i] = t; + if (t < w) + { + do + { + ++i; + w = p[i] + 1; + p[i] = w; + } + while (w == 0); + } + } +} + +/*-- add2Times -------------------------------------------------------------*/ + +/* z[] = x[] + 2 * y[] + Note that z and x might point to the same vectors. */ +#ifdef USE_MORE_MPN +static inline mp_limb_t +#if __STDC__ +add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) +#else +add2Times (z, x, y, n) + mp_ptr z; + mp_srcptr x; + mp_srcptr y; + mp_size_t n; +#endif +{ + mp_ptr t; + mp_limb_t c; + TMP_DECL (marker); + TMP_MARK (marker); + t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB); + c = mpn_lshift (t, y, n, 1); + c += mpn_add_n (z, x, t, n); + TMP_FREE (marker); + return c; +} +#else + +static mp_limb_t +#if __STDC__ +add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n) +#else +add2Times (z, x, y, n) + mp_ptr z; + mp_srcptr x; + mp_srcptr y; + mp_size_t n; +#endif +{ + mp_limb_t c, v, w; + + ASSERT (n > 0); + v = *x; w = *y; + c = w >> (BITS_PER_MP_LIMB - 1); + w <<= 1; + v += w; + c += v < w; + *z = v; + ++x; ++y; ++z; + while (--n) + { + v = *x; + w = *y; + v += c; + c = v < c; + c += w >> (BITS_PER_MP_LIMB - 1); + w <<= 1; + v += w; + c += v < w; + *z = v; + ++x; ++y; ++z; + } + + return c; +} +#endif + +/*-- evaluate3 -------------------------------------------------------------*/ + +/* Evaluates: + * ph := 4*A+2*B+C + * p1 := A+B+C + * p2 := A+2*B+4*C + * where: + * ph[], p1[], p2[], A[] and B[] all have length len, + * C[] has length len2 with len-len2 = 0, 1 or 2. + * Returns top words (overflow) at pth, pt1 and pt2 respectively. + */ +#ifdef USE_MORE_MPN +static void +#if __STDC__ +evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, + mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2) +#else +evaluate3 (ph, p1, p2, pth, pt1, pt2, + A, B, C, len, len2) + mp_ptr ph; + mp_ptr p1; + mp_ptr p2; + mp_ptr pth; + mp_ptr pt1; + mp_ptr pt2; + mp_srcptr A; + mp_srcptr B; + mp_srcptr C; + mp_size_t len; + mp_size_t len2; +#endif +{ + mp_limb_t c, d, e; + + ASSERT (len - len2 <= 2); + + e = mpn_lshift (p1, B, len, 1); + + c = mpn_lshift (ph, A, len, 2); + c += e + mpn_add_n (ph, ph, p1, len); + d = mpn_add_n (ph, ph, C, len2); + if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d); + ASSERT (c < 7); + *pth = c; + + c = mpn_lshift (p2, C, len2, 2); +#if 1 + if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; } + c += e + mpn_add_n (p2, p2, p1, len); +#else + d = mpn_add_n (p2, p2, p1, len2); + c += d; + if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c); + c += e; +#endif + c += mpn_add_n (p2, p2, A, len); + ASSERT (c < 7); + *pt2 = c; + + c = mpn_add_n (p1, A, B, len); + d = mpn_add_n (p1, p1, C, len2); + if (len2 == len) c += d; + else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d); + ASSERT (c < 3); + *pt1 = c; + +} + +#else + +static void +#if __STDC__ +evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2, + mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls) +#else +evaluate3 (ph, p1, p2, pth, pt1, pt2, + A, B, C, l, ls) + mp_ptr ph; + mp_ptr p1; + mp_ptr p2; + mp_ptr pth; + mp_ptr pt1; + mp_ptr pt2; + mp_srcptr A; + mp_srcptr B; + mp_srcptr C; + mp_size_t l; + mp_size_t ls; +#endif +{ + mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2; + + ASSERT (l - ls <= 2); + + th = t1 = t2 = 0; + for (i = 0; i < l; ++i) + { + a = *A; + b = *B; + c = i < ls ? *C : 0; + + /* TO DO: choose one of the following alternatives. */ +#if 0 + t = a << 2; + vh = th + t; + th = vh < t; + th += a >> (BITS_PER_MP_LIMB - 2); + t = b << 1; + vh += t; + th += vh < t; + th += b >> (BITS_PER_MP_LIMB - 1); + vh += c; + th += vh < c; +#else + vh = th + c; + th = vh < c; + t = b << 1; + vh += t; + th += vh < t; + th += b >> (BITS_PER_MP_LIMB - 1); + t = a << 2; + vh += t; + th += vh < t; + th += a >> (BITS_PER_MP_LIMB - 2); +#endif + + v1 = t1 + a; + t1 = v1 < a; + v1 += b; + t1 += v1 < b; + v1 += c; + t1 += v1 < c; + + v2 = t2 + a; + t2 = v2 < a; + t = b << 1; + v2 += t; + t2 += v2 < t; + t2 += b >> (BITS_PER_MP_LIMB - 1); + t = c << 2; + v2 += t; + t2 += v2 < t; + t2 += c >> (BITS_PER_MP_LIMB - 2); + + *ph = vh; + *p1 = v1; + *p2 = v2; + + ++A; ++B; ++C; + ++ph; ++p1; ++p2; + } + + ASSERT (th < 7); + ASSERT (t1 < 3); + ASSERT (t2 < 7); + + *pth = th; + *pt1 = t1; + *pt2 = t2; +} +#endif + + +/*-- interpolate3 ----------------------------------------------------------*/ + +/* Interpolates B, C, D (in-place) from: + * 16*A+8*B+4*C+2*D+E + * A+B+C+D+E + * A+2*B+4*C+8*D+16*E + * where: + * A[], B[], C[] and D[] all have length l, + * E[] has length ls with l-ls = 0, 2 or 4. + * + * Reads top words (from earlier overflow) from ptb, ptc and ptd, + * and returns new top words there. + */ + +#ifdef USE_MORE_MPN +static void +#if __STDC__ +interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, + mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2) +#else +interpolate3 (A, B, C, D, E, + ptb, ptc, ptd, len, len2) + mp_srcptr A; + mp_ptr B; + mp_ptr C; + mp_ptr D; + mp_srcptr E; + mp_ptr ptb; + mp_ptr ptc; + mp_ptr ptd; + mp_size_t len; + mp_size_t len2; +#endif +{ + mp_ptr ws; + mp_limb_t t, tb,tc,td; + TMP_DECL (marker); + TMP_MARK (marker); + + ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4); + + /* Let x1, x2, x3 be the values to interpolate. We have: + * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e + * c = a + x1 + x2 + x3 + e + * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e + */ + + ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB); + + tb = *ptb; tc = *ptc; td = *ptd; + + + /* b := b - 16*a - e + * c := c - a - e + * d := d - a - 16*e + */ + + t = mpn_lshift (ws, A, len, 4); + tb -= t + mpn_sub_n (B, B, ws, len); + t = mpn_sub_n (B, B, E, len2); + if (len2 == len) tb -= t; + else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t); + + tc -= mpn_sub_n (C, C, A, len); + t = mpn_sub_n (C, C, E, len2); + if (len2 == len) tc -= t; + else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t); + + t = mpn_lshift (ws, E, len2, 4); + t += mpn_add_n (ws, ws, A, len2); +#if 1 + if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t); + td -= t + mpn_sub_n (D, D, ws, len); +#else + t += mpn_sub_n (D, D, ws, len2); + if (len2 != len) { + t = mpn_sub_1 (D+len2, D+len2, len-len2, t); + t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2); + } /* end if/else */ + td -= t; +#endif + + + /* b, d := b + d, b - d */ + +#ifdef HAVE_MPN_ADD_SUB_N + /* #error TO DO ... */ +#else + t = tb + td + mpn_add_n (ws, B, D, len); + td = tb - td - mpn_sub_n (D, B, D, len); + tb = t; + MPN_COPY (B, ws, len); +#endif + + /* b := b-8*c */ + t = 8 * tc + mpn_lshift (ws, C, len, 3); + tb -= t + mpn_sub_n (B, B, ws, len); + + /* c := 2*c - b */ + tc = 2 * tc + mpn_lshift (C, C, len, 1); + tc -= tb + mpn_sub_n (C, C, B, len); + + /* d := d/3 */ + td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3; + + /* b, d := b + d, b - d */ +#ifdef HAVE_MPN_ADD_SUB_N + /* #error TO DO ... */ +#else + t = tb + td + mpn_add_n (ws, B, D, len); + td = tb - td - mpn_sub_n (D, B, D, len); + tb = t; + MPN_COPY (B, ws, len); +#endif + + /* Now: + * b = 4*x1 + * c = 2*x2 + * d = 4*x3 + */ + + ASSERT(!(*B & 3)); + mpn_rshift (B, B, len, 2); + B[len-1] |= tb<<(BITS_PER_MP_LIMB-2); + ASSERT((long)tb >= 0); + tb >>= 2; + + ASSERT(!(*C & 1)); + mpn_rshift (C, C, len, 1); + C[len-1] |= tc<<(BITS_PER_MP_LIMB-1); + ASSERT((long)tc >= 0); + tc >>= 1; + + ASSERT(!(*D & 3)); + mpn_rshift (D, D, len, 2); + D[len-1] |= td<<(BITS_PER_MP_LIMB-2); + ASSERT((long)td >= 0); + td >>= 2; + +#if WANT_ASSERT + ASSERT (tb < 2); + if (len == len2) + { + ASSERT (tc < 3); + ASSERT (td < 2); + } + else + { + ASSERT (tc < 2); + ASSERT (!td); + } +#endif + + *ptb = tb; + *ptc = tc; + *ptd = td; + + TMP_FREE (marker); +} + +#else + +static void +#if __STDC__ +interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E, + mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls) +#else +interpolate3 (A, B, C, D, E, + ptb, ptc, ptd, l, ls) + mp_srcptr A; + mp_ptr B; + mp_ptr C; + mp_ptr D; + mp_srcptr E; + mp_ptr ptb; + mp_ptr ptc; + mp_ptr ptd; + mp_size_t l; + mp_size_t ls; +#endif +{ + mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od; + const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1); + +#if WANT_ASSERT + t = l - ls; + ASSERT (t == 0 || t == 2 || t == 4); +#endif + + sb = sc = sd = 0; + for (i = 0; i < l; ++i) + { + mp_limb_t tb, tc, td, tt; + + a = *A; + b = *B; + c = *C; + d = *D; + e = i < ls ? *E : 0; + + /* Let x1, x2, x3 be the values to interpolate. We have: + * b = 16*a + 8*x1 + 4*x2 + 2*x3 + e + * c = a + x1 + x2 + x3 + e + * d = a + 2*x1 + 4*x2 + 8*x3 + 16*e + */ + + /* b := b - 16*a - e + * c := c - a - e + * d := d - a - 16*e + */ + t = a << 4; + tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t); + b -= t; + tb -= b < e; + b -= e; + tc = -(c < a); + c -= a; + tc -= c < e; + c -= e; + td = -(d < a); + d -= a; + t = e << 4; + td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t); + d -= t; + + /* b, d := b + d, b - d */ + t = b + d; + tt = tb + td + (t < b); + td = tb - td - (b < d); + d = b - d; + b = t; + tb = tt; + + /* b := b-8*c */ + t = c << 3; + tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t); + b -= t; + + /* c := 2*c - b */ + t = c << 1; + tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b); + c = t - b; + + /* d := d/3 */ + d *= INVERSE_3; + td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d); + td *= INVERSE_3; + + /* b, d := b + d, b - d */ + t = b + d; + tt = tb + td + (t < b); + td = tb - td - (b < d); + d = b - d; + b = t; + tb = tt; + + /* Now: + * b = 4*x1 + * c = 2*x2 + * d = 4*x3 + */ + + /* sb has period 2. */ + b += sb; + tb += b < sb; + sb &= maskOffHalf; + sb |= sb >> (BITS_PER_MP_LIMB >> 1); + sb += tb; + + /* sc has period 1. */ + c += sc; + tc += c < sc; + /* TO DO: choose one of the following alternatives. */ +#if 1 + sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1)); + sc += tc; +#else + sc = tc - ((long)sc < 0L); +#endif + + /* sd has period 2. */ + d += sd; + td += d < sd; + sd &= maskOffHalf; + sd |= sd >> (BITS_PER_MP_LIMB >> 1); + sd += td; + + if (i != 0) + { + B[-1] = ob | b << (BITS_PER_MP_LIMB - 2); + C[-1] = oc | c << (BITS_PER_MP_LIMB - 1); + D[-1] = od | d << (BITS_PER_MP_LIMB - 2); + } + ob = b >> 2; + oc = c >> 1; + od = d >> 2; + + ++A; ++B; ++C; ++D; ++E; + } + + /* Handle top words. */ + b = *ptb; + c = *ptc; + d = *ptd; + + t = b + d; + d = b - d; + b = t; + b -= c << 3; + c = (c << 1) - b; + d *= INVERSE_3; + t = b + d; + d = b - d; + b = t; + + b += sb; + c += sc; + d += sd; + + B[-1] = ob | b << (BITS_PER_MP_LIMB - 2); + C[-1] = oc | c << (BITS_PER_MP_LIMB - 1); + D[-1] = od | d << (BITS_PER_MP_LIMB - 2); + + b >>= 2; + c >>= 1; + d >>= 2; + +#if WANT_ASSERT + ASSERT (b < 2); + if (l == ls) + { + ASSERT (c < 3); + ASSERT (d < 2); + } + else + { + ASSERT (c < 2); + ASSERT (!d); + } +#endif + + *ptb = b; + *ptc = c; + *ptd = d; +} +#endif + + +/*-- mpn_toom3_mul_n --------------------------------------------------------------*/ + +/* Multiplies using 5 mults of one third size and so on recursively. + * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1]. + * No overlap of p[...] with a[...] or b[...]. + * ws is workspace. + */ + +/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the + * recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n() + * because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false. + */ + +#define TOOM3_MUL_REC(p, a, b, n, ws) \ + do { \ + if (n < KARATSUBA_MUL_THRESHOLD) \ + mpn_mul_basecase (p, a, n, b, n); \ + else if (n < TOOM3_MUL_THRESHOLD) \ + mpn_kara_mul_n (p, a, b, n, ws); \ + else \ + mpn_toom3_mul_n (p, a, b, n, ws); \ + } while (0) + +void +#if __STDC__ +mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws) +#else +mpn_toom3_mul_n (p, a, b, n, ws) + mp_ptr p; + mp_srcptr a; + mp_srcptr b; + mp_size_t n; + mp_ptr ws; +#endif +{ + mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD; + mp_limb_t *A,*B,*C,*D,*E, *W; + mp_size_t l,l2,l3,l4,l5,ls; + + /* Break n words into chunks of size l, l and ls. + * n = 3*k => l = k, ls = k + * n = 3*k+1 => l = k+1, ls = k-1 + * n = 3*k+2 => l = k+1, ls = k + */ + { + mp_limb_t m; + + ASSERT (n >= TOOM3_MUL_THRESHOLD); + l = ls = n / 3; + m = n - l * 3; + if (m != 0) + ++l; + if (m == 1) + --ls; + + l2 = l * 2; + l3 = l * 3; + l4 = l * 4; + l5 = l * 5; + A = p; + B = ws; + C = p + l2; + D = ws + l2; + E = p + l4; + W = ws + l4; + } + + /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ + evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); + evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls); + + /** Second stage: pointwise multiplies. **/ + TOOM3_MUL_REC(D, C, C + l, l, W); + tD = cD*dD; + if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD); + if (dD) tD += mpn_addmul_1 (D + l, C, l, dD); + ASSERT (tD < 49); + TOOM3_MUL_REC(C, B, B + l, l, W); + tC = cC*dC; + /* TO DO: choose one of the following alternatives. */ +#if 0 + if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC); + if (dC) tC += mpn_addmul_1 (C + l, B, l, dC); +#else + if (cC) + { + if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l); + else tC += add2Times (C + l, C + l, B + l, l); + } + if (dC) + { + if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l); + else tC += add2Times (C + l, C + l, B, l); + } +#endif + ASSERT (tC < 9); + TOOM3_MUL_REC(B, A, A + l, l, W); + tB = cB*dB; + if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB); + if (dB) tB += mpn_addmul_1 (B + l, A, l, dB); + ASSERT (tB < 49); + TOOM3_MUL_REC(A, a, b, l, W); + TOOM3_MUL_REC(E, a + l2, b + l2, ls, W); + + /** Third stage: interpolation. **/ + interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); + + /** Final stage: add up the coefficients. **/ + { + mp_limb_t i, x, y; + tB += mpn_add_n (p + l, p + l, B, l2); + tD += mpn_add_n (p + l3, p + l3, D, l2); + mpn_incr_u (p + l3, tB); + mpn_incr_u (p + l4, tC); + mpn_incr_u (p + l5, tD); + } +} + +/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/ + +/* Like previous function but for squaring */ + +#define TOOM3_SQR_REC(p, a, n, ws) \ + do { \ + if (n < KARATSUBA_SQR_THRESHOLD) \ + mpn_sqr_basecase (p, a, n); \ + else if (n < TOOM3_SQR_THRESHOLD) \ + mpn_kara_sqr_n (p, a, n, ws); \ + else \ + mpn_toom3_sqr_n (p, a, n, ws); \ + } while (0) + +void +#if __STDC__ +mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws) +#else +mpn_toom3_sqr_n (p, a, n, ws) + mp_ptr p; + mp_srcptr a; + mp_size_t n; + mp_ptr ws; +#endif +{ + mp_limb_t cB,cC,cD, tB,tC,tD; + mp_limb_t *A,*B,*C,*D,*E, *W; + mp_size_t l,l2,l3,l4,l5,ls; + + /* Break n words into chunks of size l, l and ls. + * n = 3*k => l = k, ls = k + * n = 3*k+1 => l = k+1, ls = k-1 + * n = 3*k+2 => l = k+1, ls = k + */ + { + mp_limb_t m; + + ASSERT (n >= TOOM3_MUL_THRESHOLD); + l = ls = n / 3; + m = n - l * 3; + if (m != 0) + ++l; + if (m == 1) + --ls; + + l2 = l * 2; + l3 = l * 3; + l4 = l * 4; + l5 = l * 5; + A = p; + B = ws; + C = p + l2; + D = ws + l2; + E = p + l4; + W = ws + l4; + } + + /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/ + evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls); + + /** Second stage: pointwise multiplies. **/ + TOOM3_SQR_REC(D, C, l, W); + tD = cD*cD; + if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD); + ASSERT (tD < 49); + TOOM3_SQR_REC(C, B, l, W); + tC = cC*cC; + /* TO DO: choose one of the following alternatives. */ +#if 0 + if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC); +#else + if (cC >= 1) + { + tC += add2Times (C + l, C + l, B, l); + if (cC == 2) + tC += add2Times (C + l, C + l, B, l); + } +#endif + ASSERT (tC < 9); + TOOM3_SQR_REC(B, A, l, W); + tB = cB*cB; + if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB); + ASSERT (tB < 49); + TOOM3_SQR_REC(A, a, l, W); + TOOM3_SQR_REC(E, a + l2, ls, W); + + /** Third stage: interpolation. **/ + interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1); + + /** Final stage: add up the coefficients. **/ + { + mp_limb_t i, x, y; + tB += mpn_add_n (p + l, p + l, B, l2); + tD += mpn_add_n (p + l3, p + l3, D, l2); + mpn_incr_u (p + l3, tB); + mpn_incr_u (p + l4, tC); + mpn_incr_u (p + l5, tD); + } +} + +void +#if __STDC__ +mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) +#else +mpn_mul_n (p, a, b, n) + mp_ptr p; + mp_srcptr a; + mp_srcptr b; + mp_size_t n; +#endif +{ + if (n < KARATSUBA_MUL_THRESHOLD) + mpn_mul_basecase (p, a, n, b, n); + else if (n < TOOM3_MUL_THRESHOLD) + { + /* Allocate workspace of fixed size on stack: fast! */ +#if TUNE_PROGRAM_BUILD + mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB]; +#else + mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB]; +#endif + mpn_kara_mul_n (p, a, b, n, ws); + } +#if WANT_FFT || TUNE_PROGRAM_BUILD + else if (n < FFT_MUL_THRESHOLD) +#else + else +#endif + { + /* Use workspace of unknown size in heap, as stack space may + * be limited. Since n is at least TOOM3_MUL_THRESHOLD, the + * multiplication will take much longer than malloc()/free(). */ + mp_limb_t wsLen, *ws; + wsLen = 2 * n + 3 * BITS_PER_MP_LIMB; + ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t)); + mpn_toom3_mul_n (p, a, b, n, ws); + (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t)); + } +#if WANT_FFT || TUNE_PROGRAM_BUILD + else + { + mpn_mul_fft_full (p, a, n, b, n); + } +#endif +} diff --git a/rts/gmp/mpn/generic/perfsqr.c b/rts/gmp/mpn/generic/perfsqr.c new file mode 100644 index 0000000000..42ee3405d7 --- /dev/null +++ b/rts/gmp/mpn/generic/perfsqr.c @@ -0,0 +1,123 @@ +/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square, + zero otherwise. + +Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include <stdio.h> /* for NULL */ +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + + +/* sq_res_0x100[x mod 0x100] == 1 iff x mod 0x100 is a quadratic residue + modulo 0x100. */ +static unsigned char const sq_res_0x100[0x100] = +{ + 1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, +}; + +int +#if __STDC__ +mpn_perfect_square_p (mp_srcptr up, mp_size_t usize) +#else +mpn_perfect_square_p (up, usize) + mp_srcptr up; + mp_size_t usize; +#endif +{ + mp_limb_t rem; + mp_ptr root_ptr; + int res; + TMP_DECL (marker); + + /* The first test excludes 55/64 (85.9%) of the perfect square candidates + in O(1) time. */ + if ((sq_res_0x100[(unsigned int) up[0] % 0x100] & 1) == 0) + return 0; + +#if defined (PP) + /* The second test excludes 30652543/30808063 (99.5%) of the remaining + perfect square candidates in O(n) time. */ + + /* Firstly, compute REM = A mod PP. */ + if (UDIV_TIME > (2 * UMUL_TIME + 6)) + rem = mpn_preinv_mod_1 (up, usize, (mp_limb_t) PP, (mp_limb_t) PP_INVERTED); + else + rem = mpn_mod_1 (up, usize, (mp_limb_t) PP); + + /* Now decide if REM is a quadratic residue modulo the factors in PP. */ + + /* If A is just a few limbs, computing the square root does not take long + time, so things might run faster if we limit this loop according to the + size of A. */ + +#if BITS_PER_MP_LIMB == 64 + if (((CNST_LIMB(0x12DD703303AED3) >> rem % 53) & 1) == 0) + return 0; + if (((CNST_LIMB(0x4351B2753DF) >> rem % 47) & 1) == 0) + return 0; + if (((CNST_LIMB(0x35883A3EE53) >> rem % 43) & 1) == 0) + return 0; + if (((CNST_LIMB(0x1B382B50737) >> rem % 41) & 1) == 0) + return 0; + if (((CNST_LIMB(0x165E211E9B) >> rem % 37) & 1) == 0) + return 0; + if (((CNST_LIMB(0x121D47B7) >> rem % 31) & 1) == 0) + return 0; +#endif + if (((0x13D122F3L >> rem % 29) & 1) == 0) + return 0; + if (((0x5335FL >> rem % 23) & 1) == 0) + return 0; + if (((0x30AF3L >> rem % 19) & 1) == 0) + return 0; + if (((0x1A317L >> rem % 17) & 1) == 0) + return 0; + if (((0x161BL >> rem % 13) & 1) == 0) + return 0; + if (((0x23BL >> rem % 11) & 1) == 0) + return 0; + if (((0x017L >> rem % 7) & 1) == 0) + return 0; + if (((0x13L >> rem % 5) & 1) == 0) + return 0; + if (((0x3L >> rem % 3) & 1) == 0) + return 0; +#endif + + TMP_MARK (marker); + + /* For the third and last test, we finally compute the square root, + to make sure we've really got a perfect square. */ + root_ptr = (mp_ptr) TMP_ALLOC ((usize + 1) / 2 * BYTES_PER_MP_LIMB); + + /* Iff mpn_sqrtrem returns zero, the square is perfect. */ + res = ! mpn_sqrtrem (root_ptr, NULL, up, usize); + TMP_FREE (marker); + return res; +} diff --git a/rts/gmp/mpn/generic/popcount.c b/rts/gmp/mpn/generic/popcount.c new file mode 100644 index 0000000000..387be9536d --- /dev/null +++ b/rts/gmp/mpn/generic/popcount.c @@ -0,0 +1,93 @@ +/* popcount.c + +Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#if defined __GNUC__ +/* No processor claiming to be SPARC v9 compliant seem to + implement the POPC instruction. Disable pattern for now. */ +#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64 +#define popc_limb(a) \ + ({ \ + DItype __res; \ + asm ("popc %1,%0" : "=r" (__res) : "rI" (a)); \ + __res; \ + }) +#endif +#endif + +#ifndef popc_limb + +/* Cool population count of a mp_limb_t. + You have to figure out how this works, I won't tell you! */ + +static inline unsigned int +#if __STDC__ +popc_limb (mp_limb_t x) +#else +popc_limb (x) + mp_limb_t x; +#endif +{ +#if BITS_PER_MP_LIMB == 64 + /* We have to go into some trouble to define these constants. + (For mp_limb_t being `long long'.) */ + mp_limb_t cnst; + cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2); + x -= (x & cnst) >> 1; + cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2); + x = ((x & ~cnst) >> 2) + (x & cnst); + cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2); + x = ((x >> 4) + x) & cnst; + x = ((x >> 8) + x); + x = ((x >> 16) + x); + x = ((x >> 32) + x) & 0xff; +#endif +#if BITS_PER_MP_LIMB == 32 + x -= (x & 0xaaaaaaaa) >> 1; + x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L); + x = ((x >> 4) + x) & 0x0f0f0f0fL; + x = ((x >> 8) + x); + x = ((x >> 16) + x) & 0xff; +#endif + return x; +} +#endif + +unsigned long int +#if __STDC__ +mpn_popcount (register mp_srcptr p, register mp_size_t size) +#else +mpn_popcount (p, size) + register mp_srcptr p; + register mp_size_t size; +#endif +{ + unsigned long int popcnt; + mp_size_t i; + + popcnt = 0; + for (i = 0; i < size; i++) + popcnt += popc_limb (p[i]); + + return popcnt; +} diff --git a/rts/gmp/mpn/generic/pre_mod_1.c b/rts/gmp/mpn/generic/pre_mod_1.c new file mode 100644 index 0000000000..27179683b3 --- /dev/null +++ b/rts/gmp/mpn/generic/pre_mod_1.c @@ -0,0 +1,69 @@ +/* mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb, + divisor_limb_inverted) -- + Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by the normalized DIVISOR_LIMB. + DIVISOR_LIMB_INVERTED should be 2^(2*BITS_PER_MP_LIMB) / DIVISOR_LIMB + + - 2^BITS_PER_MP_LIMB. + Return the single-limb remainder. + +Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef UMUL_TIME +#define UMUL_TIME 1 +#endif + +#ifndef UDIV_TIME +#define UDIV_TIME UMUL_TIME +#endif + +mp_limb_t +#if __STDC__ +mpn_preinv_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size, + mp_limb_t divisor_limb, mp_limb_t divisor_limb_inverted) +#else +mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb, divisor_limb_inverted) + mp_srcptr dividend_ptr; + mp_size_t dividend_size; + mp_limb_t divisor_limb; + mp_limb_t divisor_limb_inverted; +#endif +{ + mp_size_t i; + mp_limb_t n0, r; + int dummy; + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if (r >= divisor_limb) + r = 0; + else + i--; + + for (; i >= 0; i--) + { + n0 = dividend_ptr[i]; + udiv_qrnnd_preinv (dummy, r, r, n0, divisor_limb, divisor_limb_inverted); + } + return r; +} diff --git a/rts/gmp/mpn/generic/random.c b/rts/gmp/mpn/generic/random.c new file mode 100644 index 0000000000..dea4e20e56 --- /dev/null +++ b/rts/gmp/mpn/generic/random.c @@ -0,0 +1,43 @@ +/* mpn_random -- Generate random numbers. + +Copyright (C) 1996, 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "urandom.h" + +void +#if __STDC__ +mpn_random (mp_ptr res_ptr, mp_size_t size) +#else +mpn_random (res_ptr, size) + mp_ptr res_ptr; + mp_size_t size; +#endif +{ + mp_size_t i; + + for (i = 0; i < size; i++) + res_ptr[i] = urandom (); + + /* Make sure the most significant limb is non-zero. */ + while (res_ptr[size - 1] == 0) + res_ptr[size - 1] = urandom (); +} diff --git a/rts/gmp/mpn/generic/random2.c b/rts/gmp/mpn/generic/random2.c new file mode 100644 index 0000000000..86682f81fa --- /dev/null +++ b/rts/gmp/mpn/generic/random2.c @@ -0,0 +1,105 @@ +/* mpn_random2 -- Generate random numbers with relatively long strings + of ones and zeroes. Suitable for border testing. + +Copyright (C) 1992, 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#if defined (__hpux) || defined (__alpha) || defined (__svr4__) || defined (__SVR4) +/* HPUX lacks random(). DEC OSF/1 1.2 random() returns a double. */ +long mrand48 (); +static inline long +random () +{ + return mrand48 (); +} +#elif defined(_WIN32) && !(defined(__CYGWIN__) || defined(__CYGWIN32__)) +/* MS CRT supplies just the poxy rand(), with an upper bound of 0x7fff */ +static inline unsigned long +random () +{ + return rand () ^ (rand () << 16) ^ (rand() << 32); +} + +#else +long random (); +#endif + +/* It's a bit tricky to get this right, so please test the code well + if you hack with it. Some early versions of the function produced + random numbers with the leading limb == 0, and some versions never + made the most significant bit set. */ + +void +#if __STDC__ +mpn_random2 (mp_ptr res_ptr, mp_size_t size) +#else +mpn_random2 (res_ptr, size) + mp_ptr res_ptr; + mp_size_t size; +#endif +{ + int n_bits; + int bit_pos; + mp_size_t limb_pos; + unsigned int ran; + mp_limb_t limb; + + limb = 0; + + /* Start off in a random bit position in the most significant limb. */ + bit_pos = random () & (BITS_PER_MP_LIMB - 1); + + /* Least significant bit of RAN chooses string of ones/string of zeroes. + Make most significant limb be non-zero by setting bit 0 of RAN. */ + ran = random () | 1; + + for (limb_pos = size - 1; limb_pos >= 0; ) + { + n_bits = (ran >> 1) % BITS_PER_MP_LIMB + 1; + if ((ran & 1) != 0) + { + /* Generate a string of ones. */ + if (n_bits >= bit_pos) + { + res_ptr[limb_pos--] = limb | ((((mp_limb_t) 2) << bit_pos) - 1); + bit_pos += BITS_PER_MP_LIMB; + limb = (~(mp_limb_t) 0) << (bit_pos - n_bits); + } + else + { + limb |= ((((mp_limb_t) 1) << n_bits) - 1) << (bit_pos - n_bits + 1); + } + } + else + { + /* Generate a string of zeroes. */ + if (n_bits >= bit_pos) + { + res_ptr[limb_pos--] = limb; + limb = 0; + bit_pos += BITS_PER_MP_LIMB; + } + } + bit_pos -= n_bits; + ran = random (); + } +} diff --git a/rts/gmp/mpn/generic/rshift.c b/rts/gmp/mpn/generic/rshift.c new file mode 100644 index 0000000000..59caf73529 --- /dev/null +++ b/rts/gmp/mpn/generic/rshift.c @@ -0,0 +1,88 @@ +/* mpn_rshift -- Shift right a low-level natural-number integer. + +Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +/* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right + and store the USIZE least significant limbs of the result at WP. + The bits shifted out to the right are returned. + + Argument constraints: + 1. 0 < CNT < BITS_PER_MP_LIMB + 2. If the result is to be written over the input, WP must be <= UP. +*/ + +mp_limb_t +#if __STDC__ +mpn_rshift (register mp_ptr wp, + register mp_srcptr up, mp_size_t usize, + register unsigned int cnt) +#else +mpn_rshift (wp, up, usize, cnt) + register mp_ptr wp; + register mp_srcptr up; + mp_size_t usize; + register unsigned int cnt; +#endif +{ + register mp_limb_t high_limb, low_limb; + register unsigned sh_1, sh_2; + register mp_size_t i; + mp_limb_t retval; + +#ifdef DEBUG + if (usize == 0 || cnt == 0) + abort (); +#endif + + sh_1 = cnt; + +#if 0 + if (sh_1 == 0) + { + if (wp != up) + { + /* Copy from low end to high end, to allow specified input/output + overlapping. */ + for (i = 0; i < usize; i++) + wp[i] = up[i]; + } + return usize; + } +#endif + + wp -= 1; + sh_2 = BITS_PER_MP_LIMB - sh_1; + high_limb = up[0]; + retval = high_limb << sh_2; + low_limb = high_limb; + + for (i = 1; i < usize; i++) + { + high_limb = up[i]; + wp[i] = (low_limb >> sh_1) | (high_limb << sh_2); + low_limb = high_limb; + } + wp[i] = low_limb >> sh_1; + + return retval; +} diff --git a/rts/gmp/mpn/generic/sb_divrem_mn.c b/rts/gmp/mpn/generic/sb_divrem_mn.c new file mode 100644 index 0000000000..a269e34f5f --- /dev/null +++ b/rts/gmp/mpn/generic/sb_divrem_mn.c @@ -0,0 +1,201 @@ +/* mpn_sb_divrem_mn -- Divide natural numbers, producing both remainder and + quotient. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A + FUTURE GNU MP RELEASE. + + +Copyright (C) 1993, 1994, 1995, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write + the NSIZE-DSIZE least significant quotient limbs at QP + and the DSIZE long remainder at NP. If QEXTRA_LIMBS is + non-zero, generate that many fraction bits and append them after the + other quotient limbs. + Return the most significant limb of the quotient, this is always 0 or 1. + + Preconditions: + 0. NSIZE >= DSIZE. + 1. The most significant bit of the divisor must be set. + 2. QP must either not overlap with the input operands at all, or + QP + DSIZE >= NP must hold true. (This means that it's + possible to put the quotient in the high part of NUM, right after the + remainder in NUM. + 3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero. + 4. DSIZE >= 2. */ + + +#define PREINVERT_VIABLE \ + (UDIV_TIME > 2 * UMUL_TIME + 6 /* && ! TARGET_REGISTER_STARVED */) + +mp_limb_t +#if __STDC__ +mpn_sb_divrem_mn (mp_ptr qp, + mp_ptr np, mp_size_t nsize, + mp_srcptr dp, mp_size_t dsize) +#else +mpn_sb_divrem_mn (qp, np, nsize, dp, dsize) + mp_ptr qp; + mp_ptr np; + mp_size_t nsize; + mp_srcptr dp; + mp_size_t dsize; +#endif +{ + mp_limb_t most_significant_q_limb = 0; + mp_size_t i; + mp_limb_t dx, d1, n0; + mp_limb_t dxinv; + int have_preinv; + + ASSERT_ALWAYS (dsize > 2); + + np += nsize - dsize; + dx = dp[dsize - 1]; + d1 = dp[dsize - 2]; + n0 = np[dsize - 1]; + + if (n0 >= dx) + { + if (n0 > dx || mpn_cmp (np, dp, dsize - 1) >= 0) + { + mpn_sub_n (np, np, dp, dsize); + most_significant_q_limb = 1; + } + } + + /* If multiplication is much faster than division, preinvert the + most significant divisor limb before entering the loop. */ + if (PREINVERT_VIABLE) + { + have_preinv = 0; + if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - dsize) > UDIV_TIME) + { + invert_limb (dxinv, dx); + have_preinv = 1; + } + } + + for (i = nsize - dsize - 1; i >= 0; i--) + { + mp_limb_t q; + mp_limb_t nx; + mp_limb_t cy_limb; + + nx = np[dsize - 1]; + np--; + + if (nx == dx) + { + /* This might over-estimate q, but it's probably not worth + the extra code here to find out. */ + q = ~(mp_limb_t) 0; + +#if 1 + cy_limb = mpn_submul_1 (np, dp, dsize, q); +#else + /* This should be faster on many machines */ + cy_limb = mpn_sub_n (np + 1, np + 1, dp, dsize); + cy = mpn_add_n (np, np, dp, dsize); + np[dsize] += cy; +#endif + + if (nx != cy_limb) + { + mpn_add_n (np, np, dp, dsize); + q--; + } + + qp[i] = q; + } + else + { + mp_limb_t rx, r1, r0, p1, p0; + + /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register + usage when np[dsize-1] is used in an asm statement like + umul_ppmm in udiv_qrnnd_preinv. The symptom is seg faults due + to registers being clobbered. gcc 2.95 i386 doesn't have the + problem. */ + { + mp_limb_t workaround = np[dsize - 1]; + if (PREINVERT_VIABLE && have_preinv) + udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv); + else + udiv_qrnnd (q, r1, nx, workaround, dx); + } + umul_ppmm (p1, p0, d1, q); + + r0 = np[dsize - 2]; + rx = 0; + if (r1 < p1 || (r1 == p1 && r0 < p0)) + { + p1 -= p0 < d1; + p0 -= d1; + q--; + r1 += dx; + rx = r1 < dx; + } + + p1 += r0 < p0; /* cannot carry! */ + rx -= r1 < p1; /* may become 11..1 if q is still too large */ + r1 -= p1; + r0 -= p0; + + cy_limb = mpn_submul_1 (np, dp, dsize - 2, q); + + { + mp_limb_t cy1, cy2; + cy1 = r0 < cy_limb; + r0 -= cy_limb; + cy2 = r1 < cy1; + r1 -= cy1; + np[dsize - 1] = r1; + np[dsize - 2] = r0; + if (cy2 != rx) + { + mpn_add_n (np, np, dp, dsize); + q--; + } + } + qp[i] = q; + } + } + + /* ______ ______ ______ + |__rx__|__r1__|__r0__| partial remainder + ______ ______ + - |__p1__|__p0__| partial product to subtract + ______ ______ + - |______|cylimb| + + rx is -1, 0 or 1. If rx=1, then q is correct (it should match + carry out). If rx=-1 then q is too large. If rx=0, then q might + be too large, but it is most likely correct. + */ + + return most_significant_q_limb; +} diff --git a/rts/gmp/mpn/generic/scan0.c b/rts/gmp/mpn/generic/scan0.c new file mode 100644 index 0000000000..96f05ce854 --- /dev/null +++ b/rts/gmp/mpn/generic/scan0.c @@ -0,0 +1,62 @@ +/* mpn_scan0 -- Scan from a given bit position for the next clear bit. + +Copyright (C) 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Design issues: + 1. What if starting_bit is not within U? Caller's problem? + 2. Bit index should be 'unsigned'? + + Argument constraints: + 1. U must sooner ot later have a limb with a clear bit. + */ + +unsigned long int +#if __STDC__ +mpn_scan0 (register mp_srcptr up, + register unsigned long int starting_bit) +#else +mpn_scan0 (up, starting_bit) + register mp_srcptr up; + register unsigned long int starting_bit; +#endif +{ + mp_size_t starting_word; + mp_limb_t alimb; + int cnt; + mp_srcptr p; + + /* Start at the word implied by STARTING_BIT. */ + starting_word = starting_bit / BITS_PER_MP_LIMB; + p = up + starting_word; + alimb = ~*p++; + + /* Mask off any bits before STARTING_BIT in the first limb. */ + alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB); + + while (alimb == 0) + alimb = ~*p++; + + count_leading_zeros (cnt, alimb & -alimb); + return (p - up) * BITS_PER_MP_LIMB - 1 - cnt; +} diff --git a/rts/gmp/mpn/generic/scan1.c b/rts/gmp/mpn/generic/scan1.c new file mode 100644 index 0000000000..98e2e0dcc0 --- /dev/null +++ b/rts/gmp/mpn/generic/scan1.c @@ -0,0 +1,62 @@ +/* mpn_scan1 -- Scan from a given bit position for the next set bit. + +Copyright (C) 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Design issues: + 1. What if starting_bit is not within U? Caller's problem? + 2. Bit index should be 'unsigned'? + + Argument constraints: + 1. U must sooner ot later have a limb != 0. + */ + +unsigned long int +#if __STDC__ +mpn_scan1 (register mp_srcptr up, + register unsigned long int starting_bit) +#else +mpn_scan1 (up, starting_bit) + register mp_srcptr up; + register unsigned long int starting_bit; +#endif +{ + mp_size_t starting_word; + mp_limb_t alimb; + int cnt; + mp_srcptr p; + + /* Start at the word implied by STARTING_BIT. */ + starting_word = starting_bit / BITS_PER_MP_LIMB; + p = up + starting_word; + alimb = *p++; + + /* Mask off any bits before STARTING_BIT in the first limb. */ + alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB); + + while (alimb == 0) + alimb = *p++; + + count_leading_zeros (cnt, alimb & -alimb); + return (p - up) * BITS_PER_MP_LIMB - 1 - cnt; +} diff --git a/rts/gmp/mpn/generic/set_str.c b/rts/gmp/mpn/generic/set_str.c new file mode 100644 index 0000000000..e6ccc92154 --- /dev/null +++ b/rts/gmp/mpn/generic/set_str.c @@ -0,0 +1,159 @@ +/* mpn_set_str (mp_ptr res_ptr, const char *str, size_t str_len, int base) + -- Convert a STR_LEN long base BASE byte string pointed to by STR to a + limb vector pointed to by RES_PTR. Return the number of limbs in + RES_PTR. + +Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +mp_size_t +#if __STDC__ +mpn_set_str (mp_ptr xp, const unsigned char *str, size_t str_len, int base) +#else +mpn_set_str (xp, str, str_len, base) + mp_ptr xp; + const unsigned char *str; + size_t str_len; + int base; +#endif +{ + mp_size_t size; + mp_limb_t big_base; + int indigits_per_limb; + mp_limb_t res_digit; + + big_base = __mp_bases[base].big_base; + indigits_per_limb = __mp_bases[base].chars_per_limb; + +/* size = str_len / indigits_per_limb + 1; */ + + size = 0; + + if ((base & (base - 1)) == 0) + { + /* The base is a power of 2. Read the input string from + least to most significant character/digit. */ + + const unsigned char *s; + int next_bitpos; + int bits_per_indigit = big_base; + + res_digit = 0; + next_bitpos = 0; + + for (s = str + str_len - 1; s >= str; s--) + { + int inp_digit = *s; + + res_digit |= (mp_limb_t) inp_digit << next_bitpos; + next_bitpos += bits_per_indigit; + if (next_bitpos >= BITS_PER_MP_LIMB) + { + xp[size++] = res_digit; + next_bitpos -= BITS_PER_MP_LIMB; + res_digit = inp_digit >> (bits_per_indigit - next_bitpos); + } + } + + if (res_digit != 0) + xp[size++] = res_digit; + } + else + { + /* General case. The base is not a power of 2. */ + + size_t i; + int j; + mp_limb_t cy_limb; + + for (i = indigits_per_limb; i < str_len; i += indigits_per_limb) + { + res_digit = *str++; + if (base == 10) + { /* This is a common case. + Help the compiler to avoid multiplication. */ + for (j = 1; j < indigits_per_limb; j++) + res_digit = res_digit * 10 + *str++; + } + else + { + for (j = 1; j < indigits_per_limb; j++) + res_digit = res_digit * base + *str++; + } + + if (size == 0) + { + if (res_digit != 0) + { + xp[0] = res_digit; + size = 1; + } + } + else + { + cy_limb = mpn_mul_1 (xp, xp, size, big_base); + cy_limb += mpn_add_1 (xp, xp, size, res_digit); + if (cy_limb != 0) + xp[size++] = cy_limb; + } + } + + big_base = base; + res_digit = *str++; + if (base == 10) + { /* This is a common case. + Help the compiler to avoid multiplication. */ + for (j = 1; j < str_len - (i - indigits_per_limb); j++) + { + res_digit = res_digit * 10 + *str++; + big_base *= 10; + } + } + else + { + for (j = 1; j < str_len - (i - indigits_per_limb); j++) + { + res_digit = res_digit * base + *str++; + big_base *= base; + } + } + + if (size == 0) + { + if (res_digit != 0) + { + xp[0] = res_digit; + size = 1; + } + } + else + { + cy_limb = mpn_mul_1 (xp, xp, size, big_base); + cy_limb += mpn_add_1 (xp, xp, size, res_digit); + if (cy_limb != 0) + xp[size++] = cy_limb; + } + } + + return size; +} diff --git a/rts/gmp/mpn/generic/sqr_basecase.c b/rts/gmp/mpn/generic/sqr_basecase.c new file mode 100644 index 0000000000..760258a3e0 --- /dev/null +++ b/rts/gmp/mpn/generic/sqr_basecase.c @@ -0,0 +1,83 @@ +/* mpn_sqr_basecase -- Internal routine to square two natural numbers + of length m and n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +void +#if __STDC__ +mpn_sqr_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t n) +#else +mpn_sqr_basecase (prodp, up, n) + mp_ptr prodp; + mp_srcptr up; + mp_size_t n; +#endif +{ + mp_size_t i; + + { + /* N.B.! We need the superfluous indirection through argh to work around + a reloader bug in GCC 2.7.*. */ + mp_limb_t x; + mp_limb_t argh; + x = up[0]; + umul_ppmm (argh, prodp[0], x, x); + prodp[1] = argh; + } + if (n > 1) + { + mp_limb_t tarr[2 * KARATSUBA_SQR_THRESHOLD]; + mp_ptr tp = tarr; + mp_limb_t cy; + + /* must fit 2*n limbs in tarr */ + ASSERT (n <= KARATSUBA_SQR_THRESHOLD); + + cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]); + tp[n - 1] = cy; + for (i = 2; i < n; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]); + tp[n + i - 2] = cy; + } + for (i = 1; i < n; i++) + { + mp_limb_t x; + x = up[i]; + umul_ppmm (prodp[2 * i + 1], prodp[2 * i], x, x); + } + { + mp_limb_t cy; + cy = mpn_lshift (tp, tp, 2 * n - 2, 1); + cy += mpn_add_n (prodp + 1, prodp + 1, tp, 2 * n - 2); + prodp[2 * n - 1] += cy; + } + } +} diff --git a/rts/gmp/mpn/generic/sqrtrem.c b/rts/gmp/mpn/generic/sqrtrem.c new file mode 100644 index 0000000000..ee3b5144dd --- /dev/null +++ b/rts/gmp/mpn/generic/sqrtrem.c @@ -0,0 +1,509 @@ +/* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) + + Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR. + Write the remainder at REM_PTR, if REM_PTR != NULL. + Return the size of the remainder. + (The size of the root is always half of the size of the operand.) + + OP_PTR and ROOT_PTR may not point to the same object. + OP_PTR and REM_PTR may point to the same object. + + If REM_PTR is NULL, only the root is computed and the return value of + the function is 0 if OP is a perfect square, and *any* non-zero number + otherwise. + +Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* This code is just correct if "unsigned char" has at least 8 bits. It + doesn't help to use CHAR_BIT from limits.h, as the real problem is + the static arrays. */ + +#include <stdio.h> /* for NULL */ +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +/* Square root algorithm: + + 1. Shift OP (the input) to the left an even number of bits s.t. there + are an even number of words and either (or both) of the most + significant bits are set. This way, sqrt(OP) has exactly half as + many words as OP, and has its most significant bit set. + + 2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables. + This approximation is used for the first single-precision + iterations of Newton's method, yielding a full-word approximation + to sqrt(OP). + + 3. Perform multiple-precision Newton iteration until we have the + exact result. Only about half of the input operand is used in + this calculation, as the square root is perfectly determinable + from just the higher half of a number. */ + +/* Define this macro for IEEE P854 machines with a fast sqrt instruction. */ +#if defined __GNUC__ && ! defined __SOFT_FLOAT + +#if defined (__sparc__) && BITS_PER_MP_LIMB == 32 +#define SQRT(a) \ + ({ \ + double __sqrt_res; \ + asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a)); \ + __sqrt_res; \ + }) +#endif + +#if defined (__HAVE_68881__) +#define SQRT(a) \ + ({ \ + double __sqrt_res; \ + asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a)); \ + __sqrt_res; \ + }) +#endif + +#if defined (__hppa) && BITS_PER_MP_LIMB == 32 +#define SQRT(a) \ + ({ \ + double __sqrt_res; \ + asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a)); \ + __sqrt_res; \ + }) +#endif + +#if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32 +#define SQRT(a) \ + ({ \ + double __sqrt_res; \ + asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a)); \ + __sqrt_res; \ + }) +#endif + +#if 0 +#if defined (__i386__) || defined (__i486__) +#define SQRT(a) \ + ({ \ + double __sqrt_res; \ + asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a)); \ + __sqrt_res; \ + }) +#endif +#endif + +#endif + +#ifndef SQRT + +/* Tables for initial approximation of the square root. These are + indexed with bits 1-8 of the operand for which the square root is + calculated, where bit 0 is the most significant non-zero bit. I.e. + the most significant one-bit is not used, since that per definition + is one. Likewise, the tables don't return the highest bit of the + result. That bit must be inserted by or:ing the returned value with + 0x100. This way, we get a 9-bit approximation from 8-bit tables! */ + +/* Table to be used for operands with an even total number of bits. + (Exactly as in the decimal system there are similarities between the + square root of numbers with the same initial digits and an even + difference in the total number of digits. Consider the square root + of 1, 10, 100, 1000, ...) */ +static const unsigned char even_approx_tab[256] = +{ + 0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e, + 0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74, + 0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79, + 0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f, + 0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84, + 0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89, + 0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f, + 0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94, + 0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99, + 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e, + 0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3, + 0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7, + 0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac, + 0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1, + 0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6, + 0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba, + 0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf, + 0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3, + 0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8, + 0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc, + 0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1, + 0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5, + 0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda, + 0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde, + 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2, + 0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6, + 0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb, + 0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef, + 0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3, + 0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7, + 0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb, + 0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff, +}; + +/* Table to be used for operands with an odd total number of bits. + (Further comments before previous table.) */ +static const unsigned char odd_approx_tab[256] = +{ + 0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, + 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, + 0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b, + 0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f, + 0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, + 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16, + 0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a, + 0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d, + 0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21, + 0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24, + 0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28, + 0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b, + 0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f, + 0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32, + 0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35, + 0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39, + 0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c, + 0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f, + 0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42, + 0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45, + 0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49, + 0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c, + 0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f, + 0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52, + 0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55, + 0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58, + 0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b, + 0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e, + 0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61, + 0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63, + 0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66, + 0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69, +}; +#endif + + +mp_size_t +#if __STDC__ +mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size) +#else +mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size) + mp_ptr root_ptr; + mp_ptr rem_ptr; + mp_srcptr op_ptr; + mp_size_t op_size; +#endif +{ + /* R (root result) */ + mp_ptr rp; /* Pointer to least significant word */ + mp_size_t rsize; /* The size in words */ + + /* T (OP shifted to the left a.k.a. normalized) */ + mp_ptr tp; /* Pointer to least significant word */ + mp_size_t tsize; /* The size in words */ + mp_ptr t_end_ptr; /* Pointer right beyond most sign. word */ + mp_limb_t t_high0, t_high1; /* The two most significant words */ + + /* TT (temporary for numerator/remainder) */ + mp_ptr ttp; /* Pointer to least significant word */ + + /* X (temporary for quotient in main loop) */ + mp_ptr xp; /* Pointer to least significant word */ + mp_size_t xsize; /* The size in words */ + + unsigned cnt; + mp_limb_t initial_approx; /* Initially made approximation */ + mp_size_t tsizes[BITS_PER_MP_LIMB]; /* Successive calculation precisions */ + mp_size_t tmp; + mp_size_t i; + + mp_limb_t cy_limb; + TMP_DECL (marker); + + /* If OP is zero, both results are zero. */ + if (op_size == 0) + return 0; + + count_leading_zeros (cnt, op_ptr[op_size - 1]); + tsize = op_size; + if ((tsize & 1) != 0) + { + cnt += BITS_PER_MP_LIMB; + tsize++; + } + + rsize = tsize / 2; + rp = root_ptr; + + TMP_MARK (marker); + + /* Shift OP an even number of bits into T, such that either the most or + the second most significant bit is set, and such that the number of + words in T becomes even. This way, the number of words in R=sqrt(OP) + is exactly half as many as in OP, and the most significant bit of R + is set. + + Also, the initial approximation is simplified by this up-shifted OP. + + Finally, the Newtonian iteration which is the main part of this + program performs division by R. The fast division routine expects + the divisor to be "normalized" in exactly the sense of having the + most significant bit set. */ + + tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB); + + if ((cnt & ~1) % BITS_PER_MP_LIMB != 0) + t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size, + (cnt & ~1) % BITS_PER_MP_LIMB); + else + MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size); + + if (cnt >= BITS_PER_MP_LIMB) + tp[0] = 0; + + t_high0 = tp[tsize - 1]; + t_high1 = tp[tsize - 2]; /* Never stray. TSIZE is >= 2. */ + +/* Is there a fast sqrt instruction defined for this machine? */ +#ifdef SQRT + { + initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1); + /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have + become incorrect due to overflow in the conversion from double to + mp_limb_t above. It will typically be zero in that case, but might be + a small number on some machines. The most significant bit of + INITIAL_APPROX should be set, so that bit is a good overflow + indication. */ + if ((mp_limb_signed_t) initial_approx >= 0) + initial_approx = ~(mp_limb_t)0; + } +#else + /* Get a 9 bit approximation from the tables. The tables expect to + be indexed with the 8 high bits right below the highest bit. + Also, the highest result bit is not returned by the tables, and + must be or:ed into the result. The scheme gives 9 bits of start + approximation with just 256-entry 8 bit tables. */ + + if ((cnt & 1) == 0) + { + /* The most significant bit of t_high0 is set. */ + initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1); + initial_approx &= 0xff; + initial_approx = even_approx_tab[initial_approx]; + } + else + { + /* The most significant bit of t_high0 is unset, + the second most significant is set. */ + initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2); + initial_approx &= 0xff; + initial_approx = odd_approx_tab[initial_approx]; + } + initial_approx |= 0x100; + initial_approx <<= BITS_PER_MP_LIMB - 8 - 1; + + /* Perform small precision Newtonian iterations to get a full word + approximation. For small operands, these iterations will do the + entire job. */ + if (t_high0 == ~(mp_limb_t)0) + initial_approx = t_high0; + else + { + mp_limb_t quot; + + if (t_high0 >= initial_approx) + initial_approx = t_high0 + 1; + + /* First get about 18 bits with pure C arithmetics. */ + quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2; + initial_approx = (initial_approx + quot) / 2; + initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1); + + /* Now get a full word by one (or for > 36 bit machines) several + iterations. */ + for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1) + { + mp_limb_t ignored_remainder; + + udiv_qrnnd (quot, ignored_remainder, + t_high0, t_high1, initial_approx); + initial_approx = (initial_approx + quot) / 2; + initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1); + } + } +#endif + + rp[0] = initial_approx; + rsize = 1; + +#ifdef SQRT_DEBUG + printf ("\n\nT = "); + mpn_dump (tp, tsize); +#endif + + if (tsize > 2) + { + /* Determine the successive precisions to use in the iteration. We + minimize the precisions, beginning with the highest (i.e. last + iteration) to the lowest (i.e. first iteration). */ + + xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB); + ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB); + + t_end_ptr = tp + tsize; + + tmp = tsize / 2; + for (i = 0;; i++) + { + tsize = (tmp + 1) / 2; + if (tmp == tsize) + break; + tsizes[i] = tsize + tmp; + tmp = tsize; + } + + /* Main Newton iteration loop. For big arguments, most of the + time is spent here. */ + + /* It is possible to do a great optimization here. The successive + divisors in the mpn_divmod call below have more and more leading + words equal to its predecessor. Therefore the beginning of + each division will repeat the same work as did the last + division. If we could guarantee that the leading words of two + consecutive divisors are the same (i.e. in this case, a later + divisor has just more digits at the end) it would be a simple + matter of just using the old remainder of the last division in + a subsequent division, to take care of this optimization. This + idea would surely make a difference even for small arguments. */ + + /* Loop invariants: + + R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1. + X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X. + R <= shiftdown_to_same_size(X). */ + + while (--i >= 0) + { + mp_limb_t cy; +#ifdef SQRT_DEBUG + mp_limb_t old_least_sign_r = rp[0]; + mp_size_t old_rsize = rsize; + + printf ("R = "); + mpn_dump (rp, rsize); +#endif + tsize = tsizes[i]; + + /* Need to copy the numerator into temporary space, as + mpn_divmod overwrites its numerator argument with the + remainder (which we currently ignore). */ + MPN_COPY (ttp, t_end_ptr - tsize, tsize); + cy = mpn_divmod (xp, ttp, tsize, rp, rsize); + xsize = tsize - rsize; + +#ifdef SQRT_DEBUG + printf ("X =%d ", cy); + mpn_dump (xp, xsize); +#endif + + /* Add X and R with the most significant limbs aligned, + temporarily ignoring at least one limb at the low end of X. */ + tmp = xsize - rsize; + cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize); + + /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get + intermediate roots that'd need an extra bit. We don't want to + handle that since it would make the subsequent divisor + non-normalized, so round such roots down to be only ones in the + current precision. */ + if (cy == 2) + { + mp_size_t j; + for (j = xsize; j >= 0; j--) + xp[j] = ~(mp_limb_t)0; + } + + /* Divide X by 2 and put the result in R. This is the new + approximation. Shift in the carry from the addition. */ + mpn_rshift (rp, xp, xsize, 1); + rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1)); + rsize = xsize; +#ifdef SQRT_DEBUG + if (old_least_sign_r != rp[rsize - old_rsize]) + printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n", + i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r, + 2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]); +#endif + } + } + +#ifdef SQRT_DEBUG + printf ("(final) R = "); + mpn_dump (rp, rsize); +#endif + + /* We computed the square root of OP * 2**(2*floor(cnt/2)). + This has resulted in R being 2**floor(cnt/2) to large. + Shift it down here to fix that. */ + if (cnt / 2 != 0) + { + mpn_rshift (rp, rp, rsize, cnt/2); + rsize -= rp[rsize - 1] == 0; + } + + /* Calculate the remainder. */ + mpn_mul_n (tp, rp, rp, rsize); + tsize = rsize + rsize; + tsize -= tp[tsize - 1] == 0; + if (op_size < tsize + || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0)) + { + /* R is too large. Decrement it. */ + + /* These operations can't overflow. */ + cy_limb = mpn_sub_n (tp, tp, rp, rsize); + cy_limb += mpn_sub_n (tp, tp, rp, rsize); + mpn_decr_u (tp + rsize, cy_limb); + mpn_incr_u (tp, (mp_limb_t) 1); + + mpn_decr_u (rp, (mp_limb_t) 1); + +#ifdef SQRT_DEBUG + printf ("(adjusted) R = "); + mpn_dump (rp, rsize); +#endif + } + + if (rem_ptr != NULL) + { + cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize); + MPN_NORMALIZE (rem_ptr, op_size); + TMP_FREE (marker); + return op_size; + } + else + { + int res; + res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size); + TMP_FREE (marker); + return res; + } +} diff --git a/rts/gmp/mpn/generic/sub_n.c b/rts/gmp/mpn/generic/sub_n.c new file mode 100644 index 0000000000..4f2f06099c --- /dev/null +++ b/rts/gmp/mpn/generic/sub_n.c @@ -0,0 +1,62 @@ +/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length. + +Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + +mp_limb_t +#if __STDC__ +mpn_sub_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size) +#else +mpn_sub_n (res_ptr, s1_ptr, s2_ptr, size) + register mp_ptr res_ptr; + register mp_srcptr s1_ptr; + register mp_srcptr s2_ptr; + mp_size_t size; +#endif +{ + register mp_limb_t x, y, cy; + register mp_size_t j; + + /* The loop counter and index J goes from -SIZE to -1. This way + the loop becomes faster. */ + j = -size; + + /* Offset the base pointers to compensate for the negative indices. */ + s1_ptr -= j; + s2_ptr -= j; + res_ptr -= j; + + cy = 0; + do + { + y = s2_ptr[j]; + x = s1_ptr[j]; + y += cy; /* add previous carry to subtrahend */ + cy = (y < cy); /* get out carry from that addition */ + y = x - y; /* main subtract */ + cy = (y > x) + cy; /* get out carry from the subtract, combine */ + res_ptr[j] = y; + } + while (++j != 0); + + return cy; +} diff --git a/rts/gmp/mpn/generic/submul_1.c b/rts/gmp/mpn/generic/submul_1.c new file mode 100644 index 0000000000..c7c08ee4af --- /dev/null +++ b/rts/gmp/mpn/generic/submul_1.c @@ -0,0 +1,65 @@ +/* mpn_submul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR + by S2_LIMB, subtract the S1_SIZE least significant limbs of the product + from the limb vector pointed to by RES_PTR. Return the most significant + limb of the product, adjusted for carry-out from the subtraction. + +Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_submul_1 (res_ptr, s1_ptr, s1_size, s2_limb) + register mp_ptr res_ptr; + register mp_srcptr s1_ptr; + mp_size_t s1_size; + register mp_limb_t s2_limb; +{ + register mp_limb_t cy_limb; + register mp_size_t j; + register mp_limb_t prod_high, prod_low; + register mp_limb_t x; + + /* The loop counter and index J goes from -SIZE to -1. This way + the loop becomes faster. */ + j = -s1_size; + + /* Offset the base pointers to compensate for the negative indices. */ + res_ptr -= j; + s1_ptr -= j; + + cy_limb = 0; + do + { + umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb); + + prod_low += cy_limb; + cy_limb = (prod_low < cy_limb) + prod_high; + + x = res_ptr[j]; + prod_low = x - prod_low; + cy_limb += (prod_low > x); + res_ptr[j] = prod_low; + } + while (++j != 0); + + return cy_limb; +} diff --git a/rts/gmp/mpn/generic/tdiv_qr.c b/rts/gmp/mpn/generic/tdiv_qr.c new file mode 100644 index 0000000000..b748b5d810 --- /dev/null +++ b/rts/gmp/mpn/generic/tdiv_qr.c @@ -0,0 +1,401 @@ +/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and + write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp. If + qxn is non-zero, generate that many fraction limbs and append them after the + other quotient limbs, and update the remainder accordningly. The input + operands are unaffected. + + Preconditions: + 1. The most significant limb of of the divisor must be non-zero. + 2. No argument overlap is permitted. (??? relax this ???) + 3. nn >= dn, even if qxn is non-zero. (??? relax this ???) + + The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time + complexity of multiplication. + +Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD) +#endif + +/* Extract the middle limb from ((h,,l) << cnt) */ +#define SHL(h,l,cnt) \ + ((h << cnt) | ((l >> 1) >> ((~cnt) & (BITS_PER_MP_LIMB - 1)))) + +void +#if __STDC__ +mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn, + mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) +#else +mpn_tdiv_qr (qp, rp, qxn, np, nn, dp, dn) + mp_ptr qp; + mp_ptr rp; + mp_size_t qxn; + mp_srcptr np; + mp_size_t nn; + mp_srcptr dp; + mp_size_t dn; +#endif +{ + /* FIXME: + 1. qxn + 2. pass allocated storage in additional parameter? + */ + if (qxn != 0) + abort (); + + switch (dn) + { + case 0: + DIVIDE_BY_ZERO; + + case 1: + { + rp[0] = mpn_divmod_1 (qp, np, nn, dp[0]); + return; + } + + case 2: + { + int cnt; + mp_ptr n2p, d2p; + mp_limb_t qhl, cy; + TMP_DECL (marker); + TMP_MARK (marker); + count_leading_zeros (cnt, dp[dn - 1]); + if (cnt != 0) + { + d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_lshift (d2p, dp, dn, cnt); + n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB); + cy = mpn_lshift (n2p, np, nn, cnt); + n2p[nn] = cy; + qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p); + if (cy == 0) + qp[nn - 2] = qhl; /* always store nn-dn+1 quotient limbs */ + } + else + { + d2p = (mp_ptr) dp; + n2p = (mp_ptr) TMP_ALLOC (nn * BYTES_PER_MP_LIMB); + MPN_COPY (n2p, np, nn); + qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p); + qp[nn - 2] = qhl; /* always store nn-dn+1 quotient limbs */ + } + + if (cnt != 0) + mpn_rshift (rp, n2p, dn, cnt); + else + MPN_COPY (rp, n2p, dn); + TMP_FREE (marker); + return; + } + + default: + { + int adjust; + TMP_DECL (marker); + TMP_MARK (marker); + adjust = np[nn - 1] >= dp[dn - 1]; /* conservative tests for quotient size */ + if (nn + adjust >= 2 * dn) + { + mp_ptr n2p, d2p; + mp_limb_t cy; + int cnt; + count_leading_zeros (cnt, dp[dn - 1]); + + qp[nn - dn] = 0; /* zero high quotient limb */ + if (cnt != 0) /* normalize divisor if needed */ + { + d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + mpn_lshift (d2p, dp, dn, cnt); + n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB); + cy = mpn_lshift (n2p, np, nn, cnt); + n2p[nn] = cy; + nn += adjust; + } + else + { + d2p = (mp_ptr) dp; + n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB); + MPN_COPY (n2p, np, nn); + n2p[nn] = 0; + nn += adjust; + } + + if (dn == 2) + mpn_divrem_2 (qp, 0L, n2p, nn, d2p); + else if (dn < BZ_THRESHOLD) + mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn); + else + { + /* Perform 2*dn / dn limb divisions as long as the limbs + in np last. */ + mp_ptr q2p = qp + nn - 2 * dn; + n2p += nn - 2 * dn; + mpn_bz_divrem_n (q2p, n2p, d2p, dn); + nn -= dn; + while (nn >= 2 * dn) + { + mp_limb_t c; + q2p -= dn; n2p -= dn; + c = mpn_bz_divrem_n (q2p, n2p, d2p, dn); + ASSERT_ALWAYS (c == 0); + nn -= dn; + } + + if (nn != dn) + { + n2p -= nn - dn; + /* In theory, we could fall out to the cute code below + since we now have exactly the situation that code + is designed to handle. We botch this badly and call + the basic mpn_sb_divrem_mn! */ + if (dn == 2) + mpn_divrem_2 (qp, 0L, n2p, nn, d2p); + else + mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn); + } + } + + + if (cnt != 0) + mpn_rshift (rp, n2p, dn, cnt); + else + MPN_COPY (rp, n2p, dn); + TMP_FREE (marker); + return; + } + + /* When we come here, the numerator/partial remainder is less + than twice the size of the denominator. */ + + { + /* Problem: + + Divide a numerator N with nn limbs by a denominator D with dn + limbs forming a quotient of nn-dn+1 limbs. When qn is small + compared to dn, conventional division algorithms perform poorly. + We want an algorithm that has an expected running time that is + dependent only on qn. It is assumed that the most significant + limb of the numerator is smaller than the most significant limb + of the denominator. + + Algorithm (very informally stated): + + 1) Divide the 2 x qn most significant limbs from the numerator + by the qn most significant limbs from the denominator. Call + the result qest. This is either the correct quotient, but + might be 1 or 2 too large. Compute the remainder from the + division. (This step is implemented by a mpn_divrem call.) + + 2) Is the most significant limb from the remainder < p, where p + is the product of the most significant limb from the quotient + and the next(d). (Next(d) denotes the next ignored limb from + the denominator.) If it is, decrement qest, and adjust the + remainder accordingly. + + 3) Is the remainder >= qest? If it is, qest is the desired + quotient. The algorithm terminates. + + 4) Subtract qest x next(d) from the remainder. If there is + borrow out, decrement qest, and adjust the remainder + accordingly. + + 5) Skip one word from the denominator (i.e., let next(d) denote + the next less significant limb. */ + + mp_size_t qn; + mp_ptr n2p, d2p; + mp_ptr tp; + mp_limb_t cy; + mp_size_t in, rn; + mp_limb_t quotient_too_large; + int cnt; + + qn = nn - dn; + qp[qn] = 0; /* zero high quotient limb */ + qn += adjust; /* qn cannot become bigger */ + + if (qn == 0) + { + MPN_COPY (rp, np, dn); + TMP_FREE (marker); + return; + } + + in = dn - qn; /* (at least partially) ignored # of limbs in ops */ + /* Normalize denominator by shifting it to the left such that its + most significant bit is set. Then shift the numerator the same + amount, to mathematically preserve quotient. */ + count_leading_zeros (cnt, dp[dn - 1]); + if (cnt != 0) + { + d2p = (mp_ptr) TMP_ALLOC (qn * BYTES_PER_MP_LIMB); + + mpn_lshift (d2p, dp + in, qn, cnt); + d2p[0] |= dp[in - 1] >> (BITS_PER_MP_LIMB - cnt); + + n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB); + cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt); + if (adjust) + { + n2p[2 * qn] = cy; + n2p++; + } + else + { + n2p[0] |= np[nn - 2 * qn - 1] >> (BITS_PER_MP_LIMB - cnt); + } + } + else + { + d2p = (mp_ptr) dp + in; + + n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB); + MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn); + if (adjust) + { + n2p[2 * qn] = 0; + n2p++; + } + } + + /* Get an approximate quotient using the extracted operands. */ + if (qn == 1) + { + mp_limb_t q0, r0; + mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0; + /* Due to a gcc 2.7.2.3 reload pass bug, we have to use some + temps here. This doesn't hurt code quality on any machines + so we do it unconditionally. */ + gcc272bug_n1 = n2p[1]; + gcc272bug_n0 = n2p[0]; + gcc272bug_d0 = d2p[0]; + udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0, gcc272bug_d0); + n2p[0] = r0; + qp[0] = q0; + } + else if (qn == 2) + mpn_divrem_2 (qp, 0L, n2p, 4L, d2p); + else if (qn < BZ_THRESHOLD) + mpn_sb_divrem_mn (qp, n2p, qn * 2, d2p, qn); + else + mpn_bz_divrem_n (qp, n2p, d2p, qn); + + rn = qn; + /* Multiply the first ignored divisor limb by the most significant + quotient limb. If that product is > the partial remainder's + most significant limb, we know the quotient is too large. This + test quickly catches most cases where the quotient is too large; + it catches all cases where the quotient is 2 too large. */ + { + mp_limb_t dl, x; + mp_limb_t h, l; + + if (in - 2 < 0) + dl = 0; + else + dl = dp[in - 2]; + + x = SHL (dp[in - 1], dl, cnt); + umul_ppmm (h, l, x, qp[qn - 1]); + + if (n2p[qn - 1] < h) + { + mp_limb_t cy; + + mpn_decr_u (qp, (mp_limb_t) 1); + cy = mpn_add_n (n2p, n2p, d2p, qn); + if (cy) + { + /* The partial remainder is safely large. */ + n2p[qn] = cy; + ++rn; + } + } + } + + quotient_too_large = 0; + if (cnt != 0) + { + mp_limb_t cy1, cy2; + + /* Append partially used numerator limb to partial remainder. */ + cy1 = mpn_lshift (n2p, n2p, rn, BITS_PER_MP_LIMB - cnt); + n2p[0] |= np[in - 1] & (~(mp_limb_t) 0 >> cnt); + + /* Update partial remainder with partially used divisor limb. */ + cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (~(mp_limb_t) 0 >> cnt)); + if (qn != rn) + { + if (n2p[qn] < cy2) + abort (); + n2p[qn] -= cy2; + } + else + { + n2p[qn] = cy1 - cy2; + + quotient_too_large = (cy1 < cy2); + ++rn; + } + --in; + } + /* True: partial remainder now is neutral, i.e., it is not shifted up. */ + + tp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB); + + if (in < qn) + { + if (in == 0) + { + MPN_COPY (rp, n2p, rn); + if (rn != dn) + abort (); + goto foo; + } + mpn_mul (tp, qp, qn, dp, in); + } + else + mpn_mul (tp, dp, in, qp, qn); + + cy = mpn_sub (n2p, n2p, rn, tp + in, qn); + MPN_COPY (rp + in, n2p, dn - in); + quotient_too_large |= cy; + cy = mpn_sub_n (rp, np, tp, in); + cy = mpn_sub_1 (rp + in, rp + in, rn, cy); + quotient_too_large |= cy; + foo: + if (quotient_too_large) + { + mpn_decr_u (qp, (mp_limb_t) 1); + mpn_add_n (rp, rp, dp, dn); + } + } + TMP_FREE (marker); + return; + } + } +} diff --git a/rts/gmp/mpn/generic/udiv_w_sdiv.c b/rts/gmp/mpn/generic/udiv_w_sdiv.c new file mode 100644 index 0000000000..061cce86e1 --- /dev/null +++ b/rts/gmp/mpn/generic/udiv_w_sdiv.c @@ -0,0 +1,131 @@ +/* mpn_udiv_w_sdiv -- implement udiv_qrnnd on machines with only signed + division. + + Contributed by Peter L. Montgomery. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY SAFE + TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THIS FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE + GNU MP RELEASE. + + +Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_udiv_w_sdiv (rp, a1, a0, d) + mp_limb_t *rp, a1, a0, d; +{ + mp_limb_t q, r; + mp_limb_t c0, c1, b1; + + if ((mp_limb_signed_t) d >= 0) + { + if (a1 < d - a1 - (a0 >> (BITS_PER_MP_LIMB - 1))) + { + /* dividend, divisor, and quotient are nonnegative */ + sdiv_qrnnd (q, r, a1, a0, d); + } + else + { + /* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */ + sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (BITS_PER_MP_LIMB - 1)); + /* Divide (c1*2^32 + c0) by d */ + sdiv_qrnnd (q, r, c1, c0, d); + /* Add 2^31 to quotient */ + q += (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1); + } + } + else + { + b1 = d >> 1; /* d/2, between 2^30 and 2^31 - 1 */ + c1 = a1 >> 1; /* A/2 */ + c0 = (a1 << (BITS_PER_MP_LIMB - 1)) + (a0 >> 1); + + if (a1 < b1) /* A < 2^32*b1, so A/2 < 2^31*b1 */ + { + sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */ + + r = 2*r + (a0 & 1); /* Remainder from A/(2*b1) */ + if ((d & 1) != 0) + { + if (r >= q) + r = r - q; + else if (q - r <= d) + { + r = r - q + d; + q--; + } + else + { + r = r - q + 2*d; + q -= 2; + } + } + } + else if (c1 < b1) /* So 2^31 <= (A/2)/b1 < 2^32 */ + { + c1 = (b1 - 1) - c1; + c0 = ~c0; /* logical NOT */ + + sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */ + + q = ~q; /* (A/2)/b1 */ + r = (b1 - 1) - r; + + r = 2*r + (a0 & 1); /* A/(2*b1) */ + + if ((d & 1) != 0) + { + if (r >= q) + r = r - q; + else if (q - r <= d) + { + r = r - q + d; + q--; + } + else + { + r = r - q + 2*d; + q -= 2; + } + } + } + else /* Implies c1 = b1 */ + { /* Hence a1 = d - 1 = 2*b1 - 1 */ + if (a0 >= -d) + { + q = -1; + r = a0 + d; + } + else + { + q = -2; + r = a0 + 2*d; + } + } + } + + *rp = r; + return q; +} diff --git a/rts/gmp/mpn/hppa/README b/rts/gmp/mpn/hppa/README new file mode 100644 index 0000000000..97e7abe011 --- /dev/null +++ b/rts/gmp/mpn/hppa/README @@ -0,0 +1,91 @@ +This directory contains mpn functions for various HP PA-RISC chips. Code +that runs faster on the PA7100 and later implementations, is in the pa7100 +directory. + +RELEVANT OPTIMIZATION ISSUES + + Load and Store timing + +On the PA7000 no memory instructions can issue the two cycles after a store. +For the PA7100, this is reduced to one cycle. + +The PA7100 has a lookup-free cache, so it helps to schedule loads and the +dependent instruction really far from each other. + +STATUS + +1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the + instructions below (but some sw pipelining is needed to avoid the + xmpyu-fstds delay): + + fldds s1_ptr + + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + + addc + stws res_ptr + addc + stws res_ptr + + addib Loop + +2. mpn_addmul_1 could be improved from the current 10 to 7.5 cycles/limb + (asymptotically) on the PA7100, using the instructions below. With proper + sw pipelining and the unrolling level below, the speed becomes 8 + cycles/limb. + + fldds s1_ptr + fldds s1_ptr + + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + xmpyu + fstds N(%r30) + + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + ldws N(%r30) + addc + addc + addc + addc + addc %r0,%r0,cy-limb + + ldws res_ptr + ldws res_ptr + ldws res_ptr + ldws res_ptr + add + stws res_ptr + addc + stws res_ptr + addc + stws res_ptr + addc + stws res_ptr + + addib + +3. For the PA8000 we have to stick to using 32-bit limbs before compiler + support emerges. But we want to use 64-bit operations whenever possible, + in particular for loads and stores. It is possible to handle mpn_add_n + efficiently by rotating (when s1/s2 are aligned), masking+bit field + inserting when (they are not). The speed should double compared to the + code used today. diff --git a/rts/gmp/mpn/hppa/add_n.s b/rts/gmp/mpn/hppa/add_n.s new file mode 100644 index 0000000000..c53b2f71b3 --- /dev/null +++ b/rts/gmp/mpn/hppa/add_n.s @@ -0,0 +1,58 @@ +; HP-PA __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; One might want to unroll this as for other processors, but it turns +; out that the data cache contention after a store makes such +; unrolling useless. We can't come under 5 cycles/limb anyway. + + .code + .export __gmpn_add_n +__gmpn_add_n + .proc + .callinfo frame=0,no_calls + .entry + + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L$end ; check for (SIZE == 1) + add %r20,%r19,%r28 ; add first limbs ignoring cy + +L$loop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L$loop + addc %r20,%r19,%r28 + +L$end stws %r28,0(0,%r26) + bv 0(%r2) + addc %r0,%r0,%r28 + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/gmp-mparam.h b/rts/gmp/mpn/hppa/gmp-mparam.h new file mode 100644 index 0000000000..98b6d9ce3c --- /dev/null +++ b/rts/gmp/mpn/hppa/gmp-mparam.h @@ -0,0 +1,63 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values are for the PA7100 using GCC. */ +/* Generated by tuneup.c, 2000-07-25. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 30 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 172 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 59 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 185 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 96 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 122 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 18 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 46 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 33 +#endif diff --git a/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s new file mode 100644 index 0000000000..c7d218f922 --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s @@ -0,0 +1,102 @@ +; HP-PA-1.1 __gmpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 11 cycles/limb on a PA7000. With the used instructions, it +; can not become faster due to data cache contention after a store. On the +; PA7100 it runs at 10 cycles/limb, and that can not be improved either, +; since only the xmpyu does not need the integer pipeline, so the only +; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb +; on the PA7100. + +; There are some ideas described in mul_1.s that applies to this code too. + + .code + .export __gmpn_addmul_1 +__gmpn_addmul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end ldw 0(%r26),%r29 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + add %r29,%r1,%r19 + stw %r19,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/hppa1_1/mul_1.s b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s new file mode 100644 index 0000000000..4512fddec9 --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s @@ -0,0 +1,98 @@ +; HP-PA-1.1 __gmpn_mul_1 -- Multiply a limb vector with a limb and store +; the result in a second limb vector. + +; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 9 cycles/limb on a PA7000. With the used instructions, it can +; not become faster due to data cache contention after a store. On the +; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since +; only the xmpyu does not need the integer pipeline, so the only dual-issue +; we will get are addc+xmpyu. Unrolling would not help either CPU. + +; We could use fldds to read two limbs at a time from the S1 array, and that +; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and +; PA7100, respectively. We don't do that since it does not seem worth the +; (alignment) troubles... + +; At least the PA7100 is rumored to be able to deal with cache-misses +; without stalling instruction issue. If this is true, and the cache is +; actually also lockup-free, we should use a deeper software pipeline, and +; load from S1 very early! (The loads and stores to -12(sp) will surely be +; in the cache.) + + .code + .export __gmpn_mul_1 +__gmpn_mul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop fldws,ma 4(%r25),%fr5 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + fstds %fr6,-16(%r30) + ldw -16(%r30),%r28 + ldo -64(%r30),%r30 + bv 0(%r2) + fstws %fr6R,0(%r26) + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s new file mode 100644 index 0000000000..4f4be08b37 --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s @@ -0,0 +1,75 @@ +; HP-PA __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. +; This is optimized for the PA7100, where is runs at 4.25 cycles/limb + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + + .code + .export __gmpn_add_n +__gmpn_add_n + .proc + .callinfo frame=0,no_calls + .entry + + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,<= -5,%r23,L$rest + add %r20,%r19,%r28 ; add first limbs ignoring cy + +L$loop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addc %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -4,%r23,L$loop + addc %r20,%r19,%r28 + +L$rest addib,= 4,%r23,L$end + nop +L$eloop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -1,%r23,L$eloop + addc %r20,%r19,%r28 + +L$end stws %r28,0(0,%r26) + bv 0(%r2) + addc %r0,%r0,%r28 + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S new file mode 100644 index 0000000000..04db06822e --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S @@ -0,0 +1,189 @@ +; HP-PA 7100/7200 __gmpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define res_ptr %r26 +#define s1_ptr %r25 +#define size %r24 +#define s2_limb %r23 + +#define cylimb %r28 +#define s0 %r19 +#define s1 %r20 +#define s2 %r3 +#define s3 %r4 +#define lo0 %r21 +#define lo1 %r5 +#define lo2 %r6 +#define lo3 %r7 +#define hi0 %r22 +#define hi1 %r23 /* safe to reuse */ +#define hi2 %r29 +#define hi3 %r1 + + .code + .export __gmpn_addmul_1 +__gmpn_addmul_1 + .proc + .callinfo frame=128,no_calls + .entry + + ldo 128(%r30),%r30 + stws s2_limb,-16(%r30) + add %r0,%r0,cylimb ; clear cy and cylimb + addib,< -4,size,L$few_limbs + fldws -16(%r30),%fr31R + + ldo -112(%r30),%r31 + stw %r3,-96(%r30) + stw %r4,-92(%r30) + stw %r5,-88(%r30) + stw %r6,-84(%r30) + stw %r7,-80(%r30) + + bb,>=,n s1_ptr,29,L$0 + + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r31) + ldws -16(%r31),cylimb + ldws -12(%r31),lo0 + add s0,lo0,s0 + addib,< -1,size,L$few_limbs + stws,ma s0,4(res_ptr) + +; start software pipeline ---------------------------------------------------- +L$0 fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + xmpyu %fr4L,%fr31R,%fr5 + xmpyu %fr4R,%fr31R,%fr6 + xmpyu %fr8L,%fr31R,%fr9 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + fstds %fr6,-8(%r31) + fstds %fr9,0(%r31) + fstds %fr10,8(%r31) + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + addc lo1,hi0,lo1 + addc lo2,hi1,lo2 + addc lo3,hi2,lo3 + + addib,< -4,size,L$end + addc %r0,hi3,cylimb ; propagate carry into cylimb +; main loop ------------------------------------------------------------------ +L$loop fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + ldws 0(res_ptr),s0 + xmpyu %fr4L,%fr31R,%fr5 + ldws 4(res_ptr),s1 + xmpyu %fr4R,%fr31R,%fr6 + ldws 8(res_ptr),s2 + xmpyu %fr8L,%fr31R,%fr9 + ldws 12(res_ptr),s3 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + add s0,lo0,s0 + fstds %fr6,-8(%r31) + addc s1,lo1,s1 + fstds %fr9,0(%r31) + addc s2,lo2,s2 + fstds %fr10,8(%r31) + addc s3,lo3,s3 + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + stws,ma s0,4(res_ptr) + addc lo1,hi0,lo1 + stws,ma s1,4(res_ptr) + addc lo2,hi1,lo2 + stws,ma s2,4(res_ptr) + addc lo3,hi2,lo3 + stws,ma s3,4(res_ptr) + + addib,>= -4,size,L$loop + addc %r0,hi3,cylimb ; propagate carry into cylimb +; finish software pipeline --------------------------------------------------- +L$end ldws 0(res_ptr),s0 + ldws 4(res_ptr),s1 + ldws 8(res_ptr),s2 + ldws 12(res_ptr),s3 + + add s0,lo0,s0 + stws,ma s0,4(res_ptr) + addc s1,lo1,s1 + stws,ma s1,4(res_ptr) + addc s2,lo2,s2 + stws,ma s2,4(res_ptr) + addc s3,lo3,s3 + stws,ma s3,4(res_ptr) + +; restore callee-saves registers --------------------------------------------- + ldw -96(%r30),%r3 + ldw -92(%r30),%r4 + ldw -88(%r30),%r5 + ldw -84(%r30),%r6 + ldw -80(%r30),%r7 + +L$few_limbs + addib,=,n 4,size,L$ret +L$loop2 fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r30) + ldws -16(%r30),hi0 + ldws -12(%r30),lo0 + addc lo0,cylimb,lo0 + addc %r0,hi0,cylimb + add s0,lo0,s0 + stws,ma s0,4(res_ptr) + addib,<> -1,size,L$loop2 + nop + +L$ret addc %r0,cylimb,cylimb + bv 0(%r2) + ldo -128(%r30),%r30 + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s new file mode 100644 index 0000000000..31669b1a55 --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s @@ -0,0 +1,83 @@ +; HP-PA __gmpn_lshift -- +; This is optimized for the PA7100, where is runs at 3.25 cycles/limb + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s_ptr gr25 +; size gr24 +; cnt gr23 + + .code + .export __gmpn_lshift +__gmpn_lshift + .proc + .callinfo frame=64,no_calls + .entry + + sh2add %r24,%r25,%r25 + sh2add %r24,%r26,%r26 + ldws,mb -4(0,%r25),%r22 + subi 32,%r23,%r1 + mtsar %r1 + addib,= -1,%r24,L$0004 + vshd %r0,%r22,%r28 ; compute carry out limb + ldws,mb -4(0,%r25),%r29 + addib,<= -5,%r24,L$rest + vshd %r22,%r29,%r20 + +L$loop ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + vshd %r22,%r29,%r20 + ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,> -4,%r24,L$loop + vshd %r22,%r29,%r20 + +L$rest addib,= 4,%r24,L$end1 + nop +L$eloop ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + addib,<= -1,%r24,L$end2 + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,> -1,%r24,L$eloop + vshd %r22,%r29,%r20 + +L$end1 stws,mb %r20,-4(0,%r26) + vshd %r29,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) +L$end2 stws,mb %r20,-4(0,%r26) +L$0004 vshd %r22,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s new file mode 100644 index 0000000000..d32b10b4b1 --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s @@ -0,0 +1,80 @@ +; HP-PA __gmpn_rshift -- +; This is optimized for the PA7100, where is runs at 3.25 cycles/limb + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s_ptr gr25 +; size gr24 +; cnt gr23 + + .code + .export __gmpn_rshift +__gmpn_rshift + .proc + .callinfo frame=64,no_calls + .entry + + ldws,ma 4(0,%r25),%r22 + mtsar %r23 + addib,= -1,%r24,L$0004 + vshd %r22,%r0,%r28 ; compute carry out limb + ldws,ma 4(0,%r25),%r29 + addib,<= -5,%r24,L$rest + vshd %r29,%r22,%r20 + +L$loop ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + vshd %r29,%r22,%r20 + ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,> -4,%r24,L$loop + vshd %r29,%r22,%r20 + +L$rest addib,= 4,%r24,L$end1 + nop +L$eloop ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + addib,<= -1,%r24,L$end2 + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,> -1,%r24,L$eloop + vshd %r29,%r22,%r20 + +L$end1 stws,ma %r20,4(0,%r26) + vshd %r0,%r29,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) +L$end2 stws,ma %r20,4(0,%r26) +L$0004 vshd %r0,%r22,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s new file mode 100644 index 0000000000..0eec41c4b3 --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s @@ -0,0 +1,76 @@ +; HP-PA __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. +; This is optimized for the PA7100, where is runs at 4.25 cycles/limb + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + + .code + .export __gmpn_sub_n +__gmpn_sub_n + .proc + .callinfo frame=0,no_calls + .entry + + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,<= -5,%r23,L$rest + sub %r20,%r19,%r28 ; subtract first limbs ignoring cy + +L$loop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + subb %r20,%r19,%r28 + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -4,%r23,L$loop + subb %r20,%r19,%r28 + +L$rest addib,= 4,%r23,L$end + nop +L$eloop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,> -1,%r23,L$eloop + subb %r20,%r19,%r28 + +L$end stws %r28,0(0,%r26) + addc %r0,%r0,%r28 + bv 0(%r2) + subi 1,%r28,%r28 + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S new file mode 100644 index 0000000000..0fba21dcef --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S @@ -0,0 +1,195 @@ +; HP-PA 7100/7200 __gmpn_submul_1 -- Multiply a limb vector with a limb and +; subtract the result from a second limb vector. + +; Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define res_ptr %r26 +#define s1_ptr %r25 +#define size %r24 +#define s2_limb %r23 + +#define cylimb %r28 +#define s0 %r19 +#define s1 %r20 +#define s2 %r3 +#define s3 %r4 +#define lo0 %r21 +#define lo1 %r5 +#define lo2 %r6 +#define lo3 %r7 +#define hi0 %r22 +#define hi1 %r23 /* safe to reuse */ +#define hi2 %r29 +#define hi3 %r1 + + .code + .export __gmpn_submul_1 +__gmpn_submul_1 + .proc + .callinfo frame=128,no_calls + .entry + + ldo 128(%r30),%r30 + stws s2_limb,-16(%r30) + add %r0,%r0,cylimb ; clear cy and cylimb + addib,< -4,size,L$few_limbs + fldws -16(%r30),%fr31R + + ldo -112(%r30),%r31 + stw %r3,-96(%r30) + stw %r4,-92(%r30) + stw %r5,-88(%r30) + stw %r6,-84(%r30) + stw %r7,-80(%r30) + + bb,>=,n s1_ptr,29,L$0 + + fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r31) + ldws -16(%r31),cylimb + ldws -12(%r31),lo0 + sub s0,lo0,s0 + add s0,lo0,%r0 ; invert cy + addib,< -1,size,L$few_limbs + stws,ma s0,4(res_ptr) + +; start software pipeline ---------------------------------------------------- +L$0 fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + xmpyu %fr4L,%fr31R,%fr5 + xmpyu %fr4R,%fr31R,%fr6 + xmpyu %fr8L,%fr31R,%fr9 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + fstds %fr6,-8(%r31) + fstds %fr9,0(%r31) + fstds %fr10,8(%r31) + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + addc lo1,hi0,lo1 + addc lo2,hi1,lo2 + addc lo3,hi2,lo3 + + addib,< -4,size,L$end + addc %r0,hi3,cylimb ; propagate carry into cylimb +; main loop ------------------------------------------------------------------ +L$loop fldds,ma 8(s1_ptr),%fr4 + fldds,ma 8(s1_ptr),%fr8 + + ldws 0(res_ptr),s0 + xmpyu %fr4L,%fr31R,%fr5 + ldws 4(res_ptr),s1 + xmpyu %fr4R,%fr31R,%fr6 + ldws 8(res_ptr),s2 + xmpyu %fr8L,%fr31R,%fr9 + ldws 12(res_ptr),s3 + xmpyu %fr8R,%fr31R,%fr10 + + fstds %fr5,-16(%r31) + sub s0,lo0,s0 + fstds %fr6,-8(%r31) + subb s1,lo1,s1 + fstds %fr9,0(%r31) + subb s2,lo2,s2 + fstds %fr10,8(%r31) + subb s3,lo3,s3 + subb %r0,%r0,lo0 ; these two insns ... + add lo0,lo0,%r0 ; ... just invert cy + + ldws -16(%r31),hi0 + ldws -12(%r31),lo0 + ldws -8(%r31),hi1 + ldws -4(%r31),lo1 + ldws 0(%r31),hi2 + ldws 4(%r31),lo2 + ldws 8(%r31),hi3 + ldws 12(%r31),lo3 + + addc lo0,cylimb,lo0 + stws,ma s0,4(res_ptr) + addc lo1,hi0,lo1 + stws,ma s1,4(res_ptr) + addc lo2,hi1,lo2 + stws,ma s2,4(res_ptr) + addc lo3,hi2,lo3 + stws,ma s3,4(res_ptr) + + addib,>= -4,size,L$loop + addc %r0,hi3,cylimb ; propagate carry into cylimb +; finish software pipeline --------------------------------------------------- +L$end ldws 0(res_ptr),s0 + ldws 4(res_ptr),s1 + ldws 8(res_ptr),s2 + ldws 12(res_ptr),s3 + + sub s0,lo0,s0 + stws,ma s0,4(res_ptr) + subb s1,lo1,s1 + stws,ma s1,4(res_ptr) + subb s2,lo2,s2 + stws,ma s2,4(res_ptr) + subb s3,lo3,s3 + stws,ma s3,4(res_ptr) + subb %r0,%r0,lo0 ; these two insns ... + add lo0,lo0,%r0 ; ... invert cy + +; restore callee-saves registers --------------------------------------------- + ldw -96(%r30),%r3 + ldw -92(%r30),%r4 + ldw -88(%r30),%r5 + ldw -84(%r30),%r6 + ldw -80(%r30),%r7 + +L$few_limbs + addib,=,n 4,size,L$ret +L$loop2 fldws,ma 4(s1_ptr),%fr4 + ldws 0(res_ptr),s0 + xmpyu %fr4,%fr31R,%fr5 + fstds %fr5,-16(%r30) + ldws -16(%r30),hi0 + ldws -12(%r30),lo0 + addc lo0,cylimb,lo0 + addc %r0,hi0,cylimb + sub s0,lo0,s0 + add s0,lo0,%r0 ; invert cy + stws,ma s0,4(res_ptr) + addib,<> -1,size,L$loop2 + nop + +L$ret addc %r0,cylimb,cylimb + bv 0(%r2) + ldo -128(%r30),%r30 + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/hppa1_1/submul_1.s b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s new file mode 100644 index 0000000000..20a5b5ce0a --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s @@ -0,0 +1,111 @@ +; HP-PA-1.1 __gmpn_submul_1 -- Multiply a limb vector with a limb and +; subtract the result from a second limb vector. + +; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 12 cycles/limb on a PA7000. With the used instructions, it +; can not become faster due to data cache contention after a store. On the +; PA7100 it runs at 11 cycles/limb, and that can not be improved either, +; since only the xmpyu does not need the integer pipeline, so the only +; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb +; on the PA7100. + +; There are some ideas described in mul_1.s that applies to this code too. + +; It seems possible to make this run as fast as __gmpn_addmul_1, if we use +; sub,>>= %r29,%r19,%r22 +; addi 1,%r28,%r28 +; but that requires reworking the hairy software pipeline... + + .code + .export __gmpn_submul_1 +__gmpn_submul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end ldw 0(%r26),%r29 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + sub %r29,%r1,%r22 + add %r22,%r1,%r0 + stw %r22,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S new file mode 100644 index 0000000000..b83d6f4dd2 --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S @@ -0,0 +1,80 @@ +; HP-PA __udiv_qrnnd division support, used from longlong.h. +; This version runs fast on PA 7000 and later. + +; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; rem_ptr gr26 +; n1 gr25 +; n0 gr24 +; d gr23 + + .code +L$0000 .word 0x43f00000 ; 2^64 + .word 0x0 + .export __gmpn_udiv_qrnnd +__gmpn_udiv_qrnnd + .proc + .callinfo frame=64,no_calls + .entry + ldo 64(%r30),%r30 + + stws %r25,-16(0,%r30) ; n_hi + stws %r24,-12(0,%r30) ; n_lo +#ifdef PIC + addil LT%L$0000,%r19 + ldo RT%L$0000(%r1),%r19 +#else + ldil L%L$0000,%r19 + ldo R%L$0000(%r19),%r19 +#endif + fldds -16(0,%r30),%fr5 + stws %r23,-12(0,%r30) + comib,<= 0,%r25,L$1 + fcnvxf,dbl,dbl %fr5,%fr5 + fldds 0(0,%r19),%fr4 + fadd,dbl %fr4,%fr5,%fr5 +L$1 + fcpy,sgl %fr0,%fr6L + fldws -12(0,%r30),%fr6R + fcnvxf,dbl,dbl %fr6,%fr4 + + fdiv,dbl %fr5,%fr4,%fr5 + + fcnvfx,dbl,dbl %fr5,%fr4 + fstws %fr4R,-16(%r30) + xmpyu %fr4R,%fr6R,%fr6 + ldws -16(%r30),%r28 + fstds %fr6,-16(0,%r30) + ldws -12(0,%r30),%r21 + ldws -16(0,%r30),%r20 + sub %r24,%r21,%r22 + subb %r25,%r20,%r19 + comib,= 0,%r19,L$2 + ldo -64(%r30),%r30 + + add %r22,%r23,%r22 + ldo -1(%r28),%r28 +L$2 bv 0(%r2) + stws %r22,0(0,%r26) + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/hppa1_1/umul.s b/rts/gmp/mpn/hppa/hppa1_1/umul.s new file mode 100644 index 0000000000..1f1300ac9b --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa1_1/umul.s @@ -0,0 +1,42 @@ +; Copyright (C) 1999 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + .code + .export __umul_ppmm + .align 4 +__umul_ppmm + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + stw %r25,-16(0,%r30) + fldws -16(0,%r30),%fr22R + stw %r24,-16(0,%r30) + fldws -16(0,%r30),%fr22L + xmpyu %fr22R,%fr22L,%fr22 + fstds %fr22,-16(0,%r30) + ldw -16(0,%r30),%r28 + ldw -12(0,%r30),%r29 + stw %r29,0(0,%r26) + bv 0(%r2) + ldo -64(%r30),%r30 + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/hppa2_0/add_n.s b/rts/gmp/mpn/hppa/hppa2_0/add_n.s new file mode 100644 index 0000000000..6e97278a39 --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa2_0/add_n.s @@ -0,0 +1,88 @@ +; HP-PA 2.0 32-bit __gmpn_add_n -- Add two limb vectors of the same length > 0 +; and store sum in a third limb vector. + +; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .code + .export __gmpn_add_n +__gmpn_add_n + .proc + .callinfo frame=0,no_calls + .entry + + sub %r0,%r23,%r22 + zdep %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + zdep %r22,29,3,%r22 ; r22 = 4 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + sub %r26,%r22,%r26 ; offset res_ptr + blr %r28,%r0 ; branch into loop + add %r0,%r0,%r0 ; reset carry + +L$loop ldw 0(%r25),%r20 + ldw 0(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,0(%r26) +L$7 ldw 4(%r25),%r21 + ldw 4(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,4(%r26) +L$6 ldw 8(%r25),%r20 + ldw 8(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,8(%r26) +L$5 ldw 12(%r25),%r21 + ldw 12(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,12(%r26) +L$4 ldw 16(%r25),%r20 + ldw 16(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,16(%r26) +L$3 ldw 20(%r25),%r21 + ldw 20(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,20(%r26) +L$2 ldw 24(%r25),%r20 + ldw 24(%r24),%r31 + addc %r20,%r31,%r20 + stw %r20,24(%r26) +L$1 ldw 28(%r25),%r21 + ldo 32(%r25),%r25 + ldw 28(%r24),%r19 + addc %r21,%r19,%r21 + stw %r21,28(%r26) + ldo 32(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 32(%r26),%r26 + + bv (%r2) + .exit + addc %r0,%r0,%r28 + .procend diff --git a/rts/gmp/mpn/hppa/hppa2_0/sub_n.s b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s new file mode 100644 index 0000000000..7d9b50fc27 --- /dev/null +++ b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s @@ -0,0 +1,88 @@ +; HP-PA 2.0 32-bit __gmpn_sub_n -- Subtract two limb vectors of the same +; length > 0 and store difference in a third limb vector. + +; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .code + .export __gmpn_sub_n +__gmpn_sub_n + .proc + .callinfo frame=0,no_calls + .entry + + sub %r0,%r23,%r22 + zdep %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + zdep %r22,29,3,%r22 ; r22 = 4 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + blr %r28,%r0 ; branch into loop + sub %r26,%r22,%r26 ; offset res_ptr and set carry + +L$loop ldw 0(%r25),%r20 + ldw 0(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,0(%r26) +L$7 ldw 4(%r25),%r21 + ldw 4(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,4(%r26) +L$6 ldw 8(%r25),%r20 + ldw 8(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,8(%r26) +L$5 ldw 12(%r25),%r21 + ldw 12(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,12(%r26) +L$4 ldw 16(%r25),%r20 + ldw 16(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,16(%r26) +L$3 ldw 20(%r25),%r21 + ldw 20(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,20(%r26) +L$2 ldw 24(%r25),%r20 + ldw 24(%r24),%r31 + subb %r20,%r31,%r20 + stw %r20,24(%r26) +L$1 ldw 28(%r25),%r21 + ldo 32(%r25),%r25 + ldw 28(%r24),%r19 + subb %r21,%r19,%r21 + stw %r21,28(%r26) + ldo 32(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 32(%r26),%r26 + + addc %r0,%r0,%r28 + bv (%r2) + .exit + subi 1,%r28,%r28 + .procend diff --git a/rts/gmp/mpn/hppa/lshift.s b/rts/gmp/mpn/hppa/lshift.s new file mode 100644 index 0000000000..f5a2daad60 --- /dev/null +++ b/rts/gmp/mpn/hppa/lshift.s @@ -0,0 +1,66 @@ +; HP-PA __gmpn_lshift -- + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s_ptr gr25 +; size gr24 +; cnt gr23 + + .code + .export __gmpn_lshift +__gmpn_lshift + .proc + .callinfo frame=64,no_calls + .entry + + sh2add %r24,%r25,%r25 + sh2add %r24,%r26,%r26 + ldws,mb -4(0,%r25),%r22 + subi 32,%r23,%r1 + mtsar %r1 + addib,= -1,%r24,L$0004 + vshd %r0,%r22,%r28 ; compute carry out limb + ldws,mb -4(0,%r25),%r29 + addib,= -1,%r24,L$0002 + vshd %r22,%r29,%r20 + +L$loop ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + addib,= -1,%r24,L$0003 + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,<> -1,%r24,L$loop + vshd %r22,%r29,%r20 + +L$0002 stws,mb %r20,-4(0,%r26) + vshd %r29,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) +L$0003 stws,mb %r20,-4(0,%r26) +L$0004 vshd %r22,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/rshift.s b/rts/gmp/mpn/hppa/rshift.s new file mode 100644 index 0000000000..e05e2f10b5 --- /dev/null +++ b/rts/gmp/mpn/hppa/rshift.s @@ -0,0 +1,63 @@ +; HP-PA __gmpn_rshift -- + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s_ptr gr25 +; size gr24 +; cnt gr23 + + .code + .export __gmpn_rshift +__gmpn_rshift + .proc + .callinfo frame=64,no_calls + .entry + + ldws,ma 4(0,%r25),%r22 + mtsar %r23 + addib,= -1,%r24,L$0004 + vshd %r22,%r0,%r28 ; compute carry out limb + ldws,ma 4(0,%r25),%r29 + addib,= -1,%r24,L$0002 + vshd %r29,%r22,%r20 + +L$loop ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + addib,= -1,%r24,L$0003 + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,<> -1,%r24,L$loop + vshd %r29,%r22,%r20 + +L$0002 stws,ma %r20,4(0,%r26) + vshd %r0,%r29,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) +L$0003 stws,ma %r20,4(0,%r26) +L$0004 vshd %r0,%r22,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/sub_n.s b/rts/gmp/mpn/hppa/sub_n.s new file mode 100644 index 0000000000..8f770ad1ad --- /dev/null +++ b/rts/gmp/mpn/hppa/sub_n.s @@ -0,0 +1,59 @@ +; HP-PA __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; One might want to unroll this as for other processors, but it turns +; out that the data cache contention after a store makes such +; unrolling useless. We can't come under 5 cycles/limb anyway. + + .code + .export __gmpn_sub_n +__gmpn_sub_n + .proc + .callinfo frame=0,no_calls + .entry + + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L$end ; check for (SIZE == 1) + sub %r20,%r19,%r28 ; subtract first limbs ignoring cy + +L$loop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L$loop + subb %r20,%r19,%r28 + +L$end stws %r28,0(0,%r26) + addc %r0,%r0,%r28 + bv 0(%r2) + subi 1,%r28,%r28 + + .exit + .procend diff --git a/rts/gmp/mpn/hppa/udiv_qrnnd.s b/rts/gmp/mpn/hppa/udiv_qrnnd.s new file mode 100644 index 0000000000..9aa3b8a830 --- /dev/null +++ b/rts/gmp/mpn/hppa/udiv_qrnnd.s @@ -0,0 +1,286 @@ +; HP-PA __udiv_qrnnd division support, used from longlong.h. +; This version runs fast on pre-PA7000 CPUs. + +; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; rem_ptr gr26 +; n1 gr25 +; n0 gr24 +; d gr23 + +; The code size is a bit excessive. We could merge the last two ds;addc +; sequences by simply moving the "bb,< Odd" instruction down. The only +; trouble is the FFFFFFFF code that would need some hacking. + + .code + .export __gmpn_udiv_qrnnd +__gmpn_udiv_qrnnd + .proc + .callinfo frame=0,no_calls + .entry + + comb,< %r23,0,L$largedivisor + sub %r0,%r23,%r1 ; clear cy as side-effect + ds %r0,%r1,%r0 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r28 + ds %r25,%r23,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r23,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r28,%r28,%r28 + +L$largedivisor + extru %r24,31,1,%r19 ; r19 = n0 & 1 + bb,< %r23,31,L$odd + extru %r23,30,31,%r22 ; r22 = d >> 1 + shd %r25,%r24,1,%r24 ; r24 = new n0 + extru %r25,30,31,%r25 ; r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r24,%r24,%r28 + +L$odd addib,sv,n 1,%r22,L$FF.. ; r22 = (d / 2 + 1) + shd %r25,%r24,1,%r24 ; r24 = new n0 + extru %r25,30,31,%r25 ; r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r28 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 +; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25 + add,nuv %r28,%r25,%r25 + addl %r25,%r1,%r25 + addc %r0,%r28,%r28 + sub,<< %r25,%r23,%r0 + addl %r25,%r1,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r0,%r28,%r28 + +; This is just a special case of the code above. +; We come here when d == 0xFFFFFFFF +L$FF.. add,uv %r25,%r24,%r24 + sub,<< %r24,%r23,%r0 + ldo 1(%r24),%r24 + stws %r24,0(0,%r26) + bv 0(%r2) + addc %r0,%r25,%r28 + + .exit + .procend diff --git a/rts/gmp/mpn/i960/README b/rts/gmp/mpn/i960/README new file mode 100644 index 0000000000..d68a0a83eb --- /dev/null +++ b/rts/gmp/mpn/i960/README @@ -0,0 +1,9 @@ +This directory contains mpn functions for Intel i960 processors. + +RELEVANT OPTIMIZATION ISSUES + +The code in this directory is not well optimized. + +STATUS + +The code in this directory has not been tested. diff --git a/rts/gmp/mpn/i960/add_n.s b/rts/gmp/mpn/i960/add_n.s new file mode 100644 index 0000000000..387317a397 --- /dev/null +++ b/rts/gmp/mpn/i960/add_n.s @@ -0,0 +1,43 @@ +# I960 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +# sum in a third limb vector. + +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + +.text + .align 4 + .globl ___gmpn_add_n +___gmpn_add_n: + mov 0,g6 # clear carry-save register + cmpo 1,0 # clear cy + +Loop: subo 1,g3,g3 # update loop counter + ld (g1),g5 # load from s1_ptr + addo 4,g1,g1 # s1_ptr++ + ld (g2),g4 # load from s2_ptr + addo 4,g2,g2 # s2_ptr++ + cmpo g6,1 # restore cy from g6, relies on cy being 0 + addc g4,g5,g4 # main add + subc 0,0,g6 # save cy in g6 + st g4,(g0) # store result to res_ptr + addo 4,g0,g0 # res_ptr++ + cmpobne 0,g3,Loop # when branch is taken, clears C bit + + mov g6,g0 + ret diff --git a/rts/gmp/mpn/i960/addmul_1.s b/rts/gmp/mpn/i960/addmul_1.s new file mode 100644 index 0000000000..7df1418356 --- /dev/null +++ b/rts/gmp/mpn/i960/addmul_1.s @@ -0,0 +1,48 @@ +# I960 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add +# the result to a second limb vector. + +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + +.text + .align 4 + .globl ___gmpn_mul_1 +___gmpn_mul_1: + subo g2,0,g2 + shlo 2,g2,g4 + subo g4,g1,g1 + subo g4,g0,g13 + mov 0,g0 + + cmpo 1,0 # clear C bit on AC.cc + +Loop: ld (g1)[g2*4],g5 + emul g3,g5,g6 + ld (g13)[g2*4],g5 + + addc g0,g6,g6 # relies on that C bit is clear + addc 0,g7,g7 + addc g5,g6,g6 # relies on that C bit is clear + st g6,(g13)[g2*4] + addc 0,g7,g0 + + addo g2,1,g2 + cmpobne 0,g2,Loop # when branch is taken, clears C bit + + ret diff --git a/rts/gmp/mpn/i960/mul_1.s b/rts/gmp/mpn/i960/mul_1.s new file mode 100644 index 0000000000..5c0c985aa5 --- /dev/null +++ b/rts/gmp/mpn/i960/mul_1.s @@ -0,0 +1,45 @@ +# I960 __gmpn_mul_1 -- Multiply a limb vector with a limb and store +# the result in a second limb vector. + +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + +.text + .align 4 + .globl ___gmpn_mul_1 +___gmpn_mul_1: + subo g2,0,g2 + shlo 2,g2,g4 + subo g4,g1,g1 + subo g4,g0,g13 + mov 0,g0 + + cmpo 1,0 # clear C bit on AC.cc + +Loop: ld (g1)[g2*4],g5 + emul g3,g5,g6 + + addc g0,g6,g6 # relies on that C bit is clear + st g6,(g13)[g2*4] + addc 0,g7,g0 + + addo g2,1,g2 + cmpobne 0,g2,Loop # when branch is taken, clears C bit + + ret diff --git a/rts/gmp/mpn/i960/sub_n.s b/rts/gmp/mpn/i960/sub_n.s new file mode 100644 index 0000000000..2db2d46aad --- /dev/null +++ b/rts/gmp/mpn/i960/sub_n.s @@ -0,0 +1,43 @@ +# I960 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +# store difference in a third limb vector. + +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + +.text + .align 4 + .globl ___gmpn_sub_n +___gmpn_sub_n: + mov 1,g6 # set carry-save register + cmpo 1,0 # clear cy + +Loop: subo 1,g3,g3 # update loop counter + ld (g1),g5 # load from s1_ptr + addo 4,g1,g1 # s1_ptr++ + ld (g2),g4 # load from s2_ptr + addo 4,g2,g2 # s2_ptr++ + cmpo g6,1 # restore cy from g6, relies on cy being 0 + subc g4,g5,g4 # main subtract + subc 0,0,g6 # save cy in g6 + st g4,(g0) # store result to res_ptr + addo 4,g0,g0 # res_ptr++ + cmpobne 0,g3,Loop # when branch is taken, cy will be 0 + + mov g6,g0 + ret diff --git a/rts/gmp/mpn/lisp/gmpasm-mode.el b/rts/gmp/mpn/lisp/gmpasm-mode.el new file mode 100644 index 0000000000..5d9da7fa1f --- /dev/null +++ b/rts/gmp/mpn/lisp/gmpasm-mode.el @@ -0,0 +1,351 @@ +;;; gmpasm-mode.el -- GNU MP asm and m4 editing mode. + + +;; Copyright (C) 1999, 2000 Free Software Foundation, Inc. +;; +;; This file is part of the GNU MP Library. +;; +;; The GNU MP Library is free software; you can redistribute it and/or modify +;; it under the terms of the GNU Lesser General Public License as published by +;; the Free Software Foundation; either version 2.1 of the License, or (at your +;; option) any later version. +;; +;; The GNU MP Library is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +;; License for more details. +;; +;; You should have received a copy of the GNU Lesser General Public License +;; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +;; MA 02111-1307, USA. + + +;;; Commentary: +;; +;; gmpasm-mode is an editing mode for m4 processed assembler code and m4 +;; macro files in GMP. It's similar to m4-mode, but has a number of +;; settings better suited to GMP. +;; +;; +;; Install +;; ------- +;; +;; To make M-x gmpasm-mode available, put gmpasm-mode.el somewhere in the +;; load-path and the following in .emacs +;; +;; (autoload 'gmpasm-mode "gmpasm-mode" nil t) +;; +;; To use gmpasm-mode automatically on all .asm and .m4 files, put the +;; following in .emacs +;; +;; (add-to-list 'auto-mode-alist '("\\.asm\\'" . gmpasm-mode)) +;; (add-to-list 'auto-mode-alist '("\\.m4\\'" . gmpasm-mode)) +;; +;; To have gmpasm-mode only on gmp files, try instead something like the +;; following, which uses it only in a directory starting with "gmp", or a +;; sub-directory of such. +;; +;; (add-to-list 'auto-mode-alist +;; '("/gmp.*/.*\\.\\(asm\\|m4\\)\\'" . gmpasm-mode)) +;; +;; Byte compiling will slightly speed up loading. If you want a docstring +;; in the autoload you can use M-x update-file-autoloads if you set it up +;; right. +;; +;; +;; Emacsen +;; ------- +;; +;; FSF Emacs 20.x - gmpasm-mode is designed for this. +;; XEmacs 20.x - seems to work. +;; +;; FSF Emacs 19.x - should work if replacements for some 20.x-isms are +;; available. comment-region with "C" won't really do the right thing +;; though. + + +;;; Code: + +(defgroup gmpasm nil + "GNU MP m4 and asm editing." + :prefix "gmpasm-" + :group 'languages) + +(defcustom gmpasm-mode-hook nil + "*Hook called by `gmpasm-mode'." + :type 'hook + :group 'gmpasm) + +(defcustom gmpasm-comment-start-regexp "[#;!@C]" + "*Regexp matching possible comment styles. +See `gmpasm-mode' docstring for how this is used." + :type 'regexp + :group 'gmpasm) + + +(defun gmpasm-add-to-list-second (list-var element) + "(gmpasm-add-to-list-second LIST-VAR ELEMENT) + +Add ELEMENT to LIST-VAR as the second element in the list, if it isn't +already in the list. If LIST-VAR is nil, then ELEMENT is just added as the +sole element in the list. + +This is like `add-to-list', but it puts the new value second in the list. + +The first cons cell is copied rather than changed in-place, so references to +the list elsewhere won't be affected." + + (if (member element (symbol-value list-var)) + (symbol-value list-var) + (set list-var + (if (symbol-value list-var) + (cons (car (symbol-value list-var)) + (cons element + (cdr (symbol-value list-var)))) + (list element))))) + + +(defun gmpasm-delete-from-list (list-var element) + "(gmpasm-delete-from-list LIST-VAR ELEMENT) + +Delete ELEMENT from LIST-VAR, using `delete'. +This is like `add-to-list', but the element is deleted from the list. +The list is copied rather than changed in-place, so references to it elsewhere +won't be affected." + + (set list-var (delete element (copy-sequence (symbol-value list-var))))) + + +(defvar gmpasm-mode-map + (let ((map (make-sparse-keymap))) + + ;; assembler and dnl commenting + (define-key map "\C-c\C-c" 'comment-region) + (define-key map "\C-c\C-d" 'gmpasm-comment-region-dnl) + + ;; kill an M-x compile, since it's not hard to put m4 into an infinite + ;; loop + (define-key map "\C-c\C-k" 'kill-compilation) + + map) + "Keymap for `gmpasm-mode'.") + + +(defvar gmpasm-mode-syntax-table + (let ((table (make-syntax-table))) + ;; underscore left as a symbol char, like C mode + + ;; m4 quotes + (modify-syntax-entry ?` "('" table) + (modify-syntax-entry ?' ")`" table) + + table) + "Syntax table used in `gmpasm-mode'. + +m4 ignores quote marks in # comments at the top level, but inside quotes # +isn't special and all quotes are active. There seems no easy way to express +this in the syntax table, so nothing is done for comments. Usually this is +best, since it picks up invalid apostrophes in comments inside quotes.") + + +(defvar gmpasm-font-lock-keywords + (eval-when-compile + (list + (cons + (concat + "\\b" + (regexp-opt + '("deflit" "defreg" "defframe" "defframe_pushl" + "define_not_for_expansion" + "ASM_START" "ASM_END" "PROLOGUE" "EPILOGUE" + "forloop" + "TEXT" "DATA" "ALIGN" "W32" + "builtin" "changecom" "changequote" "changeword" "debugfile" + "debugmode" "decr" "define" "defn" "divert" "divnum" "dumpdef" + "errprint" "esyscmd" "eval" "__file__" "format" "gnu" "ifdef" + "ifelse" "include" "incr" "index" "indir" "len" "__line__" + "m4exit" "m4wrap" "maketemp" "patsubst" "popdef" "pushdef" + "regexp" "shift" "sinclude" "substr" "syscmd" "sysval" + "traceoff" "traceon" "translit" "undefine" "undivert" "unix") + t) + "\\b") 'font-lock-keyword-face))) + + "`font-lock-keywords' for `gmpasm-mode'. + +The keywords are m4 builtins and some of the GMP macros used in asm files. +L and LF don't look good fontified, so they're omitted. + +The right assembler comment regexp is added dynamically buffer-local (with +dnl too).") + + +;; Initialized if gmpasm-mode finds filladapt loaded. +(defvar gmpasm-filladapt-token-table nil + "Filladapt token table used in `gmpasm-mode'.") +(defvar gmpasm-filladapt-token-match-table nil + "Filladapt token match table used in `gmpasm-mode'.") +(defvar gmpasm-filladapt-token-conversion-table nil + "Filladapt token conversion table used in `gmpasm-mode'.") + + +;;;###autoload +(defun gmpasm-mode () + "A major mode for editing GNU MP asm and m4 files. + +\\{gmpasm-mode-map} +`comment-start' and `comment-end' are set buffer-local to assembler +commenting appropriate for the CPU by looking for something matching +`gmpasm-comment-start-regexp' at the start of a line, or \"#\" is used if +there's no match (if \"#\" isn't what you want, type in a desired comment +and do \\[gmpasm-mode] to reinitialize). + +`adaptive-fill-regexp' is set buffer-local to the standard regexp with +`comment-start' and dnl added. If filladapt.el has been loaded it similarly +gets `comment-start' and dnl added as buffer-local fill prefixes. + +Font locking has the m4 builtins, some of the GMP macros, m4 dnl commenting, +and assembler commenting (based on the `comment-start' determined). + +Note that `gmpasm-comment-start-regexp' is only matched as a whole word, so +the `C' in it is only matched as a whole word, not on something that happens +to start with `C'. Also it's only the particular `comment-start' determined +that's added for filling etc, not the whole `gmpasm-comment-start-regexp'. + +`gmpasm-mode-hook' is run after initializations are complete. +" + + (interactive) + (kill-all-local-variables) + (setq major-mode 'gmpasm-mode + mode-name "gmpasm") + (use-local-map gmpasm-mode-map) + (set-syntax-table gmpasm-mode-syntax-table) + (setq fill-column 76) + + ;; Short instructions might fit with 32, but anything with labels or + ;; expressions soon needs the comments pushed out to column 40. + (setq comment-column 40) + + ;; Don't want to find out the hard way which dumb assemblers don't like a + ;; missing final newline. + (set (make-local-variable 'require-final-newline) t) + + ;; The first match of gmpasm-comment-start-regexp at the start of a line + ;; determines comment-start, or "#" if no match. + (set (make-local-variable 'comment-start) + (save-excursion + (goto-char (point-min)) + (if (re-search-forward + (concat "^\\(" gmpasm-comment-start-regexp "\\)\\(\\s-\\|$\\)") + nil t) + (match-string 1) + "#"))) + (set (make-local-variable 'comment-end) "") + + ;; If comment-start ends in an alphanumeric then \b is used to match it + ;; only as a separate word. The test is for an alphanumeric rather than + ;; \w since we might try # or ! as \w characters but without wanting \b. + (let ((comment-regexp + (concat (regexp-quote comment-start) + (if (string-match "[a-zA-Z0-9]\\'" comment-start) "\\b")))) + + ;; Whitespace is required before a comment-start so m4 $# doesn't match + ;; when comment-start is "#". + ;; Only spaces or tabs match after, so newline isn't included in the + ;; font lock below. + (set (make-local-variable 'comment-start-skip) + (concat "\\(^\\|\\s-\\)" comment-regexp "[ \t]*")) + + ;; Comment fontification based on comment-start, matching through to the + ;; end of the line. + (add-to-list (make-local-variable 'gmpasm-font-lock-keywords) + (cons (concat + "\\(\\bdnl\\b\\|" comment-start-skip "\\).*$") + 'font-lock-comment-face)) + + (set (make-local-variable 'font-lock-defaults) + '(gmpasm-font-lock-keywords + t ; no syntactic fontification (of strings etc) + nil ; no case-fold + ((?_ . "w")) ; _ part of a word while fontifying + )) + + ;; Paragraphs are separated by blank lines, or lines with only dnl or + ;; comment-start. + (set (make-local-variable 'paragraph-separate) + (concat "[ \t\f]*\\(\\(" comment-regexp "\\|dnl\\)[ \t]*\\)*$")) + (set (make-local-variable 'paragraph-start) + (concat "\f\\|" paragraph-separate)) + + ;; Adaptive fill gets dnl and comment-start as comment style prefixes on + ;; top of the standard regexp (which has # and ; already actually). + (set (make-local-variable 'adaptive-fill-regexp) + (concat "[ \t]*\\(\\(" + comment-regexp + "\\|dnl\\|[-|#;>*]+\\|(?[0-9]+[.)]\\)[ \t]*\\)*")) + (set (make-local-variable 'adaptive-fill-first-line-regexp) + "\\`\\([ \t]*dnl\\)?[ \t]*\\'") + + (when (fboundp 'filladapt-mode) + (when (not gmpasm-filladapt-token-table) + (setq gmpasm-filladapt-token-table + filladapt-token-table) + (setq gmpasm-filladapt-token-match-table + filladapt-token-match-table) + (setq gmpasm-filladapt-token-conversion-table + filladapt-token-conversion-table) + + ;; Numbered bullet points like "2.1" get matched at the start of a + ;; line when it's really something like "2.1 cycles/limb", so delete + ;; this from the list. The regexp for "1.", "2." etc is left + ;; though. + (gmpasm-delete-from-list 'gmpasm-filladapt-token-table + '("[0-9]+\\(\\.[0-9]+\\)+[ \t]" + bullet)) + + ;; "%" as a comment prefix interferes with x86 register names + ;; like %eax, so delete this. + (gmpasm-delete-from-list 'gmpasm-filladapt-token-table + '("%+" postscript-comment)) + + (add-to-list 'gmpasm-filladapt-token-match-table + '(gmpasm-comment gmpasm-comment)) + (add-to-list 'gmpasm-filladapt-token-conversion-table + '(gmpasm-comment . exact)) + ) + + (set (make-local-variable 'filladapt-token-table) + gmpasm-filladapt-token-table) + (set (make-local-variable 'filladapt-token-match-table) + gmpasm-filladapt-token-match-table) + (set (make-local-variable 'filladapt-token-conversion-table) + gmpasm-filladapt-token-conversion-table) + + ;; Add dnl and comment-start as fill prefixes. + ;; Comments in filladapt.el say filladapt-token-table must begin + ;; with ("^" beginning-of-line), so put our addition second. + (gmpasm-add-to-list-second 'filladapt-token-table + (list (concat "dnl[ \t]\\|" comment-regexp) + 'gmpasm-comment)) + )) + + (run-hooks 'gmpasm-mode-hook)) + + +(defun gmpasm-comment-region-dnl (beg end &optional arg) + "(gmpasm-comment-region BEG END &option ARG) + +Comment or uncomment each line in the region using `dnl'. +With \\[universal-argument] prefix arg, uncomment each line in region. +This is `comment-region', but using \"dnl\"." + + (interactive "r\nP") + (let ((comment-start "dnl") + (comment-end "")) + (comment-region beg end arg))) + + +(provide 'gmpasm-mode) + +;;; gmpasm-mode.el ends here diff --git a/rts/gmp/mpn/m68k/add_n.S b/rts/gmp/mpn/m68k/add_n.S new file mode 100644 index 0000000000..9e1d89d64f --- /dev/null +++ b/rts/gmp/mpn/m68k/add_n.S @@ -0,0 +1,79 @@ +/* mc68020 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store + sum in a third limb vector. + +Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + s2_ptr (sp + 16) + size (sp + 12) +*/ + +#include "asm-syntax.h" + + TEXT + ALIGN + GLOBL C_SYMBOL_NAME(__gmpn_add_n) + +C_SYMBOL_NAME(__gmpn_add_n:) +PROLOG(__gmpn_add_n) +/* Save used registers on the stack. */ + movel R(d2),MEM_PREDEC(sp) + movel R(a2),MEM_PREDEC(sp) + +/* Copy the arguments to registers. Better use movem? */ + movel MEM_DISP(sp,12),R(a2) + movel MEM_DISP(sp,16),R(a0) + movel MEM_DISP(sp,20),R(a1) + movel MEM_DISP(sp,24),R(d2) + + eorw #1,R(d2) + lsrl #1,R(d2) + bcc L(L1) + subql #1,R(d2) /* clears cy as side effect */ + +L(Loop:) + movel MEM_POSTINC(a0),R(d0) + movel MEM_POSTINC(a1),R(d1) + addxl R(d1),R(d0) + movel R(d0),MEM_POSTINC(a2) +L(L1:) movel MEM_POSTINC(a0),R(d0) + movel MEM_POSTINC(a1),R(d1) + addxl R(d1),R(d0) + movel R(d0),MEM_POSTINC(a2) + + dbf R(d2),L(Loop) /* loop until 16 lsb of %4 == -1 */ + subxl R(d0),R(d0) /* d0 <= -cy; save cy as 0 or -1 in d0 */ + subl #0x10000,R(d2) + bcs L(L2) + addl R(d0),R(d0) /* restore cy */ + bra L(Loop) + +L(L2:) + negl R(d0) + +/* Restore used registers from stack frame. */ + movel MEM_POSTINC(sp),R(a2) + movel MEM_POSTINC(sp),R(d2) + + rts +EPILOG(__gmpn_add_n) diff --git a/rts/gmp/mpn/m68k/lshift.S b/rts/gmp/mpn/m68k/lshift.S new file mode 100644 index 0000000000..a539d5d42e --- /dev/null +++ b/rts/gmp/mpn/m68k/lshift.S @@ -0,0 +1,150 @@ +/* mc68020 __gmpn_lshift -- Shift left a low-level natural-number integer. + +Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s_ptr (sp + 8) + s_size (sp + 16) + cnt (sp + 12) +*/ + +#include "asm-syntax.h" + +#define res_ptr a1 +#define s_ptr a0 +#define s_size d6 +#define cnt d4 + + TEXT + ALIGN + GLOBL C_SYMBOL_NAME(__gmpn_lshift) + +C_SYMBOL_NAME(__gmpn_lshift:) +PROLOG(__gmpn_lshift) + +/* Save used registers on the stack. */ + moveml R(d2)-R(d6)/R(a2),MEM_PREDEC(sp) + +/* Copy the arguments to registers. */ + movel MEM_DISP(sp,28),R(res_ptr) + movel MEM_DISP(sp,32),R(s_ptr) + movel MEM_DISP(sp,36),R(s_size) + movel MEM_DISP(sp,40),R(cnt) + + moveql #1,R(d5) + cmpl R(d5),R(cnt) + bne L(Lnormal) + cmpl R(s_ptr),R(res_ptr) + bls L(Lspecial) /* jump if s_ptr >= res_ptr */ +#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) + lea MEM_INDX1(s_ptr,s_size,l,4),R(a2) +#else /* not mc68020 */ + movel R(s_size),R(d0) + asll #2,R(d0) + lea MEM_INDX(s_ptr,d0,l),R(a2) +#endif + cmpl R(res_ptr),R(a2) + bls L(Lspecial) /* jump if res_ptr >= s_ptr + s_size */ + +L(Lnormal:) + moveql #32,R(d5) + subl R(cnt),R(d5) + +#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) + lea MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr) + lea MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr) +#else /* not mc68000 */ + movel R(s_size),R(d0) + asll #2,R(d0) + addl R(s_size),R(s_ptr) + addl R(s_size),R(res_ptr) +#endif + movel MEM_PREDEC(s_ptr),R(d2) + movel R(d2),R(d0) + lsrl R(d5),R(d0) /* compute carry limb */ + + lsll R(cnt),R(d2) + movel R(d2),R(d1) + subql #1,R(s_size) + beq L(Lend) + lsrl #1,R(s_size) + bcs L(L1) + subql #1,R(s_size) + +L(Loop:) + movel MEM_PREDEC(s_ptr),R(d2) + movel R(d2),R(d3) + lsrl R(d5),R(d3) + orl R(d3),R(d1) + movel R(d1),MEM_PREDEC(res_ptr) + lsll R(cnt),R(d2) +L(L1:) + movel MEM_PREDEC(s_ptr),R(d1) + movel R(d1),R(d3) + lsrl R(d5),R(d3) + orl R(d3),R(d2) + movel R(d2),MEM_PREDEC(res_ptr) + lsll R(cnt),R(d1) + + dbf R(s_size),L(Loop) + subl #0x10000,R(s_size) + bcc L(Loop) + +L(Lend:) + movel R(d1),MEM_PREDEC(res_ptr) /* store least significant limb */ + +/* Restore used registers from stack frame. */ + moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2) + rts + +/* We loop from least significant end of the arrays, which is only + permissable if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. */ + +L(Lspecial:) + clrl R(d0) /* initialize carry */ + eorw #1,R(s_size) + lsrl #1,R(s_size) + bcc L(LL1) + subql #1,R(s_size) + +L(LLoop:) + movel MEM_POSTINC(s_ptr),R(d2) + addxl R(d2),R(d2) + movel R(d2),MEM_POSTINC(res_ptr) +L(LL1:) + movel MEM_POSTINC(s_ptr),R(d2) + addxl R(d2),R(d2) + movel R(d2),MEM_POSTINC(res_ptr) + + dbf R(s_size),L(LLoop) + addxl R(d0),R(d0) /* save cy in lsb */ + subl #0x10000,R(s_size) + bcs L(LLend) + lsrl #1,R(d0) /* restore cy */ + bra L(LLoop) + +L(LLend:) +/* Restore used registers from stack frame. */ + moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2) + rts +EPILOG(__gmpn_lshift) diff --git a/rts/gmp/mpn/m68k/mc68020/addmul_1.S b/rts/gmp/mpn/m68k/mc68020/addmul_1.S new file mode 100644 index 0000000000..6638115d71 --- /dev/null +++ b/rts/gmp/mpn/m68k/mc68020/addmul_1.S @@ -0,0 +1,83 @@ +/* mc68020 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add + the result to a second limb vector. + +Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + s1_size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "asm-syntax.h" + + TEXT + ALIGN + GLOBL C_SYMBOL_NAME(__gmpn_addmul_1) + +C_SYMBOL_NAME(__gmpn_addmul_1:) +PROLOG(__gmpn_addmul_1) + +#define res_ptr a0 +#define s1_ptr a1 +#define s1_size d2 +#define s2_limb d4 + +/* Save used registers on the stack. */ + moveml R(d2)-R(d5),MEM_PREDEC(sp) + +/* Copy the arguments to registers. Better use movem? */ + movel MEM_DISP(sp,20),R(res_ptr) + movel MEM_DISP(sp,24),R(s1_ptr) + movel MEM_DISP(sp,28),R(s1_size) + movel MEM_DISP(sp,32),R(s2_limb) + + eorw #1,R(s1_size) + clrl R(d1) + clrl R(d5) + lsrl #1,R(s1_size) + bcc L(L1) + subql #1,R(s1_size) + subl R(d0),R(d0) /* (d0,cy) <= (0,0) */ + +L(Loop:) + movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d1):R(d3) + addxl R(d0),R(d3) + addxl R(d5),R(d1) + addl R(d3),MEM_POSTINC(res_ptr) +L(L1:) movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d0):R(d3) + addxl R(d1),R(d3) + addxl R(d5),R(d0) + addl R(d3),MEM_POSTINC(res_ptr) + + dbf R(s1_size),L(Loop) + addxl R(d5),R(d0) + subl #0x10000,R(s1_size) + bcc L(Loop) + +/* Restore used registers from stack frame. */ + moveml MEM_POSTINC(sp),R(d2)-R(d5) + + rts +EPILOG(__gmpn_addmul_1) diff --git a/rts/gmp/mpn/m68k/mc68020/mul_1.S b/rts/gmp/mpn/m68k/mc68020/mul_1.S new file mode 100644 index 0000000000..fdd4c39d70 --- /dev/null +++ b/rts/gmp/mpn/m68k/mc68020/mul_1.S @@ -0,0 +1,90 @@ +/* mc68020 __gmpn_mul_1 -- Multiply a limb vector with a limb and store + the result in a second limb vector. + +Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + s1_size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "asm-syntax.h" + + TEXT + ALIGN + GLOBL C_SYMBOL_NAME(__gmpn_mul_1) + +C_SYMBOL_NAME(__gmpn_mul_1:) +PROLOG(__gmpn_mul_1) + +#define res_ptr a0 +#define s1_ptr a1 +#define s1_size d2 +#define s2_limb d4 + +/* Save used registers on the stack. */ + moveml R(d2)-R(d4),MEM_PREDEC(sp) +#if 0 + movel R(d2),MEM_PREDEC(sp) + movel R(d3),MEM_PREDEC(sp) + movel R(d4),MEM_PREDEC(sp) +#endif + +/* Copy the arguments to registers. Better use movem? */ + movel MEM_DISP(sp,16),R(res_ptr) + movel MEM_DISP(sp,20),R(s1_ptr) + movel MEM_DISP(sp,24),R(s1_size) + movel MEM_DISP(sp,28),R(s2_limb) + + eorw #1,R(s1_size) + clrl R(d1) + lsrl #1,R(s1_size) + bcc L(L1) + subql #1,R(s1_size) + subl R(d0),R(d0) /* (d0,cy) <= (0,0) */ + +L(Loop:) + movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d1):R(d3) + addxl R(d0),R(d3) + movel R(d3),MEM_POSTINC(res_ptr) +L(L1:) movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d0):R(d3) + addxl R(d1),R(d3) + movel R(d3),MEM_POSTINC(res_ptr) + + dbf R(s1_size),L(Loop) + clrl R(d3) + addxl R(d3),R(d0) + subl #0x10000,R(s1_size) + bcc L(Loop) + +/* Restore used registers from stack frame. */ + moveml MEM_POSTINC(sp),R(d2)-R(d4) +#if 0 + movel MEM_POSTINC(sp),R(d4) + movel MEM_POSTINC(sp),R(d3) + movel MEM_POSTINC(sp),R(d2) +#endif + rts +EPILOG(__gmpn_mul_1) diff --git a/rts/gmp/mpn/m68k/mc68020/submul_1.S b/rts/gmp/mpn/m68k/mc68020/submul_1.S new file mode 100644 index 0000000000..3c36b70166 --- /dev/null +++ b/rts/gmp/mpn/m68k/mc68020/submul_1.S @@ -0,0 +1,83 @@ +/* mc68020 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract + the result from a second limb vector. + +Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + s1_size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "asm-syntax.h" + + TEXT + ALIGN + GLOBL C_SYMBOL_NAME(__gmpn_submul_1) + +C_SYMBOL_NAME(__gmpn_submul_1:) +PROLOG(__gmpn_submul_1) + +#define res_ptr a0 +#define s1_ptr a1 +#define s1_size d2 +#define s2_limb d4 + +/* Save used registers on the stack. */ + moveml R(d2)-R(d5),MEM_PREDEC(sp) + +/* Copy the arguments to registers. Better use movem? */ + movel MEM_DISP(sp,20),R(res_ptr) + movel MEM_DISP(sp,24),R(s1_ptr) + movel MEM_DISP(sp,28),R(s1_size) + movel MEM_DISP(sp,32),R(s2_limb) + + eorw #1,R(s1_size) + clrl R(d1) + clrl R(d5) + lsrl #1,R(s1_size) + bcc L(L1) + subql #1,R(s1_size) + subl R(d0),R(d0) /* (d0,cy) <= (0,0) */ + +L(Loop:) + movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d1):R(d3) + addxl R(d0),R(d3) + addxl R(d5),R(d1) + subl R(d3),MEM_POSTINC(res_ptr) +L(L1:) movel MEM_POSTINC(s1_ptr),R(d3) + mulul R(s2_limb),R(d0):R(d3) + addxl R(d1),R(d3) + addxl R(d5),R(d0) + subl R(d3),MEM_POSTINC(res_ptr) + + dbf R(s1_size),L(Loop) + addxl R(d5),R(d0) + subl #0x10000,R(s1_size) + bcc L(Loop) + +/* Restore used registers from stack frame. */ + moveml MEM_POSTINC(sp),R(d2)-R(d5) + + rts +EPILOG(__gmpn_submul_1) diff --git a/rts/gmp/mpn/m68k/mc68020/udiv.S b/rts/gmp/mpn/m68k/mc68020/udiv.S new file mode 100644 index 0000000000..d00cf13558 --- /dev/null +++ b/rts/gmp/mpn/m68k/mc68020/udiv.S @@ -0,0 +1,31 @@ +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +.text + .even +.globl ___udiv_qrnnd +___udiv_qrnnd: + movel sp@(4),a0 + movel sp@(8),d1 + movel sp@(12),d0 + divul sp@(16),d1:d0 + movel d1,a0@ + rts diff --git a/rts/gmp/mpn/m68k/mc68020/umul.S b/rts/gmp/mpn/m68k/mc68020/umul.S new file mode 100644 index 0000000000..a34ae6c543 --- /dev/null +++ b/rts/gmp/mpn/m68k/mc68020/umul.S @@ -0,0 +1,31 @@ +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +.text + .even +.globl ___umul_ppmm +___umul_ppmm: + movel sp@(4),a0 + movel sp@(8),d1 + movel sp@(12),d0 + mulul d0,d0:d1 + movel d1,a0@ + rts diff --git a/rts/gmp/mpn/m68k/rshift.S b/rts/gmp/mpn/m68k/rshift.S new file mode 100644 index 0000000000..b47a48e52a --- /dev/null +++ b/rts/gmp/mpn/m68k/rshift.S @@ -0,0 +1,149 @@ +/* mc68020 __gmpn_rshift -- Shift right a low-level natural-number integer. + +Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s_ptr (sp + 8) + s_size (sp + 16) + cnt (sp + 12) +*/ + +#include "asm-syntax.h" + +#define res_ptr a1 +#define s_ptr a0 +#define s_size d6 +#define cnt d4 + + TEXT + ALIGN + GLOBL C_SYMBOL_NAME(__gmpn_rshift) + +C_SYMBOL_NAME(__gmpn_rshift:) +PROLOG(__gmpn_rshift) +/* Save used registers on the stack. */ + moveml R(d2)-R(d6)/R(a2),MEM_PREDEC(sp) + +/* Copy the arguments to registers. */ + movel MEM_DISP(sp,28),R(res_ptr) + movel MEM_DISP(sp,32),R(s_ptr) + movel MEM_DISP(sp,36),R(s_size) + movel MEM_DISP(sp,40),R(cnt) + + moveql #1,R(d5) + cmpl R(d5),R(cnt) + bne L(Lnormal) + cmpl R(res_ptr),R(s_ptr) + bls L(Lspecial) /* jump if res_ptr >= s_ptr */ +#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) + lea MEM_INDX1(res_ptr,s_size,l,4),R(a2) +#else /* not mc68020 */ + movel R(s_size),R(d0) + asll #2,R(d0) + lea MEM_INDX(res_ptr,d0,l),R(a2) +#endif + cmpl R(s_ptr),R(a2) + bls L(Lspecial) /* jump if s_ptr >= res_ptr + s_size */ + +L(Lnormal:) + moveql #32,R(d5) + subl R(cnt),R(d5) + movel MEM_POSTINC(s_ptr),R(d2) + movel R(d2),R(d0) + lsll R(d5),R(d0) /* compute carry limb */ + + lsrl R(cnt),R(d2) + movel R(d2),R(d1) + subql #1,R(s_size) + beq L(Lend) + lsrl #1,R(s_size) + bcs L(L1) + subql #1,R(s_size) + +L(Loop:) + movel MEM_POSTINC(s_ptr),R(d2) + movel R(d2),R(d3) + lsll R(d5),R(d3) + orl R(d3),R(d1) + movel R(d1),MEM_POSTINC(res_ptr) + lsrl R(cnt),R(d2) +L(L1:) + movel MEM_POSTINC(s_ptr),R(d1) + movel R(d1),R(d3) + lsll R(d5),R(d3) + orl R(d3),R(d2) + movel R(d2),MEM_POSTINC(res_ptr) + lsrl R(cnt),R(d1) + + dbf R(s_size),L(Loop) + subl #0x10000,R(s_size) + bcc L(Loop) + +L(Lend:) + movel R(d1),MEM(res_ptr) /* store most significant limb */ + +/* Restore used registers from stack frame. */ + moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2) + rts + +/* We loop from most significant end of the arrays, which is only + permissable if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. */ + +L(Lspecial:) +#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) + lea MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr) + lea MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr) +#else /* not mc68000 */ + movel R(s_size),R(d0) + asll #2,R(d0) + addl R(s_size),R(s_ptr) + addl R(s_size),R(res_ptr) +#endif + + clrl R(d0) /* initialize carry */ + eorw #1,R(s_size) + lsrl #1,R(s_size) + bcc L(LL1) + subql #1,R(s_size) + +L(LLoop:) + movel MEM_PREDEC(s_ptr),R(d2) + roxrl #1,R(d2) + movel R(d2),MEM_PREDEC(res_ptr) +L(LL1:) + movel MEM_PREDEC(s_ptr),R(d2) + roxrl #1,R(d2) + movel R(d2),MEM_PREDEC(res_ptr) + + dbf R(s_size),L(LLoop) + roxrl #1,R(d0) /* save cy in msb */ + subl #0x10000,R(s_size) + bcs L(LLend) + addl R(d0),R(d0) /* restore cy */ + bra L(LLoop) + +L(LLend:) +/* Restore used registers from stack frame. */ + moveml MEM_POSTINC(sp),R(d2)-R(d6)/R(a2) + rts +EPILOG(__gmpn_rshift) diff --git a/rts/gmp/mpn/m68k/sub_n.S b/rts/gmp/mpn/m68k/sub_n.S new file mode 100644 index 0000000000..ce45b24db5 --- /dev/null +++ b/rts/gmp/mpn/m68k/sub_n.S @@ -0,0 +1,79 @@ +/* mc68020 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and + store difference in a third limb vector. + +Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + s2_ptr (sp + 16) + size (sp + 12) +*/ + +#include "asm-syntax.h" + + TEXT + ALIGN + GLOBL C_SYMBOL_NAME(__gmpn_sub_n) + +C_SYMBOL_NAME(__gmpn_sub_n:) +PROLOG(__gmpn_sub_n) +/* Save used registers on the stack. */ + movel R(d2),MEM_PREDEC(sp) + movel R(a2),MEM_PREDEC(sp) + +/* Copy the arguments to registers. Better use movem? */ + movel MEM_DISP(sp,12),R(a2) + movel MEM_DISP(sp,16),R(a0) + movel MEM_DISP(sp,20),R(a1) + movel MEM_DISP(sp,24),R(d2) + + eorw #1,R(d2) + lsrl #1,R(d2) + bcc L(L1) + subql #1,R(d2) /* clears cy as side effect */ + +L(Loop:) + movel MEM_POSTINC(a0),R(d0) + movel MEM_POSTINC(a1),R(d1) + subxl R(d1),R(d0) + movel R(d0),MEM_POSTINC(a2) +L(L1:) movel MEM_POSTINC(a0),R(d0) + movel MEM_POSTINC(a1),R(d1) + subxl R(d1),R(d0) + movel R(d0),MEM_POSTINC(a2) + + dbf R(d2),L(Loop) /* loop until 16 lsb of %4 == -1 */ + subxl R(d0),R(d0) /* d0 <= -cy; save cy as 0 or -1 in d0 */ + subl #0x10000,R(d2) + bcs L(L2) + addl R(d0),R(d0) /* restore cy */ + bra L(Loop) + +L(L2:) + negl R(d0) + +/* Restore used registers from stack frame. */ + movel MEM_POSTINC(sp),R(a2) + movel MEM_POSTINC(sp),R(d2) + + rts +EPILOG(__gmpn_sub_n) diff --git a/rts/gmp/mpn/m68k/syntax.h b/rts/gmp/mpn/m68k/syntax.h new file mode 100644 index 0000000000..9eec279c06 --- /dev/null +++ b/rts/gmp/mpn/m68k/syntax.h @@ -0,0 +1,177 @@ +/* asm.h -- Definitions for 68k syntax variations. + +Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#undef ALIGN + +#ifdef MIT_SYNTAX +#define PROLOG(name) +#define EPILOG(name) +#define R(r)r +#define MEM(base)base@ +#define MEM_DISP(base,displacement)base@(displacement) +#define MEM_INDX(base,idx,size_suffix)base@(idx:size_suffix) +#define MEM_INDX1(base,idx,size_suffix,scale)base@(idx:size_suffix:scale) +#define MEM_PREDEC(memory_base)memory_base@- +#define MEM_POSTINC(memory_base)memory_base@+ +#define L(label) label +#define TEXT .text +#define ALIGN .even +#define GLOBL .globl +#define moveql moveq +/* Use variable sized opcodes. */ +#define bcc jcc +#define bcs jcs +#define bls jls +#define beq jeq +#define bne jne +#define bra jra +#endif + +#ifdef SONY_SYNTAX +#define PROLOG(name) +#define EPILOG(name) +#define R(r)r +#define MEM(base)(base) +#define MEM_DISP(base,displacement)(displacement,base) +#define MEM_INDX(base,idx,size_suffix)(base,idx.size_suffix) +#define MEM_INDX1(base,idx,size_suffix,scale)(base,idx.size_suffix*scale) +#define MEM_PREDEC(memory_base)-(memory_base) +#define MEM_POSTINC(memory_base)(memory_base)+ +#define L(label) label +#define TEXT .text +#define ALIGN .even +#define GLOBL .globl +#endif + +#ifdef MOTOROLA_SYNTAX +#define PROLOG(name) +#define EPILOG(name) +#define R(r)r +#define MEM(base)(base) +#define MEM_DISP(base,displacement)(displacement,base) +#define MEM_INDX(base,idx,size_suffix)(base,idx.size_suffix) +#define MEM_INDX1(base,idx,size_suffix,scale)(base,idx.size_suffix*scale) +#define MEM_PREDEC(memory_base)-(memory_base) +#define MEM_POSTINC(memory_base)(memory_base)+ +#define L(label) label +#define TEXT +#define ALIGN +#define GLOBL XDEF +#define lea LEA +#define movel MOVE.L +#define moveml MOVEM.L +#define moveql MOVEQ.L +#define cmpl CMP.L +#define orl OR.L +#define clrl CLR.L +#define eorw EOR.W +#define lsrl LSR.L +#define lsll LSL.L +#define roxrl ROXR.L +#define roxll ROXL.L +#define addl ADD.L +#define addxl ADDX.L +#define addql ADDQ.L +#define subl SUB.L +#define subxl SUBX.L +#define subql SUBQ.L +#define negl NEG.L +#define mulul MULU.L +#define bcc BCC +#define bcs BCS +#define bls BLS +#define beq BEQ +#define bne BNE +#define bra BRA +#define dbf DBF +#define rts RTS +#define d0 D0 +#define d1 D1 +#define d2 D2 +#define d3 D3 +#define d4 D4 +#define d5 D5 +#define d6 D6 +#define d7 D7 +#define a0 A0 +#define a1 A1 +#define a2 A2 +#define a3 A3 +#define a4 A4 +#define a5 A5 +#define a6 A6 +#define a7 A7 +#define sp SP +#endif + +#ifdef ELF_SYNTAX +#define PROLOG(name) .type name,@function +#define EPILOG(name) .size name,.-name +#define MEM(base)(R(base)) +#define MEM_DISP(base,displacement)(displacement,R(base)) +#define MEM_PREDEC(memory_base)-(R(memory_base)) +#define MEM_POSTINC(memory_base)(R(memory_base))+ +#ifdef __STDC__ +#define R_(r)%##r +#define R(r)R_(r) +#define MEM_INDX_(base,idx,size_suffix)(R(base),R(idx##.##size_suffix)) +#define MEM_INDX(base,idx,size_suffix)MEM_INDX_(base,idx,size_suffix) +#define MEM_INDX1_(base,idx,size_suffix,scale)(R(base),R(idx##.##size_suffix*scale)) +#define MEM_INDX1(base,idx,size_suffix,scale)MEM_INDX1_(base,idx,size_suffix,scale) +#define L(label) .##label +#else +#define R(r)%/**/r +#define MEM_INDX(base,idx,size_suffix)(R(base),R(idx).size_suffix) +#define MEM_INDX1(base,idx,size_suffix,scale)(R(base),R(idx).size_suffix*scale) +#define L(label) ./**/label +#endif +#define TEXT .text +#define ALIGN .align 2 +#define GLOBL .globl +#define bcc jbcc +#define bcs jbcs +#define bls jbls +#define beq jbeq +#define bne jbne +#define bra jbra +#endif + +#if defined (SONY_SYNTAX) || defined (ELF_SYNTAX) +#define movel move.l +#define moveml movem.l +#define moveql moveq.l +#define cmpl cmp.l +#define orl or.l +#define clrl clr.l +#define eorw eor.w +#define lsrl lsr.l +#define lsll lsl.l +#define roxrl roxr.l +#define roxll roxl.l +#define addl add.l +#define addxl addx.l +#define addql addq.l +#define subl sub.l +#define subxl subx.l +#define subql subq.l +#define negl neg.l +#define mulul mulu.l +#endif diff --git a/rts/gmp/mpn/m88k/add_n.s b/rts/gmp/mpn/m88k/add_n.s new file mode 100644 index 0000000000..0b776c618a --- /dev/null +++ b/rts/gmp/mpn/m88k/add_n.s @@ -0,0 +1,104 @@ +; mc88100 __gmpn_add -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; s2_ptr r4 +; size r5 + +; This code has been optimized to run one instruction per clock, avoiding +; load stalls and writeback contention. As a result, the instruction +; order is not always natural. + +; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100, +; but on the 88110, it seems to run much slower, 6.6 clocks/limb. + + text + align 16 + global ___gmpn_add_n +___gmpn_add_n: + ld r6,r3,0 ; read first limb from s1_ptr + extu r10,r5,3 + ld r7,r4,0 ; read first limb from s2_ptr + + subu.co r5,r0,r5 ; (clear carry as side effect) + mak r5,r5,3<4> + bcnd eq0,r5,Lzero + + or r12,r0,lo16(Lbase) + or.u r12,r12,hi16(Lbase) + addu r12,r12,r5 ; r12 is address for entering in loop + + extu r5,r5,2 ; divide by 4 + subu r2,r2,r5 ; adjust res_ptr + subu r3,r3,r5 ; adjust s1_ptr + subu r4,r4,r5 ; adjust s2_ptr + + or r8,r6,r0 + + jmp.n r12 + or r9,r7,r0 + +Loop: addu r3,r3,32 + st r8,r2,28 + addu r4,r4,32 + ld r6,r3,0 + addu r2,r2,32 + ld r7,r4,0 +Lzero: subu r10,r10,1 ; add 0 + 8r limbs (adj loop cnt) +Lbase: ld r8,r3,4 + addu.cio r6,r6,r7 + ld r9,r4,4 + st r6,r2,0 + ld r6,r3,8 ; add 7 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,8 + st r8,r2,4 + ld r8,r3,12 ; add 6 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,12 + st r6,r2,8 + ld r6,r3,16 ; add 5 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,16 + st r8,r2,12 + ld r8,r3,20 ; add 4 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,20 + st r6,r2,16 + ld r6,r3,24 ; add 3 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,24 + st r8,r2,20 + ld r8,r3,28 ; add 2 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,28 + st r6,r2,24 + bcnd.n ne0,r10,Loop ; add 1 + 8r limbs + addu.cio r8,r8,r9 + + st r8,r2,28 ; store most significant limb + + jmp.n r1 + addu.ci r2,r0,r0 ; return carry-out from most sign. limb diff --git a/rts/gmp/mpn/m88k/mc88110/add_n.S b/rts/gmp/mpn/m88k/mc88110/add_n.S new file mode 100644 index 0000000000..843a50dded --- /dev/null +++ b/rts/gmp/mpn/m88k/mc88110/add_n.S @@ -0,0 +1,200 @@ +; mc88110 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +#define res_ptr r2 +#define s1_ptr r3 +#define s2_ptr r4 +#define size r5 + +#include "sysdep.h" + + text + align 16 + global C_SYMBOL_NAME(__gmpn_add_n) +C_SYMBOL_NAME(__gmpn_add_n): + addu.co r0,r0,r0 ; clear cy flag + xor r12,s2_ptr,res_ptr + bb1 2,r12,L1 +; ** V1a ** +L0: bb0 2,res_ptr,L_v1 ; branch if res_ptr is aligned? +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + addu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 +L_v1: cmp r12,size,2 + bb1 lt,r12,Lend2 + + ld r10,s1_ptr,0 + ld r12,s1_ptr,4 + ld.d r8,s2_ptr,0 + subu size,size,10 + bcnd lt0,size,Lfin1 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop1: subu size,size,8 + addu.cio r6,r10,r8 + ld r10,s1_ptr,8 + addu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + addu.cio r6,r10,r8 + ld r10,s1_ptr,16 + addu.cio r7,r12,r9 + ld r12,s1_ptr,20 + ld.d r8,s2_ptr,16 + st.d r6,res_ptr,8 + addu.cio r6,r10,r8 + ld r10,s1_ptr,24 + addu.cio r7,r12,r9 + ld r12,s1_ptr,28 + ld.d r8,s2_ptr,24 + st.d r6,res_ptr,16 + addu.cio r6,r10,r8 + ld r10,s1_ptr,32 + addu.cio r7,r12,r9 + ld r12,s1_ptr,36 + addu s1_ptr,s1_ptr,32 + ld.d r8,s2_ptr,32 + addu s2_ptr,s2_ptr,32 + st.d r6,res_ptr,24 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop1 + +Lfin1: addu size,size,8-2 + bcnd lt0,size,Lend1 +/* Add blocks of 2 limbs until less than 2 limbs remain */ +Loope1: addu.cio r6,r10,r8 + ld r10,s1_ptr,8 + addu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope1 +Lend1: addu.cio r6,r10,r8 + addu.cio r7,r12,r9 + st.d r6,res_ptr,0 + + bb0 0,size,Lret1 +/* Add last limb */ + ld r10,s1_ptr,8 + ld r8,s2_ptr,8 + addu.cio r6,r10,r8 + st r6,res_ptr,8 + +Lret1: jmp.n r1 + addu.ci r2,r0,r0 ; return carry-out from most sign. limb + +L1: xor r12,s1_ptr,res_ptr + bb1 2,r12,L2 +; ** V1b ** + or r12,r0,s2_ptr + or s2_ptr,r0,s1_ptr + or s1_ptr,r0,r12 + br L0 + +; ** V2 ** +/* If we come here, the alignment of s1_ptr and res_ptr as well as the + alignment of s2_ptr and res_ptr differ. Since there are only two ways + things can be aligned (that we care about) we now know that the alignment + of s1_ptr and s2_ptr are the same. */ + +L2: cmp r12,size,1 + bb1 eq,r12,Ljone + bb0 2,s1_ptr,L_v2 ; branch if s1_ptr is aligned +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + addu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 + +L_v2: subu size,size,8 + bcnd lt0,size,Lfin2 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop2: subu size,size,8 + ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + addu.cio r8,r8,r6 + st r8,res_ptr,0 + addu.cio r9,r9,r7 + st r9,res_ptr,4 + ld.d r8,s1_ptr,8 + ld.d r6,s2_ptr,8 + addu.cio r8,r8,r6 + st r8,res_ptr,8 + addu.cio r9,r9,r7 + st r9,res_ptr,12 + ld.d r8,s1_ptr,16 + ld.d r6,s2_ptr,16 + addu.cio r8,r8,r6 + st r8,res_ptr,16 + addu.cio r9,r9,r7 + st r9,res_ptr,20 + ld.d r8,s1_ptr,24 + ld.d r6,s2_ptr,24 + addu.cio r8,r8,r6 + st r8,res_ptr,24 + addu.cio r9,r9,r7 + st r9,res_ptr,28 + addu s1_ptr,s1_ptr,32 + addu s2_ptr,s2_ptr,32 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop2 + +Lfin2: addu size,size,8-2 + bcnd lt0,size,Lend2 +Loope2: ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + addu.cio r8,r8,r6 + st r8,res_ptr,0 + addu.cio r9,r9,r7 + st r9,res_ptr,4 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope2 +Lend2: bb0 0,size,Lret2 +/* Add last limb */ +Ljone: ld r10,s1_ptr,0 + ld r8,s2_ptr,0 + addu.cio r6,r10,r8 + st r6,res_ptr,0 + +Lret2: jmp.n r1 + addu.ci r2,r0,r0 ; return carry-out from most sign. limb diff --git a/rts/gmp/mpn/m88k/mc88110/addmul_1.s b/rts/gmp/mpn/m88k/mc88110/addmul_1.s new file mode 100644 index 0000000000..7d97c87c79 --- /dev/null +++ b/rts/gmp/mpn/m88k/mc88110/addmul_1.s @@ -0,0 +1,61 @@ +; mc88110 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; size r4 +; s2_limb r5 + + text + align 16 + global ___gmpn_addmul_1 +___gmpn_addmul_1: + lda r3,r3[r4] + lda r8,r2[r4] ; RES_PTR in r8 since r2 is retval + subu r4,r0,r4 + addu.co r2,r0,r0 ; r2 = cy = 0 + + ld r6,r3[r4] + addu r4,r4,1 + subu r8,r8,4 + bcnd.n eq0,r4,Lend + mulu.d r10,r6,r5 + +Loop: ld r7,r8[r4] + ld r6,r3[r4] + addu.cio r9,r11,r2 + addu.ci r2,r10,r0 + addu.co r9,r9,r7 + st r9,r8[r4] + addu r4,r4,1 + mulu.d r10,r6,r5 + bcnd ne0,r4,Loop + +Lend: ld r7,r8,0 + addu.cio r9,r11,r2 + addu.ci r2,r10,r0 + addu.co r9,r9,r7 + st r9,r8,0 + jmp.n r1 + addu.ci r2,r2,r0 diff --git a/rts/gmp/mpn/m88k/mc88110/mul_1.s b/rts/gmp/mpn/m88k/mc88110/mul_1.s new file mode 100644 index 0000000000..b8483afa91 --- /dev/null +++ b/rts/gmp/mpn/m88k/mc88110/mul_1.s @@ -0,0 +1,59 @@ +; mc88110 __gmpn_mul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; size r4 +; s2_limb r5 + + text + align 16 + global ___gmpn_mul_1 +___gmpn_mul_1: + ; Make S1_PTR and RES_PTR point at the end of their blocks + ; and negate SIZE. + lda r3,r3[r4] + lda r8,r2[r4] ; RES_PTR in r8 since r2 is retval + subu r4,r0,r4 + + addu.co r2,r0,r0 ; r2 = cy = 0 + + ld r6,r3[r4] + addu r4,r4,1 + mulu.d r10,r6,r5 + bcnd.n eq0,r4,Lend + subu r8,r8,8 + +Loop: ld r6,r3[r4] + addu.cio r9,r11,r2 + or r2,r10,r0 ; could be avoided if unrolled + addu r4,r4,1 + mulu.d r10,r6,r5 + bcnd.n ne0,r4,Loop + st r9,r8[r4] + +Lend: addu.cio r9,r11,r2 + st r9,r8,4 + jmp.n r1 + addu.ci r2,r10,r0 diff --git a/rts/gmp/mpn/m88k/mc88110/sub_n.S b/rts/gmp/mpn/m88k/mc88110/sub_n.S new file mode 100644 index 0000000000..715a3faf25 --- /dev/null +++ b/rts/gmp/mpn/m88k/mc88110/sub_n.S @@ -0,0 +1,276 @@ +; mc88110 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +#define res_ptr r2 +#define s1_ptr r3 +#define s2_ptr r4 +#define size r5 + +#include "sysdep.h" + + text + align 16 + global C_SYMBOL_NAME(__gmpn_sub_n) +C_SYMBOL_NAME(__gmpn_sub_n): + subu.co r0,r0,r0 ; set cy flag + xor r12,s2_ptr,res_ptr + bb1 2,r12,L1 +; ** V1a ** +L0: bb0 2,res_ptr,L_v1 ; branch if res_ptr is aligned +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + subu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 +L_v1: cmp r12,size,2 + bb1 lt,r12,Lend2 + + ld r10,s1_ptr,0 + ld r12,s1_ptr,4 + ld.d r8,s2_ptr,0 + subu size,size,10 + bcnd lt0,size,Lfin1 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop1: subu size,size,8 + subu.cio r6,r10,r8 + ld r10,s1_ptr,8 + subu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + subu.cio r6,r10,r8 + ld r10,s1_ptr,16 + subu.cio r7,r12,r9 + ld r12,s1_ptr,20 + ld.d r8,s2_ptr,16 + st.d r6,res_ptr,8 + subu.cio r6,r10,r8 + ld r10,s1_ptr,24 + subu.cio r7,r12,r9 + ld r12,s1_ptr,28 + ld.d r8,s2_ptr,24 + st.d r6,res_ptr,16 + subu.cio r6,r10,r8 + ld r10,s1_ptr,32 + subu.cio r7,r12,r9 + ld r12,s1_ptr,36 + addu s1_ptr,s1_ptr,32 + ld.d r8,s2_ptr,32 + addu s2_ptr,s2_ptr,32 + st.d r6,res_ptr,24 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop1 + +Lfin1: addu size,size,8-2 + bcnd lt0,size,Lend1 +/* Add blocks of 2 limbs until less than 2 limbs remain */ +Loope1: subu.cio r6,r10,r8 + ld r10,s1_ptr,8 + subu.cio r7,r12,r9 + ld r12,s1_ptr,12 + ld.d r8,s2_ptr,8 + st.d r6,res_ptr,0 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope1 +Lend1: subu.cio r6,r10,r8 + subu.cio r7,r12,r9 + st.d r6,res_ptr,0 + + bb0 0,size,Lret1 +/* Add last limb */ + ld r10,s1_ptr,8 + ld r8,s2_ptr,8 + subu.cio r6,r10,r8 + st r6,res_ptr,8 + +Lret1: addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 + +L1: xor r12,s1_ptr,res_ptr + bb1 2,r12,L2 +; ** V1b ** + bb0 2,res_ptr,L_v1b ; branch if res_ptr is aligned +/* Add least significant limb separately to align res_ptr and s1_ptr */ + ld r10,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + ld r8,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + subu size,size,1 + subu.co r6,r8,r10 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 +L_v1b: cmp r12,size,2 + bb1 lt,r12,Lend2 + + ld r10,s2_ptr,0 + ld r12,s2_ptr,4 + ld.d r8,s1_ptr,0 + subu size,size,10 + bcnd lt0,size,Lfin1b +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop1b: subu size,size,8 + subu.cio r6,r8,r10 + ld r10,s2_ptr,8 + subu.cio r7,r9,r12 + ld r12,s2_ptr,12 + ld.d r8,s1_ptr,8 + st.d r6,res_ptr,0 + subu.cio r6,r8,r10 + ld r10,s2_ptr,16 + subu.cio r7,r9,r12 + ld r12,s2_ptr,20 + ld.d r8,s1_ptr,16 + st.d r6,res_ptr,8 + subu.cio r6,r8,r10 + ld r10,s2_ptr,24 + subu.cio r7,r9,r12 + ld r12,s2_ptr,28 + ld.d r8,s1_ptr,24 + st.d r6,res_ptr,16 + subu.cio r6,r8,r10 + ld r10,s2_ptr,32 + subu.cio r7,r9,r12 + ld r12,s2_ptr,36 + addu s2_ptr,s2_ptr,32 + ld.d r8,s1_ptr,32 + addu s1_ptr,s1_ptr,32 + st.d r6,res_ptr,24 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop1b + +Lfin1b: addu size,size,8-2 + bcnd lt0,size,Lend1b +/* Add blocks of 2 limbs until less than 2 limbs remain */ +Loope1b:subu.cio r6,r8,r10 + ld r10,s2_ptr,8 + subu.cio r7,r9,r12 + ld r12,s2_ptr,12 + ld.d r8,s1_ptr,8 + st.d r6,res_ptr,0 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope1b +Lend1b: subu.cio r6,r8,r10 + subu.cio r7,r9,r12 + st.d r6,res_ptr,0 + + bb0 0,size,Lret1b +/* Add last limb */ + ld r10,s2_ptr,8 + ld r8,s1_ptr,8 + subu.cio r6,r8,r10 + st r6,res_ptr,8 + +Lret1b: addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 + +; ** V2 ** +/* If we come here, the alignment of s1_ptr and res_ptr as well as the + alignment of s2_ptr and res_ptr differ. Since there are only two ways + things can be aligned (that we care about) we now know that the alignment + of s1_ptr and s2_ptr are the same. */ + +L2: cmp r12,size,1 + bb1 eq,r12,Ljone + bb0 2,s1_ptr,L_v2 ; branch if s1_ptr is aligned +/* Add least significant limb separately to align res_ptr and s2_ptr */ + ld r10,s1_ptr,0 + addu s1_ptr,s1_ptr,4 + ld r8,s2_ptr,0 + addu s2_ptr,s2_ptr,4 + subu size,size,1 + subu.co r6,r10,r8 + st r6,res_ptr,0 + addu res_ptr,res_ptr,4 + +L_v2: subu size,size,8 + bcnd lt0,size,Lfin2 +/* Add blocks of 8 limbs until less than 8 limbs remain */ + align 8 +Loop2: subu size,size,8 + ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + subu.cio r8,r8,r6 + st r8,res_ptr,0 + subu.cio r9,r9,r7 + st r9,res_ptr,4 + ld.d r8,s1_ptr,8 + ld.d r6,s2_ptr,8 + subu.cio r8,r8,r6 + st r8,res_ptr,8 + subu.cio r9,r9,r7 + st r9,res_ptr,12 + ld.d r8,s1_ptr,16 + ld.d r6,s2_ptr,16 + subu.cio r8,r8,r6 + st r8,res_ptr,16 + subu.cio r9,r9,r7 + st r9,res_ptr,20 + ld.d r8,s1_ptr,24 + ld.d r6,s2_ptr,24 + subu.cio r8,r8,r6 + st r8,res_ptr,24 + subu.cio r9,r9,r7 + st r9,res_ptr,28 + addu s1_ptr,s1_ptr,32 + addu s2_ptr,s2_ptr,32 + addu res_ptr,res_ptr,32 + bcnd ge0,size,Loop2 + +Lfin2: addu size,size,8-2 + bcnd lt0,size,Lend2 +Loope2: ld.d r8,s1_ptr,0 + ld.d r6,s2_ptr,0 + subu.cio r8,r8,r6 + st r8,res_ptr,0 + subu.cio r9,r9,r7 + st r9,res_ptr,4 + subu size,size,2 + addu s1_ptr,s1_ptr,8 + addu s2_ptr,s2_ptr,8 + addu res_ptr,res_ptr,8 + bcnd ge0,size,Loope2 +Lend2: bb0 0,size,Lret2 +/* Add last limb */ +Ljone: ld r10,s1_ptr,0 + ld r8,s2_ptr,0 + subu.cio r6,r10,r8 + st r6,res_ptr,0 + +Lret2: addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 diff --git a/rts/gmp/mpn/m88k/mul_1.s b/rts/gmp/mpn/m88k/mul_1.s new file mode 100644 index 0000000000..06370837ef --- /dev/null +++ b/rts/gmp/mpn/m88k/mul_1.s @@ -0,0 +1,127 @@ +; mc88100 __gmpn_mul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; size r4 +; s2_limb r5 + +; Common overhead is about 11 cycles/invocation. + +; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb. (The +; pipeline stalls 2 cycles due to WB contention.) + +; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb. (The +; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.) + +; To enhance speed: +; 1. Unroll main loop 4-8 times. +; 2. Schedule code to avoid WB contention. It might be tempting to move the +; ld instruction in the loops down to save 2 cycles (less WB contention), +; but that looses because the ultimate value will be read from outside +; the allocated space. But if we handle the ultimate multiplication in +; the tail, we can do this. +; 3. Make the multiplication with less instructions. I think the code for +; (S2_LIMB >= 0x10000) is not minimal. +; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or +; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11 +; cycles/limb. (Assuming infinite unrolling.) + + text + align 16 + global ___gmpn_mul_1 +___gmpn_mul_1: + + ; Make S1_PTR and RES_PTR point at the end of their blocks + ; and negate SIZE. + lda r3,r3[r4] + lda r6,r2[r4] ; RES_PTR in r6 since r2 is retval + subu r4,r0,r4 + + addu.co r2,r0,r0 ; r2 = cy = 0 + ld r9,r3[r4] + mask r7,r5,0xffff ; r7 = lo(S2_LIMB) + extu r8,r5,16 ; r8 = hi(S2_LIMB) + bcnd.n eq0,r8,Lsmall ; jump if (hi(S2_LIMB) == 0) + subu r6,r6,4 + +; General code for any value of S2_LIMB. + + ; Make a stack frame and save r25 and r26 + subu r31,r31,16 + st.d r25,r31,8 + + ; Enter the loop in the middle + br.n L1 + addu r4,r4,1 + +Loop: ld r9,r3[r4] + st r26,r6[r4] +; bcnd ne0,r0,0 ; bubble + addu r4,r4,1 +L1: mul r26,r9,r5 ; low word of product mul_1 WB ld + mask r12,r9,0xffff ; r12 = lo(s1_limb) mask_1 + mul r11,r12,r7 ; r11 = prod_0 mul_2 WB mask_1 + mul r10,r12,r8 ; r10 = prod_1a mul_3 + extu r13,r9,16 ; r13 = hi(s1_limb) extu_1 WB mul_1 + mul r12,r13,r7 ; r12 = prod_1b mul_4 WB extu_1 + mul r25,r13,r8 ; r25 = prod_2 mul_5 WB mul_2 + extu r11,r11,16 ; r11 = hi(prod_0) extu_2 WB mul_3 + addu r10,r10,r11 ; addu_1 WB extu_2 +; bcnd ne0,r0,0 ; bubble WB addu_1 + addu.co r10,r10,r12 ; WB mul_4 + mask.u r10,r10,0xffff ; move the 16 most significant bits... + addu.ci r10,r10,r0 ; ...to the low half of the word... + rot r10,r10,16 ; ...and put carry in pos 16. + addu.co r26,r26,r2 ; add old carry limb + bcnd.n ne0,r4,Loop + addu.ci r2,r25,r10 ; compute new carry limb + + st r26,r6[r4] + ld.d r25,r31,8 + jmp.n r1 + addu r31,r31,16 + +; Fast code for S2_LIMB < 0x10000 +Lsmall: + ; Enter the loop in the middle + br.n SL1 + addu r4,r4,1 + +SLoop: ld r9,r3[r4] ; + st r8,r6[r4] ; + addu r4,r4,1 ; +SL1: mul r8,r9,r5 ; low word of product + mask r12,r9,0xffff ; r12 = lo(s1_limb) + extu r13,r9,16 ; r13 = hi(s1_limb) + mul r11,r12,r7 ; r11 = prod_0 + mul r12,r13,r7 ; r12 = prod_1b + addu.cio r8,r8,r2 ; add old carry limb + extu r10,r11,16 ; r11 = hi(prod_0) + addu r10,r10,r12 ; + bcnd.n ne0,r4,SLoop + extu r2,r10,16 ; r2 = new carry limb + + jmp.n r1 + st r8,r6[r4] diff --git a/rts/gmp/mpn/m88k/sub_n.s b/rts/gmp/mpn/m88k/sub_n.s new file mode 100644 index 0000000000..2fd345a135 --- /dev/null +++ b/rts/gmp/mpn/m88k/sub_n.s @@ -0,0 +1,106 @@ +; mc88100 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; s2_ptr r4 +; size r5 + +; This code has been optimized to run one instruction per clock, avoiding +; load stalls and writeback contention. As a result, the instruction +; order is not always natural. + +; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100, +; but on the 88110, it seems to run much slower, 6.6 clocks/limb. + + text + align 16 + global ___gmpn_sub_n +___gmpn_sub_n: + ld r6,r3,0 ; read first limb from s1_ptr + extu r10,r5,3 + ld r7,r4,0 ; read first limb from s2_ptr + + subu r5,r0,r5 + mak r5,r5,3<4> + bcnd.n eq0,r5,Lzero + subu.co r0,r0,r0 ; initialize carry + + or r12,r0,lo16(Lbase) + or.u r12,r12,hi16(Lbase) + addu r12,r12,r5 ; r12 is address for entering in loop + + extu r5,r5,2 ; divide by 4 + subu r2,r2,r5 ; adjust res_ptr + subu r3,r3,r5 ; adjust s1_ptr + subu r4,r4,r5 ; adjust s2_ptr + + or r8,r6,r0 + + jmp.n r12 + or r9,r7,r0 + +Loop: addu r3,r3,32 + st r8,r2,28 + addu r4,r4,32 + ld r6,r3,0 + addu r2,r2,32 + ld r7,r4,0 +Lzero: subu r10,r10,1 ; subtract 0 + 8r limbs (adj loop cnt) +Lbase: ld r8,r3,4 + subu.cio r6,r6,r7 + ld r9,r4,4 + st r6,r2,0 + ld r6,r3,8 ; subtract 7 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,8 + st r8,r2,4 + ld r8,r3,12 ; subtract 6 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,12 + st r6,r2,8 + ld r6,r3,16 ; subtract 5 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,16 + st r8,r2,12 + ld r8,r3,20 ; subtract 4 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,20 + st r6,r2,16 + ld r6,r3,24 ; subtract 3 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,24 + st r8,r2,20 + ld r8,r3,28 ; subtract 2 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,28 + st r6,r2,24 + bcnd.n ne0,r10,Loop ; subtract 1 + 8r limbs + subu.cio r8,r8,r9 + + st r8,r2,28 ; store most significant limb + + addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 diff --git a/rts/gmp/mpn/mips2/add_n.s b/rts/gmp/mpn/mips2/add_n.s new file mode 100644 index 0000000000..5c3c7fc8a1 --- /dev/null +++ b/rts/gmp/mpn/mips2/add_n.s @@ -0,0 +1,120 @@ + # MIPS2 __gmpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __gmpn_add_n + .ent __gmpn_add_n +__gmpn_add_n: + .set noreorder + .set nomacro + + lw $10,0($5) + lw $11,0($6) + + addiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + subu $7,$7,$9 + +.Loop0: addiu $9,$9,-1 + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + or $2,$2,$8 + + addiu $5,$5,4 + addiu $6,$6,4 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + addiu $4,$4,4 + +.L0: beq $7,$0,.Lend + nop + +.Loop: addiu $7,$7,-4 + + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + or $2,$2,$8 + + lw $10,8($5) + addu $13,$13,$2 + lw $11,8($6) + sltu $8,$13,$2 + addu $13,$12,$13 + sltu $2,$13,$12 + sw $13,4($4) + or $2,$2,$8 + + lw $12,12($5) + addu $11,$11,$2 + lw $13,12($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,8($4) + or $2,$2,$8 + + lw $10,16($5) + addu $13,$13,$2 + lw $11,16($6) + sltu $8,$13,$2 + addu $13,$12,$13 + sltu $2,$13,$12 + sw $13,12($4) + or $2,$2,$8 + + addiu $5,$5,16 + addiu $6,$6,16 + + bne $7,$0,.Loop + addiu $4,$4,16 + +.Lend: addu $11,$11,$2 + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + j $31 + or $2,$2,$8 + + .end __gmpn_add_n diff --git a/rts/gmp/mpn/mips2/addmul_1.s b/rts/gmp/mpn/mips2/addmul_1.s new file mode 100644 index 0000000000..1e5037751b --- /dev/null +++ b/rts/gmp/mpn/mips2/addmul_1.s @@ -0,0 +1,97 @@ + # MIPS __gmpn_addmul_1 -- Multiply a limb vector with a single limb and + # add the product to a second limb vector. + + # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __gmpn_addmul_1 + .ent __gmpn_addmul_1 +__gmpn_addmul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + lw $8,0($5) + + # warm up phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) # load new s1 limb as early as possible + +Loop: lw $10,0($4) + mflo $3 + mfhi $9 + addiu $5,$5,4 + addu $3,$3,$2 # add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) # load new s1 limb as early as possible + addiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + bne $6,$0,Loop + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + multu $8,$7 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + j $31 + addu $2,$9,$2 # add high product limb and carry from addition + + .end __gmpn_addmul_1 diff --git a/rts/gmp/mpn/mips2/lshift.s b/rts/gmp/mpn/mips2/lshift.s new file mode 100644 index 0000000000..2ca3a3c800 --- /dev/null +++ b/rts/gmp/mpn/mips2/lshift.s @@ -0,0 +1,95 @@ + # MIPS2 __gmpn_lshift -- + + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __gmpn_lshift + .ent __gmpn_lshift +__gmpn_lshift: + .set noreorder + .set nomacro + + sll $2,$6,2 + addu $5,$5,$2 # make r5 point at end of src + lw $10,-4($5) # load first limb + subu $13,$0,$7 + addu $4,$4,$2 # make r4 point at end of res + addiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + srl $2,$10,$13 # compute function result + + subu $6,$6,$9 + +.Loop0: lw $3,-8($5) + addiu $4,$4,-4 + addiu $5,$5,-4 + addiu $9,$9,-1 + sll $11,$10,$7 + srl $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sw $8,0($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: lw $3,-8($5) + addiu $4,$4,-16 + addiu $6,$6,-4 + sll $11,$10,$7 + srl $12,$3,$13 + + lw $10,-12($5) + sll $14,$3,$7 + or $8,$11,$12 + sw $8,12($4) + srl $9,$10,$13 + + lw $3,-16($5) + sll $11,$10,$7 + or $8,$14,$9 + sw $8,8($4) + srl $12,$3,$13 + + lw $10,-20($5) + sll $14,$3,$7 + or $8,$11,$12 + sw $8,4($4) + srl $9,$10,$13 + + addiu $5,$5,-16 + or $8,$14,$9 + bgtz $6,.Loop + sw $8,0($4) + +.Lend: sll $8,$10,$7 + j $31 + sw $8,-4($4) + .end __gmpn_lshift diff --git a/rts/gmp/mpn/mips2/mul_1.s b/rts/gmp/mpn/mips2/mul_1.s new file mode 100644 index 0000000000..ea8aa26809 --- /dev/null +++ b/rts/gmp/mpn/mips2/mul_1.s @@ -0,0 +1,85 @@ + # MIPS __gmpn_mul_1 -- Multiply a limb vector with a single limb and + # store the product in a second limb vector. + + # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __gmpn_mul_1 + .ent __gmpn_mul_1 +__gmpn_mul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + lw $8,0($5) + + # warm up phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) # load new s1 limb as early as possible + +Loop: mflo $10 + mfhi $9 + addiu $5,$5,4 + addu $10,$10,$2 # add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) # load new s1 limb as early as possible + addiu $6,$6,-1 # decrement loop counter + sltu $2,$10,$2 # carry from previous addition -> $2 + sw $10,0($4) + addiu $4,$4,4 + bne $6,$0,Loop + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: mflo $10 + mfhi $9 + addu $10,$10,$2 + sltu $2,$10,$2 + multu $8,$7 + sw $10,0($4) + addiu $4,$4,4 + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: mflo $10 + mfhi $9 + addu $10,$10,$2 + sltu $2,$10,$2 + sw $10,0($4) + j $31 + addu $2,$9,$2 # add high product limb and carry from addition + + .end __gmpn_mul_1 diff --git a/rts/gmp/mpn/mips2/rshift.s b/rts/gmp/mpn/mips2/rshift.s new file mode 100644 index 0000000000..37c8f39cb4 --- /dev/null +++ b/rts/gmp/mpn/mips2/rshift.s @@ -0,0 +1,92 @@ + # MIPS2 __gmpn_rshift -- + + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __gmpn_rshift + .ent __gmpn_rshift +__gmpn_rshift: + .set noreorder + .set nomacro + + lw $10,0($5) # load first limb + subu $13,$0,$7 + addiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + sll $2,$10,$13 # compute function result + + subu $6,$6,$9 + +.Loop0: lw $3,4($5) + addiu $4,$4,4 + addiu $5,$5,4 + addiu $9,$9,-1 + srl $11,$10,$7 + sll $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sw $8,-4($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: lw $3,4($5) + addiu $4,$4,16 + addiu $6,$6,-4 + srl $11,$10,$7 + sll $12,$3,$13 + + lw $10,8($5) + srl $14,$3,$7 + or $8,$11,$12 + sw $8,-16($4) + sll $9,$10,$13 + + lw $3,12($5) + srl $11,$10,$7 + or $8,$14,$9 + sw $8,-12($4) + sll $12,$3,$13 + + lw $10,16($5) + srl $14,$3,$7 + or $8,$11,$12 + sw $8,-8($4) + sll $9,$10,$13 + + addiu $5,$5,16 + or $8,$14,$9 + bgtz $6,.Loop + sw $8,-4($4) + +.Lend: srl $8,$10,$7 + j $31 + sw $8,0($4) + .end __gmpn_rshift diff --git a/rts/gmp/mpn/mips2/sub_n.s b/rts/gmp/mpn/mips2/sub_n.s new file mode 100644 index 0000000000..51d34f3ac3 --- /dev/null +++ b/rts/gmp/mpn/mips2/sub_n.s @@ -0,0 +1,120 @@ + # MIPS2 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __gmpn_sub_n + .ent __gmpn_sub_n +__gmpn_sub_n: + .set noreorder + .set nomacro + + lw $10,0($5) + lw $11,0($6) + + addiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + subu $7,$7,$9 + +.Loop0: addiu $9,$9,-1 + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + or $2,$2,$8 + + addiu $5,$5,4 + addiu $6,$6,4 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + addiu $4,$4,4 + +.L0: beq $7,$0,.Lend + nop + +.Loop: addiu $7,$7,-4 + + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + or $2,$2,$8 + + lw $10,8($5) + addu $13,$13,$2 + lw $11,8($6) + sltu $8,$13,$2 + subu $13,$12,$13 + sltu $2,$12,$13 + sw $13,4($4) + or $2,$2,$8 + + lw $12,12($5) + addu $11,$11,$2 + lw $13,12($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,8($4) + or $2,$2,$8 + + lw $10,16($5) + addu $13,$13,$2 + lw $11,16($6) + sltu $8,$13,$2 + subu $13,$12,$13 + sltu $2,$12,$13 + sw $13,12($4) + or $2,$2,$8 + + addiu $5,$5,16 + addiu $6,$6,16 + + bne $7,$0,.Loop + addiu $4,$4,16 + +.Lend: addu $11,$11,$2 + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + j $31 + or $2,$2,$8 + + .end __gmpn_sub_n diff --git a/rts/gmp/mpn/mips2/submul_1.s b/rts/gmp/mpn/mips2/submul_1.s new file mode 100644 index 0000000000..495dea3ba2 --- /dev/null +++ b/rts/gmp/mpn/mips2/submul_1.s @@ -0,0 +1,97 @@ + # MIPS __gmpn_submul_1 -- Multiply a limb vector with a single limb and + # subtract the product from a second limb vector. + + # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __gmpn_submul_1 + .ent __gmpn_submul_1 +__gmpn_submul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + lw $8,0($5) + + # warm up phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) # load new s1 limb as early as possible + +Loop: lw $10,0($4) + mflo $3 + mfhi $9 + addiu $5,$5,4 + addu $3,$3,$2 # add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) # load new s1 limb as early as possible + addiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + bne $6,$0,Loop + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + multu $8,$7 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + j $31 + addu $2,$9,$2 # add high product limb and carry from addition + + .end __gmpn_submul_1 diff --git a/rts/gmp/mpn/mips2/umul.s b/rts/gmp/mpn/mips2/umul.s new file mode 100644 index 0000000000..40e847614c --- /dev/null +++ b/rts/gmp/mpn/mips2/umul.s @@ -0,0 +1,30 @@ + # Copyright (C) 1999 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + .text + .align 2 + .globl __umul_ppmm + .ent __umul_ppmm +__umul_ppmm: + multu $5,$6 + mflo $3 + mfhi $2 + sw $3,0($4) + j $31 + .end __umul_ppmm diff --git a/rts/gmp/mpn/mips3/README b/rts/gmp/mpn/mips3/README new file mode 100644 index 0000000000..e94b2c7460 --- /dev/null +++ b/rts/gmp/mpn/mips3/README @@ -0,0 +1,23 @@ +This directory contains mpn functions optimized for MIPS3. Example of +processors that implement MIPS3 are R4000, R4400, R4600, R4700, and R8000. + +RELEVANT OPTIMIZATION ISSUES + +1. On the R4000 and R4400, branches, both the plain and the "likely" ones, + take 3 cycles to execute. (The fastest possible loop will take 4 cycles, + because of the delay insn.) + + On the R4600, branches takes a single cycle + + On the R8000, branches often take no noticable cycles, as they are + executed in a separate function unit.. + +2. The R4000 and R4400 have a load latency of 4 cycles. + +3. On the R4000 and R4400, multiplies take a data-dependent number of + cycles, contrary to the SGI documentation. There seem to be 3 or 4 + possible latencies. + +STATUS + +Good... diff --git a/rts/gmp/mpn/mips3/add_n.s b/rts/gmp/mpn/mips3/add_n.s new file mode 100644 index 0000000000..adad0beaef --- /dev/null +++ b/rts/gmp/mpn/mips3/add_n.s @@ -0,0 +1,120 @@ + # MIPS3 __gmpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __gmpn_add_n + .ent __gmpn_add_n +__gmpn_add_n: + .set noreorder + .set nomacro + + ld $10,0($5) + ld $11,0($6) + + daddiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + dsubu $7,$7,$9 + +.Loop0: daddiu $9,$9,-1 + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,0($4) + or $2,$2,$8 + + daddiu $5,$5,8 + daddiu $6,$6,8 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + daddiu $4,$4,8 + +.L0: beq $7,$0,.Lend + nop + +.Loop: daddiu $7,$7,-4 + + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,0($4) + or $2,$2,$8 + + ld $10,16($5) + daddu $13,$13,$2 + ld $11,16($6) + sltu $8,$13,$2 + daddu $13,$12,$13 + sltu $2,$13,$12 + sd $13,8($4) + or $2,$2,$8 + + ld $12,24($5) + daddu $11,$11,$2 + ld $13,24($6) + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,16($4) + or $2,$2,$8 + + ld $10,32($5) + daddu $13,$13,$2 + ld $11,32($6) + sltu $8,$13,$2 + daddu $13,$12,$13 + sltu $2,$13,$12 + sd $13,24($4) + or $2,$2,$8 + + daddiu $5,$5,32 + daddiu $6,$6,32 + + bne $7,$0,.Loop + daddiu $4,$4,32 + +.Lend: daddu $11,$11,$2 + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,0($4) + j $31 + or $2,$2,$8 + + .end __gmpn_add_n diff --git a/rts/gmp/mpn/mips3/addmul_1.s b/rts/gmp/mpn/mips3/addmul_1.s new file mode 100644 index 0000000000..d390e2298e --- /dev/null +++ b/rts/gmp/mpn/mips3/addmul_1.s @@ -0,0 +1,97 @@ + # MIPS3 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and + # add the product to a second limb vector. + + # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __gmpn_addmul_1 + .ent __gmpn_addmul_1 +__gmpn_addmul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + ld $8,0($5) + + # warm up phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) # load new s1 limb as early as possible + +Loop: ld $10,0($4) + mflo $3 + mfhi $9 + daddiu $5,$5,8 + daddu $3,$3,$2 # add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) # load new s1 limb as early as possible + daddiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dmultu $8,$7 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + j $31 + daddu $2,$9,$2 # add high product limb and carry from addition + + .end __gmpn_addmul_1 diff --git a/rts/gmp/mpn/mips3/gmp-mparam.h b/rts/gmp/mpn/mips3/gmp-mparam.h new file mode 100644 index 0000000000..656e90c7b0 --- /dev/null +++ b/rts/gmp/mpn/mips3/gmp-mparam.h @@ -0,0 +1,58 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values are for the R10000 usign the system cc. */ +/* Generated by tuneup.c, 2000-07-25. */ +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 16 +#endif +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 32 +#endif + +/* Supressed the TOOM3 values as they looked absolutely crazy + (698 and 21 respectively) */ + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 58 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 54 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 82 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 159 +#endif diff --git a/rts/gmp/mpn/mips3/lshift.s b/rts/gmp/mpn/mips3/lshift.s new file mode 100644 index 0000000000..372606fddf --- /dev/null +++ b/rts/gmp/mpn/mips3/lshift.s @@ -0,0 +1,95 @@ + # MIPS3 __gmpn_lshift -- + + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __gmpn_lshift + .ent __gmpn_lshift +__gmpn_lshift: + .set noreorder + .set nomacro + + dsll $2,$6,3 + daddu $5,$5,$2 # make r5 point at end of src + ld $10,-8($5) # load first limb + dsubu $13,$0,$7 + daddu $4,$4,$2 # make r4 point at end of res + daddiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + dsrl $2,$10,$13 # compute function result + + dsubu $6,$6,$9 + +.Loop0: ld $3,-16($5) + daddiu $4,$4,-8 + daddiu $5,$5,-8 + daddiu $9,$9,-1 + dsll $11,$10,$7 + dsrl $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sd $8,0($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: ld $3,-16($5) + daddiu $4,$4,-32 + daddiu $6,$6,-4 + dsll $11,$10,$7 + dsrl $12,$3,$13 + + ld $10,-24($5) + dsll $14,$3,$7 + or $8,$11,$12 + sd $8,24($4) + dsrl $9,$10,$13 + + ld $3,-32($5) + dsll $11,$10,$7 + or $8,$14,$9 + sd $8,16($4) + dsrl $12,$3,$13 + + ld $10,-40($5) + dsll $14,$3,$7 + or $8,$11,$12 + sd $8,8($4) + dsrl $9,$10,$13 + + daddiu $5,$5,-32 + or $8,$14,$9 + bgtz $6,.Loop + sd $8,0($4) + +.Lend: dsll $8,$10,$7 + j $31 + sd $8,-8($4) + .end __gmpn_lshift diff --git a/rts/gmp/mpn/mips3/mul_1.s b/rts/gmp/mpn/mips3/mul_1.s new file mode 100644 index 0000000000..6659e2b4eb --- /dev/null +++ b/rts/gmp/mpn/mips3/mul_1.s @@ -0,0 +1,85 @@ + # MIPS3 __gmpn_mul_1 -- Multiply a limb vector with a single limb and + # store the product in a second limb vector. + + # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __gmpn_mul_1 + .ent __gmpn_mul_1 +__gmpn_mul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + ld $8,0($5) + + # warm up phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) # load new s1 limb as early as possible + +Loop: mflo $10 + mfhi $9 + daddiu $5,$5,8 + daddu $10,$10,$2 # add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) # load new s1 limb as early as possible + daddiu $6,$6,-1 # decrement loop counter + sltu $2,$10,$2 # carry from previous addition -> $2 + sd $10,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: mflo $10 + mfhi $9 + daddu $10,$10,$2 + sltu $2,$10,$2 + dmultu $8,$7 + sd $10,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: mflo $10 + mfhi $9 + daddu $10,$10,$2 + sltu $2,$10,$2 + sd $10,0($4) + j $31 + daddu $2,$9,$2 # add high product limb and carry from addition + + .end __gmpn_mul_1 diff --git a/rts/gmp/mpn/mips3/rshift.s b/rts/gmp/mpn/mips3/rshift.s new file mode 100644 index 0000000000..59c7fd3492 --- /dev/null +++ b/rts/gmp/mpn/mips3/rshift.s @@ -0,0 +1,92 @@ + # MIPS3 __gmpn_rshift -- + + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __gmpn_rshift + .ent __gmpn_rshift +__gmpn_rshift: + .set noreorder + .set nomacro + + ld $10,0($5) # load first limb + dsubu $13,$0,$7 + daddiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + dsll $2,$10,$13 # compute function result + + dsubu $6,$6,$9 + +.Loop0: ld $3,8($5) + daddiu $4,$4,8 + daddiu $5,$5,8 + daddiu $9,$9,-1 + dsrl $11,$10,$7 + dsll $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sd $8,-8($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: ld $3,8($5) + daddiu $4,$4,32 + daddiu $6,$6,-4 + dsrl $11,$10,$7 + dsll $12,$3,$13 + + ld $10,16($5) + dsrl $14,$3,$7 + or $8,$11,$12 + sd $8,-32($4) + dsll $9,$10,$13 + + ld $3,24($5) + dsrl $11,$10,$7 + or $8,$14,$9 + sd $8,-24($4) + dsll $12,$3,$13 + + ld $10,32($5) + dsrl $14,$3,$7 + or $8,$11,$12 + sd $8,-16($4) + dsll $9,$10,$13 + + daddiu $5,$5,32 + or $8,$14,$9 + bgtz $6,.Loop + sd $8,-8($4) + +.Lend: dsrl $8,$10,$7 + j $31 + sd $8,0($4) + .end __gmpn_rshift diff --git a/rts/gmp/mpn/mips3/sub_n.s b/rts/gmp/mpn/mips3/sub_n.s new file mode 100644 index 0000000000..c57c824b04 --- /dev/null +++ b/rts/gmp/mpn/mips3/sub_n.s @@ -0,0 +1,120 @@ + # MIPS3 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __gmpn_sub_n + .ent __gmpn_sub_n +__gmpn_sub_n: + .set noreorder + .set nomacro + + ld $10,0($5) + ld $11,0($6) + + daddiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + dsubu $7,$7,$9 + +.Loop0: daddiu $9,$9,-1 + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,0($4) + or $2,$2,$8 + + daddiu $5,$5,8 + daddiu $6,$6,8 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + daddiu $4,$4,8 + +.L0: beq $7,$0,.Lend + nop + +.Loop: daddiu $7,$7,-4 + + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,0($4) + or $2,$2,$8 + + ld $10,16($5) + daddu $13,$13,$2 + ld $11,16($6) + sltu $8,$13,$2 + dsubu $13,$12,$13 + sltu $2,$12,$13 + sd $13,8($4) + or $2,$2,$8 + + ld $12,24($5) + daddu $11,$11,$2 + ld $13,24($6) + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,16($4) + or $2,$2,$8 + + ld $10,32($5) + daddu $13,$13,$2 + ld $11,32($6) + sltu $8,$13,$2 + dsubu $13,$12,$13 + sltu $2,$12,$13 + sd $13,24($4) + or $2,$2,$8 + + daddiu $5,$5,32 + daddiu $6,$6,32 + + bne $7,$0,.Loop + daddiu $4,$4,32 + +.Lend: daddu $11,$11,$2 + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,0($4) + j $31 + or $2,$2,$8 + + .end __gmpn_sub_n diff --git a/rts/gmp/mpn/mips3/submul_1.s b/rts/gmp/mpn/mips3/submul_1.s new file mode 100644 index 0000000000..531f9705a6 --- /dev/null +++ b/rts/gmp/mpn/mips3/submul_1.s @@ -0,0 +1,97 @@ + # MIPS3 __gmpn_submul_1 -- Multiply a limb vector with a single limb and + # subtract the product from a second limb vector. + + # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published by + # the Free Software Foundation; either version 2.1 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + # License for more details. + + # You should have received a copy of the GNU Lesser General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, + # MA 02111-1307, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __gmpn_submul_1 + .ent __gmpn_submul_1 +__gmpn_submul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + ld $8,0($5) + + # warm up phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) # load new s1 limb as early as possible + +Loop: ld $10,0($4) + mflo $3 + mfhi $9 + daddiu $5,$5,8 + daddu $3,$3,$2 # add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) # load new s1 limb as early as possible + daddiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dmultu $8,$7 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + j $31 + daddu $2,$9,$2 # add high product limb and carry from addition + + .end __gmpn_submul_1 diff --git a/rts/gmp/mpn/mp_bases.c b/rts/gmp/mpn/mp_bases.c new file mode 100644 index 0000000000..011c328c80 --- /dev/null +++ b/rts/gmp/mpn/mp_bases.c @@ -0,0 +1,550 @@ +/* __mp_bases -- Structure for conversion between internal binary + format and strings in base 2..255. The fields are explained in + gmp-impl.h. + + +Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#include "gmp.h" +#include "gmp-impl.h" + + +#if BITS_PER_MP_LIMB == 32 +const struct bases __mp_bases[256] = +{ + /* 0 */ {0, 0.0, 0, 0}, + /* 1 */ {0, 1e38, 0, 0}, + /* 2 */ {32, 1.0000000000000000, 0x1, 0x0}, + /* 3 */ {20, 0.6309297535714575, 0xcfd41b91, 0x3b563c24}, + /* 4 */ {16, 0.5000000000000000, 0x2, 0x0}, + /* 5 */ {13, 0.4306765580733931, 0x48c27395, 0xc25c2684}, + /* 6 */ {12, 0.3868528072345416, 0x81bf1000, 0xf91bd1b6}, + /* 7 */ {11, 0.3562071871080222, 0x75db9c97, 0x1607a2cb}, + /* 8 */ {10, 0.3333333333333334, 0x3, 0x0}, + /* 9 */ {10, 0.3154648767857287, 0xcfd41b91, 0x3b563c24}, + /* 10 */ {9, 0.3010299956639811, 0x3b9aca00, 0x12e0be82}, + /* 11 */ {9, 0.2890648263178878, 0x8c8b6d2b, 0xd24cde04}, + /* 12 */ {8, 0.2789429456511298, 0x19a10000, 0x3fa39ab5}, + /* 13 */ {8, 0.2702381544273197, 0x309f1021, 0x50f8ac5f}, + /* 14 */ {8, 0.2626495350371936, 0x57f6c100, 0x74843b1e}, + /* 15 */ {8, 0.2559580248098155, 0x98c29b81, 0xad0326c2}, + /* 16 */ {8, 0.2500000000000000, 0x4, 0x0}, + /* 17 */ {7, 0.2446505421182260, 0x18754571, 0x4ef0b6bd}, + /* 18 */ {7, 0.2398124665681315, 0x247dbc80, 0xc0fc48a1}, + /* 19 */ {7, 0.2354089133666382, 0x3547667b, 0x33838942}, + /* 20 */ {7, 0.2313782131597592, 0x4c4b4000, 0xad7f29ab}, + /* 21 */ {7, 0.2276702486969530, 0x6b5a6e1d, 0x313c3d15}, + /* 22 */ {7, 0.2242438242175754, 0x94ace180, 0xb8cca9e0}, + /* 23 */ {7, 0.2210647294575037, 0xcaf18367, 0x42ed6de9}, + /* 24 */ {6, 0.2181042919855316, 0xb640000, 0x67980e0b}, + /* 25 */ {6, 0.2153382790366965, 0xe8d4a51, 0x19799812}, + /* 26 */ {6, 0.2127460535533632, 0x1269ae40, 0xbce85396}, + /* 27 */ {6, 0.2103099178571525, 0x17179149, 0x62c103a9}, + /* 28 */ {6, 0.2080145976765095, 0x1cb91000, 0x1d353d43}, + /* 29 */ {6, 0.2058468324604344, 0x23744899, 0xce1decea}, + /* 30 */ {6, 0.2037950470905062, 0x2b73a840, 0x790fc511}, + /* 31 */ {6, 0.2018490865820999, 0x34e63b41, 0x35b865a0}, + /* 32 */ {6, 0.2000000000000000, 0x5, 0x0}, + /* 33 */ {6, 0.1982398631705605, 0x4cfa3cc1, 0xa9aed1b3}, + /* 34 */ {6, 0.1965616322328226, 0x5c13d840, 0x63dfc229}, + /* 35 */ {6, 0.1949590218937863, 0x6d91b519, 0x2b0fee30}, + /* 36 */ {6, 0.1934264036172708, 0x81bf1000, 0xf91bd1b6}, + /* 37 */ {6, 0.1919587200065601, 0x98ede0c9, 0xac89c3a9}, + /* 38 */ {6, 0.1905514124267734, 0xb3773e40, 0x6d2c32fe}, + /* 39 */ {6, 0.1892003595168700, 0xd1bbc4d1, 0x387907c9}, + /* 40 */ {6, 0.1879018247091076, 0xf4240000, 0xc6f7a0b}, + /* 41 */ {5, 0.1866524112389434, 0x6e7d349, 0x28928154}, + /* 42 */ {5, 0.1854490234153689, 0x7ca30a0, 0x6e8629d}, + /* 43 */ {5, 0.1842888331487062, 0x8c32bbb, 0xd373dca0}, + /* 44 */ {5, 0.1831692509136336, 0x9d46c00, 0xa0b17895}, + /* 45 */ {5, 0.1820879004699383, 0xaffacfd, 0x746811a5}, + /* 46 */ {5, 0.1810425967800402, 0xc46bee0, 0x4da6500f}, + /* 47 */ {5, 0.1800313266566926, 0xdab86ef, 0x2ba23582}, + /* 48 */ {5, 0.1790522317510414, 0xf300000, 0xdb20a88}, + /* 49 */ {5, 0.1781035935540111, 0x10d63af1, 0xe68d5ce4}, + /* 50 */ {5, 0.1771838201355579, 0x12a05f20, 0xb7cdfd9d}, + /* 51 */ {5, 0.1762914343888821, 0x1490aae3, 0x8e583933}, + /* 52 */ {5, 0.1754250635819545, 0x16a97400, 0x697cc3ea}, + /* 53 */ {5, 0.1745834300480449, 0x18ed2825, 0x48a5ca6c}, + /* 54 */ {5, 0.1737653428714400, 0x1b5e4d60, 0x2b52db16}, + /* 55 */ {5, 0.1729696904450771, 0x1dff8297, 0x111586a6}, + /* 56 */ {5, 0.1721954337940981, 0x20d38000, 0xf31d2b36}, + /* 57 */ {5, 0.1714416005739134, 0x23dd1799, 0xc8d76d19}, + /* 58 */ {5, 0.1707072796637201, 0x271f35a0, 0xa2cb1eb4}, + /* 59 */ {5, 0.1699916162869140, 0x2a9ce10b, 0x807c3ec3}, + /* 60 */ {5, 0.1692938075987814, 0x2e593c00, 0x617ec8bf}, + /* 61 */ {5, 0.1686130986895011, 0x3257844d, 0x45746cbe}, + /* 62 */ {5, 0.1679487789570419, 0x369b13e0, 0x2c0aa273}, + /* 63 */ {5, 0.1673001788101741, 0x3b27613f, 0x14f90805}, + /* 64 */ {5, 0.1666666666666667, 0x6, 0x0}, + /* 65 */ {5, 0.1660476462159378, 0x4528a141, 0xd9cf0829}, + /* 66 */ {5, 0.1654425539190583, 0x4aa51420, 0xb6fc4841}, + /* 67 */ {5, 0.1648508567221604, 0x50794633, 0x973054cb}, + /* 68 */ {5, 0.1642720499620502, 0x56a94400, 0x7a1dbe4b}, + /* 69 */ {5, 0.1637056554452156, 0x5d393975, 0x5f7fcd7f}, + /* 70 */ {5, 0.1631512196835108, 0x642d7260, 0x47196c84}, + /* 71 */ {5, 0.1626083122716341, 0x6b8a5ae7, 0x30b43635}, + /* 72 */ {5, 0.1620765243931223, 0x73548000, 0x1c1fa5f6}, + /* 73 */ {5, 0.1615554674429964, 0x7b908fe9, 0x930634a}, + /* 74 */ {5, 0.1610447717564445, 0x84435aa0, 0xef7f4a3c}, + /* 75 */ {5, 0.1605440854340214, 0x8d71d25b, 0xcf5552d2}, + /* 76 */ {5, 0.1600530732548213, 0x97210c00, 0xb1a47c8e}, + /* 77 */ {5, 0.1595714156699382, 0xa1563f9d, 0x9634b43e}, + /* 78 */ {5, 0.1590988078692941, 0xac16c8e0, 0x7cd3817d}, + /* 79 */ {5, 0.1586349589155960, 0xb768278f, 0x65536761}, + /* 80 */ {5, 0.1581795909397823, 0xc3500000, 0x4f8b588e}, + /* 81 */ {5, 0.1577324383928644, 0xcfd41b91, 0x3b563c24}, + /* 82 */ {5, 0.1572932473495469, 0xdcfa6920, 0x28928154}, + /* 83 */ {5, 0.1568617748594410, 0xeac8fd83, 0x1721bfb0}, + /* 84 */ {5, 0.1564377883420716, 0xf9461400, 0x6e8629d}, + /* 85 */ {4, 0.1560210650222250, 0x31c84b1, 0x491cc17c}, + /* 86 */ {4, 0.1556113914024940, 0x342ab10, 0x3a11d83b}, + /* 87 */ {4, 0.1552085627701551, 0x36a2c21, 0x2be074cd}, + /* 88 */ {4, 0.1548123827357682, 0x3931000, 0x1e7a02e7}, + /* 89 */ {4, 0.1544226628011101, 0x3bd5ee1, 0x11d10edd}, + /* 90 */ {4, 0.1540392219542636, 0x3e92110, 0x5d92c68}, + /* 91 */ {4, 0.1536618862898642, 0x4165ef1, 0xf50dbfb2}, + /* 92 */ {4, 0.1532904886526781, 0x4452100, 0xdf9f1316}, + /* 93 */ {4, 0.1529248683028321, 0x4756fd1, 0xcb52a684}, + /* 94 */ {4, 0.1525648706011593, 0x4a75410, 0xb8163e97}, + /* 95 */ {4, 0.1522103467132434, 0x4dad681, 0xa5d8f269}, + /* 96 */ {4, 0.1518611533308632, 0x5100000, 0x948b0fcd}, + /* 97 */ {4, 0.1515171524096389, 0x546d981, 0x841e0215}, + /* 98 */ {4, 0.1511782109217764, 0x57f6c10, 0x74843b1e}, + /* 99 */ {4, 0.1508442006228941, 0x5b9c0d1, 0x65b11e6e}, + /* 100 */ {4, 0.1505149978319906, 0x5f5e100, 0x5798ee23}, + /* 101 */ {4, 0.1501904832236879, 0x633d5f1, 0x4a30b99b}, + /* 102 */ {4, 0.1498705416319474, 0x673a910, 0x3d6e4d94}, + /* 103 */ {4, 0.1495550618645152, 0x6b563e1, 0x314825b0}, + /* 104 */ {4, 0.1492439365274121, 0x6f91000, 0x25b55f2e}, + /* 105 */ {4, 0.1489370618588283, 0x73eb721, 0x1aadaccb}, + /* 106 */ {4, 0.1486343375718350, 0x7866310, 0x10294ba2}, + /* 107 */ {4, 0.1483356667053617, 0x7d01db1, 0x620f8f6}, + /* 108 */ {4, 0.1480409554829326, 0x81bf100, 0xf91bd1b6}, + /* 109 */ {4, 0.1477501131786861, 0x869e711, 0xe6d37b2a}, + /* 110 */ {4, 0.1474630519902391, 0x8ba0a10, 0xd55cff6e}, + /* 111 */ {4, 0.1471796869179852, 0x90c6441, 0xc4ad2db2}, + /* 112 */ {4, 0.1468999356504447, 0x9610000, 0xb4b985cf}, + /* 113 */ {4, 0.1466237184553111, 0x9b7e7c1, 0xa5782bef}, + /* 114 */ {4, 0.1463509580758620, 0xa112610, 0x96dfdd2a}, + /* 115 */ {4, 0.1460815796324244, 0xa6cc591, 0x88e7e509}, + /* 116 */ {4, 0.1458155105286054, 0xacad100, 0x7b8813d3}, + /* 117 */ {4, 0.1455526803620167, 0xb2b5331, 0x6eb8b595}, + /* 118 */ {4, 0.1452930208392428, 0xb8e5710, 0x627289db}, + /* 119 */ {4, 0.1450364656948130, 0xbf3e7a1, 0x56aebc07}, + /* 120 */ {4, 0.1447829506139581, 0xc5c1000, 0x4b66dc33}, + /* 121 */ {4, 0.1445324131589439, 0xcc6db61, 0x4094d8a3}, + /* 122 */ {4, 0.1442847926987864, 0xd345510, 0x3632f7a5}, + /* 123 */ {4, 0.1440400303421672, 0xda48871, 0x2c3bd1f0}, + /* 124 */ {4, 0.1437980688733775, 0xe178100, 0x22aa4d5f}, + /* 125 */ {4, 0.1435588526911310, 0xe8d4a51, 0x19799812}, + /* 126 */ {4, 0.1433223277500932, 0xf05f010, 0x10a523e5}, + /* 127 */ {4, 0.1430884415049874, 0xf817e01, 0x828a237}, + /* 128 */ {4, 0.1428571428571428, 0x7, 0x0}, + /* 129 */ {4, 0.1426283821033600, 0x10818201, 0xf04ec452}, + /* 130 */ {4, 0.1424021108869747, 0x11061010, 0xe136444a}, + /* 131 */ {4, 0.1421782821510107, 0x118db651, 0xd2af9589}, + /* 132 */ {4, 0.1419568500933153, 0x12188100, 0xc4b42a83}, + /* 133 */ {4, 0.1417377701235801, 0x12a67c71, 0xb73dccf5}, + /* 134 */ {4, 0.1415209988221527, 0x1337b510, 0xaa4698c5}, + /* 135 */ {4, 0.1413064939005528, 0x13cc3761, 0x9dc8f729}, + /* 136 */ {4, 0.1410942141636095, 0x14641000, 0x91bf9a30}, + /* 137 */ {4, 0.1408841194731412, 0x14ff4ba1, 0x86257887}, + /* 138 */ {4, 0.1406761707131039, 0x159df710, 0x7af5c98c}, + /* 139 */ {4, 0.1404703297561400, 0x16401f31, 0x702c01a0}, + /* 140 */ {4, 0.1402665594314587, 0x16e5d100, 0x65c3ceb1}, + /* 141 */ {4, 0.1400648234939879, 0x178f1991, 0x5bb91502}, + /* 142 */ {4, 0.1398650865947379, 0x183c0610, 0x5207ec23}, + /* 143 */ {4, 0.1396673142523192, 0x18eca3c1, 0x48ac9c19}, + /* 144 */ {4, 0.1394714728255649, 0x19a10000, 0x3fa39ab5}, + /* 145 */ {4, 0.1392775294872041, 0x1a592841, 0x36e98912}, + /* 146 */ {4, 0.1390854521985406, 0x1b152a10, 0x2e7b3140}, + /* 147 */ {4, 0.1388952096850913, 0x1bd51311, 0x2655840b}, + /* 148 */ {4, 0.1387067714131417, 0x1c98f100, 0x1e7596ea}, + /* 149 */ {4, 0.1385201075671774, 0x1d60d1b1, 0x16d8a20d}, + /* 150 */ {4, 0.1383351890281539, 0x1e2cc310, 0xf7bfe87}, + /* 151 */ {4, 0.1381519873525671, 0x1efcd321, 0x85d2492}, + /* 152 */ {4, 0.1379704747522905, 0x1fd11000, 0x179a9f4}, + /* 153 */ {4, 0.1377906240751463, 0x20a987e1, 0xf59e80eb}, + /* 154 */ {4, 0.1376124087861776, 0x21864910, 0xe8b768db}, + /* 155 */ {4, 0.1374358029495937, 0x226761f1, 0xdc39d6d5}, + /* 156 */ {4, 0.1372607812113589, 0x234ce100, 0xd021c5d1}, + /* 157 */ {4, 0.1370873187823978, 0x2436d4d1, 0xc46b5e37}, + /* 158 */ {4, 0.1369153914223921, 0x25254c10, 0xb912f39c}, + /* 159 */ {4, 0.1367449754241439, 0x26185581, 0xae150294}, + /* 160 */ {4, 0.1365760475984821, 0x27100000, 0xa36e2eb1}, + /* 161 */ {4, 0.1364085852596902, 0x280c5a81, 0x991b4094}, + /* 162 */ {4, 0.1362425662114337, 0x290d7410, 0x8f19241e}, + /* 163 */ {4, 0.1360779687331669, 0x2a135bd1, 0x8564e6b7}, + /* 164 */ {4, 0.1359147715670014, 0x2b1e2100, 0x7bfbb5b4}, + /* 165 */ {4, 0.1357529539050150, 0x2c2dd2f1, 0x72dadcc8}, + /* 166 */ {4, 0.1355924953769863, 0x2d428110, 0x69ffc498}, + /* 167 */ {4, 0.1354333760385373, 0x2e5c3ae1, 0x6167f154}, + /* 168 */ {4, 0.1352755763596663, 0x2f7b1000, 0x5911016e}, + /* 169 */ {4, 0.1351190772136599, 0x309f1021, 0x50f8ac5f}, + /* 170 */ {4, 0.1349638598663645, 0x31c84b10, 0x491cc17c}, + /* 171 */ {4, 0.1348099059658079, 0x32f6d0b1, 0x417b26d8}, + /* 172 */ {4, 0.1346571975321549, 0x342ab100, 0x3a11d83b}, + /* 173 */ {4, 0.1345057169479844, 0x3563fc11, 0x32dee622}, + /* 174 */ {4, 0.1343554469488779, 0x36a2c210, 0x2be074cd}, + /* 175 */ {4, 0.1342063706143054, 0x37e71341, 0x2514bb58}, + /* 176 */ {4, 0.1340584713587980, 0x39310000, 0x1e7a02e7}, + /* 177 */ {4, 0.1339117329233981, 0x3a8098c1, 0x180ea5d0}, + /* 178 */ {4, 0.1337661393673756, 0x3bd5ee10, 0x11d10edd}, + /* 179 */ {4, 0.1336216750601996, 0x3d311091, 0xbbfb88e}, + /* 180 */ {4, 0.1334783246737591, 0x3e921100, 0x5d92c68}, + /* 181 */ {4, 0.1333360731748201, 0x3ff90031, 0x1c024c}, + /* 182 */ {4, 0.1331949058177136, 0x4165ef10, 0xf50dbfb2}, + /* 183 */ {4, 0.1330548081372441, 0x42d8eea1, 0xea30efa3}, + /* 184 */ {4, 0.1329157659418126, 0x44521000, 0xdf9f1316}, + /* 185 */ {4, 0.1327777653067443, 0x45d16461, 0xd555c0c9}, + /* 186 */ {4, 0.1326407925678156, 0x4756fd10, 0xcb52a684}, + /* 187 */ {4, 0.1325048343149731, 0x48e2eb71, 0xc193881f}, + /* 188 */ {4, 0.1323698773862368, 0x4a754100, 0xb8163e97}, + /* 189 */ {4, 0.1322359088617821, 0x4c0e0f51, 0xaed8b724}, + /* 190 */ {4, 0.1321029160581950, 0x4dad6810, 0xa5d8f269}, + /* 191 */ {4, 0.1319708865228925, 0x4f535d01, 0x9d15039d}, + /* 192 */ {4, 0.1318398080287045, 0x51000000, 0x948b0fcd}, + /* 193 */ {4, 0.1317096685686114, 0x52b36301, 0x8c394d1d}, + /* 194 */ {4, 0.1315804563506306, 0x546d9810, 0x841e0215}, + /* 195 */ {4, 0.1314521597928493, 0x562eb151, 0x7c3784f8}, + /* 196 */ {4, 0.1313247675185968, 0x57f6c100, 0x74843b1e}, + /* 197 */ {4, 0.1311982683517524, 0x59c5d971, 0x6d02985d}, + /* 198 */ {4, 0.1310726513121843, 0x5b9c0d10, 0x65b11e6e}, + /* 199 */ {4, 0.1309479056113158, 0x5d796e61, 0x5e8e5c64}, + /* 200 */ {4, 0.1308240206478128, 0x5f5e1000, 0x5798ee23}, + /* 201 */ {4, 0.1307009860033912, 0x614a04a1, 0x50cf7bde}, + /* 202 */ {4, 0.1305787914387386, 0x633d5f10, 0x4a30b99b}, + /* 203 */ {4, 0.1304574268895465, 0x65383231, 0x43bb66bd}, + /* 204 */ {4, 0.1303368824626505, 0x673a9100, 0x3d6e4d94}, + /* 205 */ {4, 0.1302171484322746, 0x69448e91, 0x374842ee}, + /* 206 */ {4, 0.1300982152363760, 0x6b563e10, 0x314825b0}, + /* 207 */ {4, 0.1299800734730872, 0x6d6fb2c1, 0x2b6cde75}, + /* 208 */ {4, 0.1298627138972530, 0x6f910000, 0x25b55f2e}, + /* 209 */ {4, 0.1297461274170591, 0x71ba3941, 0x2020a2c5}, + /* 210 */ {4, 0.1296303050907487, 0x73eb7210, 0x1aadaccb}, + /* 211 */ {4, 0.1295152381234257, 0x7624be11, 0x155b891f}, + /* 212 */ {4, 0.1294009178639407, 0x78663100, 0x10294ba2}, + /* 213 */ {4, 0.1292873358018581, 0x7aafdeb1, 0xb160fe9}, + /* 214 */ {4, 0.1291744835645007, 0x7d01db10, 0x620f8f6}, + /* 215 */ {4, 0.1290623529140715, 0x7f5c3a21, 0x14930ef}, + /* 216 */ {4, 0.1289509357448472, 0x81bf1000, 0xf91bd1b6}, + /* 217 */ {4, 0.1288402240804449, 0x842a70e1, 0xefdcb0c7}, + /* 218 */ {4, 0.1287302100711567, 0x869e7110, 0xe6d37b2a}, + /* 219 */ {4, 0.1286208859913518, 0x891b24f1, 0xddfeb94a}, + /* 220 */ {4, 0.1285122442369443, 0x8ba0a100, 0xd55cff6e}, + /* 221 */ {4, 0.1284042773229231, 0x8e2ef9d1, 0xcceced50}, + /* 222 */ {4, 0.1282969778809442, 0x90c64410, 0xc4ad2db2}, + /* 223 */ {4, 0.1281903386569819, 0x93669481, 0xbc9c75f9}, + /* 224 */ {4, 0.1280843525090381, 0x96100000, 0xb4b985cf}, + /* 225 */ {4, 0.1279790124049077, 0x98c29b81, 0xad0326c2}, + /* 226 */ {4, 0.1278743114199984, 0x9b7e7c10, 0xa5782bef}, + /* 227 */ {4, 0.1277702427352035, 0x9e43b6d1, 0x9e1771a9}, + /* 228 */ {4, 0.1276667996348261, 0xa1126100, 0x96dfdd2a}, + /* 229 */ {4, 0.1275639755045533, 0xa3ea8ff1, 0x8fd05c41}, + /* 230 */ {4, 0.1274617638294791, 0xa6cc5910, 0x88e7e509}, + /* 231 */ {4, 0.1273601581921741, 0xa9b7d1e1, 0x8225759d}, + /* 232 */ {4, 0.1272591522708010, 0xacad1000, 0x7b8813d3}, + /* 233 */ {4, 0.1271587398372755, 0xafac2921, 0x750eccf9}, + /* 234 */ {4, 0.1270589147554692, 0xb2b53310, 0x6eb8b595}, + /* 235 */ {4, 0.1269596709794558, 0xb5c843b1, 0x6884e923}, + /* 236 */ {4, 0.1268610025517973, 0xb8e57100, 0x627289db}, + /* 237 */ {4, 0.1267629036018709, 0xbc0cd111, 0x5c80c07b}, + /* 238 */ {4, 0.1266653683442337, 0xbf3e7a10, 0x56aebc07}, + /* 239 */ {4, 0.1265683910770258, 0xc27a8241, 0x50fbb19b}, + /* 240 */ {4, 0.1264719661804097, 0xc5c10000, 0x4b66dc33}, + /* 241 */ {4, 0.1263760881150453, 0xc91209c1, 0x45ef7c7c}, + /* 242 */ {4, 0.1262807514205999, 0xcc6db610, 0x4094d8a3}, + /* 243 */ {4, 0.1261859507142915, 0xcfd41b91, 0x3b563c24}, + /* 244 */ {4, 0.1260916806894653, 0xd3455100, 0x3632f7a5}, + /* 245 */ {4, 0.1259979361142023, 0xd6c16d31, 0x312a60c3}, + /* 246 */ {4, 0.1259047118299582, 0xda488710, 0x2c3bd1f0}, + /* 247 */ {4, 0.1258120027502338, 0xdddab5a1, 0x2766aa45}, + /* 248 */ {4, 0.1257198038592741, 0xe1781000, 0x22aa4d5f}, + /* 249 */ {4, 0.1256281102107963, 0xe520ad61, 0x1e06233c}, + /* 250 */ {4, 0.1255369169267456, 0xe8d4a510, 0x19799812}, + /* 251 */ {4, 0.1254462191960791, 0xec940e71, 0x15041c33}, + /* 252 */ {4, 0.1253560122735751, 0xf05f0100, 0x10a523e5}, + /* 253 */ {4, 0.1252662914786691, 0xf4359451, 0xc5c2749}, + /* 254 */ {4, 0.1251770521943144, 0xf817e010, 0x828a237}, + /* 255 */ {4, 0.1250882898658681, 0xfc05fc01, 0x40a1423}, +}; +#endif +#if BITS_PER_MP_LIMB == 64 +const struct bases __mp_bases[256] = +{ + /* 0 */ {0, 0.0, 0, 0}, + /* 1 */ {0, 1e38, 0, 0}, + /* 2 */ {64, 1.0000000000000000, CNST_LIMB(0x1), CNST_LIMB(0x0)}, + /* 3 */ {40, 0.6309297535714574, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)}, + /* 4 */ {32, 0.5000000000000000, CNST_LIMB(0x2), CNST_LIMB(0x0)}, + /* 5 */ {27, 0.4306765580733931, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)}, + /* 6 */ {24, 0.3868528072345416, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)}, + /* 7 */ {22, 0.3562071871080222, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)}, + /* 8 */ {21, 0.3333333333333334, CNST_LIMB(0x3), CNST_LIMB(0x0)}, + /* 9 */ {20, 0.3154648767857287, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)}, + /* 10 */ {19, 0.3010299956639811, CNST_LIMB(0x8ac7230489e80000), CNST_LIMB(0xd83c94fb6d2ac34a)}, + /* 11 */ {18, 0.2890648263178878, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)}, + /* 12 */ {17, 0.2789429456511298, CNST_LIMB(0x1eca170c00000000), CNST_LIMB(0xa10c2bec5da8f8f)}, + /* 13 */ {17, 0.2702381544273197, CNST_LIMB(0x780c7372621bd74d), CNST_LIMB(0x10f4becafe412ec3)}, + /* 14 */ {16, 0.2626495350371936, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)}, + /* 15 */ {16, 0.2559580248098155, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)}, + /* 16 */ {16, 0.2500000000000000, CNST_LIMB(0x4), CNST_LIMB(0x0)}, + /* 17 */ {15, 0.2446505421182260, CNST_LIMB(0x27b95e997e21d9f1), CNST_LIMB(0x9c71e11bab279323)}, + /* 18 */ {15, 0.2398124665681315, CNST_LIMB(0x5da0e1e53c5c8000), CNST_LIMB(0x5dfaa697ec6f6a1c)}, + /* 19 */ {15, 0.2354089133666382, CNST_LIMB(0xd2ae3299c1c4aedb), CNST_LIMB(0x3711783f6be7e9ec)}, + /* 20 */ {14, 0.2313782131597592, CNST_LIMB(0x16bcc41e90000000), CNST_LIMB(0x6849b86a12b9b01e)}, + /* 21 */ {14, 0.2276702486969530, CNST_LIMB(0x2d04b7fdd9c0ef49), CNST_LIMB(0x6bf097ba5ca5e239)}, + /* 22 */ {14, 0.2242438242175754, CNST_LIMB(0x5658597bcaa24000), CNST_LIMB(0x7b8015c8d7af8f08)}, + /* 23 */ {14, 0.2210647294575037, CNST_LIMB(0xa0e2073737609371), CNST_LIMB(0x975a24b3a3151b38)}, + /* 24 */ {13, 0.2181042919855316, CNST_LIMB(0xc29e98000000000), CNST_LIMB(0x50bd367972689db1)}, + /* 25 */ {13, 0.2153382790366965, CNST_LIMB(0x14adf4b7320334b9), CNST_LIMB(0x8c240c4aecb13bb5)}, + /* 26 */ {13, 0.2127460535533632, CNST_LIMB(0x226ed36478bfa000), CNST_LIMB(0xdbd2e56854e118c9)}, + /* 27 */ {13, 0.2103099178571525, CNST_LIMB(0x383d9170b85ff80b), CNST_LIMB(0x2351ffcaa9c7c4ae)}, + /* 28 */ {13, 0.2080145976765095, CNST_LIMB(0x5a3c23e39c000000), CNST_LIMB(0x6b24188ca33b0636)}, + /* 29 */ {13, 0.2058468324604344, CNST_LIMB(0x8e65137388122bcd), CNST_LIMB(0xcc3dceaf2b8ba99d)}, + /* 30 */ {13, 0.2037950470905062, CNST_LIMB(0xdd41bb36d259e000), CNST_LIMB(0x2832e835c6c7d6b6)}, + /* 31 */ {12, 0.2018490865820999, CNST_LIMB(0xaee5720ee830681), CNST_LIMB(0x76b6aa272e1873c5)}, + /* 32 */ {12, 0.2000000000000000, CNST_LIMB(0x5), CNST_LIMB(0x0)}, + /* 33 */ {12, 0.1982398631705605, CNST_LIMB(0x172588ad4f5f0981), CNST_LIMB(0x61eaf5d402c7bf4f)}, + /* 34 */ {12, 0.1965616322328226, CNST_LIMB(0x211e44f7d02c1000), CNST_LIMB(0xeeb658123ffb27ec)}, + /* 35 */ {12, 0.1949590218937863, CNST_LIMB(0x2ee56725f06e5c71), CNST_LIMB(0x5d5e3762e6fdf509)}, + /* 36 */ {12, 0.1934264036172708, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)}, + /* 37 */ {12, 0.1919587200065601, CNST_LIMB(0x5b5b57f8a98a5dd1), CNST_LIMB(0x66ae7831762efb6f)}, + /* 38 */ {12, 0.1905514124267734, CNST_LIMB(0x7dcff8986ea31000), CNST_LIMB(0x47388865a00f544)}, + /* 39 */ {12, 0.1892003595168700, CNST_LIMB(0xabd4211662a6b2a1), CNST_LIMB(0x7d673c33a123b54c)}, + /* 40 */ {12, 0.1879018247091076, CNST_LIMB(0xe8d4a51000000000), CNST_LIMB(0x19799812dea11197)}, + /* 41 */ {11, 0.1866524112389434, CNST_LIMB(0x7a32956ad081b79), CNST_LIMB(0xc27e62e0686feae)}, + /* 42 */ {11, 0.1854490234153689, CNST_LIMB(0x9f49aaff0e86800), CNST_LIMB(0x9b6e7507064ce7c7)}, + /* 43 */ {11, 0.1842888331487062, CNST_LIMB(0xce583bb812d37b3), CNST_LIMB(0x3d9ac2bf66cfed94)}, + /* 44 */ {11, 0.1831692509136336, CNST_LIMB(0x109b79a654c00000), CNST_LIMB(0xed46bc50ce59712a)}, + /* 45 */ {11, 0.1820879004699383, CNST_LIMB(0x1543beff214c8b95), CNST_LIMB(0x813d97e2c89b8d46)}, + /* 46 */ {11, 0.1810425967800402, CNST_LIMB(0x1b149a79459a3800), CNST_LIMB(0x2e81751956af8083)}, + /* 47 */ {11, 0.1800313266566926, CNST_LIMB(0x224edfb5434a830f), CNST_LIMB(0xdd8e0a95e30c0988)}, + /* 48 */ {11, 0.1790522317510413, CNST_LIMB(0x2b3fb00000000000), CNST_LIMB(0x7ad4dd48a0b5b167)}, + /* 49 */ {11, 0.1781035935540111, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)}, + /* 50 */ {11, 0.1771838201355579, CNST_LIMB(0x43c33c1937564800), CNST_LIMB(0xe392010175ee5962)}, + /* 51 */ {11, 0.1762914343888821, CNST_LIMB(0x54411b2441c3cd8b), CNST_LIMB(0x84eaf11b2fe7738e)}, + /* 52 */ {11, 0.1754250635819545, CNST_LIMB(0x6851455acd400000), CNST_LIMB(0x3a1e3971e008995d)}, + /* 53 */ {11, 0.1745834300480449, CNST_LIMB(0x80a23b117c8feb6d), CNST_LIMB(0xfd7a462344ffce25)}, + /* 54 */ {11, 0.1737653428714400, CNST_LIMB(0x9dff7d32d5dc1800), CNST_LIMB(0x9eca40b40ebcef8a)}, + /* 55 */ {11, 0.1729696904450771, CNST_LIMB(0xc155af6faeffe6a7), CNST_LIMB(0x52fa161a4a48e43d)}, + /* 56 */ {11, 0.1721954337940981, CNST_LIMB(0xebb7392e00000000), CNST_LIMB(0x1607a2cbacf930c1)}, + /* 57 */ {10, 0.1714416005739134, CNST_LIMB(0x50633659656d971), CNST_LIMB(0x97a014f8e3be55f1)}, + /* 58 */ {10, 0.1707072796637201, CNST_LIMB(0x5fa8624c7fba400), CNST_LIMB(0x568df8b76cbf212c)}, + /* 59 */ {10, 0.1699916162869140, CNST_LIMB(0x717d9faa73c5679), CNST_LIMB(0x20ba7c4b4e6ef492)}, + /* 60 */ {10, 0.1692938075987814, CNST_LIMB(0x86430aac6100000), CNST_LIMB(0xe81ee46b9ef492f5)}, + /* 61 */ {10, 0.1686130986895011, CNST_LIMB(0x9e64d9944b57f29), CNST_LIMB(0x9dc0d10d51940416)}, + /* 62 */ {10, 0.1679487789570419, CNST_LIMB(0xba5ca5392cb0400), CNST_LIMB(0x5fa8ed2f450272a5)}, + /* 63 */ {10, 0.1673001788101741, CNST_LIMB(0xdab2ce1d022cd81), CNST_LIMB(0x2ba9eb8c5e04e641)}, + /* 64 */ {10, 0.1666666666666667, CNST_LIMB(0x6), CNST_LIMB(0x0)}, + /* 65 */ {10, 0.1660476462159378, CNST_LIMB(0x12aeed5fd3e2d281), CNST_LIMB(0xb67759cc00287bf1)}, + /* 66 */ {10, 0.1654425539190583, CNST_LIMB(0x15c3da1572d50400), CNST_LIMB(0x78621feeb7f4ed33)}, + /* 67 */ {10, 0.1648508567221604, CNST_LIMB(0x194c05534f75ee29), CNST_LIMB(0x43d55b5f72943bc0)}, + /* 68 */ {10, 0.1642720499620502, CNST_LIMB(0x1d56299ada100000), CNST_LIMB(0x173decb64d1d4409)}, + /* 69 */ {10, 0.1637056554452156, CNST_LIMB(0x21f2a089a4ff4f79), CNST_LIMB(0xe29fb54fd6b6074f)}, + /* 70 */ {10, 0.1631512196835108, CNST_LIMB(0x2733896c68d9a400), CNST_LIMB(0xa1f1f5c210d54e62)}, + /* 71 */ {10, 0.1626083122716341, CNST_LIMB(0x2d2cf2c33b533c71), CNST_LIMB(0x6aac7f9bfafd57b2)}, + /* 72 */ {10, 0.1620765243931223, CNST_LIMB(0x33f506e440000000), CNST_LIMB(0x3b563c2478b72ee2)}, + /* 73 */ {10, 0.1615554674429964, CNST_LIMB(0x3ba43bec1d062211), CNST_LIMB(0x12b536b574e92d1b)}, + /* 74 */ {10, 0.1610447717564444, CNST_LIMB(0x4455872d8fd4e400), CNST_LIMB(0xdf86c03020404fa5)}, + /* 75 */ {10, 0.1605440854340214, CNST_LIMB(0x4e2694539f2f6c59), CNST_LIMB(0xa34adf02234eea8e)}, + /* 76 */ {10, 0.1600530732548213, CNST_LIMB(0x5938006c18900000), CNST_LIMB(0x6f46eb8574eb59dd)}, + /* 77 */ {10, 0.1595714156699382, CNST_LIMB(0x65ad9912474aa649), CNST_LIMB(0x42459b481df47cec)}, + /* 78 */ {10, 0.1590988078692941, CNST_LIMB(0x73ae9ff4241ec400), CNST_LIMB(0x1b424b95d80ca505)}, + /* 79 */ {10, 0.1586349589155960, CNST_LIMB(0x836612ee9c4ce1e1), CNST_LIMB(0xf2c1b982203a0dac)}, + /* 80 */ {10, 0.1581795909397823, CNST_LIMB(0x9502f90000000000), CNST_LIMB(0xb7cdfd9d7bdbab7d)}, + /* 81 */ {10, 0.1577324383928644, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)}, + /* 82 */ {10, 0.1572932473495469, CNST_LIMB(0xbebf59a07dab4400), CNST_LIMB(0x57931eeaf85cf64f)}, + /* 83 */ {10, 0.1568617748594410, CNST_LIMB(0xd7540d4093bc3109), CNST_LIMB(0x305a944507c82f47)}, + /* 84 */ {10, 0.1564377883420716, CNST_LIMB(0xf2b96616f1900000), CNST_LIMB(0xe007ccc9c22781a)}, + /* 85 */ {9, 0.1560210650222250, CNST_LIMB(0x336de62af2bca35), CNST_LIMB(0x3e92c42e000eeed4)}, + /* 86 */ {9, 0.1556113914024940, CNST_LIMB(0x39235ec33d49600), CNST_LIMB(0x1ebe59130db2795e)}, + /* 87 */ {9, 0.1552085627701551, CNST_LIMB(0x3f674e539585a17), CNST_LIMB(0x268859e90f51b89)}, + /* 88 */ {9, 0.1548123827357682, CNST_LIMB(0x4645b6958000000), CNST_LIMB(0xd24cde0463108cfa)}, + /* 89 */ {9, 0.1544226628011101, CNST_LIMB(0x4dcb74afbc49c19), CNST_LIMB(0xa536009f37adc383)}, + /* 90 */ {9, 0.1540392219542636, CNST_LIMB(0x56064e1d18d9a00), CNST_LIMB(0x7cea06ce1c9ace10)}, + /* 91 */ {9, 0.1536618862898642, CNST_LIMB(0x5f04fe2cd8a39fb), CNST_LIMB(0x58db032e72e8ba43)}, + /* 92 */ {9, 0.1532904886526781, CNST_LIMB(0x68d74421f5c0000), CNST_LIMB(0x388cc17cae105447)}, + /* 93 */ {9, 0.1529248683028321, CNST_LIMB(0x738df1f6ab4827d), CNST_LIMB(0x1b92672857620ce0)}, + /* 94 */ {9, 0.1525648706011593, CNST_LIMB(0x7f3afbc9cfb5e00), CNST_LIMB(0x18c6a9575c2ade4)}, + /* 95 */ {9, 0.1522103467132434, CNST_LIMB(0x8bf187fba88f35f), CNST_LIMB(0xd44da7da8e44b24f)}, + /* 96 */ {9, 0.1518611533308632, CNST_LIMB(0x99c600000000000), CNST_LIMB(0xaa2f78f1b4cc6794)}, + /* 97 */ {9, 0.1515171524096389, CNST_LIMB(0xa8ce21eb6531361), CNST_LIMB(0x843c067d091ee4cc)}, + /* 98 */ {9, 0.1511782109217764, CNST_LIMB(0xb92112c1a0b6200), CNST_LIMB(0x62005e1e913356e3)}, + /* 99 */ {9, 0.1508442006228941, CNST_LIMB(0xcad7718b8747c43), CNST_LIMB(0x4316eed01dedd518)}, + /* 100 */ {9, 0.1505149978319906, CNST_LIMB(0xde0b6b3a7640000), CNST_LIMB(0x2725dd1d243aba0e)}, + /* 101 */ {9, 0.1501904832236879, CNST_LIMB(0xf2d8cf5fe6d74c5), CNST_LIMB(0xddd9057c24cb54f)}, + /* 102 */ {9, 0.1498705416319474, CNST_LIMB(0x1095d25bfa712600), CNST_LIMB(0xedeee175a736d2a1)}, + /* 103 */ {9, 0.1495550618645152, CNST_LIMB(0x121b7c4c3698faa7), CNST_LIMB(0xc4699f3df8b6b328)}, + /* 104 */ {9, 0.1492439365274121, CNST_LIMB(0x13c09e8d68000000), CNST_LIMB(0x9ebbe7d859cb5a7c)}, + /* 105 */ {9, 0.1489370618588283, CNST_LIMB(0x15876ccb0b709ca9), CNST_LIMB(0x7c828b9887eb2179)}, + /* 106 */ {9, 0.1486343375718350, CNST_LIMB(0x17723c2976da2a00), CNST_LIMB(0x5d652ab99001adcf)}, + /* 107 */ {9, 0.1483356667053617, CNST_LIMB(0x198384e9c259048b), CNST_LIMB(0x4114f1754e5d7b32)}, + /* 108 */ {9, 0.1480409554829326, CNST_LIMB(0x1bbde41dfeec0000), CNST_LIMB(0x274b7c902f7e0188)}, + /* 109 */ {9, 0.1477501131786861, CNST_LIMB(0x1e241d6e3337910d), CNST_LIMB(0xfc9e0fbb32e210c)}, + /* 110 */ {9, 0.1474630519902391, CNST_LIMB(0x20b91cee9901ee00), CNST_LIMB(0xf4afa3e594f8ea1f)}, + /* 111 */ {9, 0.1471796869179852, CNST_LIMB(0x237ff9079863dfef), CNST_LIMB(0xcd85c32e9e4437b0)}, + /* 112 */ {9, 0.1468999356504447, CNST_LIMB(0x267bf47000000000), CNST_LIMB(0xa9bbb147e0dd92a8)}, + /* 113 */ {9, 0.1466237184553111, CNST_LIMB(0x29b08039fbeda7f1), CNST_LIMB(0x8900447b70e8eb82)}, + /* 114 */ {9, 0.1463509580758620, CNST_LIMB(0x2d213df34f65f200), CNST_LIMB(0x6b0a92adaad5848a)}, + /* 115 */ {9, 0.1460815796324244, CNST_LIMB(0x30d201d957a7c2d3), CNST_LIMB(0x4f990ad8740f0ee5)}, + /* 116 */ {9, 0.1458155105286054, CNST_LIMB(0x34c6d52160f40000), CNST_LIMB(0x3670a9663a8d3610)}, + /* 117 */ {9, 0.1455526803620167, CNST_LIMB(0x3903f855d8f4c755), CNST_LIMB(0x1f5c44188057be3c)}, + /* 118 */ {9, 0.1452930208392428, CNST_LIMB(0x3d8de5c8ec59b600), CNST_LIMB(0xa2bea956c4e4977)}, + /* 119 */ {9, 0.1450364656948130, CNST_LIMB(0x4269541d1ff01337), CNST_LIMB(0xed68b23033c3637e)}, + /* 120 */ {9, 0.1447829506139581, CNST_LIMB(0x479b38e478000000), CNST_LIMB(0xc99cf624e50549c5)}, + /* 121 */ {9, 0.1445324131589439, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)}, + /* 122 */ {9, 0.1442847926987864, CNST_LIMB(0x5317871fa13aba00), CNST_LIMB(0x8a5bc740b1c113e5)}, + /* 123 */ {9, 0.1440400303421672, CNST_LIMB(0x596d2f44de9fa71b), CNST_LIMB(0x6e6c7efb81cfbb9b)}, + /* 124 */ {9, 0.1437980688733775, CNST_LIMB(0x602fd125c47c0000), CNST_LIMB(0x54aba5c5cada5f10)}, + /* 125 */ {9, 0.1435588526911310, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)}, + /* 126 */ {9, 0.1433223277500932, CNST_LIMB(0x6f15be069b847e00), CNST_LIMB(0x26fb43de2c8cd2a8)}, + /* 127 */ {9, 0.1430884415049874, CNST_LIMB(0x7746b3e82a77047f), CNST_LIMB(0x12b94793db8486a1)}, + /* 128 */ {9, 0.1428571428571428, CNST_LIMB(0x7), CNST_LIMB(0x0)}, + /* 129 */ {9, 0.1426283821033600, CNST_LIMB(0x894953f7ea890481), CNST_LIMB(0xdd5deca404c0156d)}, + /* 130 */ {9, 0.1424021108869747, CNST_LIMB(0x932abffea4848200), CNST_LIMB(0xbd51373330291de0)}, + /* 131 */ {9, 0.1421782821510107, CNST_LIMB(0x9dacb687d3d6a163), CNST_LIMB(0x9fa4025d66f23085)}, + /* 132 */ {9, 0.1419568500933153, CNST_LIMB(0xa8d8102a44840000), CNST_LIMB(0x842530ee2db4949d)}, + /* 133 */ {9, 0.1417377701235801, CNST_LIMB(0xb4b60f9d140541e5), CNST_LIMB(0x6aa7f2766b03dc25)}, + /* 134 */ {9, 0.1415209988221527, CNST_LIMB(0xc15065d4856e4600), CNST_LIMB(0x53035ba7ebf32e8d)}, + /* 135 */ {9, 0.1413064939005528, CNST_LIMB(0xceb1363f396d23c7), CNST_LIMB(0x3d12091fc9fb4914)}, + /* 136 */ {9, 0.1410942141636095, CNST_LIMB(0xdce31b2488000000), CNST_LIMB(0x28b1cb81b1ef1849)}, + /* 137 */ {9, 0.1408841194731412, CNST_LIMB(0xebf12a24bca135c9), CNST_LIMB(0x15c35be67ae3e2c9)}, + /* 138 */ {9, 0.1406761707131039, CNST_LIMB(0xfbe6f8dbf88f4a00), CNST_LIMB(0x42a17bd09be1ff0)}, + /* 139 */ {8, 0.1404703297561400, CNST_LIMB(0x1ef156c084ce761), CNST_LIMB(0x8bf461f03cf0bbf)}, + /* 140 */ {8, 0.1402665594314587, CNST_LIMB(0x20c4e3b94a10000), CNST_LIMB(0xf3fbb43f68a32d05)}, + /* 141 */ {8, 0.1400648234939879, CNST_LIMB(0x22b0695a08ba421), CNST_LIMB(0xd84f44c48564dc19)}, + /* 142 */ {8, 0.1398650865947379, CNST_LIMB(0x24b4f35d7a4c100), CNST_LIMB(0xbe58ebcce7956abe)}, + /* 143 */ {8, 0.1396673142523192, CNST_LIMB(0x26d397284975781), CNST_LIMB(0xa5fac463c7c134b7)}, + /* 144 */ {8, 0.1394714728255649, CNST_LIMB(0x290d74100000000), CNST_LIMB(0x8f19241e28c7d757)}, + /* 145 */ {8, 0.1392775294872041, CNST_LIMB(0x2b63b3a37866081), CNST_LIMB(0x799a6d046c0ae1ae)}, + /* 146 */ {8, 0.1390854521985406, CNST_LIMB(0x2dd789f4d894100), CNST_LIMB(0x6566e37d746a9e40)}, + /* 147 */ {8, 0.1388952096850913, CNST_LIMB(0x306a35e51b58721), CNST_LIMB(0x526887dbfb5f788f)}, + /* 148 */ {8, 0.1387067714131417, CNST_LIMB(0x331d01712e10000), CNST_LIMB(0x408af3382b8efd3d)}, + /* 149 */ {8, 0.1385201075671774, CNST_LIMB(0x35f14200a827c61), CNST_LIMB(0x2fbb374806ec05f1)}, + /* 150 */ {8, 0.1383351890281539, CNST_LIMB(0x38e858b62216100), CNST_LIMB(0x1fe7c0f0afce87fe)}, + /* 151 */ {8, 0.1381519873525671, CNST_LIMB(0x3c03b2c13176a41), CNST_LIMB(0x11003d517540d32e)}, + /* 152 */ {8, 0.1379704747522905, CNST_LIMB(0x3f44c9b21000000), CNST_LIMB(0x2f5810f98eff0dc)}, + /* 153 */ {8, 0.1377906240751463, CNST_LIMB(0x42ad23cef3113c1), CNST_LIMB(0xeb72e35e7840d910)}, + /* 154 */ {8, 0.1376124087861776, CNST_LIMB(0x463e546b19a2100), CNST_LIMB(0xd27de19593dc3614)}, + /* 155 */ {8, 0.1374358029495937, CNST_LIMB(0x49f9fc3f96684e1), CNST_LIMB(0xbaf391fd3e5e6fc2)}, + /* 156 */ {8, 0.1372607812113589, CNST_LIMB(0x4de1c9c5dc10000), CNST_LIMB(0xa4bd38c55228c81d)}, + /* 157 */ {8, 0.1370873187823978, CNST_LIMB(0x51f77994116d2a1), CNST_LIMB(0x8fc5a8de8e1de782)}, + /* 158 */ {8, 0.1369153914223921, CNST_LIMB(0x563cd6bb3398100), CNST_LIMB(0x7bf9265bea9d3a3b)}, + /* 159 */ {8, 0.1367449754241439, CNST_LIMB(0x5ab3bb270beeb01), CNST_LIMB(0x69454b325983dccd)}, + /* 160 */ {8, 0.1365760475984821, CNST_LIMB(0x5f5e10000000000), CNST_LIMB(0x5798ee2308c39df9)}, + /* 161 */ {8, 0.1364085852596902, CNST_LIMB(0x643dce0ec16f501), CNST_LIMB(0x46e40ba0fa66a753)}, + /* 162 */ {8, 0.1362425662114337, CNST_LIMB(0x6954fe21e3e8100), CNST_LIMB(0x3717b0870b0db3a7)}, + /* 163 */ {8, 0.1360779687331669, CNST_LIMB(0x6ea5b9755f440a1), CNST_LIMB(0x2825e6775d11cdeb)}, + /* 164 */ {8, 0.1359147715670014, CNST_LIMB(0x74322a1c0410000), CNST_LIMB(0x1a01a1c09d1b4dac)}, + /* 165 */ {8, 0.1357529539050150, CNST_LIMB(0x79fc8b6ae8a46e1), CNST_LIMB(0xc9eb0a8bebc8f3e)}, + /* 166 */ {8, 0.1355924953769863, CNST_LIMB(0x80072a66d512100), CNST_LIMB(0xffe357ff59e6a004)}, + /* 167 */ {8, 0.1354333760385373, CNST_LIMB(0x86546633b42b9c1), CNST_LIMB(0xe7dfd1be05fa61a8)}, + /* 168 */ {8, 0.1352755763596663, CNST_LIMB(0x8ce6b0861000000), CNST_LIMB(0xd11ed6fc78f760e5)}, + /* 169 */ {8, 0.1351190772136599, CNST_LIMB(0x93c08e16a022441), CNST_LIMB(0xbb8db609dd29ebfe)}, + /* 170 */ {8, 0.1349638598663645, CNST_LIMB(0x9ae49717f026100), CNST_LIMB(0xa71aec8d1813d532)}, + /* 171 */ {8, 0.1348099059658079, CNST_LIMB(0xa25577ae24c1a61), CNST_LIMB(0x93b612a9f20fbc02)}, + /* 172 */ {8, 0.1346571975321549, CNST_LIMB(0xaa15f068e610000), CNST_LIMB(0x814fc7b19a67d317)}, + /* 173 */ {8, 0.1345057169479844, CNST_LIMB(0xb228d6bf7577921), CNST_LIMB(0x6fd9a03f2e0a4b7c)}, + /* 174 */ {8, 0.1343554469488779, CNST_LIMB(0xba91158ef5c4100), CNST_LIMB(0x5f4615a38d0d316e)}, + /* 175 */ {8, 0.1342063706143054, CNST_LIMB(0xc351ad9aec0b681), CNST_LIMB(0x4f8876863479a286)}, + /* 176 */ {8, 0.1340584713587980, CNST_LIMB(0xcc6db6100000000), CNST_LIMB(0x4094d8a3041b60eb)}, + /* 177 */ {8, 0.1339117329233981, CNST_LIMB(0xd5e85d09025c181), CNST_LIMB(0x32600b8ed883a09b)}, + /* 178 */ {8, 0.1337661393673756, CNST_LIMB(0xdfc4e816401c100), CNST_LIMB(0x24df8c6eb4b6d1f1)}, + /* 179 */ {8, 0.1336216750601996, CNST_LIMB(0xea06b4c72947221), CNST_LIMB(0x18097a8ee151acef)}, + /* 180 */ {8, 0.1334783246737591, CNST_LIMB(0xf4b139365210000), CNST_LIMB(0xbd48cc8ec1cd8e3)}, + /* 181 */ {8, 0.1333360731748201, CNST_LIMB(0xffc80497d520961), CNST_LIMB(0x3807a8d67485fb)}, + /* 182 */ {8, 0.1331949058177136, CNST_LIMB(0x10b4ebfca1dee100), CNST_LIMB(0xea5768860b62e8d8)}, + /* 183 */ {8, 0.1330548081372441, CNST_LIMB(0x117492de921fc141), CNST_LIMB(0xd54faf5b635c5005)}, + /* 184 */ {8, 0.1329157659418126, CNST_LIMB(0x123bb2ce41000000), CNST_LIMB(0xc14a56233a377926)}, + /* 185 */ {8, 0.1327777653067443, CNST_LIMB(0x130a8b6157bdecc1), CNST_LIMB(0xae39a88db7cd329f)}, + /* 186 */ {8, 0.1326407925678156, CNST_LIMB(0x13e15dede0e8a100), CNST_LIMB(0x9c10bde69efa7ab6)}, + /* 187 */ {8, 0.1325048343149731, CNST_LIMB(0x14c06d941c0ca7e1), CNST_LIMB(0x8ac36c42a2836497)}, + /* 188 */ {8, 0.1323698773862368, CNST_LIMB(0x15a7ff487a810000), CNST_LIMB(0x7a463c8b84f5ef67)}, + /* 189 */ {8, 0.1322359088617821, CNST_LIMB(0x169859ddc5c697a1), CNST_LIMB(0x6a8e5f5ad090fd4b)}, + /* 190 */ {8, 0.1321029160581950, CNST_LIMB(0x1791c60f6fed0100), CNST_LIMB(0x5b91a2943596fc56)}, + /* 191 */ {8, 0.1319708865228925, CNST_LIMB(0x18948e8c0e6fba01), CNST_LIMB(0x4d4667b1c468e8f0)}, + /* 192 */ {8, 0.1318398080287045, CNST_LIMB(0x19a1000000000000), CNST_LIMB(0x3fa39ab547994daf)}, + /* 193 */ {8, 0.1317096685686114, CNST_LIMB(0x1ab769203dafc601), CNST_LIMB(0x32a0a9b2faee1e2a)}, + /* 194 */ {8, 0.1315804563506306, CNST_LIMB(0x1bd81ab557f30100), CNST_LIMB(0x26357ceac0e96962)}, + /* 195 */ {8, 0.1314521597928493, CNST_LIMB(0x1d0367a69fed1ba1), CNST_LIMB(0x1a5a6f65caa5859e)}, + /* 196 */ {8, 0.1313247675185968, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)}, + /* 197 */ {8, 0.1311982683517524, CNST_LIMB(0x1f7b2a18f29ac3e1), CNST_LIMB(0x4383340615612ca)}, + /* 198 */ {8, 0.1310726513121843, CNST_LIMB(0x20c850694c2aa100), CNST_LIMB(0xf3c77969ee4be5a2)}, + /* 199 */ {8, 0.1309479056113158, CNST_LIMB(0x222173cc014980c1), CNST_LIMB(0xe00993cc187c5ec9)}, + /* 200 */ {8, 0.1308240206478128, CNST_LIMB(0x2386f26fc1000000), CNST_LIMB(0xcd2b297d889bc2b6)}, + /* 201 */ {8, 0.1307009860033912, CNST_LIMB(0x24f92ce8af296d41), CNST_LIMB(0xbb214d5064862b22)}, + /* 202 */ {8, 0.1305787914387386, CNST_LIMB(0x2678863cd0ece100), CNST_LIMB(0xa9e1a7ca7ea10e20)}, + /* 203 */ {8, 0.1304574268895465, CNST_LIMB(0x280563f0a9472d61), CNST_LIMB(0x99626e72b39ea0cf)}, + /* 204 */ {8, 0.1303368824626505, CNST_LIMB(0x29a02e1406210000), CNST_LIMB(0x899a5ba9c13fafd9)}, + /* 205 */ {8, 0.1302171484322746, CNST_LIMB(0x2b494f4efe6d2e21), CNST_LIMB(0x7a80a705391e96ff)}, + /* 206 */ {8, 0.1300982152363760, CNST_LIMB(0x2d0134ef21cbc100), CNST_LIMB(0x6c0cfe23de23042a)}, + /* 207 */ {8, 0.1299800734730872, CNST_LIMB(0x2ec84ef4da2ef581), CNST_LIMB(0x5e377df359c944dd)}, + /* 208 */ {8, 0.1298627138972530, CNST_LIMB(0x309f102100000000), CNST_LIMB(0x50f8ac5fc8f53985)}, + /* 209 */ {8, 0.1297461274170591, CNST_LIMB(0x3285ee02a1420281), CNST_LIMB(0x44497266278e35b7)}, + /* 210 */ {8, 0.1296303050907487, CNST_LIMB(0x347d6104fc324100), CNST_LIMB(0x382316831f7ee175)}, + /* 211 */ {8, 0.1295152381234257, CNST_LIMB(0x3685e47dade53d21), CNST_LIMB(0x2c7f377833b8946e)}, + /* 212 */ {8, 0.1294009178639407, CNST_LIMB(0x389ff6bb15610000), CNST_LIMB(0x2157c761ab4163ef)}, + /* 213 */ {8, 0.1292873358018581, CNST_LIMB(0x3acc1912ebb57661), CNST_LIMB(0x16a7071803cc49a9)}, + /* 214 */ {8, 0.1291744835645007, CNST_LIMB(0x3d0acff111946100), CNST_LIMB(0xc6781d80f8224fc)}, + /* 215 */ {8, 0.1290623529140715, CNST_LIMB(0x3f5ca2e692eaf841), CNST_LIMB(0x294092d370a900b)}, + /* 216 */ {8, 0.1289509357448472, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)}, + /* 217 */ {8, 0.1288402240804449, CNST_LIMB(0x443bcb714399a5c1), CNST_LIMB(0xe03b98f103fad6d2)}, + /* 218 */ {8, 0.1287302100711567, CNST_LIMB(0x46ca406c81af2100), CNST_LIMB(0xcee3d32cad2a9049)}, + /* 219 */ {8, 0.1286208859913518, CNST_LIMB(0x496e106ac22aaae1), CNST_LIMB(0xbe3f9df9277fdada)}, + /* 220 */ {8, 0.1285122442369443, CNST_LIMB(0x4c27d39fa5410000), CNST_LIMB(0xae46f0d94c05e933)}, + /* 221 */ {8, 0.1284042773229231, CNST_LIMB(0x4ef825c296e43ca1), CNST_LIMB(0x9ef2280fb437a33d)}, + /* 222 */ {8, 0.1282969778809442, CNST_LIMB(0x51dfa61f5ad88100), CNST_LIMB(0x9039ff426d3f284b)}, + /* 223 */ {8, 0.1281903386569819, CNST_LIMB(0x54def7a6d2f16901), CNST_LIMB(0x82178c6d6b51f8f4)}, + /* 224 */ {8, 0.1280843525090381, CNST_LIMB(0x57f6c10000000000), CNST_LIMB(0x74843b1ee4c1e053)}, + /* 225 */ {8, 0.1279790124049077, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)}, + /* 226 */ {8, 0.1278743114199984, CNST_LIMB(0x5e7268b9bbdf8100), CNST_LIMB(0x5af23c74f9ad9fe9)}, + /* 227 */ {8, 0.1277702427352035, CNST_LIMB(0x61d7a7932ff3d6a1), CNST_LIMB(0x4ee7eae2acdc617e)}, + /* 228 */ {8, 0.1276667996348261, CNST_LIMB(0x65581f53c8c10000), CNST_LIMB(0x43556aa2ac262a0b)}, + /* 229 */ {8, 0.1275639755045533, CNST_LIMB(0x68f48a385b8320e1), CNST_LIMB(0x3835949593b8ddd1)}, + /* 230 */ {8, 0.1274617638294791, CNST_LIMB(0x6cada69ed07c2100), CNST_LIMB(0x2d837fbe78458762)}, + /* 231 */ {8, 0.1273601581921741, CNST_LIMB(0x70843718cdbf27c1), CNST_LIMB(0x233a7e150a54a555)}, + /* 232 */ {8, 0.1272591522708010, CNST_LIMB(0x7479027ea1000000), CNST_LIMB(0x19561984a50ff8fe)}, + /* 233 */ {8, 0.1271587398372755, CNST_LIMB(0x788cd40268f39641), CNST_LIMB(0xfd211159fe3490f)}, + /* 234 */ {8, 0.1270589147554692, CNST_LIMB(0x7cc07b437ecf6100), CNST_LIMB(0x6aa563e655033e3)}, + /* 235 */ {8, 0.1269596709794558, CNST_LIMB(0x8114cc6220762061), CNST_LIMB(0xfbb614b3f2d3b14c)}, + /* 236 */ {8, 0.1268610025517973, CNST_LIMB(0x858aa0135be10000), CNST_LIMB(0xeac0f8837fb05773)}, + /* 237 */ {8, 0.1267629036018709, CNST_LIMB(0x8a22d3b53c54c321), CNST_LIMB(0xda6e4c10e8615ca5)}, + /* 238 */ {8, 0.1266653683442337, CNST_LIMB(0x8ede496339f34100), CNST_LIMB(0xcab755a8d01fa67f)}, + /* 239 */ {8, 0.1265683910770258, CNST_LIMB(0x93bde80aec3a1481), CNST_LIMB(0xbb95a9ae71aa3e0c)}, + /* 240 */ {8, 0.1264719661804097, CNST_LIMB(0x98c29b8100000000), CNST_LIMB(0xad0326c296b4f529)}, + /* 241 */ {8, 0.1263760881150453, CNST_LIMB(0x9ded549671832381), CNST_LIMB(0x9ef9f21eed31b7c1)}, + /* 242 */ {8, 0.1262807514205999, CNST_LIMB(0xa33f092e0b1ac100), CNST_LIMB(0x91747422be14b0b2)}, + /* 243 */ {8, 0.1261859507142915, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)}, + /* 244 */ {8, 0.1260916806894653, CNST_LIMB(0xae5b564ac3a10000), CNST_LIMB(0x77df79e9a96c06f6)}, + /* 245 */ {8, 0.1259979361142023, CNST_LIMB(0xb427f4b3be74c361), CNST_LIMB(0x6bc6019636c7d0c2)}, + /* 246 */ {8, 0.1259047118299582, CNST_LIMB(0xba1f9a938041e100), CNST_LIMB(0x601c4205aebd9e47)}, + /* 247 */ {8, 0.1258120027502338, CNST_LIMB(0xc0435871d1110f41), CNST_LIMB(0x54ddc59756f05016)}, + /* 248 */ {8, 0.1257198038592741, CNST_LIMB(0xc694446f01000000), CNST_LIMB(0x4a0648979c838c18)}, + /* 249 */ {8, 0.1256281102107963, CNST_LIMB(0xcd137a5b57ac3ec1), CNST_LIMB(0x3f91b6e0bb3a053d)}, + /* 250 */ {8, 0.1255369169267456, CNST_LIMB(0xd3c21bcecceda100), CNST_LIMB(0x357c299a88ea76a5)}, + /* 251 */ {8, 0.1254462191960791, CNST_LIMB(0xdaa150410b788de1), CNST_LIMB(0x2bc1e517aecc56e3)}, + /* 252 */ {8, 0.1253560122735751, CNST_LIMB(0xe1b24521be010000), CNST_LIMB(0x225f56ceb3da9f5d)}, + /* 253 */ {8, 0.1252662914786691, CNST_LIMB(0xe8f62df12777c1a1), CNST_LIMB(0x1951136d53ad63ac)}, + /* 254 */ {8, 0.1251770521943144, CNST_LIMB(0xf06e445906fc0100), CNST_LIMB(0x1093d504b3cd7d93)}, + /* 255 */ {8, 0.1250882898658681, CNST_LIMB(0xf81bc845c81bf801), CNST_LIMB(0x824794d1ec1814f)}, +}; +#endif diff --git a/rts/gmp/mpn/ns32k/add_n.s b/rts/gmp/mpn/ns32k/add_n.s new file mode 100644 index 0000000000..bd063d07d9 --- /dev/null +++ b/rts/gmp/mpn/ns32k/add_n.s @@ -0,0 +1,46 @@ +# ns32000 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +# sum in a third limb vector. + +# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + + .align 1 +.globl ___gmpn_add_n +___gmpn_add_n: + save [r3,r4,r5] + negd 28(sp),r3 + movd r3,r0 + lshd 2,r0 + movd 24(sp),r4 + subd r0,r4 # r4 -> to end of S2 + movd 20(sp),r5 + subd r0,r5 # r5 -> to end of S1 + movd 16(sp),r2 + subd r0,r2 # r2 -> to end of RES + subd r0,r0 # cy = 0 + +Loop: movd r5[r3:d],r0 + addcd r4[r3:d],r0 + movd r0,r2[r3:d] + acbd 1,r3,Loop + + scsd r0 # r0 = cy. + restore [r5,r4,r3] + ret 0 diff --git a/rts/gmp/mpn/ns32k/addmul_1.s b/rts/gmp/mpn/ns32k/addmul_1.s new file mode 100644 index 0000000000..df0dcdd4af --- /dev/null +++ b/rts/gmp/mpn/ns32k/addmul_1.s @@ -0,0 +1,48 @@ +# ns32000 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add +# the result to a second limb vector. + +# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + + .align 1 +.globl ___gmpn_addmul_1 +___gmpn_addmul_1: + save [r3,r4,r5,r6,r7] + negd 24(sp),r4 + movd r4,r0 + lshd 2,r0 + movd 20(sp),r5 + subd r0,r5 # r5 -> to end of S1 + movd 16(sp),r6 + subd r0,r6 # r6 -> to end of RES + subd r0,r0 # r0 = 0, cy = 0 + movd 28(sp),r7 # r7 = s2_limb + +Loop: movd r5[r4:d],r2 + meid r7,r2 # r2 = low_prod, r3 = high_prod + addcd r0,r2 # r2 = low_prod + cy_limb + movd r3,r0 # r0 = new cy_limb + addcd 0,r0 + addd r2,r6[r4:d] + acbd 1,r4,Loop + + addcd 0,r0 + restore [r7,r6,r5,r4,r3] + ret 0 diff --git a/rts/gmp/mpn/ns32k/mul_1.s b/rts/gmp/mpn/ns32k/mul_1.s new file mode 100644 index 0000000000..0a77efba29 --- /dev/null +++ b/rts/gmp/mpn/ns32k/mul_1.s @@ -0,0 +1,47 @@ +# ns32000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store +# the result in a second limb vector. + +# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + + .align 1 +.globl ___gmpn_mul_1 +___gmpn_mul_1: + save [r3,r4,r5,r6,r7] + negd 24(sp),r4 + movd r4,r0 + lshd 2,r0 + movd 20(sp),r5 + subd r0,r5 # r5 -> to end of S1 + movd 16(sp),r6 + subd r0,r6 # r6 -> to end of RES + subd r0,r0 # r0 = 0, cy = 0 + movd 28(sp),r7 # r7 = s2_limb + +Loop: movd r5[r4:d],r2 + meid r7,r2 # r2 = low_prod, r3 = high_prod + addcd r0,r2 # r2 = low_prod + cy_limb + movd r3,r0 # r0 = new cy_limb + movd r2,r6[r4:d] + acbd 1,r4,Loop + + addcd 0,r0 + restore [r7,r6,r5,r4,r3] + ret 0 diff --git a/rts/gmp/mpn/ns32k/sub_n.s b/rts/gmp/mpn/ns32k/sub_n.s new file mode 100644 index 0000000000..cd89f4fd3f --- /dev/null +++ b/rts/gmp/mpn/ns32k/sub_n.s @@ -0,0 +1,46 @@ +# ns32000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +# store difference in a third limb vector. + +# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + + .align 1 +.globl ___gmpn_sub_n +___gmpn_sub_n: + save [r3,r4,r5] + negd 28(sp),r3 + movd r3,r0 + lshd 2,r0 + movd 24(sp),r4 + subd r0,r4 # r4 -> to end of S2 + movd 20(sp),r5 + subd r0,r5 # r5 -> to end of S1 + movd 16(sp),r2 + subd r0,r2 # r2 -> to end of RES + subd r0,r0 # cy = 0 + +Loop: movd r5[r3:d],r0 + subcd r4[r3:d],r0 + movd r0,r2[r3:d] + acbd 1,r3,Loop + + scsd r0 # r0 = cy. + restore [r5,r4,r3] + ret 0 diff --git a/rts/gmp/mpn/ns32k/submul_1.s b/rts/gmp/mpn/ns32k/submul_1.s new file mode 100644 index 0000000000..f811aedcf1 --- /dev/null +++ b/rts/gmp/mpn/ns32k/submul_1.s @@ -0,0 +1,48 @@ +# ns32000 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract +# the result from a second limb vector. + +# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + + .align 1 +.globl ___gmpn_submul_1 +___gmpn_submul_1: + save [r3,r4,r5,r6,r7] + negd 24(sp),r4 + movd r4,r0 + lshd 2,r0 + movd 20(sp),r5 + subd r0,r5 # r5 -> to end of S1 + movd 16(sp),r6 + subd r0,r6 # r6 -> to end of RES + subd r0,r0 # r0 = 0, cy = 0 + movd 28(sp),r7 # r7 = s2_limb + +Loop: movd r5[r4:d],r2 + meid r7,r2 # r2 = low_prod, r3 = high_prod + addcd r0,r2 # r2 = low_prod + cy_limb + movd r3,r0 # r0 = new cy_limb + addcd 0,r0 + subd r2,r6[r4:d] + acbd 1,r4,Loop + + addcd 0,r0 + restore [r7,r6,r5,r4,r3] + ret 0 diff --git a/rts/gmp/mpn/pa64/README b/rts/gmp/mpn/pa64/README new file mode 100644 index 0000000000..8d2976dabc --- /dev/null +++ b/rts/gmp/mpn/pa64/README @@ -0,0 +1,38 @@ +This directory contains mpn functions for 64-bit PA-RISC 2.0. + +RELEVANT OPTIMIZATION ISSUES + +The PA8000 has a multi-issue pipeline with large buffers for instructions +awaiting pending results. Therefore, no latency scheduling is necessary +(and might actually be harmful). + +Two 64-bit loads can be completed per cycle. One 64-bit store can be +completed per cycle. A store cannot complete in the same cycle as a load. + +STATUS + +* mpn_lshift, mpn_rshift, mpn_add_n, mpn_sub_n are all well-tuned and run at + the peak cache bandwidth; 1.5 cycles/limb for shifting and 2.0 cycles/limb + for add/subtract. + +* The multiplication functions run at 11 cycles/limb. The cache bandwidth + allows 7.5 cycles/limb. Perhaps it would be possible, using unrolling or + better scheduling, to get closer to the cache bandwidth limit. + +* xaddmul_1.S contains a quicker method for forming the 128 bit product. It + uses some fewer operations, and keep the carry flag live across the loop + boundary. But it seems hard to make it run more than 1/4 cycle faster + than the old code. Perhaps we really ought to unroll this loop be 2x? + 2x should suffice since register latency schedling is never needed, + but the unrolling would hide the store-load latency. Here is a sketch: + + 1. A multiply and store 64-bit products + 2. B sum 64-bit products 128-bit product + 3. B load 64-bit products to integer registers + 4. B multiply and store 64-bit products + 5. A sum 64-bit products 128-bit product + 6. A load 64-bit products to integer registers + 7. goto 1 + + In practice, adjacent groups (1 and 2, 2 and 3, etc) will be interleaved + for better instruction mix. diff --git a/rts/gmp/mpn/pa64/add_n.s b/rts/gmp/mpn/pa64/add_n.s new file mode 100644 index 0000000000..22ff19c184 --- /dev/null +++ b/rts/gmp/mpn/pa64/add_n.s @@ -0,0 +1,90 @@ +; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and +; store sum in a third limb vector. + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_add_n,entry +__gmpn_add_n + .proc + .callinfo frame=0,args_saved + .entry + + sub %r0,%r23,%r22 + depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + sub %r26,%r22,%r26 ; offset res_ptr + blr %r28,%r0 ; branch into loop + add %r0,%r0,%r0 ; reset carry + +L$loop ldd 0(%r25),%r20 + ldd 0(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,0(%r26) +L$7 ldd 8(%r25),%r21 + ldd 8(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,8(%r26) +L$6 ldd 16(%r25),%r20 + ldd 16(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,16(%r26) +L$5 ldd 24(%r25),%r21 + ldd 24(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,24(%r26) +L$4 ldd 32(%r25),%r20 + ldd 32(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,32(%r26) +L$3 ldd 40(%r25),%r21 + ldd 40(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,40(%r26) +L$2 ldd 48(%r25),%r20 + ldd 48(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,48(%r26) +L$1 ldd 56(%r25),%r21 + ldo 64(%r25),%r25 + ldd 56(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,56(%r26) + ldo 64(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 64(%r26),%r26 + + add,dc %r0,%r0,%r29 + bve (%r2) + .exit + ldi 0,%r28 + .procend diff --git a/rts/gmp/mpn/pa64/addmul_1.S b/rts/gmp/mpn/pa64/addmul_1.S new file mode 100644 index 0000000000..b1885b432c --- /dev/null +++ b/rts/gmp/mpn/pa64/addmul_1.S @@ -0,0 +1,167 @@ +; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb -56(%r30) + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0n + .code + .export __gmpn_addmul_1,entry +__gmpn_addmul_1 + .proc + .callinfo frame=128,no_calls + .entry + fldd -56(%r30),%fr5 ; s2limb passed on stack + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,rlimb,rlimb + add,dc t2,hi,cylimb + add t4,rlimb,t3 + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + extrd,u cylimb,31,32,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/rts/gmp/mpn/pa64/gmp-mparam.h b/rts/gmp/mpn/pa64/gmp-mparam.h new file mode 100644 index 0000000000..847735b987 --- /dev/null +++ b/rts/gmp/mpn/pa64/gmp-mparam.h @@ -0,0 +1,65 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values were measured in a PA8000 using the system compiler version + A.10.32.30. Presumably the PA8200 and PA8500 have the same timing + characteristic, but GCC might give somewhat different results. */ +/* Generated by tuneup.c, 2000-07-25. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 16 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 105 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 40 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 116 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 72 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 94 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 50 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 46 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 1 +#endif diff --git a/rts/gmp/mpn/pa64/lshift.s b/rts/gmp/mpn/pa64/lshift.s new file mode 100644 index 0000000000..994bc1c4d6 --- /dev/null +++ b/rts/gmp/mpn/pa64/lshift.s @@ -0,0 +1,103 @@ +; HP-PA 2.0 __gmpn_lshift -- + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; size gr24 +; cnt gr23 + +; This runs at 1.5 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_lshift,entry +__gmpn_lshift + .proc + .callinfo frame=0,args_saved + .entry + + shladd %r24,3,%r25,%r25 + shladd %r24,3,%r26,%r26 + subi 64,%r23,%r23 + mtsar %r23 + ldd -8(%r25),%r21 + addib,= -1,%r24,L$end + shrpd %r0,%r21,%sar,%r29 ; compute carry out limb + depw,z %r24,31,3,%r28 ; r28 = (size & 7) + sub %r0,%r24,%r22 + depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7) + add %r25,%r22,%r25 ; offset s1_ptr + blr %r28,%r0 ; branch into jump table + add %r26,%r22,%r26 ; offset res_ptr + b L$0 + nop + b L$1 + copy %r21,%r20 + b L$2 + nop + b L$3 + copy %r21,%r20 + b L$4 + nop + b L$5 + copy %r21,%r20 + b L$6 + nop + b L$7 + copy %r21,%r20 + +L$loop +L$0 ldd -16(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-8(%r26) +L$7 ldd -24(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-16(%r26) +L$6 ldd -32(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-24(%r26) +L$5 ldd -40(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-32(%r26) +L$4 ldd -48(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-40(%r26) +L$3 ldd -56(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-48(%r26) +L$2 ldd -64(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-56(%r26) +L$1 ldd -72(%r25),%r21 + ldo -64(%r25),%r25 + shrpd %r20,%r21,%sar,%r20 + std %r20,-64(%r26) + addib,> -8,%r24,L$loop + ldo -64(%r26),%r26 + +L$end shrpd %r21,%r0,%sar,%r21 + std %r21,-8(%r26) + bve (%r2) + .exit + extrd,u %r29,31,32,%r28 + .procend diff --git a/rts/gmp/mpn/pa64/mul_1.S b/rts/gmp/mpn/pa64/mul_1.S new file mode 100644 index 0000000000..ab310c1264 --- /dev/null +++ b/rts/gmp/mpn/pa64/mul_1.S @@ -0,0 +1,158 @@ +; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and +; store the result in a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb -56(%r30) + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0n + .code + .export __gmpn_mul_1,entry +__gmpn_mul_1 + .proc + .callinfo frame=128,no_calls + .entry + fldd -56(%r30),%fr5 ; s2limb passed on stack + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t3 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t3 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t2 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,t4,t3 + add,dc t2,hi,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + extrd,u cylimb,31,32,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/rts/gmp/mpn/pa64/rshift.s b/rts/gmp/mpn/pa64/rshift.s new file mode 100644 index 0000000000..f0730e2a91 --- /dev/null +++ b/rts/gmp/mpn/pa64/rshift.s @@ -0,0 +1,100 @@ +; HP-PA 2.0 __gmpn_rshift -- + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; size gr24 +; cnt gr23 + +; This runs at 1.5 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_rshift,entry +__gmpn_rshift + .proc + .callinfo frame=0,args_saved + .entry + + mtsar %r23 + ldd 0(%r25),%r21 + addib,= -1,%r24,L$end + shrpd %r21,%r0,%sar,%r29 ; compute carry out limb + depw,z %r24,31,3,%r28 ; r28 = (size & 7) + sub %r0,%r24,%r22 + depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + blr %r28,%r0 ; branch into jump table + sub %r26,%r22,%r26 ; offset res_ptr + b L$0 + nop + b L$1 + copy %r21,%r20 + b L$2 + nop + b L$3 + copy %r21,%r20 + b L$4 + nop + b L$5 + copy %r21,%r20 + b L$6 + nop + b L$7 + copy %r21,%r20 + +L$loop +L$0 ldd 8(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,0(%r26) +L$7 ldd 16(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,8(%r26) +L$6 ldd 24(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,16(%r26) +L$5 ldd 32(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,24(%r26) +L$4 ldd 40(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,32(%r26) +L$3 ldd 48(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,40(%r26) +L$2 ldd 56(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,48(%r26) +L$1 ldd 64(%r25),%r21 + ldo 64(%r25),%r25 + shrpd %r21,%r20,%sar,%r20 + std %r20,56(%r26) + addib,> -8,%r24,L$loop + ldo 64(%r26),%r26 + +L$end shrpd %r0,%r21,%sar,%r21 + std %r21,0(%r26) + bve (%r2) + .exit + extrd,u %r29,31,32,%r28 + .procend diff --git a/rts/gmp/mpn/pa64/sub_n.s b/rts/gmp/mpn/pa64/sub_n.s new file mode 100644 index 0000000000..dda1f54b34 --- /dev/null +++ b/rts/gmp/mpn/pa64/sub_n.s @@ -0,0 +1,90 @@ +; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 +; and store difference in a third limb vector. + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .level 2.0n + .code + .export __gmpn_sub_n,entry +__gmpn_sub_n + .proc + .callinfo frame=0,args_saved + .entry + + sub %r0,%r23,%r22 + depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + blr %r28,%r0 ; branch into loop + sub %r26,%r22,%r26 ; offset res_ptr and set carry + +L$loop ldd 0(%r25),%r20 + ldd 0(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,0(%r26) +L$7 ldd 8(%r25),%r21 + ldd 8(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,8(%r26) +L$6 ldd 16(%r25),%r20 + ldd 16(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,16(%r26) +L$5 ldd 24(%r25),%r21 + ldd 24(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,24(%r26) +L$4 ldd 32(%r25),%r20 + ldd 32(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,32(%r26) +L$3 ldd 40(%r25),%r21 + ldd 40(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,40(%r26) +L$2 ldd 48(%r25),%r20 + ldd 48(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,48(%r26) +L$1 ldd 56(%r25),%r21 + ldo 64(%r25),%r25 + ldd 56(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,56(%r26) + ldo 64(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 64(%r26),%r26 + + add,dc %r0,%r0,%r29 + subi 1,%r29,%r29 + bve (%r2) + .exit + ldi 0,%r28 + .procend diff --git a/rts/gmp/mpn/pa64/submul_1.S b/rts/gmp/mpn/pa64/submul_1.S new file mode 100644 index 0000000000..27666b99df --- /dev/null +++ b/rts/gmp/mpn/pa64/submul_1.S @@ -0,0 +1,170 @@ +; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and +; subtract the result from a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb -56(%r30) + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0n + .code + .export __gmpn_submul_1,entry +__gmpn_submul_1 + .proc + .callinfo frame=128,no_calls + .entry + fldd -56(%r30),%fr5 ; s2limb passed on stack + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t4 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + sub rlimb,t4,t3 + add t4,t3,%r0 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t4 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + sub rlimb,t4,t3 + add t4,t3,%r0 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,t4,t4 + add,dc t2,hi,cylimb + sub rlimb,t4,t3 + add t4,t3,%r0 + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + extrd,u cylimb,31,32,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/rts/gmp/mpn/pa64/udiv_qrnnd.c b/rts/gmp/mpn/pa64/udiv_qrnnd.c new file mode 100644 index 0000000000..1c9fe084db --- /dev/null +++ b/rts/gmp/mpn/pa64/udiv_qrnnd.c @@ -0,0 +1,111 @@ +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#define TWO64 18446744073709551616.0 + +mp_limb_t +#if __STDC__ +__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r) +#else +__MPN(udiv_qrnnd) (n1, n0, d, r) + mp_limb_t n1; + mp_limb_t n0; + mp_limb_t d; + mp_limb_t *r; +#endif +{ + mp_limb_t q1, q2, q; + mp_limb_t p1, p0; + double di, dq; + + di = 1.0 / d; + + /* Generate upper 53 bits of quotient. Be careful here; the `double' + quotient may be rounded to 2^64 which we cannot safely convert back + to a 64-bit integer. */ + dq = (TWO64 * (double) n1 + (double) n0) * di; + if (dq >= TWO64) + q1 = 0xfffffffffffff800LL; + else + q1 = (mp_limb_t) dq; + + /* Multiply back in order to compare the product to the dividend. */ + umul_ppmm (p1, p0, q1, d); + + /* Was the 53-bit quotient greater that our sought quotient? Test the + sign of the partial remainder to find out. */ + if (n1 < p1 || (n1 == p1 && n0 < p0)) + { + /* 53-bit quotient too large. Partial remainder is negative. + Compute the absolute value of the remainder in n1,,n0. */ + n1 = p1 - (n1 + (p0 < n0)); + n0 = p0 - n0; + + /* Now use the partial remainder as new dividend to compute more bits of + quotient. This is an adjustment for the one we got previously. */ + q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di); + umul_ppmm (p1, p0, q2, d); + + q = q1 - q2; + if (n1 < p1 || (n1 == p1 && n0 <= p0)) + { + n0 = p0 - n0; + } + else + { + n0 = p0 - n0; + n0 += d; + q--; + } + } + else + { + n1 = n1 - (p1 + (n0 < p0)); + n0 = n0 - p0; + + q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di); + umul_ppmm (p1, p0, q2, d); + + q = q1 + q2; + if (n1 < p1 || (n1 == p1 && n0 < p0)) + { + n0 = n0 - p0; + n0 += d; + q--; + } + else + { + n0 = n0 - p0; + if (n0 >= d) + { + n0 -= d; + q++; + } + } + } + + *r = n0; + return q; +} diff --git a/rts/gmp/mpn/pa64/umul_ppmm.S b/rts/gmp/mpn/pa64/umul_ppmm.S new file mode 100644 index 0000000000..ceff2d752f --- /dev/null +++ b/rts/gmp/mpn/pa64/umul_ppmm.S @@ -0,0 +1,74 @@ +; Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +#define p0 %r28 +#define p1 %r29 +#define t32 %r19 +#define t0 %r20 +#define t1 %r21 +#define x %r22 +#define m0 %r23 +#define m1 %r24 + .level 2.0n + .code + .export __gmpn_umul_ppmm,entry +__gmpn_umul_ppmm + .proc + .callinfo frame=128,no_calls + .entry + ldo 128(%r30),%r30 + depd %r25,31,32,%r26 + std %r26,-64(%r30) + depd %r23,31,32,%r24 + std %r24,-56(%r30) + + ldw -180(%r30),%r31 + + fldd -64(%r30),%fr4 + fldd -56(%r30),%fr5 + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + + depdi,z 1,31,1,t32 ; t32 = 2^32 + + ldd -128(%r30),p0 ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),p1 ; hi = high 64 bit of product + + add,l,*nuv m0,m1,x ; x = m1+m0 + add,l t32,p1,p1 ; propagate carry to mid of p1 + depd,z x,31,32,t0 ; lo32(m1+m0) + add t0,p0,p0 + extrd,u x,31,32,t1 ; hi32(m1+m0) + add,dc t1,p1,p1 + + std p0,0(%r31) ; store low half of product + extrd,u p1,31,32,%r28 ; return high half of product + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/rts/gmp/mpn/pa64w/README b/rts/gmp/mpn/pa64w/README new file mode 100644 index 0000000000..cf590a7b98 --- /dev/null +++ b/rts/gmp/mpn/pa64w/README @@ -0,0 +1,2 @@ +This directory contains mpn functions for 64-bit PA-RISC 2.0 +using 64-bit pointers (2.0W). diff --git a/rts/gmp/mpn/pa64w/add_n.s b/rts/gmp/mpn/pa64w/add_n.s new file mode 100644 index 0000000000..1bb9e8fbc7 --- /dev/null +++ b/rts/gmp/mpn/pa64w/add_n.s @@ -0,0 +1,90 @@ +; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and +; store sum in a third limb vector. + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .level 2.0w + .code + .export __gmpn_add_n,entry +__gmpn_add_n + .proc + .callinfo frame=0,args_saved + .entry + + sub %r0,%r23,%r22 + depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + sub %r26,%r22,%r26 ; offset res_ptr + blr %r28,%r0 ; branch into loop + add %r0,%r0,%r0 ; reset carry + +L$loop ldd 0(%r25),%r20 + ldd 0(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,0(%r26) +L$7 ldd 8(%r25),%r21 + ldd 8(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,8(%r26) +L$6 ldd 16(%r25),%r20 + ldd 16(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,16(%r26) +L$5 ldd 24(%r25),%r21 + ldd 24(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,24(%r26) +L$4 ldd 32(%r25),%r20 + ldd 32(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,32(%r26) +L$3 ldd 40(%r25),%r21 + ldd 40(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,40(%r26) +L$2 ldd 48(%r25),%r20 + ldd 48(%r24),%r31 + add,dc %r20,%r31,%r20 + std %r20,48(%r26) +L$1 ldd 56(%r25),%r21 + ldo 64(%r25),%r25 + ldd 56(%r24),%r19 + add,dc %r21,%r19,%r21 + std %r21,56(%r26) + ldo 64(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 64(%r26),%r26 + + add,dc %r0,%r0,%r29 + bve (%r2) + .exit + copy %r29,%r28 + .procend diff --git a/rts/gmp/mpn/pa64w/addmul_1.S b/rts/gmp/mpn/pa64w/addmul_1.S new file mode 100644 index 0000000000..4799f90fc5 --- /dev/null +++ b/rts/gmp/mpn/pa64w/addmul_1.S @@ -0,0 +1,168 @@ +; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb %r23 + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0w + .code + .export __gmpn_addmul_1,entry +__gmpn_addmul_1 + .proc + .callinfo frame=128,no_calls + .entry + std s2limb,-56(%r30) + fldd -56(%r30),%fr5 + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,rlimb,rlimb + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + add t4,rlimb,t3 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,rlimb,rlimb + add,dc t2,hi,cylimb + add t4,rlimb,t3 + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + copy cylimb,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/rts/gmp/mpn/pa64w/gmp-mparam.h b/rts/gmp/mpn/pa64w/gmp-mparam.h new file mode 100644 index 0000000000..ee5a0a3ab7 --- /dev/null +++ b/rts/gmp/mpn/pa64w/gmp-mparam.h @@ -0,0 +1,65 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values were measured on a PA8500 using the system compiler version + A.11.01.02. Presumably the PA8000 and PA8200 have the same timing + characteristic, but GCC might give somewhat different results.. */ +/* Generated by tuneup.c, 2000-07-25. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 18 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 105 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 46 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 83 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 58 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 134 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 56 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 26 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 1 +#endif diff --git a/rts/gmp/mpn/pa64w/lshift.s b/rts/gmp/mpn/pa64w/lshift.s new file mode 100644 index 0000000000..84f925a105 --- /dev/null +++ b/rts/gmp/mpn/pa64w/lshift.s @@ -0,0 +1,103 @@ +; HP-PA 2.0 __gmpn_lshift -- + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; size gr24 +; cnt gr23 + +; This runs at 1.5 cycles/limb on PA8000. + + .level 2.0w + .code + .export __gmpn_lshift,entry +__gmpn_lshift + .proc + .callinfo frame=0,args_saved + .entry + + shladd %r24,3,%r25,%r25 + shladd %r24,3,%r26,%r26 + subi 64,%r23,%r23 + mtsar %r23 + ldd -8(%r25),%r21 + addib,= -1,%r24,L$end + shrpd %r0,%r21,%sar,%r29 ; compute carry out limb + depw,z %r24,31,3,%r28 ; r28 = (size & 7) + sub %r0,%r24,%r22 + depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7) + add %r25,%r22,%r25 ; offset s1_ptr + blr %r28,%r0 ; branch into jump table + add %r26,%r22,%r26 ; offset res_ptr + b L$0 + nop + b L$1 + copy %r21,%r20 + b L$2 + nop + b L$3 + copy %r21,%r20 + b L$4 + nop + b L$5 + copy %r21,%r20 + b L$6 + nop + b L$7 + copy %r21,%r20 + +L$loop +L$0 ldd -16(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-8(%r26) +L$7 ldd -24(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-16(%r26) +L$6 ldd -32(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-24(%r26) +L$5 ldd -40(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-32(%r26) +L$4 ldd -48(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-40(%r26) +L$3 ldd -56(%r25),%r21 + shrpd %r20,%r21,%sar,%r20 + std %r20,-48(%r26) +L$2 ldd -64(%r25),%r20 + shrpd %r21,%r20,%sar,%r21 + std %r21,-56(%r26) +L$1 ldd -72(%r25),%r21 + ldo -64(%r25),%r25 + shrpd %r20,%r21,%sar,%r20 + std %r20,-64(%r26) + addib,> -8,%r24,L$loop + ldo -64(%r26),%r26 + +L$end shrpd %r21,%r0,%sar,%r21 + std %r21,-8(%r26) + bve (%r2) + .exit + copy %r29,%r28 + .procend diff --git a/rts/gmp/mpn/pa64w/mul_1.S b/rts/gmp/mpn/pa64w/mul_1.S new file mode 100644 index 0000000000..48f13fbd1b --- /dev/null +++ b/rts/gmp/mpn/pa64w/mul_1.S @@ -0,0 +1,159 @@ +; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and +; store the result in a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb %r23 + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0w + .code + .export __gmpn_mul_1,entry +__gmpn_mul_1 + .proc + .callinfo frame=128,no_calls + .entry + std s2limb,-56(%r30) + fldd -56(%r30),%fr5 + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t3 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t3 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t2 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,t4,t3 + add,dc t2,hi,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + copy cylimb,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/rts/gmp/mpn/pa64w/rshift.s b/rts/gmp/mpn/pa64w/rshift.s new file mode 100644 index 0000000000..2517cb1f87 --- /dev/null +++ b/rts/gmp/mpn/pa64w/rshift.s @@ -0,0 +1,100 @@ +; HP-PA 2.0 __gmpn_rshift -- + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; size gr24 +; cnt gr23 + +; This runs at 1.5 cycles/limb on PA8000. + + .level 2.0w + .code + .export __gmpn_rshift,entry +__gmpn_rshift + .proc + .callinfo frame=0,args_saved + .entry + + mtsar %r23 + ldd 0(%r25),%r21 + addib,= -1,%r24,L$end + shrpd %r21,%r0,%sar,%r29 ; compute carry out limb + depw,z %r24,31,3,%r28 ; r28 = (size & 7) + sub %r0,%r24,%r22 + depw,z %r22,28,3,%r22 ; r22 = 8 * (-size & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + blr %r28,%r0 ; branch into jump table + sub %r26,%r22,%r26 ; offset res_ptr + b L$0 + nop + b L$1 + copy %r21,%r20 + b L$2 + nop + b L$3 + copy %r21,%r20 + b L$4 + nop + b L$5 + copy %r21,%r20 + b L$6 + nop + b L$7 + copy %r21,%r20 + +L$loop +L$0 ldd 8(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,0(%r26) +L$7 ldd 16(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,8(%r26) +L$6 ldd 24(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,16(%r26) +L$5 ldd 32(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,24(%r26) +L$4 ldd 40(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,32(%r26) +L$3 ldd 48(%r25),%r21 + shrpd %r21,%r20,%sar,%r20 + std %r20,40(%r26) +L$2 ldd 56(%r25),%r20 + shrpd %r20,%r21,%sar,%r21 + std %r21,48(%r26) +L$1 ldd 64(%r25),%r21 + ldo 64(%r25),%r25 + shrpd %r21,%r20,%sar,%r20 + std %r20,56(%r26) + addib,> -8,%r24,L$loop + ldo 64(%r26),%r26 + +L$end shrpd %r0,%r21,%sar,%r21 + std %r21,0(%r26) + bve (%r2) + .exit + copy %r29,%r28 + .procend diff --git a/rts/gmp/mpn/pa64w/sub_n.s b/rts/gmp/mpn/pa64w/sub_n.s new file mode 100644 index 0000000000..ad01e24aa7 --- /dev/null +++ b/rts/gmp/mpn/pa64w/sub_n.s @@ -0,0 +1,90 @@ +; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 +; and store difference in a third limb vector. + +; Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; This runs at 2 cycles/limb on PA8000. + + .level 2.0w + .code + .export __gmpn_sub_n,entry +__gmpn_sub_n + .proc + .callinfo frame=0,args_saved + .entry + + sub %r0,%r23,%r22 + depw,z %r22,30,3,%r28 ; r28 = 2 * (-n & 7) + depw,z %r22,28,3,%r22 ; r22 = 8 * (-n & 7) + sub %r25,%r22,%r25 ; offset s1_ptr + sub %r24,%r22,%r24 ; offset s2_ptr + blr %r28,%r0 ; branch into loop + sub %r26,%r22,%r26 ; offset res_ptr and set carry + +L$loop ldd 0(%r25),%r20 + ldd 0(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,0(%r26) +L$7 ldd 8(%r25),%r21 + ldd 8(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,8(%r26) +L$6 ldd 16(%r25),%r20 + ldd 16(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,16(%r26) +L$5 ldd 24(%r25),%r21 + ldd 24(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,24(%r26) +L$4 ldd 32(%r25),%r20 + ldd 32(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,32(%r26) +L$3 ldd 40(%r25),%r21 + ldd 40(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,40(%r26) +L$2 ldd 48(%r25),%r20 + ldd 48(%r24),%r31 + sub,db %r20,%r31,%r20 + std %r20,48(%r26) +L$1 ldd 56(%r25),%r21 + ldo 64(%r25),%r25 + ldd 56(%r24),%r19 + sub,db %r21,%r19,%r21 + std %r21,56(%r26) + ldo 64(%r24),%r24 + addib,> -8,%r23,L$loop + ldo 64(%r26),%r26 + + add,dc %r0,%r0,%r29 + subi 1,%r29,%r29 + bve (%r2) + .exit + copy %r29,%r28 + .procend diff --git a/rts/gmp/mpn/pa64w/submul_1.S b/rts/gmp/mpn/pa64w/submul_1.S new file mode 100644 index 0000000000..294f6239b2 --- /dev/null +++ b/rts/gmp/mpn/pa64w/submul_1.S @@ -0,0 +1,171 @@ +; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and +; subtract the result from a second limb vector. + +; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +; INPUT PARAMETERS +#define rptr %r26 +#define sptr %r25 +#define size %r24 +#define s2limb %r23 + +; This runs at 11 cycles/limb on a PA8000. It might be possible to make +; it faster, but the PA8000 pipeline is not publically documented and it +; is very complex to reverse engineer + +#define t1 %r19 +#define rlimb %r20 +#define hi %r21 +#define lo %r22 +#define m0 %r28 +#define m1 %r3 +#define cylimb %r29 +#define t3 %r4 +#define t2 %r6 +#define t5 %r23 +#define t4 %r31 + .level 2.0w + .code + .export __gmpn_submul_1,entry +__gmpn_submul_1 + .proc + .callinfo frame=128,no_calls + .entry + std s2limb,-56(%r30) + fldd -56(%r30),%fr5 + ldo 128(%r30),%r30 + add %r0,%r0,cylimb ; clear cy and cylimb + + std %r3,-96(%r30) + std %r4,-88(%r30) + std %r5,-80(%r30) + std %r6,-72(%r30) + depdi,z 1,31,1,%r5 + + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd -128(%r30),lo ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),hi ; hi = high 64 bit of product + addib,= -1,%r24,L$end1 + nop + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + addib,= -1,%r24,L$end2 + nop +L$loop + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m1 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t4 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + sub rlimb,t4,t3 + add t4,t3,%r0 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + fldd 0(sptr),%fr4 + ldo 8(sptr),sptr + std t3,0(rptr) + addib,<> -1,%r24,L$loop + ldo 8(rptr),rptr +L$end2 + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + ldd -128(%r30),lo ; lo = low 64 bit of product + add cylimb,t4,t4 + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + add,dc t2,hi,cylimb + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + sub rlimb,t4,t3 + add t4,t3,%r0 + ldd -104(%r30),hi ; hi = high 64 bit of product + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr +L$end1 + ldd 0(rptr),rlimb + extrd,u lo,31,32,t1 ; t1 = hi32(lo) + extrd,u lo,63,32,t4 ; t4 = lo32(lo) + add,l m0,t1,t1 ; t1 += m0 + add,l,*nuv m1,t1,t1 ; t1 += m0 + add,l %r5,hi,hi ; propagate carry + extrd,u t1,31,32,t2 ; t2 = hi32(t1) + depd,z t1,31,32,t5 ; t5 = lo32(t1) + add,l t5,t4,t4 ; t4 += lo32(t1) + add cylimb,t4,t4 + add,dc t2,hi,cylimb + sub rlimb,t4,t3 + add t4,t3,%r0 + add,dc %r0,cylimb,cylimb + std t3,0(rptr) + ldo 8(rptr),rptr + + ldd -96(%r30),%r3 + ldd -88(%r30),%r4 + ldd -80(%r30),%r5 + ldd -72(%r30),%r6 + + copy cylimb,%r28 + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/rts/gmp/mpn/pa64w/udiv_qrnnd.c b/rts/gmp/mpn/pa64w/udiv_qrnnd.c new file mode 100644 index 0000000000..1852913000 --- /dev/null +++ b/rts/gmp/mpn/pa64w/udiv_qrnnd.c @@ -0,0 +1,117 @@ +/* +Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" + +#define TWO64 18446744073709551616.0 +#define TWO63 9223372036854775808.0 + +mp_limb_t +#if __STDC__ +__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r) +#else +__MPN(udiv_qrnnd) (n1, n0, d, r) + mp_limb_t n1; + mp_limb_t n0; + mp_limb_t d; + mp_limb_t *r; +#endif +{ + mp_limb_t q1, q2, q; + mp_limb_t p1, p0; + double di, dq; + + di = 1.0 / d; + + /* Generate upper 53 bits of quotient. Be careful here; the `double' + quotient may be rounded to 2^64 which we cannot safely convert back + to a 64-bit integer. */ + dq = (TWO64 * (double) n1 + (double) n0) * di; + if (dq >= TWO64) + q1 = 0xfffffffffffff800L; +#ifndef __GNUC__ + /* Work around HP compiler bug. */ + else if (dq > TWO63) + q1 = (mp_limb_t) (dq - TWO63) + 0x8000000000000000L; +#endif + else + q1 = (mp_limb_t) dq; + + /* Multiply back in order to compare the product to the dividend. */ + umul_ppmm (p1, p0, q1, d); + + /* Was the 53-bit quotient greater that our sought quotient? Test the + sign of the partial remainder to find out. */ + if (n1 < p1 || (n1 == p1 && n0 < p0)) + { + /* 53-bit quotient too large. Partial remainder is negative. + Compute the absolute value of the remainder in n1,,n0. */ + n1 = p1 - (n1 + (p0 < n0)); + n0 = p0 - n0; + + /* Now use the partial remainder as new dividend to compute more bits of + quotient. This is an adjustment for the one we got previously. */ + q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di); + umul_ppmm (p1, p0, q2, d); + + q = q1 - q2; + if (n1 < p1 || (n1 == p1 && n0 <= p0)) + { + n0 = p0 - n0; + } + else + { + n0 = p0 - n0; + n0 += d; + q--; + } + } + else + { + n1 = n1 - (p1 + (n0 < p0)); + n0 = n0 - p0; + + q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di); + umul_ppmm (p1, p0, q2, d); + + q = q1 + q2; + if (n1 < p1 || (n1 == p1 && n0 < p0)) + { + n0 = n0 - p0; + n0 += d; + q--; + } + else + { + n0 = n0 - p0; + if (n0 >= d) + { + n0 -= d; + q++; + } + } + } + + *r = n0; + return q; +} diff --git a/rts/gmp/mpn/pa64w/umul_ppmm.S b/rts/gmp/mpn/pa64w/umul_ppmm.S new file mode 100644 index 0000000000..d9fb92be8c --- /dev/null +++ b/rts/gmp/mpn/pa64w/umul_ppmm.S @@ -0,0 +1,72 @@ +; Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Lesser General Public License as published by +; the Free Software Foundation; either version 2.1 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +; License for more details. + +; You should have received a copy of the GNU Lesser General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +; MA 02111-1307, USA. + +#define p0 %r28 +#define p1 %r29 +#define t32 %r19 +#define t0 %r20 +#define t1 %r21 +#define x %r22 +#define m0 %r23 +#define m1 %r24 + .level 2.0w + .code + .export __gmpn_umul_ppmm,entry +__gmpn_umul_ppmm + .proc + .callinfo frame=128,no_calls + .entry + ldo 128(%r30),%r30 + std %r26,-64(%r30) + std %r25,-56(%r30) + + copy %r24,%r31 + + fldd -64(%r30),%fr4 + fldd -56(%r30),%fr5 + + xmpyu %fr5R,%fr4R,%fr6 + fstd %fr6,-128(%r30) + xmpyu %fr5R,%fr4L,%fr7 + fstd %fr7,-120(%r30) + xmpyu %fr5L,%fr4R,%fr8 + fstd %fr8,-112(%r30) + xmpyu %fr5L,%fr4L,%fr9 + fstd %fr9,-104(%r30) + + depdi,z 1,31,1,t32 ; t32 = 2^32 + + ldd -128(%r30),p0 ; lo = low 64 bit of product + ldd -120(%r30),m0 ; m0 = mid0 64 bit of product + ldd -112(%r30),m1 ; m1 = mid1 64 bit of product + ldd -104(%r30),p1 ; hi = high 64 bit of product + + add,l,*nuv m0,m1,x ; x = m1+m0 + add,l t32,p1,p1 ; propagate carry to mid of p1 + depd,z x,31,32,t0 ; lo32(m1+m0) + add t0,p0,p0 + extrd,u x,31,32,t1 ; hi32(m1+m0) + add,dc t1,p1,p1 + + std p0,0(%r31) ; store low half of product + copy p1,%r28 ; return high half of product + bve (%r2) + .exit + ldo -128(%r30),%r30 + .procend diff --git a/rts/gmp/mpn/power/add_n.s b/rts/gmp/mpn/power/add_n.s new file mode 100644 index 0000000000..0f9f48f1cc --- /dev/null +++ b/rts/gmp/mpn/power/add_n.s @@ -0,0 +1,79 @@ +# IBM POWER __gmpn_add_n -- Add two limb vectors of equal, non-zero length. + +# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation, +# Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + + .toc + .globl __gmpn_add_n + .globl .__gmpn_add_n + .csect __gmpn_add_n[DS] +__gmpn_add_n: + .long .__gmpn_add_n, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_add_n: + andil. 10,6,1 # odd or even number of limbs? + l 8,0(4) # load least significant s1 limb + l 0,0(5) # load least significant s2 limb + cal 3,-4(3) # offset res_ptr, it's updated before it's used + sri 10,6,1 # count for unrolled loop + a 7,0,8 # add least significant limbs, set cy + mtctr 10 # copy count into CTR + beq 0,Leven # branch if even # of limbs (# of limbs >= 2) + +# We have an odd # of limbs. Add the first limbs separately. + cmpi 1,10,0 # is count for unrolled loop zero? + bc 4,6,L1 # bne cr1,L1 (misassembled by gas) + st 7,4(3) + aze 3,10 # use the fact that r10 is zero... + br # return + +# We added least significant limbs. Now reload the next limbs to enter loop. +L1: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + stu 7,4(3) + ae 7,0,8 # add limbs, set cy +Leven: lu 9,4(4) # load s1 limb and update s1_ptr + lu 10,4(5) # load s2 limb and update s2_ptr + bdz Lend # If done, skip loop + +Loop: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + ae 11,9,10 # add previous limbs with cy, set cy + stu 7,4(3) # + lu 9,4(4) # load s1 limb and update s1_ptr + lu 10,4(5) # load s2 limb and update s2_ptr + ae 7,0,8 # add previous limbs with cy, set cy + stu 11,4(3) # + bdn Loop # decrement CTR and loop back + +Lend: ae 11,9,10 # add limbs with cy, set cy + st 7,4(3) # + st 11,8(3) # + lil 3,0 # load cy into ... + aze 3,3 # ... return value register + br diff --git a/rts/gmp/mpn/power/addmul_1.s b/rts/gmp/mpn/power/addmul_1.s new file mode 100644 index 0000000000..8ecc651579 --- /dev/null +++ b/rts/gmp/mpn/power/addmul_1.s @@ -0,0 +1,122 @@ +# IBM POWER __gmpn_addmul_1 -- Multiply a limb vector with a limb and add +# the result to a second limb vector. + +# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The POWER architecture has no unsigned 32x32->64 bit multiplication +# instruction. To obtain that operation, we have to use the 32x32->64 signed +# multiplication instruction, and add the appropriate compensation to the high +# limb of the result. We add the multiplicand if the multiplier has its most +# significant bit set, and we add the multiplier if the multiplicand has its +# most significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit we +# can branch in zero cycles, so that's how we perform the additions. + + .toc + .globl __gmpn_addmul_1 + .globl .__gmpn_addmul_1 + .csect __gmpn_addmul_1[DS] +__gmpn_addmul_1: + .long .__gmpn_addmul_1, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_addmul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 8 + cax 9,9,7 + l 7,4(3) + a 8,8,7 # add res_limb + blt Lneg +Lpos: bdz Lend + +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 8,0,9 # low limb + old_cy_limb + old cy + l 7,4(3) + aze 10,10 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 8,0,10 + l 7,4(3) + aze 9,9 + a 8,8,7 + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 7 + ae 8,7,9 + l 7,4(3) + ae 10,10,0 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 7 + ae 8,7,10 + l 7,4(3) + ae 9,9,0 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br diff --git a/rts/gmp/mpn/power/lshift.s b/rts/gmp/mpn/power/lshift.s new file mode 100644 index 0000000000..ab71fb7727 --- /dev/null +++ b/rts/gmp/mpn/power/lshift.s @@ -0,0 +1,56 @@ +# IBM POWER __gmpn_lshift -- + +# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s_ptr r4 +# size r5 +# cnt r6 + + .toc + .globl __gmpn_lshift + .globl .__gmpn_lshift + .csect __gmpn_lshift[DS] +__gmpn_lshift: + .long .__gmpn_lshift, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_lshift: + sli 0,5,2 + cax 9,3,0 + cax 4,4,0 + sfi 8,6,32 + mtctr 5 # put limb count in CTR loop register + lu 0,-4(4) # read most significant limb + sre 3,0,8 # compute carry out limb, and init MQ register + bdz Lend2 # if just one limb, skip loop + lu 0,-4(4) # read 2:nd most significant limb + sreq 7,0,8 # compute most significant limb of result + bdz Lend # if just two limb, skip loop +Loop: lu 0,-4(4) # load next lower limb + stu 7,-4(9) # store previous result during read latency + sreq 7,0,8 # compute result limb + bdn Loop # loop back until CTR is zero +Lend: stu 7,-4(9) # store 2:nd least significant limb +Lend2: sle 7,0,6 # compute least significant limb + st 7,-4(9) # store it" \ + br diff --git a/rts/gmp/mpn/power/mul_1.s b/rts/gmp/mpn/power/mul_1.s new file mode 100644 index 0000000000..4e08ade583 --- /dev/null +++ b/rts/gmp/mpn/power/mul_1.s @@ -0,0 +1,109 @@ +# IBM POWER __gmpn_mul_1 -- Multiply a limb vector with a limb and store +# the result in a second limb vector. + +# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The POWER architecture has no unsigned 32x32->64 bit multiplication +# instruction. To obtain that operation, we have to use the 32x32->64 signed +# multiplication instruction, and add the appropriate compensation to the high +# limb of the result. We add the multiplicand if the multiplier has its most +# significant bit set, and we add the multiplier if the multiplicand has its +# most significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit we +# can branch in zero cycles, so that's how we perform the additions. + + .toc + .globl __gmpn_mul_1 + .globl .__gmpn_mul_1 + .csect __gmpn_mul_1[DS] +__gmpn_mul_1: + .long .__gmpn_mul_1, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_mul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 8 + ai 0,0,0 # reset carry + cax 9,9,7 + blt Lneg +Lpos: bdz Lend +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 8,0,9 + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 8,0,10 + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + cax 10,10,0 # adjust high limb for negative s2_limb + mfmq 0 + ae 8,0,9 + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + cax 9,9,0 # adjust high limb for negative s2_limb + mfmq 0 + ae 8,0,10 + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br diff --git a/rts/gmp/mpn/power/rshift.s b/rts/gmp/mpn/power/rshift.s new file mode 100644 index 0000000000..65b3945f8a --- /dev/null +++ b/rts/gmp/mpn/power/rshift.s @@ -0,0 +1,54 @@ +# IBM POWER __gmpn_rshift -- + +# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s_ptr r4 +# size r5 +# cnt r6 + + .toc + .globl __gmpn_rshift + .globl .__gmpn_rshift + .csect __gmpn_rshift[DS] +__gmpn_rshift: + .long .__gmpn_rshift, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_rshift: + sfi 8,6,32 + mtctr 5 # put limb count in CTR loop register + l 0,0(4) # read least significant limb + ai 9,3,-4 # adjust res_ptr since it's offset in the stu:s + sle 3,0,8 # compute carry limb, and init MQ register + bdz Lend2 # if just one limb, skip loop + lu 0,4(4) # read 2:nd least significant limb + sleq 7,0,8 # compute least significant limb of result + bdz Lend # if just two limb, skip loop +Loop: lu 0,4(4) # load next higher limb + stu 7,4(9) # store previous result during read latency + sleq 7,0,8 # compute result limb + bdn Loop # loop back until CTR is zero +Lend: stu 7,4(9) # store 2:nd most significant limb +Lend2: sre 7,0,6 # compute most significant limb + st 7,4(9) # store it" \ + br diff --git a/rts/gmp/mpn/power/sdiv.s b/rts/gmp/mpn/power/sdiv.s new file mode 100644 index 0000000000..81da622fbc --- /dev/null +++ b/rts/gmp/mpn/power/sdiv.s @@ -0,0 +1,34 @@ +# Copyright (C) 1999 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + .toc + .globl __sdiv_qrnnd + .globl .__sdiv_qrnnd + .csect __sdiv_qrnnd[DS] +__sdiv_qrnnd: + .long .__sdiv_qrnnd, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__sdiv_qrnnd: + mtmq 5 + div 0,4,6 + mfmq 9 + st 9,0(3) + mr 3,0 + br diff --git a/rts/gmp/mpn/power/sub_n.s b/rts/gmp/mpn/power/sub_n.s new file mode 100644 index 0000000000..aa09cf5bc1 --- /dev/null +++ b/rts/gmp/mpn/power/sub_n.s @@ -0,0 +1,80 @@ +# IBM POWER __gmpn_sub_n -- Subtract two limb vectors of equal, non-zero length. + +# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation, +# Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + + .toc + .globl __gmpn_sub_n + .globl .__gmpn_sub_n + .csect __gmpn_sub_n[DS] +__gmpn_sub_n: + .long .__gmpn_sub_n, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_sub_n: + andil. 10,6,1 # odd or even number of limbs? + l 8,0(4) # load least significant s1 limb + l 0,0(5) # load least significant s2 limb + cal 3,-4(3) # offset res_ptr, it's updated before it's used + sri 10,6,1 # count for unrolled loop + sf 7,0,8 # subtract least significant limbs, set cy + mtctr 10 # copy count into CTR + beq 0,Leven # branch if even # of limbs (# of limbs >= 2) + +# We have an odd # of limbs. Add the first limbs separately. + cmpi 1,10,0 # is count for unrolled loop zero? + bc 4,6,L1 # bne cr1,L1 (misassembled by gas) + st 7,4(3) + sfe 3,0,0 # load !cy into ... + sfi 3,3,0 # ... return value register + br # return + +# We added least significant limbs. Now reload the next limbs to enter loop. +L1: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + stu 7,4(3) + sfe 7,0,8 # subtract limbs, set cy +Leven: lu 9,4(4) # load s1 limb and update s1_ptr + lu 10,4(5) # load s2 limb and update s2_ptr + bdz Lend # If done, skip loop + +Loop: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + sfe 11,10,9 # subtract previous limbs with cy, set cy + stu 7,4(3) # + lu 9,4(4) # load s1 limb and update s1_ptr + lu 10,4(5) # load s2 limb and update s2_ptr + sfe 7,0,8 # subtract previous limbs with cy, set cy + stu 11,4(3) # + bdn Loop # decrement CTR and loop back + +Lend: sfe 11,10,9 # subtract limbs with cy, set cy + st 7,4(3) # + st 11,8(3) # + sfe 3,0,0 # load !cy into ... + sfi 3,3,0 # ... return value register + br diff --git a/rts/gmp/mpn/power/submul_1.s b/rts/gmp/mpn/power/submul_1.s new file mode 100644 index 0000000000..bc01b7c95d --- /dev/null +++ b/rts/gmp/mpn/power/submul_1.s @@ -0,0 +1,127 @@ +# IBM POWER __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract +# the result from a second limb vector. + +# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The POWER architecture has no unsigned 32x32->64 bit multiplication +# instruction. To obtain that operation, we have to use the 32x32->64 signed +# multiplication instruction, and add the appropriate compensation to the high +# limb of the result. We add the multiplicand if the multiplier has its most +# significant bit set, and we add the multiplier if the multiplicand has its +# most significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit we +# can branch in zero cycles, so that's how we perform the additions. + + .toc + .globl __gmpn_submul_1 + .globl .__gmpn_submul_1 + .csect __gmpn_submul_1[DS] +__gmpn_submul_1: + .long .__gmpn_submul_1, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__gmpn_submul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 11 + cax 9,9,7 + l 7,4(3) + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + blt Lneg +Lpos: bdz Lend + +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 11,0,9 # low limb + old_cy_limb + old cy + l 7,4(3) + aze 10,10 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 11,0,10 + l 7,4(3) + aze 9,9 + sf 8,11,7 + a 11,8,11 # invert cy (r11 is junk) + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 7 + ae 11,7,9 + l 7,4(3) + ae 10,10,0 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 7 + ae 11,7,10 + l 7,4(3) + ae 9,9,0 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br diff --git a/rts/gmp/mpn/power/umul.s b/rts/gmp/mpn/power/umul.s new file mode 100644 index 0000000000..8c77496380 --- /dev/null +++ b/rts/gmp/mpn/power/umul.s @@ -0,0 +1,38 @@ +# Copyright (C) 1999 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + .toc + .globl __umul_ppmm + .globl .__umul_ppmm + .csect __umul_ppmm[DS] +__umul_ppmm: + .long .__umul_ppmm, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.__umul_ppmm: + mul 9,4,5 + srai 0,4,31 + and 0,0,5 + srai 5,5,31 + and 5,5,4 + cax 0,0,5 + mfmq 11 + st 11,0(3) + cax 3,9,0 + br diff --git a/rts/gmp/mpn/powerpc32/add_n.asm b/rts/gmp/mpn/powerpc32/add_n.asm new file mode 100644 index 0000000000..81ed04b162 --- /dev/null +++ b/rts/gmp/mpn/powerpc32/add_n.asm @@ -0,0 +1,61 @@ +dnl PowerPC-32 mpn_add_n -- Add two limb vectors of the same length > 0 and +dnl store sum in a third limb vector. + +dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl s2_ptr r5 +dnl size r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_add_n) + mtctr r6 C copy size into CTR + addic r0,r0,0 C clear cy + lwz r8,0(r4) C load least significant s1 limb + lwz r0,0(r5) C load least significant s2 limb + addi r3,r3,-4 C offset res_ptr, it's updated before it's used + bdz .Lend C If done, skip loop +.Loop: lwz r9,4(r4) C load s1 limb + lwz r10,4(r5) C load s2 limb + adde r7,r0,r8 C add limbs with cy, set cy + stw r7,4(r3) C store result limb + bdz .Lexit C decrement CTR and exit if done + lwzu r8,8(r4) C load s1 limb and update s1_ptr + lwzu r0,8(r5) C load s2 limb and update s2_ptr + adde r7,r10,r9 C add limbs with cy, set cy + stwu r7,8(r3) C store result limb and update res_ptr + bdnz .Loop C decrement CTR and loop back + +.Lend: adde r7,r0,r8 + stw r7,4(r3) C store ultimate result limb + li r3,0 C load cy into ... + addze r3,r3 C ... return value register + blr +.Lexit: adde r7,r10,r9 + stw r7,8(r3) + li r3,0 C load cy into ... + addze r3,r3 C ... return value register + blr +EPILOGUE(mpn_add_n) diff --git a/rts/gmp/mpn/powerpc32/addmul_1.asm b/rts/gmp/mpn/powerpc32/addmul_1.asm new file mode 100644 index 0000000000..3ef75b1532 --- /dev/null +++ b/rts/gmp/mpn/powerpc32/addmul_1.asm @@ -0,0 +1,124 @@ +dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl s2_limb r6 + +dnl This is optimized for the PPC604. It has not been tested on PPC601, PPC603 +dnl or PPC750 since I don't have access to any such machines. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_addmul_1) + cmpi cr0,r5,9 C more than 9 limbs? + bgt cr0,.Lbig C branch if more than 9 limbs + + mtctr r5 + lwz r0,0(r4) + mullw r7,r0,r6 + mulhwu r10,r0,r6 + lwz r9,0(r3) + addc r8,r7,r9 + addi r3,r3,-4 + bdz .Lend +.Lloop: + lwzu r0,4(r4) + stwu r8,4(r3) + mullw r8,r0,r6 + adde r7,r8,r10 + mulhwu r10,r0,r6 + lwz r9,4(r3) + addze r10,r10 + addc r8,r7,r9 + bdnz .Lloop +.Lend: stw r8,4(r3) + addze r3,r10 + blr + +.Lbig: stmw r30,-32(r1) + addi r5,r5,-1 + srwi r0,r5,2 + mtctr r0 + + lwz r7,0(r4) + mullw r8,r7,r6 + mulhwu r0,r7,r6 + lwz r7,0(r3) + addc r8,r8,r7 + stw r8,0(r3) + +.LloopU: + lwz r7,4(r4) + lwz r12,8(r4) + lwz r30,12(r4) + lwzu r31,16(r4) + mullw r8,r7,r6 + mullw r9,r12,r6 + mullw r10,r30,r6 + mullw r11,r31,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + adde r9,r9,r0 + mulhwu r0,r12,r6 + lwz r12,8(r3) + adde r10,r10,r0 + mulhwu r0,r30,r6 + lwz r30,12(r3) + adde r11,r11,r0 + mulhwu r0,r31,r6 + lwz r31,16(r3) + addze r0,r0 C new cy_limb + addc r8,r8,r7 + stw r8,4(r3) + adde r9,r9,r12 + stw r9,8(r3) + adde r10,r10,r30 + stw r10,12(r3) + adde r11,r11,r31 + stwu r11,16(r3) + bdnz .LloopU + + andi. r31,r5,3 + mtctr r31 + beq cr0,.Lendx + +.LloopE: + lwzu r7,4(r4) + mullw r8,r7,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + addze r0,r0 C new cy_limb + addc r8,r8,r7 + stwu r8,4(r3) + bdnz .LloopE +.Lendx: + addze r3,r0 + lmw r30,-32(r1) + blr +EPILOGUE(mpn_addmul_1) diff --git a/rts/gmp/mpn/powerpc32/aix.m4 b/rts/gmp/mpn/powerpc32/aix.m4 new file mode 100644 index 0000000000..2bd8425817 --- /dev/null +++ b/rts/gmp/mpn/powerpc32/aix.m4 @@ -0,0 +1,39 @@ +divert(-1) +dnl m4 macros for AIX 32-bit assembly. + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + +define(`ASM_START', + `.toc') + +define(`PROLOGUE', + ` + .globl $1 + .globl .$1 + .csect $1[DS],2 +$1: + .long .$1, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.$1:') + +define(`EPILOGUE', `') + +divert diff --git a/rts/gmp/mpn/powerpc32/gmp-mparam.h b/rts/gmp/mpn/powerpc32/gmp-mparam.h new file mode 100644 index 0000000000..b283185789 --- /dev/null +++ b/rts/gmp/mpn/powerpc32/gmp-mparam.h @@ -0,0 +1,66 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* These values are for the 604. Presumably, these should be considerably + different for the 603 and 750 that have much slower multiply + instructions. */ + +/* Generated by tuneup.c, 2000-05-26. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 26 /* tuneup says 20 */ +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 228 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 46 /* tuneup says 44 */ +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 262 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 52 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 86 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 23 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 7 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 53 +#endif diff --git a/rts/gmp/mpn/powerpc32/lshift.asm b/rts/gmp/mpn/powerpc32/lshift.asm new file mode 100644 index 0000000000..73a85430ab --- /dev/null +++ b/rts/gmp/mpn/powerpc32/lshift.asm @@ -0,0 +1,145 @@ +dnl PowerPC-32 mpn_lshift -- Shift a number left. + +dnl Copyright (C) 1995, 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl cnt r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_lshift) + cmpi cr0,r5,12 C more than 12 limbs? + slwi r0,r5,2 + add r4,r4,r0 C make r4 point at end of s1 + add r7,r3,r0 C make r7 point at end of res + bgt .LBIG C branch if more than 12 limbs + + mtctr r5 C copy size into CTR + subfic r8,r6,32 + lwzu r11,-4(r4) C load first s1 limb + srw r3,r11,r8 C compute function return value + bdz .Lend1 + +.Loop: lwzu r10,-4(r4) + slw r9,r11,r6 + srw r12,r10,r8 + or r9,r9,r12 + stwu r9,-4(r7) + bdz .Lend2 + lwzu r11,-4(r4) + slw r9,r10,r6 + srw r12,r11,r8 + or r9,r9,r12 + stwu r9,-4(r7) + bdnz .Loop + +.Lend1: slw r0,r11,r6 + stw r0,-4(r7) + blr +.Lend2: slw r0,r10,r6 + stw r0,-4(r7) + blr + +.LBIG: + stmw r24,-32(r1) C save registers we are supposed to preserve + lwzu r9,-4(r4) + subfic r8,r6,32 + srw r3,r9,r8 C compute function return value + slw r0,r9,r6 + addi r5,r5,-1 + + andi. r10,r5,3 C count for spill loop + beq .Le + mtctr r10 + lwzu r28,-4(r4) + bdz .Lxe0 + +.Loop0: slw r12,r28,r6 + srw r24,r28,r8 + lwzu r28,-4(r4) + or r24,r0,r24 + stwu r24,-4(r7) + mr r0,r12 + bdnz .Loop0 C taken at most once! + +.Lxe0: slw r12,r28,r6 + srw r24,r28,r8 + or r24,r0,r24 + stwu r24,-4(r7) + mr r0,r12 + +.Le: srwi r5,r5,2 C count for unrolled loop + addi r5,r5,-1 + mtctr r5 + lwz r28,-4(r4) + lwz r29,-8(r4) + lwz r30,-12(r4) + lwzu r31,-16(r4) + +.LoopU: slw r9,r28,r6 + srw r24,r28,r8 + lwz r28,-4(r4) + slw r10,r29,r6 + srw r25,r29,r8 + lwz r29,-8(r4) + slw r11,r30,r6 + srw r26,r30,r8 + lwz r30,-12(r4) + slw r12,r31,r6 + srw r27,r31,r8 + lwzu r31,-16(r4) + or r24,r0,r24 + stw r24,-4(r7) + or r25,r9,r25 + stw r25,-8(r7) + or r26,r10,r26 + stw r26,-12(r7) + or r27,r11,r27 + stwu r27,-16(r7) + mr r0,r12 + bdnz .LoopU + + slw r9,r28,r6 + srw r24,r28,r8 + slw r10,r29,r6 + srw r25,r29,r8 + slw r11,r30,r6 + srw r26,r30,r8 + slw r12,r31,r6 + srw r27,r31,r8 + or r24,r0,r24 + stw r24,-4(r7) + or r25,r9,r25 + stw r25,-8(r7) + or r26,r10,r26 + stw r26,-12(r7) + or r27,r11,r27 + stwu r27,-16(r7) + mr r0,r12 + + stw r0,-4(r7) + lmw r24,-32(r1) C restore registers + blr +EPILOGUE(mpn_lshift) diff --git a/rts/gmp/mpn/powerpc32/mul_1.asm b/rts/gmp/mpn/powerpc32/mul_1.asm new file mode 100644 index 0000000000..ec878b54d5 --- /dev/null +++ b/rts/gmp/mpn/powerpc32/mul_1.asm @@ -0,0 +1,86 @@ +dnl PowerPC-32 mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl s2_limb r6 + +dnl This is optimized for the PPC604 but it runs decently even on PPC601. It +dnl has not been tested on a PPC603 since I don't have access to any such +dnl machines. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_mul_1) + mtctr r5 + addi r3,r3,-4 C adjust res_ptr, it's offset before it's used + li r12,0 C clear upper product reg + addic r0,r0,0 C clear cy +C Start software pipeline + lwz r8,0(r4) + bdz .Lend3 + stmw r30,-8(r1) C save registers we are supposed to preserve + lwzu r9,4(r4) + mullw r11,r8,r6 + mulhwu r0,r8,r6 + bdz .Lend1 +C Software pipelined main loop +.Loop: lwz r8,4(r4) + mullw r10,r9,r6 + adde r30,r11,r12 + mulhwu r12,r9,r6 + stw r30,4(r3) + bdz .Lend2 + lwzu r9,8(r4) + mullw r11,r8,r6 + adde r31,r10,r0 + mulhwu r0,r8,r6 + stwu r31,8(r3) + bdnz .Loop +C Finish software pipeline +.Lend1: mullw r10,r9,r6 + adde r30,r11,r12 + mulhwu r12,r9,r6 + stw r30,4(r3) + adde r31,r10,r0 + stwu r31,8(r3) + addze r3,r12 + lmw r30,-8(r1) C restore registers from stack + blr +.Lend2: mullw r11,r8,r6 + adde r31,r10,r0 + mulhwu r0,r8,r6 + stwu r31,8(r3) + adde r30,r11,r12 + stw r30,4(r3) + addze r3,r0 + lmw r30,-8(r1) C restore registers from stack + blr +.Lend3: mullw r11,r8,r6 + stw r11,4(r3) + mulhwu r3,r8,r6 + blr +EPILOGUE(mpn_mul_1) diff --git a/rts/gmp/mpn/powerpc32/regmap.m4 b/rts/gmp/mpn/powerpc32/regmap.m4 new file mode 100644 index 0000000000..978f18902a --- /dev/null +++ b/rts/gmp/mpn/powerpc32/regmap.m4 @@ -0,0 +1,34 @@ +divert(-1) + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl Map register names r0, r1, etc, to just `0', `1', etc. +dnl This is needed on all systems but NeXT, Rhapsody, and MacOS-X +forloop(i,0,31, +`define(`r'i,i)' +) + +dnl Likewise for cr0, cr1, etc. +forloop(i,0,7, +`define(`cr'i,i)' +) + +divert diff --git a/rts/gmp/mpn/powerpc32/rshift.asm b/rts/gmp/mpn/powerpc32/rshift.asm new file mode 100644 index 0000000000..a09ba04938 --- /dev/null +++ b/rts/gmp/mpn/powerpc32/rshift.asm @@ -0,0 +1,60 @@ +dnl PowerPC-32 mpn_rshift -- Shift a number right. + +dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl cnt r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_rshift) + mtctr r5 C copy size into CTR + addi r7,r3,-4 C move adjusted res_ptr to free return reg + subfic r8,r6,32 + lwz r11,0(r4) C load first s1 limb + slw r3,r11,r8 C compute function return value + bdz .Lend1 + +.Loop: lwzu r10,4(r4) + srw r9,r11,r6 + slw r12,r10,r8 + or r9,r9,r12 + stwu r9,4(r7) + bdz .Lend2 + lwzu r11,4(r4) + srw r9,r10,r6 + slw r12,r11,r8 + or r9,r9,r12 + stwu r9,4(r7) + bdnz .Loop + +.Lend1: srw r0,r11,r6 + stw r0,4(r7) + blr + +.Lend2: srw r0,r10,r6 + stw r0,4(r7) + blr +EPILOGUE(mpn_rshift) diff --git a/rts/gmp/mpn/powerpc32/sub_n.asm b/rts/gmp/mpn/powerpc32/sub_n.asm new file mode 100644 index 0000000000..b04b4192ef --- /dev/null +++ b/rts/gmp/mpn/powerpc32/sub_n.asm @@ -0,0 +1,61 @@ +dnl PowerPC-32 mpn_sub_n -- Subtract two limb vectors of the same length > 0 +dnl and store difference in a third limb vector. + +dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl s2_ptr r5 +dnl size r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_sub_n) + mtctr r6 C copy size into CTR + addic r0,r6,-1 C set cy + lwz r8,0(r4) C load least significant s1 limb + lwz r0,0(r5) C load least significant s2 limb + addi r3,r3,-4 C offset res_ptr, it's updated before it's used + bdz .Lend C If done, skip loop +.Loop: lwz r9,4(r4) C load s1 limb + lwz r10,4(r5) C load s2 limb + subfe r7,r0,r8 C subtract limbs with cy, set cy + stw r7,4(r3) C store result limb + bdz .Lexit C decrement CTR and exit if done + lwzu r8,8(r4) C load s1 limb and update s1_ptr + lwzu r0,8(r5) C load s2 limb and update s2_ptr + subfe r7,r10,r9 C subtract limbs with cy, set cy + stwu r7,8(r3) C store result limb and update res_ptr + bdnz .Loop C decrement CTR and loop back + +.Lend: subfe r7,r0,r8 + stw r7,4(r3) C store ultimate result limb + subfe r3,r0,r0 C load !cy into ... + subfic r3,r3,0 C ... return value register + blr +.Lexit: subfe r7,r10,r9 + stw r7,8(r3) + subfe r3,r0,r0 C load !cy into ... + subfic r3,r3,0 C ... return value register + blr +EPILOGUE(mpn_sub_n) diff --git a/rts/gmp/mpn/powerpc32/submul_1.asm b/rts/gmp/mpn/powerpc32/submul_1.asm new file mode 100644 index 0000000000..a129e9f9ea --- /dev/null +++ b/rts/gmp/mpn/powerpc32/submul_1.asm @@ -0,0 +1,130 @@ +dnl PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +dnl INPUT PARAMETERS +dnl res_ptr r3 +dnl s1_ptr r4 +dnl size r5 +dnl s2_limb r6 + +dnl This is optimized for the PPC604. It has not been tested on PPC601, PPC603 +dnl or PPC750 since I don't have access to any such machines. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_submul_1) + cmpi cr0,r5,9 C more than 9 limbs? + bgt cr0,.Lbig C branch if more than 9 limbs + + mtctr r5 + lwz r0,0(r4) + mullw r7,r0,r6 + mulhwu r10,r0,r6 + lwz r9,0(r3) + subfc r8,r7,r9 + addc r7,r7,r8 C invert cy (r7 is junk) + addi r3,r3,-4 + bdz .Lend +.Lloop: + lwzu r0,4(r4) + stwu r8,4(r3) + mullw r8,r0,r6 + adde r7,r8,r10 + mulhwu r10,r0,r6 + lwz r9,4(r3) + addze r10,r10 + subfc r8,r7,r9 + addc r7,r7,r8 C invert cy (r7 is junk) + bdnz .Lloop +.Lend: stw r8,4(r3) + addze r3,r10 + blr + +.Lbig: stmw r30,-32(r1) + addi r5,r5,-1 + srwi r0,r5,2 + mtctr r0 + + lwz r7,0(r4) + mullw r8,r7,r6 + mulhwu r0,r7,r6 + lwz r7,0(r3) + subfc r7,r8,r7 + addc r8,r8,r7 + stw r7,0(r3) + +.LloopU: + lwz r7,4(r4) + lwz r12,8(r4) + lwz r30,12(r4) + lwzu r31,16(r4) + mullw r8,r7,r6 + mullw r9,r12,r6 + mullw r10,r30,r6 + mullw r11,r31,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + adde r9,r9,r0 + mulhwu r0,r12,r6 + lwz r12,8(r3) + adde r10,r10,r0 + mulhwu r0,r30,r6 + lwz r30,12(r3) + adde r11,r11,r0 + mulhwu r0,r31,r6 + lwz r31,16(r3) + addze r0,r0 C new cy_limb + subfc r7,r8,r7 + stw r7,4(r3) + subfe r12,r9,r12 + stw r12,8(r3) + subfe r30,r10,r30 + stw r30,12(r3) + subfe r31,r11,r31 + stwu r31,16(r3) + subfe r11,r11,r11 C invert ... + addic r11,r11,1 C ... carry + bdnz .LloopU + + andi. r31,r5,3 + mtctr r31 + beq cr0,.Lendx + +.LloopE: + lwzu r7,4(r4) + mullw r8,r7,r6 + adde r8,r8,r0 C add cy_limb + mulhwu r0,r7,r6 + lwz r7,4(r3) + addze r0,r0 C new cy_limb + subfc r7,r8,r7 + addc r8,r8,r7 + stwu r7,4(r3) + bdnz .LloopE +.Lendx: + addze r3,r0 + lmw r30,-32(r1) + blr +EPILOGUE(mpn_submul_1) diff --git a/rts/gmp/mpn/powerpc32/umul.asm b/rts/gmp/mpn/powerpc32/umul.asm new file mode 100644 index 0000000000..eeaa0a4dc8 --- /dev/null +++ b/rts/gmp/mpn/powerpc32/umul.asm @@ -0,0 +1,32 @@ +dnl PowerPC-32 umul_ppmm -- support for longlong.h + +dnl Copyright (C) 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published by +dnl the Free Software Foundation; either version 2.1 of the License, or (at your +dnl option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + mullw 0,4,5 + mulhwu 9,4,5 + stw 0,0(3) + mr 3,9 + blr +EPILOGUE(mpn_umul_ppmm) diff --git a/rts/gmp/mpn/powerpc64/README b/rts/gmp/mpn/powerpc64/README new file mode 100644 index 0000000000..c779276917 --- /dev/null +++ b/rts/gmp/mpn/powerpc64/README @@ -0,0 +1,36 @@ +PPC630 (aka Power3) pipeline information: + +Decoding is 4-way and issue is 8-way with some out-of-order capability. +LS1 - ld/st unit 1 +LS2 - ld/st unit 2 +FXU1 - integer unit 1, handles any simple integer instructions +FXU2 - integer unit 2, handles any simple integer instructions +FXU3 - integer unit 3, handles integer multiply and divide +FPU1 - floating-point unit 1 +FPU2 - floating-point unit 2 + +Memory: Any two memory operations can issue, but memory subsystem + can sustain just one store per cycle. +Simple integer: 2 operations (such as add, rl*) +Integer multiply: 1 operation every 9th cycle worst case; exact timing depends + on 2nd operand most significant bit position (10 bits per + cycle). Multiply unit is not pipelined, only one multiply + operation in progress is allowed. +Integer divide: ? +Floating-point: Any plain 2 arithmetic instructions (such as fmul, fadd, fmadd) + Latency = 4. +Floating-point divide: + ? +Floating-point square root: + ? + +Best possible times for the main loops: +shift: 1.5 cycles limited by integer unit contention. + With 63 special loops, one for each shift count, we could + reduce the needed integer instructions to 2, which would + reduce the best possible time to 1 cycle. +add/sub: 1.5 cycles, limited by ld/st unit contention. +mul: 18 cycles (average) unless floating-point operations are used, + but that would only help for multiplies of perhaps 10 and more + limbs. +addmul/submul:Same situation as for mul. diff --git a/rts/gmp/mpn/powerpc64/add_n.asm b/rts/gmp/mpn/powerpc64/add_n.asm new file mode 100644 index 0000000000..c3325376dc --- /dev/null +++ b/rts/gmp/mpn/powerpc64/add_n.asm @@ -0,0 +1,61 @@ +# PowerPC-64 mpn_add_n -- Add two limb vectors of the same length > 0 and +# store sum in a third limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_add_n) + mtctr r6 # copy size into CTR + addic r0,r0,0 # clear cy + ld r8,0(r4) # load least significant s1 limb + ld r0,0(r5) # load least significant s2 limb + addi r3,r3,-8 # offset res_ptr, it's updated before it's used + bdz .Lend # If done, skip loop +.Loop: ld r9,8(r4) # load s1 limb + ld r10,8(r5) # load s2 limb + adde r7,r0,r8 # add limbs with cy, set cy + std r7,8(r3) # store result limb + bdz .Lexit # decrement CTR and exit if done + ldu r8,16(r4) # load s1 limb and update s1_ptr + ldu r0,16(r5) # load s2 limb and update s2_ptr + adde r7,r10,r9 # add limbs with cy, set cy + stdu r7,16(r3) # store result limb and update res_ptr + bdnz .Loop # decrement CTR and loop back + +.Lend: adde r7,r0,r8 + std r7,8(r3) # store ultimate result limb + li r3,0 # load cy into ... + addze r3,r3 # ... return value register + blr +.Lexit: adde r7,r10,r9 + std r7,16(r3) + li r3,0 # load cy into ... + addze r3,r3 # ... return value register + blr +EPILOGUE(mpn_add_n) diff --git a/rts/gmp/mpn/powerpc64/addmul_1.asm b/rts/gmp/mpn/powerpc64/addmul_1.asm new file mode 100644 index 0000000000..81774482fe --- /dev/null +++ b/rts/gmp/mpn/powerpc64/addmul_1.asm @@ -0,0 +1,52 @@ +# PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add +# the result to a second limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_addmul_1) + mtctr 5 + li 9,0 # cy_limb = 0 + addic 0,0,0 + cal 3,-8(3) + cal 4,-8(4) +.Loop: + ldu 0,8(4) + ld 10,8(3) + mulld 7,0,6 + adde 7,7,9 + mulhdu 9,0,6 + addze 9,9 + addc 7,7,10 + stdu 7,8(3) + bdnz .Loop + + addze 3,9 + blr +EPILOGUE(mpn_addmul_1) diff --git a/rts/gmp/mpn/powerpc64/addsub_n.asm b/rts/gmp/mpn/powerpc64/addsub_n.asm new file mode 100644 index 0000000000..4ed40d71ae --- /dev/null +++ b/rts/gmp/mpn/powerpc64/addsub_n.asm @@ -0,0 +1,107 @@ +# PowerPC-64 mpn_addsub_n -- Simultaneous add and sub. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + +include(`asm-syntax.m4') + +define(SAVE_BORROW_RESTORE_CARRY, + `sldi $1,$1,63 + adde $1,$1,$1') +define(SAVE_CARRY_RESTORE_BORROW, + `sldi $1,$1,63 + adde $1,$1,$1') + +# 19991117 + +# This is just crafted for testing some ideas, and verifying that we can make +# it run fast. It runs at 2.55 cycles/limb on the 630, which is very good. +# We should play a little with the schedule. No time has been spent on that. + +# To finish this, the loop warm up and cool down code needs to be written, +# and the result need to be tested. Also, the proper calling sequence should +# be used. + +# r1p r2p s1p s2p n +# Use reg r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12 + +ASM_START() +PROLOGUE(mpn_addsub_n) + std r14,-64(1) + std r15,-56(1) + std r16,-48(1) + std r17,-40(1) + std r18,-32(1) + std r19,-24(1) + + srdi r7,r7,2 + mtctr r7 # copy size into CTR + addic r0,r0,0 # clear cy + addi r3,r3,-8 # offset res_ptr, it's updated before it's used + addi r4,r4,-8 # offset res_ptr, it's updated before it's used + +.Loop: + adde r12,r8,r9 + std r12,8(r3) + adde r12,r10,r11 + std r12,16(r3) + + SAVE_CARRY_RESTORE_BORROW(r0) + + subfe r12,r8,r9 + std r12,8(r4) + ld r8,8(r5) # s1 L 1 + ld r9,8(r6) # s2 L 1 + subfe r12,r10,r11 + std r12,16(r4) + ld r10,16(r5) # s1 L 2 + ld r11,16(r6) # s2 L 2 +# pair ------------------------- + subfe r12,r14,r15 + std r12,24(r4) + subfe r12,r16,r17 + stdu r12,32(r4) + + SAVE_BORROW_RESTORE_CARRY(r0) + + adde r12,r14,r15 + std r12,24(r3) + ld r14,24(r5) # s1 L 3 + ld r15,24(r6) # s2 L 3 + adde r12,r16,r17 + stdu r12,32(r3) + ldu r16,32(r5) # s1 L 4 + ldu r17,32(r6) # s2 L 4 + bdnz .Loop + + ld r14,-64(1) + ld r15,-56(1) + ld r16,-48(1) + ld r17,-40(1) + ld r18,-32(1) + ld r19,-24(1) + blr +EPILOGUE(mpn_addsub_n) diff --git a/rts/gmp/mpn/powerpc64/aix.m4 b/rts/gmp/mpn/powerpc64/aix.m4 new file mode 100644 index 0000000000..aee9f1f97a --- /dev/null +++ b/rts/gmp/mpn/powerpc64/aix.m4 @@ -0,0 +1,40 @@ +divert(-1) +dnl m4 macros for AIX 64-bit assembly. + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + +define(`ASM_START', + `.machine "ppc64" + .toc') + +define(`PROLOGUE', + ` + .globl $1 + .globl .$1 + .csect $1[DS],3 +$1: + .llong .$1, TOC[tc0], 0 + .csect .text[PR] + .align 2 +.$1:') + +define(`EPILOGUE', `') + +divert diff --git a/rts/gmp/mpn/powerpc64/copyd.asm b/rts/gmp/mpn/powerpc64/copyd.asm new file mode 100644 index 0000000000..d06e8c25fd --- /dev/null +++ b/rts/gmp/mpn/powerpc64/copyd.asm @@ -0,0 +1,45 @@ +# PowerPC-64 mpn_copyd -- Copy a limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# rptr r3 +# sptr r4 +# n r5 + +include(`../config.m4') + +# Unrolling this analogous to sparc64/copyi.s doesn't help for any +# operand sizes. + +ASM_START() +PROLOGUE(mpn_copyd) + cmpdi cr0,r5,0 + mtctr r5 + sldi r5,r5,3 + add r4,r4,r5 + add r3,r3,r5 + beq cr0,.Lend +.Loop: ldu r0,-8(r4) + stdu r0,-8(r3) + bdnz .Loop +.Lend: blr +EPILOGUE(mpn_copyd) diff --git a/rts/gmp/mpn/powerpc64/copyi.asm b/rts/gmp/mpn/powerpc64/copyi.asm new file mode 100644 index 0000000000..a1bedc4c5b --- /dev/null +++ b/rts/gmp/mpn/powerpc64/copyi.asm @@ -0,0 +1,44 @@ +# PowerPC-64 mpn_copyi -- Copy a limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# rptr r3 +# sptr r4 +# n r5 + +include(`../config.m4') + +# Unrolling this analogous to sparc64/copyi.s doesn't help for any +# operand sizes. + +ASM_START() +PROLOGUE(mpn_copyi) + cmpdi cr0,r5,0 + mtctr r5 + addi r4,r4,-8 + addi r3,r3,-8 + beq cr0,.Lend +.Loop: ldu r0,8(r4) + stdu r0,8(r3) + bdnz .Loop +.Lend: blr +EPILOGUE(mpn_copyi) diff --git a/rts/gmp/mpn/powerpc64/gmp-mparam.h b/rts/gmp/mpn/powerpc64/gmp-mparam.h new file mode 100644 index 0000000000..6fefb960cd --- /dev/null +++ b/rts/gmp/mpn/powerpc64/gmp-mparam.h @@ -0,0 +1,62 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1995, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* Generated by tuneup.c, 2000-07-16. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 10 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 57 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 16 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 89 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 28 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 216 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 14 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 6 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 163 +#endif diff --git a/rts/gmp/mpn/powerpc64/lshift.asm b/rts/gmp/mpn/powerpc64/lshift.asm new file mode 100644 index 0000000000..cef3a81fdd --- /dev/null +++ b/rts/gmp/mpn/powerpc64/lshift.asm @@ -0,0 +1,159 @@ +# PowerPC-64 mpn_lshift -- Shift a number left. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# cnt r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_lshift) + cmpdi cr0,r5,20 # more than 20 limbs? + sldi r0,r5,3 + add r4,r4,r0 # make r4 point at end of s1 + add r7,r3,r0 # make r7 point at end of res + bgt .LBIG # branch if more than 12 limbs + + mtctr r5 # copy size into CTR + subfic r8,r6,64 + ldu r11,-8(r4) # load first s1 limb + srd r3,r11,r8 # compute function return value + bdz .Lend1 + +.Loop: ldu r10,-8(r4) + sld r9,r11,r6 + srd r12,r10,r8 + or r9,r9,r12 + stdu r9,-8(r7) + bdz .Lend2 + ldu r11,-8(r4) + sld r9,r10,r6 + srd r12,r11,r8 + or r9,r9,r12 + stdu r9,-8(r7) + bdnz .Loop + +.Lend1: sld r0,r11,r6 + std r0,-8(r7) + blr +.Lend2: sld r0,r10,r6 + std r0,-8(r7) + blr + +.LBIG: + std r24,-64(1) + std r25,-56(1) + std r26,-48(1) + std r27,-40(1) + std r28,-32(1) + std r29,-24(1) + std r30,-16(1) + std r31,-8(1) + ldu r9,-8(r4) + subfic r8,r6,64 + srd r3,r9,r8 # compute function return value + sld r0,r9,r6 + addi r5,r5,-1 + + andi. r10,r5,3 # count for spill loop + beq .Le + mtctr r10 + ldu r28,-8(r4) + bdz .Lxe0 + +.Loop0: sld r12,r28,r6 + srd r24,r28,r8 + ldu r28,-8(r4) + or r24,r0,r24 + stdu r24,-8(r7) + mr r0,r12 + bdnz .Loop0 # taken at most once! + +.Lxe0: sld r12,r28,r6 + srd r24,r28,r8 + or r24,r0,r24 + stdu r24,-8(r7) + mr r0,r12 + +.Le: srdi r5,r5,2 # count for unrolled loop + addi r5,r5,-1 + mtctr r5 + ld r28,-8(r4) + ld r29,-16(r4) + ld r30,-24(r4) + ldu r31,-32(r4) + +.LoopU: sld r9,r28,r6 + srd r24,r28,r8 + ld r28,-8(r4) + sld r10,r29,r6 + srd r25,r29,r8 + ld r29,-16(r4) + sld r11,r30,r6 + srd r26,r30,r8 + ld r30,-24(r4) + sld r12,r31,r6 + srd r27,r31,r8 + ldu r31,-32(r4) + or r24,r0,r24 + std r24,-8(r7) + or r25,r9,r25 + std r25,-16(r7) + or r26,r10,r26 + std r26,-24(r7) + or r27,r11,r27 + stdu r27,-32(r7) + mr r0,r12 + bdnz .LoopU + + sld r9,r28,r6 + srd r24,r28,r8 + sld r10,r29,r6 + srd r25,r29,r8 + sld r11,r30,r6 + srd r26,r30,r8 + sld r12,r31,r6 + srd r27,r31,r8 + or r24,r0,r24 + std r24,-8(r7) + or r25,r9,r25 + std r25,-16(r7) + or r26,r10,r26 + std r26,-24(r7) + or r27,r11,r27 + stdu r27,-32(r7) + mr r0,r12 + + std r0,-8(r7) + ld r24,-64(1) + ld r25,-56(1) + ld r26,-48(1) + ld r27,-40(1) + ld r28,-32(1) + ld r29,-24(1) + ld r30,-16(1) + ld r31,-8(1) + blr +EPILOGUE(mpn_lshift) diff --git a/rts/gmp/mpn/powerpc64/mul_1.asm b/rts/gmp/mpn/powerpc64/mul_1.asm new file mode 100644 index 0000000000..47597283ff --- /dev/null +++ b/rts/gmp/mpn/powerpc64/mul_1.asm @@ -0,0 +1,49 @@ +# PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store +# the result in a second limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_mul_1) + mtctr 5 + li 9,0 # cy_limb = 0 + addic 0,0,0 + cal 3,-8(3) + cal 4,-8(4) +.Loop: + ldu 0,8(4) + mulld 7,0,6 + adde 7,7,9 + mulhdu 9,0,6 + stdu 7,8(3) + bdnz .Loop + + addze 3,9 + blr +EPILOGUE(mpn_mul_1) diff --git a/rts/gmp/mpn/powerpc64/rshift.asm b/rts/gmp/mpn/powerpc64/rshift.asm new file mode 100644 index 0000000000..88272c7fa9 --- /dev/null +++ b/rts/gmp/mpn/powerpc64/rshift.asm @@ -0,0 +1,60 @@ +# PowerPC-64 mpn_rshift -- Shift a number right. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# cnt r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_rshift) + mtctr r5 # copy size into CTR + addi r7,r3,-8 # move adjusted res_ptr to free return reg + subfic r8,r6,64 + ld r11,0(r4) # load first s1 limb + sld r3,r11,r8 # compute function return value + bdz .Lend1 + +.Loop: ldu r10,8(r4) + srd r9,r11,r6 + sld r12,r10,r8 + or r9,r9,r12 + stdu r9,8(r7) + bdz .Lend2 + ldu r11,8(r4) + srd r9,r10,r6 + sld r12,r11,r8 + or r9,r9,r12 + stdu r9,8(r7) + bdnz .Loop + +.Lend1: srd r0,r11,r6 + std r0,8(r7) + blr + +.Lend2: srd r0,r10,r6 + std r0,8(r7) + blr +EPILOGUE(mpn_rshift) diff --git a/rts/gmp/mpn/powerpc64/sub_n.asm b/rts/gmp/mpn/powerpc64/sub_n.asm new file mode 100644 index 0000000000..4de3de69c7 --- /dev/null +++ b/rts/gmp/mpn/powerpc64/sub_n.asm @@ -0,0 +1,61 @@ +# PowerPC-64 mpn_sub_n -- Subtract two limb vectors of the same length > 0 +# and store difference in a third limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc.b + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_sub_n) + mtctr r6 # copy size into CTR + addic r0,r6,-1 # set cy + ld r8,0(r4) # load least significant s1 limb + ld r0,0(r5) # load least significant s2 limb + addi r3,r3,-8 # offset res_ptr, it's updated before it's used + bdz .Lend # If done, skip loop +.Loop: ld r9,8(r4) # load s1 limb + ld r10,8(r5) # load s2 limb + subfe r7,r0,r8 # subtract limbs with cy, set cy + std r7,8(r3) # store result limb + bdz .Lexit # decrement CTR and exit if done + ldu r8,16(r4) # load s1 limb and update s1_ptr + ldu r0,16(r5) # load s2 limb and update s2_ptr + subfe r7,r10,r9 # subtract limbs with cy, set cy + stdu r7,16(r3) # store result limb and update res_ptr + bdnz .Loop # decrement CTR and loop back + +.Lend: subfe r7,r0,r8 + std r7,8(r3) # store ultimate result limb + subfe r3,r0,r0 # load !cy into ... + subfic r3,r3,0 # ... return value register + blr +.Lexit: subfe r7,r10,r9 + std r7,16(r3) + subfe r3,r0,r0 # load !cy into ... + subfic r3,r3,0 # ... return value register + blr +EPILOGUE(mpn_sub_n) diff --git a/rts/gmp/mpn/powerpc64/submul_1.asm b/rts/gmp/mpn/powerpc64/submul_1.asm new file mode 100644 index 0000000000..17f6369a38 --- /dev/null +++ b/rts/gmp/mpn/powerpc64/submul_1.asm @@ -0,0 +1,54 @@ +# PowerPC-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract +# the result from a second limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_submul_1) + mtctr 5 + li 9,0 # cy_limb = 0 + addic 0,0,0 + cal 3,-8(3) + cal 4,-8(4) +.Loop: + ldu 0,8(4) + ld 10,8(3) + mulld 7,0,6 + adde 7,7,9 + mulhdu 9,0,6 + addze 9,9 + subfc 7,7,10 + stdu 7,8(3) + subfe 11,11,11 # invert ... + addic 11,11,1 # ... carry + bdnz .Loop + + addze 3,9 + blr +EPILOGUE(mpn_submul_1) diff --git a/rts/gmp/mpn/pyr/add_n.s b/rts/gmp/mpn/pyr/add_n.s new file mode 100644 index 0000000000..e1fc535846 --- /dev/null +++ b/rts/gmp/mpn/pyr/add_n.s @@ -0,0 +1,76 @@ +# Pyramid __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +# sum in a third limb vector. + +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + +.text + .align 2 +.globl ___gmpn_add_n +___gmpn_add_n: + movw $-1,tr0 # representation for carry clear + + movw pr3,tr2 + andw $3,tr2 + beq Lend0 + subw tr2,pr3 + +Loop0: rsubw $0,tr0 # restore carry bit from carry-save register + + movw (pr1),tr1 + addwc (pr2),tr1 + movw tr1,(pr0) + + subwb tr0,tr0 + addw $4,pr0 + addw $4,pr1 + addw $4,pr2 + addw $-1,tr2 + bne Loop0 + + mtstw pr3,pr3 + beq Lend +Lend0: +Loop: rsubw $0,tr0 # restore carry bit from carry-save register + + movw (pr1),tr1 + addwc (pr2),tr1 + movw tr1,(pr0) + + movw 4(pr1),tr1 + addwc 4(pr2),tr1 + movw tr1,4(pr0) + + movw 8(pr1),tr1 + addwc 8(pr2),tr1 + movw tr1,8(pr0) + + movw 12(pr1),tr1 + addwc 12(pr2),tr1 + movw tr1,12(pr0) + + subwb tr0,tr0 + addw $16,pr0 + addw $16,pr1 + addw $16,pr2 + addw $-4,pr3 + bne Loop +Lend: + mnegw tr0,pr0 + ret diff --git a/rts/gmp/mpn/pyr/addmul_1.s b/rts/gmp/mpn/pyr/addmul_1.s new file mode 100644 index 0000000000..65c3f8f008 --- /dev/null +++ b/rts/gmp/mpn/pyr/addmul_1.s @@ -0,0 +1,45 @@ +# Pyramid __gmpn_addmul_1 -- Multiply a limb vector with a limb and add +# the result to a second limb vector. + +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + +.text + .align 2 +.globl ___gmpn_addmul_1 +___gmpn_addmul_1: + mova (pr0)[pr2*4],pr0 + mova (pr1)[pr2*4],pr1 + mnegw pr2,pr2 + movw $0,tr3 + +Loop: movw (pr1)[pr2*4],tr1 + uemul pr3,tr0 + addw tr3,tr1 + movw $0,tr3 + addwc tr0,tr3 + movw (pr0)[pr2*0x4],tr0 + addw tr0,tr1 + addwc $0,tr3 + movw tr1,(pr0)[pr2*4] + addw $1,pr2 + bne Loop + + movw tr3,pr0 + ret diff --git a/rts/gmp/mpn/pyr/mul_1.s b/rts/gmp/mpn/pyr/mul_1.s new file mode 100644 index 0000000000..1272297c42 --- /dev/null +++ b/rts/gmp/mpn/pyr/mul_1.s @@ -0,0 +1,42 @@ +# Pyramid __gmpn_mul_1 -- Multiply a limb vector with a limb and store +# the result in a second limb vector. + +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + +.text + .align 2 +.globl ___gmpn_mul_1 +___gmpn_mul_1: + mova (pr0)[pr2*4],pr0 + mova (pr1)[pr2*4],pr1 + mnegw pr2,pr2 + movw $0,tr3 + +Loop: movw (pr1)[pr2*4],tr1 + uemul pr3,tr0 + addw tr3,tr1 + movw $0,tr3 + addwc tr0,tr3 + movw tr1,(pr0)[pr2*4] + addw $1,pr2 + bne Loop + + movw tr3,pr0 + ret diff --git a/rts/gmp/mpn/pyr/sub_n.s b/rts/gmp/mpn/pyr/sub_n.s new file mode 100644 index 0000000000..1fd2eb0f17 --- /dev/null +++ b/rts/gmp/mpn/pyr/sub_n.s @@ -0,0 +1,76 @@ +# Pyramid __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +# store difference in a third limb vector. + +# Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + +.text + .align 2 +.globl ___gmpn_sub_n +___gmpn_sub_n: + movw $-1,tr0 # representation for carry clear + + movw pr3,tr2 + andw $3,tr2 + beq Lend0 + subw tr2,pr3 + +Loop0: rsubw $0,tr0 # restore carry bit from carry-save register + + movw (pr1),tr1 + subwb (pr2),tr1 + movw tr1,(pr0) + + subwb tr0,tr0 + addw $4,pr0 + addw $4,pr1 + addw $4,pr2 + addw $-1,tr2 + bne Loop0 + + mtstw pr3,pr3 + beq Lend +Lend0: +Loop: rsubw $0,tr0 # restore carry bit from carry-save register + + movw (pr1),tr1 + subwb (pr2),tr1 + movw tr1,(pr0) + + movw 4(pr1),tr1 + subwb 4(pr2),tr1 + movw tr1,4(pr0) + + movw 8(pr1),tr1 + subwb 8(pr2),tr1 + movw tr1,8(pr0) + + movw 12(pr1),tr1 + subwb 12(pr2),tr1 + movw tr1,12(pr0) + + subwb tr0,tr0 + addw $16,pr0 + addw $16,pr1 + addw $16,pr2 + addw $-4,pr3 + bne Loop +Lend: + mnegw tr0,pr0 + ret diff --git a/rts/gmp/mpn/sh/add_n.s b/rts/gmp/mpn/sh/add_n.s new file mode 100644 index 0000000000..df388b31a3 --- /dev/null +++ b/rts/gmp/mpn/sh/add_n.s @@ -0,0 +1,47 @@ +! SH __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +! sum in a third limb vector. + +! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr r4 +! s1_ptr r5 +! s2_ptr r6 +! size r7 + + .text + .align 2 + .global ___gmpn_add_n +___gmpn_add_n: + mov #0,r3 ! clear cy save reg + +Loop: mov.l @r5+,r1 + mov.l @r6+,r2 + shlr r3 ! restore cy + addc r2,r1 + movt r3 ! save cy + mov.l r1,@r4 + dt r7 + bf.s Loop + add #4,r4 + + rts + mov r3,r0 ! return carry-out from most sign. limb diff --git a/rts/gmp/mpn/sh/sh2/addmul_1.s b/rts/gmp/mpn/sh/sh2/addmul_1.s new file mode 100644 index 0000000000..f34a7f0503 --- /dev/null +++ b/rts/gmp/mpn/sh/sh2/addmul_1.s @@ -0,0 +1,53 @@ +! SH2 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add +! the result to a second limb vector. + +! Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr r4 +! s1_ptr r5 +! size r6 +! s2_limb r7 + + .text + .align 1 + .global ___gmpn_addmul_1 +___gmpn_addmul_1: + mov #0,r2 ! cy_limb = 0 + mov #0,r0 ! Keep r0 = 0 for entire loop + clrt + +Loop: mov.l @r5+,r3 + dmulu.l r3,r7 + sts macl,r1 + addc r2,r1 ! lo_prod += old cy_limb + sts mach,r2 ! new cy_limb = hi_prod + mov.l @r4,r3 + addc r0,r2 ! cy_limb += T, T = 0 + addc r3,r1 + addc r0,r2 ! cy_limb += T, T = 0 + dt r6 + mov.l r1,@r4 + bf.s Loop + add #4,r4 + + rts + mov r2,r0 diff --git a/rts/gmp/mpn/sh/sh2/mul_1.s b/rts/gmp/mpn/sh/sh2/mul_1.s new file mode 100644 index 0000000000..2a117a3175 --- /dev/null +++ b/rts/gmp/mpn/sh/sh2/mul_1.s @@ -0,0 +1,50 @@ +! SH2 __gmpn_mul_1 -- Multiply a limb vector with a limb and store +! the result in a second limb vector. + +! Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr r4 +! s1_ptr r5 +! size r6 +! s2_limb r7 + + .text + .align 1 + .global ___gmpn_mul_1 +___gmpn_mul_1: + mov #0,r2 ! cy_limb = 0 + mov #0,r0 ! Keep r0 = 0 for entire loop + clrt + +Loop: mov.l @r5+,r3 + dmulu.l r3,r7 + sts macl,r1 + addc r2,r1 + sts mach,r2 + addc r0,r2 ! propagate carry to cy_limb (dt clobbers T) + dt r6 + mov.l r1,@r4 + bf.s Loop + add #4,r4 + + rts + mov r2,r0 diff --git a/rts/gmp/mpn/sh/sh2/submul_1.s b/rts/gmp/mpn/sh/sh2/submul_1.s new file mode 100644 index 0000000000..eb9a27dde3 --- /dev/null +++ b/rts/gmp/mpn/sh/sh2/submul_1.s @@ -0,0 +1,53 @@ +! SH2 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract +! the result from a second limb vector. + +! Copyright (C) 1995, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr r4 +! s1_ptr r5 +! size r6 +! s2_limb r7 + + .text + .align 1 + .global ___gmpn_submul_1 +___gmpn_submul_1: + mov #0,r2 ! cy_limb = 0 + mov #0,r0 ! Keep r0 = 0 for entire loop + clrt + +Loop: mov.l @r5+,r3 + dmulu.l r3,r7 + sts macl,r1 + addc r2,r1 ! lo_prod += old cy_limb + sts mach,r2 ! new cy_limb = hi_prod + mov.l @r4,r3 + addc r0,r2 ! cy_limb += T, T = 0 + subc r3,r1 + addc r0,r2 ! cy_limb += T, T = 0 + dt r6 + mov.l r1,@r4 + bf.s Loop + add #4,r4 + + rts + mov r2,r0 diff --git a/rts/gmp/mpn/sh/sub_n.s b/rts/gmp/mpn/sh/sub_n.s new file mode 100644 index 0000000000..5f818c95a8 --- /dev/null +++ b/rts/gmp/mpn/sh/sub_n.s @@ -0,0 +1,47 @@ +! SH __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store +! difference in a third limb vector. + +! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr r4 +! s1_ptr r5 +! s2_ptr r6 +! size r7 + + .text + .align 2 + .global ___gmpn_sub_n +___gmpn_sub_n: + mov #0,r3 ! clear cy save reg + +Loop: mov.l @r5+,r1 + mov.l @r6+,r2 + shlr r3 ! restore cy + subc r2,r1 + movt r3 ! save cy + mov.l r1,@r4 + dt r7 + bf.s Loop + add #4,r4 + + rts + mov r3,r0 ! return carry-out from most sign. limb diff --git a/rts/gmp/mpn/sparc32/README b/rts/gmp/mpn/sparc32/README new file mode 100644 index 0000000000..7c19df7bc4 --- /dev/null +++ b/rts/gmp/mpn/sparc32/README @@ -0,0 +1,36 @@ +This directory contains mpn functions for various SPARC chips. Code that +runs only on version 8 SPARC implementations, is in the v8 subdirectory. + +RELEVANT OPTIMIZATION ISSUES + + Load and Store timing + +On most early SPARC implementations, the ST instructions takes multiple +cycles, while a STD takes just a single cycle more than an ST. For the CPUs +in SPARCstation I and II, the times are 3 and 4 cycles, respectively. +Therefore, combining two ST instrucitons into a STD when possible is a +significant optimiation. + +Later SPARC implementations have single cycle ST. + +For SuperSPARC, we can perform just one memory instruction per cycle, even +if up to two integer instructions can be executed in its pipeline. For +programs that perform so many memory operations that there are not enough +non-memory operations to issue in parallel with all memory operations, using +LDD and STD when possible helps. + +STATUS + +1. On a SuperSPARC, mpn_lshift and mpn_rshift run at 3 cycles/limb, or 2.5 + cycles/limb asymptotically. We could optimize speed for special counts + by using ADDXCC. + +2. On a SuperSPARC, mpn_add_n and mpn_sub_n runs at 2.5 cycles/limb, or 2 + cycles/limb asymptotically. + +3. mpn_mul_1 runs at what is believed to be optimal speed. + +4. On SuperSPARC, mpn_addmul_1 and mpn_submul_1 could both be improved by a + cycle by avoiding one of the add instrucitons. See a29k/addmul_1. + +The speed of the code for other SPARC implementations is uncertain. diff --git a/rts/gmp/mpn/sparc32/add_n.asm b/rts/gmp/mpn/sparc32/add_n.asm new file mode 100644 index 0000000000..5f1d00c0e0 --- /dev/null +++ b/rts/gmp/mpn/sparc32/add_n.asm @@ -0,0 +1,236 @@ +dnl SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store +dnl sum in a third limb vector. + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(res_ptr,%o0) +define(s1_ptr,%o1) +define(s2_ptr,%o2) +define(n,%o3) + +ASM_START() +PROLOGUE(mpn_add_n) + xor s2_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(1) C branch if alignment differs + nop +C ** V1a ** +L(0): andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + addcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s1_ptr+0],%g4 + addcc n,-10,n + ld [s1_ptr+4],%g1 + ldd [s2_ptr+0],%g2 + blt L(fin1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1): + addxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addxcc %g4,%g2,%o4 + ld [s1_ptr+16],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+20],%g1 + ldd [s2_ptr+16],%g2 + std %o4,[res_ptr+8] + addxcc %g4,%g2,%o4 + ld [s1_ptr+24],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+28],%g1 + ldd [s2_ptr+24],%g2 + std %o4,[res_ptr+16] + addxcc %g4,%g2,%o4 + ld [s1_ptr+32],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+36],%g1 + ldd [s2_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1) + subcc %g0,%o4,%g0 C restore cy + +L(fin1): + addcc n,8-2,n + blt L(end1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1): + addxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + addxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1) + subcc %g0,%o4,%g0 C restore cy +L(end1): + addxcc %g4,%g2,%o4 + addxcc %g1,%g3,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s1_ptr+8],%g4 + ld [s2_ptr+8],%g2 + addxcc %g4,%g2,%o4 + st %o4,[res_ptr+8] + +L(ret1): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +L(1): xor s1_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(2) + nop +C ** V1b ** + mov s2_ptr,%g1 + mov s1_ptr,s2_ptr + b L(0) + mov %g1,s1_ptr + +C ** V2 ** +C If we come here, the alignment of s1_ptr and res_ptr as well as the +C alignment of s2_ptr and res_ptr differ. Since there are only two ways +C things can be aligned (that we care about) we now know that the alignment +C of s1_ptr and s2_ptr are the same. + +L(2): cmp n,1 + be L(jone) + nop + andcc s1_ptr,4,%g0 C s1_ptr unaligned? Side effect: cy=0 + be L(v2) C if no, branch + nop +C Add least significant limb separately to align s1_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + addcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr + +L(v2): addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + blt L(fin2) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + ldd [s1_ptr+8],%g2 + ldd [s2_ptr+8],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+8] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+12] + ldd [s1_ptr+16],%g2 + ldd [s2_ptr+16],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+16] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+20] + ldd [s1_ptr+24],%g2 + ldd [s2_ptr+24],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+24] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+28] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop2) + subcc %g0,%o4,%g0 C restore cy + +L(fin2): + addcc n,8-2,n + blt L(end2) + subcc %g0,%o4,%g0 C restore cy +L(loope2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + addxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + addxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope2) + subcc %g0,%o4,%g0 C restore cy +L(end2): + andcc n,1,%g0 + be L(ret2) + subcc %g0,%o4,%g0 C restore cy +C Add last limb +L(jone): + ld [s1_ptr],%g4 + ld [s2_ptr],%g2 + addxcc %g4,%g2,%o4 + st %o4,[res_ptr] + +L(ret2): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb +EPILOGUE(mpn_add_n) diff --git a/rts/gmp/mpn/sparc32/addmul_1.asm b/rts/gmp/mpn/sparc32/addmul_1.asm new file mode 100644 index 0000000000..80c94e4251 --- /dev/null +++ b/rts/gmp/mpn/sparc32/addmul_1.asm @@ -0,0 +1,146 @@ +dnl SPARC mpn_addmul_1 -- Multiply a limb vector with a limb and add the +dnl result to a second limb vector. + +dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_addmul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + addcc %o5,%g1,%g1 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne L(loop0) + ld [%o4+%o2],%o5 + + addcc %o5,%g1,%g1 + addx %o0,%g0,%o0 + retl + st %g1,[%o4+%o2] + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + addcc %o5,%g3,%g3 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 + addcc %o2,4,%o2 + bne L(loop) + ld [%o4+%o2],%o5 + + addcc %o5,%g3,%g3 + addx %o0,%g0,%o0 + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_addmul_1) diff --git a/rts/gmp/mpn/sparc32/lshift.asm b/rts/gmp/mpn/sparc32/lshift.asm new file mode 100644 index 0000000000..529733ac2d --- /dev/null +++ b/rts/gmp/mpn/sparc32/lshift.asm @@ -0,0 +1,97 @@ +dnl SPARC mpn_lshift -- Shift a number left. +dnl + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr %o0 +C src_ptr %o1 +C size %o2 +C cnt %o3 + +ASM_START() +PROLOGUE(mpn_lshift) + sll %o2,2,%g1 + add %o1,%g1,%o1 C make %o1 point at end of src + ld [%o1-4],%g2 C load first limb + sub %g0,%o3,%o5 C negate shift count + add %o0,%g1,%o0 C make %o0 point at end of res + add %o2,-1,%o2 + andcc %o2,4-1,%g4 C number of limbs in first loop + srl %g2,%o5,%g1 C compute function result + be L(0) C if multiple of 4 limbs, skip first loop + st %g1,[%sp+80] + + sub %o2,%g4,%o2 C adjust count for main loop + +L(loop0): + ld [%o1-8],%g3 + add %o0,-4,%o0 + add %o1,-4,%o1 + addcc %g4,-1,%g4 + sll %g2,%o3,%o4 + srl %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + bne L(loop0) + st %o4,[%o0+0] + +L(0): tst %o2 + be L(end) + nop + +L(loop): + ld [%o1-8],%g3 + add %o0,-16,%o0 + addcc %o2,-4,%o2 + sll %g2,%o3,%o4 + srl %g3,%o5,%g1 + + ld [%o1-12],%g2 + sll %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0+12] + srl %g2,%o5,%g1 + + ld [%o1-16],%g3 + sll %g2,%o3,%o4 + or %g4,%g1,%g4 + st %g4,[%o0+8] + srl %g3,%o5,%g1 + + ld [%o1-20],%g2 + sll %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0+4] + srl %g2,%o5,%g1 + + add %o1,-16,%o1 + or %g4,%g1,%g4 + bne L(loop) + st %g4,[%o0+0] + +L(end): sll %g2,%o3,%g2 + st %g2,[%o0-4] + retl + ld [%sp+80],%o0 +EPILOGUE(mpn_lshift) diff --git a/rts/gmp/mpn/sparc32/mul_1.asm b/rts/gmp/mpn/sparc32/mul_1.asm new file mode 100644 index 0000000000..e5fedeabaa --- /dev/null +++ b/rts/gmp/mpn/sparc32/mul_1.asm @@ -0,0 +1,137 @@ +dnl SPARC mpn_mul_1 -- Multiply a limb vector with a limb and store +dnl the result in a second limb vector. + +dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_mul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne,a L(loop0) + ld [%o1+%o2],%o5 + + retl + st %g1,[%o4+%o2] + + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 C g2 = S1_LIMB iff S2_LIMB < 0, else 0 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne,a L(loop) + ld [%o1+%o2],%o5 + + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_mul_1) diff --git a/rts/gmp/mpn/sparc32/rshift.asm b/rts/gmp/mpn/sparc32/rshift.asm new file mode 100644 index 0000000000..9187dbaa6f --- /dev/null +++ b/rts/gmp/mpn/sparc32/rshift.asm @@ -0,0 +1,93 @@ +dnl SPARC mpn_rshift -- Shift a number right. + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr %o0 +C src_ptr %o1 +C size %o2 +C cnt %o3 + +ASM_START() +PROLOGUE(mpn_rshift) + ld [%o1],%g2 C load first limb + sub %g0,%o3,%o5 C negate shift count + add %o2,-1,%o2 + andcc %o2,4-1,%g4 C number of limbs in first loop + sll %g2,%o5,%g1 C compute function result + be L(0) C if multiple of 4 limbs, skip first loop + st %g1,[%sp+80] + + sub %o2,%g4,%o2 C adjust count for main loop + +L(loop0): + ld [%o1+4],%g3 + add %o0,4,%o0 + add %o1,4,%o1 + addcc %g4,-1,%g4 + srl %g2,%o3,%o4 + sll %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + bne L(loop0) + st %o4,[%o0-4] + +L(0): tst %o2 + be L(end) + nop + +L(loop): + ld [%o1+4],%g3 + add %o0,16,%o0 + addcc %o2,-4,%o2 + srl %g2,%o3,%o4 + sll %g3,%o5,%g1 + + ld [%o1+8],%g2 + srl %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0-16] + sll %g2,%o5,%g1 + + ld [%o1+12],%g3 + srl %g2,%o3,%o4 + or %g4,%g1,%g4 + st %g4,[%o0-12] + sll %g3,%o5,%g1 + + ld [%o1+16],%g2 + srl %g3,%o3,%g4 + or %o4,%g1,%o4 + st %o4,[%o0-8] + sll %g2,%o5,%g1 + + add %o1,16,%o1 + or %g4,%g1,%g4 + bne L(loop) + st %g4,[%o0-4] + +L(end): srl %g2,%o3,%g2 + st %g2,[%o0-0] + retl + ld [%sp+80],%o0 +EPILOGUE(mpn_rshift) diff --git a/rts/gmp/mpn/sparc32/sub_n.asm b/rts/gmp/mpn/sparc32/sub_n.asm new file mode 100644 index 0000000000..071909a1b6 --- /dev/null +++ b/rts/gmp/mpn/sparc32/sub_n.asm @@ -0,0 +1,326 @@ +dnl SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +dnl store difference in a third limb vector. + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +define(res_ptr,%o0) +define(s1_ptr,%o1) +define(s2_ptr,%o2) +define(n,%o3) + +ASM_START() +PROLOGUE(mpn_sub_n) + xor s2_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(1) C branch if alignment differs + nop +C ** V1a ** + andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + subcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s1_ptr+0],%g4 + addcc n,-10,n + ld [s1_ptr+4],%g1 + ldd [s2_ptr+0],%g2 + blt L(fin1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1): + subxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + subxcc %g4,%g2,%o4 + ld [s1_ptr+16],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+20],%g1 + ldd [s2_ptr+16],%g2 + std %o4,[res_ptr+8] + subxcc %g4,%g2,%o4 + ld [s1_ptr+24],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+28],%g1 + ldd [s2_ptr+24],%g2 + std %o4,[res_ptr+16] + subxcc %g4,%g2,%o4 + ld [s1_ptr+32],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+36],%g1 + ldd [s2_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1) + subcc %g0,%o4,%g0 C restore cy + +L(fin1): + addcc n,8-2,n + blt L(end1) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1): + subxcc %g4,%g2,%o4 + ld [s1_ptr+8],%g4 + subxcc %g1,%g3,%o5 + ld [s1_ptr+12],%g1 + ldd [s2_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1) + subcc %g0,%o4,%g0 C restore cy +L(end1): + subxcc %g4,%g2,%o4 + subxcc %g1,%g3,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s1_ptr+8],%g4 + ld [s2_ptr+8],%g2 + subxcc %g4,%g2,%o4 + st %o4,[res_ptr+8] + +L(ret1): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +L(1): xor s1_ptr,res_ptr,%g1 + andcc %g1,4,%g0 + bne L(2) + nop +C ** V1b ** + andcc res_ptr,4,%g0 C res_ptr unaligned? Side effect: cy=0 + be L(v1b) C if no, branch + nop +C Add least significant limb separately to align res_ptr and s1_ptr + ld [s2_ptr],%g4 + add s2_ptr,4,s2_ptr + ld [s1_ptr],%g2 + add s1_ptr,4,s1_ptr + add n,-1,n + subcc %g2,%g4,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr +L(v1b): addx %g0,%g0,%o4 C save cy in register + cmp n,2 C if n < 2 ... + bl L(end2) C ... branch to tail code + subcc %g0,%o4,%g0 C restore cy + + ld [s2_ptr+0],%g4 + addcc n,-10,n + ld [s2_ptr+4],%g1 + ldd [s1_ptr+0],%g2 + blt L(fin1b) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop1b): + subxcc %g2,%g4,%o4 + ld [s2_ptr+8],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+12],%g1 + ldd [s1_ptr+8],%g2 + std %o4,[res_ptr+0] + subxcc %g2,%g4,%o4 + ld [s2_ptr+16],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+20],%g1 + ldd [s1_ptr+16],%g2 + std %o4,[res_ptr+8] + subxcc %g2,%g4,%o4 + ld [s2_ptr+24],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+28],%g1 + ldd [s1_ptr+24],%g2 + std %o4,[res_ptr+16] + subxcc %g2,%g4,%o4 + ld [s2_ptr+32],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+36],%g1 + ldd [s1_ptr+32],%g2 + std %o4,[res_ptr+24] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop1b) + subcc %g0,%o4,%g0 C restore cy + +L(fin1b): + addcc n,8-2,n + blt L(end1b) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 2 limbs until less than 2 limbs remain +L(loope1b): + subxcc %g2,%g4,%o4 + ld [s2_ptr+8],%g4 + subxcc %g3,%g1,%o5 + ld [s2_ptr+12],%g1 + ldd [s1_ptr+8],%g2 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope1b) + subcc %g0,%o4,%g0 C restore cy +L(end1b): + subxcc %g2,%g4,%o4 + subxcc %g3,%g1,%o5 + std %o4,[res_ptr+0] + addx %g0,%g0,%o4 C save cy in register + + andcc n,1,%g0 + be L(ret1b) + subcc %g0,%o4,%g0 C restore cy +C Add last limb + ld [s2_ptr+8],%g4 + ld [s1_ptr+8],%g2 + subxcc %g2,%g4,%o4 + st %o4,[res_ptr+8] + +L(ret1b): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb + +C ** V2 ** +C If we come here, the alignment of s1_ptr and res_ptr as well as the +C alignment of s2_ptr and res_ptr differ. Since there are only two ways +C things can be aligned (that we care about) we now know that the alignment +C of s1_ptr and s2_ptr are the same. + +L(2): cmp n,1 + be L(jone) + nop + andcc s1_ptr,4,%g0 C s1_ptr unaligned? Side effect: cy=0 + be L(v2) C if no, branch + nop +C Add least significant limb separately to align s1_ptr and s2_ptr + ld [s1_ptr],%g4 + add s1_ptr,4,s1_ptr + ld [s2_ptr],%g2 + add s2_ptr,4,s2_ptr + add n,-1,n + subcc %g4,%g2,%o4 + st %o4,[res_ptr] + add res_ptr,4,res_ptr + +L(v2): addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + blt L(fin2) + subcc %g0,%o4,%g0 C restore cy +C Add blocks of 8 limbs until less than 8 limbs remain +L(loop2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + ldd [s1_ptr+8],%g2 + ldd [s2_ptr+8],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+8] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+12] + ldd [s1_ptr+16],%g2 + ldd [s2_ptr+16],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+16] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+20] + ldd [s1_ptr+24],%g2 + ldd [s2_ptr+24],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+24] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+28] + addx %g0,%g0,%o4 C save cy in register + addcc n,-8,n + add s1_ptr,32,s1_ptr + add s2_ptr,32,s2_ptr + add res_ptr,32,res_ptr + bge L(loop2) + subcc %g0,%o4,%g0 C restore cy + +L(fin2): + addcc n,8-2,n + blt L(end2) + subcc %g0,%o4,%g0 C restore cy +L(loope2): + ldd [s1_ptr+0],%g2 + ldd [s2_ptr+0],%o4 + subxcc %g2,%o4,%g2 + st %g2,[res_ptr+0] + subxcc %g3,%o5,%g3 + st %g3,[res_ptr+4] + addx %g0,%g0,%o4 C save cy in register + addcc n,-2,n + add s1_ptr,8,s1_ptr + add s2_ptr,8,s2_ptr + add res_ptr,8,res_ptr + bge L(loope2) + subcc %g0,%o4,%g0 C restore cy +L(end2): + andcc n,1,%g0 + be L(ret2) + subcc %g0,%o4,%g0 C restore cy +C Add last limb +L(jone): + ld [s1_ptr],%g4 + ld [s2_ptr],%g2 + subxcc %g4,%g2,%o4 + st %o4,[res_ptr] + +L(ret2): + retl + addx %g0,%g0,%o0 C return carry-out from most sign. limb +EPILOGUE(mpn_sub_n) diff --git a/rts/gmp/mpn/sparc32/submul_1.asm b/rts/gmp/mpn/sparc32/submul_1.asm new file mode 100644 index 0000000000..12abd844ce --- /dev/null +++ b/rts/gmp/mpn/sparc32/submul_1.asm @@ -0,0 +1,146 @@ +dnl SPARC mpn_submul_1 -- Multiply a limb vector with a limb and subtract +dnl the result from a second limb vector. + +dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_submul_1) + C Make S1_PTR and RES_PTR point at the end of their blocks + C and put (- 4 x SIZE) in index/loop counter. + sll %o2,2,%o2 + add %o0,%o2,%o4 C RES_PTR in o4 since o0 is retval + add %o1,%o2,%o1 + sub %g0,%o2,%o2 + + cmp %o3,0xfff + bgu L(large) + nop + + ld [%o1+%o2],%o5 + mov 0,%o0 + b L(0) + add %o4,-4,%o4 +L(loop0): + subcc %o5,%g1,%g1 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g1,[%o4+%o2] +L(0): wr %g0,%o3,%y + sra %o5,31,%g2 + and %o3,%g2,%g2 + andcc %g1,0,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,%o5,%g1 + mulscc %g1,0,%g1 + sra %g1,20,%g4 + sll %g1,12,%g1 + rd %y,%g3 + srl %g3,20,%g3 + or %g1,%g3,%g1 + + addcc %g1,%o0,%g1 + addx %g2,%g4,%o0 C add sign-compensation and cy to hi limb + addcc %o2,4,%o2 C loop counter + bne L(loop0) + ld [%o4+%o2],%o5 + + subcc %o5,%g1,%g1 + addx %o0,%g0,%o0 + retl + st %g1,[%o4+%o2] + +L(large): + ld [%o1+%o2],%o5 + mov 0,%o0 + sra %o3,31,%g4 C g4 = mask of ones iff S2_LIMB < 0 + b L(1) + add %o4,-4,%o4 +L(loop): + subcc %o5,%g3,%g3 + ld [%o1+%o2],%o5 + addx %o0,%g0,%o0 + st %g3,[%o4+%o2] +L(1): wr %g0,%o5,%y + and %o5,%g4,%g2 + andcc %g0,%g0,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%o3,%g1 + mulscc %g1,%g0,%g1 + rd %y,%g3 + addcc %g3,%o0,%g3 + addx %g2,%g1,%o0 + addcc %o2,4,%o2 + bne L(loop) + ld [%o4+%o2],%o5 + + subcc %o5,%g3,%g3 + addx %o0,%g0,%o0 + retl + st %g3,[%o4+%o2] +EPILOGUE(mpn_submul_1) diff --git a/rts/gmp/mpn/sparc32/udiv_fp.asm b/rts/gmp/mpn/sparc32/udiv_fp.asm new file mode 100644 index 0000000000..e340e147d2 --- /dev/null +++ b/rts/gmp/mpn/sparc32/udiv_fp.asm @@ -0,0 +1,158 @@ +dnl SPARC v7 __udiv_qrnnd division support, used from longlong.h. +dnl This is for v7 CPUs with a floating-point unit. + +dnl Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr i0 +C n1 i1 +C n0 i2 +C d i3 + +ASM_START() + +ifdef(`PIC', +` TEXT +L(getpc): + retl + nop') + + TEXT + ALIGN(8) +L(C0): .double 0r4294967296 +L(C1): .double 0r2147483648 + +PROLOGUE(mpn_udiv_qrnnd) + save %sp,-104,%sp + st %i1,[%fp-8] + ld [%fp-8],%f10 + +ifdef(`PIC', +`L(pc): call L(getpc) C put address of this insn in %o7 + ldd [%o7+L(C0)-L(pc)],%f8', +` sethi %hi(L(C0)),%o7 + ldd [%o7+%lo(L(C0))],%f8') + + fitod %f10,%f4 + cmp %i1,0 + bge L(248) + mov %i0,%i5 + faddd %f4,%f8,%f4 +L(248): + st %i2,[%fp-8] + ld [%fp-8],%f10 + fmuld %f4,%f8,%f6 + cmp %i2,0 + bge L(249) + fitod %f10,%f2 + faddd %f2,%f8,%f2 +L(249): + st %i3,[%fp-8] + faddd %f6,%f2,%f2 + ld [%fp-8],%f10 + cmp %i3,0 + bge L(250) + fitod %f10,%f4 + faddd %f4,%f8,%f4 +L(250): + fdivd %f2,%f4,%f2 + +ifdef(`PIC', +` ldd [%o7+L(C1)-L(pc)],%f4', +` sethi %hi(L(C1)),%o7 + ldd [%o7+%lo(L(C1))],%f4') + + fcmped %f2,%f4 + nop + fbge,a L(251) + fsubd %f2,%f4,%f2 + fdtoi %f2,%f2 + st %f2,[%fp-8] + b L(252) + ld [%fp-8],%i4 +L(251): + fdtoi %f2,%f2 + st %f2,[%fp-8] + ld [%fp-8],%i4 + sethi %hi(-2147483648),%g2 + xor %i4,%g2,%i4 +L(252): + wr %g0,%i4,%y + sra %i3,31,%g2 + and %i4,%g2,%g2 + andcc %g0,0,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,%i3,%g1 + mulscc %g1,0,%g1 + add %g1,%g2,%i0 + rd %y,%g3 + subcc %i2,%g3,%o7 + subxcc %i1,%i0,%g0 + be L(253) + cmp %o7,%i3 + + add %i4,-1,%i0 + add %o7,%i3,%o7 + st %o7,[%i5] + ret + restore +L(253): + blu L(246) + mov %i4,%i0 + add %i4,1,%i0 + sub %o7,%i3,%o7 +L(246): + st %o7,[%i5] + ret + restore +EPILOGUE(mpn_udiv_qrnnd) diff --git a/rts/gmp/mpn/sparc32/udiv_nfp.asm b/rts/gmp/mpn/sparc32/udiv_nfp.asm new file mode 100644 index 0000000000..ae19f4c6e9 --- /dev/null +++ b/rts/gmp/mpn/sparc32/udiv_nfp.asm @@ -0,0 +1,193 @@ +dnl SPARC v7 __udiv_qrnnd division support, used from longlong.h. +dnl This is for v7 CPUs without a floating-point unit. + +dnl Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr o0 +C n1 o1 +C n0 o2 +C d o3 + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + tst %o3 + bneg L(largedivisor) + mov 8,%g1 + + b L(p1) + addxcc %o2,%o2,%o2 + +L(plop): + bcc L(n1) + addxcc %o2,%o2,%o2 +L(p1): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n2) + addxcc %o2,%o2,%o2 +L(p2): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n3) + addxcc %o2,%o2,%o2 +L(p3): addx %o1,%o1,%o1 + subcc %o1,%o3,%o4 + bcc L(n4) + addxcc %o2,%o2,%o2 +L(p4): addx %o1,%o1,%o1 + addcc %g1,-1,%g1 + bne L(plop) + subcc %o1,%o3,%o4 + bcc L(n5) + addxcc %o2,%o2,%o2 +L(p5): st %o1,[%o0] + retl + xnor %g0,%o2,%o0 + +L(nlop): + bcc L(p1) + addxcc %o2,%o2,%o2 +L(n1): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p2) + addxcc %o2,%o2,%o2 +L(n2): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p3) + addxcc %o2,%o2,%o2 +L(n3): addx %o4,%o4,%o4 + subcc %o4,%o3,%o1 + bcc L(p4) + addxcc %o2,%o2,%o2 +L(n4): addx %o4,%o4,%o4 + addcc %g1,-1,%g1 + bne L(nlop) + subcc %o4,%o3,%o1 + bcc L(p5) + addxcc %o2,%o2,%o2 +L(n5): st %o4,[%o0] + retl + xnor %g0,%o2,%o0 + +L(largedivisor): + and %o2,1,%o5 C %o5 = n0 & 1 + + srl %o2,1,%o2 + sll %o1,31,%g2 + or %g2,%o2,%o2 C %o2 = lo(n1n0 >> 1) + srl %o1,1,%o1 C %o1 = hi(n1n0 >> 1) + + and %o3,1,%g2 + srl %o3,1,%g3 C %g3 = floor(d / 2) + add %g3,%g2,%g3 C %g3 = ceil(d / 2) + + b L(Lp1) + addxcc %o2,%o2,%o2 + +L(Lplop): + bcc L(Ln1) + addxcc %o2,%o2,%o2 +L(Lp1): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln2) + addxcc %o2,%o2,%o2 +L(Lp2): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln3) + addxcc %o2,%o2,%o2 +L(Lp3): addx %o1,%o1,%o1 + subcc %o1,%g3,%o4 + bcc L(Ln4) + addxcc %o2,%o2,%o2 +L(Lp4): addx %o1,%o1,%o1 + addcc %g1,-1,%g1 + bne L(Lplop) + subcc %o1,%g3,%o4 + bcc L(Ln5) + addxcc %o2,%o2,%o2 +L(Lp5): add %o1,%o1,%o1 C << 1 + tst %g2 + bne L(oddp) + add %o5,%o1,%o1 + st %o1,[%o0] + retl + xnor %g0,%o2,%o0 + +L(Lnlop): + bcc L(Lp1) + addxcc %o2,%o2,%o2 +L(Ln1): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp2) + addxcc %o2,%o2,%o2 +L(Ln2): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp3) + addxcc %o2,%o2,%o2 +L(Ln3): addx %o4,%o4,%o4 + subcc %o4,%g3,%o1 + bcc L(Lp4) + addxcc %o2,%o2,%o2 +L(Ln4): addx %o4,%o4,%o4 + addcc %g1,-1,%g1 + bne L(Lnlop) + subcc %o4,%g3,%o1 + bcc L(Lp5) + addxcc %o2,%o2,%o2 +L(Ln5): add %o4,%o4,%o4 C << 1 + tst %g2 + bne L(oddn) + add %o5,%o4,%o4 + st %o4,[%o0] + retl + xnor %g0,%o2,%o0 + +L(oddp): + xnor %g0,%o2,%o2 + C q' in %o2. r' in %o1 + addcc %o1,%o2,%o1 + bcc L(Lp6) + addx %o2,0,%o2 + sub %o1,%o3,%o1 +L(Lp6): subcc %o1,%o3,%g0 + bcs L(Lp7) + subx %o2,-1,%o2 + sub %o1,%o3,%o1 +L(Lp7): st %o1,[%o0] + retl + mov %o2,%o0 + +L(oddn): + xnor %g0,%o2,%o2 + C q' in %o2. r' in %o4 + addcc %o4,%o2,%o4 + bcc L(Ln6) + addx %o2,0,%o2 + sub %o4,%o3,%o4 +L(Ln6): subcc %o4,%o3,%g0 + bcs L(Ln7) + subx %o2,-1,%o2 + sub %o4,%o3,%o4 +L(Ln7): st %o4,[%o0] + retl + mov %o2,%o0 +EPILOGUE(mpn_udiv_qrnnd) diff --git a/rts/gmp/mpn/sparc32/umul.asm b/rts/gmp/mpn/sparc32/umul.asm new file mode 100644 index 0000000000..efa56851d6 --- /dev/null +++ b/rts/gmp/mpn/sparc32/umul.asm @@ -0,0 +1,68 @@ +dnl SPARC mpn_umul_ppmm -- support for longlong.h for non-gcc. + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + wr %g0,%o1,%y + sra %o2,31,%g2 C Don't move this insn + and %o1,%g2,%g2 C Don't move this insn + andcc %g0,0,%g1 C Don't move this insn + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,%o2,%g1 + mulscc %g1,0,%g1 + rd %y,%g3 + st %g3,[%o0] + retl + add %g1,%g2,%o0 +EPILOGUE(mpn_umul_ppmm) diff --git a/rts/gmp/mpn/sparc32/v8/addmul_1.asm b/rts/gmp/mpn/sparc32/v8/addmul_1.asm new file mode 100644 index 0000000000..da44644b51 --- /dev/null +++ b/rts/gmp/mpn/sparc32/v8/addmul_1.asm @@ -0,0 +1,122 @@ +dnl SPARC v8 mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright (C) 1992, 1993, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_addmul_1) + orcc %g0,%g0,%g2 + ld [%o1+0],%o4 C 1 + + sll %o2,4,%g1 + and %g1,(4-1)<<4,%g1 +ifdef(`PIC', +` mov %o7,%g4 C Save return address register +0: call 1f + add %o7,L(1)-0b,%g3 +1: mov %g4,%o7 C Restore return address register +', +` sethi %hi(L(1)),%g3 + or %g3,%lo(L(1)),%g3 +') + jmp %g3+%g1 + nop +L(1): +L(L00): add %o0,-4,%o0 + b L(loop00) C 4, 8, 12, ... + add %o1,-4,%o1 + nop +L(L01): b L(loop01) C 1, 5, 9, ... + nop + nop + nop +L(L10): add %o0,-12,%o0 C 2, 6, 10, ... + b L(loop10) + add %o1,4,%o1 + nop +L(L11): add %o0,-8,%o0 C 3, 7, 11, ... + b L(loop11) + add %o1,-8,%o1 + nop + +L(loop): + addcc %g3,%g2,%g3 C 1 + ld [%o1+4],%o4 C 2 + rd %y,%g2 C 1 + addx %g0,%g2,%g2 + ld [%o0+0],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+0] C 1 +L(loop00): + umul %o4,%o3,%g3 C 2 + ld [%o0+4],%g1 C 2 + addxcc %g3,%g2,%g3 C 2 + ld [%o1+8],%o4 C 3 + rd %y,%g2 C 2 + addx %g0,%g2,%g2 + nop + addcc %g1,%g3,%g3 + st %g3,[%o0+4] C 2 +L(loop11): + umul %o4,%o3,%g3 C 3 + addxcc %g3,%g2,%g3 C 3 + ld [%o1+12],%o4 C 4 + rd %y,%g2 C 3 + add %o1,16,%o1 + addx %g0,%g2,%g2 + ld [%o0+8],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+8] C 3 +L(loop10): + umul %o4,%o3,%g3 C 4 + addxcc %g3,%g2,%g3 C 4 + ld [%o1+0],%o4 C 1 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 + ld [%o0+12],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+12] C 4 + add %o0,16,%o0 + addx %g0,%g2,%g2 +L(loop01): + addcc %o2,-4,%o2 + bg L(loop) + umul %o4,%o3,%g3 C 1 + + addcc %g3,%g2,%g3 C 4 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 + ld [%o0+0],%g1 C 2 + addcc %g1,%g3,%g3 + st %g3,[%o0+0] C 4 + addx %g0,%g2,%o0 + + retl + nop +EPILOGUE(mpn_addmul_1) diff --git a/rts/gmp/mpn/sparc32/v8/mul_1.asm b/rts/gmp/mpn/sparc32/v8/mul_1.asm new file mode 100644 index 0000000000..801247553a --- /dev/null +++ b/rts/gmp/mpn/sparc32/v8/mul_1.asm @@ -0,0 +1,103 @@ +dnl SPARC v8 mpn_mul_1 -- Multiply a limb vector with a single limb and +dnl store the product in a second limb vector. + +dnl Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_mul_1) + sll %o2,4,%g1 + and %g1,(4-1)<<4,%g1 +ifdef(`PIC', +` mov %o7,%g4 C Save return address register +0: call 1f + add %o7,L(1)-0b,%g3 +1: mov %g4,%o7 C Restore return address register +', +` sethi %hi(L(1)),%g3 + or %g3,%lo(L(1)),%g3 +') + jmp %g3+%g1 + ld [%o1+0],%o4 C 1 +L(1): +L(L00): add %o0,-4,%o0 + add %o1,-4,%o1 + b L(loop00) C 4, 8, 12, ... + orcc %g0,%g0,%g2 +L(L01): b L(loop01) C 1, 5, 9, ... + orcc %g0,%g0,%g2 + nop + nop +L(L10): add %o0,-12,%o0 C 2, 6, 10, ... + add %o1,4,%o1 + b L(loop10) + orcc %g0,%g0,%g2 + nop +L(L11): add %o0,-8,%o0 C 3, 7, 11, ... + add %o1,-8,%o1 + b L(loop11) + orcc %g0,%g0,%g2 + +L(loop): + addcc %g3,%g2,%g3 C 1 + ld [%o1+4],%o4 C 2 + st %g3,[%o0+0] C 1 + rd %y,%g2 C 1 +L(loop00): + umul %o4,%o3,%g3 C 2 + addxcc %g3,%g2,%g3 C 2 + ld [%o1+8],%o4 C 3 + st %g3,[%o0+4] C 2 + rd %y,%g2 C 2 +L(loop11): + umul %o4,%o3,%g3 C 3 + addxcc %g3,%g2,%g3 C 3 + ld [%o1+12],%o4 C 4 + add %o1,16,%o1 + st %g3,[%o0+8] C 3 + rd %y,%g2 C 3 +L(loop10): + umul %o4,%o3,%g3 C 4 + addxcc %g3,%g2,%g3 C 4 + ld [%o1+0],%o4 C 1 + st %g3,[%o0+12] C 4 + add %o0,16,%o0 + rd %y,%g2 C 4 + addx %g0,%g2,%g2 +L(loop01): + addcc %o2,-4,%o2 + bg L(loop) + umul %o4,%o3,%g3 C 1 + + addcc %g3,%g2,%g3 C 4 + st %g3,[%o0+0] C 4 + rd %y,%g2 C 4 + + retl + addx %g0,%g2,%o0 +EPILOGUE(mpn_mul_1) diff --git a/rts/gmp/mpn/sparc32/v8/submul_1.asm b/rts/gmp/mpn/sparc32/v8/submul_1.asm new file mode 100644 index 0000000000..9ed132f4c1 --- /dev/null +++ b/rts/gmp/mpn/sparc32/v8/submul_1.asm @@ -0,0 +1,58 @@ +dnl SPARC v8 mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr o0 +C s1_ptr o1 +C size o2 +C s2_limb o3 + +ASM_START() +PROLOGUE(mpn_submul_1) + sub %g0,%o2,%o2 C negate ... + sll %o2,2,%o2 C ... and scale size + sub %o1,%o2,%o1 C o1 is offset s1_ptr + sub %o0,%o2,%g1 C g1 is offset res_ptr + + mov 0,%o0 C clear cy_limb + +L(loop): + ld [%o1+%o2],%o4 + ld [%g1+%o2],%g2 + umul %o4,%o3,%o5 + rd %y,%g3 + addcc %o5,%o0,%o5 + addx %g3,0,%o0 + subcc %g2,%o5,%g2 + addx %o0,0,%o0 + st %g2,[%g1+%o2] + + addcc %o2,4,%o2 + bne L(loop) + nop + + retl + nop +EPILOGUE(mpn_submul_1) diff --git a/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm b/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm new file mode 100644 index 0000000000..0d5e8d415d --- /dev/null +++ b/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm @@ -0,0 +1,122 @@ +dnl SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h. +dnl This is for SuperSPARC only, to compensate for its semi-functional +dnl udiv instruction. + +dnl Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rem_ptr i0 +C n1 i1 +C n0 i2 +C d i3 + +ASM_START() + +ifdef(`PIC', +` TEXT +L(getpc): + retl + nop') + + TEXT + ALIGN(8) +L(C0): .double 0r4294967296 +L(C1): .double 0r2147483648 + +PROLOGUE(mpn_udiv_qrnnd) + save %sp,-104,%sp + st %i1,[%fp-8] + ld [%fp-8],%f10 + +ifdef(`PIC', +`L(pc): call L(getpc) C put address of this insn in %o7 + ldd [%o7+L(C0)-L(pc)],%f8', +` sethi %hi(L(C0)),%o7 + ldd [%o7+%lo(L(C0))],%f8') + + fitod %f10,%f4 + cmp %i1,0 + bge L(248) + mov %i0,%i5 + faddd %f4,%f8,%f4 +L(248): + st %i2,[%fp-8] + ld [%fp-8],%f10 + fmuld %f4,%f8,%f6 + cmp %i2,0 + bge L(249) + fitod %f10,%f2 + faddd %f2,%f8,%f2 +L(249): + st %i3,[%fp-8] + faddd %f6,%f2,%f2 + ld [%fp-8],%f10 + cmp %i3,0 + bge L(250) + fitod %f10,%f4 + faddd %f4,%f8,%f4 +L(250): + fdivd %f2,%f4,%f2 + +ifdef(`PIC', +` ldd [%o7+L(C1)-L(pc)],%f4', +` sethi %hi(L(C1)),%o7 + ldd [%o7+%lo(L(C1))],%f4') + + fcmped %f2,%f4 + nop + fbge,a L(251) + fsubd %f2,%f4,%f2 + fdtoi %f2,%f2 + st %f2,[%fp-8] + b L(252) + ld [%fp-8],%i4 +L(251): + fdtoi %f2,%f2 + st %f2,[%fp-8] + ld [%fp-8],%i4 + sethi %hi(-2147483648),%g2 + xor %i4,%g2,%i4 +L(252): + umul %i3,%i4,%g3 + rd %y,%i0 + subcc %i2,%g3,%o7 + subxcc %i1,%i0,%g0 + be L(253) + cmp %o7,%i3 + + add %i4,-1,%i0 + add %o7,%i3,%o7 + st %o7,[%i5] + ret + restore +L(253): + blu L(246) + mov %i4,%i0 + add %i4,1,%i0 + sub %o7,%i3,%o7 +L(246): + st %o7,[%i5] + ret + restore +EPILOGUE(mpn_udiv_qrnnd) diff --git a/rts/gmp/mpn/sparc32/v8/umul.asm b/rts/gmp/mpn/sparc32/v8/umul.asm new file mode 100644 index 0000000000..ae8f692a0a --- /dev/null +++ b/rts/gmp/mpn/sparc32/v8/umul.asm @@ -0,0 +1,31 @@ +dnl SPARC v8 mpn_umul_ppmm -- support for longlong.h for non-gcc. + +dnl Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + umul %o1,%o2,%g2 + st %g2,[%o0] + retl + rd %y,%o0 +EPILOGUE(mpn_umul_ppmm) diff --git a/rts/gmp/mpn/sparc32/v9/README b/rts/gmp/mpn/sparc32/v9/README new file mode 100644 index 0000000000..9b39713271 --- /dev/null +++ b/rts/gmp/mpn/sparc32/v9/README @@ -0,0 +1,4 @@ +Code for SPARC processors implementing version 9 of the SPARC architecture. +This code is for systems that doesn't preserve the full 64-bit contents of +integer register at context switch. For other systems (such as Solaris 7 or +later) use the code in ../../sparc64. diff --git a/rts/gmp/mpn/sparc32/v9/addmul_1.asm b/rts/gmp/mpn/sparc32/v9/addmul_1.asm new file mode 100644 index 0000000000..c1762cc41f --- /dev/null +++ b/rts/gmp/mpn/sparc32/v9/addmul_1.asm @@ -0,0 +1,288 @@ +dnl SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + + TEXT + ALIGN(4) +L(noll): + .word 0 + +PROLOGUE(mpn_addmul_1) + save %sp,-256,%sp + +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hi(L(noll)),%g1 + ld [%g1+%lo(L(noll))],%f10') + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1],%f11 + subcc %i2,1,%i2 + be,pn %icc,L(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end2) + std %f12,[%fp-16] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end3) + std %f12,[%fp-32] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,L(end4) + std %f12,[%fp-16] + + b,a L(loopm) + + .align 16 +C BEGIN LOOP +L(loop): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + subcc %i2,1,%i2 + be,pn %icc,L(loope) + add %i0,4,%i0 C res_ptr++ +L(loopm): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + subcc %i2,1,%i2 + bne,pt %icc,L(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP + + fxtod %f10,%f2 + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + b,a L(xxx) +L(loope): +L(end4): + fxtod %f10,%f2 + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + b,a L(yyy) + +L(end3): + fxtod %f10,%f2 + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 +L(xxx): fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 +L(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end1): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + +L(ret): add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + st %g4,[%i0-4] + + ret + restore %g0,%g3,%o0 C sideeffect: put cy in retreg +EPILOGUE(mpn_addmul_1) diff --git a/rts/gmp/mpn/sparc32/v9/gmp-mparam.h b/rts/gmp/mpn/sparc32/v9/gmp-mparam.h new file mode 100644 index 0000000000..f946b900f0 --- /dev/null +++ b/rts/gmp/mpn/sparc32/v9/gmp-mparam.h @@ -0,0 +1,69 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +/* These values are for UltraSPARC I, II, and IIi. It is bogus that + this file lives in v9, but that will do for now. */ + +/* Variations in addmul_1 speed make the multiply and square thresholds + doubtful. TOOM3_SQR_THRESHOLD had to be estimated here. */ + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 30 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 200 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 59 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 500 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 107 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 146 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 29 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 3 +#endif diff --git a/rts/gmp/mpn/sparc32/v9/mul_1.asm b/rts/gmp/mpn/sparc32/v9/mul_1.asm new file mode 100644 index 0000000000..f8f0fdd8c2 --- /dev/null +++ b/rts/gmp/mpn/sparc32/v9/mul_1.asm @@ -0,0 +1,267 @@ +dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and +dnl store the result in a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + + TEXT + ALIGN(4) +L(noll): + .word 0 + +PROLOGUE(mpn_mul_1) + save %sp,-256,%sp + +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hi(L(noll)),%g1 + ld [%g1+%lo(L(noll))],%f10') + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1],%f11 + subcc %i2,1,%i2 + be,pn %icc,L(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end2) + std %f12,[%fp-16] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end3) + std %f12,[%fp-32] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,L(end4) + std %f12,[%fp-16] + + b,a L(loopm) + + .align 16 +C BEGIN LOOP +L(loop): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + subcc %i2,1,%i2 + be,pn %icc,L(loope) + add %i0,4,%i0 C res_ptr++ +L(loopm): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + subcc %i2,1,%i2 + bne,pt %icc,L(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP + + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + b,a L(xxx) +L(loope): +L(end4): + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + b,a L(yyy) + +L(end3): + fxtod %f10,%f2 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 +L(xxx): fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 +L(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end1): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + +L(ret): add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + st %g4,[%i0-4] + + ret + restore %g0,%g3,%o0 C sideeffect: put cy in retreg +EPILOGUE(mpn_mul_1) diff --git a/rts/gmp/mpn/sparc32/v9/submul_1.asm b/rts/gmp/mpn/sparc32/v9/submul_1.asm new file mode 100644 index 0000000000..6195ea88ea --- /dev/null +++ b/rts/gmp/mpn/sparc32/v9/submul_1.asm @@ -0,0 +1,291 @@ +dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + + TEXT + ALIGN(4) +L(noll): + .word 0 + +PROLOGUE(mpn_submul_1) + save %sp,-256,%sp + +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hi(L(noll)),%g1 + ld [%g1+%lo(L(noll))],%f10') + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-16] + ld [%fp-16],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1],%f11 + subcc %i2,1,%i2 + be,pn %icc,L(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end2) + std %f12,[%fp-16] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,L(end3) + std %f12,[%fp-32] + + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,L(end4) + std %f12,[%fp-16] + + b,a L(loopm) + + .align 16 +C BEGIN LOOP +L(loop): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + addx %g3,0,%g3 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + subcc %i2,1,%i2 + be,pn %icc,L(loope) + add %i0,4,%i0 C res_ptr++ +L(loopm): + fxtod %f10,%f2 + ld [%i1],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + addx %g3,0,%g3 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + subcc %i2,1,%i2 + bne,pt %icc,L(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP + + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + b,a L(xxx) +L(loope): +L(end4): + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + subcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + b,a L(yyy) + +L(end3): + fxtod %f10,%f2 + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 +L(xxx): fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-40] + fdtox %f4,%f12 + std %f12,[%fp-32] + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 +L(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0],%g5 + srlx %g4,32,%g3 + ldx [%fp-40],%g2 C p16 + ldx [%fp-32],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + b,a L(ret) + +L(end1): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-24] + fdtox %f4,%f12 + std %f12,[%fp-16] + + ld [%i0],%g5 + ldx [%fp-24],%g2 C p16 + ldx [%fp-16],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + +L(ret): add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + srlx %g4,32,%g3 + st %l2,[%i0-4] + + addx %g3,%g0,%g3 + ret + restore %g0,%g3,%o0 C sideeffect: put cy in retreg +EPILOGUE(mpn_submul_1) diff --git a/rts/gmp/mpn/sparc64/README b/rts/gmp/mpn/sparc64/README new file mode 100644 index 0000000000..6923a133f3 --- /dev/null +++ b/rts/gmp/mpn/sparc64/README @@ -0,0 +1,48 @@ +This directory contains mpn functions for 64-bit V9 SPARC + +RELEVANT OPTIMIZATION ISSUES + +The Ultra I/II pipeline executes up to two simple integer arithmetic operations +per cycle. The 64-bit integer multiply instruction mulx takes from 5 cycles to +35 cycles, depending on the position of the most significant bit of the 1st +source operand. It cannot overlap with other instructions. For our use of +mulx, it will take from 5 to 20 cycles. + +Integer conditional move instructions cannot dual-issue with other integer +instructions. No conditional move can issue 1-5 cycles after a load. (Or +something such bizzare.) + +Integer branches can issue with two integer arithmetic instructions. Likewise +for integer loads. Four instructions may issue (arith, arith, ld/st, branch) +but only if the branch is last. + +(The V9 architecture manual recommends that the 2nd operand of a multiply +instruction be the smaller one. For UltraSPARC, they got things backwards and +optimize for the wrong operand! Really helpful in the light of that multiply +is incredibly slow on these CPUs!) + +STATUS + +There is new code in ~/prec/gmp-remote/sparc64. Not tested or completed, but +the pipelines are worked out. Here are the timings: + +* lshift, rshift: The code is well-optimized and runs at 2.0 cycles/limb. + +* add_n, sub_n: add3.s currently runs at 6 cycles/limb. We use a bizarre + scheme of compares and branches (with some nops and fnops to align things) + and carefully stay away from the instructions intended for this application + (i.e., movcs and movcc). + + Using movcc/movcs, even with deep unrolling, seems to get down to 7 + cycles/limb. + + The most promising approach is to split operands in 32-bit pieces using + srlx, then use two addccc, and finally compile the results with sllx+or. + The result could run at 5 cycles/limb, I think. It might be possible to + do without unrolling, or with minimal unrolling. + +* addmul_1/submul_1: Should optimize for when scalar operand < 2^32. +* addmul_1/submul_1: Since mulx is horrendously slow on UltraSPARC I/II, + Karatsuba's method should save up to 16 cycles (i.e. > 20%). +* mul_1 (and possibly the other multiply functions): Handle carry in the + same tricky way as add_n,sub_n. diff --git a/rts/gmp/mpn/sparc64/add_n.asm b/rts/gmp/mpn/sparc64/add_n.asm new file mode 100644 index 0000000000..72b3895a5b --- /dev/null +++ b/rts/gmp/mpn/sparc64/add_n.asm @@ -0,0 +1,172 @@ +! SPARC v9 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +! sum in a third limb vector. + +! Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! s1_ptr %o1 +! s2_ptr %o2 +! size %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_add_n) + +! 12 mem ops >= 12 cycles +! 8 shift insn >= 8 cycles +! 8 addccc, executing alone, +8 cycles +! Unrolling not mandatory...perhaps 2-way is best? +! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl +! All in all, it runs at 5 cycles/limb + + save %sp,-160,%sp + + addcc %g0,%g0,%g0 + + add %i3,-4,%i3 + brlz,pn %i3,L(there) + nop + + ldx [%i1+0],%l0 + ldx [%i2+0],%l4 + ldx [%i1+8],%l1 + ldx [%i2+8],%l5 + ldx [%i1+16],%l2 + ldx [%i2+16],%l6 + ldx [%i1+24],%l3 + ldx [%i2+24],%l7 + add %i1,32,%i1 + add %i2,32,%i2 + + add %i3,-4,%i3 + brlz,pn %i3,L(skip) + nop + b L(loop1) ! jump instead of executing many NOPs + nop + ALIGN(32) +!--------- Start main loop --------- +L(loop1): + addccc %l0,%l4,%g1 +!- + srlx %l0,32,%o0 + ldx [%i1+0],%l0 +!- + srlx %l4,32,%o4 + ldx [%i2+0],%l4 +!- + addccc %o0,%o4,%g0 +!- + addccc %l1,%l5,%g2 +!- + srlx %l1,32,%o1 + ldx [%i1+8],%l1 +!- + srlx %l5,32,%o5 + ldx [%i2+8],%l5 +!- + addccc %o1,%o5,%g0 +!- + addccc %l2,%l6,%g3 +!- + srlx %l2,32,%o2 + ldx [%i1+16],%l2 +!- + srlx %l6,32,%g5 ! asymmetry + ldx [%i2+16],%l6 +!- + addccc %o2,%g5,%g0 +!- + addccc %l3,%l7,%g4 +!- + srlx %l3,32,%o3 + ldx [%i1+24],%l3 + add %i1,32,%i1 +!- + srlx %l7,32,%o7 + ldx [%i2+24],%l7 + add %i2,32,%i2 +!- + addccc %o3,%o7,%g0 +!- + stx %g1,[%i0+0] +!- + stx %g2,[%i0+8] +!- + stx %g3,[%i0+16] + add %i3,-4,%i3 +!- + stx %g4,[%i0+24] + add %i0,32,%i0 + + brgez,pt %i3,L(loop1) + nop +!--------- End main loop --------- +L(skip): + addccc %l0,%l4,%g1 + srlx %l0,32,%o0 + srlx %l4,32,%o4 + addccc %o0,%o4,%g0 + addccc %l1,%l5,%g2 + srlx %l1,32,%o1 + srlx %l5,32,%o5 + addccc %o1,%o5,%g0 + addccc %l2,%l6,%g3 + srlx %l2,32,%o2 + srlx %l6,32,%g5 ! asymmetry + addccc %o2,%g5,%g0 + addccc %l3,%l7,%g4 + srlx %l3,32,%o3 + srlx %l7,32,%o7 + addccc %o3,%o7,%g0 + stx %g1,[%i0+0] + stx %g2,[%i0+8] + stx %g3,[%i0+16] + stx %g4,[%i0+24] + add %i0,32,%i0 + +L(there): + add %i3,4,%i3 + brz,pt %i3,L(end) + nop + +L(loop2): + ldx [%i1+0],%l0 + add %i1,8,%i1 + ldx [%i2+0],%l4 + add %i2,8,%i2 + srlx %l0,32,%g2 + srlx %l4,32,%g3 + addccc %l0,%l4,%g1 + addccc %g2,%g3,%g0 + stx %g1,[%i0+0] + add %i0,8,%i0 + add %i3,-1,%i3 + brgz,pt %i3,L(loop2) + nop + +L(end): addc %g0,%g0,%i0 + ret + restore +EPILOGUE(mpn_add_n) diff --git a/rts/gmp/mpn/sparc64/addmul1h.asm b/rts/gmp/mpn/sparc64/addmul1h.asm new file mode 100644 index 0000000000..96cb5f7369 --- /dev/null +++ b/rts/gmp/mpn/sparc64/addmul1h.asm @@ -0,0 +1,203 @@ +dnl SPARC 64-bit addmull/addmulu -- Helper for mpn_addmul_1 and mpn_mul_1. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +ifdef(`LOWPART', +`addmull:', +`addmulu:') + save %sp,-256,%sp + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1+4],%f11 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-25] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,E(end2) + std %f12,[%fp-17] + + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end3) + std %f12,[%fp-33] + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,E(end4) + std %f12,[%fp-17] + + b,a E(loop) + nop C nop is cheap to nullify + + ALIGN(16) +C BEGIN LOOP +E(loop): + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + sub %i2,2,%i2 + add %i0,4,%i0 C res_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + std %f12,[%fp-17] + brnz,pt %i2,E(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP +E(loope): +E(end4): + fxtod %f10,%f2 + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + b,a E(yyy) + +E(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 +E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g5,%g1,%g1 C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy +ifdef(`LOWPART', +` ld [%i0+DHI],%g5') + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + ldx [%fp-33],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + +ifdef(`LOWPART', +` add %g5,%g1,%g1') C add *res_ptr to p0 (ADD2) + add %g3,%g1,%g4 C p += cy +ifdef(`LOWPART', +` st %g4,[%i0-4+DHI] + srlx %g4,32,%g4') + + ret + restore %g0,%g4,%o0 C sideeffect: put cy in retreg +ifdef(`LOWPART', +`EPILOGUE(addmull)', +`EPILOGUE(addmulu)') diff --git a/rts/gmp/mpn/sparc64/addmul_1.asm b/rts/gmp/mpn/sparc64/addmul_1.asm new file mode 100644 index 0000000000..c3f04cea6a --- /dev/null +++ b/rts/gmp/mpn/sparc64/addmul_1.asm @@ -0,0 +1,114 @@ +dnl SPARC 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and +dnl add the result to a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch + +PROLOGUE(mpn_addmul_1) + save %sp,-256,%sp + +C We store 0.0 in f10 and keep it invariant accross thw two +C function calls below. Note that this is not ABI conformant, +C but since the functions are local, that's acceptable. +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hh(L(noll)),%g2 + sethi %lm(L(noll)),%g1 + or %g2,%hm(L(noll)),%g2 + or %g1,%lo(L(noll)),%g1 + sllx %g2,32,%g2 + ld [%g1+%g2],%f10') + + sub %i1,%i0,%g1 + srlx %g1,3,%g1 + cmp %g1,%i2 + bcc,pt %xcc,L(nooverlap) + nop + + sllx %i2,3,%g2 C compute stack allocation byte count + add %g2,15,%o0 + and %o0,-16,%o0 + sub %sp,%o0,%sp + add %sp,2223,%o0 + + mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp + call mpn_copyi + mov %i2,%o2 C copy n to mpn_copyi's count parameter + + add %sp,2223,%i1 + +L(nooverlap): +C First multiply-add with low 32 bits of s2_limb + mov %i0,%o0 + mov %i1,%o1 + add %i2,%i2,%o2 + call addmull + srl %i3,0,%o3 + + mov %o0,%l0 C keep carry-out from accmull + +C Now multiply-add with high 32 bits of s2_limb, unless it is zero. + srlx %i3,32,%o3 + brz,a,pn %o3,L(small) + mov %o0,%i0 + mov %i1,%o1 + add %i2,%i2,%o2 + call addmulu + add %i0,4,%o0 + + add %l0,%o0,%i0 +L(small): + ret + restore %g0,%g0,%g0 +EPILOGUE(mpn_addmul_1) + +C Put a zero in the text segment to allow us to t the address +C quickly when compiling for PIC + TEXT + ALIGN(4) +L(noll): + .word 0 + +define(`LO',`(+4)') +define(`HI',`(-4)') + +define(`DLO',`(+4)') +define(`DHI',`(-4)') +define(`LOWPART') +define(`E',`L(l.$1)') +include_mpn(`sparc64/addmul1h.asm') + +define(`DLO',`(-4)') +define(`DHI',`(+4)') +undefine(`LOWPART') +define(`E',`L(u.$1)') +include_mpn(`sparc64/addmul1h.asm') diff --git a/rts/gmp/mpn/sparc64/copyi.asm b/rts/gmp/mpn/sparc64/copyi.asm new file mode 100644 index 0000000000..d9957e3c90 --- /dev/null +++ b/rts/gmp/mpn/sparc64/copyi.asm @@ -0,0 +1,79 @@ +! SPARC v9 __gmpn_copy -- Copy a limb vector. + +! Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! rptr %o0 +! sptr %o1 +! n %o2 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_copyi) + add %o2,-8,%o2 + brlz,pn %o2,L(skip) + nop + b,a L(loop1) + nop + + ALIGN(16) +L(loop1): + ldx [%o1+0],%g1 + ldx [%o1+8],%g2 + ldx [%o1+16],%g3 + ldx [%o1+24],%g4 + ldx [%o1+32],%g5 + ldx [%o1+40],%o3 + ldx [%o1+48],%o4 + ldx [%o1+56],%o5 + add %o1,64,%o1 + stx %g1,[%o0+0] + stx %g2,[%o0+8] + stx %g3,[%o0+16] + stx %g4,[%o0+24] + stx %g5,[%o0+32] + stx %o3,[%o0+40] + stx %o4,[%o0+48] + stx %o5,[%o0+56] + add %o2,-8,%o2 + brgez,pt %o2,L(loop1) + add %o0,64,%o0 + +L(skip): + add %o2,8,%o2 + brz,pt %o2,L(end) + nop + +L(loop2): + ldx [%o1],%g1 + add %o1,8,%o1 + add %o2,-1,%o2 + stx %g1,[%o0] + add %o0,8,%o0 + brgz,pt %o2,L(loop2) + nop + +L(end): retl + nop +EPILOGUE(mpn_copyi) diff --git a/rts/gmp/mpn/sparc64/gmp-mparam.h b/rts/gmp/mpn/sparc64/gmp-mparam.h new file mode 100644 index 0000000000..74f61661c1 --- /dev/null +++ b/rts/gmp/mpn/sparc64/gmp-mparam.h @@ -0,0 +1,88 @@ +/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 64 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +/* Tell the toom3 multiply implementation to call low-level mpn + functions instead of open-coding operations in C. */ +#define USE_MORE_MPN 1 + + +/* Run on sun workshop cc. */ +/* Generated by tuneup.c, 2000-07-30. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 12 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 95 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 33 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 125 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 27 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 107 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 12 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 199 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 304, 608, 1344, 2304, 7168, 20480, 49152, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 320 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 1664 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 304, 608, 1344, 2816, 7168, 20480, 49152, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 320 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 1664 +#endif diff --git a/rts/gmp/mpn/sparc64/lshift.asm b/rts/gmp/mpn/sparc64/lshift.asm new file mode 100644 index 0000000000..2d2edc50a7 --- /dev/null +++ b/rts/gmp/mpn/sparc64/lshift.asm @@ -0,0 +1,97 @@ +! SPARC v9 __gmpn_lshift -- + +! Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! src_ptr %o1 +! size %o2 +! cnt %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_lshift) + sllx %o2,3,%g1 + add %o1,%g1,%o1 ! make %o1 point at end of src + ldx [%o1-8],%g2 ! load first limb + sub %g0,%o3,%o5 ! negate shift count + add %o0,%g1,%o0 ! make %o0 point at end of res + add %o2,-1,%o2 + and %o2,4-1,%g4 ! number of limbs in first loop + srlx %g2,%o5,%g1 ! compute function result + brz,pn %g4,L(0) ! if multiple of 4 limbs, skip first loop + mov %g1,%g5 + + sub %o2,%g4,%o2 ! adjust count for main loop + +L(loop0): + ldx [%o1-16],%g3 + add %o0,-8,%o0 + add %o1,-8,%o1 + add %g4,-1,%g4 + sllx %g2,%o3,%o4 + srlx %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + brnz,pt %g4,L(loop0) + stx %o4,[%o0+0] + +L(0): brz,pn %o2,L(end) + nop + +L(loop1): + ldx [%o1-16],%g3 + add %o0,-32,%o0 + add %o2,-4,%o2 + sllx %g2,%o3,%o4 + srlx %g3,%o5,%g1 + + ldx [%o1-24],%g2 + sllx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0+24] + srlx %g2,%o5,%g1 + + ldx [%o1-32],%g3 + sllx %g2,%o3,%o4 + or %g4,%g1,%g4 + stx %g4,[%o0+16] + srlx %g3,%o5,%g1 + + ldx [%o1-40],%g2 + sllx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0+8] + srlx %g2,%o5,%g1 + + add %o1,-32,%o1 + or %g4,%g1,%g4 + brnz,pt %o2,L(loop1) + stx %g4,[%o0+0] + +L(end): sllx %g2,%o3,%g2 + stx %g2,[%o0-8] + retl + mov %g5,%o0 +EPILOGUE(mpn_lshift) diff --git a/rts/gmp/mpn/sparc64/mul_1.asm b/rts/gmp/mpn/sparc64/mul_1.asm new file mode 100644 index 0000000000..f2f2821d51 --- /dev/null +++ b/rts/gmp/mpn/sparc64/mul_1.asm @@ -0,0 +1,113 @@ +dnl SPARC 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and +dnl store the result to a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch + +PROLOGUE(mpn_mul_1) + save %sp,-256,%sp + +C We store 0.0 in f10 and keep it invariant accross thw two +C function calls below. Note that this is not ABI conformant, +C but since the functions are local, that's acceptable. +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hh(L(noll)),%g2 + sethi %lm(L(noll)),%g1 + or %g2,%hm(L(noll)),%g2 + or %g1,%lo(L(noll)),%g1 + sllx %g2,32,%g2 + ld [%g1+%g2],%f10') + + sub %i1,%i0,%g1 + srlx %g1,3,%g1 + cmp %g1,%i2 + bcc,pt %xcc,L(nooverlap) + nop + + sllx %i2,3,%g2 C compute stack allocation byte count + add %g2,15,%o0 + and %o0,-16,%o0 + sub %sp,%o0,%sp + add %sp,2223,%o0 + + mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp + call mpn_copyi + mov %i2,%o2 C copy n to mpn_copyi's count parameter + + add %sp,2223,%i1 + +L(nooverlap): +C First multiply-add with low 32 bits of s2_limb + mov %i0,%o0 + mov %i1,%o1 + add %i2,%i2,%o2 + call mull + srl %i3,0,%o3 + + mov %o0,%l0 C keep carry-out from accmull + +C Now multiply-add with high 32 bits of s2_limb, unless it is zero. + srlx %i3,32,%o3 + brz,a,pn %o3,L(small) + mov %o0,%i0 + mov %i1,%o1 + add %i2,%i2,%o2 + call addmulu + add %i0,4,%o0 + + add %l0,%o0,%i0 +L(small): + ret + restore %g0,%g0,%g0 +EPILOGUE(mpn_mul_1) + +C Put a zero in the text segment to allow us to t the address +C quickly when compiling for PIC + TEXT + ALIGN(4) +L(noll): + .word 0 + +define(`LO',`(+4)') +define(`HI',`(-4)') + +define(`DLO',`(+4)') +define(`DHI',`(-4)') +define(`E',`L($1)') +include_mpn(`sparc64/mul_1h.asm') + +define(`DLO',`(-4)') +define(`DHI',`(+4)') +undefine(`LOWPART') +define(`E',`L(u.$1)') +include_mpn(`sparc64/addmul1h.asm') diff --git a/rts/gmp/mpn/sparc64/mul_1h.asm b/rts/gmp/mpn/sparc64/mul_1h.asm new file mode 100644 index 0000000000..5078c01c3f --- /dev/null +++ b/rts/gmp/mpn/sparc64/mul_1h.asm @@ -0,0 +1,183 @@ +dnl SPARC 64-bit mull -- Helper for mpn_mul_1. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +mull: + save %sp,-256,%sp + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1+4],%f11 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-25] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,E(end2) + std %f12,[%fp-17] + + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end3) + std %f12,[%fp-33] + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,E(end4) + std %f12,[%fp-17] + + b,a E(loop) + nop C nop is cheap to nullify + + ALIGN(16) +C BEGIN LOOP +E(loop): + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + sub %i2,2,%i2 + add %i0,4,%i0 C res_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + std %f12,[%fp-17] + brnz,pt %i2,E(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP +E(loope): +E(end4): + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DHI] + b,a E(yyy) + +E(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 +E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + ldx [%fp-33],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %g4,[%i0-4+DLO] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + st %g4,[%i0-4+DHI] + srlx %g4,32,%g4 + + ret + restore %g0,%g4,%o0 C sideeffect: put cy in retreg +EPILOGUE(mull) diff --git a/rts/gmp/mpn/sparc64/rshift.asm b/rts/gmp/mpn/sparc64/rshift.asm new file mode 100644 index 0000000000..baf7920efb --- /dev/null +++ b/rts/gmp/mpn/sparc64/rshift.asm @@ -0,0 +1,94 @@ +! SPARC v9 __gmpn_rshift -- + +! Copyright (C) 1996, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! src_ptr %o1 +! size %o2 +! cnt %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_rshift) + ldx [%o1],%g2 ! load first limb + sub %g0,%o3,%o5 ! negate shift count + add %o2,-1,%o2 + and %o2,4-1,%g4 ! number of limbs in first loop + sllx %g2,%o5,%g1 ! compute function result + brz,pn %g4,L(0) ! if multiple of 4 limbs, skip first loop + mov %g1,%g5 + + sub %o2,%g4,%o2 ! adjust count for main loop + +L(loop0): + ldx [%o1+8],%g3 + add %o0,8,%o0 + add %o1,8,%o1 + add %g4,-1,%g4 + srlx %g2,%o3,%o4 + sllx %g3,%o5,%g1 + mov %g3,%g2 + or %o4,%g1,%o4 + brnz,pt %g4,L(loop0) + stx %o4,[%o0-8] + +L(0): brz,pn %o2,L(end) + nop + +L(loop1): + ldx [%o1+8],%g3 + add %o0,32,%o0 + add %o2,-4,%o2 + srlx %g2,%o3,%o4 + sllx %g3,%o5,%g1 + + ldx [%o1+16],%g2 + srlx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0-32] + sllx %g2,%o5,%g1 + + ldx [%o1+24],%g3 + srlx %g2,%o3,%o4 + or %g4,%g1,%g4 + stx %g4,[%o0-24] + sllx %g3,%o5,%g1 + + ldx [%o1+32],%g2 + srlx %g3,%o3,%g4 + or %o4,%g1,%o4 + stx %o4,[%o0-16] + sllx %g2,%o5,%g1 + + add %o1,32,%o1 + or %g4,%g1,%g4 + brnz %o2,L(loop1) + stx %g4,[%o0-8] + +L(end): srlx %g2,%o3,%g2 + stx %g2,[%o0-0] + retl + mov %g5,%o0 +EPILOGUE(mpn_rshift) diff --git a/rts/gmp/mpn/sparc64/sub_n.asm b/rts/gmp/mpn/sparc64/sub_n.asm new file mode 100644 index 0000000000..61547138e0 --- /dev/null +++ b/rts/gmp/mpn/sparc64/sub_n.asm @@ -0,0 +1,172 @@ +! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +! store difference in a third limb vector. + +! Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr %o0 +! s1_ptr %o1 +! s2_ptr %o2 +! size %o3 + +include(`../config.m4') + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch +PROLOGUE(mpn_sub_n) + +! 12 mem ops >= 12 cycles +! 8 shift insn >= 8 cycles +! 8 addccc, executing alone, +8 cycles +! Unrolling not mandatory...perhaps 2-way is best? +! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl +! All in all, it runs at 5 cycles/limb + + save %sp,-160,%sp + + addcc %g0,%g0,%g0 + + add %i3,-4,%i3 + brlz,pn %i3,L(there) + nop + + ldx [%i1+0],%l0 + ldx [%i2+0],%l4 + ldx [%i1+8],%l1 + ldx [%i2+8],%l5 + ldx [%i1+16],%l2 + ldx [%i2+16],%l6 + ldx [%i1+24],%l3 + ldx [%i2+24],%l7 + add %i1,32,%i1 + add %i2,32,%i2 + + add %i3,-4,%i3 + brlz,pn %i3,L(skip) + nop + b L(loop1) ! jump instead of executing many NOPs + nop + ALIGN(32) +!--------- Start main loop --------- +L(loop1): + subccc %l0,%l4,%g1 +!- + srlx %l0,32,%o0 + ldx [%i1+0],%l0 +!- + srlx %l4,32,%o4 + ldx [%i2+0],%l4 +!- + subccc %o0,%o4,%g0 +!- + subccc %l1,%l5,%g2 +!- + srlx %l1,32,%o1 + ldx [%i1+8],%l1 +!- + srlx %l5,32,%o5 + ldx [%i2+8],%l5 +!- + subccc %o1,%o5,%g0 +!- + subccc %l2,%l6,%g3 +!- + srlx %l2,32,%o2 + ldx [%i1+16],%l2 +!- + srlx %l6,32,%g5 ! asymmetry + ldx [%i2+16],%l6 +!- + subccc %o2,%g5,%g0 +!- + subccc %l3,%l7,%g4 +!- + srlx %l3,32,%o3 + ldx [%i1+24],%l3 + add %i1,32,%i1 +!- + srlx %l7,32,%o7 + ldx [%i2+24],%l7 + add %i2,32,%i2 +!- + subccc %o3,%o7,%g0 +!- + stx %g1,[%i0+0] +!- + stx %g2,[%i0+8] +!- + stx %g3,[%i0+16] + add %i3,-4,%i3 +!- + stx %g4,[%i0+24] + add %i0,32,%i0 + + brgez,pt %i3,L(loop1) + nop +!--------- End main loop --------- +L(skip): + subccc %l0,%l4,%g1 + srlx %l0,32,%o0 + srlx %l4,32,%o4 + subccc %o0,%o4,%g0 + subccc %l1,%l5,%g2 + srlx %l1,32,%o1 + srlx %l5,32,%o5 + subccc %o1,%o5,%g0 + subccc %l2,%l6,%g3 + srlx %l2,32,%o2 + srlx %l6,32,%g5 ! asymmetry + subccc %o2,%g5,%g0 + subccc %l3,%l7,%g4 + srlx %l3,32,%o3 + srlx %l7,32,%o7 + subccc %o3,%o7,%g0 + stx %g1,[%i0+0] + stx %g2,[%i0+8] + stx %g3,[%i0+16] + stx %g4,[%i0+24] + add %i0,32,%i0 + +L(there): + add %i3,4,%i3 + brz,pt %i3,L(end) + nop + +L(loop2): + ldx [%i1+0],%l0 + add %i1,8,%i1 + ldx [%i2+0],%l4 + add %i2,8,%i2 + srlx %l0,32,%g2 + srlx %l4,32,%g3 + subccc %l0,%l4,%g1 + subccc %g2,%g3,%g0 + stx %g1,[%i0+0] + add %i0,8,%i0 + add %i3,-1,%i3 + brgz,pt %i3,L(loop2) + nop + +L(end): addc %g0,%g0,%i0 + ret + restore +EPILOGUE(mpn_sub_n) diff --git a/rts/gmp/mpn/sparc64/submul1h.asm b/rts/gmp/mpn/sparc64/submul1h.asm new file mode 100644 index 0000000000..7f51ba59c6 --- /dev/null +++ b/rts/gmp/mpn/sparc64/submul1h.asm @@ -0,0 +1,204 @@ +dnl SPARC 64-bit submull/submulu -- Helper for mpn_submul_1 and mpn_mul_1. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +ifdef(`LOWPART', +`submull:', +`submulu:') + save %sp,-256,%sp + + sethi %hi(0xffff0000),%o0 + andn %i3,%o0,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f6 + + srl %i3,16,%o0 + st %o0,[%fp-17] + ld [%fp-17],%f11 + fxtod %f10,%f8 + + mov 0,%g3 C cy = 0 + + ld [%i1+4],%f11 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end1) + add %i1,4,%i1 C s1_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-25] + fdtox %f4,%f12 + subcc %i2,1,%i2 + be,pn %icc,E(end2) + std %f12,[%fp-17] + + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + subcc %i2,1,%i2 +dnl be,pn %icc,E(end3) + std %f12,[%fp-33] + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + add %i0,4,%i0 C res_ptr++ + subcc %i2,1,%i2 + be,pn %icc,E(end4) + std %f12,[%fp-17] + + b,a E(loop) + nop C nop is cheap to nullify + + ALIGN(16) +C BEGIN LOOP +E(loop): + fxtod %f10,%f2 + ld [%i1+4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + sub %i2,2,%i2 + add %i0,4,%i0 C res_ptr++ + + fxtod %f10,%f2 + ld [%i1-4],%f11 + add %i1,4,%i1 C s1_ptr++ + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-17],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DHI] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-25] + fdtox %f4,%f12 + std %f12,[%fp-17] + brnz,pt %i2,E(loop) + add %i0,4,%i0 C res_ptr++ +C END LOOP +E(loope): +E(end4): + fxtod %f10,%f2 + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DHI],%g5 + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + fmuld %f2,%f8,%f16 + ldx [%fp-33],%g1 C p0 + fmuld %f2,%f6,%f4 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DLO] + fdtox %f16,%f14 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) + ld [%i0+DLO],%g5 + srlx %g4,32,%g3 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DHI] + b,a E(yyy) + +E(end2): + fxtod %f10,%f2 + fmuld %f2,%f8,%f16 + fmuld %f2,%f6,%f4 + fdtox %f16,%f14 + std %f14,[%fp-41] + fdtox %f4,%f12 + std %f12,[%fp-33] + ld [%i0+DLO],%g5 + ldx [%fp-25],%g2 C p16 + ldx [%fp-17],%g1 C p0 + sllx %g2,16,%g2 C align p16 +E(yyy): add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy + subxcc %g5,%g4,%l2 C add *res_ptr to p0 (ADD2) +ifdef(`LOWPART', +` ld [%i0+DHI],%g5') + srlx %g4,32,%g3 + ldx [%fp-41],%g2 C p16 + ldx [%fp-33],%g1 C p0 + sllx %g2,16,%g2 C align p16 + st %l2,[%i0-4+DLO] + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i0,4,%i0 C res_ptr++ + + add %g3,%g1,%g4 C p += cy +ifdef(`LOWPART', +` subxcc %g5,%g4,%l2') C add *res_ptr to p0 (ADD2) +ifdef(`LOWPART', +` st %l2,[%i0-4+DHI] + srlx %g4,32,%g4') + + addx %g4,0,%g4 + ret + restore %g0,%g4,%o0 C sideeffect: put cy in retreg +ifdef(`LOWPART', +`EPILOGUE(submull)', +`EPILOGUE(submulu)') diff --git a/rts/gmp/mpn/sparc64/submul_1.asm b/rts/gmp/mpn/sparc64/submul_1.asm new file mode 100644 index 0000000000..7c6af0a98b --- /dev/null +++ b/rts/gmp/mpn/sparc64/submul_1.asm @@ -0,0 +1,114 @@ +dnl SPARC 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and +dnl subtract the result from a second limb vector. + +dnl Copyright (C) 1998, 2000 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 2.1 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to +dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +dnl MA 02111-1307, USA. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr i0 +C s1_ptr i1 +C size i2 +C s2_limb i3 + +ASM_START() + .register %g2,#scratch + .register %g3,#scratch + +PROLOGUE(mpn_submul_1) + save %sp,-256,%sp + +C We store 0.0 in f10 and keep it invariant accross thw two +C function calls below. Note that this is not ABI conformant, +C but since the functions are local, that's acceptable. +ifdef(`PIC', +`L(pc): rd %pc,%o7 + ld [%o7+L(noll)-L(pc)],%f10', +` sethi %hh(L(noll)),%g2 + sethi %lm(L(noll)),%g1 + or %g2,%hm(L(noll)),%g2 + or %g1,%lo(L(noll)),%g1 + sllx %g2,32,%g2 + ld [%g1+%g2],%f10') + + sub %i1,%i0,%g1 + srlx %g1,3,%g1 + cmp %g1,%i2 + bcc,pt %xcc,L(nooverlap) + nop + + sllx %i2,3,%g2 C compute stack allocation byte count + add %g2,15,%o0 + and %o0,-16,%o0 + sub %sp,%o0,%sp + add %sp,2223,%o0 + + mov %i1,%o1 C copy s1_ptr to mpn_copyi's srcp + call mpn_copyi + mov %i2,%o2 C copy n to mpn_copyi's count parameter + + add %sp,2223,%i1 + +L(nooverlap): +C First multiply-add with low 32 bits of s2_limb + mov %i0,%o0 + mov %i1,%o1 + add %i2,%i2,%o2 + call submull + srl %i3,0,%o3 + + mov %o0,%l0 C keep carry-out from accmull + +C Now multiply-add with high 32 bits of s2_limb, unless it is zero. + srlx %i3,32,%o3 + brz,a,pn %o3,L(small) + mov %o0,%i0 + mov %i1,%o1 + add %i2,%i2,%o2 + call submulu + add %i0,4,%o0 + + add %l0,%o0,%i0 +L(small): + ret + restore %g0,%g0,%g0 +EPILOGUE(mpn_submul_1) + +C Put a zero in the text segment to allow us to t the address +C quickly when compiling for PIC + TEXT + ALIGN(4) +L(noll): + .word 0 + +define(`LO',`(+4)') +define(`HI',`(-4)') + +define(`DLO',`(+4)') +define(`DHI',`(-4)') +define(`LOWPART') +define(`E',`L(l.$1)') +include_mpn(`sparc64/submul1h.asm') + +define(`DLO',`(-4)') +define(`DHI',`(+4)') +undefine(`LOWPART') +define(`E',`L(u.$1)') +include_mpn(`sparc64/submul1h.asm') diff --git a/rts/gmp/mpn/thumb/add_n.s b/rts/gmp/mpn/thumb/add_n.s new file mode 100644 index 0000000000..c1eeb6ca87 --- /dev/null +++ b/rts/gmp/mpn/thumb/add_n.s @@ -0,0 +1,50 @@ +@ ARM/Thumb __gmpn_add -- Add two limb vectors of the same length > 0 and store +@ sum in a third limb vector. + +@ Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + + +@ INPUT PARAMETERS +@ RES_ptr r0 +@ S1_ptr r1 +@ S2_ptr r2 +@ SIZE r3 + +@ NOT TESTED CODE + + .text + .thumb + .align 0 + .global ___gmpn_add_n +___gmpn_add_n: + push {r4, r5, r6, lr} + mov r6, #1 @ init carry save register + +Loop: sub r6, #1 @ restore carry (set iff r6 was 0) + ldmia r1!, {r4} @ load next limb from S1 + ldmia r2!, {r5} @ load next limb from S2 + adc r4, r5 + stmia r0!, {r4} @ store result limb to RES + sbc r6, r6 @ save negated carry + sub r3, #1 + bge Loop @ loop back while remaining count >= 4 + + mov r0, r6 + pop {r4, r5, r6, pc} diff --git a/rts/gmp/mpn/thumb/sub_n.s b/rts/gmp/mpn/thumb/sub_n.s new file mode 100644 index 0000000000..53c292375f --- /dev/null +++ b/rts/gmp/mpn/thumb/sub_n.s @@ -0,0 +1,50 @@ +@ ARM/Thumb __gmpn_sub -- Subtract two limb vectors of the same length > 0 and +@ store difference in a third limb vector. + +@ Copyright (C) 1997, 2000 Free Software Foundation, Inc. + +@ This file is part of the GNU MP Library. + +@ The GNU MP Library is free software; you can redistribute it and/or modify +@ it under the terms of the GNU Lesser General Public License as published by +@ the Free Software Foundation; either version 2.1 of the License, or (at your +@ option) any later version. + +@ The GNU MP Library is distributed in the hope that it will be useful, but +@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +@ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +@ License for more details. + +@ You should have received a copy of the GNU Lesser General Public License +@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to +@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +@ MA 02111-1307, USA. + + +@ INPUT PARAMETERS +@ RES_ptr r0 +@ S1_ptr r1 +@ S2_ptr r2 +@ SIZE r3 + +@ NOT TESTED CODE + + .text + .thumb + .align 0 + .global ___gmpn_sub_n +___gmpn_sub_n: + push {r4, r5, r6, lr} + mov r6, #1 @ init carry save register + +Loop: sub r6, #1 @ restore carry (set iff r6 was 0) + ldmia r1!, {r4} @ load next limb from S1 + ldmia r2!, {r5} @ load next limb from S2 + sbc r4, r5 + stmia r0!, {r4} @ store result limb to RES + sbc r6, r6 @ save negated carry + sub r3, #1 + bge Loop @ loop back while remaining count >= 4 + + mov r0, r6 + pop {r4, r5, r6, pc} diff --git a/rts/gmp/mpn/underscore.h b/rts/gmp/mpn/underscore.h new file mode 100644 index 0000000000..240dae0f63 --- /dev/null +++ b/rts/gmp/mpn/underscore.h @@ -0,0 +1,26 @@ +/* +Copyright (C) 1999 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + +#if __STDC__ +#define C_SYMBOL_NAME(name) _##name +#else +#define C_SYMBOL_NAME(name) _/**/name +#endif diff --git a/rts/gmp/mpn/vax/add_n.s b/rts/gmp/mpn/vax/add_n.s new file mode 100644 index 0000000000..cf4060f521 --- /dev/null +++ b/rts/gmp/mpn/vax/add_n.s @@ -0,0 +1,61 @@ +# VAX __gmpn_add_n -- Add two limb vectors of the same length > 0 and store +# sum in a third limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# s2_ptr (sp + 12) +# size (sp + 16) + +.text + .align 1 +.globl ___gmpn_add_n +___gmpn_add_n: + .word 0x0 + movl 16(ap),r0 + movl 12(ap),r1 + movl 8(ap),r2 + movl 4(ap),r3 + mnegl r0,r5 + addl2 $3,r0 + ashl $-2,r0,r0 # unroll loop count + bicl2 $-4,r5 # mask out low 2 bits + movaq (r5)[r5],r5 # 9x + jmp Loop(r5) + +Loop: movl (r2)+,r4 + adwc (r1)+,r4 + movl r4,(r3)+ + movl (r2)+,r4 + adwc (r1)+,r4 + movl r4,(r3)+ + movl (r2)+,r4 + adwc (r1)+,r4 + movl r4,(r3)+ + movl (r2)+,r4 + adwc (r1)+,r4 + movl r4,(r3)+ + sobgtr r0,Loop + + adwc r0,r0 + ret diff --git a/rts/gmp/mpn/vax/addmul_1.s b/rts/gmp/mpn/vax/addmul_1.s new file mode 100644 index 0000000000..379061dcb7 --- /dev/null +++ b/rts/gmp/mpn/vax/addmul_1.s @@ -0,0 +1,126 @@ +# VAX __gmpn_addmul_1 -- Multiply a limb vector with a limb and add +# the result to a second limb vector. + +# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# size (sp + 12) +# s2_limb (sp + 16) + +.text + .align 1 +.globl ___gmpn_addmul_1 +___gmpn_addmul_1: + .word 0xfc0 + movl 12(ap),r4 + movl 8(ap),r8 + movl 4(ap),r9 + movl 16(ap),r6 + jlss s2_big + + clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L1 + clrl r11 + +# Loop for S2_LIMB < 0x80000000 +Loop1: movl (r8)+,r1 + jlss L1n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc $0,r3 + addl2 r2,(r9)+ + adwc $0,r3 +L1: movl (r8)+,r1 + jlss L1n1 +L1p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc $0,r11 + addl2 r10,(r9)+ + adwc $0,r11 + + sobgtr r7,Loop1 + movl r11,r0 + ret + +L1n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + addl2 r2,(r9)+ + adwc $0,r3 + movl (r8)+,r1 + jgeq L1p1 +L1n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + addl2 r10,(r9)+ + adwc $0,r11 + + sobgtr r7,Loop1 + movl r11,r0 + ret + + +s2_big: clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L2 + clrl r11 + +# Loop for S2_LIMB >= 0x80000000 +Loop2: movl (r8)+,r1 + jlss L2n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r1,r3 + addl2 r2,(r9)+ + adwc $0,r3 +L2: movl (r8)+,r1 + jlss L2n1 +L2p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r1,r11 + addl2 r10,(r9)+ + adwc $0,r11 + + sobgtr r7,Loop2 + movl r11,r0 + ret + +L2n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + addl2 r2,(r9)+ + adwc r1,r3 + movl (r8)+,r1 + jgeq L2p1 +L2n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + addl2 r10,(r9)+ + adwc r1,r11 + + sobgtr r7,Loop2 + movl r11,r0 + ret diff --git a/rts/gmp/mpn/vax/lshift.s b/rts/gmp/mpn/vax/lshift.s new file mode 100644 index 0000000000..fd311a9782 --- /dev/null +++ b/rts/gmp/mpn/vax/lshift.s @@ -0,0 +1,58 @@ +# VAX __gmpn_lshift -- left shift. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# rptr (sp + 4) +# sptr (sp + 8) +# size (sp + 12) +# cnt (sp + 16) +# r0=retval r1=size r2,r3=itmp r4,r5=otmp call-used registers +# r6=sptr r7=rptr r8=cnt r9 r10 r11 call-saved registers + +.text + .align 1 +.globl ___gmpn_lshift +___gmpn_lshift: + .word 0x1c0 + movl 4(ap),r7 + movl 8(ap),r6 + movl 12(ap),r1 + movl 16(ap),r8 + + moval (r6)[r1],r6 + moval (r7)[r1],r7 + clrl r3 + movl -(r6),r2 + ashq r8,r2,r4 + movl r5,r0 + movl r2,r3 + decl r1 + jeql Lend + +Loop: movl -(r6),r2 + ashq r8,r2,r4 + movl r5,-(r7) + movl r2,r3 + jsobgtr r1,Loop + +Lend: movl r4,-4(r7) + ret diff --git a/rts/gmp/mpn/vax/mul_1.s b/rts/gmp/mpn/vax/mul_1.s new file mode 100644 index 0000000000..708e8ca6ca --- /dev/null +++ b/rts/gmp/mpn/vax/mul_1.s @@ -0,0 +1,123 @@ +# VAX __gmpn_mul_1 -- Multiply a limb vector with a limb and store +# the result in a second limb vector. + +# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# size (sp + 12) +# s2_limb (sp + 16) + +.text + .align 1 +.globl ___gmpn_mul_1 +___gmpn_mul_1: + .word 0xfc0 + movl 12(ap),r4 + movl 8(ap),r8 + movl 4(ap),r9 + movl 16(ap),r6 + jlss s2_big + +# One might want to combine the addl2 and the store below, but that +# is actually just slower according to my timing tests. (VAX 3600) + + clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L1 + clrl r11 + +# Loop for S2_LIMB < 0x80000000 +Loop1: movl (r8)+,r1 + jlss L1n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc $0,r3 + movl r2,(r9)+ +L1: movl (r8)+,r1 + jlss L1n1 +L1p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc $0,r11 + movl r10,(r9)+ + + sobgtr r7,Loop1 + movl r11,r0 + ret + +L1n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + movl r2,(r9)+ + movl (r8)+,r1 + jgeq L1p1 +L1n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + movl r10,(r9)+ + + sobgtr r7,Loop1 + movl r11,r0 + ret + + +s2_big: clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L2 + clrl r11 + +# Loop for S2_LIMB >= 0x80000000 +Loop2: movl (r8)+,r1 + jlss L2n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r1,r3 + movl r2,(r9)+ +L2: movl (r8)+,r1 + jlss L2n1 +L2p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r1,r11 + movl r10,(r9)+ + + sobgtr r7,Loop2 + movl r11,r0 + ret + +L2n0: emul r1,r6,$0,r2 + addl2 r1,r3 + addl2 r11,r2 + adwc r6,r3 + movl r2,(r9)+ + movl (r8)+,r1 + jgeq L2p1 +L2n1: emul r1,r6,$0,r10 + addl2 r1,r11 + addl2 r3,r10 + adwc r6,r11 + movl r10,(r9)+ + + sobgtr r7,Loop2 + movl r11,r0 + ret diff --git a/rts/gmp/mpn/vax/rshift.s b/rts/gmp/mpn/vax/rshift.s new file mode 100644 index 0000000000..515813208d --- /dev/null +++ b/rts/gmp/mpn/vax/rshift.s @@ -0,0 +1,56 @@ +# VAX __gmpn_rshift -- right shift. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# rptr (sp + 4) +# sptr (sp + 8) +# size (sp + 12) +# cnt (sp + 16) +# r0=retval r1=size r2,r3=itmp r4,r5=otmp call-used registers +# r6=sptr r7=rptr r8=cnt r9 r10 r11 call-saved registers + +.text + .align 1 +.globl ___gmpn_rshift +___gmpn_rshift: + .word 0x1c0 + movl 4(ap),r7 + movl 8(ap),r6 + movl 12(ap),r1 + movl 16(ap),r8 + + movl (r6)+,r2 + subl3 r8,$32,r8 + ashl r8,r2,r0 + decl r1 + jeql Lend + +Loop: movl (r6)+,r3 + ashq r8,r2,r4 + movl r5,(r7)+ + movl r3,r2 + jsobgtr r1,Loop + +Lend: clrl r3 + ashq r8,r2,r4 + movl r5,(r7) + ret diff --git a/rts/gmp/mpn/vax/sub_n.s b/rts/gmp/mpn/vax/sub_n.s new file mode 100644 index 0000000000..eff4b1c044 --- /dev/null +++ b/rts/gmp/mpn/vax/sub_n.s @@ -0,0 +1,61 @@ +# VAX __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store +# difference in a third limb vector. + +# Copyright (C) 1999, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# s2_ptr (sp + 12) +# size (sp + 16) + +.text + .align 1 +.globl ___gmpn_sub_n +___gmpn_sub_n: + .word 0x0 + movl 16(ap),r0 + movl 12(ap),r1 + movl 8(ap),r2 + movl 4(ap),r3 + mnegl r0,r5 + addl2 $3,r0 + ashl $-2,r0,r0 # unroll loop count + bicl2 $-4,r5 # mask out low 2 bits + movaq (r5)[r5],r5 # 9x + jmp Loop(r5) + +Loop: movl (r2)+,r4 + sbwc (r1)+,r4 + movl r4,(r3)+ + movl (r2)+,r4 + sbwc (r1)+,r4 + movl r4,(r3)+ + movl (r2)+,r4 + sbwc (r1)+,r4 + movl r4,(r3)+ + movl (r2)+,r4 + sbwc (r1)+,r4 + movl r4,(r3)+ + sobgtr r0,Loop + + adwc r0,r0 + ret diff --git a/rts/gmp/mpn/vax/submul_1.s b/rts/gmp/mpn/vax/submul_1.s new file mode 100644 index 0000000000..be42286935 --- /dev/null +++ b/rts/gmp/mpn/vax/submul_1.s @@ -0,0 +1,126 @@ +# VAX __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract +# the result from a second limb vector. + +# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. + +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# size (sp + 12) +# s2_limb (sp + 16) + +.text + .align 1 +.globl ___gmpn_submul_1 +___gmpn_submul_1: + .word 0xfc0 + movl 12(ap),r4 + movl 8(ap),r8 + movl 4(ap),r9 + movl 16(ap),r6 + jlss s2_big + + clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L1 + clrl r11 + +# Loop for S2_LIMB < 0x80000000 +Loop1: movl (r8)+,r1 + jlss L1n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc $0,r3 + subl2 r2,(r9)+ + adwc $0,r3 +L1: movl (r8)+,r1 + jlss L1n1 +L1p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc $0,r11 + subl2 r10,(r9)+ + adwc $0,r11 + + sobgtr r7,Loop1 + movl r11,r0 + ret + +L1n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + subl2 r2,(r9)+ + adwc $0,r3 + movl (r8)+,r1 + jgeq L1p1 +L1n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + subl2 r10,(r9)+ + adwc $0,r11 + + sobgtr r7,Loop1 + movl r11,r0 + ret + + +s2_big: clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L2 + clrl r11 + +# Loop for S2_LIMB >= 0x80000000 +Loop2: movl (r8)+,r1 + jlss L2n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r1,r3 + subl2 r2,(r9)+ + adwc $0,r3 +L2: movl (r8)+,r1 + jlss L2n1 +L2p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r1,r11 + subl2 r10,(r9)+ + adwc $0,r11 + + sobgtr r7,Loop2 + movl r11,r0 + ret + +L2n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + subl2 r2,(r9)+ + adwc r1,r3 + movl (r8)+,r1 + jgeq L2p1 +L2n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + subl2 r10,(r9)+ + adwc r1,r11 + + sobgtr r7,Loop2 + movl r11,r0 + ret diff --git a/rts/gmp/mpn/x86/README b/rts/gmp/mpn/x86/README new file mode 100644 index 0000000000..3507548b8c --- /dev/null +++ b/rts/gmp/mpn/x86/README @@ -0,0 +1,40 @@ + + X86 MPN SUBROUTINES + + +This directory contains mpn functions for various 80x86 chips. + + +CODE ORGANIZATION + + x86 i386, i486, generic + x86/pentium Intel Pentium (P5, P54) + x86/pentium/mmx Intel Pentium with MMX (P55) + x86/p6 Intel Pentium Pro + x86/p6/mmx Intel Pentium II, III + x86/p6/p3mmx Intel Pentium III + x86/k6 AMD K6, K6-2, K6-3 + x86/k6/mmx + x86/k6/k62mmx AMD K6-2 + x86/k7 AMD Athlon + x86/k7/mmx + + +The x86 directory is also the main support for P6 at the moment, and +is something of a blended style, meant to be reasonable on all x86s. + + + +STATUS + +The code is well-optimized for AMD and Intel chips, but not so well +optimized for Cyrix chips. + + + +RELEVANT OPTIMIZATION ISSUES + +For implementations with slow double shift instructions (SHLD and +SHRD), it might be better to mimic their operation with SHL+SHR+OR. +(M2 is likely to benefit from that, but not Pentium due to its slow +plain SHL and SHR.) diff --git a/rts/gmp/mpn/x86/README.family b/rts/gmp/mpn/x86/README.family new file mode 100644 index 0000000000..3bc73f58b0 --- /dev/null +++ b/rts/gmp/mpn/x86/README.family @@ -0,0 +1,333 @@ + + X86 CPU FAMILY MPN SUBROUTINES + + +This file has some notes on things common to all the x86 family code. + + + +ASM FILES + +The x86 .asm files are BSD style x86 assembler code, first put through m4 +for macro processing. The generic mpn/asm-defs.m4 is used, together with +mpn/x86/x86-defs.m4. Detailed notes are in those files. + +The code is meant for use with GNU "gas" or a system "as". There's no +support for assemblers that demand Intel style, and with gas freely +available and easy to use that shouldn't be a problem. + + + +STACK FRAME + +m4 macros are used to define the parameters passed on the stack, and these +act like comments on what the stack frame looks like too. For example, +mpn_mul_1() has the following. + + defframe(PARAM_MULTIPLIER, 16) + defframe(PARAM_SIZE, 12) + defframe(PARAM_SRC, 8) + defframe(PARAM_DST, 4) + +Here PARAM_MULTIPLIER gets defined as `FRAME+16(%esp)', and the others +similarly. The return address is at offset 0, but there's not normally any +need to access that. + +FRAME is redefined as necessary through the code so it's the number of bytes +pushed on the stack, and hence the offsets in the parameter macros stay +correct. At the start of a routine FRAME should be zero. + + deflit(`FRAME',0) + ... + deflit(`FRAME',4) + ... + deflit(`FRAME',8) + ... + +Helper macros FRAME_pushl(), FRAME_popl(), FRAME_addl_esp() and +FRAME_subl_esp() exist to adjust FRAME for the effect of those instructions, +and can be used instead of explicit definitions if preferred. +defframe_pushl() is a combination FRAME_pushl() and defframe(). + +There's generally some slackness in redefining FRAME. If new values aren't +going to get used, then the redefinitions are omitted to keep from +cluttering up the code. This happens for instance at the end of a routine, +where there might be just four register pops and then a ret, so FRAME isn't +getting used. + +Local variables and saved registers can be similarly defined, with negative +offsets representing stack space below the initial stack pointer. For +example, + + defframe(SAVE_ESI, -4) + defframe(SAVE_EDI, -8) + defframe(VAR_COUNTER,-12) + + deflit(STACK_SPACE, 12) + +Here STACK_SPACE gets used in a "subl $STACK_SPACE, %esp" to allocate the +space, and that instruction must be followed by a redefinition of FRAME +(setting it equal to STACK_SPACE) to reflect the change in %esp. + +Definitions for pushed registers are only put in when they're going to be +used. If registers are just saved and restored with pushes and pops then +definitions aren't made. + + + +ASSEMBLER EXPRESSIONS + +Only addition and subtraction seem to be universally available, certainly +that's all the Solaris 8 "as" seems to accept. If expressions are wanted +then m4 eval() should be used. + +In particular note that a "/" anywhere in a line starts a comment in Solaris +"as", and in some configurations of gas too. + + addl $32/2, %eax <-- wrong + + addl $eval(32/2), %eax <-- right + +Binutils gas/config/tc-i386.c has a choice between "/" being a comment +anywhere in a line, or only at the start. FreeBSD patches 2.9.1 to select +the latter, and as of 2.9.5 it's the default for GNU/Linux too. + + + +ASSEMBLER COMMENTS + +Solaris "as" doesn't support "#" commenting, using /* */ instead, +unfortunately. For that reason "C" commenting is used (see asm-defs.m4) and +the intermediate ".s" files have no comments. + + + +ZERO DISPLACEMENTS + +In a couple of places addressing modes like 0(%ebx) with a byte-sized zero +displacement are wanted, rather than (%ebx) with no displacement. These are +either for computed jumps or to get desirable code alignment. Explicit +.byte sequences are used to ensure the assembler doesn't turn 0(%ebx) into +(%ebx). The Zdisp() macro in x86-defs.m4 is used for this. + +Current gas 2.9.5 or recent 2.9.1 leave 0(%ebx) as written, but old gas +1.92.3 changes it. In general changing would be the sort of "optimization" +an assembler might perform, hence explicit ".byte"s are used where +necessary. + + + +SHLD/SHRD INSTRUCTIONS + +The %cl count forms of double shift instructions like "shldl %cl,%eax,%ebx" +must be written "shldl %eax,%ebx" for some assemblers. gas takes either, +Solaris "as" doesn't allow %cl, gcc generates %cl for gas and NeXT (which is +gas), and omits %cl elsewhere. + +For GMP an autoconf test is used to determine whether %cl should be used and +the macros shldl, shrdl, shldw and shrdw in mpn/x86/x86-defs.m4 then pass +through or omit %cl as necessary. See comments with those macros for usage. + + + +DIRECTION FLAG + +The x86 calling conventions say that the direction flag should be clear at +function entry and exit. (See iBCS2 and SVR4 ABI books, references below.) + +Although this has been so since the year dot, it's not absolutely clear +whether it's universally respected. Since it's better to be safe than +sorry, gmp follows glibc and does a "cld" if it depends on the direction +flag being clear. This happens only in a few places. + + + +POSITION INDEPENDENT CODE + +Defining the symbol PIC in m4 processing selects position independent code. +This mainly affects computed jumps, and these are implemented in a +self-contained fashion (without using the global offset table). The few +calls from assembly code to global functions use the normal procedure +linkage table. + +PIC is necessary for ELF shared libraries because they can be mapped into +different processes at different virtual addresses. Text relocations in +shared libraries are allowed, but that presumably means a page with such a +relocation isn't shared. The use of the PLT for PIC adds a fixed cost to +every function call, which is small but might be noticeable when working with +small operands. + +Calls from one library function to another don't need to go through the PLT, +since of course the call instruction uses a displacement, not an absolute +address, and the relative locations of object files are known when libgmp.so +is created. "ld -Bsymbolic" (or "gcc -Wl,-Bsymbolic") will resolve calls +this way, so that there's no jump through the PLT, but of course leaving +setups of the GOT address in %ebx that may be unnecessary. + +The %ebx setup could be avoided in assembly if a separate option controlled +PIC for calls as opposed to computed jumps etc. But there's only ever +likely to be a handful of calls out of assembler, and getting the same +optimization for C intra-library calls would be more important. There seems +no easy way to tell gcc that certain functions can be called non-PIC, and +unfortunately many gmp functions use the global memory allocation variables, +so they need the GOT anyway. Object files with no global data references +and only intra-library calls could go into the library as non-PIC under +-Bsymbolic. Integrating this into libtool and automake is left as an +exercise for the reader. + + + +SIMPLE LOOPS + +The overheads in setting up for an unrolled loop can mean that at small +sizes a simple loop is faster. Making small sizes go fast is important, +even if it adds a cycle or two to bigger sizes. To this end various +routines choose between a simple loop and an unrolled loop according to +operand size. The path to the simple loop, or to special case code for +small sizes, is always as fast as possible. + +Adding a simple loop requires a conditional jump to choose between the +simple and unrolled code. The size of a branch misprediction penalty +affects whether a simple loop is worthwhile. + +The convention is for an m4 definition UNROLL_THRESHOLD to set the crossover +point, with sizes < UNROLL_THRESHOLD using the simple loop, sizes >= +UNROLL_THRESHOLD using the unrolled loop. If position independent code adds +a couple of cycles to an unrolled loop setup, the threshold will vary with +PIC or non-PIC. Something like the following is typical. + + ifdef(`PIC',` + deflit(UNROLL_THRESHOLD, 10) + ',` + deflit(UNROLL_THRESHOLD, 8) + ') + +There's no automated way to determine the threshold. Setting it to a small +value and then to a big value makes it possible to measure the simple and +unrolled loops each over a range of sizes, from which the crossover point +can be determined. Alternately, just adjust the threshold up or down until +there's no more speedups. + + + +UNROLLED LOOP CODING + +The x86 addressing modes allow a byte displacement of -128 to +127, making +it possible to access 256 bytes, which is 64 limbs, without adjusting +pointer registers within the loop. Dword sized displacements can be used +too, but they increase code size, and unrolling to 64 ought to be enough. + +When unrolling to the full 64 limbs/loop, the limb at the top of the loop +will have a displacement of -128, so pointers have to have a corresponding ++128 added before entering the loop. When unrolling to 32 limbs/loop +displacements 0 to 127 can be used with 0 at the top of the loop and no +adjustment needed to the pointers. + +Where 64 limbs/loop is supported, the +128 adjustment is done only when 64 +limbs/loop is selected. Usually the gain in speed using 64 instead of 32 or +16 is small, so support for 64 limbs/loop is generally only for comparison. + + + +COMPUTED JUMPS + +When working from least significant limb to most significant limb (most +routines) the computed jump and pointer calculations in preparation for an +unrolled loop are as follows. + + S = operand size in limbs + N = number of limbs per loop (UNROLL_COUNT) + L = log2 of unrolling (UNROLL_LOG2) + M = mask for unrolling (UNROLL_MASK) + C = code bytes per limb in the loop + B = bytes per limb (4 for x86) + + computed jump (-S & M) * C + entrypoint + subtract from pointers (-S & M) * B + initial loop counter (S-1) >> L + displacements 0 to B*(N-1) + +The loop counter is decremented at the end of each loop, and the looping +stops when the decrement takes the counter to -1. The displacements are for +the addressing accessing each limb, eg. a load with "movl disp(%ebx), %eax". + +Usually the multiply by "C" can be handled without an imul, using instead an +leal, or a shift and subtract. + +When working from most significant to least significant limb (eg. mpn_lshift +and mpn_copyd), the calculations change as follows. + + add to pointers (-S & M) * B + displacements 0 to -B*(N-1) + + + +OLD GAS 1.92.3 + +This version comes with FreeBSD 2.2.8 and has a couple of gremlins that +affect gmp code. + +Firstly, an expression involving two forward references to labels comes out +as zero. For example, + + addl $bar-foo, %eax + foo: + nop + bar: + +This should lead to "addl $1, %eax", but it comes out as "addl $0, %eax". +When only one forward reference is involved, it works correctly, as for +example, + + foo: + addl $bar-foo, %eax + nop + bar: + +Secondly, an expression involving two labels can't be used as the +displacement for an leal. For example, + + foo: + nop + bar: + leal bar-foo(%eax,%ebx,8), %ecx + +A slightly cryptic error is given, "Unimplemented segment type 0 in +parse_operand". When only one label is used it's ok, and the label can be a +forward reference too, as for example, + + leal foo(%eax,%ebx,8), %ecx + nop + foo: + +These problems only affect PIC computed jump calculations. The workarounds +are just to do an leal without a displacement and then an addl, and to make +sure the code is placed so that there's at most one forward reference in the +addl. + + + +REFERENCES + +"Intel Architecture Software Developer's Manual", volumes 1 to 3, 1999, +order numbers 243190, 243191 and 243192. Available on-line, + + ftp://download.intel.com/design/PentiumII/manuals/243190.htm + ftp://download.intel.com/design/PentiumII/manuals/243191.htm + ftp://download.intel.com/design/PentiumII/manuals/243192.htm + +"Intel386 Family Binary Compatibility Specification 2", Intel Corporation, +published by McGraw-Hill, 1991, ISBN 0-07-031219-2. + +"System V Application Binary Interface", Unix System Laboratories Inc, 1992, +published by Prentice Hall, ISBN 0-13-880410-9. And the "Intel386 Processor +Supplement", AT&T, 1991, ISBN 0-13-877689-X. (These have details of ELF +shared library PIC coding.) + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/rts/gmp/mpn/x86/addsub_n.S b/rts/gmp/mpn/x86/addsub_n.S new file mode 100644 index 0000000000..fe6f648f53 --- /dev/null +++ b/rts/gmp/mpn/x86/addsub_n.S @@ -0,0 +1,174 @@ +/* Currently not working and not used. */ + +/* +Copyright (C) 1999 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. +*/ + + +#define SAVE_BORROW_RESTORE_CARRY(r) adcl r,r; shll $31,r +#define SAVE_CARRY_RESTORE_BORROW(r) adcl r,r + + .globl mpn_addsub_n_0 + .globl mpn_addsub_n_1 + +/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s2,r2==s1. + We let subtraction and addition alternate in being two limbs + ahead of the other, thereby avoiding some SAVE_RESTORE. */ +// r1 = r2 + r1 edi = esi + edi +// r2 = r2 - r1 esi = esi - edi +// s1 s2 +// r2 r1 +// eax,ebx,ecx,edx,esi,edi,ebp +mpn_addsub_n_0: + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s1_ptr */ + movl 36(%esp),%ebp /* size */ + + shrl $2,%ebp + xorl %edx,%edx + .align 4 +Loop0: // L=load E=execute S=store + movl (%esi),%ebx // sub 0 L + movl 4(%esi),%ecx // sub 1 L + sbbl (%edi),%ebx // sub 0 LE + sbbl 4(%edi),%ecx // sub 1 LE +// SAVE_BORROW_RESTORE_CARRY(%edx) + movl (%esi),%eax // add 0 L + adcl %eax,(%edi) // add 0 LES + movl 4(%esi),%eax // add 1 L + adcl %eax,4(%edi) // add 1 LES + movl %ebx,(%esi) // sub 0 S + movl %ecx,4(%esi) // sub 1 S + movl 8(%esi),%ebx // add 2 L + adcl 8(%edi),%ebx // add 2 LE + movl 12(%esi),%ecx // add 3 L + adcl 12(%edi),%ecx // add 3 LE +// SAVE_CARRY_RESTORE_BORROW(%edx) + movl 8(%edi),%eax // sub 2 L + sbbl %eax,8(%esi) // sub 2 LES + movl 12(%edi),%eax // sub 3 L + sbbl %eax,12(%esi) // sub 3 LES + movl %ebx,8(%edi) // add 2 S + movl %ecx,12(%edi) // add 3 S + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ebp + jnz Loop0 + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s1,r2==s2. + We let subtraction and addition alternate in being two limbs + ahead of the other, thereby avoiding some SAVE_RESTORE. */ +// r1 = r1 + r2 edi = edi + esi +// r2 = r1 - r2 esi = edi - esi +// s2 s1 +// r2 r1 +// eax,ebx,ecx,edx,esi,edi,ebp +mpn_addsub_n_1: + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s1_ptr */ + movl 36(%esp),%ebp /* size */ + + shrl $2,%ebp + xorl %edx,%edx + .align 4 +Loop1: // L=load E=execute S=store + movl (%edi),%ebx // sub 0 L + sbbl (%esi),%ebx // sub 0 LE + movl 4(%edi),%ecx // sub 1 L + sbbl 4(%esi),%ecx // sub 1 LE +// SAVE_BORROW_RESTORE_CARRY(%edx) + movl (%esi),%eax // add 0 L + adcl %eax,(%edi) // add 0 LES + movl 4(%esi),%eax // add 1 L + adcl %eax,4(%edi) // add 1 LES + movl %ebx,(%esi) // sub 0 S + movl %ecx,4(%esi) // sub 1 S + movl 8(%esi),%ebx // add 2 L + adcl 8(%edi),%ebx // add 2 LE + movl 12(%esi),%ecx // add 3 L + adcl 12(%edi),%ecx // add 3 LE +// SAVE_CARRY_RESTORE_BORROW(%edx) + movl 8(%edi),%eax // sub 2 L + sbbl 8(%esi),%eax // sub 2 LES + movl %eax,8(%esi) // sub 2 S + movl 12(%edi),%eax // sub 3 L + sbbl 12(%esi),%eax // sub 3 LE + movl %eax,12(%esi) // sub 3 S + movl %ebx,8(%edi) // add 2 S + movl %ecx,12(%edi) // add 3 S + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ebp + jnz Loop1 + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + .globl mpn_copy +mpn_copy: + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s1_ptr */ + movl 28(%esp),%ebp /* size */ + + shrl $2,%ebp + .align 4 +Loop2: + movl (%esi),%eax + movl 4(%esi),%ebx + movl %eax,(%edi) + movl %ebx,4(%edi) + movl 8(%esi),%eax + movl 12(%esi),%ebx + movl %eax,8(%edi) + movl %ebx,12(%edi) + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ebp + jnz Loop2 + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/rts/gmp/mpn/x86/aors_n.asm b/rts/gmp/mpn/x86/aors_n.asm new file mode 100644 index 0000000000..18ef816b4d --- /dev/null +++ b/rts/gmp/mpn/x86/aors_n.asm @@ -0,0 +1,187 @@ +dnl x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. + +dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +ifdef(`OPERATION_add_n',` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + +',`ifdef(`OPERATION_sub_n',` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) + +PROLOGUE(M4_function_nc) +deflit(`FRAME',0) + + pushl %edi FRAME_pushl() + pushl %esi FRAME_pushl() + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%edx + movl PARAM_SIZE,%ecx + + movl %ecx,%eax + shrl $3,%ecx C compute count for unrolled loop + negl %eax + andl $7,%eax C get index where to start loop + jz LF(M4_function_n,oopgo) C necessary special case for 0 + incl %ecx C adjust loop count + shll $2,%eax C adjustment for pointers... + subl %eax,%edi C ... since they are offset ... + subl %eax,%esi C ... by a constant when we ... + subl %eax,%edx C ... enter the loop + shrl $2,%eax C restore previous value + +ifdef(`PIC',` + C Calculate start address in loop for PIC. Due to limitations in + C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal + call L(0a) +L(0a): leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $LF(M4_function_n,oop)-L(0a)-3,%eax + addl $4,%esp +',` + C Calculate start address in loop for non-PIC. + leal LF(M4_function_n,oop)-3(%eax,%eax,8),%eax +') + + C These lines initialize carry from the 5th parameter. Should be + C possible to simplify. + pushl %ebp FRAME_pushl() + movl PARAM_CARRY,%ebp + shrl $1,%ebp C shift bit 0 into carry + popl %ebp FRAME_popl() + + jmp *%eax C jump into loop + +EPILOGUE() + + + ALIGN(8) +PROLOGUE(M4_function_n) +deflit(`FRAME',0) + + pushl %edi FRAME_pushl() + pushl %esi FRAME_pushl() + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%edx + movl PARAM_SIZE,%ecx + + movl %ecx,%eax + shrl $3,%ecx C compute count for unrolled loop + negl %eax + andl $7,%eax C get index where to start loop + jz L(oop) C necessary special case for 0 + incl %ecx C adjust loop count + shll $2,%eax C adjustment for pointers... + subl %eax,%edi C ... since they are offset ... + subl %eax,%esi C ... by a constant when we ... + subl %eax,%edx C ... enter the loop + shrl $2,%eax C restore previous value + +ifdef(`PIC',` + C Calculate start address in loop for PIC. Due to limitations in + C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal + call L(0b) +L(0b): leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $L(oop)-L(0b)-3,%eax + addl $4,%esp +',` + C Calculate start address in loop for non-PIC. + leal L(oop)-3(%eax,%eax,8),%eax +') + jmp *%eax C jump into loop + +L(oopgo): + pushl %ebp FRAME_pushl() + movl PARAM_CARRY,%ebp + shrl $1,%ebp C shift bit 0 into carry + popl %ebp FRAME_popl() + + ALIGN(8) +L(oop): movl (%esi),%eax + M4_inst (%edx),%eax + movl %eax,(%edi) + movl 4(%esi),%eax + M4_inst 4(%edx),%eax + movl %eax,4(%edi) + movl 8(%esi),%eax + M4_inst 8(%edx),%eax + movl %eax,8(%edi) + movl 12(%esi),%eax + M4_inst 12(%edx),%eax + movl %eax,12(%edi) + movl 16(%esi),%eax + M4_inst 16(%edx),%eax + movl %eax,16(%edi) + movl 20(%esi),%eax + M4_inst 20(%edx),%eax + movl %eax,20(%edi) + movl 24(%esi),%eax + M4_inst 24(%edx),%eax + movl %eax,24(%edi) + movl 28(%esi),%eax + M4_inst 28(%edx),%eax + movl %eax,28(%edi) + leal 32(%edi),%edi + leal 32(%esi),%esi + leal 32(%edx),%edx + decl %ecx + jnz L(oop) + + sbbl %eax,%eax + negl %eax + + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/aorsmul_1.asm b/rts/gmp/mpn/x86/aorsmul_1.asm new file mode 100644 index 0000000000..f32ad83989 --- /dev/null +++ b/rts/gmp/mpn/x86/aorsmul_1.asm @@ -0,0 +1,134 @@ +dnl x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a +dnl limb and add the result to a second limb vector. + + +dnl Copyright (C) 1992, 1994, 1997, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +ifdef(`OPERATION_addmul_1',` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + +',`ifdef(`OPERATION_submul_1',` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); + +define(PARAM_MULTIPLIER, `FRAME+16(%esp)') +define(PARAM_SIZE, `FRAME+12(%esp)') +define(PARAM_SRC, `FRAME+8(%esp)') +define(PARAM_DST, `FRAME+4(%esp)') + + TEXT + ALIGN(8) + +PROLOGUE(M4_function_1) +deflit(`FRAME',0) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ecx + + xorl %ebx,%ebx + andl $3,%ecx + jz L(end0) + +L(oop0): + movl (%esi),%eax + mull PARAM_MULTIPLIER + leal 4(%esi),%esi + addl %ebx,%eax + movl $0,%ebx + adcl %ebx,%edx + M4_inst %eax,(%edi) + adcl %edx,%ebx C propagate carry into cylimb + + leal 4(%edi),%edi + decl %ecx + jnz L(oop0) + +L(end0): + movl PARAM_SIZE,%ecx + shrl $2,%ecx + jz L(end) + + ALIGN(8) +L(oop): movl (%esi),%eax + mull PARAM_MULTIPLIER + addl %eax,%ebx + movl $0,%ebp + adcl %edx,%ebp + + movl 4(%esi),%eax + mull PARAM_MULTIPLIER + M4_inst %ebx,(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + movl 8(%esi),%eax + mull PARAM_MULTIPLIER + M4_inst %ebp,4(%edi) + adcl %eax,%ebx C new lo + cylimb + movl $0,%ebp + adcl %edx,%ebp + + movl 12(%esi),%eax + mull PARAM_MULTIPLIER + M4_inst %ebx,8(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + M4_inst %ebp,12(%edi) + adcl $0,%ebx C propagate carry into cylimb + + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ecx + jnz L(oop) + +L(end): movl %ebx,%eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/copyd.asm b/rts/gmp/mpn/x86/copyd.asm new file mode 100644 index 0000000000..439640e836 --- /dev/null +++ b/rts/gmp/mpn/x86/copyd.asm @@ -0,0 +1,80 @@ +dnl x86 mpn_copyd -- copy limb vector, decrementing. +dnl +dnl Future: On P6 an MMX loop should be able to go faster than this code. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size, working from high to low addresses. +C +C The code here is very generic and can be expected to be reasonable on all +C the x86 family. +C +C P5 - 1.0 cycles/limb. +C +C P6 - 2.4 cycles/limb, approx 40 cycles startup. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) + +PROLOGUE(mpn_copyd) + C eax saved esi + C ebx + C ecx counter + C edx saved edi + C esi src + C edi dst + C ebp + + movl PARAM_SIZE, %ecx + movl %esi, %eax + + movl PARAM_SRC, %esi + movl %edi, %edx + + movl PARAM_DST, %edi + leal -4(%esi,%ecx,4), %esi + + leal -4(%edi,%ecx,4), %edi + + std + + rep + movsl + + cld + + movl %eax, %esi + movl %edx, %edi + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/copyi.asm b/rts/gmp/mpn/x86/copyi.asm new file mode 100644 index 0000000000..5bc4e36689 --- /dev/null +++ b/rts/gmp/mpn/x86/copyi.asm @@ -0,0 +1,79 @@ +dnl x86 mpn_copyi -- copy limb vector, incrementing. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size, working from low to high addresses. +C +C The code here is very generic and can be expected to be reasonable on all +C the x86 family. +C +C P5 - 1.0 cycles/limb. +C +C P6 - 0.75 cycles/limb. An MMX based copy was tried, but was found to be +C slower than a rep movs in all cases. The fastest MMX found was 0.8 +C cycles/limb (when fully aligned). A rep movs seems to have a startup +C time of about 15 cycles, but doing something special for small sizes +C could lead to a branch misprediction that would destroy any saving. +C For now a plain rep movs seems ok for P6. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) + + C eax saved esi + C ebx + C ecx counter + C edx saved edi + C esi src + C edi dst + C ebp + +PROLOGUE(mpn_copyi) + + movl PARAM_SIZE, %ecx + movl %esi, %eax + + movl PARAM_SRC, %esi + movl %edi, %edx + + movl PARAM_DST, %edi + + cld C better safe than sorry, see mpn/x86/README.family + + rep + movsl + + movl %eax, %esi + movl %edx, %edi + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/diveby3.asm b/rts/gmp/mpn/x86/diveby3.asm new file mode 100644 index 0000000000..df879da9e1 --- /dev/null +++ b/rts/gmp/mpn/x86/diveby3.asm @@ -0,0 +1,115 @@ +dnl x86 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl The following all have their own optimized versions of this routine, +dnl but for reference the code here runs as follows. +dnl +dnl cycles/limb +dnl P54 18.0 +dnl P55 17.0 +dnl P6 14.5 +dnl K6 14.0 +dnl K7 10.0 + + +include(`../config.m4') + + +C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t carry); + +defframe(PARAM_CARRY,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl multiplicative inverse of 3, modulo 2^32 +deflit(INVERSE_3, 0xAAAAAAAB) + +dnl ceil(b/3) and ceil(b*2/3) where b=2^32 +deflit(ONE_THIRD_CEIL, 0x55555556) +deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB) + + .text + ALIGN(8) + +PROLOGUE(mpn_divexact_by3c) +deflit(`FRAME',0) + + movl PARAM_SRC, %ecx + pushl %ebp FRAME_pushl() + + movl PARAM_SIZE, %ebp + pushl %edi FRAME_pushl() + + movl PARAM_DST, %edi + pushl %esi FRAME_pushl() + + movl $INVERSE_3, %esi + pushl %ebx FRAME_pushl() + + leal (%ecx,%ebp,4), %ecx + movl PARAM_CARRY, %ebx + + leal (%edi,%ebp,4), %edi + negl %ebp + + + ALIGN(8) +L(top): + C eax scratch, low product + C ebx carry limb (0 to 3) + C ecx &src[size] + C edx scratch, high product + C esi multiplier + C edi &dst[size] + C ebp counter, limbs, negative + + movl (%ecx,%ebp,4), %eax + + subl %ebx, %eax + + setc %bl + + imull %esi + + cmpl $ONE_THIRD_CEIL, %eax + movl %eax, (%edi,%ebp,4) + + sbbl $-1, %ebx C +1 if eax>=ceil(b/3) + cmpl $TWO_THIRDS_CEIL, %eax + + sbbl $-1, %ebx C +1 if eax>=ceil(b*2/3) + incl %ebp + + jnz L(top) + + + movl %ebx, %eax + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/divrem_1.asm b/rts/gmp/mpn/x86/divrem_1.asm new file mode 100644 index 0000000000..12f14676d6 --- /dev/null +++ b/rts/gmp/mpn/x86/divrem_1.asm @@ -0,0 +1,232 @@ +dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient. + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl cycles/limb +dnl K6 20 +dnl P5 44 +dnl P6 39 +dnl 486 approx 43 maybe +dnl +dnl +dnl The following have their own optimized divrem_1 implementations, but +dnl for reference the code here runs as follows. +dnl +dnl cycles/limb +dnl P6MMX 39 +dnl K7 42 + + +include(`../config.m4') + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C +C Divide src,size by divisor and store the quotient in dst+xsize,size. +C Extend the division to fractional quotient limbs in dst,xsize. Return the +C remainder. Either or both xsize and size can be 0. +C +C mpn_divrem_1c takes a carry parameter which is an initial high limb, +C effectively one extra limb at the top of src,size. Must have +C carry<divisor. +C +C +C Essentially the code is the same as the division based part of +C mpn/generic/divrem_1.c, but has the following advantages. +C +C - If gcc isn't being used then divrem_1.c will get the generic C +C udiv_qrnnd() and be rather slow. +C +C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't +C generate that instruction (as of gcc 2.95.2 at least). +C +C A test is done to see if the high limb is less the the divisor, and if so +C one less div is done. A div is between 20 and 40 cycles on the various +C x86s, so assuming high<divisor about half the time, then this test saves +C half that amount. The branch misprediction penalty on each chip is less +C than half a div. +C +C +C K6: Back-to-back div instructions run at 20 cycles, the same as the loop +C here, so it seems there's nothing to gain by rearranging the loop. +C Pairing the mov and loop instructions was found to gain nothing. (The +C same is true of the mpn/x86/mod_1.asm loop.) +C +C With a "decl/jnz" rather than a "loop" this code runs at 22 cycles. +C The loop_or_decljnz macro is an easy way to get a 10% speedup. +C +C The fast K6 multiply might be thought to suit a multiply-by-inverse, +C but that algorithm has been found to suffer from the releatively poor +C carry handling on K6 and too many auxiliary instructions. The +C fractional part however could be done at about 13 c/l. +C +C P5: Moving the load down to pair with the store might save 1 cycle, but +C that doesn't seem worth bothering with, since it'd be only a 2.2% +C saving. +C +C Again here the auxiliary instructions hinder a multiply-by-inverse, +C though there might be a 10-15% speedup available + + +defframe(PARAM_CARRY, 24) +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(16) + +PROLOGUE(mpn_divrem_1c) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %edi FRAME_pushl() + + movl PARAM_SRC, %edi + pushl %esi FRAME_pushl() + + movl PARAM_DIVISOR, %esi + pushl %ebx FRAME_pushl() + + movl PARAM_DST, %ebx + pushl %ebp FRAME_pushl() + + movl PARAM_XSIZE, %ebp + orl %ecx, %ecx + + movl PARAM_CARRY, %edx + jz LF(mpn_divrem_1,fraction) + + leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part + jmp LF(mpn_divrem_1,integer_top) + +EPILOGUE() + + +PROLOGUE(mpn_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %edi FRAME_pushl() + + movl PARAM_SRC, %edi + pushl %esi FRAME_pushl() + + movl PARAM_DIVISOR, %esi + orl %ecx,%ecx + + jz L(size_zero) + pushl %ebx FRAME_pushl() + + movl -4(%edi,%ecx,4), %eax C src high limb + xorl %edx, %edx + + movl PARAM_DST, %ebx + pushl %ebp FRAME_pushl() + + movl PARAM_XSIZE, %ebp + cmpl %esi, %eax + + leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part + jae L(integer_entry) + + + C high<divisor, so high of dst is zero, and avoid one div + + movl %edx, (%ebx,%ecx,4) + decl %ecx + + movl %eax, %edx + jz L(fraction) + + +L(integer_top): + C eax scratch (quotient) + C ebx dst+4*xsize-4 + C ecx counter + C edx scratch (remainder) + C esi divisor + C edi src + C ebp xsize + + movl -4(%edi,%ecx,4), %eax +L(integer_entry): + + divl %esi + + movl %eax, (%ebx,%ecx,4) + loop_or_decljnz L(integer_top) + + +L(fraction): + orl %ebp, %ecx + jz L(done) + + movl PARAM_DST, %ebx + + +L(fraction_top): + C eax scratch (quotient) + C ebx dst + C ecx counter + C edx scratch (remainder) + C esi divisor + C edi + C ebp + + xorl %eax, %eax + + divl %esi + + movl %eax, -4(%ebx,%ecx,4) + loop_or_decljnz L(fraction_top) + + +L(done): + popl %ebp + movl %edx, %eax + popl %ebx + popl %esi + popl %edi + ret + + +L(size_zero): +deflit(`FRAME',8) + movl PARAM_XSIZE, %ecx + xorl %eax, %eax + + movl PARAM_DST, %edi + + cld C better safe than sorry, see mpn/x86/README.family + + rep + stosl + + popl %esi + popl %edi + ret +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/README b/rts/gmp/mpn/x86/k6/README new file mode 100644 index 0000000000..3ad96c8b89 --- /dev/null +++ b/rts/gmp/mpn/x86/k6/README @@ -0,0 +1,237 @@ + + AMD K6 MPN SUBROUTINES + + + +This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and +K6-3. + +The mmx and k62mmx subdirectories have routines using MMX instructions. All +K6s have MMX, the separate directories are just so that ./configure can omit +them if the assembler doesn't support MMX. + + + + +STATUS + +Times for the loops, with all code and data in L1 cache, are as follows. + + cycles/limb + + mpn_add_n/sub_n 3.25 normal, 2.75 in-place + + mpn_mul_1 6.25 + mpn_add/submul_1 7.65-8.4 (varying with data values) + + mpn_mul_basecase 9.25 cycles/crossproduct (approx) + mpn_sqr_basecase 4.7 cycles/crossproduct (approx) + or 9.2 cycles/triangleproduct (approx) + + mpn_divrem_1 20.0 + mpn_mod_1 20.0 + mpn_divexact_by3 11.0 + + mpn_l/rshift 3.0 + + mpn_copyi/copyd 1.0 + + mpn_com_n 1.5-1.85 \ + mpn_and/andn/ior/xor_n 1.5-1.75 | varying with + mpn_iorn/xnor_n 2.0-2.25 | data alignment + mpn_nand/nior_n 2.0-2.25 / + + mpn_popcount 12.5 + mpn_hamdist 13.0 + + +K6-2 and K6-3 have dual-issue MMX and get the following improvements. + + mpn_l/rshift 1.75 + + mpn_copyi/copyd 0.56 or 1.0 \ + | + mpn_com_n 1.0-1.2 | varying with + mpn_and/andn/ior/xor_n 1.2-1.5 | data alignment + mpn_iorn/xnor_n 1.5-2.0 | + mpn_nand/nior_n 1.75-2.0 / + + mpn_popcount 9.0 + mpn_hamdist 11.5 + + +Prefetching of sources hasn't yet given any joy. With the 3DNow "prefetch" +instruction, code seems to run slower, and with just "mov" loads it doesn't +seem faster. Results so far are inconsistent. The K6 does a hardware +prefetch of the second cache line in a sector, so the penalty for not +prefetching in software is reduced. + + + + +NOTES + +All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow. + +Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can +execute them in both X and Y (and together). + +Branch misprediction penalty is 1 to 4 cycles (Optimization Manual +chapter 6 table 12). + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. +Store queue is 7 entries of 64 bits each. + +Floating point multiplications can be done in parallel with integer +multiplications, but there doesn't seem to be any way to make use of this. + + + +OPTIMIZATIONS + +Unrolled loops are used to reduce looping overhead. The unrolling is +configurable up to 32 limbs/loop for most routines, up to 64 for some. + +Sometimes computed jumps into the unrolling are used to handle sizes not a +multiple of the unrolling. An attractive feature of this is that times +smoothly increase with operand size, but an indirect jump is about 6 cycles +and the setups about another 6, so it depends on how much the unrolled code +is faster than a simple loop as to whether a computed jump ought to be used. + +Position independent code is implemented using a call to get eip for +computed jumps and a ret is always done, rather than an addl $4,%esp or a +popl, so the CPU return address branch prediction stack stays synchronised +with the actual stack in memory. Such a call however still costs 4 to 7 +cycles. + +Branch prediction, in absence of any history, will guess forward jumps are +not taken and backward jumps are taken. Where possible it's arranged that +the less likely or less important case is under a taken forward jump. + + + +MMX + +Putting emms or femms as late as possible in a routine seems to be fastest. +Perhaps an emms or femms stalls until all outstanding MMX instructions have +completed, so putting it later gives them a chance to complete on their own, +in parallel with other operations (like register popping). + +The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3 +at the start of a routine, in case it's been preceded by x87 floating point +operations. This isn't done because in gmp programs it's expected that x87 +floating point won't be much used and that chances are an mpn routine won't +have been preceded by any x87 code. + + + +CODING + +Instructions in general code are shown paired if they can decode and execute +together, meaning two short decode instructions with the second not +depending on the first, only the first using the shifter, no more than one +load, and no more than one store. + +K6 does some out of order execution so the pairings aren't essential, they +just show what slots might be available. When decoding is the limiting +factor things can be scheduled that might not execute until later. + + + +NOTES + +Code alignment + +- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary, + short decode is inhibited. The cross.pl script detects this. + +- loops and branch targets should be aligned to 16 bytes, or ensure at least + 2 instructions before a 32 byte boundary. This makes use of the 16 byte + cache in the BTB. + +Addressing modes + +- (%esi) degrades decoding from short to vector. 0(%esi) doesn't have this + problem, and can be used as an equivalent, or easier is just to use a + different register, like %ebx. + +- K6 and pre-CXT core K6-2 have the following problem. (K6-2 CXT and K6-3 + have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F). + + If more than 3 bytes are needed to determine instruction length then + decoding degrades from direct to long, or from long to vector. This + happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since + with mod=00 the sib determines whether there's a displacement. + + This affects all MMX and 3DNow instructions, and others with an 0F prefix + like movzbl. The modes affected are anything with an index and no + displacement, or an index but no base, and this includes (%esp) which is + really (,%esp,1). + + The cross.pl script detects problem cases. The workaround is to always + use a displacement, and to do this with Zdisp if it's zero so the + assembler doesn't discard it. + + See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages + 13-14 and 36-37. + +Calls + +- indirect jumps and calls are not branch predicted, they measure about 6 + cycles. + +Various + +- adcl 2 cycles of decode, maybe 2 cycles executing in the X pipe +- bsf 12-27 cycles +- emms 5 cycles +- femms 3 cycles +- jecxz 2 cycles taken, 13 not taken (optimization manual says 7 not taken) +- divl 20 cycles back-to-back +- imull 2 decode, 2 execute +- mull 2 decode, 3 execute (optimization manual decoding sample) +- prefetch 2 cycles +- rcll/rcrl implicit by one bit: 2 cycles + immediate or %cl count: 11 + 2 per bit for dword + 13 + 4 per bit for byte +- setCC 2 cycles +- xchgl %eax,reg 1.5 cycles, back-to-back (strange) + reg,reg 2 cycles, back-to-back + + + + +REFERENCES + +"AMD-K6 Processor Code Optimization Application Note", AMD publication +number 21924, revision D amendment 0, January 2000. This describes K6-2 and +K6-3. Available on-line, + + http://www.amd.com/K6/k6docs/pdf/21924.pdf + +"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD +publication number 21828, revision A amendment 0, August 1997. This is an +older edition of the above document, describing plain K6. Available +on-line, + + http://www.amd.com/K6/k6docs/pdf/21828.pdf + +"3DNow Technology Manual", AMD publication number 21928F/0-August 1999. +This describes the femms and prefetch instructions, but nothing else from +3DNow has been used. Available on-line, + + http://www.amd.com/K6/k6docs/pdf/21928.pdf + +"3DNow Instruction Porting Guide", AMD publication number 22621, revision B, +August 1999. This has some notes on general K6 optimizations as well as +3DNow. Available on-line, + + http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/rts/gmp/mpn/x86/k6/aors_n.asm b/rts/gmp/mpn/x86/k6/aors_n.asm new file mode 100644 index 0000000000..31b05ada51 --- /dev/null +++ b/rts/gmp/mpn/x86/k6/aors_n.asm @@ -0,0 +1,329 @@ +dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction. +dnl +dnl K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +ifdef(`OPERATION_add_n', ` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + define(M4_description, add) +',`ifdef(`OPERATION_sub_n', ` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + define(M4_description, subtract) +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C +C Calculate src1,size M4_description src2,size, and store the result in +C dst,size. The return value is the carry bit from the top of the result +C (1 or 0). +C +C The _nc version accepts 1 or 0 for an initial carry into the low limb of +C the calculation. Note values other than 1 or 0 here will lead to garbage +C results. +C +C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and +C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of +C loop control, which with 4 limbs/loop means an extra 0.25 c/l. + +define(PARAM_CARRY, `FRAME+20(%esp)') +define(PARAM_SIZE, `FRAME+16(%esp)') +define(PARAM_SRC2, `FRAME+12(%esp)') +define(PARAM_SRC1, `FRAME+8(%esp)') +define(PARAM_DST, `FRAME+4(%esp)') +deflit(`FRAME',0) + +dnl minimum 5 because the unrolled code can't handle less +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(32) + +PROLOGUE(M4_function_nc) + movl PARAM_CARRY, %eax + jmp LF(M4_function_n,start) +EPILOGUE() + + +PROLOGUE(M4_function_n) + xorl %eax, %eax +L(start): + movl PARAM_SIZE, %ecx + pushl %ebx +FRAME_pushl() + + movl PARAM_SRC1, %ebx + pushl %edi +FRAME_pushl() + + movl PARAM_SRC2, %edx + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_DST, %edi + jae L(unroll) + + + shrl %eax C initial carry flag + + C offset 0x21 here, close enough to aligned +L(simple): + C eax scratch + C ebx src1 + C ecx counter + C edx src2 + C esi + C edi dst + C ebp + C + C The store to (%edi) could be done with a stosl; it'd be smaller + C code, but there's no speed gain and a cld would have to be added + C (per mpn/x86/README.family). + + movl (%ebx), %eax + leal 4(%ebx), %ebx + + M4_inst (%edx), %eax + + movl %eax, (%edi) + leal 4(%edi), %edi + + leal 4(%edx), %edx + loop L(simple) + + + movl $0, %eax + popl %edi + + setc %al + + popl %ebx + ret + + +C ----------------------------------------------------------------------------- +L(unroll): + C eax carry + C ebx src1 + C ecx counter + C edx src2 + C esi + C edi dst + C ebp + + cmpl %edi, %ebx + pushl %esi + + je L(inplace) + +ifdef(`OPERATION_add_n',` + cmpl %edi, %edx + + je L(inplace_reverse) +') + + movl %ecx, %esi + + andl $-4, %ecx + andl $3, %esi + + leal (%ebx,%ecx,4), %ebx + leal (%edx,%ecx,4), %edx + leal (%edi,%ecx,4), %edi + + negl %ecx + shrl %eax + + ALIGN(32) +L(normal_top): + C eax counter, qwords, negative + C ebx src1 + C ecx scratch + C edx src2 + C esi + C edi dst + C ebp + + movl (%ebx,%ecx,4), %eax + leal 5(%ecx), %ecx + M4_inst -20(%edx,%ecx,4), %eax + movl %eax, -20(%edi,%ecx,4) + + movl 4-20(%ebx,%ecx,4), %eax + M4_inst 4-20(%edx,%ecx,4), %eax + movl %eax, 4-20(%edi,%ecx,4) + + movl 8-20(%ebx,%ecx,4), %eax + M4_inst 8-20(%edx,%ecx,4), %eax + movl %eax, 8-20(%edi,%ecx,4) + + movl 12-20(%ebx,%ecx,4), %eax + M4_inst 12-20(%edx,%ecx,4), %eax + movl %eax, 12-20(%edi,%ecx,4) + + loop L(normal_top) + + + decl %esi + jz L(normal_finish_one) + js L(normal_done) + + C two or three more limbs + + movl (%ebx), %eax + M4_inst (%edx), %eax + movl %eax, (%edi) + + movl 4(%ebx), %eax + M4_inst 4(%edx), %eax + decl %esi + movl %eax, 4(%edi) + + jz L(normal_done) + movl $2, %ecx + +L(normal_finish_one): + movl (%ebx,%ecx,4), %eax + M4_inst (%edx,%ecx,4), %eax + movl %eax, (%edi,%ecx,4) + +L(normal_done): + popl %esi + popl %edi + + movl $0, %eax + popl %ebx + + setc %al + + ret + + +C ----------------------------------------------------------------------------- + +ifdef(`OPERATION_add_n',` +L(inplace_reverse): + C dst==src2 + + movl %ebx, %edx +') + +L(inplace): + C eax initial carry + C ebx + C ecx size + C edx src + C esi + C edi dst + C ebp + + leal -1(%ecx), %esi + decl %ecx + + andl $-4, %ecx + andl $3, %esi + + movl (%edx), %ebx C src low limb + leal (%edx,%ecx,4), %edx + + leal (%edi,%ecx,4), %edi + negl %ecx + + shrl %eax + + + ALIGN(32) +L(inplace_top): + C eax + C ebx next src limb + C ecx size + C edx src + C esi + C edi dst + C ebp + + M4_inst %ebx, (%edi,%ecx,4) + + movl 4(%edx,%ecx,4), %eax + leal 5(%ecx), %ecx + + M4_inst %eax, 4-20(%edi,%ecx,4) + + movl 8-20(%edx,%ecx,4), %eax + movl 12-20(%edx,%ecx,4), %ebx + + M4_inst %eax, 8-20(%edi,%ecx,4) + M4_inst %ebx, 12-20(%edi,%ecx,4) + + movl 16-20(%edx,%ecx,4), %ebx + loop L(inplace_top) + + + C now %esi is 0 to 3 representing respectively 1 to 4 limbs more + + M4_inst %ebx, (%edi) + + decl %esi + jz L(inplace_finish_one) + js L(inplace_done) + + C two or three more limbs + + movl 4(%edx), %eax + movl 8(%edx), %ebx + M4_inst %eax, 4(%edi) + M4_inst %ebx, 8(%edi) + + decl %esi + movl $2, %ecx + + jz L(normal_done) + +L(inplace_finish_one): + movl 4(%edx,%ecx,4), %eax + M4_inst %eax, 4(%edi,%ecx,4) + +L(inplace_done): + popl %esi + popl %edi + + movl $0, %eax + popl %ebx + + setc %al + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/aorsmul_1.asm b/rts/gmp/mpn/x86/k6/aorsmul_1.asm new file mode 100644 index 0000000000..da4120fe2f --- /dev/null +++ b/rts/gmp/mpn/x86/k6/aorsmul_1.asm @@ -0,0 +1,372 @@ +dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. +dnl +dnl K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data), +dnl PIC adds about 6 cycles at the start. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K6: large multpliers small multpliers +dnl UNROLL_COUNT cycles/limb cycles/limb +dnl 4 9.5 7.78 +dnl 8 9.0 7.78 +dnl 16 8.4 7.65 +dnl 32 8.4 8.2 +dnl +dnl Maximum possible unrolling with the current code is 32. +dnl +dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256 +dnl byte block, which might explain the good speed at that unrolling. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + define(M4_description, add it to) + define(M4_desc_retval, carry) +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) + define(M4_description, subtract it from) + define(M4_desc_retval, borrow) +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C Calculate src,size multiplied by mult and M4_description dst,size. +C Return the M4_desc_retval limb from the top of the result. +C +C The jadcl0()s in the unrolled loop makes the speed data dependent. Small +C multipliers (most significant few bits clear) result in few carry bits and +C speeds up to 7.65 cycles/limb are attained. Large multipliers (most +C significant few bits set) make the carry bits 50/50 and lead to something +C more like 8.4 c/l. (With adcl's both of these would be 9.3 c/l.) +C +C It's important that the gains for jadcl0 on small multipliers don't come +C at the cost of slowing down other data. Tests on uniformly distributed +C random data, designed to confound branch prediction, show about a 7% +C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all +C overheads included). +C +C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus +C 11.0 cycles/limb), and hence isn't used. +C +C In the simple loop, note that running ecx from negative to zero and using +C it as an index in the two movs wouldn't help. It would save one +C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired +C that would be collapsed by this. +C +C +C jadcl0 +C ------ +C +C jadcl0() being faster than adcl $0 seems to be an artifact of two things, +C firstly the instruction decoding and secondly the fact that there's a +C carry bit for the jadcl0 only on average about 1/4 of the time. +C +C The code in the unrolled loop decodes something like the following. +C +C decode cycles +C mull %ebp 2 +C M4_inst %esi, disp(%edi) 1 +C adcl %eax, %ecx 2 +C movl %edx, %esi \ 1 +C jnc 1f / +C incl %esi \ 1 +C 1: movl disp(%ebx), %eax / +C --- +C 7 +C +C In a back-to-back style test this measures 7 with the jnc not taken, or 8 +C with it taken (both when correctly predicted). This is opposite to the +C measurements showing small multipliers running faster than large ones. +C Watch this space for more info ... +C +C It's not clear how much branch misprediction might be costing. The K6 +C doco says it will be 1 to 4 cycles, but presumably it's near the low end +C of that range to get the measured results. +C +C +C In the code the two carries are more or less the preceding mul product and +C the calculation is roughly +C +C x*y + u*b+v +C +C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and +C v are the two limbs it's added to (being the low of the next mul, and a +C limb from the destination). +C +C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and +C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of +C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0 +C and b-1, then the total probability can be summed over x and y, +C +C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1) +C --- * sum sum --- = --- * ------- * ------- = 1/4 +C b^2 x=0 y=1 b^2 b^4 2 2 +C +C Actually it's a very tiny bit less than 1/4 of course. If y is fixed, +C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2. + + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 9) +',` +deflit(UNROLL_THRESHOLD, 6) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) + +PROLOGUE(M4_function_1c) + pushl %esi +deflit(`FRAME',4) + movl PARAM_CARRY, %esi + jmp LF(M4_function_1,start_nc) +EPILOGUE() + +PROLOGUE(M4_function_1) + push %esi +deflit(`FRAME',4) + xorl %esi, %esi C initial carry + +L(start_nc): + movl PARAM_SIZE, %ecx + pushl %ebx +deflit(`FRAME',8) + + movl PARAM_SRC, %ebx + pushl %edi +deflit(`FRAME',12) + + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_DST, %edi + + pushl %ebp +deflit(`FRAME',16) + jae L(unroll) + + + C simple loop + + movl PARAM_MULTIPLIER, %ebp + +L(simple): + C eax scratch + C ebx src + C ecx counter + C edx scratch + C esi carry + C edi dst + C ebp multiplier + + movl (%ebx), %eax + addl $4, %ebx + + mull %ebp + + addl $4, %edi + addl %esi, %eax + + adcl $0, %edx + + M4_inst %eax, -4(%edi) + + adcl $0, %edx + + movl %edx, %esi + loop L(simple) + + + popl %ebp + popl %edi + + popl %ebx + movl %esi, %eax + + popl %esi + ret + + + +C ----------------------------------------------------------------------------- +C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop +C the carries are ecx=lo, esi=hi, then they swap for each limb processed. +C For the computed jump an odd size means they start one way around, an even +C size the other. +C +C VAR_JUMP holds the computed jump temporarily because there's not enough +C registers at the point of doing the mul for the initial two carry limbs. +C +C The add/adc for the initial carry in %esi is necessary only for the +C mpn_addmul/submul_1c entry points. Duplicating the startup code to +C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good +C idea. + +dnl overlapping with parameters already fetched +define(VAR_COUNTER, `PARAM_SIZE') +define(VAR_JUMP, `PARAM_DST') + +L(unroll): + C eax + C ebx src + C ecx size + C edx + C esi initial carry + C edi dst + C ebp + + movl %ecx, %edx + decl %ecx + + subl $2, %edx + negl %ecx + + shrl $UNROLL_LOG2, %edx + andl $UNROLL_MASK, %ecx + + movl %edx, VAR_COUNTER + movl %ecx, %edx + + shll $4, %edx + negl %ecx + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%edx,%ecx,1), %edx +') + movl (%ebx), %eax C src low limb + + movl PARAM_MULTIPLIER, %ebp + movl %edx, VAR_JUMP + + mull %ebp + + addl %esi, %eax C initial carry (from _1c) + jadcl0( %edx) + + + leal 4(%ebx,%ecx,4), %ebx + movl %edx, %esi C high carry + + movl VAR_JUMP, %edx + leal (%edi,%ecx,4), %edi + + testl $1, %ecx + movl %eax, %ecx C low carry + + jz L(noswap) + movl %esi, %ecx C high,low carry other way around + + movl %eax, %esi +L(noswap): + + jmp *%edx + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%edx,%ecx,1), %edx + addl $L(entry)-L(here), %edx + addl (%esp), %edx + ret +') + + +C ----------------------------------------------------------- + ALIGN(32) +L(top): +deflit(`FRAME',16) + C eax scratch + C ebx src + C ecx carry lo + C edx scratch + C esi carry hi + C edi dst + C ebp multiplier + C + C 15 code bytes per limb + + leal UNROLL_BYTES(%edi), %edi + +L(entry): +forloop(`i', 0, UNROLL_COUNT/2-1, ` + deflit(`disp0', eval(2*i*4)) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%ebx), %eax) + mull %ebp +Zdisp( M4_inst,%ecx, disp0,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) + + movl disp1(%ebx), %eax + mull %ebp + M4_inst %esi, disp1(%edi) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%ebx), %ebx + + jns L(top) + + + popl %ebp + M4_inst %ecx, UNROLL_BYTES(%edi) + + popl %edi + movl %esi, %eax + + popl %ebx + jadcl0( %eax) + + popl %esi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/cross.pl b/rts/gmp/mpn/x86/k6/cross.pl new file mode 100644 index 0000000000..21734f3e52 --- /dev/null +++ b/rts/gmp/mpn/x86/k6/cross.pl @@ -0,0 +1,141 @@ +#! /usr/bin/perl + +# Copyright (C) 2000 Free Software Foundation, Inc. +# +# This file is part of the GNU MP Library. +# +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation; either version 2.1 of the License, or (at +# your option) any later version. +# +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +# MA 02111-1307, USA. + + +# Usage: cross.pl [filename.o]... +# +# Produce an annotated disassembly of the given object files, indicating +# certain code alignment and addressing mode problems afflicting K6 chips. +# "ZZ" is used on all annotations, so this can be searched for. +# +# With no arguments, all .o files corresponding to .asm files are processed. +# This is good in the mpn object directory of a k6*-*-* build. +# +# As far as fixing problems goes, any cache line crossing problems in loops +# get attention, but as a rule it's too tedious to rearrange code or slip in +# nops to fix every problem in setup or finishup code. +# +# Bugs: +# +# Instructions without mod/rm bytes or which are already vector decoded are +# unaffected by cache line boundary crossing, but not all of these have yet +# been put in as exceptions. All that occur in practice in GMP are present +# though. +# +# There's no messages for using the vector decoded addressing mode (%esi), +# but that mode is easy to avoid when coding. + +use strict; + +sub disassemble { + my ($file) = @_; + my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm); + + open (IN, "objdump -Srfh $file |") + || die "Cannot open pipe from objdump\n"; + while (<IN>) { + print; + + if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) { + if ($1 < 5) { + print "ZZ need at least 2**5 for predictable cache line crossing\n"; + } + } + + if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4); + + } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,$3,''); + + } elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) { + ($addr,$b1,$b2,$b3) = ($1,$2,'',''); + + } else { + next; + } + + if ($b1 =~ /0f/) { + $prefix = $b1; + $opcode = $b2; + $modrm = $b3; + } else { + $prefix = ''; + $opcode = $b1; + $modrm = $b2; + } + + # modrm of the form 00-xxx-100 with an 0F prefix is the problem case + # for K6 and pre-CXT K6-2 + if ($prefix =~ /0f/ + && $opcode !~ /^8/ # jcond disp32 + && $modrm =~ /^[0-3][4c]/) { + print "ZZ ($file) >3 bytes to determine instruction length\n"; + } + + # with just an opcode, starting 1f mod 20h + if ($addr =~ /[13579bdf]f$/ + && $prefix !~ /0f/ + && $opcode !~ /1[012345]/ # adc + && $opcode !~ /1[89abcd]/ # sbb + && $opcode !~ /68/ # push $imm32 + && $opcode !~ /^7/ # jcond disp8 + && $opcode !~ /a[89]/ # test+imm + && $opcode !~ /a[a-f]/ # stos/lods/scas + && $opcode !~ /b8/ # movl $imm32,%eax + && $opcode !~ /e[0123]/ # loop/loopz/loopnz/jcxz + && $opcode !~ /e[b9]/ # jmp disp8/disp32 + && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std + && !($opcode =~ /f[67]/ # grp 1 + && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv + && $modrm !~ /^$/) { + print "ZZ ($file) opcode/modrm cross 32-byte boundary\n"; + } + + # with an 0F prefix, anything starting at 1f mod 20h + if ($addr =~ /[13579bdf][f]$/ + && $prefix =~ /0f/) { + print "ZZ ($file) prefix/opcode cross 32-byte boundary\n"; + } + + # with an 0F prefix, anything with mod/rm starting at 1e mod 20h + if ($addr =~ /[13579bdf][e]$/ + && $prefix =~ /0f/ + && $opcode !~ /^8/ # jcond disp32 + && $modrm !~ /^$/) { + print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n"; + } + } + close IN || die "Error from objdump (or objdump not available)\n"; +} + + +my @files; +if ($#ARGV >= 0) { + @files = @ARGV; +} else { + @files = glob "*.asm"; + map {s/.asm/.o/} @files; +} + +foreach (@files) { + disassemble($_); +} diff --git a/rts/gmp/mpn/x86/k6/diveby3.asm b/rts/gmp/mpn/x86/k6/diveby3.asm new file mode 100644 index 0000000000..ffb97bc380 --- /dev/null +++ b/rts/gmp/mpn/x86/k6/diveby3.asm @@ -0,0 +1,110 @@ +dnl AMD K6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. +dnl +dnl K6: 11.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t carry); +C +C Using %esi in (%esi,%ecx,4) or 0(%esi,%ecx,4) addressing modes doesn't +C lead to vector decoding, unlike plain (%esi) does. + +defframe(PARAM_CARRY,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl multiplicative inverse of 3, modulo 2^32 +deflit(INVERSE_3, 0xAAAAAAAB) + + .text + ALIGN(32) + +PROLOGUE(mpn_divexact_by3c) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %esi defframe_pushl(SAVE_ESI) + + movl PARAM_SRC, %esi + pushl %edi defframe_pushl(SAVE_EDI) + + movl PARAM_DST, %edi + pushl %ebx defframe_pushl(SAVE_EBX) + + movl PARAM_CARRY, %ebx + leal (%esi,%ecx,4), %esi + + pushl $3 defframe_pushl(VAR_THREE) + leal (%edi,%ecx,4), %edi + + negl %ecx + + + C Need 32 alignment for claimed speed, to avoid the movl store + C opcode/modrm crossing a cache line boundary + + ALIGN(32) +L(top): + C eax scratch, low product + C ebx carry limb (0 to 3) + C ecx counter, limbs, negative + C edx scratch, high product + C esi &src[size] + C edi &dst[size] + C ebp + C + C The 0(%esi,%ecx,4) form pads so the finishup "movl %ebx, %eax" + C doesn't cross a 32 byte boundary, saving a couple of cycles + C (that's a fixed couple, not per loop). + +Zdisp( movl, 0,(%esi,%ecx,4), %eax) + subl %ebx, %eax + + setc %bl + + imull $INVERSE_3, %eax + + movl %eax, (%edi,%ecx,4) + addl $2, %ecx + + mull VAR_THREE + + addl %edx, %ebx + loop L(top) + + + movl SAVE_ESI, %esi + movl %ebx, %eax + + movl SAVE_EBX, %ebx + + movl SAVE_EDI, %edi + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/gmp-mparam.h b/rts/gmp/mpn/x86/k6/gmp-mparam.h new file mode 100644 index 0000000000..77f3948d77 --- /dev/null +++ b/rts/gmp/mpn/x86/k6/gmp-mparam.h @@ -0,0 +1,97 @@ +/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +#ifndef UMUL_TIME +#define UMUL_TIME 3 /* cycles */ +#endif + +#ifndef UDIV_TIME +#define UDIV_TIME 20 /* cycles */ +#endif + +/* bsfl takes 12-27 cycles, put an average for uniform random numbers */ +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 14 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-04. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 18 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 130 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 34 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 116 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 68 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 98 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 13 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 67 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 472 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 4352 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 528, 1184, 2176, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 544 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 4352 +#endif diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm new file mode 100644 index 0000000000..20a33e6ccf --- /dev/null +++ b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm @@ -0,0 +1,179 @@ +dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing. +dnl +dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data +dnl alignment. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K6-2 aligned: +dnl UNROLL_COUNT cycles/limb +dnl 8 0.75 +dnl 16 0.625 +dnl 32 0.5625 +dnl 64 0.53 +dnl Maximum possible with the current code is 64, the minimum is 2. + +deflit(UNROLL_COUNT, 32) + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size, processing limbs from high to low addresses. +C +C The comments in copyi.asm apply here too. + + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) + +PROLOGUE(mpn_copyd) + movl PARAM_SIZE, %ecx + movl %esi, %eax + + movl PARAM_SRC, %esi + movl %edi, %edx + + std + + movl PARAM_DST, %edi + cmpl $UNROLL_COUNT, %ecx + + leal -4(%esi,%ecx,4), %esi + + leal -4(%edi,%ecx,4), %edi + ja L(unroll) + +L(simple): + rep + movsl + + cld + + movl %eax, %esi + movl %edx, %edi + + ret + + +L(unroll): + C if src and dst are different alignments mod8, then use rep movs + C if src and dst are both 4mod8 then process one limb to get 0mod8 + + pushl %ebx + leal (%esi,%edi), %ebx + + testb $4, %bl + popl %ebx + + jnz L(simple) + testl $4, %esi + + leal -UNROLL_COUNT(%ecx), %ecx + jnz L(already_aligned) + + movsl + + decl %ecx +L(already_aligned): + + +ifelse(UNROLL_BYTES,256,` + subl $128, %esi + subl $128, %edi +') + + C offset 0x3D here, but gets full speed without further alignment +L(top): + C eax saved esi + C ebx + C ecx counter, limbs + C edx saved edi + C esi src, incrementing + C edi dst, incrementing + C ebp + C + C `disp' is never 0, so don't need to force 0(%esi). + +deflit(CHUNK_COUNT, 2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128))) + movq disp(%esi), %mm0 + movq %mm0, disp(%edi) +') + + leal -UNROLL_BYTES(%esi), %esi + subl $UNROLL_COUNT, %ecx + + leal -UNROLL_BYTES(%edi), %edi + jns L(top) + + + C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to + C UNROLL_COUNT-1 limbs remaining + + testb $eval(UNROLL_COUNT/2), %cl + + leal UNROLL_COUNT(%ecx), %ecx + jz L(not_half) + + + C at an unroll count of 32 this block of code is 16 cycles faster than + C the rep movs, less 3 or 4 to test whether to do it + +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, ` + deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128))) + movq disp(%esi), %mm0 + movq %mm0, disp(%edi) +') + + subl $eval(UNROLL_BYTES/2), %esi + subl $eval(UNROLL_BYTES/2), %edi + + subl $eval(UNROLL_COUNT/2), %ecx +L(not_half): + + +ifelse(UNROLL_BYTES,256,` + addl $128, %esi + addl $128, %edi +') + + rep + movsl + + cld + + movl %eax, %esi + movl %edx, %edi + + femms + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm new file mode 100644 index 0000000000..215d805f2e --- /dev/null +++ b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm @@ -0,0 +1,196 @@ +dnl AMD K6-2 mpn_copyi -- copy limb vector, incrementing. +dnl +dnl K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data +dnl alignment. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K6-2 aligned: +dnl UNROLL_COUNT cycles/limb +dnl 8 0.75 +dnl 16 0.625 +dnl 32 0.5625 +dnl 64 0.53 +dnl Maximum possible with the current code is 64, the minimum is 2. + +deflit(UNROLL_COUNT, 32) + + +C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The MMX loop is faster than a rep movs when src and dst are both 0mod8. +C With one 0mod8 and one 4mod8 it's 1.056 c/l and the rep movs at 1.0 c/l is +C used instead. +C +C mod8 +C src dst +C 0 0 both aligned, use mmx +C 0 4 unaligned, use rep movs +C 4 0 unaligned, use rep movs +C 4 4 do one movs, then both aligned, use mmx +C +C The MMX code on aligned data is 0.5 c/l, plus loop overhead of 2 +C cycles/loop, which is 0.0625 c/l at 32 limbs/loop. +C +C A pattern of two movq loads and two movq stores (or four and four) was +C tried, but found to be the same speed as just one of each. +C +C Note that this code only suits K6-2 and K6-3. Plain K6 does only one mmx +C instruction per cycle, so "movq"s are no faster than the simple 1 c/l rep +C movs. +C +C Enhancement: +C +C Addressing modes like disp(%esi,%ecx,4) aren't currently used. They'd +C make it possible to avoid incrementing %esi and %edi in the loop and hence +C get loop overhead down to 1 cycle. Care would be needed to avoid bad +C cache line crossings since the "movq"s would then be 5 code bytes rather +C than 4. + + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) + +PROLOGUE(mpn_copyi) + movl PARAM_SIZE, %ecx + movl %esi, %eax + + movl PARAM_SRC, %esi + movl %edi, %edx + + cld + + movl PARAM_DST, %edi + cmpl $UNROLL_COUNT, %ecx + + ja L(unroll) + +L(simple): + rep + movsl + + movl %eax, %esi + movl %edx, %edi + + ret + + +L(unroll): + C if src and dst are different alignments mod8, then use rep movs + C if src and dst are both 4mod8 then process one limb to get 0mod8 + + pushl %ebx + leal (%esi,%edi), %ebx + + testb $4, %bl + popl %ebx + + jnz L(simple) + testl $4, %esi + + leal -UNROLL_COUNT(%ecx), %ecx + jz L(already_aligned) + + decl %ecx + + movsl +L(already_aligned): + + +ifelse(UNROLL_BYTES,256,` + addl $128, %esi + addl $128, %edi +') + + C this is offset 0x34, no alignment needed +L(top): + C eax saved esi + C ebx + C ecx counter, limbs + C edx saved edi + C esi src, incrementing + C edi dst, incrementing + C ebp + C + C Zdisp gets 0(%esi) left that way to avoid vector decode, and with + C 0(%edi) keeps code aligned to 16 byte boundaries. + +deflit(CHUNK_COUNT, 2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) +Zdisp( movq, disp,(%esi), %mm0) +Zdisp( movq, %mm0, disp,(%edi)) +') + + addl $UNROLL_BYTES, %esi + subl $UNROLL_COUNT, %ecx + + leal UNROLL_BYTES(%edi), %edi + jns L(top) + + + C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to + C UNROLL_COUNT-1 limbs remaining + + testb $eval(UNROLL_COUNT/2), %cl + + leal UNROLL_COUNT(%ecx), %ecx + jz L(not_half) + + C at an unroll count of 32 this block of code is 16 cycles faster than + C the rep movs, less 3 or 4 to test whether to do it + +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, ` + deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + movq disp(%esi), %mm0 + movq %mm0, disp(%edi) +') + addl $eval(UNROLL_BYTES/2), %esi + addl $eval(UNROLL_BYTES/2), %edi + + subl $eval(UNROLL_COUNT/2), %ecx +L(not_half): + + +ifelse(UNROLL_BYTES,256,` + subl $128, %esi + subl $128, %edi +') + + rep + movsl + + movl %eax, %esi + movl %edx, %edi + + femms + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm new file mode 100644 index 0000000000..f6d54f97a8 --- /dev/null +++ b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm @@ -0,0 +1,286 @@ +dnl AMD K6-2 mpn_lshift -- mpn left shift. +dnl +dnl K6-2: 1.75 cycles/limb + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl used after src has been fetched +define(VAR_RETVAL,`PARAM_SRC') + +dnl minimum 9, because unrolled loop can't handle less +deflit(UNROLL_THRESHOLD, 9) + + .text + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shldl( %cl, %edx, %eax) C return value + + shll %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) C avoid offset 0x1f +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx,%eax,4), %edx C src high limb + negl %ecx + + movd PARAM_SHIFT, %mm6 + addl $32, %ecx C 32-shift + + shrl %cl, %edx + cmpl $UNROLL_THRESHOLD-1, %eax + + movl %edx, VAR_RETVAL + jae L(unroll) + + + movd %ecx, %mm7 + movl %eax, %ecx + + movl PARAM_DST, %eax + +L(simple): + C eax dst + C ebx src + C ecx counter, size-1 to 1 + C edx retval + C + C mm0 scratch + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%ecx,4), %mm0 + + psrlq %mm7, %mm0 + +Zdisp( movd, %mm0, 0,(%eax,%ecx,4)) + loop L(simple) + + + movd (%ebx), %mm0 + popl %ebx + + psllq %mm6, %mm0 + + movd %mm0, (%eax) + movl %edx, %eax + + femms + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx src + C ecx 32-shift + C edx retval (but instead VAR_RETVAL is used) + C + C mm6 shift + + addl $32, %ecx + movl PARAM_DST, %edx + + movd %ecx, %mm7 + subl $7, %eax C size-8 + + leal (%edx,%eax,4), %ecx C alignment of dst + + movq 32-8(%ebx,%eax,4), %mm2 C src high qword + testb $4, %cl + + jz L(dst_aligned) + psllq %mm6, %mm2 + + psrlq $32, %mm2 + decl %eax + + movd %mm2, 32(%edx,%eax,4) C dst high limb + movq 32-8(%ebx,%eax,4), %mm2 C new src high qword +L(dst_aligned): + + movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword + + + C This loop is the important bit, the rest is just support for it. + C Four src limbs are held at the start, and four more will be read. + C Four dst limbs will be written. This schedule seems necessary for + C full speed. + C + C The use of size-8 lets the loop stop when %eax goes negative and + C leaves -4 to -1 which can be tested with test $1 and $2. + +L(top): + C eax counter, size-8 step by -4 until <0 + C ebx src + C ecx + C edx dst + C + C mm0 src next qword + C mm1 scratch + C mm2 src prev qword + C mm6 shift + C mm7 64-shift + + psllq %mm6, %mm2 + subl $4, %eax + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm0, %mm2 + movq 24(%ebx,%eax,4), %mm0 + + psllq %mm6, %mm1 + movq %mm2, 40(%edx,%eax,4) + + movq %mm0, %mm2 + psrlq %mm7, %mm0 + + por %mm0, %mm1 + movq 16(%ebx,%eax,4), %mm0 + + movq %mm1, 32(%edx,%eax,4) + jnc L(top) + + + C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4. + C + C 8(%ebx) is the next source, and 24(%edx) is the next destination. + C %eax is between -4 and -1, representing respectively 0 to 3 extra + C limbs that must be read. + + + testl $2, %eax C testl to avoid bad cache line crossing + jz L(finish_nottwo) + + C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes + C new mm2 and a new mm0 is loaded. + + psllq %mm6, %mm2 + movq %mm0, %mm1 + + psrlq %mm7, %mm0 + subl $2, %eax + + por %mm0, %mm2 + movq 16(%ebx,%eax,4), %mm0 + + movq %mm2, 32(%edx,%eax,4) + movq %mm1, %mm2 +L(finish_nottwo): + + + C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0 + + testb $1, %al + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm0, %mm2 + psllq %mm6, %mm1 + + movq %mm2, 24(%edx,%eax,4) + jz L(finish_even) + + + C Size is odd, so mm1 and one extra limb to process. + + movd (%ebx), %mm0 C src[0] + popl %ebx +deflit(`FRAME',0) + + movq %mm0, %mm2 + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + psllq %mm6, %mm2 + por %mm0, %mm1 + + movq %mm1, 4(%edx) C dst[1,2] + movd %mm2, (%edx) C dst[0] + + movl VAR_RETVAL, %eax + + femms + ret + + + nop C avoid bad cache line crossing +L(finish_even): +deflit(`FRAME',4) + C Size is even, so only mm1 left to process. + + movq %mm1, (%edx) C dst[0,1] + movl VAR_RETVAL, %eax + + popl %ebx + femms + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm new file mode 100644 index 0000000000..8a8c144241 --- /dev/null +++ b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm @@ -0,0 +1,285 @@ +dnl AMD K6-2 mpn_rshift -- mpn right shift. +dnl +dnl K6-2: 1.75 cycles/limb + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl Minimum 9, because the unrolled loop can't handle less. +dnl +deflit(UNROLL_THRESHOLD, 9) + + .text + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shrdl( %cl, %edx, %eax) C return value + + shrl %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) C avoid offset 0x1f +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx), %edx C src low limb + negl %ecx + + addl $32, %ecx + movd PARAM_SHIFT, %mm6 + + shll %cl, %edx + cmpl $UNROLL_THRESHOLD-1, %eax + + jae L(unroll) + + + C eax size-1 + C ebx src + C ecx 32-shift + C edx retval + C + C mm6 shift + + movl PARAM_DST, %ecx + leal (%ebx,%eax,4), %ebx + + leal -4(%ecx,%eax,4), %ecx + negl %eax + + C This loop runs at about 3 cycles/limb, which is the amount of + C decoding, and this is despite every second access being unaligned. + +L(simple): + C eax counter, -(size-1) to -1 + C ebx &src[size-1] + C ecx &dst[size-1] + C edx retval + C + C mm0 scratch + C mm6 shift + +Zdisp( movq, 0,(%ebx,%eax,4), %mm0) + incl %eax + + psrlq %mm6, %mm0 + +Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) + jnz L(simple) + + + movq %mm0, (%ecx) + movl %edx, %eax + + popl %ebx + + femms + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx src + C ecx 32-shift + C edx retval + C + C mm6 shift + + addl $32, %ecx + subl $7, %eax C size-8 + + movd %ecx, %mm7 + movl PARAM_DST, %ecx + + movq (%ebx), %mm2 C src low qword + leal (%ebx,%eax,4), %ebx C src end - 32 + + testb $4, %cl + leal (%ecx,%eax,4), %ecx C dst end - 32 + + notl %eax C -(size-7) + jz L(dst_aligned) + + psrlq %mm6, %mm2 + incl %eax + +Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb + movq 4(%ebx,%eax,4), %mm2 C new src low qword +L(dst_aligned): + + movq 12(%ebx,%eax,4), %mm0 C src second lowest qword + nop C avoid bad cache line crossing + + + C This loop is the important bit, the rest is just support for it. + C Four src limbs are held at the start, and four more will be read. + C Four dst limbs will be written. This schedule seems necessary for + C full speed. + C + C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and + C and leaves 0 to 3 which can be tested with test $1 and $2. + +L(top): + C eax counter, -(size-7) step by +4 until >=0 + C ebx src end - 32 + C ecx dst end - 32 + C edx retval + C + C mm0 src next qword + C mm1 scratch + C mm2 src prev qword + C mm6 shift + C mm7 64-shift + + psrlq %mm6, %mm2 + addl $4, %eax + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm0, %mm2 + movq 4(%ebx,%eax,4), %mm0 + + psrlq %mm6, %mm1 + movq %mm2, -12(%ecx,%eax,4) + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm0, %mm1 + movq 12(%ebx,%eax,4), %mm0 + + movq %mm1, -4(%ecx,%eax,4) + ja L(top) C jump if no carry and not zero + + + + C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0 + C to 3 representing respectively 3 to 0 further limbs. + + testl $2, %eax C testl to avoid bad cache line crossings + jnz L(finish_nottwo) + + C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0 + C becomes new mm2 and a new mm0 is loaded. + + psrlq %mm6, %mm2 + movq %mm0, %mm1 + + psllq %mm7, %mm0 + addl $2, %eax + + por %mm0, %mm2 + movq 12(%ebx,%eax,4), %mm0 + + movq %mm2, -4(%ecx,%eax,4) + movq %mm1, %mm2 +L(finish_nottwo): + + + testb $1, %al + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm0, %mm2 + psrlq %mm6, %mm1 + + movq %mm2, 4(%ecx,%eax,4) + jnz L(finish_even) + + + C one further extra limb to process + + movd 32-4(%ebx), %mm0 C src[size-1], most significant limb + popl %ebx + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm0, %mm1 + psrlq %mm6, %mm2 + + movq %mm1, 32-12(%ecx) C dst[size-3,size-2] + movd %mm2, 32-4(%ecx) C dst[size-1] + + movl %edx, %eax C retval + + femms + ret + + + nop C avoid bad cache line crossing +L(finish_even): + C no further extra limbs + + movq %mm1, 32-8(%ecx) C dst[size-2,size-1] + movl %edx, %eax C retval + + popl %ebx + + femms + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/mmx/com_n.asm b/rts/gmp/mpn/x86/k6/mmx/com_n.asm new file mode 100644 index 0000000000..8915080f0f --- /dev/null +++ b/rts/gmp/mpn/x86/k6/mmx/com_n.asm @@ -0,0 +1,91 @@ +dnl AMD K6-2 mpn_com_n -- mpn bitwise one's complement. +dnl +dnl alignment dst/src, A=0mod8 N=4mod8 +dnl A/A A/N N/A N/N +dnl K6-2 1.0 1.18 1.18 1.18 cycles/limb +dnl K6 1.5 1.85 1.75 1.85 + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Take the bitwise ones-complement of src,size and write it to dst,size. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_com_n) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + shrl %ecx + jnz L(two_or_more) + + movl (%eax), %eax + notl %eax + movl %eax, (%edx) + ret + + +L(two_or_more): + pushl %ebx +FRAME_pushl() + movl %ecx, %ebx + + pcmpeqd %mm7, %mm7 C all ones + + + ALIGN(16) +L(top): + C eax src + C ebx floor(size/2) + C ecx counter + C edx dst + C esi + C edi + C ebp + + movq -8(%eax,%ecx,8), %mm0 + pxor %mm7, %mm0 + movq %mm0, -8(%edx,%ecx,8) + loop L(top) + + + jnc L(no_extra) + movl (%eax,%ebx,8), %eax + notl %eax + movl %eax, (%edx,%ebx,8) +L(no_extra): + + popl %ebx + emms_or_femms + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/mmx/logops_n.asm b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm new file mode 100644 index 0000000000..46cb3b7ea5 --- /dev/null +++ b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm @@ -0,0 +1,212 @@ +dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, +dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. +dnl +dnl alignment dst/src1/src2, A=0mod8, N=4mod8 +dnl A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N +dnl +dnl K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor +dnl K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor +dnl K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior +dnl +dnl K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor +dnl K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor +dnl K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl M4_p and M4_i are the MMX and integer instructions +dnl M4_*_neg_dst means whether to negate the final result before writing +dnl M4_*_neg_src2 means whether to negate the src2 values before using them + +define(M4_choose_op, +m4_assert_numargs(7) +`ifdef(`OPERATION_$1',` +define(`M4_function', `mpn_$1') +define(`M4_operation', `$1') +define(`M4_p', `$2') +define(`M4_p_neg_dst', `$3') +define(`M4_p_neg_src2',`$4') +define(`M4_i', `$5') +define(`M4_i_neg_dst', `$6') +define(`M4_i_neg_src2',`$7') +')') + +dnl xnor is done in "iorn" style because it's a touch faster than "nior" +dnl style (the two are equivalent for xor). + +M4_choose_op( and_n, pand,0,0, andl,0,0) +M4_choose_op( andn_n, pandn,0,0, andl,0,1) +M4_choose_op( nand_n, pand,1,0, andl,1,0) +M4_choose_op( ior_n, por,0,0, orl,0,0) +M4_choose_op( iorn_n, por,0,1, orl,0,1) +M4_choose_op( nior_n, por,1,0, orl,1,0) +M4_choose_op( xor_n, pxor,0,0, xorl,0,0) +M4_choose_op( xnor_n, pxor,0,1, xorl,0,1) + +ifdef(`M4_function',, +`m4_error(`Unrecognised or undefined OPERATION symbol +')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + + +C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C +C Do src1,size M4_operation src2,size, storing the result in dst,size. +C +C Unaligned movq loads and stores are a bit slower than aligned ones. The +C test at the start of the routine checks the alignment of src1 and if +C necessary processes one limb separately at the low end to make it aligned. +C +C The raw speeds without this alignment switch are as follows. +C +C alignment dst/src1/src2, A=0mod8, N=4mod8 +C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N +C +C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor +C K6 1.75 2.2 2.0 2.28 iorn,xnor +C K6 2.0 2.25 2.35 2.28 nand,nior +C +C +C Future: +C +C K6 can do one 64-bit load per cycle so each of these routines should be +C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be +C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs. +C The others are 4 instructions per 2 limbs, and so can only approach 1.0 +C because there's nowhere to hide some loop control. + +defframe(PARAM_SIZE,16) +defframe(PARAM_SRC2,12) +defframe(PARAM_SRC1,8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) +PROLOGUE(M4_function) + movl PARAM_SIZE, %ecx + pushl %ebx + FRAME_pushl() + movl PARAM_SRC1, %eax + movl PARAM_SRC2, %ebx + cmpl $1, %ecx + movl PARAM_DST, %edx + ja L(two_or_more) + + + movl (%ebx), %ecx + popl %ebx +ifelse(M4_i_neg_src2,1,`notl %ecx') + M4_i (%eax), %ecx +ifelse(M4_i_neg_dst,1,` notl %ecx') + movl %ecx, (%edx) + + ret + + +L(two_or_more): + C eax src1 + C ebx src2 + C ecx size + C edx dst + C esi + C edi + C ebp + C + C carry bit is low of size + + pushl %esi + FRAME_pushl() + testl $4, %eax + jz L(alignment_ok) + + movl (%ebx), %esi + addl $4, %ebx +ifelse(M4_i_neg_src2,1,`notl %esi') + M4_i (%eax), %esi + addl $4, %eax +ifelse(M4_i_neg_dst,1,` notl %esi') + movl %esi, (%edx) + addl $4, %edx + decl %ecx + +L(alignment_ok): + movl %ecx, %esi + shrl %ecx + jnz L(still_two_or_more) + + movl (%ebx), %ecx + popl %esi +ifelse(M4_i_neg_src2,1,`notl %ecx') + M4_i (%eax), %ecx +ifelse(M4_i_neg_dst,1,` notl %ecx') + popl %ebx + movl %ecx, (%edx) + ret + + +L(still_two_or_more): +ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,` + pcmpeqd %mm7, %mm7 C all ones +') + + ALIGN(16) +L(top): + C eax src1 + C ebx src2 + C ecx counter + C edx dst + C esi + C edi + C ebp + C + C carry bit is low of size + + movq -8(%ebx,%ecx,8), %mm0 +ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0') + M4_p -8(%eax,%ecx,8), %mm0 +ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0') + movq %mm0, -8(%edx,%ecx,8) + + loop L(top) + + + jnc L(no_extra) + + movl -4(%ebx,%esi,4), %ebx +ifelse(M4_i_neg_src2,1,`notl %ebx') + M4_i -4(%eax,%esi,4), %ebx +ifelse(M4_i_neg_dst,1,` notl %ebx') + movl %ebx, -4(%edx,%esi,4) +L(no_extra): + + popl %esi + popl %ebx + emms_or_femms + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/mmx/lshift.asm b/rts/gmp/mpn/x86/k6/mmx/lshift.asm new file mode 100644 index 0000000000..f1dc83db46 --- /dev/null +++ b/rts/gmp/mpn/x86/k6/mmx/lshift.asm @@ -0,0 +1,122 @@ +dnl AMD K6 mpn_lshift -- mpn left shift. +dnl +dnl K6: 3.0 cycles/limb + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx +C instructions. This is despite every second fetch being unaligned. + + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shldl( %cl, %edx, %eax) C return value + + shll %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + + ALIGN(16) C avoid offset 0x1f + nop C avoid bad cache line crossing +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx,%eax,4), %edx C src high limb + negl %ecx + + movd PARAM_SHIFT, %mm6 + addl $32, %ecx C 32-shift + + shrl %cl, %edx + + movd %ecx, %mm7 + movl PARAM_DST, %ecx + +L(top): + C eax counter, size-1 to 1 + C ebx src + C ecx dst + C edx retval + C + C mm0 scratch + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + movd %mm0, 4(%ecx,%eax,4) + jnz L(top) + + + movd (%ebx), %mm0 + popl %ebx + + psllq %mm6, %mm0 + movl %edx, %eax + + movd %mm0, (%ecx) + + emms + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/mmx/popham.asm b/rts/gmp/mpn/x86/k6/mmx/popham.asm new file mode 100644 index 0000000000..2c619252bb --- /dev/null +++ b/rts/gmp/mpn/x86/k6/mmx/popham.asm @@ -0,0 +1,238 @@ +dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and +dnl hamming distance. +dnl +dnl popcount hamdist +dnl K6-2: 9.0 11.5 cycles/limb +dnl K6: 12.5 13.0 + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C The code here isn't optimal, but it's already a 2x speedup over the plain +C integer mpn/generic/popcount.c,hamdist.c. + + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist +')m4exit(1)')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` + dnl non-PIC + + DATA + ALIGN(8) + +define(LS, +m4_assert_numargs(1) +`LF(M4_function,`$1')') + +LS(rodata_AAAAAAAAAAAAAAAA): + .long 0xAAAAAAAA + .long 0xAAAAAAAA + +LS(rodata_3333333333333333): + .long 0x33333333 + .long 0x33333333 + +LS(rodata_0F0F0F0F0F0F0F0F): + .long 0x0F0F0F0F + .long 0x0F0F0F0F + +LS(rodata_000000FF000000FF): + .long 0x000000FF + .long 0x000000FF +') + + .text + ALIGN(32) + +POP(`ifdef(`PIC', ` + C avoid shrl crossing a 32-byte boundary + nop')') + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + orl %ecx, %ecx + jz L(zero) + +ifdef(`PIC',` + movl $0xAAAAAAAA, %eax + movl $0x33333333, %edx + + movd %eax, %mm7 + movd %edx, %mm6 + + movl $0x0F0F0F0F, %eax + movl $0x000000FF, %edx + + punpckldq %mm7, %mm7 + punpckldq %mm6, %mm6 + + movd %eax, %mm5 + movd %edx, %mm4 + + punpckldq %mm5, %mm5 + punpckldq %mm4, %mm4 +',` + + movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7 + movq LS(rodata_3333333333333333), %mm6 + movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5 + movq LS(rodata_000000FF000000FF), %mm4 +') + +define(REG_AAAAAAAAAAAAAAAA, %mm7) +define(REG_3333333333333333, %mm6) +define(REG_0F0F0F0F0F0F0F0F, %mm5) +define(REG_000000FF000000FF, %mm4) + + + movl PARAM_SRC, %eax +HAM(` movl PARAM_SRC2, %edx') + + pxor %mm2, %mm2 C total + + shrl %ecx + jnc L(top) + +Zdisp( movd, 0,(%eax,%ecx,8), %mm1) + +HAM(` +Zdisp( movd, 0,(%edx,%ecx,8), %mm0) + pxor %mm0, %mm1 +') + + incl %ecx + jmp L(loaded) + + + ALIGN(16) +POP(` nop C alignment to avoid crossing 32-byte boundaries') + +L(top): + C eax src + C ebx + C ecx counter, qwords, decrementing + C edx [hamdist] src2 + C + C mm0 (scratch) + C mm1 (scratch) + C mm2 total (low dword) + C mm3 + C mm4 \ + C mm5 | special constants + C mm6 | + C mm7 / + + movq -8(%eax,%ecx,8), %mm1 +HAM(` pxor -8(%edx,%ecx,8), %mm1') + +L(loaded): + movq %mm1, %mm0 + pand REG_AAAAAAAAAAAAAAAA, %mm1 + + psrlq $1, %mm1 +HAM(` nop C code alignment') + + psubd %mm1, %mm0 C bit pairs +HAM(` nop C code alignment') + + + movq %mm0, %mm1 + psrlq $2, %mm0 + + pand REG_3333333333333333, %mm0 + pand REG_3333333333333333, %mm1 + + paddd %mm1, %mm0 C nibbles + + + movq %mm0, %mm1 + psrlq $4, %mm0 + + pand REG_0F0F0F0F0F0F0F0F, %mm0 + pand REG_0F0F0F0F0F0F0F0F, %mm1 + + paddd %mm1, %mm0 C bytes + + movq %mm0, %mm1 + psrlq $8, %mm0 + + + paddb %mm1, %mm0 C words + + + movq %mm0, %mm1 + psrlq $16, %mm0 + + paddd %mm1, %mm0 C dwords + + pand REG_000000FF000000FF, %mm0 + + paddd %mm0, %mm2 C low to total + psrlq $32, %mm0 + + paddd %mm0, %mm2 C high to total + loop L(top) + + + + movd %mm2, %eax + emms_or_femms + ret + +L(zero): + movl $0, %eax + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/mmx/rshift.asm b/rts/gmp/mpn/x86/k6/mmx/rshift.asm new file mode 100644 index 0000000000..cc5948f26c --- /dev/null +++ b/rts/gmp/mpn/x86/k6/mmx/rshift.asm @@ -0,0 +1,122 @@ +dnl AMD K6 mpn_rshift -- mpn right shift. +dnl +dnl K6: 3.0 cycles/limb + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx +C instructions. This is despite every second fetch being unaligned. + + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + + .text + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + C The 1 limb case can be done without the push %ebx, but it's then + C still the same speed. The push is left as a free helping hand for + C the two_or_more code. + + movl PARAM_SIZE, %eax + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + decl %eax + + movl PARAM_SHIFT, %ecx + jnz L(two_or_more) + + movl (%ebx), %edx C src limb + movl PARAM_DST, %ebx + + shrdl( %cl, %edx, %eax) C return value + + shrl %cl, %edx + + movl %edx, (%ebx) C dst limb + popl %ebx + + ret + + + ALIGN(16) C avoid offset 0x1f +L(two_or_more): + C eax size-1 + C ebx src + C ecx shift + C edx + + movl (%ebx), %edx C src low limb + negl %ecx + + addl $32, %ecx C 32-shift + movd PARAM_SHIFT, %mm6 + + shll %cl, %edx C retval + movl PARAM_DST, %ecx + + leal (%ebx,%eax,4), %ebx + + leal -4(%ecx,%eax,4), %ecx + negl %eax + + +L(simple): + C eax counter (negative) + C ebx &src[size-1] + C ecx &dst[size-1] + C edx retval + C + C mm0 scratch + C mm6 shift + +Zdisp( movq, 0,(%ebx,%eax,4), %mm0) + incl %eax + + psrlq %mm6, %mm0 + +Zdisp( movd, %mm0, 0,(%ecx,%eax,4)) + jnz L(simple) + + + movq %mm0, (%ecx) + movl %edx, %eax + + popl %ebx + + emms + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/mul_1.asm b/rts/gmp/mpn/x86/k6/mul_1.asm new file mode 100644 index 0000000000..c2220fe4ca --- /dev/null +++ b/rts/gmp/mpn/x86/k6/mul_1.asm @@ -0,0 +1,272 @@ +dnl AMD K6 mpn_mul_1 -- mpn by limb multiply. +dnl +dnl K6: 6.25 cycles/limb. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); +C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier, mp_limb_t carry); +C +C Multiply src,size by mult and store the result in dst,size. +C Return the carry limb from the top of the result. +C +C mpn_mul_1c() accepts an initial carry for the calculation, it's added into +C the low limb of the result. + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl minimum 5 because the unrolled code can't handle less +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(32) + +PROLOGUE(mpn_mul_1c) + pushl %esi +deflit(`FRAME',4) + movl PARAM_CARRY, %esi + jmp LF(mpn_mul_1,start_nc) +EPILOGUE() + + +PROLOGUE(mpn_mul_1) + push %esi +deflit(`FRAME',4) + xorl %esi, %esi C initial carry + +L(start_nc): + mov PARAM_SIZE, %ecx + push %ebx +FRAME_pushl() + + movl PARAM_SRC, %ebx + push %edi +FRAME_pushl() + + movl PARAM_DST, %edi + pushl %ebp +FRAME_pushl() + + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_MULTIPLIER, %ebp + + jae L(unroll) + + + C code offset 0x22 here, close enough to aligned +L(simple): + C eax scratch + C ebx src + C ecx counter + C edx scratch + C esi carry + C edi dst + C ebp multiplier + C + C this loop 8 cycles/limb + + movl (%ebx), %eax + addl $4, %ebx + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi) + addl $4, %edi + + loop L(simple) + + + popl %ebp + + popl %edi + popl %ebx + + movl %esi, %eax + popl %esi + + ret + + +C ----------------------------------------------------------------------------- +C The code for each limb is 6 cycles, with instruction decoding being the +C limiting factor. At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25 +C cycles/limb in total. +C +C The secret ingredient to get 6.25 is to start the loop with the mul and +C have the load/store pair at the end. Rotating the load/store to the top +C is an 0.5 c/l slowdown. (Some address generation effect probably.) +C +C The whole unrolled loop fits nicely in exactly 80 bytes. + + + ALIGN(16) C already aligned to 16 here actually +L(unroll): + movl (%ebx), %eax + leal -16(%ebx,%ecx,4), %ebx + + leal -16(%edi,%ecx,4), %edi + subl $4, %ecx + + negl %ecx + + + ALIGN(16) C one byte nop for this alignment +L(top): + C eax scratch + C ebx &src[size-4] + C ecx counter + C edx scratch + C esi carry + C edi &dst[size-4] + C ebp multiplier + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi,%ecx,4) + movl 4(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 4(%edi,%ecx,4) + movl 8(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 8(%edi,%ecx,4) + movl 12(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 12(%edi,%ecx,4) + movl 16(%ebx,%ecx,4), %eax + + + addl $4, %ecx + js L(top) + + + + C eax next src limb + C ebx &src[size-4] + C ecx 0 to 3 representing respectively 4 to 1 further limbs + C edx + C esi carry + C edi &dst[size-4] + + testb $2, %cl + jnz L(finish_not_two) + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi,%ecx,4) + movl 4(%ebx,%ecx,4), %eax + + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 4(%edi,%ecx,4) + movl 8(%ebx,%ecx,4), %eax + + addl $2, %ecx +L(finish_not_two): + + + testb $1, %cl + jnz L(finish_not_one) + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, 8(%edi) + movl 12(%ebx), %eax +L(finish_not_one): + + + mull %ebp + + addl %esi, %eax + popl %ebp + + adcl $0, %edx + + movl %eax, 12(%edi) + popl %edi + + popl %ebx + movl %edx, %eax + + popl %esi + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/mul_basecase.asm b/rts/gmp/mpn/x86/k6/mul_basecase.asm new file mode 100644 index 0000000000..1f5a3a4b4b --- /dev/null +++ b/rts/gmp/mpn/x86/k6/mul_basecase.asm @@ -0,0 +1,600 @@ +dnl AMD K6 mpn_mul_basecase -- multiply two mpn numbers. +dnl +dnl K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop +dnl unrolling). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K6: UNROLL_COUNT cycles/product (approx) +dnl 8 9.75 +dnl 16 9.3 +dnl 32 9.3 +dnl Maximum possible with the current code is 32. +dnl +dnl With 16 the inner unrolled loop fits exactly in a 256 byte block, which +dnl might explain it's good performance. + +deflit(UNROLL_COUNT, 16) + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); +C +C Calculate xp,xsize multiplied by yp,ysize, storing the result in +C wp,xsize+ysize. +C +C This routine is essentially the same as mpn/generic/mul_basecase.c, but +C it's faster because it does most of the mpn_addmul_1() entry code only +C once. The saving is about 10-20% on typical sizes coming from the +C Karatsuba multiply code. +C +C Future: +C +C The unrolled loop could be shared by mpn_addmul_1, with some extra stack +C setups and maybe 2 or 3 wasted cycles at the end. Code saving would be +C 256 bytes. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 8) +',` +deflit(UNROLL_THRESHOLD, 8) +') + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + movl PARAM_XSIZE, %ecx + movl PARAM_YP, %eax + + movl PARAM_XP, %edx + movl (%eax), %eax C yp low limb + + cmpl $2, %ecx + ja L(xsize_more_than_two_limbs) + je L(two_by_something) + + + C one limb by one limb + + movl (%edx), %edx C xp low limb + movl PARAM_WP, %ecx + + mull %edx + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- +L(two_by_something): + decl PARAM_YSIZE + pushl %ebx +deflit(`FRAME',4) + + movl PARAM_WP, %ebx + pushl %esi +deflit(`FRAME',8) + + movl %eax, %ecx C yp low limb + movl (%edx), %eax C xp low limb + + movl %edx, %esi C xp + jnz L(two_by_two) + + + C two limbs by one limb + + mull %ecx + + movl %eax, (%ebx) + movl 4(%esi), %eax + + movl %edx, %esi C carry + + mull %ecx + + addl %eax, %esi + movl %esi, 4(%ebx) + + adcl $0, %edx + + movl %edx, 8(%ebx) + popl %esi + + popl %ebx + ret + + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(two_by_two): + C eax xp low limb + C ebx wp + C ecx yp low limb + C edx + C esi xp + C edi + C ebp +deflit(`FRAME',8) + + mull %ecx C xp[0] * yp[0] + + push %edi +deflit(`FRAME',12) + movl %eax, (%ebx) + + movl 4(%esi), %eax + movl %edx, %edi C carry, for wp[1] + + mull %ecx C xp[1] * yp[0] + + addl %eax, %edi + movl PARAM_YP, %ecx + + adcl $0, %edx + + movl %edi, 4(%ebx) + movl 4(%ecx), %ecx C yp[1] + + movl 4(%esi), %eax C xp[1] + movl %edx, %edi C carry, for wp[2] + + mull %ecx C xp[1] * yp[1] + + addl %eax, %edi + + adcl $0, %edx + + movl (%esi), %eax C xp[0] + movl %edx, %esi C carry, for wp[3] + + mull %ecx C xp[0] * yp[1] + + addl %eax, 4(%ebx) + adcl %edx, %edi + adcl $0, %esi + + movl %edi, 8(%ebx) + popl %edi + + movl %esi, 12(%ebx) + popl %esi + + popl %ebx + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(xsize_more_than_two_limbs): + +C The first limb of yp is processed with a simple mpn_mul_1 style loop +C inline. Unrolling this doesn't seem worthwhile since it's only run once +C (whereas the addmul below is run ysize-1 many times). A call to the +C actual mpn_mul_1 will be slowed down by the call and parameter pushing and +C popping, and doesn't seem likely to be worthwhile on the typical 10-20 +C limb operations the Karatsuba code calls here with. + + C eax yp[0] + C ebx + C ecx xsize + C edx xp + C esi + C edi + C ebp +deflit(`FRAME',0) + + pushl %edi defframe_pushl(SAVE_EDI) + pushl %ebp defframe_pushl(SAVE_EBP) + + movl PARAM_WP, %edi + pushl %esi defframe_pushl(SAVE_ESI) + + movl %eax, %ebp + pushl %ebx defframe_pushl(SAVE_EBX) + + leal (%edx,%ecx,4), %ebx C xp end + xorl %esi, %esi + + leal (%edi,%ecx,4), %edi C wp end of mul1 + negl %ecx + + +L(mul1): + C eax scratch + C ebx xp end + C ecx counter, negative + C edx scratch + C esi carry + C edi wp end of mul1 + C ebp multiplier + + movl (%ebx,%ecx,4), %eax + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi,%ecx,4) + incl %ecx + + jnz L(mul1) + + + movl PARAM_YSIZE, %edx + movl %esi, (%edi) C final carry + + movl PARAM_XSIZE, %ecx + decl %edx + + jnz L(ysize_more_than_one_limb) + + popl %ebx + popl %esi + popl %ebp + popl %edi + ret + + +L(ysize_more_than_one_limb): + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_YP, %eax + + jae L(unroll) + + +C ----------------------------------------------------------------------------- +C Simple addmul loop. +C +C Using ebx and edi pointing at the ends of their respective locations saves +C a couple of instructions in the outer loop. The inner loop is still 11 +C cycles, the same as the simple loop in aorsmul_1.asm. + + C eax yp + C ebx xp end + C ecx xsize + C edx ysize-1 + C esi + C edi wp end of mul1 + C ebp + + movl 4(%eax), %ebp C multiplier + negl %ecx + + movl %ecx, PARAM_XSIZE C -xsize + xorl %esi, %esi C initial carry + + leal 4(%eax,%edx,4), %eax C yp end + negl %edx + + movl %eax, PARAM_YP + movl %edx, PARAM_YSIZE + + jmp L(simple_outer_entry) + + + C aligning here saves a couple of cycles + ALIGN(16) +L(simple_outer_top): + C edx ysize counter, negative + + movl PARAM_YP, %eax C yp end + xorl %esi, %esi C carry + + movl PARAM_XSIZE, %ecx C -xsize + movl %edx, PARAM_YSIZE + + movl (%eax,%edx,4), %ebp C yp limb multiplier +L(simple_outer_entry): + addl $4, %edi + + +L(simple_inner): + C eax scratch + C ebx xp end + C ecx counter, negative + C edx scratch + C esi carry + C edi wp end of this addmul + C ebp multiplier + + movl (%ebx,%ecx,4), %eax + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl $0, %edx + addl %eax, (%edi,%ecx,4) + adcl %edx, %esi + + incl %ecx + jnz L(simple_inner) + + + movl PARAM_YSIZE, %edx + movl %esi, (%edi) + + incl %edx + jnz L(simple_outer_top) + + + popl %ebx + popl %esi + popl %ebp + popl %edi + ret + + +C ----------------------------------------------------------------------------- +C Unrolled loop. +C +C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for +C some comments. +C +C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to +C 0, inclusive. +C +C VAR_JMP is the computed jump into the unrolled loop. +C +C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop +C is entered. +C +C VAR_XP_LOW is the least significant limb of xp, which is needed at the +C start of the unrolled loop. This can't just be fetched through the xp +C pointer because of the offset applied to it. +C +C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, +C inclusive. +C +C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be +C added to give the location of the next limb of yp, which is the multiplier +C in the unrolled loop. +C +C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added +C to give the starting point in the destination for each unrolled loop (this +C point is one limb upwards for each limb of yp processed). +C +C Having PARAM_YSIZE count negative to zero means it's not necessary to +C store new values of PARAM_YP and PARAM_WP on each loop. Those values on +C the stack remain constant and on each loop an leal adjusts them with the +C PARAM_YSIZE counter value. + + +defframe(VAR_COUNTER, -20) +defframe(VAR_COUNTER_INIT, -24) +defframe(VAR_JMP, -28) +defframe(VAR_XP_LOW, -32) +deflit(VAR_STACK_SPACE, 16) + +dnl For some strange reason using (%esp) instead of 0(%esp) is a touch +dnl slower in this code, hence the defframe empty-if-zero feature is +dnl disabled. +dnl +dnl If VAR_COUNTER is at (%esp), the effect is worse. In this case the +dnl unrolled loop is 255 instead of 256 bytes, but quite how this affects +dnl anything isn't clear. +dnl +define(`defframe_empty_if_zero_disabled',1) + +L(unroll): + C eax yp (not used) + C ebx xp end (not used) + C ecx xsize + C edx ysize-1 + C esi + C edi wp end of mul1 (not used) + C ebp +deflit(`FRAME', 16) + + leal -2(%ecx), %ebp C one limb processed at start, + decl %ecx C and ebp is one less + + shrl $UNROLL_LOG2, %ebp + negl %ecx + + subl $VAR_STACK_SPACE, %esp +deflit(`FRAME', 16+VAR_STACK_SPACE) + andl $UNROLL_MASK, %ecx + + movl %ecx, %esi + shll $4, %ecx + + movl %ebp, VAR_COUNTER_INIT + negl %esi + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(unroll_here): +',` + leal L(unroll_entry) (%ecx,%esi,1), %ecx +') + + movl PARAM_XP, %ebx + movl %ebp, VAR_COUNTER + + movl PARAM_WP, %edi + movl %ecx, VAR_JMP + + movl (%ebx), %eax + leal 4(%edi,%esi,4), %edi C wp adjust for unrolling and mul1 + + leal (%ebx,%esi,4), %ebx C xp adjust for unrolling + + movl %eax, VAR_XP_LOW + + movl %ebx, PARAM_XP + movl PARAM_YP, %ebx + + leal (%edi,%edx,4), %ecx C wp adjust for ysize indexing + movl 4(%ebx), %ebp C multiplier (yp second limb) + + leal 4(%ebx,%edx,4), %ebx C yp adjust for ysize indexing + + movl %ecx, PARAM_WP + + leal 1(%esi), %ecx C adjust parity for decl %ecx above + + movl %ebx, PARAM_YP + negl %edx + + movl %edx, PARAM_YSIZE + jmp L(unroll_outer_entry) + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%ecx,%esi,1), %ecx + addl $L(unroll_entry)-L(unroll_here), %ecx + addl (%esp), %ecx + ret +') + + +C ----------------------------------------------------------------------------- + C Aligning here saves a couple of cycles per loop. Using 32 doesn't + C cost any extra space, since the inner unrolled loop below is + C aligned to 32. + ALIGN(32) +L(unroll_outer_top): + C edx ysize + + movl PARAM_YP, %eax + movl %edx, PARAM_YSIZE C incremented ysize counter + + movl PARAM_WP, %edi + + movl VAR_COUNTER_INIT, %ebx + movl (%eax,%edx,4), %ebp C next multiplier + + movl PARAM_XSIZE, %ecx + leal (%edi,%edx,4), %edi C adjust wp for where we are in yp + + movl VAR_XP_LOW, %eax + movl %ebx, VAR_COUNTER + +L(unroll_outer_entry): + mull %ebp + + C using testb is a tiny bit faster than testl + testb $1, %cl + + movl %eax, %ecx C low carry + movl VAR_JMP, %eax + + movl %edx, %esi C high carry + movl PARAM_XP, %ebx + + jnz L(unroll_noswap) + movl %ecx, %esi C high,low carry other way around + + movl %edx, %ecx +L(unroll_noswap): + + jmp *%eax + + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(unroll_top): + C eax scratch + C ebx xp + C ecx carry low + C edx scratch + C esi carry high + C edi wp + C ebp multiplier + C VAR_COUNTER loop counter + C + C 15 code bytes each limb + + leal UNROLL_BYTES(%edi), %edi + +L(unroll_entry): +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4)) + deflit(`disp1', eval(disp0 + 4)) + deflit(`disp2', eval(disp1 + 4)) + + movl disp1(%ebx), %eax + mull %ebp +Zdisp( addl, %ecx, disp0,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) + + movl disp2(%ebx), %eax + mull %ebp + addl %esi, disp1(%edi) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%ebx), %ebx + + jns L(unroll_top) + + + movl PARAM_YSIZE, %edx + addl %ecx, UNROLL_BYTES(%edi) + + adcl $0, %esi + + incl %edx + movl %esi, UNROLL_BYTES+4(%edi) + + jnz L(unroll_outer_top) + + + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + + addl $FRAME, %esp + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k6/sqr_basecase.asm b/rts/gmp/mpn/x86/k6/sqr_basecase.asm new file mode 100644 index 0000000000..70d49b3e57 --- /dev/null +++ b/rts/gmp/mpn/x86/k6/sqr_basecase.asm @@ -0,0 +1,672 @@ +dnl AMD K6 mpn_sqr_basecase -- square an mpn number. +dnl +dnl K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular +dnl product (measured on the speed difference between 17 and 33 limbs, +dnl which is roughly the Karatsuba recursing range). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this +dnl code supports. This value is used only by the tune program to know +dnl what it can go up to. (An attempt to compile with a bigger value will +dnl trigger some m4_assert()s in the code, making the build fail.) +dnl +dnl The value is determined by requiring the displacements in the unrolled +dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of +dnl 63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66. + +deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66) + + +dnl Allow a value from the tune program to override config.m4. + +ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE', +`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)') + + +dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The +dnl number required is determined by KARATSUBA_SQR_THRESHOLD, since +dnl mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD. +dnl +dnl The first addmul is the biggest, and this takes the second least +dnl significant limb and multiplies it by the third least significant and +dnl up. Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1 +dnl limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3. + +m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD') +deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3)) + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the given size +C is small. +C +C The code size might look a bit excessive, but not all of it is executed +C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases +C clearly apply only to those sizes; mid sizes like 10x10 only need part of +C the unrolled addmul; and big sizes like 35x35 that do need all of it will +C at least be getting value for money, because 35x35 spends something like +C 5780 cycles here. +C +C Different values of UNROLL_COUNT give slightly different speeds, between +C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs. +C This isn't a big difference, but it's presumably some alignment effect +C which if understood could give a simple speedup. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + + cmpl $2, %ecx + je L(two_limbs) + + movl PARAM_DST, %edx + ja L(three_or_more) + + +C ----------------------------------------------------------------------------- +C one limb only + C eax src + C ebx + C ecx size + C edx dst + + movl (%eax), %eax + movl %edx, %ecx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(two_limbs): + C eax src + C ebx + C ecx size + C edx dst + + pushl %ebx + movl %eax, %ebx C src +deflit(`FRAME',4) + + movl (%ebx), %eax + movl PARAM_DST, %ecx + + mull %eax C src[0]^2 + + movl %eax, (%ecx) + movl 4(%ebx), %eax + + movl %edx, 4(%ecx) + + mull %eax C src[1]^2 + + movl %eax, 8(%ecx) + movl (%ebx), %eax + + movl %edx, 12(%ecx) + movl 4(%ebx), %edx + + mull %edx C src[0]*src[1] + + addl %eax, 4(%ecx) + + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + + popl %ebx + addl %eax, 4(%ecx) + + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + + ret + + +C ----------------------------------------------------------------------------- +L(three_or_more): +deflit(`FRAME',0) + cmpl $4, %ecx + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + C eax src + C ecx size + C edx dst + + pushl %ebx + movl %eax, %ebx C src + + movl (%ebx), %eax + movl %edx, %ecx C dst + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl 4(%ebx), %eax + + movl %edx, 4(%ecx) + pushl %esi + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl 8(%ebx), %eax + + movl %edx, 12(%ecx) + pushl %edi + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl (%ebx), %eax + + movl %edx, 20(%ecx) + movl 4(%ebx), %edx + + mull %edx C src[0] * src[1] + + movl %eax, %esi + movl (%ebx), %eax + + movl %edx, %edi + movl 8(%ebx), %edx + + pushl %ebp + xorl %ebp, %ebp + + mull %edx C src[0] * src[2] + + addl %eax, %edi + movl 4(%ebx), %eax + + adcl %edx, %ebp + + movl 8(%ebx), %edx + + mull %edx C src[1] * src[2] + + addl %eax, %ebp + + adcl $0, %edx + + + C eax will be dst[5] + C ebx + C ecx dst + C edx dst[4] + C esi dst[1] + C edi dst[2] + C ebp dst[3] + + xorl %eax, %eax + addl %esi, %esi + adcl %edi, %edi + adcl %ebp, %ebp + adcl %edx, %edx + adcl $0, %eax + + addl %esi, 4(%ecx) + adcl %edi, 8(%ecx) + adcl %ebp, 12(%ecx) + + popl %ebp + popl %edi + + adcl %edx, 16(%ecx) + + popl %esi + popl %ebx + + adcl %eax, 20(%ecx) + ASSERT(nc) + + ret + + +C ----------------------------------------------------------------------------- + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +defframe(VAR_COUNTER,-20) +defframe(VAR_JMP, -24) +deflit(STACK_SPACE, 24) + + ALIGN(16) +L(four_or_more): + + C eax src + C ebx + C ecx size + C edx dst + C esi + C edi + C ebp + +C First multiply src[0]*src[1..size-1] and store at dst[1..size]. +C +C A test was done calling mpn_mul_1 here to get the benefit of its unrolled +C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off +C a 5780 cycle operation, which is not surprising since the loop here is 8 +C c/l and mpn_mul_1 is 6.25 c/l. + + subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) + + movl %edi, SAVE_EDI + leal 4(%edx), %edi + + movl %ebx, SAVE_EBX + leal 4(%eax), %ebx + + movl %esi, SAVE_ESI + xorl %esi, %esi + + movl %ebp, SAVE_EBP + + C eax + C ebx src+4 + C ecx size + C edx + C esi + C edi dst+4 + C ebp + + movl (%eax), %ebp C multiplier + leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary + + + ALIGN(16) +L(mul_1): + C eax scratch + C ebx src ptr + C ecx counter + C edx scratch + C esi carry + C edi dst ptr + C ebp multiplier + + movl (%ebx), %eax + addl $4, %ebx + + mull %ebp + + addl %esi, %eax + movl $0, %esi + + adcl %edx, %esi + + movl %eax, (%edi) + addl $4, %edi + + loop L(mul_1) + + +C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. +C +C The last two addmuls, which are the bottom right corner of the product +C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] +C and src[size-2]*src[size-1]. If size is 4 then it's only these corner +C cases that need to be done. +C +C The unrolled code is the same as mpn_addmul_1(), see that routine for some +C comments. +C +C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled code, stepped by one code +C chunk each outer loop. +C +C K6 doesn't do any branch prediction on indirect jumps, which is good +C actually because it's a different target each time. The unrolled addmul +C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of +C the indirect jump is quickly recovered. + + +dnl This value is also implicitly encoded in a shift and add. +dnl +deflit(CODE_BYTES_PER_LIMB, 15) + +dnl With the unmodified &src[size] and &dst[size] pointers, the +dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT +dnl values up to 31. Above that an offset must be added to them. +dnl +deflit(OFFSET, +ifelse(eval(UNROLL_COUNT>31),1, +eval((UNROLL_COUNT-31)*4), +0)) + + C eax + C ebx &src[size] + C ecx + C edx + C esi carry + C edi &dst[size] + C ebp + + movl PARAM_SIZE, %ecx + movl %esi, (%edi) + + subl $4, %ecx + jz L(corner) + + movl %ecx, %edx +ifelse(OFFSET,0,, +` subl $OFFSET, %ebx') + + shll $4, %ecx +ifelse(OFFSET,0,, +` subl $OFFSET, %edi') + + negl %ecx + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + negl %edx + + + C The calculated jump mustn't be before the start of the available + C code. This is the limitation UNROLL_COUNT puts on the src operand + C size, but checked here using the jump address directly. + C + ASSERT(ae,` + movl_text_address( L(unroll_inner_start), %eax) + cmpl %eax, %ecx + ') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx &src[size], constant + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi high limb to store + C edi dst ptr, high of last addmul + C ebp + + movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier + movl %edx, VAR_COUNTER + + movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand + + mull %ebp + + testb $1, %cl + + movl %edx, %esi C high carry + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + leal CODE_BYTES_PER_LIMB(%edx), %edx + + movl %edx, VAR_JMP + leal 4(%edi), %edi + + C A branch-free version of this using some xors was found to be a + C touch slower than just a conditional jump, despite the jump + C switching between taken and not taken on every loop. + +ifelse(eval(UNROLL_COUNT%2),0, + jz,jnz) L(unroll_noswap) + movl %esi, %eax C high,low carry other way around + + movl %ecx, %esi + movl %eax, %ecx +L(unroll_noswap): + + jmp *%edx + + + C Must be on an even address here so the low bit of the jump address + C will indicate which way around ecx/esi should start. + C + C An attempt was made at padding here to get the end of the unrolled + C code to come out on a good alignment, to save padding before + C L(corner). This worked, but turned out to run slower than just an + C ALIGN(2). The reason for this is not clear, it might be related + C to the different speeds on different UNROLL_COUNTs noted above. + + ALIGN(2) + +L(unroll_inner_start): + C eax scratch + C ebx src + C ecx carry low + C edx scratch + C esi carry high + C edi dst + C ebp multiplier + C + C 15 code bytes each limb + C ecx/esi swapped on each chunk + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src - 4)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%ebx), %eax) + mull %ebp +Zdisp( addl, %esi, disp_dst,(%edi)) + adcl %eax, %ecx + movl %edx, %esi + jadcl0( %esi) +',` + dnl this one comes out last +Zdisp( movl, disp_src,(%ebx), %eax) + mull %ebp +Zdisp( addl, %ecx, disp_dst,(%edi)) + adcl %eax, %esi + movl %edx, %ecx + jadcl0( %ecx) +') +') +L(unroll_inner_end): + + addl %esi, -4+OFFSET(%edi) + + movl VAR_COUNTER, %edx + jadcl0( %ecx) + + movl %ecx, m4_empty_if_zero(OFFSET)(%edi) + movl VAR_JMP, %ecx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %ebx + addl $OFFSET, %edi +') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(corner): + C ebx &src[size] + C edi &dst[2*size-5] + + movl -12(%ebx), %ebp + + movl -8(%ebx), %eax + movl %eax, %ecx + + mull %ebp + + addl %eax, -4(%edi) + adcl $0, %edx + + movl -4(%ebx), %eax + movl %edx, %esi + movl %eax, %ebx + + mull %ebp + + addl %esi, %eax + adcl $0, %edx + + addl %eax, (%edi) + adcl $0, %edx + + movl %edx, %esi + movl %ebx, %eax + + mull %ecx + + addl %esi, %eax + movl %eax, 4(%edi) + + adcl $0, %edx + + movl %edx, 8(%edi) + + +C ----------------------------------------------------------------------------- +C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. +C The loop measures about 6 cycles/iteration, though it looks like it should +C decode in 5. + +L(lshift_start): + movl PARAM_SIZE, %ecx + + movl PARAM_DST, %edi + subl $1, %ecx C size-1 and clear carry + + movl PARAM_SRC, %ebx + movl %ecx, %edx + + xorl %eax, %eax C ready for adcl + + + ALIGN(16) +L(lshift): + C eax + C ebx src (for later use) + C ecx counter, decrementing + C edx size-1 (for later use) + C esi + C edi dst, incrementing + C ebp + + rcll 4(%edi) + rcll 8(%edi) + leal 8(%edi), %edi + loop L(lshift) + + + adcl %eax, %eax + + movl %eax, 4(%edi) C dst most significant limb + movl (%ebx), %eax C src[0] + + leal 4(%ebx,%edx,4), %ebx C &src[size] + subl %edx, %ecx C -(size-1) + + +C ----------------------------------------------------------------------------- +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + + mull %eax + + movl %eax, (%edi,%ecx,8) C dst[0] + + + ALIGN(16) +L(diag): + C eax scratch + C ebx &src[size] + C ecx counter, negative + C edx carry + C esi scratch + C edi dst[2*size-2] + C ebp + + movl (%ebx,%ecx,4), %eax + movl %edx, %esi + + mull %eax + + addl %esi, 4(%edi,%ecx,8) + adcl %eax, 8(%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + addl %edx, 4(%edi) C dst most significant limb + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + ret + + + +C ----------------------------------------------------------------------------- +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + addl (%esp), %ecx + addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx + addl %edx, %ecx + ret +') + + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/README b/rts/gmp/mpn/x86/k7/README new file mode 100644 index 0000000000..c34315c401 --- /dev/null +++ b/rts/gmp/mpn/x86/k7/README @@ -0,0 +1,145 @@ + + AMD K7 MPN SUBROUTINES + + +This directory contains code optimized for the AMD Athlon CPU. + +The mmx subdirectory has routines using MMX instructions. All Athlons have +MMX, the separate directory is just so that configure can omit it if the +assembler doesn't support MMX. + + + +STATUS + +Times for the loops, with all code and data in L1 cache. + + cycles/limb + mpn_add/sub_n 1.6 + + mpn_copyi 0.75 or 1.0 \ varying with data alignment + mpn_copyd 0.75 or 1.0 / + + mpn_divrem_1 17.0 integer part, 15.0 fractional part + mpn_mod_1 17.0 + mpn_divexact_by3 8.0 + + mpn_l/rshift 1.2 + + mpn_mul_1 3.4 + mpn_addmul/submul_1 3.9 + + mpn_mul_basecase 4.42 cycles/crossproduct (approx) + + mpn_popcount 5.0 + mpn_hamdist 6.0 + +Prefetching of sources hasn't yet been tried. + + + +NOTES + +cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available. + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. + +Floating point multiplications can be done in parallel with integer +multiplications, but there doesn't seem to be any way to make use of this. + +Unsigned "mul"s can be issued every 3 cycles. This suggests 3 is a limit on +the speed of the multiplication routines. The documentation shows mul +executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that, +to get near 3 cycles code has to be arranged so that nothing else is issued +to IEU0. A busy IEU0 could explain why some code takes 4 cycles and other +apparently equivalent code takes 5. + + + +OPTIMIZATIONS + +Unrolled loops are used to reduce looping overhead. The unrolling is +configurable up to 32 limbs/loop for most routines and up to 64 for some. +The K7 has 64k L1 code cache so quite big unrolling is allowable. + +Computed jumps into the unrolling are used to handle sizes not a multiple of +the unrolling. An attractive feature of this is that times increase +smoothly with operand size, but it may be that some routines should just +have simple loops to finish up, especially when PIC adds between 2 and 16 +cycles to get %eip. + +Position independent code is implemented using a call to get %eip for the +computed jumps and a ret is always done, rather than an addl $4,%esp or a +popl, so the CPU return address branch prediction stack stays synchronised +with the actual stack in memory. + +Branch prediction, in absence of any history, will guess forward jumps are +not taken and backward jumps are taken. Where possible it's arranged that +the less likely or less important case is under a taken forward jump. + + + +CODING + +Instructions in general code have been shown grouped if they can execute +together, which means up to three direct-path instructions which have no +successive dependencies. K7 always decodes three and has out-of-order +execution, but the groupings show what slots might be available and what +dependency chains exist. + +When there's vector-path instructions an effort is made to get triplets of +direct-path instructions in between them, even if there's dependencies, +since this maximizes decoding throughput and might save a cycle or two if +decoding is the limiting factor. + + + +INSTRUCTIONS + +adcl direct +divl 39 cycles back-to-back +lodsl,etc vector +loop 1 cycle vector (decl/jnz opens up one decode slot) +movd reg vector +movd mem direct +mull issue every 3 cycles, latency 4 cycles low word, 6 cycles high word +popl vector (use movl for more than one pop) +pushl direct, will pair with a load +shrdl %cl vector, 3 cycles, seems to be 3 decode too +xorl r,r false read dependency recognised + + + +REFERENCES + +"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number +22007, revision E, November 1999. Available on-line, + + http://www.amd.com/products/cpg/athlon/techdocs/pdf/22007.pdf + +"3DNow Technology Manual", AMD publication number 21928F/0-August 1999. +This describes the femms and prefetch instructions. Available on-line, + + http://www.amd.com/K6/k6docs/pdf/21928.pdf + +"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD +publication number 22466, revision B, August 1999. This describes +instructions added in the Athlon processor, such as pswapd and the extra +prefetch forms. Available on-line, + + http://www.amd.com/products/cpg/athlon/techdocs/pdf/22466.pdf + +"3DNow Instruction Porting Guide", AMD publication number 22621, revision B, +August 1999. This has some notes on general Athlon optimizations as well as +3DNow. Available on-line, + + http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf + + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/rts/gmp/mpn/x86/k7/aors_n.asm b/rts/gmp/mpn/x86/k7/aors_n.asm new file mode 100644 index 0000000000..85fa9d3036 --- /dev/null +++ b/rts/gmp/mpn/x86/k7/aors_n.asm @@ -0,0 +1,250 @@ +dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract. +dnl +dnl K7: 1.64 cycles/limb (at 16 limb/loop). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 8 1.9 +dnl 16 1.64 +dnl 32 1.7 +dnl 64 2.0 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_add_n', ` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + define(M4_description, add) +',`ifdef(`OPERATION_sub_n', ` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + define(M4_description, subtract) +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); +C +C Calculate src1,size M4_description src2,size, and store the result in +C dst,size. The return value is the carry bit from the top of the result (1 +C or 0). +C +C The _nc version accepts 1 or 0 for an initial carry into the low limb of +C the calculation. Note values other than 1 or 0 here will lead to garbage +C results. +C +C This code runs at 1.64 cycles/limb, which is probably the best possible +C with plain integer operations. Each limb is 2 loads and 1 store, and in +C one cycle the K7 can do two loads, or a load and a store, leading to 1.5 +C c/l. + +dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 8) +',` +deflit(UNROLL_THRESHOLD, 8) +') + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBP, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +defframe(SAVE_EDI, -16) +deflit(STACK_SPACE, 16) + + .text + ALIGN(32) +deflit(`FRAME',0) + +PROLOGUE(M4_function_nc) + movl PARAM_CARRY, %eax + jmp LF(M4_function_n,start) +EPILOGUE() + +PROLOGUE(M4_function_n) + + xorl %eax, %eax C carry +L(start): + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %edi, SAVE_EDI + movl %ebx, SAVE_EBX + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_SRC2, %edx + movl PARAM_SRC1, %ebx + jae L(unroll) + + movl PARAM_DST, %edi + leal (%ebx,%ecx,4), %ebx + leal (%edx,%ecx,4), %edx + + leal (%edi,%ecx,4), %edi + negl %ecx + shrl %eax + + C This loop in in a single 16 byte code block already, so no + C alignment necessary. +L(simple): + C eax scratch + C ebx src1 + C ecx counter + C edx src2 + C esi + C edi dst + C ebp + + movl (%ebx,%ecx,4), %eax + M4_inst (%edx,%ecx,4), %eax + movl %eax, (%edi,%ecx,4) + incl %ecx + jnz L(simple) + + movl $0, %eax + movl SAVE_EDI, %edi + + movl SAVE_EBX, %ebx + setc %al + addl $STACK_SPACE, %esp + + ret + + +C ----------------------------------------------------------------------------- + C This is at 0x55, close enough to aligned. +L(unroll): +deflit(`FRAME',STACK_SPACE) + movl %ebp, SAVE_EBP + andl $-2, %ecx C size low bit masked out + andl $1, PARAM_SIZE C size low bit kept + + movl %ecx, %edi + decl %ecx + movl PARAM_DST, %ebp + + shrl $UNROLL_LOG2, %ecx + negl %edi + movl %esi, SAVE_ESI + + andl $UNROLL_MASK, %edi + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%edi,%edi,8), %esi C 9 bytes per +') + negl %edi + shrl %eax + + leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx + leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx + leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi + + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%edi,%edi,8), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + ret +') + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(top): + C eax zero + C ebx src1 + C ecx counter + C edx src2 + C esi scratch (was computed jump) + C edi dst + C ebp scratch + + leal UNROLL_BYTES(%edx), %edx + +L(entry): +deflit(CHUNK_COUNT, 2) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%ebx), %esi) + movl disp1(%ebx), %ebp +Zdisp( M4_inst,disp0,(%edx), %esi) +Zdisp( movl, %esi, disp0,(%edi)) + M4_inst disp1(%edx), %ebp + movl %ebp, disp1(%edi) +') + + decl %ecx + leal UNROLL_BYTES(%ebx), %ebx + leal UNROLL_BYTES(%edi), %edi + jns L(top) + + + mov PARAM_SIZE, %esi + movl SAVE_EBP, %ebp + movl $0, %eax + + decl %esi + js L(even) + + movl (%ebx), %ecx + M4_inst UNROLL_BYTES(%edx), %ecx + movl %ecx, (%edi) +L(even): + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + setc %al + + movl SAVE_ESI, %esi + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/aorsmul_1.asm b/rts/gmp/mpn/x86/k7/aorsmul_1.asm new file mode 100644 index 0000000000..9f9c3daaf4 --- /dev/null +++ b/rts/gmp/mpn/x86/k7/aorsmul_1.asm @@ -0,0 +1,364 @@ +dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. +dnl +dnl K7: 3.9 cycles/limb. +dnl +dnl Future: It should be possible to avoid the separate mul after the +dnl unrolled loop by moving the movl/adcl to the top. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 4 4.42 +dnl 8 4.16 +dnl 16 3.9 +dnl 32 3.9 +dnl 64 3.87 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_addmul_1',` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + define(M4_description, add it to) + define(M4_desc_retval, carry) +',`ifdef(`OPERATION_submul_1',` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) + define(M4_description, subtract it from) + define(M4_desc_retval, borrow) +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C Calculate src,size multiplied by mult and M4_description dst,size. +C Return the M4_desc_retval limb from the top of the result. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 9) +',` +deflit(UNROLL_THRESHOLD, 6) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(SAVE_SIZE, 16) + + .text + ALIGN(32) +PROLOGUE(M4_function_1) + movl PARAM_SIZE, %edx + movl PARAM_SRC, %eax + xorl %ecx, %ecx + + decl %edx + jnz LF(M4_function_1c,start_1) + + movl (%eax), %eax + movl PARAM_DST, %ecx + + mull PARAM_MULTIPLIER + + M4_inst %eax, (%ecx) + adcl $0, %edx + movl %edx, %eax + + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(M4_function_1c) + movl PARAM_SIZE, %edx + movl PARAM_SRC, %eax + + decl %edx + jnz L(more_than_one_limb) + + movl (%eax), %eax + movl PARAM_DST, %ecx + + mull PARAM_MULTIPLIER + + addl PARAM_CARRY, %eax + + adcl $0, %edx + M4_inst %eax, (%ecx) + + adcl $0, %edx + movl %edx, %eax + + ret + + + C offset 0x44 so close enough to aligned +L(more_than_one_limb): + movl PARAM_CARRY, %ecx +L(start_1): + C eax src + C ecx initial carry + C edx size-1 + subl $SAVE_SIZE, %esp +deflit(`FRAME',16) + + movl %ebx, SAVE_EBX + movl %esi, SAVE_ESI + movl %edx, %ebx C size-1 + + movl PARAM_SRC, %esi + movl %ebp, SAVE_EBP + cmpl $UNROLL_THRESHOLD, %edx + + movl PARAM_MULTIPLIER, %ebp + movl %edi, SAVE_EDI + + movl (%esi), %eax C src low limb + movl PARAM_DST, %edi + ja L(unroll) + + + C simple loop + + leal 4(%esi,%ebx,4), %esi C point one limb past last + leal (%edi,%ebx,4), %edi C point at last limb + negl %ebx + + C The movl to load the next source limb is done well ahead of the + C mul. This is necessary for full speed, and leads to one limb + C handled separately at the end. + +L(simple): + C eax src limb + C ebx loop counter + C ecx carry limb + C edx scratch + C esi src + C edi dst + C ebp multiplier + + mull %ebp + + addl %eax, %ecx + adcl $0, %edx + + M4_inst %ecx, (%edi,%ebx,4) + movl (%esi,%ebx,4), %eax + adcl $0, %edx + + incl %ebx + movl %edx, %ecx + jnz L(simple) + + + mull %ebp + + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + + addl %eax, %ecx + adcl $0, %edx + + M4_inst %ecx, (%edi) + adcl $0, %edx + movl SAVE_EDI, %edi + + addl $SAVE_SIZE, %esp + movl %edx, %eax + ret + + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax src low limb + C ebx size-1 + C ecx carry + C edx size-1 + C esi src + C edi dst + C ebp multiplier + +dnl overlapping with parameters no longer needed +define(VAR_COUNTER,`PARAM_SIZE') +define(VAR_JUMP, `PARAM_MULTIPLIER') + + subl $2, %ebx C (size-2)-1 + decl %edx C size-2 + + shrl $UNROLL_LOG2, %ebx + negl %edx + + movl %ebx, VAR_COUNTER + andl $UNROLL_MASK, %edx + + movl %edx, %ebx + shll $4, %edx + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%edx,%ebx,1), %edx +') + negl %ebx + movl %edx, VAR_JUMP + + mull %ebp + + addl %eax, %ecx C initial carry, becomes low carry + adcl $0, %edx + testb $1, %bl + + movl 4(%esi), %eax C src second limb + leal ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi + leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebx,4), %edi + + movl %edx, %ebx C high carry + cmovnz( %ecx, %ebx) C high,low carry other way around + cmovnz( %edx, %ecx) + + jmp *VAR_JUMP + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%edx,%ebx,1), %edx + addl $L(entry)-L(here), %edx + addl (%esp), %edx + ret +') + + +C ----------------------------------------------------------------------------- +C This code uses a "two carry limbs" scheme. At the top of the loop the +C carries are ebx=lo, ecx=hi, then they swap for each limb processed. For +C the computed jump an odd size means they start one way around, an even +C size the other. Either way one limb is handled separately at the start of +C the loop. +C +C The positioning of the movl to load the next source limb is important. +C Moving it after the adcl with a view to avoiding a separate mul at the end +C of the loop slows the code down. + + ALIGN(32) +L(top): + C eax src limb + C ebx carry high + C ecx carry low + C edx scratch + C esi src+8 + C edi dst + C ebp multiplier + C + C VAR_COUNTER loop counter + C + C 17 bytes each limb + +L(entry): +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + + mull %ebp + +Zdisp( M4_inst,%ecx, disp0,(%edi)) + movl $0, %ecx + + adcl %eax, %ebx + +Zdisp( movl, disp0,(%esi), %eax) + adcl %edx, %ecx + + + mull %ebp + + M4_inst %ebx, disp1(%edi) + movl $0, %ebx + + adcl %eax, %ecx + + movl disp1(%esi), %eax + adcl %edx, %ebx +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + leal UNROLL_BYTES(%edi), %edi + + jns L(top) + + + C eax src limb + C ebx carry high + C ecx carry low + C edx + C esi + C edi dst (points at second last limb) + C ebp multiplier +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) +deflit(`disp1', eval(disp0-0 + 4)) + + mull %ebp + + M4_inst %ecx, disp0(%edi) + movl SAVE_EBP, %ebp + + adcl %ebx, %eax + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + adcl $0, %edx + M4_inst %eax, disp1(%edi) + movl SAVE_EDI, %edi + + adcl $0, %edx + addl $SAVE_SIZE, %esp + + movl %edx, %eax + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/diveby3.asm b/rts/gmp/mpn/x86/k7/diveby3.asm new file mode 100644 index 0000000000..57684958a5 --- /dev/null +++ b/rts/gmp/mpn/x86/k7/diveby3.asm @@ -0,0 +1,131 @@ +dnl AMD K7 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. +dnl +dnl K7: 8.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t carry); + +defframe(PARAM_CARRY,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl multiplicative inverse of 3, modulo 2^32 +deflit(INVERSE_3, 0xAAAAAAAB) + +dnl ceil(b/3) and floor(b*2/3) where b=2^32 +deflit(ONE_THIRD_CEIL, 0x55555556) +deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA) + + .text + ALIGN(32) + +PROLOGUE(mpn_divexact_by3c) +deflit(`FRAME',0) + + movl PARAM_SRC, %ecx + pushl %ebx defframe_pushl(SAVE_EBX) + + movl PARAM_CARRY, %ebx + pushl %ebp defframe_pushl(SAVE_EBP) + + movl PARAM_SIZE, %ebp + pushl %edi defframe_pushl(SAVE_EDI) + + movl (%ecx), %eax C src low limb + pushl %esi defframe_pushl(SAVE_ESI) + + movl PARAM_DST, %edi + movl $TWO_THIRDS_FLOOR, %esi + leal -4(%ecx,%ebp,4), %ecx C &src[size-1] + + subl %ebx, %eax + + setc %bl + decl %ebp + jz L(last) + + leal (%edi,%ebp,4), %edi C &dst[size-1] + negl %ebp + + + ALIGN(16) +L(top): + C eax src limb, carry subtracted + C ebx carry limb (0 or 1) + C ecx &src[size-1] + C edx scratch + C esi TWO_THIRDS_FLOOR + C edi &dst[size-1] + C ebp counter, limbs, negative + + imull $INVERSE_3, %eax, %edx + + movl 4(%ecx,%ebp,4), %eax C next src limb + cmpl $ONE_THIRD_CEIL, %edx + + sbbl $-1, %ebx C +1 if result>=ceil(b/3) + cmpl %edx, %esi + + sbbl %ebx, %eax C and further 1 if result>=ceil(b*2/3) + movl %edx, (%edi,%ebp,4) + incl %ebp + + setc %bl C new carry + jnz L(top) + + + +L(last): + C eax src limb, carry subtracted + C ebx carry limb (0 or 1) + C ecx &src[size-1] + C edx scratch + C esi multiplier + C edi &dst[size-1] + C ebp + + imull $INVERSE_3, %eax + + cmpl $ONE_THIRD_CEIL, %eax + movl %eax, (%edi) + movl SAVE_EBP, %ebp + + sbbl $-1, %ebx C +1 if eax>=ceil(b/3) + cmpl %eax, %esi + movl $0, %eax + + adcl %ebx, %eax C further +1 if eax>=ceil(b*2/3) + movl SAVE_EDI, %edi + movl SAVE_ESI, %esi + + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/gmp-mparam.h b/rts/gmp/mpn/x86/k7/gmp-mparam.h new file mode 100644 index 0000000000..c3bba0afc4 --- /dev/null +++ b/rts/gmp/mpn/x86/k7/gmp-mparam.h @@ -0,0 +1,100 @@ +/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +/* the low limb is ready after 4 cycles, but normally it's the high limb + which is of interest, and that comes out after 6 cycles */ +#ifndef UMUL_TIME +#define UMUL_TIME 6 /* cycles */ +#endif + +/* AMD doco says 40, but it measures 39 back-to-back */ +#ifndef UDIV_TIME +#define UDIV_TIME 39 /* cycles */ +#endif + +/* using bsf */ +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 7 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 26 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 177 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 52 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 173 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 76 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 114 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 34 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 5 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 54 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 720, 1440, 2944, 7680, 18432, 57344, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 736 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 6912 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 784, 1696, 3200, 7680, 18432, 57344, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 800 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 8448 +#endif diff --git a/rts/gmp/mpn/x86/k7/mmx/copyd.asm b/rts/gmp/mpn/x86/k7/mmx/copyd.asm new file mode 100644 index 0000000000..33214daa1f --- /dev/null +++ b/rts/gmp/mpn/x86/k7/mmx/copyd.asm @@ -0,0 +1,136 @@ +dnl AMD K7 mpn_copyd -- copy limb vector, decrementing. +dnl +dnl alignment dst/src, A=0mod8 N=4mod8 +dnl A/A A/N N/A N/N +dnl K7 0.75 1.0 1.0 0.75 + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The various comments in mpn/x86/k7/copyi.asm apply here too. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl parameter space reused +define(SAVE_EBX,`PARAM_SIZE') +define(SAVE_ESI,`PARAM_SRC') + +dnl minimum 5 since the unrolled code can't handle less than 5 +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(32) +PROLOGUE(mpn_copyd) + + movl PARAM_SIZE, %ecx + movl %ebx, SAVE_EBX + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + + cmpl $UNROLL_THRESHOLD, %ecx + jae L(unroll) + + orl %ecx, %ecx + jz L(simple_done) + +L(simple): + C eax src + C ebx scratch + C ecx counter + C edx dst + C + C this loop is 2 cycles/limb + + movl -4(%eax,%ecx,4), %ebx + movl %ebx, -4(%edx,%ecx,4) + decl %ecx + jnz L(simple) + +L(simple_done): + movl SAVE_EBX, %ebx + ret + + +L(unroll): + movl %esi, SAVE_ESI + leal (%eax,%ecx,4), %ebx + leal (%edx,%ecx,4), %esi + + andl %esi, %ebx + movl SAVE_ESI, %esi + subl $4, %ecx C size-4 + + testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) + jz L(aligned) + + C both src and dst unaligned, process one limb to align them + movl 12(%eax,%ecx,4), %ebx + movl %ebx, 12(%edx,%ecx,4) + decl %ecx +L(aligned): + + + ALIGN(16) +L(top): + C eax src + C ebx + C ecx counter, limbs + C edx dst + + movq 8(%eax,%ecx,4), %mm0 + movq (%eax,%ecx,4), %mm1 + subl $4, %ecx + movq %mm0, 16+8(%edx,%ecx,4) + movq %mm1, 16(%edx,%ecx,4) + jns L(top) + + + C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining + + testb $2, %cl + jz L(finish_not_two) + + movq 8(%eax,%ecx,4), %mm0 + movq %mm0, 8(%edx,%ecx,4) +L(finish_not_two): + + testb $1, %cl + jz L(done) + + movl (%eax), %ebx + movl %ebx, (%edx) + +L(done): + movl SAVE_EBX, %ebx + emms + ret + + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/mmx/copyi.asm b/rts/gmp/mpn/x86/k7/mmx/copyi.asm new file mode 100644 index 0000000000..b234a1628c --- /dev/null +++ b/rts/gmp/mpn/x86/k7/mmx/copyi.asm @@ -0,0 +1,147 @@ +dnl AMD K7 mpn_copyi -- copy limb vector, incrementing. +dnl +dnl alignment dst/src, A=0mod8 N=4mod8 +dnl A/A A/N N/A N/N +dnl K7 0.75 1.0 1.0 0.75 + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Copy src,size to dst,size. +C +C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at +C 1.33 c/l. +C +C The K7 can do two loads, or two stores, or a load and a store, in one +C cycle, so if those are 64-bit operations then 0.5 c/l should be possible, +C however nothing under 0.7 c/l is known. +C +C If both source and destination are unaligned then one limb is processed at +C the start to make them aligned and so get 0.75 c/l, whereas if they'd been +C used unaligned it would be 1.5 c/l. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl parameter space reused +define(SAVE_EBX,`PARAM_SIZE') + +dnl minimum 5 since the unrolled code can't handle less than 5 +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(32) +PROLOGUE(mpn_copyi) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl %ebx, SAVE_EBX + + movl PARAM_SRC, %eax + movl PARAM_DST, %edx + + cmpl $UNROLL_THRESHOLD, %ecx + jae L(unroll) + + orl %ecx, %ecx + jz L(simple_done) + +L(simple): + C eax src, incrementing + C ebx scratch + C ecx counter + C edx dst, incrementing + C + C this loop is 2 cycles/limb + + movl (%eax), %ebx + movl %ebx, (%edx) + decl %ecx + leal 4(%eax), %eax + leal 4(%edx), %edx + jnz L(simple) + +L(simple_done): + movl SAVE_EBX, %ebx + ret + + +L(unroll): + movl %eax, %ebx + leal -12(%eax,%ecx,4), %eax C src end - 12 + subl $3, %ecx C size-3 + + andl %edx, %ebx + leal (%edx,%ecx,4), %edx C dst end - 12 + negl %ecx + + testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) + jz L(aligned) + + C both src and dst unaligned, process one limb to align them + movl (%eax,%ecx,4), %ebx + movl %ebx, (%edx,%ecx,4) + incl %ecx +L(aligned): + + + ALIGN(16) +L(top): + C eax src end - 12 + C ebx + C ecx counter, negative, limbs + C edx dst end - 12 + + movq (%eax,%ecx,4), %mm0 + movq 8(%eax,%ecx,4), %mm1 + addl $4, %ecx + movq %mm0, -16(%edx,%ecx,4) + movq %mm1, -16+8(%edx,%ecx,4) + ja L(top) C jump no carry and not zero + + + C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining + + testb $2, %cl + jnz L(finish_not_two) + + movq (%eax,%ecx,4), %mm0 + movq %mm0, (%edx,%ecx,4) +L(finish_not_two): + + testb $1, %cl + jnz L(done) + + movl 8(%eax), %ebx + movl %ebx, 8(%edx) + +L(done): + movl SAVE_EBX, %ebx + emms + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm b/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm new file mode 100644 index 0000000000..483ad6a9a1 --- /dev/null +++ b/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm @@ -0,0 +1,718 @@ +dnl AMD K7 mpn_divrem_1 -- mpn by limb division. +dnl +dnl K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C The method and nomenclature follow part 8 of "Division by Invariant +C Integers using Multiplication" by Granlund and Montgomery, reference in +C gmp.texi. +C +C The "and"s shown in the paper are done here with "cmov"s. "m" is written +C for m', and "d" for d_norm, which won't cause any confusion since it's +C only the normalized divisor that's of any use in the code. "b" is written +C for 2^N, the size of a limb, N being 32 here. +C +C mpn_divrem_1 avoids one division if the src high limb is less than the +C divisor. mpn_divrem_1c doesn't check for a zero carry, since in normal +C circumstances that will be a very rare event. +C +C There's a small bias towards expecting xsize==0, by having code for +C xsize==0 in a straight line and xsize!=0 under forward jumps. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The inverse takes about 50 cycles to calculate, but after that the +dnl multiply is 17 c/l versus division at 42 c/l. +dnl +dnl At 3 limbs the mul is a touch faster than div on the integer part, and +dnl even more so on the fractional part. + +deflit(MUL_THRESHOLD, 3) + + +defframe(PARAM_CARRY, 24) +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC, -28) +defframe(VAR_DST, -32) +defframe(VAR_DST_STOP,-36) + +deflit(STACK_SPACE, 36) + + .text + ALIGN(32) + +PROLOGUE(mpn_divrem_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + leal -4(%edi,%ebx,4), %edi + jmp LF(mpn_divrem_1,start_1c) + +EPILOGUE() + + + C offset 0x31, close enough to aligned +PROLOGUE(mpn_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl $0, %edx C initial carry (if can't skip a div) + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + orl %ecx, %ecx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] + + jz L(no_skip_div) + movl -4(%esi,%ecx,4), %eax C src high limb + + cmpl %ebp, %eax C one less div if high<divisor + jnb L(no_skip_div) + + movl $0, (%edi,%ecx,4) C dst high limb + decl %ecx C size-1 + movl %eax, %edx C src high limb as initial carry +L(no_skip_div): + + +L(start_1c): + C eax + C ebx xsize + C ecx size + C edx carry + C esi src + C edi &dst[xsize-1] + C ebp divisor + + leal (%ebx,%ecx), %eax C size+xsize + cmpl $MUL_THRESHOLD, %eax + jae L(mul_by_inverse) + + +C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs. +C It'd be possible to write them out without the looping, but no speedup +C would be expected. +C +C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the +C integer part, but curiously not on the fractional part, where %ebp is a +C (fixed) couple of cycles faster. + + orl %ecx, %ecx + jz L(divide_no_integer) + +L(divide_integer): + C eax scratch (quotient) + C ebx xsize + C ecx counter + C edx scratch (remainder) + C esi src + C edi &dst[xsize-1] + C ebp divisor + + movl -4(%esi,%ecx,4), %eax + + divl PARAM_DIVISOR + + movl %eax, (%edi,%ecx,4) + decl %ecx + jnz L(divide_integer) + + +L(divide_no_integer): + movl PARAM_DST, %edi + orl %ebx, %ebx + jnz L(divide_fraction) + +L(divide_done): + movl SAVE_ESI, %esi + movl SAVE_EDI, %edi + movl %edx, %eax + + movl SAVE_EBX, %ebx + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + + +L(divide_fraction): + C eax scratch (quotient) + C ebx counter + C ecx + C edx scratch (remainder) + C esi + C edi dst + C ebp divisor + + movl $0, %eax + + divl %ebp + + movl %eax, -4(%edi,%ebx,4) + decl %ebx + jnz L(divide_fraction) + + jmp L(divide_done) + + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): + C eax + C ebx xsize + C ecx size + C edx carry + C esi src + C edi &dst[xsize-1] + C ebp divisor + + bsrl %ebp, %eax C 31-l + + leal 12(%edi), %ebx + leal 4(%edi,%ecx,4), %edi C &dst[xsize+size] + + movl %edi, VAR_DST + movl %ebx, VAR_DST_STOP + + movl %ecx, %ebx C size + movl $31, %ecx + + movl %edx, %edi C carry + movl $-1, %edx + + C + + xorl %eax, %ecx C l + incl %eax C 32-l + + shll %cl, %ebp C d normalized + movl %ecx, VAR_NORM + + movd %eax, %mm7 + + movl $-1, %eax + subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1 + + divl %ebp C floor (b*(b-d)-1) / d + + orl %ebx, %ebx C size + movl %eax, VAR_INVERSE + leal -12(%esi,%ebx,4), %eax C &src[size-3] + + jz L(start_zero) + movl %eax, VAR_SRC + cmpl $1, %ebx + + movl 8(%eax), %esi C src high limb + jz L(start_one) + +L(start_two_or_more): + movl 4(%eax), %edx C src second highest limb + + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shldl( %cl, %edx, %esi) C n10 = high,second << l + + cmpl $2, %ebx + je L(integer_two_left) + jmp L(integer_top) + + +L(start_one): + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shll %cl, %esi C n10 = high << l + movl %eax, VAR_SRC + jmp L(integer_one_left) + + +L(start_zero): + shll %cl, %edi C n2 = carry << l + movl $0, %esi C n10 = 0 + + C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then + C must have xsize!=0 + jmp L(fraction_some) + + + +C ----------------------------------------------------------------------------- +C +C The multiply by inverse loop is 17 cycles, and relies on some out-of-order +C execution. The instruction scheduling is important, with various +C apparently equivalent forms running 1 to 5 cycles slower. +C +C A lower bound for the time would seem to be 16 cycles, based on the +C following successive dependencies. +C +C cycles +C n2+n1 1 +C mul 6 +C q1+1 1 +C mul 6 +C sub 1 +C addback 1 +C --- +C 16 +C +C This chain is what the loop has already, but 16 cycles isn't achieved. +C K7 has enough decode, and probably enough execute (depending maybe on what +C a mul actually consumes), but nothing running under 17 has been found. +C +C In theory n2+n1 could be done in the sub and addback stages (by +C calculating both n2 and n2+n1 there), but lack of registers makes this an +C unlikely proposition. +C +C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow +C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent +C chain, and nothing better than 18 cycles has been found when using it. +C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will +C be an extremely rare event. +C +C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but +C if some special data is coming out with this always, the q1_ff special +C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to +C induce the q1_ff case, for speed measurements or testing. Note that +C 0xFFF...FFF divided by 1 or 2 doesn't induce it. +C +C The instruction groupings and empty comments show the cycles for a naive +C in-order view of the code (conveniently ignoring the load latency on +C VAR_INVERSE). This shows some of where the time is going, but is nonsense +C to the extent that out-of-order execution rearranges it. In this case +C there's 19 cycles shown, but it executes at 17. + + ALIGN(16) +L(integer_top): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src qword) + C mm7 rshift for normalization + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + movl VAR_SRC, %ecx + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movq (%ecx), %mm0 C next limb and the one below it + subl $4, %ecx + + movl %ecx, VAR_SRC + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + movl VAR_DST, %ecx + + mull %ebx C (q1+1)*d + + psrlq %mm7, %mm0 + + leal -4(%ecx), %ecx + + C + + subl %eax, %esi + movl VAR_DST_STOP, %eax + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + cmpl %eax, %ecx + + movl %ebx, (%ecx) + movl %ecx, VAR_DST + jne L(integer_top) + + +L(integer_loop_done): + + +C ----------------------------------------------------------------------------- +C +C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz +C q1_ff special case. This make the code a bit smaller and simpler, and +C costs only 1 cycle (each). + +L(integer_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + movl PARAM_SRC, %ecx + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd (%ecx), %mm0 C src low limb + + movl VAR_DST_STOP, %ecx + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + subl %eax, %esi + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + + movl %ebx, -4(%ecx) + + +C ----------------------------------------------------------------------------- +L(integer_one_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx dst + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + movl VAR_DST_STOP, %ecx + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx C q1 if q1+1 overflowed + + mull %ebx + + C + + C + + C + + subl %eax, %esi + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + + movl %ebx, -8(%ecx) + subl $8, %ecx + + + +L(integer_none): + cmpl $0, PARAM_XSIZE + jne L(fraction_some) + + movl %edi, %eax +L(fraction_done): + movl VAR_NORM, %ecx + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + movl SAVE_ESI, %esi + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + shrl %cl, %eax + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx + C edx + C esi n10 + C edi n2 + C ebp divisor + + movl VAR_DST, %ecx + movl VAR_DST_STOP, %edx + subl $4, %ecx + + psrlq %mm7, %mm0 + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + movl %ecx, VAR_DST + + movd %mm0, %esi C next n10 + + movl $-1, (%ecx) + cmpl %ecx, %edx + jne L(integer_top) + + jmp L(integer_loop_done) + + + +C ----------------------------------------------------------------------------- +C +C Being the fractional part, the "source" limbs are all zero, meaning +C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated. +C +C The loop runs at 15 cycles. The dependent chain is the same as the +C general case above, but without the n2+n1 stage (due to n1==0), so 15 +C would seem to be the lower bound. +C +C A not entirely obvious simplification is that q1+1 never overflows a limb, +C and so there's no need for the sbbl $0 or jz q1_ff from the general case. +C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always. +C rnd() means rounding down to a multiple of d. +C +C m*n2 + b*n2 <= m*(d-1) + b*(d-1) +C = m*d + b*d - m - b +C = floor((b(b-d)-1)/d)*d + b*d - m - b +C = rnd(b(b-d)-1) + b*d - m - b +C = rnd(b(b-d)-1 + b*d) - m - b +C = rnd(b*b-1) - m - b +C <= (b-2)*b +C +C Unchanged from the general case is that the final quotient limb q can be +C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from +C equation 8.4 of the paper which simplifies as follows when n1==0 and +C n0==0. +C +C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b +C +C As before, the instruction groupings and empty comments show a naive +C in-order view of the code, which is made a nonsense by out of order +C execution. There's 17 cycles shown, but it executes at 15. +C +C Rotating the store q and remainder->n2 instructions up to the top of the +C loop gets the run time down from 16 to 15. + + ALIGN(16) +L(fraction_some): + C eax + C ebx + C ecx + C edx + C esi + C edi carry + C ebp divisor + + movl PARAM_DST, %esi + movl VAR_DST_STOP, %ecx + movl %edi, %eax + + subl $8, %ecx + + jmp L(fraction_entry) + + + ALIGN(16) +L(fraction_top): + C eax n2 carry, then scratch + C ebx scratch (nadj, q1) + C ecx dst, decrementing + C edx scratch + C esi dst stop point + C edi (will be n2) + C ebp divisor + + movl %ebx, (%ecx) C previous q + movl %eax, %edi C remainder->n2 + +L(fraction_entry): + mull VAR_INVERSE C m*n2 + + movl %ebp, %eax C d + subl $4, %ecx C dst + leal 1(%edi), %ebx + + C + + C + + C + + C + + addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 + + mull %ebx C (q1+1)*d + + C + + C + + C + + negl %eax C low of n - (q1+1)*d + + C + + sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry + leal (%ebp,%eax), %edx + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + sbbl $0, %ebx C q + cmpl %esi, %ecx + + jne L(fraction_top) + + + movl %ebx, (%ecx) + jmp L(fraction_done) + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/mmx/lshift.asm b/rts/gmp/mpn/x86/k7/mmx/lshift.asm new file mode 100644 index 0000000000..4d17c881ec --- /dev/null +++ b/rts/gmp/mpn/x86/k7/mmx/lshift.asm @@ -0,0 +1,472 @@ +dnl AMD K7 mpn_lshift -- mpn left shift. +dnl +dnl K7: 1.21 cycles/limb (at 16 limbs/loop). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 4 1.51 +dnl 8 1.26 +dnl 16 1.21 +dnl 32 1.2 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size left by shift many bits and store the result in dst,size. +C Zeros are shifted in at the right. The bits shifted out at the left are +C the return value. +C +C The comments in mpn_rshift apply here too. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 10) +',` +deflit(UNROLL_THRESHOLD, 10) +') + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EDI, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +deflit(SAVE_SIZE, 12) + + .text + ALIGN(32) + +PROLOGUE(mpn_lshift) +deflit(`FRAME',0) + + movl PARAM_SIZE, %eax + movl PARAM_SRC, %edx + subl $SAVE_SIZE, %esp +deflit(`FRAME',SAVE_SIZE) + + movl PARAM_SHIFT, %ecx + movl %edi, SAVE_EDI + + movl PARAM_DST, %edi + decl %eax + jnz L(more_than_one_limb) + + movl (%edx), %edx + + shldl( %cl, %edx, %eax) C eax was decremented to zero + + shll %cl, %edx + + movl %edx, (%edi) + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(more_than_one_limb): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + + movd PARAM_SHIFT, %mm6 + movd (%edx,%eax,4), %mm5 C src high limb + cmp $UNROLL_THRESHOLD-1, %eax + + jae L(unroll) + negl %ecx + movd (%edx), %mm4 C src low limb + + addl $32, %ecx + + movd %ecx, %mm7 + +L(simple_top): + C eax loop counter, limbs + C ebx + C ecx + C edx src + C esi + C edi dst + C ebp + C + C mm0 scratch + C mm4 src low limb + C mm5 src high limb + C mm6 shift + C mm7 32-shift + + movq -4(%edx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + movd %mm0, 4(%edi,%eax,4) + jnz L(simple_top) + + + psllq %mm6, %mm5 + psllq %mm6, %mm4 + + psrlq $32, %mm5 + movd %mm4, (%edi) C dst low limb + + movd %mm5, %eax C return value + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx (saved) + C ecx shift + C edx src + C esi + C edi dst + C ebp + C + C mm5 src high limb, for return value + C mm6 lshift + + movl %esi, SAVE_ESI + movl %ebx, SAVE_EBX + leal -4(%edx,%eax,4), %edx C &src[size-2] + + testb $4, %dl + movq (%edx), %mm1 C src high qword + + jz L(start_src_aligned) + + + C src isn't aligned, process high limb (marked xxx) separately to + C make it so + C + C source -4(edx,%eax,4) + C | + C +-------+-------+-------+-- + C | xxx | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + C + C dest -4(edi,%eax,4) + C | + C +-------+-------+-- + C | xxx | | + C +-------+-------+-- + + psllq %mm6, %mm1 + subl $4, %edx + movl %eax, PARAM_SIZE C size-1 + + psrlq $32, %mm1 + decl %eax C size-2 is new size-1 + + movd %mm1, 4(%edi,%eax,4) + movq (%edx), %mm1 C new src high qword +L(start_src_aligned): + + + leal -4(%edi,%eax,4), %edi C &dst[size-2] + psllq %mm6, %mm5 + + testl $4, %edi + psrlq $32, %mm5 C return value + + jz L(start_dst_aligned) + + + C dst isn't aligned, subtract 4 bytes to make it so, and pretend the + C shift is 32 bits extra. High limb of dst (marked xxx) handled + C here separately. + C + C source %edx + C +-------+-------+-- + C | mm1 | + C +-------+-------+-- + C 0mod8 4mod8 + C + C dest %edi + C +-------+-------+-------+-- + C | xxx | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + + movq %mm1, %mm0 + psllq %mm6, %mm1 + addl $32, %ecx C shift+32 + + psrlq $32, %mm1 + + movd %mm1, 4(%edi) + movq %mm0, %mm1 + subl $4, %edi + + movd %ecx, %mm6 C new lshift +L(start_dst_aligned): + + decl %eax C size-2, two last limbs handled at end + movq %mm1, %mm2 C copy of src high qword + negl %ecx + + andl $-2, %eax C round size down to even + addl $64, %ecx + + movl %eax, %ebx + negl %eax + + andl $UNROLL_MASK, %eax + decl %ebx + + shll %eax + + movd %ecx, %mm7 C rshift = 64-lshift + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%eax,%eax,4), %esi +') + shrl $UNROLL_LOG2, %ebx C loop counter + + leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx + leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi + movl PARAM_SIZE, %eax C for use at end + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%eax,%eax,4), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + + ret +') + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(top): + C eax size (for use at end) + C ebx loop counter + C ecx rshift + C edx src + C esi computed jump + C edi dst + C ebp + C + C mm0 scratch + C mm1 \ carry (alternating, mm2 first) + C mm2 / + C mm6 lshift + C mm7 rshift + C + C 10 code bytes/limb + C + C The two chunks differ in whether mm1 or mm2 hold the carry. + C The computed jump puts the initial carry in both mm1 and mm2. + +L(entry): +deflit(CHUNK_COUNT, 4) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 - 8)) + + movq disp0(%edx), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm2, %mm0 + movq %mm0, disp0(%edi) + + + movq disp1(%edx), %mm0 + psllq %mm6, %mm1 + + movq %mm0, %mm2 + psrlq %mm7, %mm0 + + por %mm1, %mm0 + movq %mm0, disp1(%edi) +') + + subl $UNROLL_BYTES, %edx + subl $UNROLL_BYTES, %edi + decl %ebx + + jns L(top) + + + +define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))') + +L(end): + testb $1, %al + movl SAVE_EBX, %ebx + psllq %mm6, %mm2 C wanted left shifted in all cases below + + movd %mm5, %eax + + movl SAVE_ESI, %esi + jz L(end_even) + + +L(end_odd): + + C Size odd, destination was aligned. + C + C source edx+8 edx+4 + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edi + C --+---------------+---------------+-------+ + C | written | | | + C --+---------------+---------------+-------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size odd, destination was unaligned. + C + C source edx+8 edx+4 + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edi + C --+---------------+---------------+ + C | written | | + C --+---------------+---------------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at (%edi), and in the aligned case + C there's an extra limb of dst to be formed from that extra src limb + C left shifted. + + movd disp(4) (%edx), %mm0 + testb $32, %cl + + movq %mm0, %mm1 + psllq $32, %mm0 + + psrlq %mm7, %mm0 + psllq %mm6, %mm1 + + por %mm2, %mm0 + + movq %mm0, disp(0) (%edi) + jz L(end_odd_unaligned) + movd %mm1, disp(-4) (%edi) +L(end_odd_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +L(end_even): + + C Size even, destination was aligned. + C + C source edx+8 + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edi + C --+---------------+---------------+ + C | written | | + C --+---------------+---------------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size even, destination was unaligned. + C + C source edx+8 + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edi+4 + C --+---------------+-------+ + C | written | | + C --+---------------+-------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C The movq for the aligned case overwrites the movd for the + C unaligned case. + + movq %mm2, %mm0 + psrlq $32, %mm2 + + testb $32, %cl + movd %mm2, disp(4) (%edi) + + jz L(end_even_unaligned) + movq %mm0, disp(0) (%edi) +L(end_even_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/mmx/mod_1.asm b/rts/gmp/mpn/x86/k7/mmx/mod_1.asm new file mode 100644 index 0000000000..545ca56ddf --- /dev/null +++ b/rts/gmp/mpn/x86/k7/mmx/mod_1.asm @@ -0,0 +1,457 @@ +dnl AMD K7 mpn_mod_1 -- mpn by limb remainder. +dnl +dnl K7: 17.0 cycles/limb. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C +C The code here is the same as mpn_divrem_1, but with the quotient +C discarded. See mpn/x86/k7/mmx/divrem_1.c for some comments. + + +dnl MUL_THRESHOLD is the size at which the multiply by inverse method is +dnl used, rather than plain "divl"s. Minimum value 2. +dnl +dnl The inverse takes about 50 cycles to calculate, but after that the +dnl multiply is 17 c/l versus division at 41 c/l. +dnl +dnl Using mul or div is about the same speed at 3 limbs, so the threshold +dnl is set to 4 to get the smaller div code used at 3. + +deflit(MUL_THRESHOLD, 4) + + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC_STOP,-28) + +deflit(STACK_SPACE, 28) + + .text + ALIGN(32) + +PROLOGUE(mpn_mod_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + jmp LF(mpn_mod_1,start_1c) + +EPILOGUE() + + + ALIGN(32) +PROLOGUE(mpn_mod_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl $0, %edx C initial carry (if can't skip a div) + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + orl %ecx, %ecx + jz L(divide_done) + + movl -4(%esi,%ecx,4), %eax C src high limb + + cmpl %ebp, %eax C carry flag if high<divisor + + cmovc( %eax, %edx) C src high limb as initial carry + sbbl $0, %ecx C size-1 to skip one div + jz L(divide_done) + + + ALIGN(16) +L(start_1c): + C eax + C ebx + C ecx size + C edx carry + C esi src + C edi + C ebp divisor + + cmpl $MUL_THRESHOLD, %ecx + jae L(mul_by_inverse) + + + +C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations, +C but it's already fast and compact, and there's nothing to gain by +C expanding it out. +C +C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp. + + orl %ecx, %ecx + jz L(divide_done) + + +L(divide_top): + C eax scratch (quotient) + C ebx + C ecx counter, limbs, decrementing + C edx scratch (remainder) + C esi src + C edi + C ebp + + movl -4(%esi,%ecx,4), %eax + + divl PARAM_DIVISOR + + decl %ecx + jnz L(divide_top) + + +L(divide_done): + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + movl %edx, %eax + + ret + + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): + C eax + C ebx + C ecx size + C edx carry + C esi src + C edi + C ebp divisor + + bsrl %ebp, %eax C 31-l + + movl %ebx, SAVE_EBX + leal -4(%esi), %ebx + + movl %ebx, VAR_SRC_STOP + movl %edi, SAVE_EDI + + movl %ecx, %ebx C size + movl $31, %ecx + + movl %edx, %edi C carry + movl $-1, %edx + + C + + xorl %eax, %ecx C l + incl %eax C 32-l + + shll %cl, %ebp C d normalized + movl %ecx, VAR_NORM + + movd %eax, %mm7 + + movl $-1, %eax + subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 + + divl %ebp C floor (b*(b-d)-1) / d + + C + + movl %eax, VAR_INVERSE + leal -12(%esi,%ebx,4), %eax C &src[size-3] + + movl 8(%eax), %esi C src high limb + movl 4(%eax), %edx C src second highest limb + + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shldl( %cl, %edx, %esi) C n10 = high,second << l + + movl %eax, %ecx C &src[size-3] + + +ifelse(MUL_THRESHOLD,2,` + cmpl $2, %ebx + je L(inverse_two_left) +') + + +C The dependent chain here is the same as in mpn_divrem_1, but a few +C instructions are saved by not needing to store the quotient limbs. +C Unfortunately this doesn't get the code down to the theoretical 16 c/l. +C +C There's four dummy instructions in the loop, all of which are necessary +C for the claimed 17 c/l. It's a 1 to 3 cycle slowdown if any are removed, +C or changed from load to store or vice versa. They're not completely +C random, since they correspond to what mpn_divrem_1 has, but there's no +C obvious reason why they're necessary. Presumably they induce something +C good in the out of order execution, perhaps through some load/store +C ordering and/or decoding effects. +C +C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1. On +C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at +C about 13.5 c/l. + + ALIGN(32) +L(inverse_top): + C eax scratch + C ebx scratch (nadj, q1) + C ecx src pointer, decrementing + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src qword) + C mm7 rshift for normalization + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + movl PARAM_SIZE, %ebx C dummy + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movq (%ecx), %mm0 C next src limb and the one below it + subl $4, %ecx + + movl %ecx, PARAM_SIZE C dummy + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + nop C dummy + + mull %ebx C (q1+1)*d + + psrlq %mm7, %mm0 + leal 0(%ecx), %ecx C dummy + + C + + C + + subl %eax, %esi + movl VAR_SRC_STOP, %eax + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + cmpl %eax, %ecx + jne L(inverse_top) + + +L(inverse_loop_done): + + +C ----------------------------------------------------------------------------- + +L(inverse_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx &src[-1] + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src dword) + C mm7 rshift + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd 4(%ecx), %mm0 C src low limb + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + subl %eax, %esi + + C + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + movd %mm0, %esi + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + + +C One limb left + + C eax scratch + C ebx scratch (nadj, q1) + C ecx + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + cmpl $0x80000000, %esi C n1 as 0=c, 1=nc + movl %edi, %eax C n2 + + leal (%ebp,%esi), %ebx + cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow + sbbl $-1, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movl VAR_NORM, %ecx C for final denorm + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + movl SAVE_EBX, %ebx + + C + + C + + subl %eax, %esi + + movl %esi, %eax C remainder + movl SAVE_ESI, %esi + + sbbl %edx, %edi C n - (q1+1)*d + leal (%ebp,%eax), %edx + movl SAVE_EBP, %ebp + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + movl SAVE_EDI, %edi + + shrl %cl, %eax C denorm remainder + addl $STACK_SPACE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx src pointer + C edx + C esi n10 + C edi (n2) + C ebp divisor + + movl VAR_SRC_STOP, %edx + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + psrlq %mm7, %mm0 + + movd %mm0, %esi C next n10 + + cmpl %ecx, %edx + jne L(inverse_top) + jmp L(inverse_loop_done) + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/mmx/popham.asm b/rts/gmp/mpn/x86/k7/mmx/popham.asm new file mode 100644 index 0000000000..fa7c8c04a5 --- /dev/null +++ b/rts/gmp/mpn/x86/k7/mmx/popham.asm @@ -0,0 +1,239 @@ +dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming +dnl distance. +dnl +dnl K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl Only recent versions of gas know psadbw, in particular gas 2.9.1 on +dnl FreeBSD 3.3 and 3.4 doesn't recognise it. + +define(psadbw_mm4_mm0, +`ifelse(m4_ifdef_anyof_p(`HAVE_TARGET_CPU_athlon', + `HAVE_TARGET_CPU_pentium3'),1, + `.byte 0x0f,0xf6,0xc4 C psadbw %mm4, %mm0', + +`m4_warning(`warning, using simulated and only partly functional psadbw, use for testing only +') C this works enough for the sum of bytes done below, making it + C possible to test on an older cpu + leal -8(%esp), %esp + movq %mm4, (%esp) + movq %mm0, %mm4 +forloop(i,1,7, +` psrlq $ 8, %mm4 + paddb %mm4, %mm0 +') + pushl $ 0 + pushl $ 0xFF + pand (%esp), %mm0 + movq 8(%esp), %mm4 + leal 16(%esp), %esp +')') + + +C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); +C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); +C +C The code here is almost certainly not optimal, but is already a 3x speedup +C over the generic C code. The main improvement would be to interleave +C processing of two qwords in the loop so as to fully exploit the available +C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs). +C +C The loop is based on the example "Efficient 64-bit population count using +C MMX instructions" in the Athlon Optimization Guide, AMD document 22007, +C page 158 of rev E (reference in mpn/x86/k7/README). + +ifdef(`OPERATION_popcount',, +`ifdef(`OPERATION_hamdist',, +`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined +')')') + +define(HAM, +m4_assert_numargs(1) +`ifdef(`OPERATION_hamdist',`$1')') + +define(POP, +m4_assert_numargs(1) +`ifdef(`OPERATION_popcount',`$1')') + +HAM(` +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC2, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_hamdist) +') +POP(` +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) +define(M4_function,mpn_popcount) +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + + +ifdef(`PIC',,` + dnl non-PIC + + DATA + ALIGN(8) + +define(LS, +m4_assert_numargs(1) +`LF(M4_function,`$1')') + +LS(rodata_AAAAAAAAAAAAAAAA): + .long 0xAAAAAAAA + .long 0xAAAAAAAA + +LS(rodata_3333333333333333): + .long 0x33333333 + .long 0x33333333 + +LS(rodata_0F0F0F0F0F0F0F0F): + .long 0x0F0F0F0F + .long 0x0F0F0F0F +') + + .text + ALIGN(32) + +PROLOGUE(M4_function) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + orl %ecx, %ecx + jz L(zero) + +ifdef(`PIC',` + movl $0xAAAAAAAA, %eax + movl $0x33333333, %edx + + movd %eax, %mm7 + movd %edx, %mm6 + + movl $0x0F0F0F0F, %eax + + punpckldq %mm7, %mm7 + punpckldq %mm6, %mm6 + + movd %eax, %mm5 + movd %edx, %mm4 + + punpckldq %mm5, %mm5 + +',` + movq LS(rodata_AAAAAAAAAAAAAAAA), %mm7 + movq LS(rodata_3333333333333333), %mm6 + movq LS(rodata_0F0F0F0F0F0F0F0F), %mm5 +') + pxor %mm4, %mm4 + +define(REG_AAAAAAAAAAAAAAAA,%mm7) +define(REG_3333333333333333,%mm6) +define(REG_0F0F0F0F0F0F0F0F,%mm5) +define(REG_0000000000000000,%mm4) + + + movl PARAM_SRC, %eax +HAM(` movl PARAM_SRC2, %edx') + + pxor %mm2, %mm2 C total + + shrl %ecx + jnc L(top) + + movd (%eax,%ecx,8), %mm1 + +HAM(` movd 0(%edx,%ecx,8), %mm0 + pxor %mm0, %mm1 +') + orl %ecx, %ecx + jmp L(loaded) + + + ALIGN(16) +L(top): + C eax src + C ebx + C ecx counter, qwords, decrementing + C edx [hamdist] src2 + C + C mm0 (scratch) + C mm1 (scratch) + C mm2 total (low dword) + C mm3 + C mm4 \ + C mm5 | special constants + C mm6 | + C mm7 / + + movq -8(%eax,%ecx,8), %mm1 + +HAM(` pxor -8(%edx,%ecx,8), %mm1') + decl %ecx + +L(loaded): + movq %mm1, %mm0 + pand REG_AAAAAAAAAAAAAAAA, %mm1 + + psrlq $1, %mm1 + + psubd %mm1, %mm0 C bit pairs + + + movq %mm0, %mm1 + psrlq $2, %mm0 + + pand REG_3333333333333333, %mm0 + pand REG_3333333333333333, %mm1 + + paddd %mm1, %mm0 C nibbles + + + movq %mm0, %mm1 + psrlq $4, %mm0 + + pand REG_0F0F0F0F0F0F0F0F, %mm0 + pand REG_0F0F0F0F0F0F0F0F, %mm1 + + paddd %mm1, %mm0 C bytes + + + psadbw_mm4_mm0 + + paddd %mm0, %mm2 C add to total + jnz L(top) + + + movd %mm2, %eax + emms + ret + + +L(zero): + movl $0, %eax + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/mmx/rshift.asm b/rts/gmp/mpn/x86/k7/mmx/rshift.asm new file mode 100644 index 0000000000..abb546cd5b --- /dev/null +++ b/rts/gmp/mpn/x86/k7/mmx/rshift.asm @@ -0,0 +1,471 @@ +dnl AMD K7 mpn_rshift -- mpn right shift. +dnl +dnl K7: 1.21 cycles/limb (at 16 limbs/loop). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 4 1.51 +dnl 8 1.26 +dnl 16 1.21 +dnl 32 1.2 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size right by shift many bits and store the result in dst,size. +C Zeros are shifted in at the left. The bits shifted out at the right are +C the return value. +C +C This code uses 64-bit MMX operations, which makes it possible to handle +C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer +C code, on the other hand, suffers from shrd being a vector path decode and +C running at 3 cycles back-to-back. +C +C Full speed depends on source and destination being aligned, and some hairy +C setups and finish-ups are done to arrange this for the loop. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 10) +',` +deflit(UNROLL_THRESHOLD, 10) +') + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EDI, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EBX, -12) +deflit(SAVE_SIZE, 12) + + .text + ALIGN(32) + +PROLOGUE(mpn_rshift) +deflit(`FRAME',0) + + movl PARAM_SIZE, %eax + movl PARAM_SRC, %edx + subl $SAVE_SIZE, %esp +deflit(`FRAME',SAVE_SIZE) + + movl PARAM_SHIFT, %ecx + movl %edi, SAVE_EDI + + movl PARAM_DST, %edi + decl %eax + jnz L(more_than_one_limb) + + movl (%edx), %edx C src limb + + shrdl( %cl, %edx, %eax) C eax was decremented to zero + + shrl %cl, %edx + + movl %edx, (%edi) C dst limb + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(more_than_one_limb): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + + movd PARAM_SHIFT, %mm6 C rshift + movd (%edx), %mm5 C src low limb + cmp $UNROLL_THRESHOLD-1, %eax + + jae L(unroll) + leal (%edx,%eax,4), %edx C &src[size-1] + leal -4(%edi,%eax,4), %edi C &dst[size-2] + + movd (%edx), %mm4 C src high limb + negl %eax + + +L(simple_top): + C eax loop counter, limbs, negative + C ebx + C ecx shift + C edx carry + C edx &src[size-1] + C edi &dst[size-2] + C ebp + C + C mm0 scratch + C mm4 src high limb + C mm5 src low limb + C mm6 shift + + movq (%edx,%eax,4), %mm0 + incl %eax + + psrlq %mm6, %mm0 + + movd %mm0, (%edi,%eax,4) + jnz L(simple_top) + + + psllq $32, %mm5 + psrlq %mm6, %mm4 + + psrlq %mm6, %mm5 + movd %mm4, 4(%edi) C dst high limb + + movd %mm5, %eax C return value + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll): + C eax size-1 + C ebx + C ecx shift + C edx src + C esi + C edi dst + C ebp + C + C mm5 src low limb + C mm6 rshift + + testb $4, %dl + movl %esi, SAVE_ESI + movl %ebx, SAVE_EBX + + psllq $32, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process low limb separately (marked xxx) and + C step src and dst by one limb, making src aligned. + C + C source edx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + C + C dest edi + C --+-------+-------+ + C | | xxx | + C --+-------+-------+ + + movq (%edx), %mm0 C src low two limbs + addl $4, %edx + movl %eax, PARAM_SIZE C size-1 + + addl $4, %edi + decl %eax C size-2 is new size-1 + + psrlq %mm6, %mm0 + movl %edi, PARAM_DST C new dst + + movd %mm0, -4(%edi) +L(start_src_aligned): + + + movq (%edx), %mm1 C src low two limbs + decl %eax C size-2, two last limbs handled at end + testl $4, %edi + + psrlq %mm6, %mm5 + jz L(start_dst_aligned) + + + C dst isn't aligned, add 4 to make it so, and pretend the shift is + C 32 bits extra. Low limb of dst (marked xxx) handled here separately. + C + C source edx + C --+-------+-------+ + C | mm1 | + C --+-------+-------+ + C 4mod8 0mod8 + C + C dest edi + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + + movq %mm1, %mm0 + psrlq %mm6, %mm1 + addl $32, %ecx C shift+32 + + movd %mm1, (%edi) + movq %mm0, %mm1 + addl $4, %edi C new dst + + movd %ecx, %mm6 +L(start_dst_aligned): + + + movq %mm1, %mm2 C copy of src low two limbs + negl %ecx + andl $-2, %eax C round size down to even + + movl %eax, %ebx + negl %eax + addl $64, %ecx + + andl $UNROLL_MASK, %eax + decl %ebx + + shll %eax + + movd %ecx, %mm7 C lshift = 64-rshift + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(entry) (%eax,%eax,4), %esi + negl %eax +') + shrl $UNROLL_LOG2, %ebx C loop counter + + leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx + leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi + movl PARAM_SIZE, %eax C for use at end + + jmp *%esi + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%eax,%eax,4), %esi + addl $L(entry)-L(here), %esi + addl (%esp), %esi + negl %eax + + ret +') + + +C ----------------------------------------------------------------------------- + ALIGN(64) +L(top): + C eax size, for use at end + C ebx loop counter + C ecx lshift + C edx src + C esi was computed jump + C edi dst + C ebp + C + C mm0 scratch + C mm1 \ carry (alternating) + C mm2 / + C mm6 rshift + C mm7 lshift + C + C 10 code bytes/limb + C + C The two chunks differ in whether mm1 or mm2 hold the carry. + C The computed jump puts the initial carry in both mm1 and mm2. + +L(entry): +deflit(CHUNK_COUNT, 4) +forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 8)) + + movq disp0(%edx), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + por %mm2, %mm0 + movq %mm0, disp0(%edi) + + + movq disp1(%edx), %mm0 + psrlq %mm6, %mm1 + + movq %mm0, %mm2 + psllq %mm7, %mm0 + + por %mm1, %mm0 + movq %mm0, disp1(%edi) +') + + addl $UNROLL_BYTES, %edx + addl $UNROLL_BYTES, %edi + decl %ebx + + jns L(top) + + +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) +deflit(`disp1', eval(disp0-0 + 8)) + + testb $1, %al + psrlq %mm6, %mm2 C wanted rshifted in all cases below + movl SAVE_ESI, %esi + + movd %mm5, %eax C return value + + movl SAVE_EBX, %ebx + jz L(end_even) + + + C Size odd, destination was aligned. + C + C source + C edx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edi + C +-------+---------------+---------------+-- + C | | | written | + C +-------+---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size odd, destination was unaligned. + C + C source + C edx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edi + C +---------------+---------------+-- + C | | written | + C +---------------+---------------+-- + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword to store, and in the aligned case there's + C a further extra limb of dst to be formed. + + + movd disp0(%edx), %mm0 + movq %mm0, %mm1 + + psllq %mm7, %mm0 + testb $32, %cl + + por %mm2, %mm0 + psrlq %mm6, %mm1 + + movq %mm0, disp0(%edi) + jz L(finish_odd_unaligned) + + movd %mm1, disp1(%edi) +L(finish_odd_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + + +L(end_even): + + C Size even, destination was aligned. + C + C source + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edi + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C Size even, destination was unaligned. + C + C source + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edi + C +-------+---------------+-- + C | | mm3 | + C +-------+---------------+-- + C + C mm6 = shift+32 + C mm7 = 64-(shift+32) + + + C The movd for the unaligned case is the same data as the movq for + C the aligned case, it's just a choice between whether one or two + C limbs should be written. + + + testb $32, %cl + movd %mm2, disp0(%edi) + + jz L(end_even_unaligned) + + movq %mm2, disp0(%edi) +L(end_even_unaligned): + + movl SAVE_EDI, %edi + addl $SAVE_SIZE, %esp + emms + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/mul_1.asm b/rts/gmp/mpn/x86/k7/mul_1.asm new file mode 100644 index 0000000000..07f7085b10 --- /dev/null +++ b/rts/gmp/mpn/x86/k7/mul_1.asm @@ -0,0 +1,265 @@ +dnl AMD K7 mpn_mul_1 -- mpn by limb multiply. +dnl +dnl K7: 3.4 cycles/limb (at 16 limbs/loop). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7: UNROLL_COUNT cycles/limb +dnl 8 3.9 +dnl 16 3.4 +dnl 32 3.4 +dnl 64 3.35 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); +C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier, mp_limb_t carry); +C +C Multiply src,size by mult and store the result in dst,size. +C Return the carry limb from the top of the result. +C +C mpn_mul_1c() accepts an initial carry for the calculation, it's added into +C the low limb of the destination. +C +C Variations on the unrolled loop have been tried, with the current +C registers or with the counter on the stack to free up ecx. The current +C code is the fastest found. +C +C An interesting effect is that removing the stores "movl %ebx, disp0(%edi)" +C from the unrolled loop actually slows it down to 5.0 cycles/limb. Code +C with this change can be tested on sizes of the form UNROLL_COUNT*n+1 +C without having to change the computed jump. There's obviously something +C fishy going on, perhaps with what execution units the mul needs. + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBP, -4) +defframe(SAVE_EDI, -8) +defframe(SAVE_ESI, -12) +defframe(SAVE_EBX, -16) +deflit(STACK_SPACE, 16) + +dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 7) +',` +deflit(UNROLL_THRESHOLD, 5) +') + + .text + ALIGN(32) +PROLOGUE(mpn_mul_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + jmp LF(mpn_mul_1,start_nc) +EPILOGUE() + + +PROLOGUE(mpn_mul_1) +deflit(`FRAME',0) + xorl %edx, %edx C initial carry +L(start_nc): + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME', STACK_SPACE) + + movl %edi, SAVE_EDI + movl %ebx, SAVE_EBX + movl %edx, %ebx + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_DST, %edi + movl %ebp, SAVE_EBP + jae L(unroll) + + leal (%esi,%ecx,4), %esi + leal (%edi,%ecx,4), %edi + negl %ecx + + movl PARAM_MULTIPLIER, %ebp + +L(simple): + C eax scratch + C ebx carry + C ecx counter (negative) + C edx scratch + C esi src + C edi dst + C ebp multiplier + + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl %eax, (%edi,%ecx,4) + movl $0, %ebx + + adcl %edx, %ebx + incl %ecx + jnz L(simple) + + movl %ebx, %eax + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + + +C ----------------------------------------------------------------------------- +C The mov to load the next source limb is done well ahead of the mul, this +C is necessary for full speed. It leads to one limb handled separately +C after the loop. +C +C When unrolling to 32 or more, an offset of +4 is used on the src pointer, +C to avoid having an 0x80 displacement in the code for the last limb in the +C unrolled loop. This is for a fair comparison between 16 and 32 unrolling. + +ifelse(eval(UNROLL_COUNT >= 32),1,` +deflit(SRC_OFFSET,4) +',` +deflit(SRC_OFFSET,) +') + + C this is offset 0x62, so close enough to aligned +L(unroll): + C eax + C ebx initial carry + C ecx size + C edx + C esi src + C edi dst + C ebp +deflit(`FRAME', STACK_SPACE) + + leal -1(%ecx), %edx C one limb handled at end + leal -2(%ecx), %ecx C and ecx is one less than edx + movl %ebp, SAVE_EBP + + negl %edx + shrl $UNROLL_LOG2, %ecx C unrolled loop counter + movl (%esi), %eax C src low limb + + andl $UNROLL_MASK, %edx + movl PARAM_DST, %edi + + movl %edx, %ebp + shll $4, %edx + + C 17 code bytes per limb +ifdef(`PIC',` + call L(add_eip_to_edx) +L(here): +',` + leal L(entry) (%edx,%ebp), %edx +') + negl %ebp + + leal ifelse(UNROLL_BYTES,256,128+) SRC_OFFSET(%esi,%ebp,4), %esi + leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebp,4), %edi + movl PARAM_MULTIPLIER, %ebp + + jmp *%edx + + +ifdef(`PIC',` +L(add_eip_to_edx): + C See README.family about old gas bugs + leal (%edx,%ebp), %edx + addl $L(entry)-L(here), %edx + addl (%esp), %edx + ret +') + + +C ---------------------------------------------------------------------------- + ALIGN(32) +L(top): + C eax next src limb + C ebx carry + C ecx counter + C edx scratch + C esi src+4 + C edi dst + C ebp multiplier + C + C 17 code bytes per limb processed + +L(entry): +forloop(i, 0, UNROLL_COUNT-1, ` + deflit(`disp_dst', eval(i*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp_src', eval(disp_dst + 4-(SRC_OFFSET-0))) + + mull %ebp + + addl %eax, %ebx +Zdisp( movl, disp_src,(%esi), %eax) +Zdisp( movl, %ebx, disp_dst,(%edi)) + + movl $0, %ebx + adcl %edx, %ebx +') + + decl %ecx + + leal UNROLL_BYTES(%esi), %esi + leal UNROLL_BYTES(%edi), %edi + jns L(top) + + +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) + + mull %ebp + + addl %eax, %ebx + movl $0, %eax + movl SAVE_ESI, %esi + + movl %ebx, disp0(%edi) + movl SAVE_EBX, %ebx + movl SAVE_EDI, %edi + + adcl %edx, %eax + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/mul_basecase.asm b/rts/gmp/mpn/x86/k7/mul_basecase.asm new file mode 100644 index 0000000000..c4be62e633 --- /dev/null +++ b/rts/gmp/mpn/x86/k7/mul_basecase.asm @@ -0,0 +1,593 @@ +dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers. +dnl +dnl K7: approx 4.42 cycles per cross product at around 20x20 limbs (16 +dnl limbs/loop unrolling). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl K7 UNROLL_COUNT cycles/product (at around 20x20) +dnl 8 4.67 +dnl 16 4.59 +dnl 32 4.42 +dnl Maximum possible with the current code is 32. +dnl +dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get +dnl done with a straight run through a block of code, no inner loop. Using +dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache. + +deflit(UNROLL_COUNT, 32) + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); +C +C Calculate xp,xsize multiplied by yp,ysize, storing the result in +C wp,xsize+ysize. +C +C This routine is essentially the same as mpn/generic/mul_basecase.c, but +C it's faster because it does most of the mpn_addmul_1() startup +C calculations only once. The saving is 15-25% on typical sizes coming from +C the Karatsuba multiply code. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 5) +',` +deflit(UNROLL_THRESHOLD, 5) +') + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + movl PARAM_XSIZE, %ecx + movl PARAM_YP, %eax + + movl PARAM_XP, %edx + movl (%eax), %eax C yp low limb + + cmpl $2, %ecx + ja L(xsize_more_than_two) + je L(two_by_something) + + + C one limb by one limb + + mull (%edx) + + movl PARAM_WP, %ecx + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- +L(two_by_something): +deflit(`FRAME',0) + decl PARAM_YSIZE + pushl %ebx defframe_pushl(`SAVE_EBX') + movl %eax, %ecx C yp low limb + + movl PARAM_WP, %ebx + pushl %esi defframe_pushl(`SAVE_ESI') + movl %edx, %esi C xp + + movl (%edx), %eax C xp low limb + jnz L(two_by_two) + + + C two limbs by one limb + + mull %ecx + + movl %eax, (%ebx) + movl 4(%esi), %eax + movl %edx, %esi C carry + + mull %ecx + + addl %eax, %esi + + movl %esi, 4(%ebx) + movl SAVE_ESI, %esi + + adcl $0, %edx + + movl %edx, 8(%ebx) + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +C Could load yp earlier into another register. + + ALIGN(16) +L(two_by_two): + C eax xp low limb + C ebx wp + C ecx yp low limb + C edx + C esi xp + C edi + C ebp + +dnl FRAME carries on from previous + + mull %ecx C xp[0] * yp[0] + + push %edi defframe_pushl(`SAVE_EDI') + movl %edx, %edi C carry, for wp[1] + + movl %eax, (%ebx) + movl 4(%esi), %eax + + mull %ecx C xp[1] * yp[0] + + addl %eax, %edi + movl PARAM_YP, %ecx + + adcl $0, %edx + movl 4(%ecx), %ecx C yp[1] + movl %edi, 4(%ebx) + + movl 4(%esi), %eax C xp[1] + movl %edx, %edi C carry, for wp[2] + + mull %ecx C xp[1] * yp[1] + + addl %eax, %edi + + adcl $0, %edx + movl (%esi), %eax C xp[0] + + movl %edx, %esi C carry, for wp[3] + + mull %ecx C xp[0] * yp[1] + + addl %eax, 4(%ebx) + adcl %edx, %edi + movl %edi, 8(%ebx) + + adcl $0, %esi + movl SAVE_EDI, %edi + movl %esi, 12(%ebx) + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(xsize_more_than_two): + +C The first limb of yp is processed with a simple mpn_mul_1 style loop +C inline. Unrolling this doesn't seem worthwhile since it's only run once +C (whereas the addmul below is run ysize-1 many times). A call to the +C actual mpn_mul_1 will be slowed down by the call and parameter pushing and +C popping, and doesn't seem likely to be worthwhile on the typical 13-26 +C limb operations the Karatsuba code calls here with. + + C eax yp[0] + C ebx + C ecx xsize + C edx xp + C esi + C edi + C ebp + +dnl FRAME doesn't carry on from previous, no pushes yet here +defframe(`SAVE_EBX',-4) +defframe(`SAVE_ESI',-8) +defframe(`SAVE_EDI',-12) +defframe(`SAVE_EBP',-16) +deflit(`FRAME',0) + + subl $16, %esp +deflit(`FRAME',16) + + movl %edi, SAVE_EDI + movl PARAM_WP, %edi + + movl %ebx, SAVE_EBX + movl %ebp, SAVE_EBP + movl %eax, %ebp + + movl %esi, SAVE_ESI + xorl %ebx, %ebx + leal (%edx,%ecx,4), %esi C xp end + + leal (%edi,%ecx,4), %edi C wp end of mul1 + negl %ecx + + +L(mul1): + C eax scratch + C ebx carry + C ecx counter, negative + C edx scratch + C esi xp end + C edi wp end of mul1 + C ebp multiplier + + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl %eax, (%edi,%ecx,4) + movl $0, %ebx + + adcl %edx, %ebx + incl %ecx + jnz L(mul1) + + + movl PARAM_YSIZE, %edx + movl PARAM_XSIZE, %ecx + + movl %ebx, (%edi) C final carry + decl %edx + + jnz L(ysize_more_than_one) + + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + + movl SAVE_EBP, %ebp + movl SAVE_ESI, %esi + addl $FRAME, %esp + + ret + + +L(ysize_more_than_one): + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_YP, %eax + + jae L(unroll) + + +C ----------------------------------------------------------------------------- + C simple addmul looping + C + C eax yp + C ebx + C ecx xsize + C edx ysize-1 + C esi xp end + C edi wp end of mul1 + C ebp + + leal 4(%eax,%edx,4), %ebp C yp end + negl %ecx + negl %edx + + movl (%esi,%ecx,4), %eax C xp low limb + movl %edx, PARAM_YSIZE C -(ysize-1) + incl %ecx + + xorl %ebx, %ebx C initial carry + movl %ecx, PARAM_XSIZE C -(xsize-1) + movl %ebp, PARAM_YP + + movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier + jmp L(simple_outer_entry) + + + C this is offset 0x121 so close enough to aligned +L(simple_outer_top): + C ebp ysize counter, negative + + movl PARAM_YP, %edx + movl PARAM_XSIZE, %ecx C -(xsize-1) + xorl %ebx, %ebx C carry + + movl %ebp, PARAM_YSIZE + addl $4, %edi C next position in wp + + movl (%edx,%ebp,4), %ebp C yp limb - multiplier + movl -4(%esi,%ecx,4), %eax C xp low limb + + +L(simple_outer_entry): + +L(simple_inner): + C eax xp limb + C ebx carry limb + C ecx loop counter (negative) + C edx scratch + C esi xp end + C edi wp end + C ebp multiplier + + mull %ebp + + addl %eax, %ebx + adcl $0, %edx + + addl %ebx, (%edi,%ecx,4) + movl (%esi,%ecx,4), %eax + adcl $0, %edx + + incl %ecx + movl %edx, %ebx + jnz L(simple_inner) + + + mull %ebp + + movl PARAM_YSIZE, %ebp + addl %eax, %ebx + + adcl $0, %edx + addl %ebx, (%edi) + + adcl $0, %edx + incl %ebp + + movl %edx, 4(%edi) + jnz L(simple_outer_top) + + + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +C +C The unrolled loop is the same as in mpn_addmul_1(), see that code for some +C comments. +C +C VAR_ADJUST is the negative of how many limbs the leals in the inner loop +C increment xp and wp. This is used to adjust back xp and wp, and rshifted +C to given an initial VAR_COUNTER at the top of the outer loop. +C +C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT +C up to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled loop. +C +C VAR_XP_LOW is the least significant limb of xp, which is needed at the +C start of the unrolled loop. +C +C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, +C inclusive. +C +C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be +C added to give the location of the next limb of yp, which is the multiplier +C in the unrolled loop. +C +C The trick with VAR_ADJUST means it's only necessary to do one fetch in the +C outer loop to take care of xp, wp and the inner loop counter. + +defframe(VAR_COUNTER, -20) +defframe(VAR_ADJUST, -24) +defframe(VAR_JMP, -28) +defframe(VAR_XP_LOW, -32) +deflit(VAR_EXTRA_SPACE, 16) + + +L(unroll): + C eax yp + C ebx + C ecx xsize + C edx ysize-1 + C esi xp end + C edi wp end of mul1 + C ebp + + movl PARAM_XP, %esi + movl 4(%eax), %ebp C multiplier (yp second limb) + leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing + + movl PARAM_WP, %edi + movl %eax, PARAM_YP + negl %edx + + movl %edx, PARAM_YSIZE + leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1 + decl %ecx C xsize-1 + + movl (%esi), %eax C xp low limb + andl $-UNROLL_MASK-1, %ebx + negl %ecx + + subl $VAR_EXTRA_SPACE, %esp +deflit(`FRAME',16+VAR_EXTRA_SPACE) + negl %ebx + andl $UNROLL_MASK, %ecx + + movl %ebx, VAR_ADJUST + movl %ecx, %edx + shll $4, %ecx + + sarl $UNROLL_LOG2, %ebx + + C 17 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(unroll_here): +',` + leal L(unroll_entry) (%ecx,%edx,1), %ecx +') + negl %edx + + movl %eax, VAR_XP_LOW + movl %ecx, VAR_JMP + leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling, + leal 4(%esi,%edx,4), %esi C and start at second limb + jmp L(unroll_outer_entry) + + +ifdef(`PIC',` +L(pic_calc): + C See README.family about old gas bugs + leal (%ecx,%edx,1), %ecx + addl $L(unroll_entry)-L(unroll_here), %ecx + addl (%esp), %ecx + ret +') + + +C -------------------------------------------------------------------------- + ALIGN(32) +L(unroll_outer_top): + C ebp ysize counter, negative + + movl VAR_ADJUST, %ebx + movl PARAM_YP, %edx + + movl VAR_XP_LOW, %eax + movl %ebp, PARAM_YSIZE C store incremented ysize counter + + leal 4(%edi,%ebx,4), %edi + leal (%esi,%ebx,4), %esi + sarl $UNROLL_LOG2, %ebx + + movl (%edx,%ebp,4), %ebp C yp next multiplier + movl VAR_JMP, %ecx + +L(unroll_outer_entry): + mull %ebp + + testb $1, %cl C and clear carry bit + movl %ebx, VAR_COUNTER + movl $0, %ebx + + movl $0, %ecx + cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb + cmovnz( %eax, %ebx) + + C Extra fetch of VAR_JMP is bad, but registers are tight + jmp *VAR_JMP + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(unroll_top): + C eax xp limb + C ebx carry high + C ecx carry low + C edx scratch + C esi xp+8 + C edi wp + C ebp yp multiplier limb + C + C VAR_COUNTER loop counter, negative + C + C 17 bytes each limb + +L(unroll_entry): + +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%esi), %eax) + adcl %edx, %ebx + + mull %ebp + +Zdisp( addl, %ecx, disp0,(%edi)) + movl $0, %ecx + + adcl %eax, %ebx + + + movl disp1(%esi), %eax + adcl %edx, %ecx + + mull %ebp + + addl %ebx, disp1(%edi) + movl $0, %ebx + + adcl %eax, %ecx +') + + + incl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + leal UNROLL_BYTES(%edi), %edi + + jnz L(unroll_top) + + + C eax + C ebx zero + C ecx low + C edx high + C esi + C edi wp, pointing at second last limb) + C ebp + C + C carry flag to be added to high + +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) +deflit(`disp1', eval(disp0-0 + 4)) + + movl PARAM_YSIZE, %ebp + adcl $0, %edx + addl %ecx, disp0(%edi) + + adcl $0, %edx + incl %ebp + + movl %edx, disp1(%edi) + jnz L(unroll_outer_top) + + + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/k7/sqr_basecase.asm b/rts/gmp/mpn/x86/k7/sqr_basecase.asm new file mode 100644 index 0000000000..84861ea66b --- /dev/null +++ b/rts/gmp/mpn/x86/k7/sqr_basecase.asm @@ -0,0 +1,627 @@ +dnl AMD K7 mpn_sqr_basecase -- square an mpn number. +dnl +dnl K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product +dnl (measured on the speed difference between 25 and 50 limbs, which is +dnl roughly the Karatsuba recursing range). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for +dnl some comments. + +deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66) + +ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE', +`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)') + +m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD') +deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3)) + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C With a KARATSUBA_SQR_THRESHOLD around 50 this code is about 1500 bytes, +C which is quite a bit, but is considered good value since squares big +C enough to use most of the code will be spending quite a few cycles in it. + + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl PARAM_SRC, %eax + cmpl $2, %ecx + + movl PARAM_DST, %edx + je L(two_limbs) + ja L(three_or_more) + + +C------------------------------------------------------------------------------ +C one limb only + C eax src + C ecx size + C edx dst + + movl (%eax), %eax + movl %edx, %ecx + + mull %eax + + movl %edx, 4(%ecx) + movl %eax, (%ecx) + ret + + +C------------------------------------------------------------------------------ +C +C Using the read/modify/write "add"s seems to be faster than saving and +C restoring registers. Perhaps the loads for the first set hide under the +C mul latency and the second gets store to load forwarding. + + ALIGN(16) +L(two_limbs): + C eax src + C ebx + C ecx size + C edx dst +deflit(`FRAME',0) + + pushl %ebx FRAME_pushl() + movl %eax, %ebx C src + movl (%eax), %eax + + movl %edx, %ecx C dst + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl 4(%ebx), %eax + + movl %edx, 4(%ecx) C dst[1] + + mull %eax C src[1]^2 + + movl %eax, 8(%ecx) C dst[2] + movl (%ebx), %eax + + movl %edx, 12(%ecx) C dst[3] + + mull 4(%ebx) C src[0]*src[1] + + popl %ebx + + addl %eax, 4(%ecx) + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + ASSERT(nc) + + addl %eax, 4(%ecx) + adcl %edx, 8(%ecx) + adcl $0, 12(%ecx) + ASSERT(nc) + + ret + + +C------------------------------------------------------------------------------ +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(STACK_SPACE, 16) + +L(three_or_more): + subl $STACK_SPACE, %esp + cmpl $4, %ecx + jae L(four_or_more) +deflit(`FRAME',STACK_SPACE) + + +C------------------------------------------------------------------------------ +C Three limbs +C +C Writing out the loads and stores separately at the end of this code comes +C out about 10 cycles faster than using adcls to memory. + + C eax src + C ecx size + C edx dst + + movl %ebx, SAVE_EBX + movl %eax, %ebx C src + movl (%eax), %eax + + movl %edx, %ecx C dst + movl %esi, SAVE_ESI + movl %edi, SAVE_EDI + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl 4(%ebx), %eax + movl %edx, 4(%ecx) + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl 8(%ebx), %eax + movl %edx, 12(%ecx) + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl (%ebx), %eax + movl %edx, 20(%ecx) + + mull 4(%ebx) C src[0] * src[1] + + movl %eax, %esi + movl (%ebx), %eax + movl %edx, %edi + + mull 8(%ebx) C src[0] * src[2] + + addl %eax, %edi + movl %ebp, SAVE_EBP + movl $0, %ebp + + movl 4(%ebx), %eax + adcl %edx, %ebp + + mull 8(%ebx) C src[1] * src[2] + + xorl %ebx, %ebx + addl %eax, %ebp + + adcl $0, %edx + + C eax + C ebx zero, will be dst[5] + C ecx dst + C edx dst[4] + C esi dst[1] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %esi, %esi + + adcl %edi, %edi + movl 4(%ecx), %eax + + adcl %ebp, %ebp + + adcl %edx, %edx + + adcl $0, %ebx + addl %eax, %esi + movl 8(%ecx), %eax + + adcl %eax, %edi + movl 12(%ecx), %eax + movl %esi, 4(%ecx) + + adcl %eax, %ebp + movl 16(%ecx), %eax + movl %edi, 8(%ecx) + + movl SAVE_ESI, %esi + movl SAVE_EDI, %edi + + adcl %eax, %edx + movl 20(%ecx), %eax + movl %ebp, 12(%ecx) + + adcl %ebx, %eax + ASSERT(nc) + movl SAVE_EBX, %ebx + movl SAVE_EBP, %ebp + + movl %edx, 16(%ecx) + movl %eax, 20(%ecx) + addl $FRAME, %esp + + ret + + +C------------------------------------------------------------------------------ +L(four_or_more): + +C First multiply src[0]*src[1..size-1] and store at dst[1..size]. +C Further products are added in rather than stored. + + C eax src + C ebx + C ecx size + C edx dst + C esi + C edi + C ebp + +defframe(`VAR_COUNTER',-20) +defframe(`VAR_JMP', -24) +deflit(EXTRA_STACK_SPACE, 8) + + movl %ebx, SAVE_EBX + movl %edi, SAVE_EDI + leal (%edx,%ecx,4), %edi C &dst[size] + + movl %esi, SAVE_ESI + movl %ebp, SAVE_EBP + leal (%eax,%ecx,4), %esi C &src[size] + + movl (%eax), %ebp C multiplier + movl $0, %ebx + decl %ecx + + negl %ecx + subl $EXTRA_STACK_SPACE, %esp +FRAME_subl_esp(EXTRA_STACK_SPACE) + +L(mul_1): + C eax scratch + C ebx carry + C ecx counter + C edx scratch + C esi &src[size] + C edi &dst[size] + C ebp multiplier + + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl %eax, (%edi,%ecx,4) + movl $0, %ebx + + adcl %edx, %ebx + incl %ecx + jnz L(mul_1) + + +C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. +C +C The last two products, which are the bottom right corner of the product +C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] +C and src[size-2]*src[size-1]. If size is 4 then it's only these corner +C cases that need to be done. +C +C The unrolled code is the same as in mpn_addmul_1, see that routine for +C some comments. +C +C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled code, stepped by one code +C chunk each outer loop. +C +C K7 does branch prediction on indirect jumps, which is bad since it's a +C different target each time. There seems no way to avoid this. + +dnl This value also hard coded in some shifts and adds +deflit(CODE_BYTES_PER_LIMB, 17) + +dnl With the unmodified &src[size] and &dst[size] pointers, the +dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT +dnl values up to 31, but above that an offset must be added to them. + +deflit(OFFSET, +ifelse(eval(UNROLL_COUNT>31),1, +eval((UNROLL_COUNT-31)*4), +0)) + +dnl Because the last chunk of code is generated differently, a label placed +dnl at the end doesn't work. Instead calculate the implied end using the +dnl start and how many chunks of code there are. + +deflit(UNROLL_INNER_END, +`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)') + + C eax + C ebx carry + C ecx + C edx + C esi &src[size] + C edi &dst[size] + C ebp + + movl PARAM_SIZE, %ecx + movl %ebx, (%edi) + + subl $4, %ecx + jz L(corner) + + negl %ecx +ifelse(OFFSET,0,,`subl $OFFSET, %edi') +ifelse(OFFSET,0,,`subl $OFFSET, %esi') + + movl %ecx, %edx + shll $4, %ecx + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + + + C The calculated jump mustn't come out to before the start of the + C code available. This is the limit UNROLL_COUNT puts on the src + C operand size, but checked here directly using the jump address. + ASSERT(ae, + `movl_text_address(L(unroll_inner_start), %eax) + cmpl %eax, %ecx') + + +C------------------------------------------------------------------------------ + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx high limb to store + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi &src[size], constant + C edi dst ptr, high of last addmul + C ebp + + movl -12+OFFSET(%esi,%edx,4), %ebp C next multiplier + movl -8+OFFSET(%esi,%edx,4), %eax C first of multiplicand + + movl %edx, VAR_COUNTER + + mull %ebp + +define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')') + + testb $1, %cl + movl %edx, %ebx C high carry + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + cmovX( %ebx, %ecx) C high carry reverse + cmovX( %eax, %ebx) C low carry reverse + + leal CODE_BYTES_PER_LIMB(%edx), %eax + xorl %edx, %edx + leal 4(%edi), %edi + + movl %eax, VAR_JMP + + jmp *%eax + + +ifdef(`PIC',` +L(pic_calc): + addl (%esp), %ecx + addl $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx + addl %edx, %ecx + ret +') + + + C Must be an even address to preserve the significance of the low + C bit of the jump address indicating which way around ecx/ebx should + C start. + ALIGN(2) + +L(unroll_inner_start): + C eax next limb + C ebx carry high + C ecx carry low + C edx scratch + C esi src + C edi dst + C ebp multiplier + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src - 4)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%esi), %eax) + adcl %edx, %ebx + + mull %ebp + +Zdisp( addl, %ecx, disp_dst,(%edi)) + movl $0, %ecx + + adcl %eax, %ebx + +',` + dnl this bit comes out last +Zdisp( movl, disp_src,(%esi), %eax) + adcl %edx, %ecx + + mull %ebp + +dnl Zdisp( addl %ebx, disp_src,(%edi)) + addl %ebx, disp_dst(%edi) +ifelse(forloop_last,0, +` movl $0, %ebx') + + adcl %eax, %ecx +') +') + + C eax next limb + C ebx carry high + C ecx carry low + C edx scratch + C esi src + C edi dst + C ebp multiplier + + adcl $0, %edx + addl %ecx, -4+OFFSET(%edi) + movl VAR_JMP, %ecx + + adcl $0, %edx + + movl %edx, m4_empty_if_zero(OFFSET) (%edi) + movl VAR_COUNTER, %edx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %esi + addl $OFFSET, %edi +') + + +C------------------------------------------------------------------------------ +L(corner): + C esi &src[size] + C edi &dst[2*size-5] + + movl -12(%esi), %ebp + movl -8(%esi), %eax + movl %eax, %ecx + + mull %ebp + + addl %eax, -4(%edi) + movl -4(%esi), %eax + + adcl $0, %edx + movl %edx, %ebx + movl %eax, %esi + + mull %ebp + + addl %ebx, %eax + + adcl $0, %edx + addl %eax, (%edi) + movl %esi, %eax + + adcl $0, %edx + movl %edx, %ebx + + mull %ecx + + addl %ebx, %eax + movl %eax, 4(%edi) + + adcl $0, %edx + movl %edx, 8(%edi) + + + +C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. + +L(lshift_start): + movl PARAM_SIZE, %eax + movl PARAM_DST, %edi + xorl %ecx, %ecx C clear carry + + leal (%edi,%eax,8), %edi + notl %eax C -size-1, preserve carry + + leal 2(%eax), %eax C -(size-1) + +L(lshift): + C eax counter, negative + C ebx + C ecx + C edx + C esi + C edi dst, pointing just after last limb + C ebp + + rcll -4(%edi,%eax,8) + rcll (%edi,%eax,8) + incl %eax + jnz L(lshift) + + setc %al + + movl PARAM_SRC, %esi + movl %eax, -4(%edi) C dst most significant limb + + movl PARAM_SIZE, %ecx + + +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + movl (%esi), %eax C src[0] + + mull %eax + + leal (%esi,%ecx,4), %esi C src point just after last limb + negl %ecx + + movl %eax, (%edi,%ecx,8) C dst[0] + incl %ecx + +L(diag): + C eax scratch + C ebx scratch + C ecx counter, negative + C edx carry + C esi src just after last limb + C edi dst just after last limb + C ebp + + movl (%esi,%ecx,4), %eax + movl %edx, %ebx + + mull %eax + + addl %ebx, -4(%edi,%ecx,8) + adcl %eax, (%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + addl %edx, -4(%edi) C dst most significant limb + movl SAVE_EDI, %edi + + movl SAVE_EBP, %ebp + addl $FRAME, %esp + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/lshift.asm b/rts/gmp/mpn/x86/lshift.asm new file mode 100644 index 0000000000..4735335cbe --- /dev/null +++ b/rts/gmp/mpn/x86/lshift.asm @@ -0,0 +1,90 @@ +dnl x86 mpn_lshift -- mpn left shift. + +dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_lshift) + + pushl %edi + pushl %esi + pushl %ebx +deflit(`FRAME',12) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%edx + movl PARAM_SHIFT,%ecx + + subl $4,%esi C adjust src + + movl (%esi,%edx,4),%ebx C read most significant limb + xorl %eax,%eax + shldl( %cl, %ebx, %eax) C compute carry limb + decl %edx + jz L(end) + pushl %eax C push carry limb onto stack + testb $1,%dl + jnz L(1) C enter loop in the middle + movl %ebx,%eax + + ALIGN(8) +L(oop): movl (%esi,%edx,4),%ebx C load next lower limb + shldl( %cl, %ebx, %eax) C compute result limb + movl %eax,(%edi,%edx,4) C store it + decl %edx +L(1): movl (%esi,%edx,4),%eax + shldl( %cl, %eax, %ebx) + movl %ebx,(%edi,%edx,4) + decl %edx + jnz L(oop) + + shll %cl,%eax C compute least significant limb + movl %eax,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebx + popl %esi + popl %edi + ret + +L(end): shll %cl,%ebx C compute least significant limb + movl %ebx,(%edi) C store it + + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/mod_1.asm b/rts/gmp/mpn/x86/mod_1.asm new file mode 100644 index 0000000000..3908161b3e --- /dev/null +++ b/rts/gmp/mpn/x86/mod_1.asm @@ -0,0 +1,141 @@ +dnl x86 mpn_mod_1 -- mpn by limb remainder. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl cycles/limb +dnl K6 20 +dnl P5 44 +dnl P6 39 +dnl 486 approx 42 maybe +dnl +dnl The following have their own optimized mod_1 implementations, but for +dnl reference the code here runs as follows. +dnl +dnl P6MMX 39 +dnl K7 41 + + +include(`../config.m4') + + +C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C +C Divide src,size by divisor and return the remainder. The quotient is +C discarded. +C +C See mpn/x86/divrem_1.asm for some comments. + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + + .text + ALIGN(16) + +PROLOGUE(mpn_mod_1c) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + pushl %esi FRAME_pushl() + + movl PARAM_DIVISOR, %esi + orl %ecx, %ecx + + movl PARAM_CARRY, %edx + jnz LF(mpn_mod_1,top) + + popl %esi + movl %edx, %eax + + popl %ebx + + ret + +EPILOGUE() + + +PROLOGUE(mpn_mod_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + pushl %ebx FRAME_pushl() + + movl PARAM_SRC, %ebx + pushl %esi FRAME_pushl() + + orl %ecx, %ecx + jz L(done_zero) + + movl PARAM_DIVISOR, %esi + movl -4(%ebx,%ecx,4), %eax C src high limb + + cmpl %esi, %eax + + sbbl %edx, %edx C -1 if high<divisor + + addl %edx, %ecx C skip one division if high<divisor + jz L(done_eax) + + andl %eax, %edx C carry if high<divisor + + +L(top): + C eax scratch (quotient) + C ebx src + C ecx counter + C edx carry (remainder) + C esi divisor + C edi + C ebp + + movl -4(%ebx,%ecx,4), %eax + + divl %esi + + loop_or_decljnz L(top) + + + movl %edx, %eax +L(done_eax): + popl %esi + + popl %ebx + + ret + + +L(done_zero): + popl %esi + xorl %eax, %eax + + popl %ebx + + ret + + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/mul_1.asm b/rts/gmp/mpn/x86/mul_1.asm new file mode 100644 index 0000000000..8817f291bc --- /dev/null +++ b/rts/gmp/mpn/x86/mul_1.asm @@ -0,0 +1,130 @@ +dnl x86 mpn_mul_1 (for 386, 486, and Pentium Pro) -- Multiply a limb vector +dnl with a limb and store the result in a second limb vector. +dnl +dnl cycles/limb +dnl P6: 5.5 +dnl +dnl The following CPUs have their own optimized code, but for reference the +dnl code here runs as follows. +dnl +dnl cycles/limb +dnl P5: 12.5 +dnl K6: 10.5 +dnl K7: 4.5 + + +dnl Copyright (C) 1992, 1994, 1997, 1998, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); + +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_mul_1) +deflit(`FRAME',0) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ecx + + xorl %ebx,%ebx + andl $3,%ecx + jz L(end0) + +L(oop0): + movl (%esi),%eax + mull PARAM_MULTIPLIER + leal 4(%esi),%esi + addl %ebx,%eax + movl $0,%ebx + adcl %ebx,%edx + movl %eax,(%edi) + movl %edx,%ebx C propagate carry into cylimb + + leal 4(%edi),%edi + decl %ecx + jnz L(oop0) + +L(end0): + movl PARAM_SIZE,%ecx + shrl $2,%ecx + jz L(end) + + + ALIGN(8) +L(oop): movl (%esi),%eax + mull PARAM_MULTIPLIER + addl %eax,%ebx + movl $0,%ebp + adcl %edx,%ebp + + movl 4(%esi),%eax + mull PARAM_MULTIPLIER + movl %ebx,(%edi) + addl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + movl 8(%esi),%eax + mull PARAM_MULTIPLIER + movl %ebp,4(%edi) + addl %eax,%ebx C new lo + cylimb + movl $0,%ebp + adcl %edx,%ebp + + movl 12(%esi),%eax + mull PARAM_MULTIPLIER + movl %ebx,8(%edi) + addl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + movl %ebp,12(%edi) + + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ecx + jnz L(oop) + +L(end): movl %ebx,%eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/mul_basecase.asm b/rts/gmp/mpn/x86/mul_basecase.asm new file mode 100644 index 0000000000..3a9b73895b --- /dev/null +++ b/rts/gmp/mpn/x86/mul_basecase.asm @@ -0,0 +1,209 @@ +dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result +dnl in a third limb vector. + + +dnl Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); +C +C This was written in a haste since the Pentium optimized code that was used +C for all x86 machines was slow for the Pentium II. This code would benefit +C from some cleanup. +C +C To shave off some percentage of the run-time, one should make 4 variants +C of the Louter loop, for the four different outcomes of un mod 4. That +C would avoid Loop0 altogether. Code expansion would be > 4-fold for that +C part of the function, but since it is not very large, that would be +C acceptable. +C +C The mul loop (at L(oopM)) might need some tweaking. It's current speed is +C unknown. + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + +defframe(VAR_MULTIPLIER, -4) +defframe(VAR_COUNTER, -8) +deflit(VAR_STACK_SPACE, 8) + + .text + ALIGN(8) + +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + subl $VAR_STACK_SPACE,%esp + pushl %esi + pushl %ebp + pushl %edi +deflit(`FRAME',eval(VAR_STACK_SPACE+12)) + + movl PARAM_XP,%esi + movl PARAM_WP,%edi + movl PARAM_YP,%ebp + + movl (%esi),%eax C load xp[0] + mull (%ebp) C multiply by yp[0] + movl %eax,(%edi) C store to wp[0] + movl PARAM_XSIZE,%ecx C xsize + decl %ecx C If xsize = 1, ysize = 1 too + jz L(done) + + pushl %ebx +FRAME_pushl() + movl %edx,%ebx + + leal 4(%esi),%esi + leal 4(%edi),%edi + +L(oopM): + movl (%esi),%eax C load next limb at xp[j] + leal 4(%esi),%esi + mull (%ebp) + addl %ebx,%eax + movl %edx,%ebx + adcl $0,%ebx + movl %eax,(%edi) + leal 4(%edi),%edi + decl %ecx + jnz L(oopM) + + movl %ebx,(%edi) C most significant limb of product + addl $4,%edi C increment wp + movl PARAM_XSIZE,%eax + shll $2,%eax + subl %eax,%edi + subl %eax,%esi + + movl PARAM_YSIZE,%eax C ysize + decl %eax + jz L(skip) + movl %eax,VAR_COUNTER C set index i to ysize + +L(outer): + movl PARAM_YP,%ebp C yp + addl $4,%ebp C make ebp point to next v limb + movl %ebp,PARAM_YP + movl (%ebp),%eax C copy y limb ... + movl %eax,VAR_MULTIPLIER C ... to stack slot + movl PARAM_XSIZE,%ecx + + xorl %ebx,%ebx + andl $3,%ecx + jz L(end0) + +L(oop0): + movl (%esi),%eax + mull VAR_MULTIPLIER + leal 4(%esi),%esi + addl %ebx,%eax + movl $0,%ebx + adcl %ebx,%edx + addl %eax,(%edi) + adcl %edx,%ebx C propagate carry into cylimb + + leal 4(%edi),%edi + decl %ecx + jnz L(oop0) + +L(end0): + movl PARAM_XSIZE,%ecx + shrl $2,%ecx + jz L(endX) + + ALIGN(8) +L(oopX): + movl (%esi),%eax + mull VAR_MULTIPLIER + addl %eax,%ebx + movl $0,%ebp + adcl %edx,%ebp + + movl 4(%esi),%eax + mull VAR_MULTIPLIER + addl %ebx,(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + movl 8(%esi),%eax + mull VAR_MULTIPLIER + addl %ebp,4(%edi) + adcl %eax,%ebx C new lo + cylimb + movl $0,%ebp + adcl %edx,%ebp + + movl 12(%esi),%eax + mull VAR_MULTIPLIER + addl %ebx,8(%edi) + adcl %eax,%ebp C new lo + cylimb + movl $0,%ebx + adcl %edx,%ebx + + addl %ebp,12(%edi) + adcl $0,%ebx C propagate carry into cylimb + + leal 16(%esi),%esi + leal 16(%edi),%edi + decl %ecx + jnz L(oopX) + +L(endX): + movl %ebx,(%edi) + addl $4,%edi + + C we incremented wp and xp in the loop above; compensate + movl PARAM_XSIZE,%eax + shll $2,%eax + subl %eax,%edi + subl %eax,%esi + + movl VAR_COUNTER,%eax + decl %eax + movl %eax,VAR_COUNTER + jnz L(outer) + +L(skip): + popl %ebx + popl %edi + popl %ebp + popl %esi + addl $8,%esp + ret + +L(done): + movl %edx,4(%edi) C store to wp[1] + popl %edi + popl %ebp + popl %esi + addl $8,%esp + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/p6/README b/rts/gmp/mpn/x86/p6/README new file mode 100644 index 0000000000..7dbc905a0d --- /dev/null +++ b/rts/gmp/mpn/x86/p6/README @@ -0,0 +1,95 @@ + + INTEL P6 MPN SUBROUTINES + + + +This directory contains code optimized for Intel P6 class CPUs, meaning +PentiumPro, Pentium II and Pentium III. The mmx and p3mmx subdirectories +have routines using MMX instructions. + + + +STATUS + +Times for the loops, with all code and data in L1 cache, are as follows. +Some of these might be able to be improved. + + cycles/limb + + mpn_add_n/sub_n 3.7 + + mpn_copyi 0.75 + mpn_copyd 2.4 + + mpn_divrem_1 39.0 + mpn_mod_1 39.0 + mpn_divexact_by3 8.5 + + mpn_mul_1 5.5 + mpn_addmul/submul_1 6.35 + + mpn_l/rshift 2.5 + + mpn_mul_basecase 8.2 cycles/crossproduct (approx) + mpn_sqr_basecase 4.0 cycles/crossproduct (approx) + or 7.75 cycles/triangleproduct (approx) + +Pentium II and III have MMX and get the following improvements. + + mpn_divrem_1 25.0 integer part, 17.5 fractional part + mpn_mod_1 24.0 + + mpn_l/rshift 1.75 + + + + +NOTES + +Write-allocate L1 data cache means prefetching of destinations is unnecessary. + +Mispredicted branches have a penalty of between 9 and 15 cycles, and even up +to 26 cycles depending how far speculative execution has gone. The 9 cycle +minimum penalty comes from the issue pipeline being 9 stages. + +A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4, +5, 6 or 7 limb operations are all the same. The 0.75 cycles/limb would be 3 +cycles per 16 byte block. + + + + +CODING + +Instructions in general code have been shown grouped if they can execute +together, which means up to three instructions with no successive +dependencies, and with only the first being a multiple micro-op. + +P6 has out-of-order execution, so the groupings are really only showing +dependent paths where some shuffling might allow some latencies to be +hidden. + + + + +REFERENCES + +"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated +02/99, order number 245127 (order number 730795-001 is in the document too). +Available on-line: + + http://download.intel.com/design/PentiumII/manuals/245127.htm + +"Intel Architecture Optimization Manual", 1997, order number 242816. This +is an older document mostly about P5 and not as good as the above. +Available on-line: + + http://download.intel.com/design/PentiumII/manuals/242816.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/rts/gmp/mpn/x86/p6/aorsmul_1.asm b/rts/gmp/mpn/x86/p6/aorsmul_1.asm new file mode 100644 index 0000000000..feb364ec0b --- /dev/null +++ b/rts/gmp/mpn/x86/p6/aorsmul_1.asm @@ -0,0 +1,300 @@ +dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. +dnl +dnl P6: 6.35 cycles/limb (at 16 limbs/loop). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl P6 UNROLL_COUNT cycles/limb +dnl 8 6.7 +dnl 16 6.35 +dnl 32 6.3 +dnl 64 6.3 +dnl Maximum possible with the current code is 64. + +deflit(UNROLL_COUNT, 16) + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + define(M4_function_1c, mpn_addmul_1c) + define(M4_description, add it to) + define(M4_desc_retval, carry) +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + define(M4_function_1c, mpn_submul_1c) + define(M4_description, subtract it from) + define(M4_desc_retval, borrow) +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); +C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult, mp_limb_t carry); +C +C Calculate src,size multiplied by mult and M4_description dst,size. +C Return the M4_desc_retval limb from the top of the result. +C +C This code is pretty much the same as the K6 code. The unrolled loop is +C the same, but there's just a few scheduling tweaks in the setups and the +C simple loop. +C +C A number of variations have been tried for the unrolled loop, with one or +C two carries, and with loads scheduled earlier, but nothing faster than 6 +C cycles/limb has been found. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 5) +',` +deflit(UNROLL_THRESHOLD, 5) +') + +defframe(PARAM_CARRY, 20) +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) + +PROLOGUE(M4_function_1c) + pushl %ebx +deflit(`FRAME',4) + movl PARAM_CARRY, %ebx + jmp LF(M4_function_1,start_nc) +EPILOGUE() + +PROLOGUE(M4_function_1) + push %ebx +deflit(`FRAME',4) + xorl %ebx, %ebx C initial carry + +L(start_nc): + movl PARAM_SIZE, %ecx + pushl %esi +deflit(`FRAME',8) + + movl PARAM_SRC, %esi + pushl %edi +deflit(`FRAME',12) + + movl PARAM_DST, %edi + pushl %ebp +deflit(`FRAME',16) + cmpl $UNROLL_THRESHOLD, %ecx + + movl PARAM_MULTIPLIER, %ebp + jae L(unroll) + + + C simple loop + C this is offset 0x22, so close enough to aligned +L(simple): + C eax scratch + C ebx carry + C ecx counter + C edx scratch + C esi src + C edi dst + C ebp multiplier + + movl (%esi), %eax + addl $4, %edi + + mull %ebp + + addl %ebx, %eax + adcl $0, %edx + + M4_inst %eax, -4(%edi) + movl %edx, %ebx + + adcl $0, %ebx + decl %ecx + + leal 4(%esi), %esi + jnz L(simple) + + + popl %ebp + popl %edi + + popl %esi + movl %ebx, %eax + + popl %ebx + ret + + + +C------------------------------------------------------------------------------ +C VAR_JUMP holds the computed jump temporarily because there's not enough +C registers when doing the mul for the initial two carry limbs. +C +C The add/adc for the initial carry in %ebx is necessary only for the +C mpn_add/submul_1c entry points. Duplicating the startup code to +C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good +C idea. + +dnl overlapping with parameters already fetched +define(VAR_COUNTER,`PARAM_SIZE') +define(VAR_JUMP, `PARAM_DST') + + C this is offset 0x43, so close enough to aligned +L(unroll): + C eax + C ebx initial carry + C ecx size + C edx + C esi src + C edi dst + C ebp + + movl %ecx, %edx + decl %ecx + + subl $2, %edx + negl %ecx + + shrl $UNROLL_LOG2, %edx + andl $UNROLL_MASK, %ecx + + movl %edx, VAR_COUNTER + movl %ecx, %edx + + C 15 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + shll $4, %edx + negl %ecx + + leal L(entry) (%edx,%ecx,1), %edx +') + movl (%esi), %eax C src low limb + + movl %edx, VAR_JUMP + leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi + + mull %ebp + + addl %ebx, %eax C initial carry (from _1c) + adcl $0, %edx + + movl %edx, %ebx C high carry + leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi + + movl VAR_JUMP, %edx + testl $1, %ecx + movl %eax, %ecx C low carry + + cmovnz( %ebx, %ecx) C high,low carry other way around + cmovnz( %eax, %ebx) + + jmp *%edx + + +ifdef(`PIC',` +L(pic_calc): + shll $4, %edx + negl %ecx + + C See README.family about old gas bugs + leal (%edx,%ecx,1), %edx + addl $L(entry)-L(here), %edx + + addl (%esp), %edx + + ret +') + + +C ----------------------------------------------------------- + ALIGN(32) +L(top): +deflit(`FRAME',16) + C eax scratch + C ebx carry hi + C ecx carry lo + C edx scratch + C esi src + C edi dst + C ebp multiplier + C + C VAR_COUNTER loop counter + C + C 15 code bytes per limb + + addl $UNROLL_BYTES, %edi + +L(entry): +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%esi), %eax) + mull %ebp +Zdisp( M4_inst,%ecx, disp0,(%edi)) + adcl %eax, %ebx + movl %edx, %ecx + adcl $0, %ecx + + movl disp1(%esi), %eax + mull %ebp + M4_inst %ebx, disp1(%edi) + adcl %eax, %ecx + movl %edx, %ebx + adcl $0, %ebx +') + + decl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + + jns L(top) + + +deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) + + M4_inst %ecx, disp0(%edi) + movl %ebx, %eax + + popl %ebp + popl %edi + + popl %esi + popl %ebx + adcl $0, %eax + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/p6/diveby3.asm b/rts/gmp/mpn/x86/p6/diveby3.asm new file mode 100644 index 0000000000..a77703ea89 --- /dev/null +++ b/rts/gmp/mpn/x86/p6/diveby3.asm @@ -0,0 +1,37 @@ +dnl Intel P6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. +dnl +dnl P6: 8.5 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl The P5 code runs well on P6, in fact better than anything else found so +dnl far. An imul is 4 cycles, meaning the two cmp/sbbl pairs on the +dnl dependent path are taking 4.5 cycles. +dnl +dnl The destination cache line prefetching is unnecessary on P6, but +dnl removing it is a 2 cycle slowdown (approx), so it must be inducing +dnl something good in the out of order execution. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_divexact_by3c) +include_mpn(`x86/pentium/diveby3.asm') diff --git a/rts/gmp/mpn/x86/p6/gmp-mparam.h b/rts/gmp/mpn/x86/p6/gmp-mparam.h new file mode 100644 index 0000000000..d7bfb6d60c --- /dev/null +++ b/rts/gmp/mpn/x86/p6/gmp-mparam.h @@ -0,0 +1,96 @@ +/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +#ifndef UMUL_TIME +#define UMUL_TIME 5 /* cycles */ +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 39 /* cycles */ +#endif + +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 2 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 23 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 139 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 52 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 166 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 116 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 66 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 20 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 54 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 592, 1440, 2688, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 608 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 5888 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 656, 1504, 2944, 6656, 18432, 57344, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 672 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 5888 +#endif diff --git a/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm new file mode 100644 index 0000000000..f1b011b623 --- /dev/null +++ b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm @@ -0,0 +1,677 @@ +dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division. +dnl +dnl P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C +C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm, +C see that file for some comments. It's likely what's here can be improved. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The different speeds of the integer and fraction parts means that using +dnl xsize+size isn't quite right. The threshold wants to be a bit higher +dnl for the integer part and a bit lower for the fraction part. (Or what's +dnl really wanted is to speed up the integer part!) +dnl +dnl The threshold is set to make the integer part right. At 4 limbs the +dnl div and mul are about the same there, but on the fractional part the +dnl mul is much faster. + +deflit(MUL_THRESHOLD, 4) + + +defframe(PARAM_CARRY, 24) +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC, -28) +defframe(VAR_DST, -32) +defframe(VAR_DST_STOP,-36) + +deflit(STACK_SPACE, 36) + + .text + ALIGN(16) + +PROLOGUE(mpn_divrem_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + leal -4(%edi,%ebx,4), %edi + jmp LF(mpn_divrem_1,start_1c) + +EPILOGUE() + + + C offset 0x31, close enough to aligned +PROLOGUE(mpn_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + movl $0, %edx C initial carry (if can't skip a div) + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + orl %ecx, %ecx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + leal -4(%edi,%ebx,4), %edi C &dst[xsize-1] + jz L(no_skip_div) + + movl -4(%esi,%ecx,4), %eax C src high limb + cmpl %ebp, %eax C one less div if high<divisor + jnb L(no_skip_div) + + movl $0, (%edi,%ecx,4) C dst high limb + decl %ecx C size-1 + movl %eax, %edx C src high limb as initial carry +L(no_skip_div): + + +L(start_1c): + C eax + C ebx xsize + C ecx size + C edx carry + C esi src + C edi &dst[xsize-1] + C ebp divisor + + leal (%ebx,%ecx), %eax C size+xsize + cmpl $MUL_THRESHOLD, %eax + jae L(mul_by_inverse) + + orl %ecx, %ecx + jz L(divide_no_integer) + +L(divide_integer): + C eax scratch (quotient) + C ebx xsize + C ecx counter + C edx scratch (remainder) + C esi src + C edi &dst[xsize-1] + C ebp divisor + + movl -4(%esi,%ecx,4), %eax + + divl %ebp + + movl %eax, (%edi,%ecx,4) + decl %ecx + jnz L(divide_integer) + + +L(divide_no_integer): + movl PARAM_DST, %edi + orl %ebx, %ebx + jnz L(divide_fraction) + +L(divide_done): + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + + movl SAVE_EBX, %ebx + movl %edx, %eax + + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + + +L(divide_fraction): + C eax scratch (quotient) + C ebx counter + C ecx + C edx scratch (remainder) + C esi + C edi dst + C ebp divisor + + movl $0, %eax + + divl %ebp + + movl %eax, -4(%edi,%ebx,4) + decl %ebx + jnz L(divide_fraction) + + jmp L(divide_done) + + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): + C eax + C ebx xsize + C ecx size + C edx carry + C esi src + C edi &dst[xsize-1] + C ebp divisor + + leal 12(%edi), %ebx + + movl %ebx, VAR_DST_STOP + leal 4(%edi,%ecx,4), %edi C &dst[xsize+size] + + movl %edi, VAR_DST + movl %ecx, %ebx C size + + bsrl %ebp, %ecx C 31-l + movl %edx, %edi C carry + + leal 1(%ecx), %eax C 32-l + xorl $31, %ecx C l + + movl %ecx, VAR_NORM + movl $-1, %edx + + shll %cl, %ebp C d normalized + movd %eax, %mm7 + + movl $-1, %eax + subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1 + + divl %ebp C floor (b*(b-d)-1) / d + + movl %eax, VAR_INVERSE + orl %ebx, %ebx C size + leal -12(%esi,%ebx,4), %eax C &src[size-3] + + movl %eax, VAR_SRC + jz L(start_zero) + + movl 8(%eax), %esi C src high limb + cmpl $1, %ebx + jz L(start_one) + +L(start_two_or_more): + movl 4(%eax), %edx C src second highest limb + + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shldl( %cl, %edx, %esi) C n10 = high,second << l + + cmpl $2, %ebx + je L(integer_two_left) + jmp L(integer_top) + + +L(start_one): + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shll %cl, %esi C n10 = high << l + jmp L(integer_one_left) + + +L(start_zero): + shll %cl, %edi C n2 = carry << l + movl $0, %esi C n10 = 0 + + C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then + C must have xsize!=0 + jmp L(fraction_some) + + + +C ----------------------------------------------------------------------------- +C +C This loop runs at about 25 cycles, which is probably sub-optimal, and +C certainly more than the dependent chain would suggest. A better loop, or +C a better rough analysis of what's possible, would be welcomed. +C +C In the current implementation, the following successively dependent +C micro-ops seem to exist. +C +C uops +C n2+n1 1 (addl) +C mul 5 +C q1+1 3 (addl/adcl) +C mul 5 +C sub 3 (subl/sbbl) +C addback 2 (cmov) +C --- +C 19 +C +C Lack of registers hinders explicit scheduling and it might be that the +C normal out of order execution isn't able to hide enough under the mul +C latencies. +C +C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than +C cmov (and takes one uop off the dependent chain). A sarl/andl/addl +C combination was tried for the addback (despite the fact it would lengthen +C the dependent chain) but found to be no faster. + + + ALIGN(16) +L(integer_top): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp d + C + C mm0 scratch (src qword) + C mm7 rshift for normalization + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl VAR_SRC, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + movq (%ecx), %mm0 C next src limb and the one below it + + mull VAR_INVERSE C m*(n2+n1) + + subl $4, %ecx + + movl %ecx, VAR_SRC + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + movl %ebp, %eax C d + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + + mull %ebx C (q1+1)*d + + movl VAR_DST, %ecx + psrlq %mm7, %mm0 + + C + + C + + C + + subl %eax, %esi + movl VAR_DST_STOP, %eax + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + sbbl $0, %ebx C q + subl $4, %ecx + + movl %ebx, (%ecx) + cmpl %eax, %ecx + + movl %ecx, VAR_DST + jne L(integer_top) + + +L(integer_loop_done): + + +C ----------------------------------------------------------------------------- +C +C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz +C q1_ff special case. This make the code a bit smaller and simpler, and +C costs only 2 cycles (each). + +L(integer_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (src, dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl PARAM_SRC, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd (%ecx), %mm0 C src low limb + + movl VAR_DST_STOP, %ecx + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + sbbl $0, %ebx C q + + movl %ebx, -4(%ecx) + + +C ----------------------------------------------------------------------------- +L(integer_one_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx scratch (dst) + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + movl VAR_DST_STOP, %ecx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + C + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx C q1 if q1+1 overflowed + + mull %ebx + + C + + C + + C + + C + + subl %eax, %esi + movl PARAM_XSIZE, %eax + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + + sbbl $0, %ebx C q + + movl %ebx, -8(%ecx) + subl $8, %ecx + + + + orl %eax, %eax C xsize + jnz L(fraction_some) + + movl %edi, %eax +L(fraction_done): + movl VAR_NORM, %ecx + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + + movl SAVE_ESI, %esi + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + shrl %cl, %eax + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx + C edx + C esi n10 + C edi n2 + C ebp divisor + + movl VAR_DST, %ecx + movl VAR_DST_STOP, %edx + subl $4, %ecx + + movl %ecx, VAR_DST + psrlq %mm7, %mm0 + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + + movl $-1, (%ecx) + movd %mm0, %esi C next n10 + + cmpl %ecx, %edx + jne L(integer_top) + + jmp L(integer_loop_done) + + + +C ----------------------------------------------------------------------------- +C +C In the current implementation, the following successively dependent +C micro-ops seem to exist. +C +C uops +C mul 5 +C q1+1 1 (addl) +C mul 5 +C sub 3 (negl/sbbl) +C addback 2 (cmov) +C --- +C 16 +C +C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for +C the addback was found to be a touch slower. + + + ALIGN(16) +L(fraction_some): + C eax + C ebx + C ecx + C edx + C esi + C edi carry + C ebp divisor + + movl PARAM_DST, %esi + movl VAR_DST_STOP, %ecx + movl %edi, %eax + + subl $8, %ecx + + + ALIGN(16) +L(fraction_top): + C eax n2, then scratch + C ebx scratch (nadj, q1) + C ecx dst, decrementing + C edx scratch + C esi dst stop point + C edi n2 + C ebp divisor + + mull VAR_INVERSE C m*n2 + + movl %ebp, %eax C d + subl $4, %ecx C dst + leal 1(%edi), %ebx + + C + + C + + C + + addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1 + + mull %ebx C (q1+1)*d + + C + + C + + C + + C + + negl %eax C low of n - (q1+1)*d + + sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry + leal (%ebp,%eax), %edx + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + + sbbl $0, %ebx C q + movl %eax, %edi C remainder->n2 + cmpl %esi, %ecx + + movl %ebx, (%ecx) C previous q + jne L(fraction_top) + + + jmp L(fraction_done) + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/p6/mmx/mod_1.asm b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm new file mode 100644 index 0000000000..e7d8d94d33 --- /dev/null +++ b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm @@ -0,0 +1,444 @@ +dnl Intel Pentium-II mpn_mod_1 -- mpn by limb remainder. +dnl +dnl P6MMX: 24.0 cycles/limb. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C +C The code here very similar to mpn_divrem_1, but with the quotient +C discarded. What's here probably isn't optimal. +C +C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some +C comments. + + +dnl MUL_THRESHOLD is the size at which the multiply by inverse method is +dnl used, rather than plain "divl"s. Minimum value 2. + +deflit(MUL_THRESHOLD, 4) + + +defframe(PARAM_CARRY, 16) +defframe(PARAM_DIVISOR,12) +defframe(PARAM_SIZE, 8) +defframe(PARAM_SRC, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) + +defframe(VAR_NORM, -20) +defframe(VAR_INVERSE, -24) +defframe(VAR_SRC_STOP,-28) + +deflit(STACK_SPACE, 28) + + .text + ALIGN(16) + +PROLOGUE(mpn_mod_1c) +deflit(`FRAME',0) + movl PARAM_CARRY, %edx + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + jmp LF(mpn_mod_1,start_1c) + +EPILOGUE() + + + ALIGN(16) +PROLOGUE(mpn_mod_1) +deflit(`FRAME',0) + + movl $0, %edx C initial carry (if can't skip a div) + movl PARAM_SIZE, %ecx + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + orl %ecx, %ecx + jz L(divide_done) + + movl -4(%esi,%ecx,4), %eax C src high limb + + cmpl %ebp, %eax C carry flag if high<divisor + + cmovc( %eax, %edx) C src high limb as initial carry + sbbl $0, %ecx C size-1 to skip one div + jz L(divide_done) + + + ALIGN(16) +L(start_1c): + C eax + C ebx + C ecx size + C edx carry + C esi src + C edi + C ebp divisor + + cmpl $MUL_THRESHOLD, %ecx + jae L(mul_by_inverse) + + + orl %ecx, %ecx + jz L(divide_done) + + +L(divide_top): + C eax scratch (quotient) + C ebx + C ecx counter, limbs, decrementing + C edx scratch (remainder) + C esi src + C edi + C ebp + + movl -4(%esi,%ecx,4), %eax + + divl %ebp + + decl %ecx + jnz L(divide_top) + + +L(divide_done): + movl SAVE_ESI, %esi + movl %edx, %eax + + movl SAVE_EBP, %ebp + addl $STACK_SPACE, %esp + + ret + + + +C ----------------------------------------------------------------------------- + +L(mul_by_inverse): + C eax + C ebx + C ecx size + C edx carry + C esi src + C edi + C ebp divisor + + movl %ebx, SAVE_EBX + leal -4(%esi), %ebx + + movl %ebx, VAR_SRC_STOP + movl %ecx, %ebx C size + + movl %edi, SAVE_EDI + movl %edx, %edi C carry + + bsrl %ebp, %ecx C 31-l + movl $-1, %edx + + leal 1(%ecx), %eax C 32-l + xorl $31, %ecx C l + + movl %ecx, VAR_NORM + shll %cl, %ebp C d normalized + + movd %eax, %mm7 + movl $-1, %eax + subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 + + divl %ebp C floor (b*(b-d)-1) / d + + C + + movl %eax, VAR_INVERSE + leal -12(%esi,%ebx,4), %eax C &src[size-3] + + movl 8(%eax), %esi C src high limb + movl 4(%eax), %edx C src second highest limb + + shldl( %cl, %esi, %edi) C n2 = carry,high << l + + shldl( %cl, %edx, %esi) C n10 = high,second << l + + movl %eax, %ecx C &src[size-3] + + +ifelse(MUL_THRESHOLD,2,` + cmpl $2, %ebx + je L(inverse_two_left) +') + + +C The dependent chain here is the same as in mpn_divrem_1, but a few +C instructions are saved by not needing to store the quotient limbs. This +C gets it down to 24 c/l, which is still a bit away from a theoretical 19 +C c/l. + + ALIGN(16) +L(inverse_top): + C eax scratch + C ebx scratch (nadj, q1) + C ecx src pointer, decrementing + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src qword) + C mm7 rshift for normalization + + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movq (%ecx), %mm0 C next src limb and the one below it + subl $4, %ecx + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + + mull %ebx C (q1+1)*d + + psrlq %mm7, %mm0 + movl VAR_SRC_STOP, %ebx + + C + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + cmpl %ebx, %ecx + + jne L(inverse_top) + + +L(inverse_loop_done): + + +C ----------------------------------------------------------------------------- + +L(inverse_two_left): + C eax scratch + C ebx scratch (nadj, q1) + C ecx &src[-1] + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 scratch (src dword) + C mm7 rshift + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movd 4(%ecx), %mm0 C src low limb + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + movl %ebp, %eax C d + + mull %ebx C (q1+1)*d + + psllq $32, %mm0 + + psrlq %mm7, %mm0 + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + movl %esi, %edi C remainder -> n2 + leal (%ebp,%esi), %edx + + cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 + movd %mm0, %esi + + +C One limb left + + C eax scratch + C ebx scratch (nadj, q1) + C ecx + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + C + C mm0 src limb, shifted + C mm7 rshift + + movl %esi, %eax + movl %ebp, %ebx + + sarl $31, %eax C -n1 + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + movl VAR_NORM, %ecx C for final denorm + + C + + C + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2<<32 + m*(n2+n1)) + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + + sbbl $0, %ebx + movl %ebp, %eax C d + + mull %ebx C (q1+1)*d + + movl SAVE_EBX, %ebx + + C + + C + + C + + subl %eax, %esi + + sbbl %edx, %edi C n - (q1+1)*d + leal (%ebp,%esi), %edx + movl SAVE_EBP, %ebp + + movl %esi, %eax C remainder + movl SAVE_ESI, %esi + + cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1 + movl SAVE_EDI, %edi + + shrl %cl, %eax C denorm remainder + addl $STACK_SPACE, %esp + emms + + ret + + +C ----------------------------------------------------------------------------- +C +C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword +C of q*d is simply -d and the remainder n-q*d = n10+d + +L(q1_ff): + C eax (divisor) + C ebx (q1+1 == 0) + C ecx src pointer + C edx + C esi n10 + C edi (n2) + C ebp divisor + + leal (%ebp,%esi), %edi C n-q*d remainder -> next n2 + movl VAR_SRC_STOP, %edx + psrlq %mm7, %mm0 + + movd %mm0, %esi C next n10 + cmpl %ecx, %edx + jne L(inverse_top) + + jmp L(inverse_loop_done) + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/p6/mmx/popham.asm b/rts/gmp/mpn/x86/p6/mmx/popham.asm new file mode 100644 index 0000000000..50f9a11218 --- /dev/null +++ b/rts/gmp/mpn/x86/p6/mmx/popham.asm @@ -0,0 +1,31 @@ +dnl Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and +dnl hamming distance. +dnl +dnl P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb +dnl (approx) + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/rts/gmp/mpn/x86/p6/p3mmx/popham.asm b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm new file mode 100644 index 0000000000..e63fbf334b --- /dev/null +++ b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm @@ -0,0 +1,30 @@ +dnl Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and +dnl hamming distance. + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl Haven't actually measured it, but the K7 code with the psadbw should be +dnl good on P-III. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k7/mmx/popham.asm') diff --git a/rts/gmp/mpn/x86/p6/sqr_basecase.asm b/rts/gmp/mpn/x86/p6/sqr_basecase.asm new file mode 100644 index 0000000000..174c78406a --- /dev/null +++ b/rts/gmp/mpn/x86/p6/sqr_basecase.asm @@ -0,0 +1,641 @@ +dnl Intel P6 mpn_sqr_basecase -- square an mpn number. +dnl +dnl P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular +dnl product (measured on the speed difference between 20 and 40 limbs, +dnl which is the Karatsuba recursing range). + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for +dnl a description. The only difference here is that UNROLL_COUNT can go up +dnl to 64 (not 63) making KARATSUBA_SQR_THRESHOLD_MAX 67. + +deflit(KARATSUBA_SQR_THRESHOLD_MAX, 67) + +ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE', +`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)') + +m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD') +deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3)) + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the given size +C is small. +C +C The code size might look a bit excessive, but not all of it is executed so +C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases +C clearly apply only to those sizes; mid sizes like 10x10 only need part of +C the unrolled addmul; and big sizes like 40x40 that do use the full +C unrolling will least be making good use of it, because 40x40 will take +C something like 7000 cycles. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(32) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + + movl PARAM_SRC, %eax + + cmpl $2, %edx + movl PARAM_DST, %ecx + je L(two_limbs) + + movl (%eax), %eax + ja L(three_or_more) + + +C ----------------------------------------------------------------------------- +C one limb only + C eax src limb + C ebx + C ecx dst + C edx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + ret + + +C ----------------------------------------------------------------------------- +L(two_limbs): + C eax src + C ebx + C ecx dst + C edx + +defframe(SAVE_ESI, -4) +defframe(SAVE_EBX, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(`STACK_SPACE',16) + + subl $STACK_SPACE, %esp +deflit(`FRAME',STACK_SPACE) + + movl %esi, SAVE_ESI + movl %eax, %esi + movl (%eax), %eax + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl 4(%esi), %eax + + movl %ebx, SAVE_EBX + movl %edx, %ebx C dst[1] + + mull %eax C src[1]^2 + + movl %edi, SAVE_EDI + movl %eax, %edi C dst[2] + movl (%esi), %eax + + movl %ebp, SAVE_EBP + movl %edx, %ebp C dst[3] + + mull 4(%esi) C src[0]*src[1] + + addl %eax, %ebx + movl SAVE_ESI, %esi + + adcl %edx, %edi + + adcl $0, %ebp + addl %ebx, %eax + movl SAVE_EBX, %ebx + + adcl %edi, %edx + movl SAVE_EDI, %edi + + adcl $0, %ebp + + movl %eax, 4(%ecx) + + movl %ebp, 12(%ecx) + movl SAVE_EBP, %ebp + + movl %edx, 8(%ecx) + addl $FRAME, %esp + + ret + + +C ----------------------------------------------------------------------------- +L(three_or_more): + C eax src low limb + C ebx + C ecx dst + C edx size +deflit(`FRAME',0) + + pushl %esi defframe_pushl(`SAVE_ESI') + cmpl $4, %edx + + movl PARAM_SRC, %esi + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + + C eax src low limb + C ebx + C ecx dst + C edx + C esi src + C edi + C ebp + + pushl %ebp defframe_pushl(`SAVE_EBP') + pushl %edi defframe_pushl(`SAVE_EDI') + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + movl 4(%esi), %eax + xorl %ebp, %ebp + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl %edx, 12(%ecx) + movl 8(%esi), %eax + + pushl %ebx defframe_pushl(`SAVE_EBX') + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl %edx, 20(%ecx) + + movl (%esi), %eax + + mull 4(%esi) C src[0] * src[1] + + movl %eax, %ebx + movl %edx, %edi + + movl (%esi), %eax + + mull 8(%esi) C src[0] * src[2] + + addl %eax, %edi + movl %edx, %ebp + + adcl $0, %ebp + movl 4(%esi), %eax + + mull 8(%esi) C src[1] * src[2] + + xorl %esi, %esi + addl %eax, %ebp + + C eax + C ebx dst[1] + C ecx dst + C edx dst[4] + C esi zero, will be dst[5] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %ebx, %ebx + + adcl %edi, %edi + + adcl %ebp, %ebp + + adcl %edx, %edx + movl 4(%ecx), %eax + + adcl $0, %esi + addl %ebx, %eax + + movl %eax, 4(%ecx) + movl 8(%ecx), %eax + + adcl %edi, %eax + movl 12(%ecx), %ebx + + adcl %ebp, %ebx + movl 16(%ecx), %edi + + movl %eax, 8(%ecx) + movl SAVE_EBP, %ebp + + movl %ebx, 12(%ecx) + movl SAVE_EBX, %ebx + + adcl %edx, %edi + movl 20(%ecx), %eax + + movl %edi, 16(%ecx) + movl SAVE_EDI, %edi + + adcl %esi, %eax C no carry out of this + movl SAVE_ESI, %esi + + movl %eax, 20(%ecx) + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +defframe(VAR_COUNTER,-20) +defframe(VAR_JMP, -24) +deflit(`STACK_SPACE',24) + +L(four_or_more): + C eax src low limb + C ebx + C ecx + C edx size + C esi src + C edi + C ebp +deflit(`FRAME',4) dnl %esi already pushed + +C First multiply src[0]*src[1..size-1] and store at dst[1..size]. + + subl $STACK_SPACE-FRAME, %esp +deflit(`FRAME',STACK_SPACE) + movl $1, %ecx + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl %ebx, SAVE_EBX + subl %edx, %ecx C -(size-1) + + movl %ebp, SAVE_EBP + movl $0, %ebx C initial carry + + leal (%esi,%edx,4), %esi C &src[size] + movl %eax, %ebp C multiplier + + leal -4(%edi,%edx,4), %edi C &dst[size-1] + + +C This loop runs at just over 6 c/l. + +L(mul_1): + C eax scratch + C ebx carry + C ecx counter, limbs, negative, -(size-1) to -1 + C edx scratch + C esi &src[size] + C edi &dst[size-1] + C ebp multiplier + + movl %ebp, %eax + + mull (%esi,%ecx,4) + + addl %ebx, %eax + movl $0, %ebx + + adcl %edx, %ebx + movl %eax, 4(%edi,%ecx,4) + + incl %ecx + jnz L(mul_1) + + + movl %ebx, 4(%edi) + + +C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2. +C +C The last two addmuls, which are the bottom right corner of the product +C triangle, are left to the end. These are src[size-3]*src[size-2,size-1] +C and src[size-2]*src[size-1]. If size is 4 then it's only these corner +C cases that need to be done. +C +C The unrolled code is the same as mpn_addmul_1(), see that routine for some +C comments. +C +C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled code, stepped by one code +C chunk each outer loop. + +dnl This is also hard-coded in the address calculation below. +deflit(CODE_BYTES_PER_LIMB, 15) + +dnl With &src[size] and &dst[size-1] pointers, the displacements in the +dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above +dnl that an offset must be added to them. +deflit(OFFSET, +ifelse(eval(UNROLL_COUNT>32),1, +eval((UNROLL_COUNT-32)*4), +0)) + + C eax + C ebx carry + C ecx + C edx + C esi &src[size] + C edi &dst[size-1] + C ebp + + movl PARAM_SIZE, %ecx + + subl $4, %ecx + jz L(corner) + + movl %ecx, %edx + negl %ecx + + shll $4, %ecx +ifelse(OFFSET,0,,`subl $OFFSET, %esi') + +ifdef(`PIC',` + call L(pic_calc) +L(here): +',` + leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx +') + negl %edx + +ifelse(OFFSET,0,,`subl $OFFSET, %edi') + + C The calculated jump mustn't be before the start of the available + C code. This is the limit that UNROLL_COUNT puts on the src operand + C size, but checked here using the jump address directly. + + ASSERT(ae, + `movl_text_address( L(unroll_inner_start), %eax) + cmpl %eax, %ecx') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(unroll_outer_top): + C eax + C ebx high limb to store + C ecx VAR_JMP + C edx VAR_COUNTER, limbs, negative + C esi &src[size], constant + C edi dst ptr, second highest limb of last addmul + C ebp + + movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier + movl %edx, VAR_COUNTER + + movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand + + mull %ebp + +define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')') + + testb $1, %cl + + movl %edx, %ebx C high carry + leal 4(%edi), %edi + + movl %ecx, %edx C jump + + movl %eax, %ecx C low carry + leal CODE_BYTES_PER_LIMB(%edx), %edx + + cmovX( %ebx, %ecx) C high carry reverse + cmovX( %eax, %ebx) C low carry reverse + movl %edx, VAR_JMP + jmp *%edx + + + C Must be on an even address here so the low bit of the jump address + C will indicate which way around ecx/ebx should start. + + ALIGN(2) + +L(unroll_inner_start): + C eax scratch + C ebx carry high + C ecx carry low + C edx scratch + C esi src pointer + C edi dst pointer + C ebp multiplier + C + C 15 code bytes each limb + C ecx/ebx reversed on each chunk + +forloop(`i', UNROLL_COUNT, 1, ` + deflit(`disp_src', eval(-i*4 + OFFSET)) + deflit(`disp_dst', eval(disp_src)) + + m4_assert(`disp_src>=-128 && disp_src<128') + m4_assert(`disp_dst>=-128 && disp_dst<128') + +ifelse(eval(i%2),0,` +Zdisp( movl, disp_src,(%esi), %eax) + mull %ebp +Zdisp( addl, %ebx, disp_dst,(%edi)) + adcl %eax, %ecx + movl %edx, %ebx + adcl $0, %ebx +',` + dnl this one comes out last +Zdisp( movl, disp_src,(%esi), %eax) + mull %ebp +Zdisp( addl, %ecx, disp_dst,(%edi)) + adcl %eax, %ebx + movl %edx, %ecx + adcl $0, %ecx +') +') +L(unroll_inner_end): + + addl %ebx, m4_empty_if_zero(OFFSET)(%edi) + + movl VAR_COUNTER, %edx + adcl $0, %ecx + + movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi) + movl VAR_JMP, %ecx + + incl %edx + jnz L(unroll_outer_top) + + +ifelse(OFFSET,0,,` + addl $OFFSET, %esi + addl $OFFSET, %edi +') + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(corner): + C eax + C ebx + C ecx + C edx + C esi &src[size] + C edi &dst[2*size-5] + C ebp + + movl -12(%esi), %eax + + mull -8(%esi) + + addl %eax, (%edi) + movl -12(%esi), %eax + movl $0, %ebx + + adcl %edx, %ebx + + mull -4(%esi) + + addl %eax, %ebx + movl -8(%esi), %eax + + adcl $0, %edx + + addl %ebx, 4(%edi) + movl $0, %ebx + + adcl %edx, %ebx + + mull -4(%esi) + + movl PARAM_SIZE, %ecx + addl %ebx, %eax + + adcl $0, %edx + + movl %eax, 8(%edi) + + movl %edx, 12(%edi) + movl PARAM_DST, %edi + + +C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1]. + + subl $1, %ecx C size-1 + xorl %eax, %eax C ready for final adcl, and clear carry + + movl %ecx, %edx + movl PARAM_SRC, %esi + + +L(lshift): + C eax + C ebx + C ecx counter, size-1 to 1 + C edx size-1 (for later use) + C esi src (for later use) + C edi dst, incrementing + C ebp + + rcll 4(%edi) + rcll 8(%edi) + + leal 8(%edi), %edi + decl %ecx + jnz L(lshift) + + + adcl %eax, %eax + + movl %eax, 4(%edi) C dst most significant limb + movl (%esi), %eax C src[0] + + leal 4(%esi,%edx,4), %esi C &src[size] + subl %edx, %ecx C -(size-1) + + +C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + + mull %eax + + movl %eax, (%edi,%ecx,8) C dst[0] + + +L(diag): + C eax scratch + C ebx scratch + C ecx counter, negative + C edx carry + C esi &src[size] + C edi dst[2*size-2] + C ebp + + movl (%esi,%ecx,4), %eax + movl %edx, %ebx + + mull %eax + + addl %ebx, 4(%edi,%ecx,8) + adcl %eax, 8(%edi,%ecx,8) + adcl $0, %edx + + incl %ecx + jnz L(diag) + + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + + addl %edx, 4(%edi) C dst most significant limb + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + ret + + + +C ----------------------------------------------------------------------------- +ifdef(`PIC',` +L(pic_calc): + addl (%esp), %ecx + addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx + addl %edx, %ecx + ret +') + + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/README b/rts/gmp/mpn/x86/pentium/README new file mode 100644 index 0000000000..3b9ec8ac6f --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/README @@ -0,0 +1,77 @@ + + INTEL PENTIUM P5 MPN SUBROUTINES + + +This directory contains mpn functions optimized for Intel Pentium (P5,P54) +processors. The mmx subdirectory has code for Pentium with MMX (P55). + + +STATUS + + cycles/limb + + mpn_add_n/sub_n 2.375 + + mpn_copyi/copyd 1.0 + + mpn_divrem_1 44.0 + mpn_mod_1 44.0 + mpn_divexact_by3 15.0 + + mpn_l/rshift 5.375 normal (6.0 on P54) + 1.875 special shift by 1 bit + + mpn_mul_1 13.0 + mpn_add/submul_1 14.0 + + mpn_mul_basecase 14.2 cycles/crossproduct (approx) + + mpn_sqr_basecase 8 cycles/crossproduct (approx) + or 15.5 cycles/triangleproduct (approx) + +Pentium MMX gets the following improvements + + mpn_l/rshift 1.75 + + +1. mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the +documentation indicates that they should take only 43/8 = 5.375 cycles/limb, +or 5 cycles/limb asymptotically. The P55 runs them at the expected speed. + +2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop +overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb. + +3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they +should. Intel documentation says a mul instruction is 10 cycles, but it +measures 9 and the routines using it run with it as 9. + + + +RELEVANT OPTIMIZATION ISSUES + +1. Pentium doesn't allocate cache lines on writes, unlike most other modern +processors. Since the functions in the mpn class do array writes, we have to +handle allocating the destination cache lines by reading a word from it in the +loops, to achieve the best performance. + +2. Pairing of memory operations requires that the two issued operations refer +to different cache banks. The simplest way to insure this is to read/write +two words from the same object. If we make operations on different objects, +they might or might not be to the same cache bank. + + + +REFERENCES + +"Intel Architecture Optimization Manual", 1997, order number 242816. This +is mostly about P5, the parts about P6 aren't relevant. Available on-line: + + http://download.intel.com/design/PentiumII/manuals/242816.htm + + + +---------------- +Local variables: +mode: text +fill-column: 76 +End: diff --git a/rts/gmp/mpn/x86/pentium/aors_n.asm b/rts/gmp/mpn/x86/pentium/aors_n.asm new file mode 100644 index 0000000000..a61082a456 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/aors_n.asm @@ -0,0 +1,196 @@ +dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction. +dnl +dnl P5: 2.375 cycles/limb + + +dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +ifdef(`OPERATION_add_n',` + define(M4_inst, adcl) + define(M4_function_n, mpn_add_n) + define(M4_function_nc, mpn_add_nc) + +',`ifdef(`OPERATION_sub_n',` + define(M4_inst, sbbl) + define(M4_function_n, mpn_sub_n) + define(M4_function_nc, mpn_sub_nc) + +',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n +')')') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + + +C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size); +C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, +C mp_size_t size, mp_limb_t carry); + +defframe(PARAM_CARRY,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC2, 12) +defframe(PARAM_SRC1, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(M4_function_nc) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%ebp + movl PARAM_SIZE,%ecx + + movl (%ebp),%ebx + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx C zero carry flag + jz L(endgo) + + pushl %edx +FRAME_pushl() + movl PARAM_CARRY,%eax + shrl $1,%eax C shift bit 0 into carry + jmp LF(M4_function_n,oop) + +L(endgo): +deflit(`FRAME',16) + movl PARAM_CARRY,%eax + shrl $1,%eax C shift bit 0 into carry + jmp LF(M4_function_n,end) + +EPILOGUE() + + + ALIGN(8) +PROLOGUE(M4_function_n) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC1,%esi + movl PARAM_SRC2,%ebp + movl PARAM_SIZE,%ecx + + movl (%ebp),%ebx + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx C zero carry flag + jz L(end) + pushl %edx +FRAME_pushl() + + ALIGN(8) +L(oop): movl 28(%edi),%eax C fetch destination cache line + leal 32(%edi),%edi + +L(1): movl (%esi),%eax + movl 4(%esi),%edx + M4_inst %ebx,%eax + movl 4(%ebp),%ebx + M4_inst %ebx,%edx + movl 8(%ebp),%ebx + movl %eax,-32(%edi) + movl %edx,-28(%edi) + +L(2): movl 8(%esi),%eax + movl 12(%esi),%edx + M4_inst %ebx,%eax + movl 12(%ebp),%ebx + M4_inst %ebx,%edx + movl 16(%ebp),%ebx + movl %eax,-24(%edi) + movl %edx,-20(%edi) + +L(3): movl 16(%esi),%eax + movl 20(%esi),%edx + M4_inst %ebx,%eax + movl 20(%ebp),%ebx + M4_inst %ebx,%edx + movl 24(%ebp),%ebx + movl %eax,-16(%edi) + movl %edx,-12(%edi) + +L(4): movl 24(%esi),%eax + movl 28(%esi),%edx + M4_inst %ebx,%eax + movl 28(%ebp),%ebx + M4_inst %ebx,%edx + movl 32(%ebp),%ebx + movl %eax,-8(%edi) + movl %edx,-4(%edi) + + leal 32(%esi),%esi + leal 32(%ebp),%ebp + decl %ecx + jnz L(oop) + + popl %edx +FRAME_popl() +L(end): + decl %edx C test %edx w/o clobbering carry + js L(end2) + incl %edx +L(oop2): + leal 4(%edi),%edi + movl (%esi),%eax + M4_inst %ebx,%eax + movl 4(%ebp),%ebx + movl %eax,-4(%edi) + leal 4(%esi),%esi + leal 4(%ebp),%ebp + decl %edx + jnz L(oop2) +L(end2): + movl (%esi),%eax + M4_inst %ebx,%eax + movl %eax,(%edi) + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/aorsmul_1.asm b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm new file mode 100644 index 0000000000..147b55610f --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm @@ -0,0 +1,99 @@ +dnl Intel Pentium mpn_addmul_1 -- mpn by limb multiplication. +dnl +dnl P5: 14.0 cycles/limb + + +dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. */ + + +include(`../config.m4') + + +ifdef(`OPERATION_addmul_1', ` + define(M4_inst, addl) + define(M4_function_1, mpn_addmul_1) + +',`ifdef(`OPERATION_submul_1', ` + define(M4_inst, subl) + define(M4_function_1, mpn_submul_1) + +',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 +')')') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + + +C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t mult); + +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) + +PROLOGUE(M4_function_1) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST, %edi + movl PARAM_SRC, %esi + movl PARAM_SIZE, %ecx + movl PARAM_MULTIPLIER, %ebp + + leal (%edi,%ecx,4), %edi + leal (%esi,%ecx,4), %esi + negl %ecx + xorl %ebx, %ebx + ALIGN(8) + +L(oop): adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl (%edi,%ecx,4), %ebx + + adcl $0, %edx + M4_inst %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(oop) + + adcl $0, %ebx + movl %ebx, %eax + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/diveby3.asm b/rts/gmp/mpn/x86/pentium/diveby3.asm new file mode 100644 index 0000000000..dbac81642f --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/diveby3.asm @@ -0,0 +1,183 @@ +dnl Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder. +dnl +dnl P5: 15.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t carry); + +defframe(PARAM_CARRY,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +dnl multiplicative inverse of 3, modulo 2^32 +deflit(INVERSE_3, 0xAAAAAAAB) + +dnl ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32 +deflit(ONE_THIRD_CEIL, 0x55555556) +deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB) +deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA) + + .text + ALIGN(8) + +PROLOGUE(mpn_divexact_by3c) +deflit(`FRAME',0) + + movl PARAM_SRC, %ecx + movl PARAM_SIZE, %edx + + decl %edx + jnz L(two_or_more) + + movl (%ecx), %edx + movl PARAM_CARRY, %eax C risk of cache bank clash here + + movl PARAM_DST, %ecx + subl %eax, %edx + + sbbl %eax, %eax C 0 or -1 + + imull $INVERSE_3, %edx, %edx + + negl %eax C 0 or 1 + cmpl $ONE_THIRD_CEIL, %edx + + sbbl $-1, %eax C +1 if edx>=ceil(b/3) + cmpl $TWO_THIRDS_CEIL, %edx + + sbbl $-1, %eax C +1 if edx>=ceil(b*2/3) + movl %edx, (%ecx) + + ret + + +L(two_or_more): + C eax + C ebx + C ecx src + C edx size-1 + C esi + C edi + C ebp + + pushl %ebx FRAME_pushl() + pushl %esi FRAME_pushl() + + pushl %edi FRAME_pushl() + pushl %ebp FRAME_pushl() + + movl PARAM_DST, %edi + movl PARAM_CARRY, %esi + + movl (%ecx), %eax C src low limb + xorl %ebx, %ebx + + sub %esi, %eax + movl $TWO_THIRDS_FLOOR, %esi + + leal (%ecx,%edx,4), %ecx C &src[size-1] + leal (%edi,%edx,4), %edi C &dst[size-1] + + adcl $0, %ebx C carry, 0 or 1 + negl %edx C -(size-1) + + +C The loop needs a source limb ready at the top, which leads to one limb +C handled separately at the end, and the special case above for size==1. +C There doesn't seem to be any scheduling that would keep the speed but move +C the source load and carry subtract up to the top. +C +C The destination cache line prefetching adds 1 cycle to the loop but is +C considered worthwhile. The slowdown is a factor of 1.07, but will prevent +C repeated write-throughs if the destination isn't in L1. A version using +C an outer loop to prefetch only every 8 limbs (a cache line) proved to be +C no faster, due to unavoidable branch mispreditions in the inner loop. +C +C setc is 2 cycles on P54, so an adcl is used instead. If the movl $0,%ebx +C could be avoided then the src limb fetch could pair up and save a cycle. +C This would probably mean going to a two limb loop with the carry limb +C alternately positive or negative, since an sbbl %ebx,%ebx will leave a +C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax. +C +C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as +C "cmpl %edx, $n" with the immediate as the second operand. +C +C The "4" source displacement is in the loop rather than the setup because +C this gets L(top) aligned to 8 bytes at no cost. + + ALIGN(8) +L(top): + C eax source limb, carry subtracted + C ebx carry (0 or 1) + C ecx &src[size-1] + C edx counter, limbs, negative + C esi TWO_THIRDS_FLOOR + C edi &dst[size-1] + C ebp scratch (result limb) + + imull $INVERSE_3, %eax, %ebp + + cmpl $ONE_THIRD_CEIL, %ebp + movl (%edi,%edx,4), %eax C dst cache line prefetch + + sbbl $-1, %ebx C +1 if ebp>=ceil(b/3) + cmpl %ebp, %esi + + movl 4(%ecx,%edx,4), %eax C next src limb + + sbbl %ebx, %eax C and further -1 if ebp>=ceil(b*2/3) + movl $0, %ebx + + adcl $0, %ebx C new carry + movl %ebp, (%edi,%edx,4) + + incl %edx + jnz L(top) + + + + imull $INVERSE_3, %eax, %edx + + cmpl $ONE_THIRD_CEIL, %edx + movl %edx, (%edi) + + sbbl $-1, %ebx C +1 if edx>=ceil(b/3) + cmpl $TWO_THIRDS_CEIL, %edx + + sbbl $-1, %ebx C +1 if edx>=ceil(b*2/3) + popl %ebp + + movl %ebx, %eax + popl %edi + + popl %esi + popl %ebx + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/gmp-mparam.h new file mode 100644 index 0000000000..d3ed3d73ce --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/gmp-mparam.h @@ -0,0 +1,97 @@ +/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +#ifndef UMUL_TIME +#define UMUL_TIME 9 /* cycles */ +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 41 /* cycles */ +#endif + +/* bsf takes 18-42 cycles, put an average for uniform random numbers */ +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 14 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 179 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 22 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 153 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 46 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 110 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 13 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 4 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 25 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 496, 928, 1920, 4608, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 512 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 3840 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 496, 1184, 1920, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 512 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 3840 +#endif diff --git a/rts/gmp/mpn/x86/pentium/lshift.asm b/rts/gmp/mpn/x86/pentium/lshift.asm new file mode 100644 index 0000000000..e1e35d4c57 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/lshift.asm @@ -0,0 +1,236 @@ +dnl Intel Pentium mpn_lshift -- mpn left shift. +dnl +dnl cycles/limb +dnl P5,P54: 6.0 +dnl P55: 5.375 + + +dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, +C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_lshift) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ebp + movl PARAM_SHIFT,%ecx + +C We can use faster code for shift-by-1 under certain conditions. + cmp $1,%ecx + jne L(normal) + leal 4(%esi),%eax + cmpl %edi,%eax + jnc L(special) C jump if s_ptr + 1 >= res_ptr + leal (%esi,%ebp,4),%eax + cmpl %eax,%edi + jnc L(special) C jump if res_ptr >= s_ptr + size + +L(normal): + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + xorl %eax,%eax + shldl( %cl, %edx, %eax) C compute carry limb + pushl %eax C push carry limb onto stack + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz L(end) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(oop): movl -28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + shldl( %cl, %eax, %ebx) + shldl( %cl, %edx, %eax) + movl %ebx,(%edi) + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + shldl( %cl, %ebx, %edx) + shldl( %cl, %eax, %ebx) + movl %edx,-8(%edi) + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + shldl( %cl, %edx, %eax) + shldl( %cl, %ebx, %edx) + movl %eax,-16(%edi) + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + shldl( %cl, %eax, %ebx) + shldl( %cl, %edx, %eax) + movl %ebx,-24(%edi) + movl %eax,-28(%edi) + + subl $32,%esi + subl $32,%edi + decl %ebp + jnz L(oop) + +L(end): popl %ebp + andl $7,%ebp + jz L(end2) +L(oop2): + movl (%esi),%eax + shldl( %cl,%eax,%edx) + movl %edx,(%edi) + movl %eax,%edx + subl $4,%esi + subl $4,%edi + decl %ebp + jnz L(oop2) + +L(end2): + shll %cl,%edx C compute least significant limb + movl %edx,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + +C We loop from least significant end of the arrays, which is only +C permissable if the source and destination don't overlap, since the +C function is documented to work for overlapping source and destination. + +L(special): + movl (%esi),%edx + addl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + addl %edx,%edx + incl %ebp + decl %ebp + jz L(Lend) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(Loop): + movl 28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + adcl %eax,%eax + movl %ebx,(%edi) + adcl %edx,%edx + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + adcl %ebx,%ebx + movl %edx,8(%edi) + adcl %eax,%eax + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + adcl %edx,%edx + movl %eax,16(%edi) + adcl %ebx,%ebx + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %eax,%eax + movl %ebx,24(%edi) + adcl %edx,%edx + movl %eax,28(%edi) + + leal 32(%esi),%esi C use leal not to clobber carry + leal 32(%edi),%edi + decl %ebp + jnz L(Loop) + +L(Lend): + popl %ebp + sbbl %eax,%eax C save carry in %eax + andl $7,%ebp + jz L(Lend2) + addl %eax,%eax C restore carry from eax +L(Loop2): + movl %edx,%ebx + movl (%esi),%edx + adcl %edx,%edx + movl %ebx,(%edi) + + leal 4(%esi),%esi C use leal not to clobber carry + leal 4(%edi),%edi + decl %ebp + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax C restore carry from eax +L(L1): movl %edx,(%edi) C store last limb + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h new file mode 100644 index 0000000000..2379077d0c --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h @@ -0,0 +1,97 @@ +/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + + +#ifndef UMUL_TIME +#define UMUL_TIME 9 /* cycles */ +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 41 /* cycles */ +#endif + +/* bsf takes 18-42 cycles, put an average for uniform random numbers */ +#ifndef COUNT_TRAILING_ZEROS_TIME +#define COUNT_TRAILING_ZEROS_TIME 20 /* cycles */ +#endif + + +/* Generated by tuneup.c, 2000-07-06. */ + +#ifndef KARATSUBA_MUL_THRESHOLD +#define KARATSUBA_MUL_THRESHOLD 14 +#endif +#ifndef TOOM3_MUL_THRESHOLD +#define TOOM3_MUL_THRESHOLD 99 +#endif + +#ifndef KARATSUBA_SQR_THRESHOLD +#define KARATSUBA_SQR_THRESHOLD 22 +#endif +#ifndef TOOM3_SQR_THRESHOLD +#define TOOM3_SQR_THRESHOLD 89 +#endif + +#ifndef BZ_THRESHOLD +#define BZ_THRESHOLD 40 +#endif + +#ifndef FIB_THRESHOLD +#define FIB_THRESHOLD 98 +#endif + +#ifndef POWM_THRESHOLD +#define POWM_THRESHOLD 13 +#endif + +#ifndef GCD_ACCEL_THRESHOLD +#define GCD_ACCEL_THRESHOLD 5 +#endif +#ifndef GCDEXT_THRESHOLD +#define GCDEXT_THRESHOLD 25 +#endif + +#ifndef FFT_MUL_TABLE +#define FFT_MUL_TABLE { 496, 1056, 1920, 4608, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_MUL_THRESHOLD +#define FFT_MODF_MUL_THRESHOLD 512 +#endif +#ifndef FFT_MUL_THRESHOLD +#define FFT_MUL_THRESHOLD 3840 +#endif + +#ifndef FFT_SQR_TABLE +#define FFT_SQR_TABLE { 496, 1184, 2176, 5632, 14336, 40960, 0 } +#endif +#ifndef FFT_MODF_SQR_THRESHOLD +#define FFT_MODF_SQR_THRESHOLD 512 +#endif +#ifndef FFT_SQR_THRESHOLD +#define FFT_SQR_THRESHOLD 4352 +#endif diff --git a/rts/gmp/mpn/x86/pentium/mmx/lshift.asm b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm new file mode 100644 index 0000000000..2225438658 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm @@ -0,0 +1,455 @@ +dnl Intel P5 mpn_lshift -- mpn left shift. +dnl +dnl P5: 1.75 cycles/limb. + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size left by shift many bits and store the result in dst,size. +C Zeros are shifted in at the right. Return the bits shifted out at the +C left. +C +C The comments in mpn_rshift apply here too. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl minimum 5, because the unrolled loop can't handle less +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(8) + +PROLOGUE(mpn_lshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + movl -4(%ebx,%eax,4), %edi C src high limb + decl %eax + + jnz L(simple) + + shldl( %cl, %edi, %eax) C eax was decremented to zero + + shll %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx,%eax,4), %mm5 C src high limb + + movd %ecx, %mm6 C lshift + negl %ecx + + psllq %mm6, %mm5 + addl $32, %ecx + + movd %ecx, %mm7 + psrlq $32, %mm5 C retval + + +L(simple_top): + C eax counter, limbs, negative + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 scratch + C mm5 return value + C mm6 shift + C mm7 32-shift + + movq -4(%ebx,%eax,4), %mm0 + decl %eax + + psrlq %mm7, %mm0 + + C + + movd %mm0, 4(%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + + movd %mm5, %eax + psllq %mm6, %mm0 + + popl %edi + popl %ebx + + movd %mm0, (%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd -4(%ebx,%eax,4), %mm5 C src high limb + leal (%ebx,%eax,4), %edi + + movd %ecx, %mm6 C lshift + andl $4, %edi + + psllq %mm6, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process high limb separately (marked xxx) to + C make it so. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-------+-- + C | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-- + C | xxx | | + C +-------+-------+-- + + movq -8(%ebx,%eax,4), %mm0 C unaligned load + + psllq %mm6, %mm0 + decl %eax + + psrlq $32, %mm0 + + C + + movd %mm0, (%edx,%eax,4) +L(start_src_aligned): + + movq -8(%ebx,%eax,4), %mm1 C src high qword + leal (%edx,%eax,4), %edi + + andl $4, %edi + psrlq $32, %mm5 C return value + + movq -16(%ebx,%eax,4), %mm3 C src second highest qword + jz L(start_dst_aligned) + + C dst isn't aligned, subtract 4 to make it so, and pretend the shift + C is 32 bits extra. High limb of dst (marked xxx) handled here + C separately. + C + C source -8(ebx,%eax,4) + C | + C +-------+-------+-- + C | mm1 | + C +-------+-------+-- + C 0mod8 4mod8 + C + C dest + C -4(edx,%eax,4) + C | + C +-------+-------+-------+-- + C | xxx | | + C +-------+-------+-------+-- + C 0mod8 4mod8 0mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psllq %mm6, %mm0 + + movd %ecx, %mm6 + psrlq $32, %mm0 + + C wasted cycle here waiting for %mm0 + + movd %mm0, -4(%edx,%eax,4) + subl $4, %edx +L(start_dst_aligned): + + + psllq %mm6, %mm1 + negl %ecx C -shift + + addl $64, %ecx C 64-shift + movq %mm3, %mm2 + + movd %ecx, %mm7 + subl $8, %eax C size-8 + + psrlq %mm7, %mm3 + + por %mm1, %mm3 C mm3 ready to store + jc L(finish) + + + C The comments in mpn_rshift apply here too. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs + C ebx src + C ecx + C edx dst + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from 48(%ebx,%eax,4) + C mm3 dst qword ready to store to 56(%edx,%eax,4) + C + C mm5 return value + C mm6 lshift + C mm7 rshift + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq (%ebx,%eax,4), %mm3 C + psllq %mm6, %mm1 C + + movq %mm0, 16(%edx,%eax,4) + movq %mm3, %mm2 C + + psrlq %mm7, %mm3 C + subl $4, %eax + + por %mm1, %mm3 C + jnc L(unroll_loop) + + + +L(finish): + C eax -4 to -1 representing respectively 0 to 3 limbs remaining + + testb $2, %al + + jz L(finish_no_two) + + movq 8(%ebx,%eax,4), %mm0 + psllq %mm6, %mm2 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + movq %mm3, 24(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + subl $2, %eax +L(finish_no_two): + + + C eax -4 or -3 representing respectively 0 or 1 limbs remaining + C + C mm2 src prev qword, from 48(%ebx,%eax,4) + C mm3 dst qword, for 56(%edx,%eax,4) + + testb $1, %al + movd %mm5, %eax C retval + + popl %edi + jz L(finish_zero) + + + C One extra src limb, destination was aligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 edx + C --+---------------+---------------+-------+ + C | mm3 | | | + C --+---------------+---------------+-------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra src limb, destination was unaligned. + C + C source ebx + C --+---------------+-------+ + C | mm2 | | + C --+---------------+-------+ + C + C dest edx+12 edx+4 + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 4(%edx), and in the aligned case + C there's an extra limb of dst to be formed from that extra src limb + C left shifted. + + + movd (%ebx), %mm0 + psllq %mm6, %mm2 + + movq %mm3, 12(%edx) + psllq $32, %mm0 + + movq %mm0, %mm1 + psrlq %mm7, %mm0 + + por %mm2, %mm0 + psllq %mm6, %mm1 + + movq %mm0, 4(%edx) + psrlq $32, %mm1 + + andl $32, %ecx + popl %ebx + + jz L(finish_one_unaligned) + + movd %mm1, (%edx) +L(finish_one_unaligned): + + emms + + ret + + +L(finish_zero): + + C No extra src limbs, destination was aligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx + C --+---------------+---------------+ + C | mm3 | | + C --+---------------+---------------+ + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra src limbs, destination was unaligned. + C + C source ebx + C --+---------------+ + C | mm2 | + C --+---------------+ + C + C dest edx+8 edx+4 + C --+---------------+-------+ + C | mm3 | | + C --+---------------+-------+ + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C The movd for the unaligned case writes the same data to 4(%edx) + C that the movq does for the aligned case. + + + movq %mm3, 8(%edx) + andl $32, %ecx + + psllq %mm6, %mm2 + jz L(finish_zero_unaligned) + + movq %mm2, (%edx) +L(finish_zero_unaligned): + + psrlq $32, %mm2 + popl %ebx + + movd %mm5, %eax C retval + + movd %mm2, 4(%edx) + + emms + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/mmx/popham.asm b/rts/gmp/mpn/x86/pentium/mmx/popham.asm new file mode 100644 index 0000000000..587a07ab3d --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mmx/popham.asm @@ -0,0 +1,30 @@ +dnl Intel P55 mpn_popcount, mpn_hamdist -- population count and hamming +dnl distance. +dnl +dnl P55: popcount 11.5 cycles/limb, hamdist 12.0 cycles/limb + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) +include_mpn(`x86/k6/mmx/popham.asm') diff --git a/rts/gmp/mpn/x86/pentium/mmx/rshift.asm b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm new file mode 100644 index 0000000000..7672630d57 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm @@ -0,0 +1,460 @@ +dnl Intel P5 mpn_rshift -- mpn right shift. +dnl +dnl P5: 1.75 cycles/limb. + + +dnl Copyright (C) 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C Shift src,size right by shift many bits and store the result in dst,size. +C Zeros are shifted in at the left. Return the bits shifted out at the +C right. +C +C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb, +C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l. +C +C Full speed depends on source and destination being aligned. Unaligned mmx +C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy +C setups and finish-ups are done to ensure alignment for the loop. +C +C MMX shifts work out a bit faster even for the simple loop. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) +deflit(`FRAME',0) + +dnl Minimum 5, because the unrolled loop can't handle less. +deflit(UNROLL_THRESHOLD, 5) + + .text + ALIGN(8) + +PROLOGUE(mpn_rshift) + + pushl %ebx + pushl %edi +deflit(`FRAME',8) + + movl PARAM_SIZE, %eax + movl PARAM_DST, %edx + + movl PARAM_SRC, %ebx + movl PARAM_SHIFT, %ecx + + cmp $UNROLL_THRESHOLD, %eax + jae L(unroll) + + decl %eax + movl (%ebx), %edi C src low limb + + jnz L(simple) + + shrdl( %cl, %edi, %eax) C eax was decremented to zero + + shrl %cl, %edi + + movl %edi, (%edx) C dst low limb + popl %edi C risk of data cache bank clash + + popl %ebx + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(simple): + C eax size-1 + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx), %mm5 C src[0] + leal (%ebx,%eax,4), %ebx C &src[size-1] + + movd %ecx, %mm6 C rshift + leal -4(%edx,%eax,4), %edx C &dst[size-2] + + psllq $32, %mm5 + negl %eax + + +C This loop is 5 or 8 cycles, with every second load unaligned and a wasted +C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4 +C cycles and would be 8 in a simple loop. Using mmx helps the return value +C and last limb calculations too. + +L(simple_top): + C eax counter, limbs, negative + C ebx &src[size-1] + C ecx return value + C edx &dst[size-2] + C + C mm0 scratch + C mm5 return value + C mm6 shift + + movq (%ebx,%eax,4), %mm0 + incl %eax + + psrlq %mm6, %mm0 + + movd %mm0, (%edx,%eax,4) + jnz L(simple_top) + + + movd (%ebx), %mm0 + psrlq %mm6, %mm5 C return value + + psrlq %mm6, %mm0 + popl %edi + + movd %mm5, %eax + popl %ebx + + movd %mm0, 4(%edx) + + emms + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(unroll): + C eax size + C ebx src + C ecx shift + C edx dst + C esi + C edi + C ebp +deflit(`FRAME',8) + + movd (%ebx), %mm5 C src[0] + movl $4, %edi + + movd %ecx, %mm6 C rshift + testl %edi, %ebx + + psllq $32, %mm5 + jz L(start_src_aligned) + + + C src isn't aligned, process low limb separately (marked xxx) and + C step src and dst by one limb, making src aligned. + C + C source ebx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + C + C dest edx + C --+-------+-------+ + C | | xxx | + C --+-------+-------+ + + movq (%ebx), %mm0 C unaligned load + + psrlq %mm6, %mm0 + addl $4, %ebx + + decl %eax + + movd %mm0, (%edx) + addl $4, %edx +L(start_src_aligned): + + + movq (%ebx), %mm1 + testl %edi, %edx + + psrlq %mm6, %mm5 C retval + jz L(start_dst_aligned) + + C dst isn't aligned, add 4 to make it so, and pretend the shift is + C 32 bits extra. Low limb of dst (marked xxx) handled here + C separately. + C + C source ebx + C --+-------+-------+ + C | mm1 | + C --+-------+-------+ + C 4mod8 0mod8 + C + C dest edx + C --+-------+-------+-------+ + C | xxx | + C --+-------+-------+-------+ + C 4mod8 0mod8 4mod8 + + movq %mm1, %mm0 + addl $32, %ecx C new shift + + psrlq %mm6, %mm0 + + movd %ecx, %mm6 + + movd %mm0, (%edx) + addl $4, %edx +L(start_dst_aligned): + + + movq 8(%ebx), %mm3 + negl %ecx + + movq %mm3, %mm2 C mm2 src qword + addl $64, %ecx + + movd %ecx, %mm7 + psrlq %mm6, %mm1 + + leal -12(%ebx,%eax,4), %ebx + leal -20(%edx,%eax,4), %edx + + psllq %mm7, %mm3 + subl $7, %eax C size-7 + + por %mm1, %mm3 C mm3 ready to store + negl %eax C -(size-7) + + jns L(finish) + + + C This loop is the important bit, the rest is just support. Careful + C instruction scheduling achieves the claimed 1.75 c/l. The + C relevant parts of the pairing rules are: + C + C - mmx loads and stores execute only in the U pipe + C - only one mmx shift in a pair + C - wait one cycle before storing an mmx register result + C - the usual address generation interlock + C + C Two qword calculations are slightly interleaved. The instructions + C marked "C" belong to the second qword, and the "C prev" one is for + C the second qword from the previous iteration. + + ALIGN(8) +L(unroll_loop): + C eax counter, limbs, negative + C ebx &src[size-12] + C ecx + C edx &dst[size-12] + C esi + C edi + C + C mm0 + C mm1 + C mm2 src qword from -8(%ebx,%eax,4) + C mm3 dst qword ready to store to -8(%edx,%eax,4) + C + C mm5 return value + C mm6 rshift + C mm7 lshift + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq 8(%ebx,%eax,4), %mm3 C + psrlq %mm6, %mm1 C + + movq %mm0, (%edx,%eax,4) + movq %mm3, %mm2 C + + psllq %mm7, %mm3 C + addl $4, %eax + + por %mm1, %mm3 C + js L(unroll_loop) + + +L(finish): + C eax 0 to 3 representing respectively 3 to 0 limbs remaining + + testb $2, %al + + jnz L(finish_no_two) + + movq (%ebx,%eax,4), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, -8(%edx,%eax,4) C prev + por %mm2, %mm0 + + movq %mm1, %mm2 + movq %mm0, %mm3 + + addl $2, %eax +L(finish_no_two): + + + C eax 2 or 3 representing respectively 1 or 0 limbs remaining + C + C mm2 src prev qword, from -8(%ebx,%eax,4) + C mm3 dst qword, for -8(%edx,%eax,4) + + testb $1, %al + popl %edi + + movd %mm5, %eax C retval + jnz L(finish_zero) + + + C One extra limb, destination was aligned. + C + C source ebx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edx + C +-------+---------------+---------------+-- + C | | | mm3 | + C +-------+---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C One extra limb, destination was unaligned. + C + C source ebx + C +-------+---------------+-- + C | | mm2 | + C +-------+---------------+-- + C + C dest edx + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift+32 + C mm7 = ecx = 64-(shift+32) + + + C In both cases there's one extra limb of src to fetch and combine + C with mm2 to make a qword at 8(%edx), and in the aligned case + C there's a further extra limb of dst to be formed. + + + movd 8(%ebx), %mm0 + psrlq %mm6, %mm2 + + movq %mm0, %mm1 + psllq %mm7, %mm0 + + movq %mm3, (%edx) + por %mm2, %mm0 + + psrlq %mm6, %mm1 + andl $32, %ecx + + popl %ebx + jz L(finish_one_unaligned) + + C dst was aligned, must store one extra limb + movd %mm1, 16(%edx) +L(finish_one_unaligned): + + movq %mm0, 8(%edx) + + emms + + ret + + +L(finish_zero): + + C No extra limbs, destination was aligned. + C + C source ebx + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edx+4 + C +---------------+---------------+-- + C | | mm3 | + C +---------------+---------------+-- + C + C mm6 = shift + C mm7 = ecx = 64-shift + + + C No extra limbs, destination was unaligned. + C + C source ebx + C +---------------+-- + C | mm2 | + C +---------------+-- + C + C dest edx+4 + C +-------+---------------+-- + C | | mm3 | + C +-------+---------------+-- + C + C mm6 = shift+32 + C mm7 = 64-(shift+32) + + + C The movd for the unaligned case is clearly the same data as the + C movq for the aligned case, it's just a choice between whether one + C or two limbs should be written. + + + movq %mm3, 4(%edx) + psrlq %mm6, %mm2 + + movd %mm2, 12(%edx) + andl $32, %ecx + + popl %ebx + jz L(finish_zero_unaligned) + + movq %mm2, 12(%edx) +L(finish_zero_unaligned): + + emms + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/mul_1.asm b/rts/gmp/mpn/x86/pentium/mul_1.asm new file mode 100644 index 0000000000..08639eca09 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mul_1.asm @@ -0,0 +1,79 @@ +dnl Intel Pentium mpn_mul_1 -- mpn by limb multiplication. +dnl +dnl P5: 13.0 cycles/limb + +dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. */ + + +include(`../config.m4') + + +C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t multiplier); + +defframe(PARAM_MULTIPLIER,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_mul_1) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST, %edi + movl PARAM_SRC, %esi + movl PARAM_SIZE, %ecx + movl PARAM_MULTIPLIER, %ebp + + leal (%edi,%ecx,4), %edi + leal (%esi,%ecx,4), %esi + negl %ecx + xorl %ebx, %ebx + ALIGN(8) + +L(oop): adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(oop) + + adcl $0, %ebx + movl %ebx, %eax + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/mul_basecase.asm b/rts/gmp/mpn/x86/pentium/mul_basecase.asm new file mode 100644 index 0000000000..d9f79a0831 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/mul_basecase.asm @@ -0,0 +1,135 @@ +dnl Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication. +dnl +dnl P5: 14.2 cycles/crossproduct (approx) + + +dnl Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); + +defframe(PARAM_YSIZE, 20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE, 12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + +defframe(VAR_COUNTER, -4) + + .text + ALIGN(8) +PROLOGUE(mpn_mul_basecase) + + pushl %eax C dummy push for allocating stack slot + pushl %esi + pushl %ebp + pushl %edi +deflit(`FRAME',16) + + movl PARAM_XP,%esi + movl PARAM_WP,%edi + movl PARAM_YP,%ebp + + movl (%esi),%eax C load xp[0] + mull (%ebp) C multiply by yp[0] + movl %eax,(%edi) C store to wp[0] + movl PARAM_XSIZE,%ecx C xsize + decl %ecx C If xsize = 1, ysize = 1 too + jz L(done) + + movl PARAM_XSIZE,%eax + pushl %ebx +FRAME_pushl() + movl %edx,%ebx + leal (%esi,%eax,4),%esi C make xp point at end + leal (%edi,%eax,4),%edi C offset wp by xsize + negl %ecx C negate j size/index for inner loop + xorl %eax,%eax C clear carry + + ALIGN(8) +L(oop1): adcl $0,%ebx + movl (%esi,%ecx,4),%eax C load next limb at xp[j] + mull (%ebp) + addl %ebx,%eax + movl %eax,(%edi,%ecx,4) + incl %ecx + movl %edx,%ebx + jnz L(oop1) + + adcl $0,%ebx + movl PARAM_YSIZE,%eax + movl %ebx,(%edi) C most significant limb of product + addl $4,%edi C increment wp + decl %eax + jz L(skip) + movl %eax,VAR_COUNTER C set index i to ysize + +L(outer): + addl $4,%ebp C make ebp point to next y limb + movl PARAM_XSIZE,%ecx + negl %ecx + xorl %ebx,%ebx + + C code at 0x61 here, close enough to aligned +L(oop2): + adcl $0,%ebx + movl (%esi,%ecx,4),%eax + mull (%ebp) + addl %ebx,%eax + movl (%edi,%ecx,4),%ebx + adcl $0,%edx + addl %eax,%ebx + movl %ebx,(%edi,%ecx,4) + incl %ecx + movl %edx,%ebx + jnz L(oop2) + + adcl $0,%ebx + + movl %ebx,(%edi) + addl $4,%edi + movl VAR_COUNTER,%eax + decl %eax + movl %eax,VAR_COUNTER + jnz L(outer) + +L(skip): + popl %ebx + popl %edi + popl %ebp + popl %esi + addl $4,%esp + ret + +L(done): + movl %edx,4(%edi) C store to wp[1] + popl %edi + popl %ebp + popl %esi + popl %eax C dummy pop for deallocating stack slot + ret + +EPILOGUE() + diff --git a/rts/gmp/mpn/x86/pentium/rshift.asm b/rts/gmp/mpn/x86/pentium/rshift.asm new file mode 100644 index 0000000000..e8f5ae8ec8 --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/rshift.asm @@ -0,0 +1,236 @@ +dnl Intel Pentium mpn_rshift -- mpn right shift. +dnl +dnl cycles/limb +dnl P5,P54: 6.0 +dnl P55: 5.375 + + +dnl Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software +dnl Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); +C +C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does, +C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere. + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_rshift) + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp +deflit(`FRAME',16) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%ebp + movl PARAM_SHIFT,%ecx + +C We can use faster code for shift-by-1 under certain conditions. + cmp $1,%ecx + jne L(normal) + leal 4(%edi),%eax + cmpl %esi,%eax + jnc L(special) C jump if res_ptr + 1 >= s_ptr + leal (%edi,%ebp,4),%eax + cmpl %eax,%esi + jnc L(special) C jump if s_ptr >= res_ptr + size + +L(normal): + movl (%esi),%edx + addl $4,%esi + xorl %eax,%eax + shrdl( %cl, %edx, %eax) C compute carry limb + pushl %eax C push carry limb onto stack + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz L(end) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(oop): movl 28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + shrdl( %cl, %eax, %ebx) + shrdl( %cl, %edx, %eax) + movl %ebx,(%edi) + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + shrdl( %cl, %ebx, %edx) + shrdl( %cl, %eax, %ebx) + movl %edx,8(%edi) + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + shrdl( %cl, %edx, %eax) + shrdl( %cl, %ebx, %edx) + movl %eax,16(%edi) + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + shrdl( %cl, %eax, %ebx) + shrdl( %cl, %edx, %eax) + movl %ebx,24(%edi) + movl %eax,28(%edi) + + addl $32,%esi + addl $32,%edi + decl %ebp + jnz L(oop) + +L(end): popl %ebp + andl $7,%ebp + jz L(end2) +L(oop2): + movl (%esi),%eax + shrdl( %cl,%eax,%edx) C compute result limb + movl %edx,(%edi) + movl %eax,%edx + addl $4,%esi + addl $4,%edi + decl %ebp + jnz L(oop2) + +L(end2): + shrl %cl,%edx C compute most significant limb + movl %edx,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + +C We loop from least significant end of the arrays, which is only +C permissable if the source and destination don't overlap, since the +C function is documented to work for overlapping source and destination. + +L(special): + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + shrl %edx + incl %ebp + decl %ebp + jz L(Lend) + + movl (%edi),%eax C fetch destination cache line + + ALIGN(4) +L(Loop): + movl -28(%edi),%eax C fetch destination cache line + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + rcrl %eax + movl %ebx,(%edi) + rcrl %edx + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + rcrl %ebx + movl %edx,-8(%edi) + rcrl %eax + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + rcrl %edx + movl %eax,-16(%edi) + rcrl %ebx + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + rcrl %eax + movl %ebx,-24(%edi) + rcrl %edx + movl %eax,-28(%edi) + + leal -32(%esi),%esi C use leal not to clobber carry + leal -32(%edi),%edi + decl %ebp + jnz L(Loop) + +L(Lend): + popl %ebp + sbbl %eax,%eax C save carry in %eax + andl $7,%ebp + jz L(Lend2) + addl %eax,%eax C restore carry from eax +L(Loop2): + movl %edx,%ebx + movl (%esi),%edx + rcrl %edx + movl %ebx,(%edi) + + leal -4(%esi),%esi C use leal not to clobber carry + leal -4(%edi),%edi + decl %ebp + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax C restore carry from eax +L(L1): movl %edx,(%edi) C store last limb + + movl $0,%eax + rcrl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/pentium/sqr_basecase.asm b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm new file mode 100644 index 0000000000..c8584df13c --- /dev/null +++ b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm @@ -0,0 +1,520 @@ +dnl Intel P5 mpn_sqr_basecase -- square an mpn number. +dnl +dnl P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular +dnl product at around 20x20 limbs. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C Calculate src,size squared, storing the result in dst,2*size. +C +C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a +C lot of function call overheads are avoided, especially when the size is +C small. + +defframe(PARAM_SIZE,12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_sqr_basecase) +deflit(`FRAME',0) + + movl PARAM_SIZE, %edx + movl PARAM_SRC, %eax + + cmpl $2, %edx + movl PARAM_DST, %ecx + + je L(two_limbs) + + movl (%eax), %eax + ja L(three_or_more) + +C ----------------------------------------------------------------------------- +C one limb only + C eax src + C ebx + C ecx dst + C edx + + mull %eax + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + ret + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(two_limbs): + C eax src + C ebx + C ecx dst + C edx size + + pushl %ebp + pushl %edi + + pushl %esi + pushl %ebx + + movl %eax, %ebx + movl (%eax), %eax + + mull %eax C src[0]^2 + + movl %eax, (%ecx) C dst[0] + movl %edx, %esi C dst[1] + + movl 4(%ebx), %eax + + mull %eax C src[1]^2 + + movl %eax, %edi C dst[2] + movl %edx, %ebp C dst[3] + + movl (%ebx), %eax + + mull 4(%ebx) C src[0]*src[1] + + addl %eax, %esi + popl %ebx + + adcl %edx, %edi + + adcl $0, %ebp + addl %esi, %eax + + adcl %edi, %edx + movl %eax, 4(%ecx) + + adcl $0, %ebp + popl %esi + + movl %edx, 8(%ecx) + movl %ebp, 12(%ecx) + + popl %edi + popl %ebp + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(three_or_more): + C eax src low limb + C ebx + C ecx dst + C edx size + + cmpl $4, %edx + pushl %ebx +deflit(`FRAME',4) + + movl PARAM_SRC, %ebx + jae L(four_or_more) + + +C ----------------------------------------------------------------------------- +C three limbs + C eax src low limb + C ebx src + C ecx dst + C edx size + + pushl %ebp + pushl %edi + + mull %eax C src[0] ^ 2 + + movl %eax, (%ecx) + movl %edx, 4(%ecx) + + movl 4(%ebx), %eax + xorl %ebp, %ebp + + mull %eax C src[1] ^ 2 + + movl %eax, 8(%ecx) + movl %edx, 12(%ecx) + + movl 8(%ebx), %eax + pushl %esi C risk of cache bank clash + + mull %eax C src[2] ^ 2 + + movl %eax, 16(%ecx) + movl %edx, 20(%ecx) + + movl (%ebx), %eax + + mull 4(%ebx) C src[0] * src[1] + + movl %eax, %esi + movl %edx, %edi + + movl (%ebx), %eax + + mull 8(%ebx) C src[0] * src[2] + + addl %eax, %edi + movl %edx, %ebp + + adcl $0, %ebp + movl 4(%ebx), %eax + + mull 8(%ebx) C src[1] * src[2] + + xorl %ebx, %ebx + addl %eax, %ebp + + C eax + C ebx zero, will be dst[5] + C ecx dst + C edx dst[4] + C esi dst[1] + C edi dst[2] + C ebp dst[3] + + adcl $0, %edx + addl %esi, %esi + + adcl %edi, %edi + + adcl %ebp, %ebp + + adcl %edx, %edx + movl 4(%ecx), %eax + + adcl $0, %ebx + addl %esi, %eax + + movl %eax, 4(%ecx) + movl 8(%ecx), %eax + + adcl %edi, %eax + movl 12(%ecx), %esi + + adcl %ebp, %esi + movl 16(%ecx), %edi + + movl %eax, 8(%ecx) + movl %esi, 12(%ecx) + + adcl %edx, %edi + popl %esi + + movl 20(%ecx), %eax + movl %edi, 16(%ecx) + + popl %edi + popl %ebp + + adcl %ebx, %eax C no carry out of this + popl %ebx + + movl %eax, 20(%ecx) + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(8) +L(four_or_more): + C eax src low limb + C ebx src + C ecx dst + C edx size + C esi + C edi + C ebp + C + C First multiply src[0]*src[1..size-1] and store at dst[1..size]. + +deflit(`FRAME',4) + + pushl %edi +FRAME_pushl() + pushl %esi +FRAME_pushl() + + pushl %ebp +FRAME_pushl() + leal (%ecx,%edx,4), %edi C dst end of this mul1 + + leal (%ebx,%edx,4), %esi C src end + movl %ebx, %ebp C src + + negl %edx C -size + xorl %ebx, %ebx C clear carry limb and carry flag + + leal 1(%edx), %ecx C -(size-1) + +L(mul1): + C eax scratch + C ebx carry + C ecx counter, negative + C edx scratch + C esi &src[size] + C edi &dst[size] + C ebp src + + adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull (%ebp) + + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(mul1) + + + C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for + C n=1..size-2. + C + C The last two products, which are the end corner of the product + C triangle, are handled separately to save looping overhead. These + C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1]. + C If size is 4 then it's only these that need to be done. + C + C In the outer loop %esi is a constant, and %edi just advances by 1 + C limb each time. The size of the operation decreases by 1 limb + C each time. + + C eax + C ebx carry (needing carry flag added) + C ecx + C edx + C esi &src[size] + C edi &dst[size] + C ebp + + adcl $0, %ebx + movl PARAM_SIZE, %edx + + movl %ebx, (%edi) + subl $4, %edx + + negl %edx + jz L(corner) + + +L(outer): + C ebx previous carry limb to store + C edx outer loop counter (negative) + C esi &src[size] + C edi dst, pointing at stored carry limb of previous loop + + pushl %edx C new outer loop counter + leal -2(%edx), %ecx + + movl %ebx, (%edi) + addl $4, %edi + + addl $4, %ebp + xorl %ebx, %ebx C initial carry limb, clear carry flag + +L(inner): + C eax scratch + C ebx carry (needing carry flag added) + C ecx counter, negative + C edx scratch + C esi &src[size] + C edi dst end of this addmul + C ebp &src[j] + + adcl $0, %ebx + movl (%esi,%ecx,4), %eax + + mull (%ebp) + + addl %ebx, %eax + movl (%edi,%ecx,4), %ebx + + adcl $0, %edx + addl %eax, %ebx + + movl %ebx, (%edi,%ecx,4) + incl %ecx + + movl %edx, %ebx + jnz L(inner) + + + adcl $0, %ebx + popl %edx C outer loop counter + + incl %edx + jnz L(outer) + + + movl %ebx, (%edi) + +L(corner): + C esi &src[size] + C edi &dst[2*size-4] + + movl -8(%esi), %eax + movl -4(%edi), %ebx C risk of data cache bank clash here + + mull -12(%esi) C src[size-2]*src[size-3] + + addl %eax, %ebx + movl %edx, %ecx + + adcl $0, %ecx + movl -4(%esi), %eax + + mull -12(%esi) C src[size-1]*src[size-3] + + addl %ecx, %eax + movl (%edi), %ecx + + adcl $0, %edx + movl %ebx, -4(%edi) + + addl %eax, %ecx + movl %edx, %ebx + + adcl $0, %ebx + movl -4(%esi), %eax + + mull -8(%esi) C src[size-1]*src[size-2] + + movl %ecx, 0(%edi) + addl %eax, %ebx + + adcl $0, %edx + movl PARAM_SIZE, %eax + + negl %eax + movl %ebx, 4(%edi) + + addl $1, %eax C -(size-1) and clear carry + movl %edx, 8(%edi) + + +C ----------------------------------------------------------------------------- +C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. + +L(lshift): + C eax counter, negative + C ebx next limb + C ecx + C edx + C esi + C edi &dst[2*size-4] + C ebp + + movl 12(%edi,%eax,8), %ebx + + rcll %ebx + movl 16(%edi,%eax,8), %ecx + + rcll %ecx + movl %ebx, 12(%edi,%eax,8) + + movl %ecx, 16(%edi,%eax,8) + incl %eax + + jnz L(lshift) + + + adcl %eax, %eax C high bit out + movl PARAM_SRC, %esi + + movl PARAM_SIZE, %ecx C risk of cache bank clash + movl %eax, 12(%edi) C dst most significant limb + + +C ----------------------------------------------------------------------------- +C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., +C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the +C low limb of src[0]^2. + + movl (%esi), %eax C src[0] + leal (%esi,%ecx,4), %esi C src end + + negl %ecx + + mull %eax + + movl %eax, 16(%edi,%ecx,8) C dst[0] + movl %edx, %ebx + + addl $1, %ecx C size-1 and clear carry + +L(diag): + C eax scratch (low product) + C ebx carry limb + C ecx counter, negative + C edx scratch (high product) + C esi &src[size] + C edi &dst[2*size-4] + C ebp scratch (fetched dst limbs) + + movl (%esi,%ecx,4), %eax + adcl $0, %ebx + + mull %eax + + movl 16-4(%edi,%ecx,8), %ebp + + addl %ebp, %ebx + movl 16(%edi,%ecx,8), %ebp + + adcl %eax, %ebp + movl %ebx, 16-4(%edi,%ecx,8) + + movl %ebp, 16(%edi,%ecx,8) + incl %ecx + + movl %edx, %ebx + jnz L(diag) + + + adcl $0, %edx + movl 16-4(%edi), %eax C dst most significant limb + + addl %eax, %edx + popl %ebp + + movl %edx, 16-4(%edi) + popl %esi C risk of cache bank clash + + popl %edi + popl %ebx + + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/rshift.asm b/rts/gmp/mpn/x86/rshift.asm new file mode 100644 index 0000000000..c9881fd966 --- /dev/null +++ b/rts/gmp/mpn/x86/rshift.asm @@ -0,0 +1,92 @@ +dnl x86 mpn_rshift -- mpn right shift. + +dnl Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, +dnl Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size, +C unsigned shift); + +defframe(PARAM_SHIFT,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + + .text + ALIGN(8) +PROLOGUE(mpn_rshift) + + pushl %edi + pushl %esi + pushl %ebx +deflit(`FRAME',12) + + movl PARAM_DST,%edi + movl PARAM_SRC,%esi + movl PARAM_SIZE,%edx + movl PARAM_SHIFT,%ecx + + leal -4(%edi,%edx,4),%edi + leal (%esi,%edx,4),%esi + negl %edx + + movl (%esi,%edx,4),%ebx C read least significant limb + xorl %eax,%eax + shrdl( %cl, %ebx, %eax) C compute carry limb + incl %edx + jz L(end) + pushl %eax C push carry limb onto stack + testb $1,%dl + jnz L(1) C enter loop in the middle + movl %ebx,%eax + + ALIGN(8) +L(oop): movl (%esi,%edx,4),%ebx C load next higher limb + shrdl( %cl, %ebx, %eax) C compute result limb + movl %eax,(%edi,%edx,4) C store it + incl %edx +L(1): movl (%esi,%edx,4),%eax + shrdl( %cl, %eax, %ebx) + movl %ebx,(%edi,%edx,4) + incl %edx + jnz L(oop) + + shrl %cl,%eax C compute most significant limb + movl %eax,(%edi) C store it + + popl %eax C pop carry limb + + popl %ebx + popl %esi + popl %edi + ret + +L(end): shrl %cl,%ebx C compute most significant limb + movl %ebx,(%edi) C store it + + popl %ebx + popl %esi + popl %edi + ret + +EPILOGUE() diff --git a/rts/gmp/mpn/x86/udiv.asm b/rts/gmp/mpn/x86/udiv.asm new file mode 100644 index 0000000000..9fe022b107 --- /dev/null +++ b/rts/gmp/mpn/x86/udiv.asm @@ -0,0 +1,44 @@ +dnl x86 mpn_udiv_qrnnd -- 2 by 1 limb division + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low, +C mp_limb_t divisor); + +defframe(PARAM_DIVISOR, 16) +defframe(PARAM_LOW, 12) +defframe(PARAM_HIGH, 8) +defframe(PARAM_REMPTR, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_udiv_qrnnd) +deflit(`FRAME',0) + movl PARAM_LOW, %eax + movl PARAM_HIGH, %edx + divl PARAM_DIVISOR + movl PARAM_REMPTR, %ecx + movl %edx, (%ecx) + ret +EPILOGUE() diff --git a/rts/gmp/mpn/x86/umul.asm b/rts/gmp/mpn/x86/umul.asm new file mode 100644 index 0000000000..3d289d1784 --- /dev/null +++ b/rts/gmp/mpn/x86/umul.asm @@ -0,0 +1,43 @@ +dnl mpn_umul_ppmm -- 1x1->2 limb multiplication + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +include(`../config.m4') + + +C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2); +C + +defframe(PARAM_M2, 12) +defframe(PARAM_M1, 8) +defframe(PARAM_LOWPTR, 4) + + TEXT + ALIGN(8) +PROLOGUE(mpn_umul_ppmm) +deflit(`FRAME',0) + movl PARAM_LOWPTR, %ecx + movl PARAM_M1, %eax + mull PARAM_M2 + movl %eax, (%ecx) + movl %edx, %eax + ret +EPILOGUE() diff --git a/rts/gmp/mpn/x86/x86-defs.m4 b/rts/gmp/mpn/x86/x86-defs.m4 new file mode 100644 index 0000000000..2dad698002 --- /dev/null +++ b/rts/gmp/mpn/x86/x86-defs.m4 @@ -0,0 +1,713 @@ +divert(-1) + +dnl m4 macros for x86 assembler. + + +dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 2.1 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public +dnl License along with the GNU MP Library; see the file COPYING.LIB. If +dnl not, write to the Free Software Foundation, Inc., 59 Temple Place - +dnl Suite 330, Boston, MA 02111-1307, USA. + + +dnl Notes: +dnl +dnl m4 isn't perfect for processing BSD style x86 assembler code, the main +dnl problems are, +dnl +dnl 1. Doing define(foo,123) and then using foo in an addressing mode like +dnl foo(%ebx) expands as a macro rather than a constant. This is worked +dnl around by using deflit() from asm-defs.m4, instead of define(). +dnl +dnl 2. Immediates in macro definitions need a space or `' to stop the $ +dnl looking like a macro parameter. For example, +dnl +dnl define(foo, `mov $ 123, %eax') +dnl +dnl This is only a problem in macro definitions, not in ordinary text, +dnl nor in macro parameters like text passed to forloop() or ifdef(). + + +deflit(BYTES_PER_MP_LIMB, 4) + + +dnl -------------------------------------------------------------------------- +dnl Replacement PROLOGUE/EPILOGUE with more sophisticated error checking. +dnl Nesting and overlapping not allowed. +dnl + + +dnl Usage: PROLOGUE(functionname) +dnl +dnl Generate a function prologue. functionname gets GSYM_PREFIX added. +dnl Examples, +dnl +dnl PROLOGUE(mpn_add_n) +dnl PROLOGUE(somefun) + +define(`PROLOGUE', +m4_assert_numargs(1) +m4_assert_defined(`PROLOGUE_cpu') +`ifdef(`PROLOGUE_current_function', +`m4_error(`PROLOGUE'(`PROLOGUE_current_function') needs an `EPILOGUE'() before `PROLOGUE'($1) +)')dnl +m4_file_seen()dnl +define(`PROLOGUE_current_function',`$1')dnl +PROLOGUE_cpu(GSYM_PREFIX`'$1)') + + +dnl Usage: EPILOGUE() +dnl +dnl Notice the function name is passed to EPILOGUE_cpu(), letting it use $1 +dnl instead of the long PROLOGUE_current_function symbol. + +define(`EPILOGUE', +m4_assert_numargs(0) +m4_assert_defined(`EPILOGUE_cpu') +`ifdef(`PROLOGUE_current_function',, +`m4_error(`EPILOGUE'() with no `PROLOGUE'() +)')dnl +EPILOGUE_cpu(GSYM_PREFIX`'PROLOGUE_current_function)`'dnl +undefine(`PROLOGUE_current_function')') + +m4wrap_prepend( +`ifdef(`PROLOGUE_current_function', +`m4_error(`EPILOGUE() for PROLOGUE('PROLOGUE_current_function`) never seen +')')') + + +dnl Usage: PROLOGUE_assert_inside() +dnl +dnl Use this unquoted on a line on its own at the start of a macro +dnl definition to add some code to check the macro is only used inside a +dnl PROLOGUE/EPILOGUE pair, and that hence PROLOGUE_current_function is +dnl defined. + +define(PROLOGUE_assert_inside, +m4_assert_numargs(0) +``PROLOGUE_assert_inside_internal'(m4_doublequote($`'0))`dnl '') + +define(PROLOGUE_assert_inside_internal, +m4_assert_numargs(1) +`ifdef(`PROLOGUE_current_function',, +`m4_error(`$1 used outside a PROLOGUE / EPILOGUE pair +')')') + + +dnl Usage: L(labelname) +dnl LF(functionname,labelname) +dnl +dnl Generate a local label in the current or given function. For LF(), +dnl functionname gets GSYM_PREFIX added, the same as with PROLOGUE(). +dnl +dnl For example, in a function mpn_add_n (and with MPN_PREFIX __gmpn), +dnl +dnl L(bar) => L__gmpn_add_n__bar +dnl LF(somefun,bar) => Lsomefun__bar +dnl +dnl The funtion name and label name get two underscores between them rather +dnl than one to guard against clashing with a separate external symbol that +dnl happened to be called functionname_labelname. (Though this would only +dnl happen if the local label prefix is is empty.) Underscores are used so +dnl the whole label will still be a valid C identifier and so can be easily +dnl used in gdb. + +dnl LSYM_PREFIX can be L$, so defn() is used to prevent L expanding as the +dnl L macro and making an infinite recursion. +define(LF, +m4_assert_numargs(2) +m4_assert_defined(`LSYM_PREFIX') +`defn(`LSYM_PREFIX')GSYM_PREFIX`'$1`'__$2') + +define(`L', +m4_assert_numargs(1) +PROLOGUE_assert_inside() +`LF(PROLOGUE_current_function,`$1')') + + +dnl Called: PROLOGUE_cpu(gsym) +dnl EPILOGUE_cpu(gsym) + +define(PROLOGUE_cpu, +m4_assert_numargs(1) + `GLOBL $1 + TYPE($1,`function') +$1:') + +define(EPILOGUE_cpu, +m4_assert_numargs(1) +` SIZE($1,.-$1)') + + + +dnl -------------------------------------------------------------------------- +dnl Various x86 macros. +dnl + + +dnl Usage: ALIGN_OFFSET(bytes,offset) +dnl +dnl Align to `offset' away from a multiple of `bytes'. +dnl +dnl This is useful for testing, for example align to something very strict +dnl and see what effect offsets from it have, "ALIGN_OFFSET(256,32)". +dnl +dnl Generally you wouldn't execute across the padding, but it's done with +dnl nop's so it'll work. + +define(ALIGN_OFFSET, +m4_assert_numargs(2) +`ALIGN($1) +forloop(`i',1,$2,` nop +')') + + +dnl Usage: defframe(name,offset) +dnl +dnl Make a definition like the following with which to access a parameter +dnl or variable on the stack. +dnl +dnl define(name,`FRAME+offset(%esp)') +dnl +dnl Actually m4_empty_if_zero(FRAME+offset) is used, which will save one +dnl byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp). +dnl Use define(`defframe_empty_if_zero_disabled',1) if for some reason the +dnl zero offset is wanted. +dnl +dnl The new macro also gets a check that when it's used FRAME is actually +dnl defined, and that the final %esp offset isn't negative, which would +dnl mean an attempt to access something below the current %esp. +dnl +dnl deflit() is used rather than a plain define(), so the new macro won't +dnl delete any following parenthesized expression. name(%edi) will come +dnl out say as 16(%esp)(%edi). This isn't valid assembler and should +dnl provoke an error, which is better than silently giving just 16(%esp). +dnl +dnl See README.family for more on the suggested way to access the stack +dnl frame. + +define(defframe, +m4_assert_numargs(2) +`deflit(`$1', +m4_assert_defined(`FRAME') +`defframe_check_notbelow(`$1',$2,FRAME)dnl +defframe_empty_if_zero(FRAME+($2))(%esp)')') + +dnl Called: defframe_empty_if_zero(expression) +define(defframe_empty_if_zero, +`ifelse(defframe_empty_if_zero_disabled,1, +`eval($1)', +`m4_empty_if_zero($1)')') + +dnl Called: defframe_check_notbelow(`name',offset,FRAME) +define(defframe_check_notbelow, +m4_assert_numargs(3) +`ifelse(eval(($3)+($2)<0),1, +`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes +')')') + + +dnl Usage: FRAME_pushl() +dnl FRAME_popl() +dnl FRAME_addl_esp(n) +dnl FRAME_subl_esp(n) +dnl +dnl Adjust FRAME appropriately for a pushl or popl, or for an addl or subl +dnl %esp of n bytes. +dnl +dnl Using these macros is completely optional. Sometimes it makes more +dnl sense to put explicit deflit(`FRAME',N) forms, especially when there's +dnl jumps and different sequences of FRAME values need to be used in +dnl different places. + +define(FRAME_pushl, +m4_assert_numargs(0) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME+4))') + +define(FRAME_popl, +m4_assert_numargs(0) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME-4))') + +define(FRAME_addl_esp, +m4_assert_numargs(1) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME-($1)))') + +define(FRAME_subl_esp, +m4_assert_numargs(1) +m4_assert_defined(`FRAME') +`deflit(`FRAME',eval(FRAME+($1)))') + + +dnl Usage: defframe_pushl(name) +dnl +dnl Do a combination of a FRAME_pushl() and a defframe() to name the stack +dnl location just pushed. This should come after a pushl instruction. +dnl Putting it on the same line works and avoids lengthening the code. For +dnl example, +dnl +dnl pushl %eax defframe_pushl(VAR_COUNTER) +dnl +dnl Notice the defframe() is done with an unquoted -FRAME thus giving its +dnl current value without tracking future changes. + +define(defframe_pushl, +`FRAME_pushl()defframe(`$1',-FRAME)') + + +dnl -------------------------------------------------------------------------- +dnl Assembler instruction macros. +dnl + + +dnl Usage: emms_or_femms +dnl femms_available_p +dnl +dnl femms_available_p expands to 1 or 0 according to whether the AMD 3DNow +dnl femms instruction is available. emms_or_femms expands to femms if +dnl available, or emms if not. +dnl +dnl emms_or_femms is meant for use in the K6 directory where plain K6 +dnl (without femms) and K6-2 and K6-3 (with a slightly faster femms) are +dnl supported together. +dnl +dnl On K7 femms is no longer faster and is just an alias for emms, so plain +dnl emms may as well be used. + +define(femms_available_p, +m4_assert_numargs(-1) +`m4_ifdef_anyof_p( + `HAVE_TARGET_CPU_k62', + `HAVE_TARGET_CPU_k63', + `HAVE_TARGET_CPU_athlon')') + +define(emms_or_femms, +m4_assert_numargs(-1) +`ifelse(femms_available_p,1,`femms',`emms')') + + +dnl Usage: femms +dnl +dnl The gas 2.9.1 that comes with FreeBSD 3.4 doesn't support femms, so the +dnl following is a replacement using .byte. +dnl +dnl If femms isn't available, an emms is generated instead, for convenience +dnl when testing on a machine without femms. + +define(femms, +m4_assert_numargs(-1) +`ifelse(femms_available_p,1, +`.byte 15,14 C AMD 3DNow femms', +`emms`'dnl +m4_warning(`warning, using emms in place of femms, use for testing only +')')') + + +dnl Usage: jadcl0(op) +dnl +dnl Issue a jnc/incl as a substitute for adcl $0,op. This isn't an exact +dnl replacement, since it doesn't set the flags like adcl does. +dnl +dnl This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and +dnl mpn_sqr_basecase because on K6 an adcl is slow, the branch +dnl misprediction penalty is small, and the multiply algorithm used leads +dnl to a carry bit on average only 1/4 of the time. +dnl +dnl jadcl0_disabled can be set to 1 to instead issue an ordinary adcl for +dnl comparison. For example, +dnl +dnl define(`jadcl0_disabled',1) +dnl +dnl When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is +dnl the same size as an adcl. This makes it possible to use the exact same +dnl computed jump code when testing the relative speed of jnc/incl and adcl +dnl with jadcl0_disabled. + +define(jadcl0, +m4_assert_numargs(1) +`ifelse(jadcl0_disabled,1, + `adcl $`'0, $1', + `jnc 1f + incl $1 +1:dnl')') + + +dnl Usage: cmov_available_p +dnl +dnl Expand to 1 if cmov is available, 0 if not. + +define(cmov_available_p, +`m4_ifdef_anyof_p( + `HAVE_TARGET_CPU_pentiumpro', + `HAVE_TARGET_CPU_pentium2', + `HAVE_TARGET_CPU_pentium3', + `HAVE_TARGET_CPU_athlon')') + + +dnl Usage: x86_lookup(target, key,value, key,value, ...) +dnl x86_lookup_p(target, key,value, key,value, ...) +dnl +dnl Look for `target' among the `key' parameters. +dnl +dnl x86_lookup expands to the corresponding `value', or generates an error +dnl if `target' isn't found. +dnl +dnl x86_lookup_p expands to 1 if `target' is found, or 0 if not. + +define(x86_lookup, +`ifelse(eval($#<3),1, +`m4_error(`unrecognised part of x86 instruction: $1 +')', +`ifelse(`$1',`$2', `$3', +`x86_lookup(`$1',shift(shift(shift($@))))')')') + +define(x86_lookup_p, +`ifelse(eval($#<3),1, `0', +`ifelse(`$1',`$2', `1', +`x86_lookup_p(`$1',shift(shift(shift($@))))')')') + + +dnl Usage: x86_opcode_reg32(reg) +dnl x86_opcode_reg32_p(reg) +dnl +dnl x86_opcode_reg32 expands to the standard 3 bit encoding for the given +dnl 32-bit register, eg. `%ebp' turns into 5. +dnl +dnl x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0 +dnl if not. + +define(x86_opcode_reg32, +m4_assert_numargs(1) +`x86_lookup(`$1',x86_opcode_reg32_list)') + +define(x86_opcode_reg32_p, +m4_assert_onearg() +`x86_lookup_p(`$1',x86_opcode_reg32_list)') + +define(x86_opcode_reg32_list, +``%eax',0, +`%ecx',1, +`%edx',2, +`%ebx',3, +`%esp',4, +`%ebp',5, +`%esi',6, +`%edi',7') + + +dnl Usage: x86_opcode_tttn(cond) +dnl +dnl Expand to the 4-bit "tttn" field value for the given x86 branch +dnl condition (like `c', `ae', etc). + +define(x86_opcode_tttn, +m4_assert_numargs(1) +`x86_lookup(`$1',x86_opcode_ttn_list)') + +define(x86_opcode_tttn_list, +``o', 0, +`no', 1, +`b', 2, `c', 2, `nae',2, +`nb', 3, `nc', 3, `ae', 3, +`e', 4, `z', 4, +`ne', 5, `nz', 5, +`be', 6, `na', 6, +`nbe', 7, `a', 7, +`s', 8, +`ns', 9, +`p', 10, `pe', 10, `npo',10, +`np', 11, `npe',11, `po', 11, +`l', 12, `nge',12, +`nl', 13, `ge', 13, +`le', 14, `ng', 14, +`nle',15, `g', 15') + + +dnl Usage: cmovCC(srcreg,dstreg) +dnl +dnl Generate a cmov instruction if the target supports cmov, or simulate it +dnl with a conditional jump if not (the latter being meant only for +dnl testing). For example, +dnl +dnl cmovz( %eax, %ebx) +dnl +dnl cmov instructions are generated using .byte sequences, since only +dnl recent versions of gas know cmov. +dnl +dnl The source operand can only be a plain register. (m4 code implementing +dnl full memory addressing modes exists, believe it or not, but isn't +dnl currently needed and isn't included.) +dnl +dnl All the standard conditions are defined. Attempting to use one without +dnl the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke +dnl an error. This ensures the necessary .byte sequences aren't +dnl accidentally missed. + +dnl Called: define_cmov_many(cond,tttn,cond,tttn,...) +define(define_cmov_many, +`ifelse(m4_length(`$1'),0,, +`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')') + +dnl Called: define_cmov(cond,tttn) +define(define_cmov, +m4_assert_numargs(2) +`define(`cmov$1', +m4_instruction_wrapper() +m4_assert_numargs(2) +`cmov_internal'(m4_doublequote($`'0),``$1',`$2'',dnl +m4_doublequote($`'1),m4_doublequote($`'2)))') + +define_cmov_many(x86_opcode_tttn_list) + + +dnl Called: cmov_internal(name,cond,tttn,src,dst) +define(cmov_internal, +m4_assert_numargs(5) +`ifelse(cmov_available_p,1, +`cmov_bytes_tttn(`$1',`$3',`$4',`$5')', +`m4_warning(`warning, simulating cmov with jump, use for testing only +')cmov_simulate(`$2',`$4',`$5')')') + +dnl Called: cmov_simulate(cond,src,dst) +dnl If this is going to be used with memory operands for the source it will +dnl need to be changed to do a fetch even if the condition is false, so as +dnl to trigger exceptions the same way a real cmov does. +define(cmov_simulate, +m4_assert_numargs(3) + `j$1 1f C cmov$1 $2, $3 + jmp 2f +1: movl $2, $3 +2:') + +dnl Called: cmov_bytes_tttn(name,tttn,src,dst) +define(cmov_bytes_tttn, +m4_assert_numargs(4) +`.byte dnl +15, dnl +eval(64+$2), dnl +eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl + C `$1 $3, $4'') + + +dnl Usage: loop_or_decljnz label +dnl +dnl Generate either a "loop" instruction or a "decl %ecx / jnz", whichever +dnl is better. "loop" is better on K6 and probably on 386, on other chips +dnl separate decl/jnz is better. +dnl +dnl This macro is just for mpn/x86/divrem_1.asm and mpn/x86/mod_1.asm where +dnl this loop_or_decljnz variation is enough to let the code be shared by +dnl all chips. + +define(loop_or_decljnz, +`ifelse(loop_is_better_p,1, + `loop', + `decl %ecx + jnz')') + +define(loop_is_better_p, +`m4_ifdef_anyof_p(`HAVE_TARGET_CPU_k6', + `HAVE_TARGET_CPU_k62', + `HAVE_TARGET_CPU_k63', + `HAVE_TARGET_CPU_i386')') + + +dnl Usage: Zdisp(inst,op,op,op) +dnl +dnl Generate explicit .byte sequences if necessary to force a byte-sized +dnl zero displacement on an instruction. For example, +dnl +dnl Zdisp( movl, 0,(%esi), %eax) +dnl +dnl expands to +dnl +dnl .byte 139,70,0 C movl 0(%esi), %eax +dnl +dnl If the displacement given isn't 0, then normal assembler code is +dnl generated. For example, +dnl +dnl Zdisp( movl, 4,(%esi), %eax) +dnl +dnl expands to +dnl +dnl movl 4(%esi), %eax +dnl +dnl This means a single Zdisp() form can be used with an expression for the +dnl displacement, and .byte will be used only if necessary. The +dnl displacement argument is eval()ed. +dnl +dnl Because there aren't many places a 0(reg) form is wanted, Zdisp is +dnl implemented with a table of instructions and encodings. A new entry is +dnl needed for any different operation or registers. + +define(Zdisp, +`define(`Zdisp_found',0)dnl +Zdisp_match( movl, %eax, 0,(%edi), `137,71,0', $@)`'dnl +Zdisp_match( movl, %ebx, 0,(%edi), `137,95,0', $@)`'dnl +Zdisp_match( movl, %esi, 0,(%edi), `137,119,0', $@)`'dnl +Zdisp_match( movl, 0,(%ebx), %eax, `139,67,0', $@)`'dnl +Zdisp_match( movl, 0,(%ebx), %esi, `139,115,0', $@)`'dnl +Zdisp_match( movl, 0,(%esi), %eax, `139,70,0', $@)`'dnl +Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl +Zdisp_match( addl, %ebx, 0,(%edi), `1,95,0', $@)`'dnl +Zdisp_match( addl, %ecx, 0,(%edi), `1,79,0', $@)`'dnl +Zdisp_match( addl, %esi, 0,(%edi), `1,119,0', $@)`'dnl +Zdisp_match( subl, %ecx, 0,(%edi), `41,79,0', $@)`'dnl +Zdisp_match( adcl, 0,(%edx), %esi, `19,114,0', $@)`'dnl +Zdisp_match( sbbl, 0,(%edx), %esi, `27,114,0', $@)`'dnl +Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl +Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl +Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl +Zdisp_match( movq, 0,(%esi), %mm0, `15,111,70,0', $@)`'dnl +Zdisp_match( movq, %mm0, 0,(%edi), `15,127,71,0', $@)`'dnl +Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl +Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl +Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl +Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl +Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl +Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl +Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl +Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl +ifelse(Zdisp_found,0, +`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4 +')')') + +define(Zdisp_match, +`ifelse(eval(m4_stringequal_p(`$1',`$6') + && m4_stringequal_p(`$2',0) + && m4_stringequal_p(`$3',`$8') + && m4_stringequal_p(`$4',`$9')),1, +`define(`Zdisp_found',1)dnl +ifelse(eval(`$7'),0, +` .byte $5 C `$1 0$3, $4'', +` $6 $7$8, $9')', + +`ifelse(eval(m4_stringequal_p(`$1',`$6') + && m4_stringequal_p(`$2',`$7') + && m4_stringequal_p(`$3',0) + && m4_stringequal_p(`$4',`$9')),1, +`define(`Zdisp_found',1)dnl +ifelse(eval(`$8'),0, +` .byte $5 C `$1 $2, 0$4'', +` $6 $7, $8$9')')')') + + +dnl Usage: shldl(count,src,dst) +dnl shrdl(count,src,dst) +dnl shldw(count,src,dst) +dnl shrdw(count,src,dst) +dnl +dnl Generate a double-shift instruction, possibly omitting a %cl count +dnl parameter if that's what the assembler requires, as indicated by +dnl WANT_SHLDL_CL in config.m4. For example, +dnl +dnl shldl( %cl, %eax, %ebx) +dnl +dnl turns into either +dnl +dnl shldl %cl, %eax, %ebx +dnl or +dnl shldl %eax, %ebx +dnl +dnl Immediate counts are always passed through unchanged. For example, +dnl +dnl shrdl( $2, %esi, %edi) +dnl becomes +dnl shrdl $2, %esi, %edi +dnl +dnl +dnl If you forget to use the macro form "shldl( ...)" and instead write +dnl just a plain "shldl ...", an error results. This ensures the necessary +dnl variant treatment of %cl isn't accidentally bypassed. + +define(define_shd_instruction, +`define($1, +m4_instruction_wrapper() +m4_assert_numargs(3) +`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl +m4_doublequote($`'2),m4_doublequote($`'3)))') + +dnl Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc +define_shd_instruction(shldl) +define_shd_instruction(shrdl) +define_shd_instruction(shldw) +define_shd_instruction(shrdw) + +dnl Called: shd_instruction(op,count,src,dst) +define(shd_instruction, +m4_assert_numargs(4) +m4_assert_defined(`WANT_SHLDL_CL') +`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1, +``$1' `$3', `$4'', +``$1' `$2', `$3', `$4'')') + + +dnl Usage: ASSERT(cond, instructions) +dnl +dnl If WANT_ASSERT is 1, output the given instructions and expect the given +dnl flags condition to then be satisfied. For example, +dnl +dnl ASSERT(ne, `cmpl %eax, %ebx') +dnl +dnl The instructions can be omitted to just assert a flags condition with +dnl no extra calculation. For example, +dnl +dnl ASSERT(nc) +dnl +dnl When `instructions' is not empty, a pushf/popf is added to preserve the +dnl flags, but the instructions themselves must preserve any registers that +dnl matter. FRAME is adjusted for the push and pop, so the instructions +dnl given can use defframe() stack variables. + +define(ASSERT, +m4_assert_numargs_range(1,2) +`ifelse(WANT_ASSERT,1, + `C ASSERT +ifelse(`$2',,,` pushf ifdef(`FRAME',`FRAME_pushl()')') + $2 + j`$1' 1f + ud2 C assertion failed +1: +ifelse(`$2',,,` popf ifdef(`FRAME',`FRAME_popl()')') +')') + + +dnl Usage: movl_text_address(label,register) +dnl +dnl Get the address of a text segment label, using either a plain movl or a +dnl position-independent calculation, as necessary. For example, +dnl +dnl movl_code_address(L(foo),%eax) +dnl +dnl This macro is only meant for use in ASSERT()s or when testing, since +dnl the PIC sequence it generates will want to be done with a ret balancing +dnl the call on CPUs with return address branch predition. +dnl +dnl The addl generated here has a backward reference to 1b, and so won't +dnl suffer from the two forwards references bug in old gas (described in +dnl mpn/x86/README.family). + +define(movl_text_address, +`ifdef(`PIC', + `call 1f +1: popl $2 C %eip + addl `$'$1-1b, $2', + `movl `$'$1, $2')') + + +divert`'dnl diff --git a/rts/gmp/mpn/z8000/add_n.s b/rts/gmp/mpn/z8000/add_n.s new file mode 100644 index 0000000000..3a136107fe --- /dev/null +++ b/rts/gmp/mpn/z8000/add_n.s @@ -0,0 +1,53 @@ +! Z8000 __gmpn_add_n -- Add two limb vectors of equal, non-zero length. + +! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr r7 +! s1_ptr r6 +! s2_ptr r5 +! size r4 + +! If we are really crazy, we can use push to write a few result words +! backwards, using push just because it is faster than reg+disp. We'd +! then add 2x the number of words written to r7... + + unseg + .text + even + global ___gmpn_add_n +___gmpn_add_n: + pop r0,@r6 + pop r1,@r5 + add r0,r1 + ld @r7,r0 + dec r4 + jr eq,Lend +Loop: pop r0,@r6 + pop r1,@r5 + adc r0,r1 + inc r7,#2 + ld @r7,r0 + dec r4 + jr ne,Loop +Lend: ld r2,r4 ! use 0 already in r4 + adc r2,r2 + ret t diff --git a/rts/gmp/mpn/z8000/gmp-mparam.h b/rts/gmp/mpn/z8000/gmp-mparam.h new file mode 100644 index 0000000000..4216df673c --- /dev/null +++ b/rts/gmp/mpn/z8000/gmp-mparam.h @@ -0,0 +1,27 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +#define BITS_PER_MP_LIMB 16 +#define BYTES_PER_MP_LIMB 2 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 16 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 diff --git a/rts/gmp/mpn/z8000/mul_1.s b/rts/gmp/mpn/z8000/mul_1.s new file mode 100644 index 0000000000..20fadd340a --- /dev/null +++ b/rts/gmp/mpn/z8000/mul_1.s @@ -0,0 +1,68 @@ +! Z8000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store +! the result in a second limb vector. + +! Copyright (C) 1993, 1994, 1995, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr r7 +! s1_ptr r6 +! size r5 +! s2_limb r4 + + unseg + .text + even + global ___gmpn_mul_1 +___gmpn_mul_1: + sub r2,r2 ! zero carry limb + and r4,r4 + jr mi,Lneg + +Lpos: pop r1,@r6 + ld r9,r1 + mult rr8,r4 + and r1,r1 ! shift msb of loaded limb into cy + jr mi,Lp ! branch if loaded limb's msb is set + add r8,r4 ! hi_limb += sign_comp2 +Lp: add r9,r2 ! lo_limb += cy_limb + xor r2,r2 + adc r2,r8 + ld @r7,r9 + inc r7,#2 + dec r5 + jr ne,Lpos + ret t + +Lneg: pop r1,@r6 + ld r9,r1 + mult rr8,r4 + add r8,r1 ! hi_limb += sign_comp1 + and r1,r1 + jr mi,Ln + add r8,r4 ! hi_limb += sign_comp2 +Ln: add r9,r2 ! lo_limb += cy_limb + xor r2,r2 + adc r2,r8 + ld @r7,r9 + inc r7,#2 + dec r5 + jr ne,Lneg + ret t diff --git a/rts/gmp/mpn/z8000/sub_n.s b/rts/gmp/mpn/z8000/sub_n.s new file mode 100644 index 0000000000..bd9a7ad409 --- /dev/null +++ b/rts/gmp/mpn/z8000/sub_n.s @@ -0,0 +1,54 @@ +! Z8000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and +! store difference in a third limb vector. + +! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr r7 +! s1_ptr r6 +! s2_ptr r5 +! size r4 + +! If we are really crazy, we can use push to write a few result words +! backwards, using push just because it is faster than reg+disp. We'd +! then add 2x the number of words written to r7... + + unseg + .text + even + global ___gmpn_sub_n +___gmpn_sub_n: + pop r0,@r6 + pop r1,@r5 + sub r0,r1 + ld @r7,r0 + dec r4 + jr eq,Lend +Loop: pop r0,@r6 + pop r1,@r5 + sbc r0,r1 + inc r7,#2 + ld @r7,r0 + dec r4 + jr ne,Loop +Lend: ld r2,r4 ! use 0 already in r4 + adc r2,r2 + ret t diff --git a/rts/gmp/mpn/z8000x/add_n.s b/rts/gmp/mpn/z8000x/add_n.s new file mode 100644 index 0000000000..7f130785c5 --- /dev/null +++ b/rts/gmp/mpn/z8000x/add_n.s @@ -0,0 +1,56 @@ +! Z8000 (32 bit limb version) __gmpn_add_n -- Add two limb vectors of equal, +! non-zero length. + +! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr r7 +! s1_ptr r6 +! s2_ptr r5 +! size r4 + +! If we are really crazy, we can use push to write a few result words +! backwards, using push just because it is faster than reg+disp. We'd +! then add 2x the number of words written to r7... + + segm + .text + even + global ___gmpn_add_n +___gmpn_add_n: + popl rr0,@r6 + popl rr8,@r5 + addl rr0,rr8 + ldl @r7,rr0 + dec r4 + jr eq,Lend +Loop: popl rr0,@r6 + popl rr8,@r5 + adc r1,r9 + adc r0,r8 + inc r7,#4 + ldl @r7,rr0 + dec r4 + jr ne,Loop +Lend: ld r2,r4 ! use 0 already in r4 + ld r3,r4 + adc r2,r2 + ret t diff --git a/rts/gmp/mpn/z8000x/sub_n.s b/rts/gmp/mpn/z8000x/sub_n.s new file mode 100644 index 0000000000..f416d1d6eb --- /dev/null +++ b/rts/gmp/mpn/z8000x/sub_n.s @@ -0,0 +1,56 @@ +! Z8000 (32 bit limb version) __gmpn_sub_n -- Subtract two limb vectors of the +! same length > 0 and store difference in a third limb vector. + +! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Lesser General Public License as published by +! the Free Software Foundation; either version 2.1 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +! License for more details. + +! You should have received a copy of the GNU Lesser General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +! MA 02111-1307, USA. + + +! INPUT PARAMETERS +! res_ptr r7 +! s1_ptr r6 +! s2_ptr r5 +! size r4 + +! If we are really crazy, we can use push to write a few result words +! backwards, using push just because it is faster than reg+disp. We'd +! then add 2x the number of words written to r7... + + segm + .text + even + global ___gmpn_sub_n +___gmpn_sub_n: + popl rr0,@r6 + popl rr8,@r5 + subl rr0,rr8 + ldl @r7,rr0 + dec r4 + jr eq,Lend +Loop: popl rr0,@r6 + popl rr8,@r5 + sbc r1,r9 + sbc r0,r8 + inc r7,#4 + ldl @r7,rr0 + dec r4 + jr ne,Loop +Lend: ld r2,r4 ! use 0 already in r4 + ld r3,r4 + adc r2,r2 + ret t |