347 files changed, 50022 insertions, 0 deletions
diff --git a/rts/gmp/mpn/Makefile.am b/rts/gmp/mpn/Makefile.am
new file mode 100644
index 0000000000..1c49ccda25
--- /dev/null
+++ b/rts/gmp/mpn/Makefile.am
@@ -0,0 +1,94 @@
+## Process this file with automake to generate Makefile.in
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+SUBDIRS = tests
+
+CPP	= @CPP@
+
+# -DOPERATION_$* tells multi-function files which function to produce.
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+GENERIC_SOURCES = mp_bases.c
+OFILES = @mpn_objects@
+
+noinst_LTLIBRARIES = libmpn.la
+libmpn_la_SOURCES = $(GENERIC_SOURCES)
+libmpn_la_LIBADD = $(OFILES)
+libmpn_la_DEPENDENCIES = $(OFILES)
+
+TARG_DIST = a29k alpha arm clipper cray generic hppa i960 lisp m68k m88k \
+  mips2 mips3 ns32k pa64 pa64w power powerpc32 powerpc64 pyr sh sparc32 \
+  sparc64 thumb vax x86 z8000 z8000x
+
+EXTRA_DIST = underscore.h asm-defs.m4 $(TARG_DIST)
+
+# COMPILE minus CC.  FIXME: Really pass *_CFLAGS to CPP?
+COMPILE_FLAGS =	\
+	$(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+SUFFIXES = .s .S .asm
+
+# *.s are not preprocessed at all.
+.s.o:
+	$(CCAS) $(COMPILE_FLAGS) $<
+.s.obj:
+	$(CCAS) $(COMPILE_FLAGS) `cygpath -w $<`
+.s.lo:
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) $<
+
+# *.S are preprocessed with CPP.
+.S.o:
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.S.obj:
+	$(CPP) $(COMPILE_FLAGS) `cygpath -w $<` | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+
+# We have to rebuild the static object file without passing -DPIC to
+# preprocessor.  The overhead cost is one extra assemblation.  FIXME:
+# Teach libtool how to assemble with a preprocessor pass (CPP or m4).
+
+.S.lo:
+	$(CPP) $(COMPILE_FLAGS) -DPIC $< | grep -v '^#' >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
+
+# *.m4 are preprocessed with m4.
+.asm.o:
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.obj:
+	$(M4) -DOPERATION_$* `cygpath -w $<` >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.lo:
+	$(M4) -DPIC -DOPERATION_$* $< >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
diff --git a/rts/gmp/mpn/Makefile.in b/rts/gmp/mpn/Makefile.in
new file mode 100644
index 0000000000..59ee958c92
--- /dev/null
+++ b/rts/gmp/mpn/Makefile.in
@@ -0,0 +1,472 @@
+# Makefile.in generated automatically by automake 1.4a from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+SHELL = @SHELL@
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+
+bindir = @bindir@
+sbindir = @sbindir@
+libexecdir = @libexecdir@
+datadir = @datadir@
+sysconfdir = @sysconfdir@
+sharedstatedir = @sharedstatedir@
+localstatedir = @localstatedir@
+libdir = @libdir@
+infodir = @infodir@
+mandir = @mandir@
+includedir = @includedir@
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+
+top_builddir = ..
+
+ACLOCAL = @ACLOCAL@
+AUTOCONF = @AUTOCONF@
+AUTOMAKE = @AUTOMAKE@
+AUTOHEADER = @AUTOHEADER@
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_FLAG =
+transform = @program_transform_name@
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+
+@SET_MAKE@
+build_alias = @build_alias@
+build_triplet = @build@
+host_alias = @host_alias@
+host_triplet = @host@
+target_alias = @target_alias@
+target_triplet = @target@
+AMDEP = @AMDEP@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AWK = @AWK@
+CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@
+CC = @CC@
+CCAS = @CCAS@
+CPP = @CPP@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+EXEEXT = @EXEEXT@
+LIBTOOL = @LIBTOOL@
+LN_S = @LN_S@
+M4 = @M4@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+RANLIB = @RANLIB@
+SPEED_CYCLECOUNTER_OBJS = @SPEED_CYCLECOUNTER_OBJS@
+STRIP = @STRIP@
+U = @U@
+VERSION = @VERSION@
+gmp_srclinks = @gmp_srclinks@
+install_sh = @install_sh@
+mpn_objects = @mpn_objects@
+mpn_objs_in_libgmp = @mpn_objs_in_libgmp@
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+SUBDIRS =
+
+CPP = @CPP@
+
+# -DOPERATION_$* tells multi-function files which function to produce.
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+GENERIC_SOURCES = mp_bases.c
+OFILES = @mpn_objects@
+
+noinst_LTLIBRARIES = libmpn.la
+libmpn_la_SOURCES = $(GENERIC_SOURCES)
+libmpn_la_LIBADD = $(OFILES)
+libmpn_la_DEPENDENCIES = $(OFILES)
+
+TARG_DIST = a29k alpha arm clipper cray generic hppa i960 lisp m68k m88k \
+  mips2 mips3 ns32k pa64 pa64w power powerpc32 powerpc64 pyr sh sparc32 \
+  sparc64 thumb vax x86 z8000 z8000x
+
+
+EXTRA_DIST = underscore.h asm-defs.m4 $(TARG_DIST)
+
+# COMPILE minus CC.  FIXME: Really pass *_CFLAGS to CPP?
+COMPILE_FLAGS = \
+	$(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+
+SUFFIXES = .s .S .asm
+subdir = mpn
+mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
+CONFIG_HEADER = ../config.h
+CONFIG_CLEAN_FILES = 
+LTLIBRARIES =  $(noinst_LTLIBRARIES)
+
+
+DEFS = @DEFS@ -I. -I$(srcdir) -I..
+CPPFLAGS = @CPPFLAGS@
+LDFLAGS = @LDFLAGS@
+LIBS = @LIBS@
+libmpn_la_LDFLAGS = 
+am_libmpn_la_OBJECTS =  mp_bases.lo
+libmpn_la_OBJECTS =  $(am_libmpn_la_OBJECTS)
+COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CFLAGS = @CFLAGS@
+CCLD = $(CC)
+LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+DIST_SOURCES =  $(libmpn_la_SOURCES)
+DIST_COMMON =  README Makefile.am Makefile.in
+
+
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+
+GZIP_ENV = --best
+depcomp = 
+SOURCES = $(libmpn_la_SOURCES)
+OBJECTS = $(am_libmpn_la_OBJECTS)
+
+all: all-redirect
+.SUFFIXES:
+.SUFFIXES: .S .asm .c .lo .o .obj .s
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) 
+	cd $(top_srcdir) && $(AUTOMAKE) --gnu mpn/Makefile
+
+Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status
+	cd $(top_builddir) \
+	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+
+mostlyclean-noinstLTLIBRARIES:
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+
+distclean-noinstLTLIBRARIES:
+
+maintainer-clean-noinstLTLIBRARIES:
+
+mostlyclean-compile:
+	-rm -f *.o core *.core
+	-rm -f *.$(OBJEXT)
+
+clean-compile:
+
+distclean-compile:
+	-rm -f *.tab.c
+
+maintainer-clean-compile:
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+distclean-libtool:
+
+maintainer-clean-libtool:
+
+libmpn.la: $(libmpn_la_OBJECTS) $(libmpn_la_DEPENDENCIES)
+	$(LINK)  $(libmpn_la_LDFLAGS) $(libmpn_la_OBJECTS) $(libmpn_la_LIBADD) $(LIBS)
+.c.o:
+	$(COMPILE) -c $<
+.c.obj:
+	$(COMPILE) -c `cygpath -w $<`
+.c.lo:
+	$(LTCOMPILE) -c -o $@ $<
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+
+all-recursive install-data-recursive install-exec-recursive \
+installdirs-recursive install-recursive uninstall-recursive  \
+check-recursive installcheck-recursive info-recursive dvi-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+mostlyclean-recursive clean-recursive distclean-recursive \
+maintainer-clean-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	rev=''; list='$(SUBDIRS)'; for subdir in $$list; do \
+	  rev="$$subdir $$rev"; \
+	  if test "$$subdir" = "."; then dot_seen=yes; else :; fi; \
+	done; \
+	test "$$dot_seen" = "no" && rev=". $$rev"; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+
+tags: TAGS
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -f$$here/ID $$unique $(LISP)
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+   if test "$$subdir" = .; then :; else \
+	    test -f $$subdir/TAGS && tags="$$tags -i $$here/$$subdir/TAGS"; \
+   fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \
+	  || etags $(ETAGS_ARGS) $$tags  $$unique $(LISP)
+
+mostlyclean-tags:
+
+clean-tags:
+
+distclean-tags:
+	-rm -f TAGS ID
+
+maintainer-clean-tags:
+
+distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir)
+
+distdir: $(DISTFILES)
+	@for file in $(DISTFILES); do \
+	  d=$(srcdir); \
+	  if test -d $$d/$$file; then \
+	    cp -pR $$d/$$file $(distdir); \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file || :; \
+	  fi; \
+	done
+	for subdir in $(SUBDIRS); do \
+	  if test "$$subdir" = .; then :; else \
+	    test -d $(distdir)/$$subdir \
+	    || mkdir $(distdir)/$$subdir \
+	    || exit 1; \
+	    (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(top_distdir) distdir=../$(distdir)/$$subdir distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+info-am:
+info: info-recursive
+dvi-am:
+dvi: dvi-recursive
+check-am: all-am
+check: check-recursive
+installcheck-am:
+installcheck: installcheck-recursive
+install-exec-am:
+install-exec: install-exec-recursive
+
+install-data-am:
+install-data: install-data-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-recursive
+uninstall-am:
+uninstall: uninstall-recursive
+all-am: Makefile $(LTLIBRARIES)
+all-redirect: all-recursive
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_STRIP_FLAG=-s install
+installdirs: installdirs-recursive
+installdirs-am:
+
+
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
+
+maintainer-clean-generic:
+	-rm -f Makefile.in
+mostlyclean-am:  mostlyclean-noinstLTLIBRARIES mostlyclean-compile \
+		mostlyclean-libtool mostlyclean-tags \
+		mostlyclean-generic
+
+mostlyclean: mostlyclean-recursive
+
+clean-am:  clean-noinstLTLIBRARIES clean-compile clean-libtool \
+		clean-tags clean-generic mostlyclean-am
+
+clean: clean-recursive
+
+distclean-am:  distclean-noinstLTLIBRARIES distclean-compile \
+		distclean-libtool distclean-tags distclean-generic \
+		clean-am
+	-rm -f libtool
+
+distclean: distclean-recursive
+
+maintainer-clean-am:  maintainer-clean-noinstLTLIBRARIES \
+		maintainer-clean-compile maintainer-clean-libtool \
+		maintainer-clean-tags maintainer-clean-generic \
+		distclean-am
+	@echo "This command is intended for maintainers to use;"
+	@echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-recursive
+
+.PHONY: mostlyclean-noinstLTLIBRARIES distclean-noinstLTLIBRARIES \
+clean-noinstLTLIBRARIES maintainer-clean-noinstLTLIBRARIES \
+mostlyclean-compile distclean-compile clean-compile \
+maintainer-clean-compile mostlyclean-libtool distclean-libtool \
+clean-libtool maintainer-clean-libtool install-recursive \
+uninstall-recursive install-data-recursive uninstall-data-recursive \
+install-exec-recursive uninstall-exec-recursive installdirs-recursive \
+uninstalldirs-recursive all-recursive check-recursive \
+installcheck-recursive info-recursive dvi-recursive \
+mostlyclean-recursive distclean-recursive clean-recursive \
+maintainer-clean-recursive tags tags-recursive mostlyclean-tags \
+distclean-tags clean-tags maintainer-clean-tags distdir info-am info \
+dvi-am dvi check check-am installcheck-am installcheck install-exec-am \
+install-exec install-data-am install-data install-am install \
+uninstall-am uninstall all-redirect all-am all install-strip \
+installdirs-am installdirs mostlyclean-generic distclean-generic \
+clean-generic maintainer-clean-generic clean mostlyclean distclean \
+maintainer-clean
+
+
+# *.s are not preprocessed at all.
+.s.o:
+	$(CCAS) $(COMPILE_FLAGS) $<
+.s.obj:
+	$(CCAS) $(COMPILE_FLAGS) `cygpath -w $<`
+.s.lo:
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) $<
+
+# *.S are preprocessed with CPP.
+.S.o:
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.S.obj:
+	$(CPP) $(COMPILE_FLAGS) `cygpath -w $<` | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+
+# We have to rebuild the static object file without passing -DPIC to
+# preprocessor.  The overhead cost is one extra assemblation.  FIXME:
+# Teach libtool how to assemble with a preprocessor pass (CPP or m4).
+
+.S.lo:
+	$(CPP) $(COMPILE_FLAGS) -DPIC $< | grep -v '^#' >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
+
+# *.m4 are preprocessed with m4.
+.asm.o:
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.obj:
+	$(M4) -DOPERATION_$* `cygpath -w $<` >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.lo:
+	$(M4) -DPIC -DOPERATION_$* $< >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/rts/gmp/mpn/README b/rts/gmp/mpn/README
new file mode 100644
index 0000000000..7453c9d03e
--- /dev/null
+++ b/rts/gmp/mpn/README
@@ -0,0 +1,13 @@
+This directory contains all code for the mpn layer of GMP.
+
+Most subdirectories contain machine-dependent code, written in assembly or C.
+The `generic' subdirectory contains default code, used when there is no
+machine-dependent replacement for a particular machine.
+
+There is one subdirectory for each ISA family.  Note that e.g., 32-bit SPARC
+and 64-bit SPARC are very different ISA's, and thus cannot share any code.
+
+A particular compile will only use code from one subdirectory, and the
+`generic' subdirectory.  The ISA-specific subdirectories contain hierachies of
+directories for various architecture variants and implementations; the
+top-most level contains code that runs correctly on all variants.
diff --git a/rts/gmp/mpn/a29k/add_n.s b/rts/gmp/mpn/a29k/add_n.s
new file mode 100644
index 0000000000..e3ee6dfa60
--- /dev/null
+++ b/rts/gmp/mpn/a29k/add_n.s
@@ -0,0 +1,120 @@
+; 29000 __gmpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+; The 29k has no addition or subtraction instructions that doesn't
+; affect carry, so we need to save and restore that as soon as we
+; adjust the pointers.  gr116 is used for this purpose.  Note that
+; gr116==0 means that carry should be set.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_add_n
+	.word	0x60000
+___gmpn_add_n:
+	srl	gr117,lr5,3
+	sub	gr118,gr117,1
+	jmpt	gr118,Ltail
+	 constn	gr116,-1		; init cy reg
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr104,lr4
+	add	lr4,lr4,32
+
+	subr	gr116,gr116,0		; restore carry
+	addc	gr96,gr96,gr104
+	addc	gr97,gr97,gr105
+	addc	gr98,gr98,gr106
+	addc	gr99,gr99,gr107
+	addc	gr100,gr100,gr108
+	addc	gr101,gr101,gr109
+	addc	gr102,gr102,gr110
+	addc	gr103,gr103,gr111
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr96,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+; Code for the last up-to-7 limbs.
+; This code might look very strange, but it's hard to write it
+; differently without major slowdown.
+
+	and	lr5,lr5,(8-1)
+Ltail:	sub	gr118,lr5,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	gr117,lr5,2		; count for jmpfdec
+
+	mtsr	cr,gr118
+	loadm	0,0,gr96,lr3
+	mtsr	cr,gr118
+	loadm	0,0,gr104,lr4
+
+	subr	gr116,gr116,0		; restore carry
+
+	jmpfdec	gr117,L1
+	 addc	gr96,gr96,gr104
+	jmp	Lstore
+	 mtsr	cr,gr118
+L1:	jmpfdec	gr117,L2
+	 addc	gr97,gr97,gr105
+	jmp	Lstore
+	 mtsr	cr,gr118
+L2:	jmpfdec	gr117,L3
+	 addc	gr98,gr98,gr106
+	jmp	Lstore
+	 mtsr	cr,gr118
+L3:	jmpfdec	gr117,L4
+	 addc	gr99,gr99,gr107
+	jmp	Lstore
+	 mtsr	cr,gr118
+L4:	jmpfdec	gr117,L5
+	 addc	gr100,gr100,gr108
+	jmp	Lstore
+	 mtsr	cr,gr118
+L5:	jmpfdec	gr117,L6
+	 addc	gr101,gr101,gr109
+	jmp	Lstore
+	 mtsr	cr,gr118
+L6:	addc	gr102,gr102,gr110
+
+Lstore:	storem	0,0,gr96,lr2
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+Lend:	jmpi	lr0
+	 add	gr96,gr116,1
diff --git a/rts/gmp/mpn/a29k/addmul_1.s b/rts/gmp/mpn/a29k/addmul_1.s
new file mode 100644
index 0000000000..f51b6d7af6
--- /dev/null
+++ b/rts/gmp/mpn/a29k/addmul_1.s
@@ -0,0 +1,113 @@
+; 29000 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+; add the product to a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; size		lr4
+; s2_limb	lr5
+
+	.cputype 29050
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_addmul_1
+	.word	0x60000
+___gmpn_addmul_1:
+	sub	lr4,lr4,8
+	jmpt	lr4,Ltail
+	 const	gr120,0			; init cylimb reg
+
+	srl	gr117,lr4,3		; divide by 8
+	sub	gr117,gr117,1		; count for jmpfdec
+
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+
+	multiplu gr104,gr96,lr5
+	multmu	 gr96,gr96,lr5
+	multiplu gr105,gr97,lr5
+	multmu	 gr97,gr97,lr5
+	multiplu gr106,gr98,lr5
+	multmu	 gr98,gr98,lr5
+	multiplu gr107,gr99,lr5
+	multmu	 gr99,gr99,lr5
+	multiplu gr108,gr100,lr5
+	multmu	 gr100,gr100,lr5
+	multiplu gr109,gr101,lr5
+	multmu	 gr101,gr101,lr5
+	multiplu gr110,gr102,lr5
+	multmu	 gr102,gr102,lr5
+	multiplu gr111,gr103,lr5
+	multmu	 gr103,gr103,lr5
+
+	add	gr104,gr104,gr120
+	addc	gr105,gr105,gr96
+	addc	gr106,gr106,gr97
+	addc	gr107,gr107,gr98
+	addc	gr108,gr108,gr99
+	addc	gr109,gr109,gr100
+	addc	gr110,gr110,gr101
+	addc	gr111,gr111,gr102
+	addc	gr120,gr103,0
+
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr2
+
+	add	gr104,gr96,gr104
+	addc	gr105,gr97,gr105
+	addc	gr106,gr98,gr106
+	addc	gr107,gr99,gr107
+	addc	gr108,gr100,gr108
+	addc	gr109,gr101,gr109
+	addc	gr110,gr102,gr110
+	addc	gr111,gr103,gr111
+	addc	gr120,gr120,0
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr104,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+Ltail:	and	lr4,lr4,(8-1)
+	sub	gr118,lr4,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2
+	sub	lr2,lr2,4		; offset res_ptr by one limb
+
+Loop2:	load	0,0,gr116,lr3
+	add	lr3,lr3,4
+	multiplu gr117,gr116,lr5
+	multmu	gr118,gr116,lr5
+	add	lr2,lr2,4
+	load	0,0,gr119,lr2
+	add	gr117,gr117,gr120
+	addc	gr118,gr118,0
+	add	gr117,gr117,gr119
+	store	0,0,gr117,lr2
+	jmpfdec	lr4,Loop2
+	 addc	gr120,gr118,0
+
+Lend:	jmpi	lr0
+	 or	gr96,gr120,0		; copy
diff --git a/rts/gmp/mpn/a29k/lshift.s b/rts/gmp/mpn/a29k/lshift.s
new file mode 100644
index 0000000000..93e1917127
--- /dev/null
+++ b/rts/gmp/mpn/a29k/lshift.s
@@ -0,0 +1,93 @@
+; 29000 __gmpn_lshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_lshift
+	.word	0x60000
+___gmpn_lshift:
+	sll	gr116,lr4,2
+	add	lr3,gr116,lr3
+	add	lr2,gr116,lr2
+	sub	lr3,lr3,4
+	load	0,0,gr119,lr3
+
+	subr	gr116,lr5,32
+	srl	gr96,gr119,gr116	; return value
+	sub	lr4,lr4,1		; actual loop count is SIZE - 1
+
+	srl	gr117,lr4,3		; chuck count = (actual count) / 8
+	cpeq	gr118,gr117,0
+	jmpt	gr118,Ltail
+	 mtsr	fc,lr5
+
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	sub	lr3,lr3,32
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr100,lr3
+
+	extract	gr109,gr119,gr107
+	extract	gr108,gr107,gr106
+	extract	gr107,gr106,gr105
+	extract	gr106,gr105,gr104
+	extract	gr105,gr104,gr103
+	extract	gr104,gr103,gr102
+	extract	gr103,gr102,gr101
+	extract	gr102,gr101,gr100
+
+	sub	lr2,lr2,32
+	mtsrim	cr,(8-1)
+	storem	0,0,gr102,lr2
+	jmpfdec	gr117,Loop
+	 or	gr119,gr100,0
+
+; Code for the last up-to-7 limbs.
+
+	and	lr4,lr4,(8-1)
+Ltail:	cpeq	gr118,lr4,0
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2		; count for jmpfdec
+
+Loop2:	sub	lr3,lr3,4
+	load	0,0,gr116,lr3
+	extract	gr117,gr119,gr116
+	sub	lr2,lr2,4
+	store	0,0,gr117,lr2
+	jmpfdec	lr4,Loop2
+	 or	gr119,gr116,0
+
+Lend:	extract	gr117,gr119,0
+	sub	lr2,lr2,4
+	jmpi	lr0
+	 store	0,0,gr117,lr2
diff --git a/rts/gmp/mpn/a29k/mul_1.s b/rts/gmp/mpn/a29k/mul_1.s
new file mode 100644
index 0000000000..6bcf7ce0cf
--- /dev/null
+++ b/rts/gmp/mpn/a29k/mul_1.s
@@ -0,0 +1,97 @@
+; 29000 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; size		lr4
+; s2_limb	lr5
+
+	.cputype 29050
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_mul_1
+	.word	0x60000
+___gmpn_mul_1:
+	sub	lr4,lr4,8
+	jmpt	lr4,Ltail
+	 const	gr120,0			; init cylimb reg
+
+	srl	gr117,lr4,3		; divide by 8
+	sub	gr117,gr117,1		; count for jmpfdec
+
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+
+	multiplu gr104,gr96,lr5
+	multmu	 gr96,gr96,lr5
+	multiplu gr105,gr97,lr5
+	multmu	 gr97,gr97,lr5
+	multiplu gr106,gr98,lr5
+	multmu	 gr98,gr98,lr5
+	multiplu gr107,gr99,lr5
+	multmu	 gr99,gr99,lr5
+	multiplu gr108,gr100,lr5
+	multmu	 gr100,gr100,lr5
+	multiplu gr109,gr101,lr5
+	multmu	 gr101,gr101,lr5
+	multiplu gr110,gr102,lr5
+	multmu	 gr102,gr102,lr5
+	multiplu gr111,gr103,lr5
+	multmu	 gr103,gr103,lr5
+
+	add	gr104,gr104,gr120
+	addc	gr105,gr105,gr96
+	addc	gr106,gr106,gr97
+	addc	gr107,gr107,gr98
+	addc	gr108,gr108,gr99
+	addc	gr109,gr109,gr100
+	addc	gr110,gr110,gr101
+	addc	gr111,gr111,gr102
+	addc	gr120,gr103,0
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr104,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+Ltail:	and	lr4,lr4,(8-1)
+	sub	gr118,lr4,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2
+	sub	lr2,lr2,4		; offset res_ptr by one limb
+
+Loop2:	load	0,0,gr116,lr3
+	add	lr3,lr3,4
+	multiplu gr117,gr116,lr5
+	multmu	gr118,gr116,lr5
+	add	lr2,lr2,4
+	add	gr117,gr117,gr120
+	store	0,0,gr117,lr2
+	jmpfdec	lr4,Loop2
+	 addc	gr120,gr118,0
+
+Lend:	jmpi	lr0
+	 or	gr96,gr120,0		; copy
diff --git a/rts/gmp/mpn/a29k/rshift.s b/rts/gmp/mpn/a29k/rshift.s
new file mode 100644
index 0000000000..ea163bff2b
--- /dev/null
+++ b/rts/gmp/mpn/a29k/rshift.s
@@ -0,0 +1,89 @@
+; 29000 __gmpn_rshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_rshift
+	.word	0x60000
+___gmpn_rshift:
+	load	0,0,gr119,lr3
+	add	lr3,lr3,4
+
+	subr	gr116,lr5,32
+	sll	gr96,gr119,gr116	; return value
+	sub	lr4,lr4,1		; actual loop count is SIZE - 1
+
+	srl	gr117,lr4,3		; chuck count = (actual count) / 8
+	cpeq	gr118,gr117,0
+	jmpt	gr118,Ltail
+	 mtsr	fc,gr116
+
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr100,lr3
+	add	lr3,lr3,32
+
+	extract	gr98,gr100,gr119
+	extract	gr99,gr101,gr100
+	extract	gr100,gr102,gr101
+	extract	gr101,gr103,gr102
+	extract	gr102,gr104,gr103
+	extract	gr103,gr105,gr104
+	extract	gr104,gr106,gr105
+	extract	gr105,gr107,gr106
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr98,lr2
+	add	lr2,lr2,32
+	jmpfdec	gr117,Loop
+	 or	gr119,gr107,0
+
+; Code for the last up-to-7 limbs.
+
+	and	lr4,lr4,(8-1)
+Ltail:	cpeq	gr118,lr4,0
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2		; count for jmpfdec
+
+Loop2:	load	0,0,gr100,lr3
+	add	lr3,lr3,4
+	extract	gr117,gr100,gr119
+	store	0,0,gr117,lr2
+	add	lr2,lr2,4
+	jmpfdec	lr4,Loop2
+	 or	gr119,gr100,0
+
+Lend:	srl	gr117,gr119,lr5
+	jmpi	lr0
+	 store	0,0,gr117,lr2
diff --git a/rts/gmp/mpn/a29k/sub_n.s b/rts/gmp/mpn/a29k/sub_n.s
new file mode 100644
index 0000000000..c6b64c5bee
--- /dev/null
+++ b/rts/gmp/mpn/a29k/sub_n.s
@@ -0,0 +1,120 @@
+; 29000 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+; The 29k has no addition or subtraction instructions that doesn't
+; affect carry, so we need to save and restore that as soon as we
+; adjust the pointers.  gr116 is used for this purpose.  Note that
+; gr116==0 means that carry should be set.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_sub_n
+	.word	0x60000
+___gmpn_sub_n:
+	srl	gr117,lr5,3
+	sub	gr118,gr117,1
+	jmpt	gr118,Ltail
+	 constn	gr116,-1		; init cy reg
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr104,lr4
+	add	lr4,lr4,32
+
+	subr	gr116,gr116,0		; restore carry
+	subc	gr96,gr96,gr104
+	subc	gr97,gr97,gr105
+	subc	gr98,gr98,gr106
+	subc	gr99,gr99,gr107
+	subc	gr100,gr100,gr108
+	subc	gr101,gr101,gr109
+	subc	gr102,gr102,gr110
+	subc	gr103,gr103,gr111
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr96,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+; Code for the last up-to-7 limbs.
+; This code might look very strange, but it's hard to write it
+; differently without major slowdown.
+
+	and	lr5,lr5,(8-1)
+Ltail:	sub	gr118,lr5,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	gr117,lr5,2		; count for jmpfdec
+
+	mtsr	cr,gr118
+	loadm	0,0,gr96,lr3
+	mtsr	cr,gr118
+	loadm	0,0,gr104,lr4
+
+	subr	gr116,gr116,0		; restore carry
+
+	jmpfdec	gr117,L1
+	 subc	gr96,gr96,gr104
+	jmp	Lstore
+	 mtsr	cr,gr118
+L1:	jmpfdec	gr117,L2
+	 subc	gr97,gr97,gr105
+	jmp	Lstore
+	 mtsr	cr,gr118
+L2:	jmpfdec	gr117,L3
+	 subc	gr98,gr98,gr106
+	jmp	Lstore
+	 mtsr	cr,gr118
+L3:	jmpfdec	gr117,L4
+	 subc	gr99,gr99,gr107
+	jmp	Lstore
+	 mtsr	cr,gr118
+L4:	jmpfdec	gr117,L5
+	 subc	gr100,gr100,gr108
+	jmp	Lstore
+	 mtsr	cr,gr118
+L5:	jmpfdec	gr117,L6
+	 subc	gr101,gr101,gr109
+	jmp	Lstore
+	 mtsr	cr,gr118
+L6:	subc	gr102,gr102,gr110
+
+Lstore:	storem	0,0,gr96,lr2
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+Lend:	jmpi	lr0
+	 add	gr96,gr116,1
diff --git a/rts/gmp/mpn/a29k/submul_1.s b/rts/gmp/mpn/a29k/submul_1.s
new file mode 100644
index 0000000000..ef97d8d4e5
--- /dev/null
+++ b/rts/gmp/mpn/a29k/submul_1.s
@@ -0,0 +1,116 @@
+; 29000 __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+; subtract the product from a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; size		lr4
+; s2_limb	lr5
+
+	.cputype 29050
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_submul_1
+	.word	0x60000
+___gmpn_submul_1:
+	sub	lr4,lr4,8
+	jmpt	lr4,Ltail
+	 const	gr120,0			; init cylimb reg
+
+	srl	gr117,lr4,3		; divide by 8
+	sub	gr117,gr117,1		; count for jmpfdec
+
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+
+	multiplu gr104,gr96,lr5
+	multmu	 gr96,gr96,lr5
+	multiplu gr105,gr97,lr5
+	multmu	 gr97,gr97,lr5
+	multiplu gr106,gr98,lr5
+	multmu	 gr98,gr98,lr5
+	multiplu gr107,gr99,lr5
+	multmu	 gr99,gr99,lr5
+	multiplu gr108,gr100,lr5
+	multmu	 gr100,gr100,lr5
+	multiplu gr109,gr101,lr5
+	multmu	 gr101,gr101,lr5
+	multiplu gr110,gr102,lr5
+	multmu	 gr102,gr102,lr5
+	multiplu gr111,gr103,lr5
+	multmu	 gr103,gr103,lr5
+
+	add	gr104,gr104,gr120
+	addc	gr105,gr105,gr96
+	addc	gr106,gr106,gr97
+	addc	gr107,gr107,gr98
+	addc	gr108,gr108,gr99
+	addc	gr109,gr109,gr100
+	addc	gr110,gr110,gr101
+	addc	gr111,gr111,gr102
+	addc	gr120,gr103,0
+
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr2
+
+	sub	gr96,gr96,gr104
+	subc	gr97,gr97,gr105
+	subc	gr98,gr98,gr106
+	subc	gr99,gr99,gr107
+	subc	gr100,gr100,gr108
+	subc	gr101,gr101,gr109
+	subc	gr102,gr102,gr110
+	subc	gr103,gr103,gr111
+
+	add	gr104,gr103,gr111	; invert carry from previus sub
+	addc	gr120,gr120,0
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr96,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+Ltail:	and	lr4,lr4,(8-1)
+	sub	gr118,lr4,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2
+	sub	lr2,lr2,4		; offset res_ptr by one limb
+
+Loop2:	load	0,0,gr116,lr3
+	add	lr3,lr3,4
+	multiplu gr117,gr116,lr5
+	multmu	gr118,gr116,lr5
+	add	lr2,lr2,4
+	load	0,0,gr119,lr2
+	add	gr117,gr117,gr120
+	addc	gr118,gr118,0
+	sub	gr119,gr119,gr117
+	add	gr104,gr119,gr117	; invert carry from previus sub
+	store	0,0,gr119,lr2
+	jmpfdec	lr4,Loop2
+	 addc	gr120,gr118,0
+
+Lend:	jmpi	lr0
+	 or	gr96,gr120,0		; copy
diff --git a/rts/gmp/mpn/a29k/udiv.s b/rts/gmp/mpn/a29k/udiv.s
new file mode 100644
index 0000000000..fdd53a9a88
--- /dev/null
+++ b/rts/gmp/mpn/a29k/udiv.s
@@ -0,0 +1,30 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+	.sect .lit,lit
+	.text
+	.align 4
+	.global ___udiv_qrnnd
+	.word 0x60000
+___udiv_qrnnd:
+	mtsr q,lr3
+	dividu gr96,lr4,lr5
+	mfsr gr116,q
+	jmpi lr0
+	store 0,0,gr116,lr2
diff --git a/rts/gmp/mpn/a29k/umul.s b/rts/gmp/mpn/a29k/umul.s
new file mode 100644
index 0000000000..7741981167
--- /dev/null
+++ b/rts/gmp/mpn/a29k/umul.s
@@ -0,0 +1,29 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+	.sect .lit,lit
+	.text
+	.align 4
+	.global ___umul_ppmm
+	.word 0x50000
+___umul_ppmm:
+	multiplu gr116,lr3,lr4
+	multmu gr96,lr3,lr4
+	jmpi lr0
+	store 0,0,gr116,lr2
diff --git a/rts/gmp/mpn/alpha/README b/rts/gmp/mpn/alpha/README
new file mode 100644
index 0000000000..744260c7c5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/README
@@ -0,0 +1,224 @@
+This directory contains mpn functions optimized for DEC Alpha processors.
+
+ALPHA ASSEMBLY RULES AND REGULATIONS
+
+The `.prologue N' pseudo op marks the end of instruction that needs
+special handling by unwinding.  It also says whether $27 is really
+needed for computing the gp.  The `.mask M' pseudo op says which
+registers are saved on the stack, and at what offset in the frame.
+
+Cray code is very very different...
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+EV4
+
+1. This chip has very limited store bandwidth.  The on-chip L1 cache is
+   write-through, and a cache line is transfered from the store buffer to
+   the off-chip L2 in as much 15 cycles on most systems.  This delay hurts
+   mpn_add_n, mpn_sub_n, mpn_lshift, and mpn_rshift.
+
+2. Pairing is possible between memory instructions and integer arithmetic
+   instructions.
+
+3. mulq and umulh are documented to have a latency of 23 cycles, but 2 of
+   these cycles are pipelined.  Thus, multiply instructions can be issued at
+   a rate of one each 21st cycle.
+
+EV5
+
+1. The memory bandwidth of this chip seems excellent, both for loads and
+   stores.  Even when the working set is larger than the on-chip L1 and L2
+   caches, the performance remain almost unaffected.
+
+2. mulq has a latency of 12 cycles and an issue rate of 1 each 8th cycle.
+   umulh has a measured latency of 14 cycles and an issue rate of 1 each
+   10th cycle.  But the exact timing is somewhat confusing.
+
+3. mpn_add_n.  With 4-fold unrolling, we need 37 instructions, whereof 12
+   are memory operations.  This will take at least
+	ceil(37/2) [dual issue] + 1 [taken branch] = 19 cycles
+   We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data
+   cache cycles, which should be completely hidden in the 19 issue cycles.
+   The computation is inherently serial, with these dependencies:
+
+	       ldq  ldq
+		 \  /\
+	  (or)   addq |
+	   |\   /   \ |
+	   | addq  cmpult
+	    \  |     |
+	     cmpult  |
+		 \  /
+		  or
+
+   I.e., 3 operations are needed between carry-in and carry-out, making 12
+   cycles the absolute minimum for the 4 limbs.  We could replace the `or'
+   with a cmoveq/cmovne, which could issue one cycle earlier that the `or',
+   but that might waste a cycle on EV4.  The total depth remain unaffected,
+   since cmov has a latency of 2 cycles.
+
+     addq
+     /   \
+   addq  cmpult
+     |      \
+   cmpult -> cmovne
+
+Montgomery has a slightly different way of computing carry that requires one
+less instruction, but has depth 4 (instead of the current 3).  Since the
+code is currently instruction issue bound, Montgomery's idea should save us
+1/2 cycle per limb, or bring us down to a total of 17 cycles or 4.25
+cycles/limb.  Unfortunately, this method will not be good for the EV6.
+
+EV6
+
+Here we have a really parallel pipeline, capable of issuing up to 4 integer
+instructions per cycle.  One integer multiply instruction can issue each
+cycle.  To get optimal speed, we need to pretend we are vectorizing the code,
+i.e., minimize the iterative dependencies.
+
+There are two dependencies to watch out for.  1) Address arithmetic
+dependencies, and 2) carry propagation dependencies.
+
+We can avoid serializing due to address arithmetic by unrolling the loop, so
+that addresses don't depend heavily on an index variable.  Avoiding
+serializing because of carry propagation is trickier; the ultimate performance
+of the code will be determined of the number of latency cycles it takes from
+accepting carry-in to a vector point until we can generate carry-out.
+
+Most integer instructions can execute in either the L0, U0, L1, or U1
+pipelines.  Shifts only execute in U0 and U1, and multiply only in U1.
+
+CMOV instructions split into two internal instructions, CMOV1 and CMOV2, but
+the execute efficiently.  But CMOV split the mapping process (see pg 2-26 in
+cmpwrgd.pdf), suggesting the CMOV should always be placed as the last
+instruction of an aligned 4 instruction block (?).
+
+Perhaps the most important issue is the latency between the L0/U0 and L1/U1
+clusters; a result obtained on either cluster has an extra cycle of latency
+for consumers in the opposite cluster.  Because of the dynamic nature of the
+implementation, it is hard to predict where an instruction will execute.
+
+The shift loops need (per limb):
+    1 load (Lx pipes)
+    1 store (Lx pipes)
+    2 shift (Ux pipes)
+    1 iaddlog (Lx pipes, Ux pipes)
+Obviously, since the pipes are very equally loaded, we should get 4 insn/cycle, or 1.25 cycles/limb.
+
+For mpn_add_n, we currently have
+    2 load (Lx pipes)
+    1 store (Lx pipes)
+    5 iaddlog (Lx pipes, Ux pipes)
+
+Again, we have a perfect balance and will be limited by carry propagation
+delays, currently three cycles.  The superoptimizer indicates that ther
+might be sequences that--using a final cmov--have a carry propagation delay
+of just two.  Montgomery's subtraction sequence could perhaps be used, by
+complementing some operands.  All in all, we should get down to 2 cycles
+without much problems.
+
+For mpn_mul_1, we could do, just like for mpn_add_n:
+    not		newlo,notnewlo
+    addq	cylimb,newlo,newlo  ||    cmpult	cylimb,notnewlo,cyout
+    addq	cyout,newhi,cylimb
+and get 2-cycle carry propagation.  The instructions needed will be
+    1 ld (Lx pipes)
+    1 st (Lx pipes)
+    2 mul (U1 pipe)
+    4 iaddlog (Lx pipes, Ux pipes)
+issue1: addq not mul ld
+issue2: cmpult addq mul st
+Conclusion: no cluster delays and 2-cycle carry delays will give us 2 cycles/limb!
+
+Last, we have mpn_addmul_1.  Almost certainly, we will get down to 3
+cycles/limb, which would be absolutely awesome.
+
+Old, perhaps obsolete addmul_1 dependency diagram (needs 175 columns wide screen):
+
+   i  
+   s
+   s  i
+   u  n
+   e  s
+   d  t
+      r
+   i  u
+l  n  c
+i  s  t
+v  t  i
+e  r  o
+   u  n
+v  c	
+a  t  t
+l  i  y
+u  o  p
+e  n  e
+s  s  s
+        issue
+         in
+        cycle
+         -1     ldq
+               /    \
+          0   |      \
+              |       \
+          1   |        |
+              |        |
+          2   |        |                   ldq
+              |        |                  /    \
+          3   |       mulq               |      \
+              |           \              |       \
+          4  umulh         \             |        |
+               |            |            |        |
+          5    |            |            |        |                   ldq
+               |            |            |        |                  /    \
+    4calm 6    |            |   ldq      |       mulq               |      \
+               |            |  /         |           \              |       \
+    4casm 7    |            | /         umulh         \             |        |
+6              |            ||            |            |            |        |
+    3aal  8    |            ||            |            |            |        |                   ldq
+7              |            ||            |            |            |        |                  /    \
+    4calm 9    |            ||            |            |   ldq      |       mulq               |      \
+9              |            ||            |            |  /         |           \              |       \
+    4casm 10   |            ||            |            | /         umulh         \             |        |
+9              |            ||            |            ||            |            |            |        |
+    3aal  11   |           addq           |            ||            |            |            |        |                   ldq
+9              |          //   \          |            ||            |            |            |        |                  /    \
+    4calm 12    \     cmpult    addq<-cy  |            ||            |            |   ldq      |       mulq               |      \
+13               \    /       //   \      |            ||            |            |  /         |           \              |       \
+    4casm 13      addq   cmpult     stq   |            ||            |            | /         umulh         \             |        |
+11                    \  /                |            ||            |            ||            |            |            |        |
+    3aal  14          addq                |           addq           |            ||            |            |            |        |                   ldq
+10                        \               |          //   \          |            ||            |            |            |        |                  /    \
+    4calm 15                cy ---->       \     cmpult    addq<-cy  |            ||            |            |   ldq      |       mulq               |      \
+13                                          \    /       //   \      |            ||            |            |  /         |           \              |       \
+    4casm 16                                 addq   cmpult     stq   |            ||            |            | /         umulh         \             |        |
+11                                               \  /                |            ||            |            ||            |            |            |        |
+    3aal  17                                     addq                |           addq           |            ||            |            |            |        |
+10                                                   \               |          //   \          |            ||            |            |            |        |
+    4calm 18                                           cy ---->       \     cmpult    addq<-cy  |            ||            |            |   ldq      |       mulq
+13                                                                     \    /       //   \      |            ||            |            |  /         |           \
+    4casm 19                                                            addq   cmpult     stq   |            ||            |            | /         umulh         \
+11                                                                          \  /                |            ||            |            ||            |            |
+    3aal  20                                                                addq                |           addq           |            ||            |            |
+10                                                                              \               |          //   \          |            ||            |            |
+    4calm 21                                                                      cy ---->       \     cmpult    addq<-cy  |            ||            |            |   ldq
+                                                                                                  \    /       //   \      |            ||            |            |  /
+          22                                                                                       addq   cmpult     stq   |            ||            |            | /
+                                                                                                       \  /                |            ||            |            ||
+          23                                                                                           addq                |           addq           |            ||
+                                                                                                           \               |          //   \          |            ||
+          24                                                                                                 cy ---->       \     cmpult    addq<-cy  |            ||
+                                                                                                                             \    /       //   \      |            ||
+          25                                                                                                                  addq   cmpult     stq   |            ||
+                                                                                                                                  \  /                |            ||
+          26                                                                                                                      addq                |           addq
+                                                                                                                                      \               |          //   \
+          27                                                                                                                            cy ---->       \     cmpult    addq<-cy
+                                                                                                                                                        \    /       //   \
+          28                                                                                                                                             addq   cmpult     stq
+                                                                                                                                                             \  /
+As many as 6 consecutive points will be under execution simultaneously, or if we                                                                             addq
+schedule loads even further away, maybe 7 or 8.  But the number of live quantities                                                                               \
+is reasonable, and can easily be satisfied.                                                                                                                        cy ---->    
diff --git a/rts/gmp/mpn/alpha/add_n.asm b/rts/gmp/mpn/alpha/add_n.asm
new file mode 100644
index 0000000000..08d6a9f7b8
--- /dev/null
+++ b/rts/gmp/mpn/alpha/add_n.asm
@@ -0,0 +1,114 @@
+dnl  Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl  store sum in a third limb vector.
+
+dnl  Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	ldq	r3,0(r17)
+	ldq	r4,0(r18)
+
+	subq	r19,1,r19
+	and	r19,4-1,r2	C number of limbs in first loop
+	bis	r31,r31,r0
+	beq	r2,$L0		C if multiple of 4 limbs, skip first loop
+
+	subq	r19,r2,r19
+
+$Loop0:	subq	r2,1,r2
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	addq	r17,8,r17
+	addq	r18,8,r18
+	bis	r5,r5,r3
+	bis	r6,r6,r4
+	addq	r16,8,r16
+	bne	r2,$Loop0
+
+$L0:	beq	r19,$Lend
+
+	ALIGN(8)
+$Loop:	subq	r19,4,r19
+
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,16(r17)
+	addq	r6,r0,r6
+	ldq	r4,16(r18)
+	cmpult	r6,r0,r1
+	addq	r5,r6,r6
+	cmpult	r6,r5,r0
+	stq	r6,8(r16)
+	bis	r0,r1,r0
+
+	ldq	r5,24(r17)
+	addq	r4,r0,r4
+	ldq	r6,24(r18)
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,16(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,32(r17)
+	addq	r6,r0,r6
+	ldq	r4,32(r18)
+	cmpult	r6,r0,r1
+	addq	r5,r6,r6
+	cmpult	r6,r5,r0
+	stq	r6,24(r16)
+	bis	r0,r1,r0
+
+	addq	r17,32,r17
+	addq	r18,32,r18
+	addq	r16,32,r16
+	bne	r19,$Loop
+
+$Lend:	addq	r4,r0,r4
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/addmul_1.asm b/rts/gmp/mpn/alpha/addmul_1.asm
new file mode 100644
index 0000000000..4ea900be6b
--- /dev/null
+++ b/rts/gmp/mpn/alpha/addmul_1.asm
@@ -0,0 +1,87 @@
+dnl Alpha __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl  cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Lend1	C jump if size was == 1
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	addq	r5,r3,r3
+	cmpult	r3,r5,r4
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	beq	r18,$Lend2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,1,r18	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	addq	r5,r0,r0	C combine carries
+	bne	r18,$Loop
+
+$Lend2:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r5,r0,r0	C combine carries
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Lend1:	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r0,r5,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/cntlz.asm b/rts/gmp/mpn/alpha/cntlz.asm
new file mode 100644
index 0000000000..febb3b70d9
--- /dev/null
+++ b/rts/gmp/mpn/alpha/cntlz.asm
@@ -0,0 +1,68 @@
+dnl  Alpha auxiliary for longlong.h's count_leading_zeros
+
+dnl  Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  DISCUSSION:
+
+dnl  Other methods have been tried, and using a 128-entry table actually trims
+dnl  about 10% of the execution time (on a 21164) when the table is in the L1
+dnl  cache.  But under non-benchmarking conditions, the table will hardly be in
+dnl  the L1 cache.  Tricky bit-fiddling methods with multiplies and magic tables
+dnl  are also possible, but they require many more instructions than the current
+dnl  code.  (But for count_trailing_zeros, such tricks are beneficial.)
+dnl  Finally, converting to floating-point and extracting the exponent is much
+dnl  slower.
+
+ASM_START()
+PROLOGUE(MPN(count_leading_zeros))
+	bis	r31,63,r0		C initialize partial result count
+
+	srl	r16,32,r1		C shift down 32 steps -> r1
+	cmovne	r1,r1,r16		C select r1 if non-zero
+	cmovne	r1,31,r0		C if r1 is nonzero choose smaller count
+
+	srl	r16,16,r1		C shift down 16 steps -> r1
+	subq	r0,16,r2		C generate new partial result count
+	cmovne	r1,r1,r16		C choose new r1 if non-zero
+	cmovne	r1,r2,r0		C choose new count if r1 was non-zero
+
+	srl	r16,8,r1
+	subq	r0,8,r2
+	cmovne	r1,r1,r16
+	cmovne	r1,r2,r0
+
+	srl	r16,4,r1
+	subq	r0,4,r2
+	cmovne	r1,r1,r16
+	cmovne	r1,r2,r0
+
+	srl	r16,2,r1
+	subq	r0,2,r2
+	cmovne	r1,r1,r16
+	cmovne	r1,r2,r0
+
+	srl	r16,1,r1		C extract bit 1
+	subq	r0,r1,r0		C subtract it from partial result
+
+	ret	r31,(r26),1
+EPILOGUE(MPN(count_leading_zeros))
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/default.m4 b/rts/gmp/mpn/alpha/default.m4
new file mode 100644
index 0000000000..5f4c48dc73
--- /dev/null
+++ b/rts/gmp/mpn/alpha/default.m4
@@ -0,0 +1,77 @@
+divert(-1)
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+define(`ASM_START',
+	`
+	.set noreorder
+	.set noat')
+
+define(`X',`0x$1')
+define(`FLOAT64',
+	`
+	.align	3
+$1:	.t_floating $2')
+
+define(`PROLOGUE',
+	`
+	.text
+	.align	3
+	.globl	$1
+	.ent	$1
+$1:
+	.frame r30,0,r26
+	.prologue 0')
+
+define(`PROLOGUE_GP',
+	`
+	.text
+	.align	3
+	.globl	$1
+	.ent	$1
+$1:
+	ldgp	r29,0(r27)
+	.frame	r30,0,r26
+	.prologue 1')
+
+define(`EPILOGUE',
+	`
+	.end	$1')
+
+dnl Map register names r0, r1, etc, to `$0', `$1', etc.
+dnl This is needed on all systems but Unicos
+forloop(i,0,31,
+`define(`r'i,``$''i)'
+)
+forloop(i,0,31,
+`define(`f'i,``$f''i)'
+)
+
+define(`DATASTART',
+	`dnl
+	DATA
+$1:')
+define(`DATAEND',`dnl')
+
+define(`ASM_END',`dnl')
+
+divert
diff --git a/rts/gmp/mpn/alpha/ev5/add_n.asm b/rts/gmp/mpn/alpha/ev5/add_n.asm
new file mode 100644
index 0000000000..716d6404ae
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/add_n.asm
@@ -0,0 +1,143 @@
+dnl  Alpha EV5 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl  store sum in a third limb vector.
+
+dnl  Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	bis	r31,r31,r25		C clear cy
+	subq	r19,4,r19		C decr loop cnt
+	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	ldq	r1,8(r18)
+	ldq	r5,8(r17)
+	addq	r17,32,r17		C update s1_ptr
+	ldq	r2,16(r18)
+	addq	r0,r4,r20		C 1st main add
+	ldq	r3,24(r18)
+	subq	r19,4,r19		C decr loop cnt
+	ldq	r6,-16(r17)
+	cmpult	r20,r0,r25		C compute cy from last add
+	ldq	r7,-8(r17)
+	addq	r1,r5,r28		C 2nd main add
+	addq	r18,32,r18		C update s2_ptr
+	addq	r28,r25,r21		C 2nd carry add
+	cmpult	r28,r5,r8		C compute cy from last add
+	blt	r19,$Lend1		C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+	ALIGN(16)
+$Loop:	cmpult	r21,r28,r25		C compute cy from last add
+	ldq	r0,0(r18)
+	bis	r8,r25,r25		C combine cy from the two adds
+	ldq	r1,8(r18)
+	addq	r2,r6,r28		C 3rd main add
+	ldq	r4,0(r17)
+	addq	r28,r25,r22		C 3rd carry add
+	ldq	r5,8(r17)
+	cmpult	r28,r6,r8		C compute cy from last add
+	cmpult	r22,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+	stq	r21,8(r16)
+	addq	r3,r7,r28		C 4th main add
+	addq	r28,r25,r23		C 4th carry add
+	cmpult	r28,r7,r8		C compute cy from last add
+	cmpult	r23,r28,r25		C compute cy from last add
+		addq	r17,32,r17		C update s1_ptr
+	bis	r8,r25,r25		C combine cy from the two adds
+		addq	r16,32,r16		C update res_ptr
+	addq	r0,r4,r28		C 1st main add
+	ldq	r2,16(r18)
+	addq	r25,r28,r20		C 1st carry add
+	ldq	r3,24(r18)
+	cmpult	r28,r4,r8		C compute cy from last add
+	ldq	r6,-16(r17)
+	cmpult	r20,r28,r25		C compute cy from last add
+	ldq	r7,-8(r17)
+	bis	r8,r25,r25		C combine cy from the two adds
+	subq	r19,4,r19		C decr loop cnt
+	stq	r22,-16(r16)
+	addq	r1,r5,r28		C 2nd main add
+	stq	r23,-8(r16)
+	addq	r25,r28,r21		C 2nd carry add
+		addq	r18,32,r18		C update s2_ptr
+	cmpult	r28,r5,r8		C compute cy from last add
+	bge	r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1:	cmpult	r21,r28,r25		C compute cy from last add
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r2,r6,r28		C 3rd main add
+	addq	r28,r25,r22		C 3rd carry add
+	cmpult	r28,r6,r8		C compute cy from last add
+	cmpult	r22,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+	stq	r21,8(r16)
+	addq	r3,r7,r28		C 4th main add
+	addq	r28,r25,r23		C 4th carry add
+	cmpult	r28,r7,r8		C compute cy from last add
+	cmpult	r23,r28,r25		C compute cy from last add
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r16,32,r16		C update res_ptr
+	stq	r22,-16(r16)
+	stq	r23,-8(r16)
+$Lend2:	addq	r19,4,r19		C restore loop cnt
+	beq	r19,$Lret
+C Start software pipeline for 2nd loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	subq	r19,1,r19
+	beq	r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+	ALIGN(16)
+$Loop0:	addq	r0,r4,r28		C main add
+	ldq	r0,8(r18)
+	cmpult	r28,r4,r8		C compute cy from last add
+	ldq	r4,8(r17)
+	addq	r28,r25,r20		C carry add
+	addq	r18,8,r18
+	addq	r17,8,r17
+	stq	r20,0(r16)
+	cmpult	r20,r28,r25		C compute cy from last add
+	subq	r19,1,r19		C decr loop cnt
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r16,8,r16
+	bne	r19,$Loop0
+$Lend0:	addq	r0,r4,r28		C main add
+	addq	r28,r25,r20		C carry add
+	cmpult	r28,r4,r8		C compute cy from last add
+	cmpult	r20,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+
+$Lret:	bis	r25,r31,r0		C return cy
+	ret	r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/lshift.asm b/rts/gmp/mpn/alpha/ev5/lshift.asm
new file mode 100644
index 0000000000..cb181dda66
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/lshift.asm
@@ -0,0 +1,169 @@
+dnl  Alpha EV5 __gmpn_lshift -- Shift a number left.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	s8addq	r18,r17,r17	C make r17 point at end of s1
+	ldq	r4,-8(r17)	C load first limb
+	subq	r31,r19,r20
+	s8addq	r18,r16,r16	C make r16 point at end of RES
+	subq	r18,1,r18
+	and	r18,4-1,r28	C number of limbs in first loop
+	srl	r4,r20,r0	C compute function result
+
+	beq	r28,$L0
+	subq	r18,r28,r18
+
+	ALIGN(8)
+$Loop0:	ldq	r3,-16(r17)
+	subq	r16,8,r16
+	sll	r4,r19,r5
+	subq	r17,8,r17
+	subq	r28,1,r28
+	srl	r3,r20,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,0(r16)
+	bne	r28,$Loop0
+
+$L0:	sll	r4,r19,r24
+	beq	r18,$Lend
+C warm up phase 1
+	ldq	r1,-16(r17)
+	subq	r18,4,r18
+	ldq	r2,-24(r17)
+	ldq	r3,-32(r17)
+	ldq	r4,-40(r17)
+	beq	r18,$Lend1
+C warm up phase 2
+	srl	r1,r20,r7
+	sll	r1,r19,r21
+	srl	r2,r20,r8
+	ldq	r1,-48(r17)
+	sll	r2,r19,r22
+	ldq	r2,-56(r17)
+	srl	r3,r20,r5
+	bis	r7,r24,r7
+	sll	r3,r19,r23
+	bis	r8,r21,r8
+	srl	r4,r20,r6
+	ldq	r3,-64(r17)
+	sll	r4,r19,r24
+	ldq	r4,-72(r17)
+	subq	r18,4,r18
+	beq	r18,$Lend2
+	ALIGN(16)
+C main loop
+$Loop:	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+
+	srl	r1,r20,r7
+	subq	r18,4,r18
+	sll	r1,r19,r21
+	unop	C ldq	r31,-96(r17)
+
+	srl	r2,r20,r8
+	ldq	r1,-80(r17)
+	sll	r2,r19,r22
+	ldq	r2,-88(r17)
+
+	stq	r5,-24(r16)
+	bis	r7,r24,r7
+	stq	r6,-32(r16)
+	bis	r8,r21,r8
+
+	srl	r3,r20,r5
+	unop	C ldq	r31,-96(r17)
+	sll	r3,r19,r23
+	subq	r16,32,r16
+
+	srl	r4,r20,r6
+	ldq	r3,-96(r17)
+	sll	r4,r19,r24
+	ldq	r4,-104(r17)
+
+	subq	r17,32,r17
+	bne	r18,$Loop
+C cool down phase 2/1
+$Lend2:	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+	srl	r1,r20,r7
+	sll	r1,r19,r21
+	srl	r2,r20,r8
+	sll	r2,r19,r22
+	stq	r5,-24(r16)
+	bis	r7,r24,r7
+	stq	r6,-32(r16)
+	bis	r8,r21,r8
+	srl	r3,r20,r5
+	sll	r3,r19,r23
+	srl	r4,r20,r6
+	sll	r4,r19,r24
+C cool down phase 2/2
+	stq	r7,-40(r16)
+	bis	r5,r22,r5
+	stq	r8,-48(r16)
+	bis	r6,r23,r6
+	stq	r5,-56(r16)
+	stq	r6,-64(r16)
+C cool down phase 2/3
+	stq	r24,-72(r16)
+	ret	r31,(r26),1
+
+C cool down phase 1/1
+$Lend1:	srl	r1,r20,r7
+	sll	r1,r19,r21
+	srl	r2,r20,r8
+	sll	r2,r19,r22
+	srl	r3,r20,r5
+	bis	r7,r24,r7
+	sll	r3,r19,r23
+	bis	r8,r21,r8
+	srl	r4,r20,r6
+	sll	r4,r19,r24
+C cool down phase 1/2
+	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+	stq	r5,-24(r16)
+	stq	r6,-32(r16)
+	stq	r24,-40(r16)
+	ret	r31,(r26),1
+
+$Lend:	stq	r24,-8(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/rshift.asm b/rts/gmp/mpn/alpha/ev5/rshift.asm
new file mode 100644
index 0000000000..9940d83fad
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/rshift.asm
@@ -0,0 +1,167 @@
+dnl  Alpha EV5 __gmpn_rshift -- Shift a number right.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ldq	r4,0(r17)	C load first limb
+	subq	r31,r19,r20
+	subq	r18,1,r18
+	and	r18,4-1,r28	C number of limbs in first loop
+	sll	r4,r20,r0	C compute function result
+
+	beq	r28,$L0
+	subq	r18,r28,r18
+
+	ALIGN(8)
+$Loop0:	ldq	r3,8(r17)
+	addq	r16,8,r16
+	srl	r4,r19,r5
+	addq	r17,8,r17
+	subq	r28,1,r28
+	sll	r3,r20,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,-8(r16)
+	bne	r28,$Loop0
+
+$L0:	srl	r4,r19,r24
+	beq	r18,$Lend
+C warm up phase 1
+	ldq	r1,8(r17)
+	subq	r18,4,r18
+	ldq	r2,16(r17)
+	ldq	r3,24(r17)
+	ldq	r4,32(r17)
+	beq	r18,$Lend1
+C warm up phase 2
+	sll	r1,r20,r7
+	srl	r1,r19,r21
+	sll	r2,r20,r8
+	ldq	r1,40(r17)
+	srl	r2,r19,r22
+	ldq	r2,48(r17)
+	sll	r3,r20,r5
+	bis	r7,r24,r7
+	srl	r3,r19,r23
+	bis	r8,r21,r8
+	sll	r4,r20,r6
+	ldq	r3,56(r17)
+	srl	r4,r19,r24
+	ldq	r4,64(r17)
+	subq	r18,4,r18
+	beq	r18,$Lend2
+	ALIGN(16)
+C main loop
+$Loop:	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+
+	sll	r1,r20,r7
+	subq	r18,4,r18
+	srl	r1,r19,r21
+	unop	C ldq	r31,-96(r17)
+
+	sll	r2,r20,r8
+	ldq	r1,72(r17)
+	srl	r2,r19,r22
+	ldq	r2,80(r17)
+
+	stq	r5,16(r16)
+	bis	r7,r24,r7
+	stq	r6,24(r16)
+	bis	r8,r21,r8
+
+	sll	r3,r20,r5
+	unop	C ldq	r31,-96(r17)
+	srl	r3,r19,r23
+	addq	r16,32,r16
+
+	sll	r4,r20,r6
+	ldq	r3,88(r17)
+	srl	r4,r19,r24
+	ldq	r4,96(r17)
+
+	addq	r17,32,r17
+	bne	r18,$Loop
+C cool down phase 2/1
+$Lend2:	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+	sll	r1,r20,r7
+	srl	r1,r19,r21
+	sll	r2,r20,r8
+	srl	r2,r19,r22
+	stq	r5,16(r16)
+	bis	r7,r24,r7
+	stq	r6,24(r16)
+	bis	r8,r21,r8
+	sll	r3,r20,r5
+	srl	r3,r19,r23
+	sll	r4,r20,r6
+	srl	r4,r19,r24
+C cool down phase 2/2
+	stq	r7,32(r16)
+	bis	r5,r22,r5
+	stq	r8,40(r16)
+	bis	r6,r23,r6
+	stq	r5,48(r16)
+	stq	r6,56(r16)
+C cool down phase 2/3
+	stq	r24,64(r16)
+	ret	r31,(r26),1
+
+C cool down phase 1/1
+$Lend1:	sll	r1,r20,r7
+	srl	r1,r19,r21
+	sll	r2,r20,r8
+	srl	r2,r19,r22
+	sll	r3,r20,r5
+	bis	r7,r24,r7
+	srl	r3,r19,r23
+	bis	r8,r21,r8
+	sll	r4,r20,r6
+	srl	r4,r19,r24
+C cool down phase 1/2
+	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+	stq	r5,16(r16)
+	stq	r6,24(r16)
+	stq	r24,32(r16)
+	ret	r31,(r26),1
+
+$Lend:	stq	r24,0(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/sub_n.asm b/rts/gmp/mpn/alpha/ev5/sub_n.asm
new file mode 100644
index 0000000000..5248a2aa38
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/sub_n.asm
@@ -0,0 +1,143 @@
+dnl  Alpha EV5 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl  and store difference in a third limb vector.
+
+dnl  Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	bis	r31,r31,r25		C clear cy
+	subq	r19,4,r19		C decr loop cnt
+	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	ldq	r1,8(r18)
+	ldq	r5,8(r17)
+	addq	r17,32,r17		C update s1_ptr
+	ldq	r2,16(r18)
+	subq	r4,r0,r20		C 1st main subtract
+	ldq	r3,24(r18)
+	subq	r19,4,r19		C decr loop cnt
+	ldq	r6,-16(r17)
+	cmpult	r4,r0,r25		C compute cy from last subtract
+	ldq	r7,-8(r17)
+	subq	r5,r1,r28		C 2nd main subtract
+	addq	r18,32,r18		C update s2_ptr
+	subq	r28,r25,r21		C 2nd carry subtract
+	cmpult	r5,r1,r8		C compute cy from last subtract
+	blt	r19,$Lend1		C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+	ALIGN(16)
+$Loop:	cmpult	r28,r25,r25		C compute cy from last subtract
+	ldq	r0,0(r18)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	ldq	r1,8(r18)
+	subq	r6,r2,r28		C 3rd main subtract
+	ldq	r4,0(r17)
+	subq	r28,r25,r22		C 3rd carry subtract
+	ldq	r5,8(r17)
+	cmpult	r6,r2,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	stq	r21,8(r16)
+	subq	r7,r3,r28		C 4th main subtract
+	subq	r28,r25,r23		C 4th carry subtract
+	cmpult	r7,r3,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+		addq	r17,32,r17		C update s1_ptr
+	bis	r8,r25,r25		C combine cy from the two subtracts
+		addq	r16,32,r16		C update res_ptr
+	subq	r4,r0,r28		C 1st main subtract
+	ldq	r2,16(r18)
+	subq	r28,r25,r20		C 1st carry subtract
+	ldq	r3,24(r18)
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	ldq	r6,-16(r17)
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	ldq	r7,-8(r17)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	subq	r19,4,r19		C decr loop cnt
+	stq	r22,-16(r16)
+	subq	r5,r1,r28		C 2nd main subtract
+	stq	r23,-8(r16)
+	subq	r28,r25,r21		C 2nd carry subtract
+		addq	r18,32,r18		C update s2_ptr
+	cmpult	r5,r1,r8		C compute cy from last subtract
+	bge	r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1:	cmpult	r28,r25,r25		C compute cy from last subtract
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	subq	r6,r2,r28		C cy add
+	subq	r28,r25,r22		C 3rd main subtract
+	cmpult	r6,r2,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	stq	r21,8(r16)
+	subq	r7,r3,r28		C cy add
+	subq	r28,r25,r23		C 4th main subtract
+	cmpult	r7,r3,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	addq	r16,32,r16		C update res_ptr
+	stq	r22,-16(r16)
+	stq	r23,-8(r16)
+$Lend2:	addq	r19,4,r19		C restore loop cnt
+	beq	r19,$Lret
+C Start software pipeline for 2nd loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	subq	r19,1,r19
+	beq	r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+	ALIGN(16)
+$Loop0:	subq	r4,r0,r28		C main subtract
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	ldq	r0,8(r18)
+	ldq	r4,8(r17)
+	subq	r28,r25,r20		C carry subtract
+	addq	r18,8,r18
+	addq	r17,8,r17
+	stq	r20,0(r16)
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	subq	r19,1,r19		C decr loop cnt
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	addq	r16,8,r16
+	bne	r19,$Loop0
+$Lend0:	subq	r4,r0,r28		C main subtract
+	subq	r28,r25,r20		C carry subtract
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+
+$Lret:	bis	r25,r31,r0		C return cy
+	ret	r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev6/addmul_1.asm b/rts/gmp/mpn/alpha/ev6/addmul_1.asm
new file mode 100644
index 0000000000..2f588626a5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev6/addmul_1.asm
@@ -0,0 +1,474 @@
+dnl Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
+dnl  exactly 3.625 cycles/limb on EV6...
+
+dnl This code was written in close cooperation with ev6 pipeline expert
+dnl Steve Root (root@toober.hlo.dec.com).  Any errors are tege's fault, though.
+dnl
+dnl   Register usages for unrolled loop:
+dnl	  0-3     mul's
+dnl	  4-7     acc's
+dnl	  8-15    mul results
+dnl	  20,21   carry's
+dnl	  22,23   save for stores
+
+dnl   Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
+
+dnl   The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl   them, so that further disturbance to the schedule is damped.
+
+dnl   We couldn't pair the loads, because the entangled schedule of the
+dnl   carry's has to happen on one side {0} of the machine. Note, the total
+dnl   use of U0, and the total use of L0 (after attending to the stores).
+dnl   which is part of the reason why....
+
+dnl   This is a great schedule for the d_cache, a poor schedule for the
+dnl   b_cache. The lockup on U0 means that any stall can't be recovered
+dnl   from. Consider a ldq in L1.  say that load gets stalled because it
+dnl   collides with a fill from the b_Cache. On the next cycle, this load
+dnl   gets priority. If first looks at L0, and goes there. The instruction
+dnl   we intended for L0 gets to look at L1, which is NOT where we want
+dnl   it. It either stalls 1, because it can't go in L0, or goes there, and
+dnl   causes a further instruction to stall.
+
+dnl   So for b_cache, we're likely going to want to put one or more cycles
+dnl   back into the code! And, of course, put in prefetches. For the
+dnl   accumulator, lds, intent to modify.  For the multiplier, you might
+dnl   want ldq, evict next, if you're not wanting to use it again soon. Use
+dnl   256 ahead of present pointer value. At a place where we have an mt
+dnl   followed by a bookkeeping, put the bookkeeping in upper, and the
+dnl   prefetch into lower.
+
+dnl   Note, the usage of physical registers per cycle is smoothed off, as
+dnl   much as possible.
+
+dnl   Note, the ldq's and stq's are at the end of the quadpacks.  note, we'd
+dnl   like not to have a ldq or stq to preceded a conditional branch in a
+dnl   quadpack. The conditional branch moves the retire pointer one cycle
+dnl   later.
+
+dnl   Optimization notes:
+dnl   Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
+dnl   Reserved regs:	 r29 r30 r31
+dnl   Free caller-saves regs in unrolled code: r24 r25 r28
+dnl   We should swap some of the callee-saves regs for some of the free
+dnl   caller-saves regs, saving some overhead cycles.
+dnl   Most importantly, we should write fast code for the 0-7 case.
+dnl   The code we use there are for the 21164, and runs at 7 cycles/limb
+dnl   on the 21264.  Should not be hard, if we write specialized code for
+dnl   1-7 limbs (the one for 0 limbs should be straightforward).  We then just
+dnl   need a jump table indexed by the low 3 bits of the count argument.
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	cmpult	r18,	8,	r1
+	beq	r1,	$Large
+
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r18,	1,	r18	C size--
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	umulh	r2,	r19,	r0	C r0 = prod_high
+	beq	r18,	$Lend0b		C jump if size was == 1
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r18,	1,	r18	C size--
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r4
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	beq	r18,	$Lend0a		C jump if size was == 2
+
+	ALIGN(8)
+$Loop0:	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,	1,	r18	C size--
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r5,	r0,	r0	C combine carries
+	bne	r18,	$Loop0
+$Lend0a:
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r5,	r0,	r0	C combine carries
+	addq	r4,	r0,	r0	C cy_limb = prod_high + cy
+	ret	r31,	(r26),	1
+$Lend0b:
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r0,	r5,	r0
+	ret	r31,	(r26),	1
+
+$Large:
+	lda	$30,	-240($30)
+	stq	$9,	8($30)
+	stq	$10,	16($30)
+	stq	$11,	24($30)
+	stq	$12,	32($30)
+	stq	$13,	40($30)
+	stq	$14,	48($30)
+	stq	$15,	56($30)
+
+	and	r18,	7,	r20	C count for the first loop, 0-7
+	srl	r18,	3,	r18	C count for unrolled loop
+	bis	r31,	r31,	r0
+	beq	r20,	$Lunroll
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r20,	1,	r20	C size--
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	umulh	r2,	r19,	r0	C r0 = prod_high
+	beq	r20,	$Lend1b		C jump if size was == 1
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r20,	1,	r20	C size--
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r4
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	beq	r20,	$Lend1a		C jump if size was == 2
+
+	ALIGN(8)
+$Loop1:	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	subq	r20,	1,	r20	C size--
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r5,	r0,	r0	C combine carries
+	bne	r20,	$Loop1
+
+$Lend1a:
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r5,	r0,	r0	C combine carries
+	addq	r4,	r0,	r0	C cy_limb = prod_high + cy
+	br	r31,	$Lunroll
+$Lend1b:
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r0,	r5,	r0
+
+$Lunroll:
+	lda	r17,	-16(r17)	C L1 bookkeeping
+	lda	r16,	-16(r16)	C L1 bookkeeping
+	bis	r0,	r31,	r12
+
+C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+	ldq	r2,	16(r17)		C L1
+	ldq	r3,	24(r17)		C L1
+	lda	r18,	-1(r18)		C L1 bookkeeping
+	ldq	r6,	16(r16)		C L1
+	ldq	r7,	24(r16)		C L1
+	ldq	r0,	32(r17)		C L1
+	mulq	r19,	r2,	r13	C U1
+	ldq	r1,	40(r17)		C L1
+	umulh	r19,	r2,	r14	C U1
+	mulq	r19,	r3,	r15	C U1
+	lda	r17,	64(r17)		C L1 bookkeeping
+	ldq	r4,	32(r16)		C L1
+	ldq	r5,	40(r16)		C L1
+	umulh	r19,	r3,	r8	C U1
+	ldq	r2,	-16(r17)	C L1
+	mulq	r19,	r0,	r9	C U1
+	ldq	r3,	-8(r17)		C L1
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	mulq	r19,	r1,	r11	C U1
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	lda	r16,	64(r16)		C L1 bookkeeping
+	addq	r6,	r12,	r22	C U0 hi add => answer
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	ldq	r6,	-16(r16)	C L1
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	ldq	r7,	-8(r16)		C L1
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	ldq	r0,	0(r17)		C L1
+	mulq	r19,	r2,	r13	C U1
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r1,	8(r17)		C L1
+	umulh	r19,	r2,	r14	C U1
+	addq	r4,	r9,	r4	C L0 lo + acc
+	stq	r22,	-48(r16)	C L0
+	stq	r23,	-40(r16)	C L1
+	mulq	r19,	r3,	r15	C U1
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	addq	r4,	r8,	r22	C U0 hi add => answer
+	ble	r18,	$Lend		C U1 bookkeeping
+
+C ____ MAIN UNROLLED LOOP ____
+	ALIGN(16)
+$Loop:
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	ldq	r4,	0(r16)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	ldq	r5,	8(r16)		C L1
+
+	umulh	r19,	r3,	r8	C U1
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	ldq	r2,	16(r17)		C L1
+
+	mulq	r19,	r0,	r9	C U1
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	ldq	r3,	24(r17)		C L1
+
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	stq	r22,	-32(r16)	C L0
+	stq	r23,	-24(r16)	C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r1,	r11	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r12,	r21,	r12	C U0 hi mul + carry
+
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r18,	-1(r18)		C L1 bookkeeping
+	addq	r6,	r12,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	ldq	r6,	16(r16)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	ldq	r7,	24(r16)		C L1
+
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	ldq	r0,	32(r17)		C L1
+
+	mulq	r19,	r2,	r13	C U1
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r1,	40(r17)		C L1
+
+	umulh	r19,	r2,	r14	C U1
+	addq	r4,	r9,	r4	C U0 lo + acc
+	stq	r22,	-16(r16)	C L0
+	stq	r23,	-8(r16)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r3,	r15	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C L0 hi mul + carry
+
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r17,	64(r17)		C L1 bookkeeping
+	addq	r4,	r8,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	ldq	r4,	32(r16)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	ldq	r5,	40(r16)		C L1
+
+	umulh	r19,	r3,	r8	C U1
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	ldq	r2,	-16(r17)	C L1
+
+	mulq	r19,	r0,	r9	C U1
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	ldq	r3,	-8(r17)		C L1
+
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	stq	r22,	0(r16)		C L0
+	stq	r23,	8(r16)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r1,	r11	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r12,	r21,	r12	C U0 hi mul + carry
+
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r16,	64(r16)		C L1 bookkeeping
+	addq	r6,	r12,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	ldq	r6,	-16(r16)	C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	ldq	r7,	-8(r16)		C L1
+
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	ldq	r0,	0(r17)		C L1
+
+	mulq	r19,	r2,	r13	C U1
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r1,	8(r17)		C L1
+
+	umulh	r19,	r2,	r14	C U1
+	addq	r4,	r9,	r4	C L0 lo + acc
+	stq	r22,	-48(r16)	C L0
+	stq	r23,	-40(r16)	C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r3,	r15	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	addq	r4,	r8,	r22	C U0 hi add => answer
+	bis	r31,	r31,	r31	C L1 mt
+	bgt	r18,	$Loop		C U1 bookkeeping
+
+C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	ldq	r4,	0(r16)		C L1
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	ldq	r5,	8(r16)		C L1
+	umulh	r19,	r3,	r8	C U1
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	mulq	r19,	r0,	r9	C U1
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	stq	r22,	-32(r16)	C L0
+	stq	r23,	-24(r16)	C L1
+	mulq	r19,	r1,	r11	C U1
+	addq	r12,	r21,	r12	C U0 hi mul + carry
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	addq	r6,	r12,	r22	C U0 hi add => answer
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	addq	r4,	r9,	r4	C U0 lo + acc
+	stq	r22,	-16(r16)	C L0
+	stq	r23,	-8(r16)		C L1
+	bis	r31,	r31,	r31	C L0 st slosh
+	addq	r8,	r21,	r8	C L0 hi mul + carry
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	addq	r4,	r8,	r22	C U0 hi add => answer
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	stq	r22,	0(r16)		C L0
+	stq	r23,	8(r16)		C L1
+	addq	r12,	r21,	r0	C U0 hi mul + carry
+
+	ldq	$9,	8($30)
+	ldq	$10,	16($30)
+	ldq	$11,	24($30)
+	ldq	$12,	32($30)
+	ldq	$13,	40($30)
+	ldq	$14,	48($30)
+	ldq	$15,	56($30)
+	lda	$30,	240($30)
+	ret	r31,	(r26),	1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev6/gmp-mparam.h b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h
new file mode 100644
index 0000000000..7ea20577f8
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h
@@ -0,0 +1,62 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Generated by tuneup.c, 2000-08-02. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD     47
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD         70
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD     94
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD        101
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD                33
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD               70
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD              29
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD         46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD            33
+#endif
diff --git a/rts/gmp/mpn/alpha/gmp-mparam.h b/rts/gmp/mpn/alpha/gmp-mparam.h
new file mode 100644
index 0000000000..054ff2fe5f
--- /dev/null
+++ b/rts/gmp/mpn/alpha/gmp-mparam.h
@@ -0,0 +1,64 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the 21164 family.  The 21264 will require
+   different values, since it has such quick multiplication.  */
+/* Generated by tuneup.c, 2000-07-19. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   22
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD       53
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   31
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       47
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              64
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            17
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           4
+#endif
diff --git a/rts/gmp/mpn/alpha/invert_limb.asm b/rts/gmp/mpn/alpha/invert_limb.asm
new file mode 100644
index 0000000000..a921b32b3f
--- /dev/null
+++ b/rts/gmp/mpn/alpha/invert_limb.asm
@@ -0,0 +1,345 @@
+dnl  Alpha mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+dnl 
+dnl  This is based on sophie:/gmp-stuff/dbg-inv-limb.c.
+dnl  The ideas are due to Peter L. Montgomery
+dnl 
+dnl  The table below uses 4096 bytes.  The file mentioned above has an
+dnl  alternative function that doesn't require the table, but it runs 50%
+dnl  slower than this.
+
+include(`../config.m4')
+
+ASM_START()
+
+FLOAT64($C36,9223372036854775808.0)		C 2^63
+
+PROLOGUE_GP(mpn_invert_limb)
+	lda	r30,-16(r30)
+	addq	r16,r16,r1
+	bne	r1,$73
+	lda	r0,-1
+	br	r31,$Lend
+$73:
+	srl	r16,1,r1
+	stq	r1,0(r30)
+	ldt	f11,0(r30)
+	cvtqt	f11,f1
+	lda	r1,$C36
+	ldt	f10,0(r1)
+	divt	f10,f1,f10
+	lda	r2,$invtab-4096
+	srl	r16,52,r1
+	addq	r1,r1,r1
+	addq	r1,r2,r1
+	bic	r1,6,r2
+	ldq	r2,0(r2)
+	bic	r1,1,r1
+	extwl	r2,r1,r2
+	sll	r2,48,r0
+	umulh	r16,r0,r1
+	addq	r16,r1,r3
+	stq	r3,0(r30)
+	ldt	f11,0(r30)
+	cvtqt	f11,f1
+	mult	f1,f10,f1
+	cvttqc	f1,f1
+	stt	f1,0(r30)
+	ldq	r4,0(r30)
+	subq	r0,r4,r0
+	umulh	r16,r0,r1
+	mulq	r16,r0,r2
+	addq	r16,r1,r3
+	bge	r3,$Loop2
+$Loop1:	addq	r2,r16,r2
+	cmpult	r2,r16,r1
+	addq	r3,r1,r3
+	addq	r0,1,r0
+	blt	r3,$Loop1
+$Loop2:	cmpult	r2,r16,r1
+	subq	r0,1,r0
+	subq	r3,r1,r3
+	subq	r2,r16,r2
+	bge	r3,$Loop2
+$Lend:
+	lda	r30,16(r30)
+	ret	r31,(r26),1
+EPILOGUE(mpn_invert_limb)
+DATASTART(`$invtab',4)
+	.word 0xffff,0xffc0,0xff80,0xff40,0xff00,0xfec0,0xfe81,0xfe41
+	.word 0xfe01,0xfdc2,0xfd83,0xfd43,0xfd04,0xfcc5,0xfc86,0xfc46
+	.word 0xfc07,0xfbc8,0xfb8a,0xfb4b,0xfb0c,0xfacd,0xfa8e,0xfa50
+	.word 0xfa11,0xf9d3,0xf994,0xf956,0xf918,0xf8d9,0xf89b,0xf85d
+	.word 0xf81f,0xf7e1,0xf7a3,0xf765,0xf727,0xf6ea,0xf6ac,0xf66e
+	.word 0xf631,0xf5f3,0xf5b6,0xf578,0xf53b,0xf4fd,0xf4c0,0xf483
+	.word 0xf446,0xf409,0xf3cc,0xf38f,0xf352,0xf315,0xf2d8,0xf29c
+	.word 0xf25f,0xf222,0xf1e6,0xf1a9,0xf16d,0xf130,0xf0f4,0xf0b8
+	.word 0xf07c,0xf03f,0xf003,0xefc7,0xef8b,0xef4f,0xef14,0xeed8
+	.word 0xee9c,0xee60,0xee25,0xede9,0xedae,0xed72,0xed37,0xecfb
+	.word 0xecc0,0xec85,0xec4a,0xec0e,0xebd3,0xeb98,0xeb5d,0xeb22
+	.word 0xeae8,0xeaad,0xea72,0xea37,0xe9fd,0xe9c2,0xe988,0xe94d
+	.word 0xe913,0xe8d8,0xe89e,0xe864,0xe829,0xe7ef,0xe7b5,0xe77b
+	.word 0xe741,0xe707,0xe6cd,0xe694,0xe65a,0xe620,0xe5e6,0xe5ad
+	.word 0xe573,0xe53a,0xe500,0xe4c7,0xe48d,0xe454,0xe41b,0xe3e2
+	.word 0xe3a9,0xe370,0xe336,0xe2fd,0xe2c5,0xe28c,0xe253,0xe21a
+	.word 0xe1e1,0xe1a9,0xe170,0xe138,0xe0ff,0xe0c7,0xe08e,0xe056
+	.word 0xe01e,0xdfe5,0xdfad,0xdf75,0xdf3d,0xdf05,0xdecd,0xde95
+	.word 0xde5d,0xde25,0xdded,0xddb6,0xdd7e,0xdd46,0xdd0f,0xdcd7
+	.word 0xdca0,0xdc68,0xdc31,0xdbf9,0xdbc2,0xdb8b,0xdb54,0xdb1d
+	.word 0xdae6,0xdaae,0xda78,0xda41,0xda0a,0xd9d3,0xd99c,0xd965
+	.word 0xd92f,0xd8f8,0xd8c1,0xd88b,0xd854,0xd81e,0xd7e8,0xd7b1
+	.word 0xd77b,0xd745,0xd70e,0xd6d8,0xd6a2,0xd66c,0xd636,0xd600
+	.word 0xd5ca,0xd594,0xd55f,0xd529,0xd4f3,0xd4bd,0xd488,0xd452
+	.word 0xd41d,0xd3e7,0xd3b2,0xd37c,0xd347,0xd312,0xd2dd,0xd2a7
+	.word 0xd272,0xd23d,0xd208,0xd1d3,0xd19e,0xd169,0xd134,0xd100
+	.word 0xd0cb,0xd096,0xd061,0xd02d,0xcff8,0xcfc4,0xcf8f,0xcf5b
+	.word 0xcf26,0xcef2,0xcebe,0xce89,0xce55,0xce21,0xcded,0xcdb9
+	.word 0xcd85,0xcd51,0xcd1d,0xcce9,0xccb5,0xcc81,0xcc4e,0xcc1a
+	.word 0xcbe6,0xcbb3,0xcb7f,0xcb4c,0xcb18,0xcae5,0xcab1,0xca7e
+	.word 0xca4b,0xca17,0xc9e4,0xc9b1,0xc97e,0xc94b,0xc918,0xc8e5
+	.word 0xc8b2,0xc87f,0xc84c,0xc819,0xc7e7,0xc7b4,0xc781,0xc74f
+	.word 0xc71c,0xc6e9,0xc6b7,0xc684,0xc652,0xc620,0xc5ed,0xc5bb
+	.word 0xc589,0xc557,0xc524,0xc4f2,0xc4c0,0xc48e,0xc45c,0xc42a
+	.word 0xc3f8,0xc3c7,0xc395,0xc363,0xc331,0xc300,0xc2ce,0xc29c
+	.word 0xc26b,0xc239,0xc208,0xc1d6,0xc1a5,0xc174,0xc142,0xc111
+	.word 0xc0e0,0xc0af,0xc07e,0xc04d,0xc01c,0xbfeb,0xbfba,0xbf89
+	.word 0xbf58,0xbf27,0xbef6,0xbec5,0xbe95,0xbe64,0xbe33,0xbe03
+	.word 0xbdd2,0xbda2,0xbd71,0xbd41,0xbd10,0xbce0,0xbcb0,0xbc80
+	.word 0xbc4f,0xbc1f,0xbbef,0xbbbf,0xbb8f,0xbb5f,0xbb2f,0xbaff
+	.word 0xbacf,0xba9f,0xba6f,0xba40,0xba10,0xb9e0,0xb9b1,0xb981
+	.word 0xb951,0xb922,0xb8f2,0xb8c3,0xb894,0xb864,0xb835,0xb806
+	.word 0xb7d6,0xb7a7,0xb778,0xb749,0xb71a,0xb6eb,0xb6bc,0xb68d
+	.word 0xb65e,0xb62f,0xb600,0xb5d1,0xb5a2,0xb574,0xb545,0xb516
+	.word 0xb4e8,0xb4b9,0xb48a,0xb45c,0xb42e,0xb3ff,0xb3d1,0xb3a2
+	.word 0xb374,0xb346,0xb318,0xb2e9,0xb2bb,0xb28d,0xb25f,0xb231
+	.word 0xb203,0xb1d5,0xb1a7,0xb179,0xb14b,0xb11d,0xb0f0,0xb0c2
+	.word 0xb094,0xb067,0xb039,0xb00b,0xafde,0xafb0,0xaf83,0xaf55
+	.word 0xaf28,0xaefb,0xaecd,0xaea0,0xae73,0xae45,0xae18,0xadeb
+	.word 0xadbe,0xad91,0xad64,0xad37,0xad0a,0xacdd,0xacb0,0xac83
+	.word 0xac57,0xac2a,0xabfd,0xabd0,0xaba4,0xab77,0xab4a,0xab1e
+	.word 0xaaf1,0xaac5,0xaa98,0xaa6c,0xaa40,0xaa13,0xa9e7,0xa9bb
+	.word 0xa98e,0xa962,0xa936,0xa90a,0xa8de,0xa8b2,0xa886,0xa85a
+	.word 0xa82e,0xa802,0xa7d6,0xa7aa,0xa77e,0xa753,0xa727,0xa6fb
+	.word 0xa6d0,0xa6a4,0xa678,0xa64d,0xa621,0xa5f6,0xa5ca,0xa59f
+	.word 0xa574,0xa548,0xa51d,0xa4f2,0xa4c6,0xa49b,0xa470,0xa445
+	.word 0xa41a,0xa3ef,0xa3c4,0xa399,0xa36e,0xa343,0xa318,0xa2ed
+	.word 0xa2c2,0xa297,0xa26d,0xa242,0xa217,0xa1ed,0xa1c2,0xa197
+	.word 0xa16d,0xa142,0xa118,0xa0ed,0xa0c3,0xa098,0xa06e,0xa044
+	.word 0xa01a,0x9fef,0x9fc5,0x9f9b,0x9f71,0x9f47,0x9f1c,0x9ef2
+	.word 0x9ec8,0x9e9e,0x9e74,0x9e4b,0x9e21,0x9df7,0x9dcd,0x9da3
+	.word 0x9d79,0x9d50,0x9d26,0x9cfc,0x9cd3,0x9ca9,0x9c80,0x9c56
+	.word 0x9c2d,0x9c03,0x9bda,0x9bb0,0x9b87,0x9b5e,0x9b34,0x9b0b
+	.word 0x9ae2,0x9ab9,0x9a8f,0x9a66,0x9a3d,0x9a14,0x99eb,0x99c2
+	.word 0x9999,0x9970,0x9947,0x991e,0x98f6,0x98cd,0x98a4,0x987b
+	.word 0x9852,0x982a,0x9801,0x97d8,0x97b0,0x9787,0x975f,0x9736
+	.word 0x970e,0x96e5,0x96bd,0x9695,0x966c,0x9644,0x961c,0x95f3
+	.word 0x95cb,0x95a3,0x957b,0x9553,0x952b,0x9503,0x94db,0x94b3
+	.word 0x948b,0x9463,0x943b,0x9413,0x93eb,0x93c3,0x939b,0x9374
+	.word 0x934c,0x9324,0x92fd,0x92d5,0x92ad,0x9286,0x925e,0x9237
+	.word 0x920f,0x91e8,0x91c0,0x9199,0x9172,0x914a,0x9123,0x90fc
+	.word 0x90d4,0x90ad,0x9086,0x905f,0x9038,0x9011,0x8fea,0x8fc3
+	.word 0x8f9c,0x8f75,0x8f4e,0x8f27,0x8f00,0x8ed9,0x8eb2,0x8e8b
+	.word 0x8e65,0x8e3e,0x8e17,0x8df1,0x8dca,0x8da3,0x8d7d,0x8d56
+	.word 0x8d30,0x8d09,0x8ce3,0x8cbc,0x8c96,0x8c6f,0x8c49,0x8c23
+	.word 0x8bfc,0x8bd6,0x8bb0,0x8b8a,0x8b64,0x8b3d,0x8b17,0x8af1
+	.word 0x8acb,0x8aa5,0x8a7f,0x8a59,0x8a33,0x8a0d,0x89e7,0x89c1
+	.word 0x899c,0x8976,0x8950,0x892a,0x8904,0x88df,0x88b9,0x8893
+	.word 0x886e,0x8848,0x8823,0x87fd,0x87d8,0x87b2,0x878d,0x8767
+	.word 0x8742,0x871d,0x86f7,0x86d2,0x86ad,0x8687,0x8662,0x863d
+	.word 0x8618,0x85f3,0x85ce,0x85a9,0x8583,0x855e,0x8539,0x8514
+	.word 0x84f0,0x84cb,0x84a6,0x8481,0x845c,0x8437,0x8412,0x83ee
+	.word 0x83c9,0x83a4,0x8380,0x835b,0x8336,0x8312,0x82ed,0x82c9
+	.word 0x82a4,0x8280,0x825b,0x8237,0x8212,0x81ee,0x81ca,0x81a5
+	.word 0x8181,0x815d,0x8138,0x8114,0x80f0,0x80cc,0x80a8,0x8084
+	.word 0x8060,0x803c,0x8018,0x7ff4,0x7fd0,0x7fac,0x7f88,0x7f64
+	.word 0x7f40,0x7f1c,0x7ef8,0x7ed4,0x7eb1,0x7e8d,0x7e69,0x7e45
+	.word 0x7e22,0x7dfe,0x7ddb,0x7db7,0x7d93,0x7d70,0x7d4c,0x7d29
+	.word 0x7d05,0x7ce2,0x7cbf,0x7c9b,0x7c78,0x7c55,0x7c31,0x7c0e
+	.word 0x7beb,0x7bc7,0x7ba4,0x7b81,0x7b5e,0x7b3b,0x7b18,0x7af5
+	.word 0x7ad2,0x7aaf,0x7a8c,0x7a69,0x7a46,0x7a23,0x7a00,0x79dd
+	.word 0x79ba,0x7997,0x7975,0x7952,0x792f,0x790c,0x78ea,0x78c7
+	.word 0x78a4,0x7882,0x785f,0x783c,0x781a,0x77f7,0x77d5,0x77b2
+	.word 0x7790,0x776e,0x774b,0x7729,0x7706,0x76e4,0x76c2,0x76a0
+	.word 0x767d,0x765b,0x7639,0x7617,0x75f5,0x75d2,0x75b0,0x758e
+	.word 0x756c,0x754a,0x7528,0x7506,0x74e4,0x74c2,0x74a0,0x747e
+	.word 0x745d,0x743b,0x7419,0x73f7,0x73d5,0x73b4,0x7392,0x7370
+	.word 0x734f,0x732d,0x730b,0x72ea,0x72c8,0x72a7,0x7285,0x7264
+	.word 0x7242,0x7221,0x71ff,0x71de,0x71bc,0x719b,0x717a,0x7158
+	.word 0x7137,0x7116,0x70f5,0x70d3,0x70b2,0x7091,0x7070,0x704f
+	.word 0x702e,0x700c,0x6feb,0x6fca,0x6fa9,0x6f88,0x6f67,0x6f46
+	.word 0x6f26,0x6f05,0x6ee4,0x6ec3,0x6ea2,0x6e81,0x6e60,0x6e40
+	.word 0x6e1f,0x6dfe,0x6dde,0x6dbd,0x6d9c,0x6d7c,0x6d5b,0x6d3a
+	.word 0x6d1a,0x6cf9,0x6cd9,0x6cb8,0x6c98,0x6c77,0x6c57,0x6c37
+	.word 0x6c16,0x6bf6,0x6bd6,0x6bb5,0x6b95,0x6b75,0x6b54,0x6b34
+	.word 0x6b14,0x6af4,0x6ad4,0x6ab4,0x6a94,0x6a73,0x6a53,0x6a33
+	.word 0x6a13,0x69f3,0x69d3,0x69b3,0x6993,0x6974,0x6954,0x6934
+	.word 0x6914,0x68f4,0x68d4,0x68b5,0x6895,0x6875,0x6855,0x6836
+	.word 0x6816,0x67f6,0x67d7,0x67b7,0x6798,0x6778,0x6758,0x6739
+	.word 0x6719,0x66fa,0x66db,0x66bb,0x669c,0x667c,0x665d,0x663e
+	.word 0x661e,0x65ff,0x65e0,0x65c0,0x65a1,0x6582,0x6563,0x6544
+	.word 0x6524,0x6505,0x64e6,0x64c7,0x64a8,0x6489,0x646a,0x644b
+	.word 0x642c,0x640d,0x63ee,0x63cf,0x63b0,0x6391,0x6373,0x6354
+	.word 0x6335,0x6316,0x62f7,0x62d9,0x62ba,0x629b,0x627c,0x625e
+	.word 0x623f,0x6221,0x6202,0x61e3,0x61c5,0x61a6,0x6188,0x6169
+	.word 0x614b,0x612c,0x610e,0x60ef,0x60d1,0x60b3,0x6094,0x6076
+	.word 0x6058,0x6039,0x601b,0x5ffd,0x5fdf,0x5fc0,0x5fa2,0x5f84
+	.word 0x5f66,0x5f48,0x5f2a,0x5f0b,0x5eed,0x5ecf,0x5eb1,0x5e93
+	.word 0x5e75,0x5e57,0x5e39,0x5e1b,0x5dfd,0x5de0,0x5dc2,0x5da4
+	.word 0x5d86,0x5d68,0x5d4a,0x5d2d,0x5d0f,0x5cf1,0x5cd3,0x5cb6
+	.word 0x5c98,0x5c7a,0x5c5d,0x5c3f,0x5c21,0x5c04,0x5be6,0x5bc9
+	.word 0x5bab,0x5b8e,0x5b70,0x5b53,0x5b35,0x5b18,0x5afb,0x5add
+	.word 0x5ac0,0x5aa2,0x5a85,0x5a68,0x5a4b,0x5a2d,0x5a10,0x59f3
+	.word 0x59d6,0x59b8,0x599b,0x597e,0x5961,0x5944,0x5927,0x590a
+	.word 0x58ed,0x58d0,0x58b3,0x5896,0x5879,0x585c,0x583f,0x5822
+	.word 0x5805,0x57e8,0x57cb,0x57ae,0x5791,0x5775,0x5758,0x573b
+	.word 0x571e,0x5702,0x56e5,0x56c8,0x56ac,0x568f,0x5672,0x5656
+	.word 0x5639,0x561c,0x5600,0x55e3,0x55c7,0x55aa,0x558e,0x5571
+	.word 0x5555,0x5538,0x551c,0x5500,0x54e3,0x54c7,0x54aa,0x548e
+	.word 0x5472,0x5456,0x5439,0x541d,0x5401,0x53e5,0x53c8,0x53ac
+	.word 0x5390,0x5374,0x5358,0x533c,0x5320,0x5304,0x52e8,0x52cb
+	.word 0x52af,0x5293,0x5277,0x525c,0x5240,0x5224,0x5208,0x51ec
+	.word 0x51d0,0x51b4,0x5198,0x517c,0x5161,0x5145,0x5129,0x510d
+	.word 0x50f2,0x50d6,0x50ba,0x509f,0x5083,0x5067,0x504c,0x5030
+	.word 0x5015,0x4ff9,0x4fdd,0x4fc2,0x4fa6,0x4f8b,0x4f6f,0x4f54
+	.word 0x4f38,0x4f1d,0x4f02,0x4ee6,0x4ecb,0x4eb0,0x4e94,0x4e79
+	.word 0x4e5e,0x4e42,0x4e27,0x4e0c,0x4df0,0x4dd5,0x4dba,0x4d9f
+	.word 0x4d84,0x4d69,0x4d4d,0x4d32,0x4d17,0x4cfc,0x4ce1,0x4cc6
+	.word 0x4cab,0x4c90,0x4c75,0x4c5a,0x4c3f,0x4c24,0x4c09,0x4bee
+	.word 0x4bd3,0x4bb9,0x4b9e,0x4b83,0x4b68,0x4b4d,0x4b32,0x4b18
+	.word 0x4afd,0x4ae2,0x4ac7,0x4aad,0x4a92,0x4a77,0x4a5d,0x4a42
+	.word 0x4a27,0x4a0d,0x49f2,0x49d8,0x49bd,0x49a3,0x4988,0x496e
+	.word 0x4953,0x4939,0x491e,0x4904,0x48e9,0x48cf,0x48b5,0x489a
+	.word 0x4880,0x4865,0x484b,0x4831,0x4817,0x47fc,0x47e2,0x47c8
+	.word 0x47ae,0x4793,0x4779,0x475f,0x4745,0x472b,0x4711,0x46f6
+	.word 0x46dc,0x46c2,0x46a8,0x468e,0x4674,0x465a,0x4640,0x4626
+	.word 0x460c,0x45f2,0x45d8,0x45be,0x45a5,0x458b,0x4571,0x4557
+	.word 0x453d,0x4523,0x4509,0x44f0,0x44d6,0x44bc,0x44a2,0x4489
+	.word 0x446f,0x4455,0x443c,0x4422,0x4408,0x43ef,0x43d5,0x43bc
+	.word 0x43a2,0x4388,0x436f,0x4355,0x433c,0x4322,0x4309,0x42ef
+	.word 0x42d6,0x42bc,0x42a3,0x428a,0x4270,0x4257,0x423d,0x4224
+	.word 0x420b,0x41f2,0x41d8,0x41bf,0x41a6,0x418c,0x4173,0x415a
+	.word 0x4141,0x4128,0x410e,0x40f5,0x40dc,0x40c3,0x40aa,0x4091
+	.word 0x4078,0x405f,0x4046,0x402d,0x4014,0x3ffb,0x3fe2,0x3fc9
+	.word 0x3fb0,0x3f97,0x3f7e,0x3f65,0x3f4c,0x3f33,0x3f1a,0x3f01
+	.word 0x3ee8,0x3ed0,0x3eb7,0x3e9e,0x3e85,0x3e6c,0x3e54,0x3e3b
+	.word 0x3e22,0x3e0a,0x3df1,0x3dd8,0x3dc0,0x3da7,0x3d8e,0x3d76
+	.word 0x3d5d,0x3d45,0x3d2c,0x3d13,0x3cfb,0x3ce2,0x3cca,0x3cb1
+	.word 0x3c99,0x3c80,0x3c68,0x3c50,0x3c37,0x3c1f,0x3c06,0x3bee
+	.word 0x3bd6,0x3bbd,0x3ba5,0x3b8d,0x3b74,0x3b5c,0x3b44,0x3b2b
+	.word 0x3b13,0x3afb,0x3ae3,0x3acb,0x3ab2,0x3a9a,0x3a82,0x3a6a
+	.word 0x3a52,0x3a3a,0x3a22,0x3a09,0x39f1,0x39d9,0x39c1,0x39a9
+	.word 0x3991,0x3979,0x3961,0x3949,0x3931,0x3919,0x3901,0x38ea
+	.word 0x38d2,0x38ba,0x38a2,0x388a,0x3872,0x385a,0x3843,0x382b
+	.word 0x3813,0x37fb,0x37e3,0x37cc,0x37b4,0x379c,0x3785,0x376d
+	.word 0x3755,0x373e,0x3726,0x370e,0x36f7,0x36df,0x36c8,0x36b0
+	.word 0x3698,0x3681,0x3669,0x3652,0x363a,0x3623,0x360b,0x35f4
+	.word 0x35dc,0x35c5,0x35ae,0x3596,0x357f,0x3567,0x3550,0x3539
+	.word 0x3521,0x350a,0x34f3,0x34db,0x34c4,0x34ad,0x3496,0x347e
+	.word 0x3467,0x3450,0x3439,0x3422,0x340a,0x33f3,0x33dc,0x33c5
+	.word 0x33ae,0x3397,0x3380,0x3368,0x3351,0x333a,0x3323,0x330c
+	.word 0x32f5,0x32de,0x32c7,0x32b0,0x3299,0x3282,0x326c,0x3255
+	.word 0x323e,0x3227,0x3210,0x31f9,0x31e2,0x31cb,0x31b5,0x319e
+	.word 0x3187,0x3170,0x3159,0x3143,0x312c,0x3115,0x30fe,0x30e8
+	.word 0x30d1,0x30ba,0x30a4,0x308d,0x3076,0x3060,0x3049,0x3033
+	.word 0x301c,0x3005,0x2fef,0x2fd8,0x2fc2,0x2fab,0x2f95,0x2f7e
+	.word 0x2f68,0x2f51,0x2f3b,0x2f24,0x2f0e,0x2ef8,0x2ee1,0x2ecb
+	.word 0x2eb4,0x2e9e,0x2e88,0x2e71,0x2e5b,0x2e45,0x2e2e,0x2e18
+	.word 0x2e02,0x2dec,0x2dd5,0x2dbf,0x2da9,0x2d93,0x2d7c,0x2d66
+	.word 0x2d50,0x2d3a,0x2d24,0x2d0e,0x2cf8,0x2ce1,0x2ccb,0x2cb5
+	.word 0x2c9f,0x2c89,0x2c73,0x2c5d,0x2c47,0x2c31,0x2c1b,0x2c05
+	.word 0x2bef,0x2bd9,0x2bc3,0x2bad,0x2b97,0x2b81,0x2b6c,0x2b56
+	.word 0x2b40,0x2b2a,0x2b14,0x2afe,0x2ae8,0x2ad3,0x2abd,0x2aa7
+	.word 0x2a91,0x2a7c,0x2a66,0x2a50,0x2a3a,0x2a25,0x2a0f,0x29f9
+	.word 0x29e4,0x29ce,0x29b8,0x29a3,0x298d,0x2977,0x2962,0x294c
+	.word 0x2937,0x2921,0x290c,0x28f6,0x28e0,0x28cb,0x28b5,0x28a0
+	.word 0x288b,0x2875,0x2860,0x284a,0x2835,0x281f,0x280a,0x27f5
+	.word 0x27df,0x27ca,0x27b4,0x279f,0x278a,0x2774,0x275f,0x274a
+	.word 0x2735,0x271f,0x270a,0x26f5,0x26e0,0x26ca,0x26b5,0x26a0
+	.word 0x268b,0x2676,0x2660,0x264b,0x2636,0x2621,0x260c,0x25f7
+	.word 0x25e2,0x25cd,0x25b8,0x25a2,0x258d,0x2578,0x2563,0x254e
+	.word 0x2539,0x2524,0x250f,0x24fa,0x24e5,0x24d1,0x24bc,0x24a7
+	.word 0x2492,0x247d,0x2468,0x2453,0x243e,0x2429,0x2415,0x2400
+	.word 0x23eb,0x23d6,0x23c1,0x23ad,0x2398,0x2383,0x236e,0x235a
+	.word 0x2345,0x2330,0x231c,0x2307,0x22f2,0x22dd,0x22c9,0x22b4
+	.word 0x22a0,0x228b,0x2276,0x2262,0x224d,0x2239,0x2224,0x2210
+	.word 0x21fb,0x21e6,0x21d2,0x21bd,0x21a9,0x2194,0x2180,0x216c
+	.word 0x2157,0x2143,0x212e,0x211a,0x2105,0x20f1,0x20dd,0x20c8
+	.word 0x20b4,0x20a0,0x208b,0x2077,0x2063,0x204e,0x203a,0x2026
+	.word 0x2012,0x1ffd,0x1fe9,0x1fd5,0x1fc1,0x1fac,0x1f98,0x1f84
+	.word 0x1f70,0x1f5c,0x1f47,0x1f33,0x1f1f,0x1f0b,0x1ef7,0x1ee3
+	.word 0x1ecf,0x1ebb,0x1ea7,0x1e93,0x1e7f,0x1e6a,0x1e56,0x1e42
+	.word 0x1e2e,0x1e1a,0x1e06,0x1df3,0x1ddf,0x1dcb,0x1db7,0x1da3
+	.word 0x1d8f,0x1d7b,0x1d67,0x1d53,0x1d3f,0x1d2b,0x1d18,0x1d04
+	.word 0x1cf0,0x1cdc,0x1cc8,0x1cb5,0x1ca1,0x1c8d,0x1c79,0x1c65
+	.word 0x1c52,0x1c3e,0x1c2a,0x1c17,0x1c03,0x1bef,0x1bdb,0x1bc8
+	.word 0x1bb4,0x1ba0,0x1b8d,0x1b79,0x1b66,0x1b52,0x1b3e,0x1b2b
+	.word 0x1b17,0x1b04,0x1af0,0x1add,0x1ac9,0x1ab6,0x1aa2,0x1a8f
+	.word 0x1a7b,0x1a68,0x1a54,0x1a41,0x1a2d,0x1a1a,0x1a06,0x19f3
+	.word 0x19e0,0x19cc,0x19b9,0x19a5,0x1992,0x197f,0x196b,0x1958
+	.word 0x1945,0x1931,0x191e,0x190b,0x18f8,0x18e4,0x18d1,0x18be
+	.word 0x18ab,0x1897,0x1884,0x1871,0x185e,0x184b,0x1837,0x1824
+	.word 0x1811,0x17fe,0x17eb,0x17d8,0x17c4,0x17b1,0x179e,0x178b
+	.word 0x1778,0x1765,0x1752,0x173f,0x172c,0x1719,0x1706,0x16f3
+	.word 0x16e0,0x16cd,0x16ba,0x16a7,0x1694,0x1681,0x166e,0x165b
+	.word 0x1648,0x1635,0x1623,0x1610,0x15fd,0x15ea,0x15d7,0x15c4
+	.word 0x15b1,0x159f,0x158c,0x1579,0x1566,0x1553,0x1541,0x152e
+	.word 0x151b,0x1508,0x14f6,0x14e3,0x14d0,0x14bd,0x14ab,0x1498
+	.word 0x1485,0x1473,0x1460,0x144d,0x143b,0x1428,0x1416,0x1403
+	.word 0x13f0,0x13de,0x13cb,0x13b9,0x13a6,0x1394,0x1381,0x136f
+	.word 0x135c,0x1349,0x1337,0x1325,0x1312,0x1300,0x12ed,0x12db
+	.word 0x12c8,0x12b6,0x12a3,0x1291,0x127f,0x126c,0x125a,0x1247
+	.word 0x1235,0x1223,0x1210,0x11fe,0x11ec,0x11d9,0x11c7,0x11b5
+	.word 0x11a3,0x1190,0x117e,0x116c,0x1159,0x1147,0x1135,0x1123
+	.word 0x1111,0x10fe,0x10ec,0x10da,0x10c8,0x10b6,0x10a4,0x1091
+	.word 0x107f,0x106d,0x105b,0x1049,0x1037,0x1025,0x1013,0x1001
+	.word 0x0fef,0x0fdc,0x0fca,0x0fb8,0x0fa6,0x0f94,0x0f82,0x0f70
+	.word 0x0f5e,0x0f4c,0x0f3a,0x0f28,0x0f17,0x0f05,0x0ef3,0x0ee1
+	.word 0x0ecf,0x0ebd,0x0eab,0x0e99,0x0e87,0x0e75,0x0e64,0x0e52
+	.word 0x0e40,0x0e2e,0x0e1c,0x0e0a,0x0df9,0x0de7,0x0dd5,0x0dc3
+	.word 0x0db2,0x0da0,0x0d8e,0x0d7c,0x0d6b,0x0d59,0x0d47,0x0d35
+	.word 0x0d24,0x0d12,0x0d00,0x0cef,0x0cdd,0x0ccb,0x0cba,0x0ca8
+	.word 0x0c97,0x0c85,0x0c73,0x0c62,0x0c50,0x0c3f,0x0c2d,0x0c1c
+	.word 0x0c0a,0x0bf8,0x0be7,0x0bd5,0x0bc4,0x0bb2,0x0ba1,0x0b8f
+	.word 0x0b7e,0x0b6c,0x0b5b,0x0b4a,0x0b38,0x0b27,0x0b15,0x0b04
+	.word 0x0af2,0x0ae1,0x0ad0,0x0abe,0x0aad,0x0a9c,0x0a8a,0x0a79
+	.word 0x0a68,0x0a56,0x0a45,0x0a34,0x0a22,0x0a11,0x0a00,0x09ee
+	.word 0x09dd,0x09cc,0x09bb,0x09a9,0x0998,0x0987,0x0976,0x0965
+	.word 0x0953,0x0942,0x0931,0x0920,0x090f,0x08fe,0x08ec,0x08db
+	.word 0x08ca,0x08b9,0x08a8,0x0897,0x0886,0x0875,0x0864,0x0853
+	.word 0x0842,0x0831,0x081f,0x080e,0x07fd,0x07ec,0x07db,0x07ca
+	.word 0x07b9,0x07a8,0x0798,0x0787,0x0776,0x0765,0x0754,0x0743
+	.word 0x0732,0x0721,0x0710,0x06ff,0x06ee,0x06dd,0x06cd,0x06bc
+	.word 0x06ab,0x069a,0x0689,0x0678,0x0668,0x0657,0x0646,0x0635
+	.word 0x0624,0x0614,0x0603,0x05f2,0x05e1,0x05d1,0x05c0,0x05af
+	.word 0x059e,0x058e,0x057d,0x056c,0x055c,0x054b,0x053a,0x052a
+	.word 0x0519,0x0508,0x04f8,0x04e7,0x04d6,0x04c6,0x04b5,0x04a5
+	.word 0x0494,0x0484,0x0473,0x0462,0x0452,0x0441,0x0431,0x0420
+	.word 0x0410,0x03ff,0x03ef,0x03de,0x03ce,0x03bd,0x03ad,0x039c
+	.word 0x038c,0x037b,0x036b,0x035b,0x034a,0x033a,0x0329,0x0319
+	.word 0x0309,0x02f8,0x02e8,0x02d7,0x02c7,0x02b7,0x02a6,0x0296
+	.word 0x0286,0x0275,0x0265,0x0255,0x0245,0x0234,0x0224,0x0214
+	.word 0x0204,0x01f3,0x01e3,0x01d3,0x01c3,0x01b2,0x01a2,0x0192
+	.word 0x0182,0x0172,0x0161,0x0151,0x0141,0x0131,0x0121,0x0111
+	.word 0x0101,0x00f0,0x00e0,0x00d0,0x00c0,0x00b0,0x00a0,0x0090
+	.word 0x0080,0x0070,0x0060,0x0050,0x0040,0x0030,0x0020,0x0010
+DATAEND()
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/lshift.asm b/rts/gmp/mpn/alpha/lshift.asm
new file mode 100644
index 0000000000..87c46f6fe7
--- /dev/null
+++ b/rts/gmp/mpn/alpha/lshift.asm
@@ -0,0 +1,104 @@
+dnl  Alpha mpn_lshift -- Shift a number left.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+dnl  it would take 4 cycles/limb.  It should be possible to get down to 3
+dnl  cycles/limb since both ldq and stq can be paired with the other used
+dnl  instructions.  But there are many restrictions in the 21064 pipeline that
+dnl  makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl  1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl  2. Only aligned instruction pairs can be paired.
+dnl  3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	s8addq	r18,r17,r17	C make r17 point at end of s1
+	ldq	r4,-8(r17)	C load first limb
+	subq	r17,8,r17
+	subq	r31,r19,r7
+	s8addq	r18,r16,r16	C make r16 point at end of RES
+	subq	r18,1,r18
+	and	r18,4-1,r20	C number of limbs in first loop
+	srl	r4,r7,r0	C compute function result
+
+	beq	r20,$L0
+	subq	r18,r20,r18
+
+	ALIGN(8)
+$Loop0:
+	ldq	r3,-8(r17)
+	subq	r16,8,r16
+	subq	r17,8,r17
+	subq	r20,1,r20
+	sll	r4,r19,r5
+	srl	r3,r7,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,0(r16)
+	bne	r20,$Loop0
+
+$L0:	beq	r18,$Lend
+
+	ALIGN(8)
+$Loop:	ldq	r3,-8(r17)
+	subq	r16,32,r16
+	subq	r18,4,r18
+	sll	r4,r19,r5
+	srl	r3,r7,r6
+
+	ldq	r4,-16(r17)
+	sll	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,24(r16)
+	srl	r4,r7,r2
+
+	ldq	r3,-24(r17)
+	sll	r4,r19,r5
+	bis	r1,r2,r8
+	stq	r8,16(r16)
+	srl	r3,r7,r6
+
+	ldq	r4,-32(r17)
+	sll	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,8(r16)
+	srl	r4,r7,r2
+
+	subq	r17,32,r17
+	bis	r1,r2,r8
+	stq	r8,0(r16)
+
+	bgt	r18,$Loop
+
+$Lend:	sll	r4,r19,r8
+	stq	r8,-8(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/mul_1.asm b/rts/gmp/mpn/alpha/mul_1.asm
new file mode 100644
index 0000000000..46b8df34f5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/mul_1.asm
@@ -0,0 +1,71 @@
+dnl  Alpha __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl  cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	subq	r18,1,r18	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	bic	r31,r31,r4	C clear cy_limb
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Lend1	C jump if size was == 1
+	ldq	r2,8(r17)	C r2 = s1_limb
+	subq	r18,1,r18	C size--
+	stq	r3,0(r16)
+	beq	r18,$Lend2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,1,r18	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,16(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	stq	r3,8(r16)
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r16,8,r16	C res_ptr++
+	bne	r18,$Loop
+
+$Lend2:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	stq	r3,8(r16)
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Lend1:	stq	r3,0(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/rshift.asm b/rts/gmp/mpn/alpha/rshift.asm
new file mode 100644
index 0000000000..aa25eda54e
--- /dev/null
+++ b/rts/gmp/mpn/alpha/rshift.asm
@@ -0,0 +1,102 @@
+dnl  Alpha mpn_rshift -- Shift a number right.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+dnl  it would take 4 cycles/limb.  It should be possible to get down to 3
+dnl  cycles/limb since both ldq and stq can be paired with the other used
+dnl  instructions.  But there are many restrictions in the 21064 pipeline that
+dnl  makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl  1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl  2. Only aligned instruction pairs can be paired.
+dnl  3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ldq	r4,0(r17)	C load first limb
+	addq	r17,8,r17
+	subq	r31,r19,r7
+	subq	r18,1,r18
+	and	r18,4-1,r20	C number of limbs in first loop
+	sll	r4,r7,r0	C compute function result
+
+	beq	r20,$L0
+	subq	r18,r20,r18
+
+	ALIGN(8)
+$Loop0:
+	ldq	r3,0(r17)
+	addq	r16,8,r16
+	addq	r17,8,r17
+	subq	r20,1,r20
+	srl	r4,r19,r5
+	sll	r3,r7,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,-8(r16)
+	bne	r20,$Loop0
+
+$L0:	beq	r18,$Lend
+
+	ALIGN(8)
+$Loop:	ldq	r3,0(r17)
+	addq	r16,32,r16
+	subq	r18,4,r18
+	srl	r4,r19,r5
+	sll	r3,r7,r6
+
+	ldq	r4,8(r17)
+	srl	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,-32(r16)
+	sll	r4,r7,r2
+
+	ldq	r3,16(r17)
+	srl	r4,r19,r5
+	bis	r1,r2,r8
+	stq	r8,-24(r16)
+	sll	r3,r7,r6
+
+	ldq	r4,24(r17)
+	srl	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,-16(r16)
+	sll	r4,r7,r2
+
+	addq	r17,32,r17
+	bis	r1,r2,r8
+	stq	r8,-8(r16)
+
+	bgt	r18,$Loop
+
+$Lend:	srl	r4,r19,r8
+	stq	r8,0(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/sub_n.asm b/rts/gmp/mpn/alpha/sub_n.asm
new file mode 100644
index 0000000000..718f657141
--- /dev/null
+++ b/rts/gmp/mpn/alpha/sub_n.asm
@@ -0,0 +1,114 @@
+dnl  Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	ldq	r3,0(r17)
+	ldq	r4,0(r18)
+
+	subq	r19,1,r19
+	and	r19,4-1,r2	C number of limbs in first loop
+	bis	r31,r31,r0
+	beq	r2,$L0		C if multiple of 4 limbs, skip first loop
+
+	subq	r19,r2,r19
+
+$Loop0:	subq	r2,1,r2
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	addq	r17,8,r17
+	addq	r18,8,r18
+	bis	r5,r5,r3
+	bis	r6,r6,r4
+	addq	r16,8,r16
+	bne	r2,$Loop0
+
+$L0:	beq	r19,$Lend
+
+	ALIGN(8)
+$Loop:	subq	r19,4,r19
+
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,16(r17)
+	addq	r6,r0,r6
+	ldq	r4,16(r18)
+	cmpult	r6,r0,r1
+	subq	r5,r6,r6
+	cmpult	r5,r6,r0
+	stq	r6,8(r16)
+	bis	r0,r1,r0
+
+	ldq	r5,24(r17)
+	addq	r4,r0,r4
+	ldq	r6,24(r18)
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,16(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,32(r17)
+	addq	r6,r0,r6
+	ldq	r4,32(r18)
+	cmpult	r6,r0,r1
+	subq	r5,r6,r6
+	cmpult	r5,r6,r0
+	stq	r6,24(r16)
+	bis	r0,r1,r0
+
+	addq	r17,32,r17
+	addq	r18,32,r18
+	addq	r16,32,r16
+	bne	r19,$Loop
+
+$Lend:	addq	r4,r0,r4
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/submul_1.asm b/rts/gmp/mpn/alpha/submul_1.asm
new file mode 100644
index 0000000000..caec1a720b
--- /dev/null
+++ b/rts/gmp/mpn/alpha/submul_1.asm
@@ -0,0 +1,87 @@
+dnl  Alpha __gmpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl  cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Lend1	C jump if size was == 1
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	subq	r5,r3,r3
+	cmpult	r5,r3,r4
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	beq	r18,$Lend2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,1,r18	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	addq	r5,r0,r0	C combine carries
+	bne	r18,$Loop
+
+$Lend2:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r5,r0,r0	C combine carries
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Lend1:	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r0,r5,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_submul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/udiv_qrnnd.S b/rts/gmp/mpn/alpha/udiv_qrnnd.S
new file mode 100644
index 0000000000..53814bbcb0
--- /dev/null
+++ b/rts/gmp/mpn/alpha/udiv_qrnnd.S
@@ -0,0 +1,151 @@
+ # Alpha 21064 __udiv_qrnnd
+
+ # Copyright (C) 1992, 1994, 1995, 1997, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+	.set noreorder
+	.set noat
+.text
+	.align	3
+	.globl	__gmpn_udiv_qrnnd
+	.ent	__gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd:
+	.frame $30,0,$26,0
+	.prologue 0
+#define cnt	$2
+#define tmp	$3
+#define rem_ptr	$16
+#define n1	$17
+#define n0	$18
+#define d	$19
+#define qb	$20
+
+	ldiq	cnt,16
+	blt	d,.Largedivisor
+
+.Loop1:	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	subq	cnt,1,cnt
+	bgt	cnt,.Loop1
+	stq	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+.Largedivisor:
+	and	n0,1,$4
+
+	srl	n0,1,n0
+	sll	n1,63,tmp
+	or	tmp,n0,n0
+	srl	n1,1,n1
+
+	and	d,1,$6
+	srl	d,1,$5
+	addq	$5,$6,$5
+
+.Loop2:	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	subq	cnt,1,cnt
+	bgt	cnt,.Loop2
+
+	addq	n1,n1,n1
+	addq	$4,n1,n1
+	bne	$6,.LOdd
+	stq	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+.LOdd:
+	/* q' in n0. r' in n1 */
+	addq	n1,n0,n1
+	cmpult	n1,n0,tmp	# tmp := carry from addq
+	beq	tmp,.LLp6
+	addq	n0,1,n0
+	subq	n1,d,n1
+.LLp6:	cmpult	n1,d,tmp
+	bne	tmp,.LLp7
+	addq	n0,1,n0
+	subq	n1,d,n1
+.LLp7:
+	stq	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+	.end	__gmpn_udiv_qrnnd
diff --git a/rts/gmp/mpn/alpha/umul.asm b/rts/gmp/mpn/alpha/umul.asm
new file mode 100644
index 0000000000..44428ed5f5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/umul.asm
@@ -0,0 +1,39 @@
+dnl  Currently unused.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+	.set noreorder
+	.set volatile
+	.set noat
+
+.text
+	.align 3
+	.globl __umul_ppmm
+	.ent __umul_ppmm
+__umul_ppmm:
+__umul_ppmm..ng:
+	.frame $30,0,$26,0
+	.prologue 0
+	mulq $17,$18,$1
+	umulh $17,$18,$0
+	stq $1,0($16)
+	ret $31,($26),1
+	.end __umul_ppmm
diff --git a/rts/gmp/mpn/alpha/unicos.m4 b/rts/gmp/mpn/alpha/unicos.m4
new file mode 100644
index 0000000000..7ff26c090c
--- /dev/null
+++ b/rts/gmp/mpn/alpha/unicos.m4
@@ -0,0 +1,63 @@
+divert(-1)
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+define(`ASM_START',
+	`.ident	dummy')
+
+define(`X',`^X$1')
+define(`FLOAT64',
+	`dnl
+	.psect	$1@crud,data
+$1:	.t_floating $2
+	.endp')
+
+define(`PROLOGUE',
+	`dnl
+	.stack	192		; What does this mean?  Only Cray knows.
+	.psect	$1@code,code,cache
+$1::')
+define(`PROLOGUE_GP', `PROLOGUE($1)')
+
+define(`EPILOGUE',
+	`dnl
+	.endp')
+
+define(`DATASTART',
+	`dnl
+	.psect	$1@crud,data
+$1:')
+define(`DATAEND',
+	`dnl
+	.endp')
+
+define(`ASM_END',
+	`dnl
+	.end')
+
+define(`unop',`bis r31,r31,r31') ; Unicos assembler lacks unop
+define(`cvttqc',`cvttq/c')
+
+define(`ALIGN',`')		; Unicos assembler seems to align using garbage
+
+divert
+
diff --git a/rts/gmp/mpn/arm/add_n.S b/rts/gmp/mpn/arm/add_n.S
new file mode 100644
index 0000000000..fb3f8f703b
--- /dev/null
+++ b/rts/gmp/mpn/arm/add_n.S
@@ -0,0 +1,77 @@
+@ ARM mpn_add -- Add two limb vectors of the same length > 0 and store sum in
+@ a third limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define s r0
+#define a r1
+#define b r2
+#define n r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global __gmpn_add_n
+	.type	__gmpn_add_n,%function
+__gmpn_add_n:
+	stmfd	sp!, { r8, r9, lr }
+	movs	n, n, lsr #1
+	bcc	skip1
+	ldr	ip, [a], #4
+	ldr	lr, [b], #4
+	adds	ip, ip, lr
+	str	ip, [s], #4
+skip1:
+	tst	n, #1
+	beq	skip2
+	ldmia	a!, { r8, r9 }
+	ldmia	b!, { ip, lr }
+	adcs	r8, r8, ip
+	adcs	r9, r9, lr
+	stmia	s!, { r8, r9 }
+skip2:
+	bics	n, n, #1
+	beq	return
+	stmfd	sp!, { r4, r5, r6, r7 }
+add_n_loop:
+	ldmia	a!, { r4, r5, r6, r7 }
+	ldmia	b!, { r8, r9, ip, lr }
+	adcs	r4, r4, r8
+	ldr	r8, [s] /* Bring stuff into cache. */
+	adcs	r5, r5, r9
+	adcs	r6, r6, ip
+	adcs	r7, r7, lr
+	stmia	s!, { r4, r5, r6, r7 }
+	sub	n, n, #2
+	teq	n, #0
+	bne	add_n_loop
+	ldmfd	sp!, { r4, r5, r6, r7 }
+return:
+	adc	r0, n, #0
+	ldmfd	sp!, { r8, r9, pc }
+end:
+	.size	__gmpn_add_n, end - __gmpn_add_n
diff --git a/rts/gmp/mpn/arm/addmul_1.S b/rts/gmp/mpn/arm/addmul_1.S
new file mode 100644
index 0000000000..396fff77a3
--- /dev/null
+++ b/rts/gmp/mpn/arm/addmul_1.S
@@ -0,0 +1,89 @@
+@ ARM mpn_mul_1 -- Multiply a limb vector with a limb and add the result to a
+@ second limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define p r0
+#define a r1
+#define n r2
+#define w r3
+
+#define z r11
+
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global	__gmpn_addmul_1
+	.type	__gmpn_addmul_1,%function
+__gmpn_addmul_1:
+	stmfd	sp!, { r8-r11, lr }
+	mov	z, #0
+	mov	ip, #0
+	movs	n, n, lsr #1
+	bcc	skip1
+	ldr	lr, [a], #4
+	ldr	r9, [p]
+	umlal	r9, ip, w, lr
+	str	r9, [p], #4
+skip1:
+	movs	n, n, lsr #1
+	bcc	skip2
+	ldmia	p, { r9, r10 }
+	adds	r8, ip, r9
+	adc	r9, z, #0
+	ldmia	a!, { ip, lr }
+	umlal	r8, r9, w, ip
+	adds	r9, r9, r10
+	adc	ip, z, #0
+	umlal	r9, ip, w, lr
+	stmia	p!, { r8, r9 }
+skip2:
+	teq	n, #0
+	beq	return
+	stmfd	sp!, { r4-r7 }
+addmul_loop:
+	ldmia	p, { r5, r6, r7, r8 }
+	adds	r4, ip, r5
+	adc	r5, z, #0
+	ldmia	a!, { r9, r10, ip, lr }
+	umlal	r4, r5, w, r9
+	adds	r5, r5, r6
+	adc	r6, z, #0
+	umlal	r5, r6, w, r10
+	adds	r6, r6, r7
+	adc	r7, z, #0
+	umlal	r6, r7, w, ip
+	adds	r7, r7, r8
+	adc	ip, z, #0
+	umlal	r7, ip, w, lr
+	subs	n, n, #1
+	stmia	p!, { r4, r5, r6, r7 }
+	bne	addmul_loop
+	ldmfd	sp!, { r4-r7 }
+return:
+	mov	r0, ip
+	ldmfd	sp!, { r8-r11, pc }
+end:
+	.size	__gmpn_addmul_1, end - __gmpn_addmul_1
diff --git a/rts/gmp/mpn/arm/gmp-mparam.h b/rts/gmp/mpn/arm/gmp-mparam.h
new file mode 100644
index 0000000000..a35b0c7b66
--- /dev/null
+++ b/rts/gmp/mpn/arm/gmp-mparam.h
@@ -0,0 +1,34 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 21
+#endif
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 48
+#endif
diff --git a/rts/gmp/mpn/arm/mul_1.S b/rts/gmp/mpn/arm/mul_1.S
new file mode 100644
index 0000000000..bae526a0f0
--- /dev/null
+++ b/rts/gmp/mpn/arm/mul_1.S
@@ -0,0 +1,81 @@
+@ ARM mpn_addmul_1 -- Multiply a limb vector with a limb and store the result
+@ in a second limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define p r0
+#define a r1
+#define n r2
+#define w r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global	__gmpn_mul_1
+	.type	__gmpn_mul_1,%function
+__gmpn_mul_1:
+	stmfd	sp!, { r8, r9, lr }
+	ands	ip, n, #1
+	beq	skip1
+	ldr	lr, [a], #4
+	umull	r9, ip, w, lr
+	str	r9, [p], #4
+skip1:
+	tst	n, #2
+	beq	skip2
+	mov	r8, ip
+	ldmia	a!, { ip, lr }
+	mov	r9, #0
+	umlal	r8, r9, w, ip
+	mov	ip, #0
+	umlal	r9, ip, w, lr
+	stmia	p!, { r8, r9 }
+skip2:
+	bics	n, n, #3
+	beq	return
+	stmfd	sp!, { r6, r7 }
+mul_1_loop:
+	mov	r6, ip
+	ldmia	a!, { r8, r9, ip, lr }
+	ldr	r7, [p] /* Bring stuff into cache. */
+	mov	r7, #0
+	umlal	r6, r7, w, r8
+	mov	r8, #0
+	umlal	r7, r8, w, r9
+	mov	r9, #0
+	umlal	r8, r9, w, ip
+	mov	ip, #0
+	umlal	r9, ip, w, lr
+	subs	n, n, #4
+	stmia	p!, { r6, r7, r8, r9 }
+	bne	mul_1_loop
+	ldmfd	sp!, { r6, r7 }
+return:
+	mov	r0, ip
+	ldmfd	sp!, { r8, r9, pc }
+end:
+	.size	__gmpn_mul_1, end - __gmpn_mul_1
diff --git a/rts/gmp/mpn/arm/sub_n.S b/rts/gmp/mpn/arm/sub_n.S
new file mode 100644
index 0000000000..856505fe21
--- /dev/null
+++ b/rts/gmp/mpn/arm/sub_n.S
@@ -0,0 +1,79 @@
+@ ARM mpn_sub -- Subtract two limb vectors of the same length > 0 and store
+@ difference in a third limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define d r0
+#define a r1
+#define b r2
+#define n r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global	__gmpn_sub_n
+	.type	__gmpn_sub_n,%function
+__gmpn_sub_n:
+	stmfd	sp!, { r8, r9, lr }
+	subs	ip, ip, ip
+	tst	n, #1
+	beq	skip1
+	ldr	ip, [a], #4
+	ldr	lr, [b], #4
+	subs	ip, ip, lr
+	str	ip, [d], #4
+skip1:
+	tst	n, #2
+	beq	skip2
+	ldmia	a!, { r8, r9 }
+	ldmia	b!, { ip, lr }
+	sbcs	r8, r8, ip
+	sbcs	r9, r9, lr
+	stmia	d!, { r8, r9 }
+skip2:
+	bics	n, n, #3
+	beq	return
+	stmfd	sp!, { r4, r5, r6, r7 }
+sub_n_loop:
+	ldmia	a!, { r4, r5, r6, r7 }
+	ldmia	b!, { r8, r9, ip, lr }
+	sbcs	r4, r4, r8
+	ldr	r8, [d] /* Bring stuff into cache. */
+	sbcs	r5, r5, r9
+	sbcs	r6, r6, ip
+	sbcs	r7, r7, lr
+	stmia	d!, { r4, r5, r6, r7 }
+	sub	n, n, #4
+	teq	n, #0
+	bne	sub_n_loop
+	ldmfd	sp!, { r4, r5, r6, r7 }
+return:
+	sbc	r0, r0, r0
+	and	r0, r0, #1
+	ldmfd	sp!, { r8, r9, pc }
+end:
+	.size	__gmpn_sub_n, end - __gmpn_sub_n
diff --git a/rts/gmp/mpn/asm-defs.m4 b/rts/gmp/mpn/asm-defs.m4
new file mode 100644
index 0000000000..aa2024138b
--- /dev/null
+++ b/rts/gmp/mpn/asm-defs.m4
@@ -0,0 +1,1182 @@
+divert(-1)
+dnl
+dnl  m4 macros for gmp assembly code, shared by all CPUs.
+dnl
+dnl  These macros are designed for use with any m4 and have been used on
+dnl  GNU, FreeBSD, OpenBSD and SysV.
+dnl
+dnl  GNU m4 and OpenBSD 2.7 m4 will give filenames and line numbers in error
+dnl  messages.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  Macros:
+dnl
+dnl  Most new m4 specific macros have an "m4_" prefix to emphasise they're
+dnl  m4 expansions.  But new defining things like deflit() and defreg() are
+dnl  named like the builtin define(), and forloop() is named following the
+dnl  GNU m4 example on which it's based.
+dnl
+dnl  GNU m4 with the -P option uses "m4_" as a prefix for builtins, but that
+dnl  option isn't going to be used, so there's no conflict or confusion.
+dnl
+dnl
+dnl  Comments in output:
+dnl
+dnl  The m4 comment delimiters are left at # and \n, the normal assembler
+dnl  commenting for most CPUs.  m4 passes comment text through without
+dnl  expanding macros in it, which is generally a good thing since it stops
+dnl  unexpected expansions and possible resultant errors.
+dnl
+dnl  But note that when a quoted string is being read, a # isn't special, so
+dnl  apostrophes in comments in quoted strings must be avoided or they'll be
+dnl  interpreted as a closing quote mark.  But when the quoted text is
+dnl  re-read # will still act like a normal comment, supressing macro
+dnl  expansion.
+dnl
+dnl  For example,
+dnl
+dnl          # apostrophes in comments that're outside quotes are ok
+dnl          # and using macro names like PROLOGUE is ok too
+dnl          ...
+dnl          ifdef(`PIC',`
+dnl                  # but apostrophes aren't ok inside quotes
+dnl                  #                     ^--wrong
+dnl                  ...
+dnl                  # though macro names like PROLOGUE are still ok
+dnl                  ...
+dnl          ')
+dnl
+dnl  If macro expansion in a comment is wanted, use `#' in the .asm (ie. a
+dnl  quoted hash symbol), which will turn into # in the .s but get
+dnl  expansions done on that line.  This can make the .s more readable to
+dnl  humans, but it won't make a blind bit of difference to the assembler.
+dnl
+dnl  All the above applies, mutatis mutandis, when changecom() is used to
+dnl  select @ ! ; or whatever other commenting.
+dnl
+dnl
+dnl  Variations in m4 affecting gmp:
+dnl
+dnl  $# - When a macro is called as "foo" with no brackets, BSD m4 sets $#
+dnl       to 1, whereas GNU or SysV m4 set it to 0.  In all cases though
+dnl       "foo()" sets $# to 1.  This is worked around in various places.
+dnl
+dnl  len() - When "len()" is given an empty argument, BSD m4 evaluates to
+dnl       nothing, whereas GNU, SysV, and the new OpenBSD, evaluate to 0.
+dnl       See m4_length() below which works around this.
+dnl
+dnl  translit() - GNU m4 accepts character ranges like A-Z, and the new
+dnl       OpenBSD m4 does under option -g, but basic BSD and SysV don't.
+dnl
+dnl  popdef() - in BSD and SysV m4 popdef() takes multiple arguments and
+dnl       pops each, but GNU m4 only takes one argument.
+dnl
+dnl  push back - BSD m4 has some limits on the amount of text that can be
+dnl       pushed back.  The limit is reasonably big and so long as macros
+dnl       don't gratuitously duplicate big arguments it isn't a problem.
+dnl       Normally an error message is given, but sometimes it just hangs.
+dnl
+dnl  eval() &,|,^ - GNU and SysV m4 have bitwise operators &,|,^ available,
+dnl       but BSD m4 doesn't (contrary to what the man page suggests) and
+dnl       instead ^ is exponentiation.
+dnl
+dnl  eval() ?: - The C ternary operator "?:" is available in BSD m4, but not
+dnl       in SysV or GNU m4 (as of GNU m4 1.4 and betas of 1.5).
+dnl
+dnl  eval() -2^31 - BSD m4 has a bug where an eval() resulting in -2^31
+dnl       (ie. -2147483648) gives "-(".  Using -2147483648 within an
+dnl       expression is ok, it just can't be a final result.  "-(" will of
+dnl       course upset parsing, with all sorts of strange effects.
+dnl
+dnl  eval() <<,>> - SysV m4 doesn't support shift operators in eval() (on
+dnl       SunOS 5.7 /usr/xpg4/m4 has them but /usr/ccs/m4 doesn't).  See
+dnl       m4_lshift() and m4_rshift() below for workarounds.
+dnl
+dnl  m4wrap() - in BSD m4, m4wrap() replaces any previous m4wrap() string,
+dnl       in SysV m4 it appends to it, and in GNU m4 it prepends.  See
+dnl       m4wrap_prepend() below which brings uniformity to this.
+dnl
+dnl  __file__,__line__ - GNU m4 and OpenBSD 2.7 m4 provide these, and
+dnl       they're used here to make error messages more informative.  GNU m4
+dnl       gives an unhelpful "NONE 0" in an m4wrap(), but that's worked
+dnl       around.
+dnl
+dnl  __file__ quoting - OpenBSD m4, unlike GNU m4, doesn't quote the
+dnl       filename in __file__, so care should be taken that no macro has
+dnl       the same name as a file, or an unwanted expansion will occur when
+dnl       printing an error or warning.
+dnl
+dnl  OpenBSD 2.6 m4 - this m4 rejects decimal constants containing an 8 or 9
+dnl       in eval(), making it pretty much unusable.  This bug is confined
+dnl       to version 2.6 (it's not in 2.5, and has been fixed in 2.7).
+dnl
+dnl  SunOS /usr/bin/m4 - this m4 lacks a number of desired features,
+dnl       including $# and $@, defn(), m4exit(), m4wrap(), pushdef(),
+dnl       popdef().  /usr/5bin/m4 is a SysV style m4 which should always be
+dnl       available, and "configure" will reject /usr/bin/m4 in favour of
+dnl       /usr/5bin/m4 (if necessary).
+dnl
+dnl       The sparc code actually has modest m4 requirements currently and
+dnl       could manage with /usr/bin/m4, but there's no reason to put our
+dnl       macros through contortions when /usr/5bin/m4 is available or GNU
+dnl       m4 can be installed.
+
+
+ifdef(`__ASM_DEFS_M4_INCLUDED__',
+`m4_error(`asm-defs.m4 already included, dont include it twice
+')m4exit(1)')
+define(`__ASM_DEFS_M4_INCLUDED__')
+
+
+dnl  Detect and give a message about the unsuitable OpenBSD 2.6 m4.
+
+ifelse(eval(89),89,,
+`errprint(
+`This m4 doesnt accept 8 and/or 9 in constants in eval(), making it unusable.
+This is probably OpenBSD 2.6 m4 (September 1999).  Upgrade to OpenBSD 2.7,
+or get a bug fix from the CVS (expr.c rev 1.9), or get GNU m4.  Dont forget
+to configure with M4=/wherever/m4 if you install one of these in a directory
+not in $PATH.
+')m4exit(1)')
+
+
+dnl  Detect and give a message about the unsuitable SunOS /usr/bin/m4.
+dnl
+dnl  Unfortunately this test doesn't work when m4 is run in the normal way
+dnl  from mpn/Makefile with "m4 -DOPERATION_foo foo.asm", since the bad m4
+dnl  takes "-" in "-D..." to mean read stdin, so it will look like it just
+dnl  hangs.  But running "m4 asm-defs.m4" to try it out will work.
+dnl
+dnl  We'd like to abort immediately on finding a problem, but unfortunately
+dnl  the bad m4 doesn't have an m4exit(), nor does an invalid eval() kill
+dnl  it.  Unexpanded $#'s in some m4_assert_numargs() later on will comment
+dnl  out some closing parentheses and kill it with "m4: arg stack overflow".
+
+define(m4_dollarhash_works_test,``$#'')
+ifelse(m4_dollarhash_works_test(x),1,,
+`errprint(
+`This m4 doesnt support $# and cant be used for GMP asm processing.
+If this is on SunOS, ./configure should choose /usr/5bin/m4 if you have that
+or can get it, otherwise install GNU m4.  Dont forget to configure with
+M4=/wherever/m4 if you install in a directory not in $PATH.
+')')
+undefine(`m4_dollarhash_works_test')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Basic error handling things.
+
+
+dnl  Usage: m4_dollarhash_1_if_noparen_p
+dnl
+dnl  Expand to 1 if a call "foo" gives $# set to 1 (as opposed to 0 like GNU
+dnl  and SysV m4 give).
+
+define(m4_dollarhash_1_if_noparen_test,`$#')
+define(m4_dollarhash_1_if_noparen_p,
+eval(m4_dollarhash_1_if_noparen_test==1))
+undefine(`m4_dollarhash_1_if_noparen_test')
+
+
+dnl  Usage: m4wrap_prepend(string)
+dnl
+dnl  Prepend the given string to what will be exapanded under m4wrap at the
+dnl  end of input.
+dnl
+dnl  This macro exists to work around variations in m4wrap() behaviour in
+dnl  the various m4s (notes at the start of this file).  Don't use m4wrap()
+dnl  directly since it will interfere with this scheme.
+
+define(m4wrap_prepend,
+m4_assert_numargs(1)
+`define(`m4wrap_string',`$1'defn(`m4wrap_string'))')
+
+m4wrap(`m4wrap_string')
+define(m4wrap_string,`')
+
+
+dnl  Usage: m4_file_and_line
+dnl
+dnl  Expand to the current file and line number, if the GNU m4 extensions
+dnl  __file__ and __line__ are available.
+dnl
+dnl  In GNU m4 1.4 at the end of input when m4wrap text is expanded,
+dnl  __file__ is NONE and __line__ is 0, which is not a helpful thing to
+dnl  print.  If m4_file_seen() has been called to note the last file seen,
+dnl  then that file at a big line number is used, otherwise "end of input"
+dnl  is used (although "end of input" won't parse as an error message).
+
+define(m4_file_and_line,
+`ifdef(`__file__',
+`ifelse(__file__`'__line__,`NONE0',
+`ifdef(`m4_file_seen_last',`m4_file_seen_last: 999999: ',`end of input: ')',
+`__file__: __line__: ')')')
+
+
+dnl  Usage: m4_errprint_commas(arg,...)
+dnl
+dnl  The same as errprint(), but commas are printed between arguments
+dnl  instead of spaces.
+
+define(m4_errprint_commas,
+`errprint(`$1')dnl
+ifelse(eval($#>1),1,`errprint(`,')m4_errprint_commas(shift($@))')')
+
+
+dnl  Usage: m4_error(args...)
+dnl         m4_warning(args...)
+dnl
+dnl  Print an error message, using m4_errprint_commas, prefixed with the
+dnl  current filename and line number (if available).  m4_error sets up to
+dnl  give an error exit at the end of processing, m4_warning just prints.
+dnl  These macros are the recommended way to print errors.
+dnl
+dnl  The arguments here should be quoted in the usual way to prevent them
+dnl  being expanded when the macro call is read.  (m4_error takes care not
+dnl  to do any further expansion.)
+dnl
+dnl  For example,
+dnl
+dnl         m4_error(`some error message
+dnl         ')
+dnl
+dnl  which prints
+dnl
+dnl         foo.asm:123: some error message
+dnl
+dnl  or if __file__ and __line__ aren't available
+dnl
+dnl         some error message
+dnl
+dnl  The "file:line:" format is a basic style, used by gcc and GNU m4, so
+dnl  emacs and other editors will recognise it in their normal error message
+dnl  parsing.
+
+define(m4_warning,
+`m4_errprint_commas(m4_file_and_line`'$@)')
+
+define(m4_error,
+`define(`m4_error_occurred',1)m4_warning($@)')
+
+define(`m4_error_occurred',0)
+
+dnl  This m4wrap_prepend() is first, so it'll be executed last.
+m4wrap_prepend(
+`ifelse(m4_error_occurred,1,
+`m4_error(`Errors occurred during m4 processing
+')m4exit(1)')')
+
+
+dnl  Usage: m4_assert_numargs(num)
+dnl
+dnl  Put this unquoted on a line on its own at the start of a macro
+dnl  definition to add some code to check that num many arguments get passed
+dnl  to the macro.  For example,
+dnl
+dnl         define(foo,
+dnl         m4_assert_numargs(2)
+dnl         `something `$1' and `$2' blah blah')
+dnl
+dnl  Then a call like foo(one,two,three) will provoke an error like
+dnl
+dnl         file:10: foo expected 2 arguments, got 3 arguments
+dnl
+dnl  Here are some calls and how many arguments they're interpreted as passing.
+dnl
+dnl         foo(abc,def)  2
+dnl         foo(xyz)      1
+dnl         foo()         0
+dnl         foo          -1
+dnl
+dnl  The -1 for no parentheses at all means a macro that's meant to be used
+dnl  that way can be checked with m4_assert_numargs(-1).  For example,
+dnl
+dnl         define(SPECIAL_SUFFIX,
+dnl         m4_assert_numargs(-1)
+dnl         `ifdef(`FOO',`_foo',`_bar')')
+dnl
+dnl  But as an alternative see also deflit() below where parenthesized
+dnl  expressions following a macro are passed through to the output.
+dnl
+dnl  Note that in BSD m4 there's no way to differentiate calls "foo" and
+dnl  "foo()", so in BSD m4 the distinction between the two isn't enforced.
+dnl  (In GNU and SysV m4 it can be checked, and is.)
+
+
+dnl  m4_assert_numargs is able to check its own arguments by calling
+dnl  assert_numargs_internal directly.
+dnl
+dnl  m4_doublequote($`'0) expands to ``$0'', whereas ``$`'0'' would expand
+dnl  to `$`'0' and do the wrong thing, and likewise for $1.  The same is
+dnl  done in other assert macros.
+dnl
+dnl  $`#' leaves $# in the new macro being defined, and stops # being
+dnl  interpreted as a comment character.
+dnl
+dnl  `dnl ' means an explicit dnl isn't necessary when m4_assert_numargs is
+dnl  used.  The space means that if there is a dnl it'll still work.
+
+dnl  Usage: m4_doublequote(x) expands to ``x''
+define(m4_doublequote,
+`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))``$1''')
+
+define(m4_assert_numargs,
+`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))dnl
+`m4_assert_numargs_internal'(m4_doublequote($`'0),$1,$`#',`len'(m4_doublequote($`'1)))`dnl '')
+
+dnl  Called: m4_assert_numargs_internal(`macroname',wantargs,$#,len(`$1'))
+define(m4_assert_numargs_internal,
+`m4_assert_numargs_internal_check(`$1',`$2',m4_numargs_count(`$3',`$4'))')
+
+dnl  Called: m4_assert_numargs_internal_check(`macroname',wantargs,gotargs)
+dnl
+dnl  If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it
+dnl  should be -1.  If wantargs is -1 but gotargs is 0 and the two can't be
+dnl  distinguished then it's allowed to pass.
+dnl
+define(m4_assert_numargs_internal_check,
+`ifelse(eval($2 == $3
+             || ($2==-1 && $3==0 && m4_dollarhash_1_if_noparen_p)),0,
+`m4_error(`$1 expected 'm4_Narguments(`$2')`, got 'm4_Narguments(`$3')
+)')')
+
+dnl  Called: m4_numargs_count($#,len(`$1'))
+dnl  If $#==0 then -1 args, if $#==1 but len(`$1')==0 then 0 args, otherwise
+dnl  $# args.
+define(m4_numargs_count,
+`ifelse($1,0, -1,
+`ifelse(eval($1==1 && $2-0==0),1, 0, $1)')')
+
+dnl  Usage: m4_Narguments(N)
+dnl  "$1 argument" or "$1 arguments" with the plural according to $1.
+define(m4_Narguments,
+`$1 argument`'ifelse(`$1',1,,s)')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Additional error checking things.
+
+
+dnl  Usage: m4_file_seen()
+dnl
+dnl  Record __file__ for the benefit of m4_file_and_line in m4wrap text.
+dnl  The basic __file__ macro comes out quoted, like `foo.asm', and
+dnl  m4_file_seen_last is defined like that too.
+dnl
+dnl  This only needs to be used with something that could generate an error
+dnl  message in m4wrap text.  The x86 PROLOGUE is the only such at the
+dnl  moment (at end of input its m4wrap checks for missing EPILOGUE).  A few
+dnl  include()s can easily trick this scheme, but you'd expect an EPILOGUE
+dnl  in the same file as the PROLOGUE.
+
+define(m4_file_seen,
+m4_assert_numargs(0)
+`ifelse(__file__,`NONE',,
+`define(`m4_file_seen_last',m4_doublequote(__file__))')')
+
+
+dnl  Usage: m4_assert_onearg()
+dnl
+dnl  Put this, unquoted, at the start of a macro definition to add some code
+dnl  to check that one argument is passed to the macro, but with that
+dnl  argument allowed to be empty.  For example,
+dnl
+dnl          define(foo,
+dnl          m4_assert_onearg()
+dnl          `blah blah $1 blah blah')
+dnl
+dnl  Calls "foo(xyz)" or "foo()" are accepted.  A call "foo(xyz,abc)" fails.
+dnl  A call "foo" fails too, but BSD m4 can't detect this case (GNU and SysV
+dnl  m4 can).
+
+define(m4_assert_onearg,
+m4_assert_numargs(0)
+`m4_assert_onearg_internal'(m4_doublequote($`'0),$`#')`dnl ')
+
+dnl  Called: m4_assert_onearg(`macroname',$#)
+define(m4_assert_onearg_internal,
+`ifelse($2,1,,
+`m4_error(`$1 expected 1 argument, got 'm4_Narguments(`$2')
+)')')
+
+
+dnl  Usage: m4_assert_numargs_range(low,high)
+dnl
+dnl  Put this, unquoted, at the start of a macro definition to add some code
+dnl  to check that between low and high many arguments get passed to the
+dnl  macro.  For example,
+dnl
+dnl         define(foo,
+dnl         m4_assert_numargs_range(3,5)
+dnl         `mandatory $1 $2 $3 optional $4 $5 end')
+dnl
+dnl  See m4_assert_numargs() for more info.
+
+define(m4_assert_numargs_range,
+m4_assert_numargs(2)
+``m4_assert_numargs_range_internal'(m4_doublequote($`'0),$1,$2,$`#',`len'(m4_doublequote($`'1)))`dnl '')
+
+dnl  Called: m4_assert_numargs_range_internal(`name',low,high,$#,len(`$1'))
+define(m4_assert_numargs_range_internal,
+m4_assert_numargs(5)
+`m4_assert_numargs_range_check(`$1',`$2',`$3',m4_numargs_count(`$4',`$5'))')
+
+dnl  Called: m4_assert_numargs_range_check(`name',low,high,gotargs)
+dnl
+dnl  If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it
+dnl  should be -1.  To ensure a `high' of -1 works, a fudge is applied to
+dnl  gotargs if it's 0 and the 0 and -1 cases can't be distinguished.
+dnl
+define(m4_assert_numargs_range_check,
+m4_assert_numargs(4)
+`ifelse(eval($2 <= $4 &&
+             ($4 - ($4==0 && m4_dollarhash_1_if_noparen_p) <= $3)),0,
+`m4_error(`$1 expected $2 to $3 arguments, got 'm4_Narguments(`$4')
+)')')
+
+
+dnl  Usage: m4_assert_defined(symbol)
+dnl
+dnl  Put this unquoted on a line of its own at the start of a macro
+dnl  definition to add some code to check that the given symbol is defined
+dnl  when the macro is used.  For example,
+dnl
+dnl          define(foo,
+dnl          m4_assert_defined(`FOO_PREFIX')
+dnl          `FOO_PREFIX whatever')
+dnl
+dnl  This is a convenient way to check that the user or ./configure or
+dnl  whatever has defined the things needed by a macro, as opposed to
+dnl  silently generating garbage.
+
+define(m4_assert_defined,
+m4_assert_numargs(1)
+``m4_assert_defined_internal'(m4_doublequote($`'0),``$1'')`dnl '')
+
+dnl  Called: m4_assert_defined_internal(`macroname',`define_required')
+define(m4_assert_defined_internal,
+m4_assert_numargs(2)
+`ifdef(`$2',,
+`m4_error(`$1 needs $2 defined
+')')')
+
+
+dnl  Usage: m4_not_for_expansion(`SYMBOL')
+dnl         define_not_for_expansion(`SYMBOL')
+dnl
+dnl  m4_not_for_expansion turns SYMBOL, if defined, into something which
+dnl  will give an error if expanded.  For example,
+dnl
+dnl         m4_not_for_expansion(`PIC')
+dnl
+dnl  define_not_for_expansion is the same, but always makes a definition.
+dnl
+dnl  These are for symbols that should be tested with ifdef(`FOO',...)
+dnl  rather than be expanded as such.  They guard against accidentally
+dnl  omitting the quotes, as in ifdef(FOO,...).  Note though that they only
+dnl  catches this when FOO is defined, so be sure to test code both with and
+dnl  without each definition.
+
+define(m4_not_for_expansion,
+m4_assert_numargs(1)
+`ifdef(`$1',`define_not_for_expansion(`$1')')')
+
+define(define_not_for_expansion,
+m4_assert_numargs(1)
+`ifelse(defn(`$1'),,,
+`m4_error(``$1' has a non-empty value, maybe it shouldnt be munged with m4_not_for_expansion()
+')')dnl
+define(`$1',`m4_not_for_expansion_internal(`$1')')')
+
+define(m4_not_for_expansion_internal,
+`m4_error(``$1' is not meant to be expanded, perhaps you mean `ifdef(`$1',...)'
+')')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various generic m4 things.
+
+
+dnl  Usage: m4_ifdef_anyof_p(`symbol',...)
+dnl
+dnl  Expand to 1 if any of the symbols in the argument list are defined, or
+dnl  to 0 if not.
+
+define(m4_ifdef_anyof_p,
+`ifelse(eval($#<=1 && m4_length(`$1')==0),1, 0,
+`ifdef(`$1', 1,
+`m4_ifdef_anyof_p(shift($@))')')')
+
+
+dnl  Usage: m4_length(string)
+dnl
+dnl  Determine the length of a string.  This is the same as len(), but
+dnl  always expands to a number, working around the BSD len() which
+dnl  evaluates to nothing given an empty argument.
+
+define(m4_length,
+m4_assert_onearg()
+`eval(len(`$1')-0)')
+
+
+dnl  Usage: m4_stringequal_p(x,y)
+dnl
+dnl  Expand to 1 or 0 according as strings x and y are equal or not.
+
+define(m4_stringequal_p,
+`ifelse(`$1',`$2',1,0)')
+
+
+dnl  Usage: m4_incr_or_decr(n,last)
+dnl
+dnl  Do an incr(n) or decr(n), whichever is in the direction of "last".
+dnl  Both n and last must be numbers of course.
+
+define(m4_incr_or_decr,
+m4_assert_numargs(2)
+`ifelse(eval($1<$2),1,incr($1),decr($1))')
+
+
+dnl  Usage: forloop(i, first, last, statement)
+dnl
+dnl  Based on GNU m4 examples/forloop.m4, but extended.
+dnl
+dnl  statement is expanded repeatedly, with i successively defined as
+dnl
+dnl         first, first+1, ..., last-1, last
+dnl
+dnl  Or if first > last, then it's
+dnl
+dnl         first, first-1, ..., last+1, last
+dnl
+dnl  If first == last, then one expansion is done.
+dnl
+dnl  A pushdef/popdef of i is done to preserve any previous definition (or
+dnl  lack of definition).  first and last are eval()ed and so can be
+dnl  expressions.
+dnl
+dnl  forloop_first is defined to 1 on the first iteration, 0 on the rest.
+dnl  forloop_last is defined to 1 on the last iteration, 0 on the others.
+dnl  Nested forloops are allowed, in which case forloop_first and
+dnl  forloop_last apply to the innermost loop that's open.
+dnl
+dnl  A simple example,
+dnl
+dnl         forloop(i, 1, 2*2+1, `dnl
+dnl         iteration number i ... ifelse(forloop_first,1,FIRST)
+dnl         ')
+
+
+dnl  "i" and "statement" are carefully quoted, but "first" and "last" are
+dnl  just plain numbers once eval()ed.
+
+define(`forloop',
+m4_assert_numargs(4)
+`pushdef(`$1',eval(`$2'))dnl
+pushdef(`forloop_first',1)dnl
+pushdef(`forloop_last',0)dnl
+forloop_internal(`$1',eval(`$3'),`$4')`'dnl
+popdef(`forloop_first')dnl
+popdef(`forloop_last')dnl
+popdef(`$1')')
+
+dnl  Called: forloop_internal(`var',last,statement)
+define(`forloop_internal',
+m4_assert_numargs(3)
+`ifelse($1,$2,
+`define(`forloop_last',1)$3',
+`$3`'dnl
+define(`forloop_first',0)dnl
+define(`$1',m4_incr_or_decr($1,$2))dnl
+forloop_internal(`$1',$2,`$3')')')
+
+
+dnl  Usage: m4_toupper(x)
+dnl         m4_tolower(x)
+dnl
+dnl  Convert the argument string to upper or lower case, respectively.
+dnl  Only one argument accepted.
+dnl
+dnl  BSD m4 doesn't take ranges like a-z in translit(), so the full alphabet
+dnl  is written out.
+
+define(m4_alphabet_lower, `abcdefghijklmnopqrstuvwxyz')
+define(m4_alphabet_upper, `ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+
+define(m4_toupper,
+m4_assert_onearg()
+`translit(`$1', m4_alphabet_lower, m4_alphabet_upper)')
+
+define(m4_tolower,
+m4_assert_onearg()
+`translit(`$1', m4_alphabet_upper, m4_alphabet_lower)')
+
+
+dnl  Usage: m4_empty_if_zero(x)
+dnl
+dnl  Evaluate to x, or to nothing if x is 0.  x is eval()ed and so can be an
+dnl  expression.
+dnl
+dnl  This is useful for x86 addressing mode displacements since forms like
+dnl  (%ebx) are one byte shorter than 0(%ebx).  A macro `foo' for use as
+dnl  foo(%ebx) could be defined with the following so it'll be empty if the
+dnl  expression comes out zero.
+dnl
+dnl	   deflit(`foo', `m4_empty_if_zero(a+b*4-c)')
+dnl
+dnl  Naturally this shouldn't be done if, say, a computed jump depends on
+dnl  the code being a particular size.
+
+define(m4_empty_if_zero,
+m4_assert_onearg()
+`ifelse(eval($1),0,,eval($1))')
+
+
+dnl  Usage: m4_log2(x)
+dnl
+dnl  Calculate a logarithm to base 2.
+dnl  x must be an integral power of 2, between 2**0 and 2**30.
+dnl  x is eval()ed, so it can be an expression.
+dnl  An error results if x is invalid.
+dnl
+dnl  2**31 isn't supported, because an unsigned 2147483648 is out of range
+dnl  of a 32-bit signed int.  Also, the bug in BSD m4 where an eval()
+dnl  resulting in 2147483648 (or -2147483648 as the case may be) gives `-('
+dnl  means tests like eval(1<<31==(x)) would be necessary, but that then
+dnl  gives an unattractive explosion of eval() error messages if x isn't
+dnl  numeric.
+
+define(m4_log2,
+m4_assert_numargs(1)
+`m4_log2_internal(0,1,eval(`$1'))')
+
+dnl  Called: m4_log2_internal(n,2**n,target)
+define(m4_log2_internal,
+m4_assert_numargs(3)
+`ifelse($2,$3,$1,
+`ifelse($1,30,
+`m4_error(`m4_log2() argument too big or not a power of two: $3
+')',
+`m4_log2_internal(incr($1),eval(2*$2),$3)')')')
+
+
+dnl  Usage:  m4_div2_towards_zero
+dnl
+dnl  m4 division is probably whatever a C signed division is, and C doesn't
+dnl  specify what rounding gets used on negatives, so this expression forces
+dnl  a rounding towards zero.
+
+define(m4_div2_towards_zero,
+m4_assert_numargs(1)
+`eval((($1) + ((($1)<0) & ($1))) / 2)')
+
+
+dnl  Usage: m4_lshift(n,count)
+dnl         m4_rshift(n,count)
+dnl
+dnl  Calculate n shifted left or right by count many bits.  Both n and count
+dnl  are eval()ed and so can be expressions.
+dnl
+dnl  Negative counts are allowed and mean a shift in the opposite direction.
+dnl  Negative n is allowed and right shifts will be arithmetic (meaning
+dnl  divide by 2**count, rounding towards zero, also meaning the sign bit is
+dnl  duplicated).
+dnl
+dnl  Use these macros instead of << and >> in eval() since the basic ccs
+dnl  SysV m4 doesn't have those operators.
+
+define(m4_rshift,
+m4_assert_numargs(2)
+`m4_lshift(`$1',-(`$2'))')
+
+define(m4_lshift,
+m4_assert_numargs(2)
+`m4_lshift_internal(eval(`$1'),eval(`$2'))')
+
+define(m4_lshift_internal,
+m4_assert_numargs(2)
+`ifelse(eval($2-0==0),1,$1,
+`ifelse(eval($2>0),1,
+`m4_lshift_internal(eval($1*2),decr($2))',
+`m4_lshift_internal(m4_div2_towards_zero($1),incr($2))')')')
+
+
+dnl  Usage: deflit(name,value)
+dnl
+dnl  Like define(), but "name" expands like a literal, rather than taking
+dnl  arguments.  For example "name(%eax)" expands to "value(%eax)".
+dnl
+dnl  Limitations:
+dnl
+dnl  $ characters in the value part must have quotes to stop them looking
+dnl  like macro parameters.  For example, deflit(reg,`123+$`'4+567').  See
+dnl  defreg() below for handling simple register definitions like $7 etc.
+dnl
+dnl  "name()" is turned into "name", unfortunately.  In GNU and SysV m4 an
+dnl  error is generated when this happens, but in BSD m4 it will happen
+dnl  silently.  The problem is that in BSD m4 $# is 1 in both "name" or
+dnl  "name()", so there's no way to differentiate them.  Because we want
+dnl  plain "name" to turn into plain "value", we end up with "name()"
+dnl  turning into plain "value" too.
+dnl
+dnl  "name(foo)" will lose any whitespace after commas in "foo", for example
+dnl  "disp(%eax, %ecx)" would become "128(%eax,%ecx)".
+dnl
+dnl  These parentheses oddities shouldn't matter in assembler text, but if
+dnl  they do the suggested workaround is to write "name ()" or "name (foo)"
+dnl  to stop the parentheses looking like a macro argument list.  If a space
+dnl  isn't acceptable in the output, then write "name`'()" or "name`'(foo)".
+dnl  The `' is stripped when read, but again stops the parentheses looking
+dnl  like parameters.
+
+dnl  Quoting for deflit_emptyargcheck is similar to m4_assert_numargs.  The
+dnl  stuff in the ifelse gives a $#, $1 and $@ evaluated in the new macro
+dnl  created, not in deflit.
+define(deflit,
+m4_assert_numargs(2)
+`define(`$1',
+`deflit_emptyargcheck'(``$1'',$`#',m4_doublequote($`'1))`dnl
+$2`'dnl
+ifelse(eval($'`#>1 || m4_length('m4_doublequote($`'1)`)!=0),1,($'`@))')')
+
+dnl  Called: deflit_emptyargcheck(macroname,$#,`$1')
+define(deflit_emptyargcheck,
+`ifelse(eval($2==1 && !m4_dollarhash_1_if_noparen_p && m4_length(`$3')==0),1,
+`m4_error(`dont use a deflit as $1() because it loses the brackets (see deflit in asm-incl.m4 for more information)
+')')')
+
+
+dnl  Usage: m4_assert(`expr')
+dnl
+dnl  Test a compile-time requirement with an m4 expression.  The expression
+dnl  should be quoted, and will be eval()ed and expected to be non-zero.
+dnl  For example,
+dnl
+dnl         m4_assert(`FOO*2+6 < 14')
+
+define(m4_assert,
+m4_assert_numargs(1)
+`ifelse(eval($1),1,,
+`m4_error(`assertion failed: $1
+')')')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various assembler things, not specific to any particular CPU.
+dnl
+
+
+dnl  Usage: include_mpn(`filename')
+dnl
+dnl  Like include(), but adds a path to the mpn source directory.  For
+dnl  example,
+dnl
+dnl         include_mpn(`sparc64/addmul_1h.asm')
+
+define(include_mpn,
+m4_assert_numargs(1)
+m4_assert_defined(`CONFIG_TOP_SRCDIR')
+`include(CONFIG_TOP_SRCDIR`/mpn/$1')')
+
+
+dnl  Usage: C comment ...
+dnl
+dnl  "C" works like a FORTRAN-style comment character.  This can be used for
+dnl  comments to the right of assembly instructions, where just dnl would
+dnl  remove the linefeed, and concatenate adjacent lines.
+dnl
+dnl  "C" and/or "dnl" are useful when an assembler doesn't support comments,
+dnl  or where different assemblers for a particular CPU have different
+dnl  comment styles.  The intermediate ".s" files will end up with no
+dnl  comments, just code.
+dnl
+dnl  Using "C" is not intended to cause offence to anyone who doesn't like
+dnl  FORTRAN; but if that happens it's an unexpected bonus.
+
+define(C, `
+dnl')
+
+
+dnl  Various possible defines passed from the Makefile that are to be tested
+dnl  with ifdef() rather than be expanded.
+
+m4_not_for_expansion(`PIC')
+
+dnl  aors_n
+m4_not_for_expansion(`OPERATION_add_n')
+m4_not_for_expansion(`OPERATION_sub_n')
+
+dnl  aorsmul_n
+m4_not_for_expansion(`OPERATION_addmul_1')
+m4_not_for_expansion(`OPERATION_submul_1')
+
+dnl  logops_n
+m4_not_for_expansion(`OPERATION_and_n')
+m4_not_for_expansion(`OPERATION_andn_n')
+m4_not_for_expansion(`OPERATION_nand_n')
+m4_not_for_expansion(`OPERATION_ior_n')
+m4_not_for_expansion(`OPERATION_iorn_n')
+m4_not_for_expansion(`OPERATION_nior_n')
+m4_not_for_expansion(`OPERATION_xor_n')
+m4_not_for_expansion(`OPERATION_xnor_n')
+
+dnl  popham
+m4_not_for_expansion(`OPERATION_popcount')
+m4_not_for_expansion(`OPERATION_hamdist')
+
+
+dnl  Usage: m4_config_gmp_mparam(`symbol')
+dnl
+dnl  Check that `symbol' is defined.  If it isn't, issue an error and
+dnl  terminate immediately.  The error message explains that the symbol
+dnl  should be in config.m4, copied from gmp-mparam.h.
+dnl
+dnl  Processing is terminated immediately since missing something like
+dnl  KARATSUBA_SQR_THRESHOLD can lead to infinite loops with endless error
+dnl  messages.
+
+define(m4_config_gmp_mparam,
+m4_assert_numargs(1)
+`ifdef(`$1',,
+`m4_error(`$1 is not defined.
+	"configure" should have extracted this from gmp-mparam.h and put it
+	in config.m4, but somehow this has failed.
+')m4exit(1)')')
+
+
+dnl  Usage: defreg(name,reg)
+dnl
+dnl  Give a name to a $ style register.  For example,
+dnl
+dnl         defreg(foo,$12)
+dnl
+dnl  defreg() inserts an extra pair of quotes after the $ so that it's not
+dnl  interpreted as an m4 macro parameter, ie. foo is actually $`'12.  m4
+dnl  strips those quotes when foo is expanded.
+dnl
+dnl  deflit() is used to make the new definition, so it will expand
+dnl  literally even if followed by parentheses ie. foo(99) will become
+dnl  $12(99).  (But there's nowhere that would be used is there?)
+dnl
+dnl  When making further definitions from existing defreg() macros, remember
+dnl  to use defreg() again to protect the $ in the new definitions too.  For
+dnl  example,
+dnl
+dnl         defreg(a0,$4)
+dnl         defreg(a1,$5)
+dnl         ...
+dnl
+dnl         defreg(PARAM_DST,a0)
+dnl
+dnl  This is only because a0 is expanding at the time the PARAM_DST
+dnl  definition is made, leaving a literal $4 that must be re-quoted.  On
+dnl  the other hand in something like the following ra is only expanded when
+dnl  ret is used and its $`'31 protection will have its desired effect at
+dnl  that time.
+dnl
+dnl         defreg(ra,$31)
+dnl         ...
+dnl         define(ret,`j ra')
+dnl
+dnl  Note that only $n forms are meant to be used here, and something like
+dnl  128($30) doesn't get protected and will come out wrong.
+
+define(defreg,
+m4_assert_numargs(2)
+`deflit(`$1',
+substr(`$2',0,1)``''substr(`$2',1))')
+
+
+dnl  Usage: m4_instruction_wrapper(num)
+dnl
+dnl  Put this, unquoted, on a line on its own, at the start of a macro
+dnl  that's a wrapper around an assembler instruction.  It adds code to give
+dnl  a descriptive error message if the macro is invoked without arguments.
+dnl
+dnl  For example, suppose jmp needs to be wrapped,
+dnl
+dnl         define(jmp,
+dnl         m4_instruction_wrapper()
+dnl         m4_assert_numargs(1)
+dnl                 `.byte 0x42
+dnl                 .long  $1
+dnl                 nop')
+dnl
+dnl  The point of m4_instruction_wrapper is to get a better error message
+dnl  than m4_assert_numargs would give if jmp is accidentally used as plain
+dnl  "jmp foo" instead of the intended "jmp( foo)".  "jmp()" with no
+dnl  argument also provokes the error message.
+dnl
+dnl  m4_instruction_wrapper should only be used with wrapped instructions
+dnl  that take arguments, since obviously something meant to be used as
+dnl  plain "ret", say, doesn't want to give an error when used that way.
+
+define(m4_instruction_wrapper,
+m4_assert_numargs(0)
+``m4_instruction_wrapper_internal'(m4_doublequote($`'0),dnl
+m4_doublequote(ifdef(`__file__',__file__,`the m4 sources')),dnl
+$`#',m4_doublequote($`'1))`dnl'')
+
+dnl  Called: m4_instruction_wrapper_internal($0,`filename',$#,$1)
+define(m4_instruction_wrapper_internal,
+`ifelse(eval($3<=1 && m4_length(`$4')==0),1,
+`m4_error(`$1 is a macro replacing that instruction and needs arguments, see $2 for details
+')')')
+
+
+dnl  Usage: UNROLL_LOG2, UNROLL_MASK, UNROLL_BYTES
+dnl         CHUNK_LOG2, CHUNK_MASK, CHUNK_BYTES
+dnl
+dnl  When code supports a variable amount of loop unrolling, the convention
+dnl  is to define UNROLL_COUNT to the number of limbs processed per loop.
+dnl  When testing code this can be varied to see how much the loop overhead
+dnl  is costing.  For example,
+dnl
+dnl         deflit(UNROLL_COUNT, 32)
+dnl
+dnl  If the forloop() generating the unrolled loop has a pattern processing
+dnl  more than one limb, the convention is to express this with CHUNK_COUNT.
+dnl  For example,
+dnl
+dnl         deflit(CHUNK_COUNT, 2)
+dnl
+dnl  The LOG2, MASK and BYTES definitions below are derived from these COUNT
+dnl  definitions.  If COUNT is redefined, the LOG2, MASK and BYTES follow
+dnl  the new definition automatically.
+dnl
+dnl  LOG2 is the log base 2 of COUNT.  MASK is COUNT-1, which can be used as
+dnl  a bit mask.  BYTES is BYTES_PER_MP_LIMB*COUNT, the number of bytes
+dnl  processed in each unrolled loop.
+dnl
+dnl  BYTES_PER_MP_LIMB is defined in a CPU specific m4 include file.  It
+dnl  exists only so the BYTES definitions here can be common to all CPUs.
+dnl  In the actual code for a given CPU, an explicit 4 or 8 may as well be
+dnl  used because the code is only for a particular CPU, it doesn't need to
+dnl  be general.
+dnl
+dnl  Note that none of these macros do anything except give conventional
+dnl  names to commonly used things.  You still have to write your own
+dnl  expressions for a forloop() and the resulting address displacements.
+dnl  Something like the following would be typical for 4 bytes per limb.
+dnl
+dnl         forloop(`i',0,UNROLL_COUNT-1,`
+dnl                 deflit(`disp',eval(i*4))
+dnl                 ...
+dnl         ')
+dnl
+dnl  Or when using CHUNK_COUNT,
+dnl
+dnl         forloop(`i',0,UNROLL_COUNT/CHUNK_COUNT-1,`
+dnl                 deflit(`disp0',eval(i*CHUNK_COUNT*4))
+dnl                 deflit(`disp1',eval(disp0+4))
+dnl                 ...
+dnl         ')
+dnl
+dnl  Clearly `i' can be run starting from 1, or from high to low or whatever
+dnl  best suits.
+
+deflit(UNROLL_LOG2,
+m4_assert_defined(`UNROLL_COUNT')
+`m4_log2(UNROLL_COUNT)')
+
+deflit(UNROLL_MASK,
+m4_assert_defined(`UNROLL_COUNT')
+`eval(UNROLL_COUNT-1)')
+
+deflit(UNROLL_BYTES,
+m4_assert_defined(`UNROLL_COUNT')
+m4_assert_defined(`BYTES_PER_MP_LIMB')
+`eval(UNROLL_COUNT * BYTES_PER_MP_LIMB)')
+ 
+deflit(CHUNK_LOG2,
+m4_assert_defined(`CHUNK_COUNT')
+`m4_log2(CHUNK_COUNT)')
+
+deflit(CHUNK_MASK,
+m4_assert_defined(`CHUNK_COUNT')
+`eval(CHUNK_COUNT-1)')
+
+deflit(CHUNK_BYTES,
+m4_assert_defined(`CHUNK_COUNT')
+m4_assert_defined(`BYTES_PER_MP_LIMB')
+`eval(CHUNK_COUNT * BYTES_PER_MP_LIMB)')
+
+
+dnl  Usage: MPN(name)
+dnl
+dnl  Add MPN_PREFIX to a name.
+dnl  MPN_PREFIX defaults to "__gmpn_" if not defined.
+
+ifdef(`MPN_PREFIX',,
+`define(`MPN_PREFIX',`__gmpn_')')
+
+define(MPN,
+m4_assert_numargs(1)
+`MPN_PREFIX`'$1')
+
+
+dnl  Usage: mpn_add_n, etc
+dnl
+dnl  Convenience definitions using MPN(), like the #defines in gmp.h.  Each
+dnl  function that might be implemented in assembler is here.
+
+define(define_mpn,
+m4_assert_numargs(1)
+`define(`mpn_$1',`MPN(`$1')')')
+
+define_mpn(add)
+define_mpn(add_1)
+define_mpn(add_n)
+define_mpn(add_nc)
+define_mpn(addmul_1)
+define_mpn(addmul_1c)
+define_mpn(addsub_n)
+define_mpn(addsub_nc)
+define_mpn(and_n)
+define_mpn(andn_n)
+define_mpn(bdivmod)
+define_mpn(cmp)
+define_mpn(com_n)
+define_mpn(copyd)
+define_mpn(copyi)
+define_mpn(divexact_by3c)
+define_mpn(divrem)
+define_mpn(divrem_1)
+define_mpn(divrem_1c)
+define_mpn(divrem_2)
+define_mpn(divrem_classic)
+define_mpn(divrem_newton)
+define_mpn(dump)
+define_mpn(gcd)
+define_mpn(gcd_1)
+define_mpn(gcdext)
+define_mpn(get_str)
+define_mpn(hamdist)
+define_mpn(invert_limb)
+define_mpn(ior_n)
+define_mpn(iorn_n)
+define_mpn(kara_mul_n)
+define_mpn(kara_sqr_n)
+define_mpn(lshift)
+define_mpn(lshiftc)
+define_mpn(mod_1)
+define_mpn(mod_1c)
+define_mpn(mul)
+define_mpn(mul_1)
+define_mpn(mul_1c)
+define_mpn(mul_basecase)
+define_mpn(mul_n)
+define_mpn(perfect_square_p)
+define_mpn(popcount)
+define_mpn(preinv_mod_1)
+define_mpn(nand_n)
+define_mpn(nior_n)
+define_mpn(random)
+define_mpn(random2)
+define_mpn(rshift)
+define_mpn(rshiftc)
+define_mpn(scan0)
+define_mpn(scan1)
+define_mpn(set_str)
+define_mpn(sqr_basecase)
+define_mpn(sub_n)
+define_mpn(sqrtrem)
+define_mpn(sub)
+define_mpn(sub_1)
+define_mpn(sub_n)
+define_mpn(sub_nc)
+define_mpn(submul_1)
+define_mpn(submul_1c)
+define_mpn(toom3_mul_n)
+define_mpn(toom3_sqr_n)
+define_mpn(umul_ppmm)
+define_mpn(udiv_qrnnd)
+define_mpn(xnor_n)
+define_mpn(xor_n)
+
+define(`ASM_START',
+	`')
+
+define(`PROLOGUE',
+	`
+	TEXT
+	ALIGN(4)
+	GLOBL	GSYM_PREFIX`$1'
+	TYPE(GSYM_PREFIX`$1',`function')
+GSYM_PREFIX`$1':')
+
+define(`EPILOGUE',
+	`
+	SIZE(GSYM_PREFIX`$1',.-GSYM_PREFIX`$1')')
+
+dnl  LSYM_PREFIX might be L$, so defn() must be used to quote it or the L
+dnl  will expand as the L macro, an infinite recursion.
+define(`L',`defn(`LSYM_PREFIX')$1')
+
+define(`INT32',
+	`
+	ALIGN(4)
+$1:
+	W32	$2
+	')
+
+define(`INT64',
+	`
+	ALIGN(8)
+$1:
+	W32	$2
+	W32	$3
+	')
+
+
+dnl  Usage: ALIGN(bytes)
+dnl
+dnl  Emit a ".align" directive.  The alignment is specified in bytes, and
+dnl  will normally need to be a power of 2.  The actual ".align" generated
+dnl  is either bytes or logarithmic according to what ./configure detects.
+dnl
+dnl  ALIGN_FILL_0x90, if defined and equal to "yes", means a ", 0x90" should
+dnl  be appended (this is for x86).
+
+define(ALIGN,
+m4_assert_numargs(1)
+m4_assert_defined(`ALIGN_LOGARITHMIC')
+`.align	ifelse(ALIGN_LOGARITHMIC,yes,`m4_log2($1)',`eval($1)')dnl
+ifelse(ALIGN_FILL_0x90,yes,`, 0x90')')
+
+
+dnl  Usage: MULFUNC_PROLOGUE(function function...)
+dnl
+dnl  A dummy macro which is grepped for by ./configure to know what
+dnl  functions a multi-function file is providing.  Use this if there aren't
+dnl  explicit PROLOGUE()s for each possible function.
+dnl
+dnl  Multiple MULFUNC_PROLOGUEs can be used, or just one with the function
+dnl  names separated by spaces.
+
+define(`MULFUNC_PROLOGUE',
+m4_assert_numargs(1)
+`')
+
+
+divert`'dnl
diff --git a/rts/gmp/mpn/clipper/add_n.s b/rts/gmp/mpn/clipper/add_n.s
new file mode 100644
index 0000000000..538a1caed0
--- /dev/null
+++ b/rts/gmp/mpn/clipper/add_n.s
@@ -0,0 +1,48 @@
+; Clipper __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+	.align 16
+.globl ___gmpn_add_n
+___gmpn_add_n:
+	subq	$8,sp
+	storw	r6,(sp)
+	loadw	12(sp),r2
+	loadw	16(sp),r3
+	loadq	$0,r6		; clear carry-save register
+
+.Loop:	loadw	(r1),r4
+	loadw	(r2),r5
+	addwc	r6,r6		; restore carry from r6
+	addwc	r5,r4
+	storw	r4,(r0)
+	subwc	r6,r6		; save carry in r6
+	addq	$4,r0
+	addq	$4,r1
+	addq	$4,r2
+	subq	$1,r3
+	brne	.Loop
+
+	negw	r6,r0
+	loadw	(sp),r6
+	addq	$8,sp
+	ret	sp
diff --git a/rts/gmp/mpn/clipper/mul_1.s b/rts/gmp/mpn/clipper/mul_1.s
new file mode 100644
index 0000000000..c0c756488c
--- /dev/null
+++ b/rts/gmp/mpn/clipper/mul_1.s
@@ -0,0 +1,47 @@
+; Clipper __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+	.align	16
+.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	subq	$8,sp
+	storw	r6,(sp)
+	loadw	12(sp),r2
+	loadw	16(sp),r3
+	loadq	$0,r6		; clear carry limb
+
+.Loop:	loadw	(r1),r4
+	mulwux	r3,r4
+	addw	r6,r4		; add old carry limb into low product limb
+	loadq	$0,r6
+	addwc	r5,r6		; propagate cy into high product limb
+	storw	r4,(r0)
+	addq	$4,r0
+	addq	$4,r1
+	subq	$1,r2
+	brne	.Loop
+
+	movw	r6,r0
+	loadw	0(sp),r6
+	addq	$8,sp
+	ret	sp
diff --git a/rts/gmp/mpn/clipper/sub_n.s b/rts/gmp/mpn/clipper/sub_n.s
new file mode 100644
index 0000000000..44d8797289
--- /dev/null
+++ b/rts/gmp/mpn/clipper/sub_n.s
@@ -0,0 +1,48 @@
+; Clipper __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+	.align 16
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	subq	$8,sp
+	storw	r6,(sp)
+	loadw	12(sp),r2
+	loadw	16(sp),r3
+	loadq	$0,r6		; clear carry-save register
+
+.Loop:	loadw	(r1),r4
+	loadw	(r2),r5
+	addwc	r6,r6		; restore carry from r6
+	subwc	r5,r4
+	storw	r4,(r0)
+	subwc	r6,r6		; save carry in r6
+	addq	$4,r0
+	addq	$4,r1
+	addq	$4,r2
+	subq	$1,r3
+	brne	.Loop
+
+	negw	r6,r0
+	loadw	(sp),r6
+	addq	$8,sp
+	ret	sp
diff --git a/rts/gmp/mpn/cray/README b/rts/gmp/mpn/cray/README
new file mode 100644
index 0000000000..8195c67e21
--- /dev/null
+++ b/rts/gmp/mpn/cray/README
@@ -0,0 +1,14 @@
+The (poorly optimized) code in this directory was originally written for a
+j90 system, but finished on a c90.  It should work on all Cray vector
+computers.  For the T3E and T3D systems, the `alpha' subdirectory at the
+same level as the directory containing this file, is much better.
+
+* `+' seems to be faster than `|' when combining carries.
+
+* It is possible that the best multiply performance would be achived by
+  storing only 24 bits per element, and using lazy carry propagation.  Before
+  calling i24mult, full carry propagation would be needed.
+
+* Supply tasking versions of the C loops.
+
+
diff --git a/rts/gmp/mpn/cray/add_n.c b/rts/gmp/mpn/cray/add_n.c
new file mode 100644
index 0000000000..1fdb394993
--- /dev/null
+++ b/rts/gmp/mpn/cray/add_n.c
@@ -0,0 +1,96 @@
+/* mpn_add_n -- Add two limb vectors of equal, non-zero length.
+   For Cray vector processors.
+
+   Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+   the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+   MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_add_n (c, a, b, n)
+     mp_ptr c;
+     mp_srcptr a, b;
+     mp_size_t n;
+{
+  mp_size_t i;
+  mp_size_t nm1 = n - 1;
+  int more_carries = 0;
+  int carry_out;
+
+  /* For small operands the non-vector code is faster.  */
+  if (n < 16)
+    goto sequential;
+
+  if (a == c || b == c)
+    {
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      if (c == a)
+	{
+	  /* allocate temp space for a */
+	  mp_ptr ax = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (ax, a, n);
+	  a = (mp_srcptr) ax;
+	}
+      if (c == b)
+	{
+	  /* allocate temp space for b */
+	  mp_ptr bx = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (bx, b, n);
+	  b = (mp_srcptr) bx;
+	}
+      carry_out = mpn_add_n (c, a, b, n);
+      TMP_FREE (marker);
+      return carry_out;
+    }
+
+  carry_out = a[nm1] + b[nm1] < a[nm1];
+
+#pragma _CRI ivdep			/* Cray PVP systems */
+  for (i = nm1; i > 0; i--)
+    {
+      int cy_in;
+      cy_in = a[i - 1] + b[i - 1] < a[i - 1];
+      c[i] = a[i] + b[i] + cy_in;
+      more_carries += c[i] < cy_in;
+    }
+  c[0] = a[0] + b[0];
+
+  if (more_carries)
+    {
+      /* This won't vectorize, but we should come here rarely.  */
+      int cy;
+    sequential:
+      cy = 0;
+      for (i = 0; i < n; i++)
+	{
+	  mp_limb_t ai, ci, t;
+	  ai = a[i];
+	  t = b[i] + cy;
+	  cy = t < cy;
+	  ci = ai + t;
+	  cy += ci < ai;
+	  c[i] = ci;
+	}
+      carry_out = cy;
+    }
+
+  return carry_out;
+}
diff --git a/rts/gmp/mpn/cray/addmul_1.c b/rts/gmp/mpn/cray/addmul_1.c
new file mode 100644
index 0000000000..031b4e8e8d
--- /dev/null
+++ b/rts/gmp/mpn/cray/addmul_1.c
@@ -0,0 +1,46 @@
+/* mpn_addmul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_ptr p0, p1, tp;
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  cy_limb = mpn_add_n (tp, rp, p0, n);
+  rp[0] = tp[0];
+  cy_limb += mpn_add_n (rp + 1, tp + 1, p1, n - 1);
+  cy_limb += p1[n - 1];
+
+  TMP_FREE (marker);
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/cray/gmp-mparam.h b/rts/gmp/mpn/cray/gmp-mparam.h
new file mode 100644
index 0000000000..14f7b8e05b
--- /dev/null
+++ b/rts/gmp/mpn/cray/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 64
+#define BITS_PER_SHORTINT 32
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/cray/mul_1.c b/rts/gmp/mpn/cray/mul_1.c
new file mode 100644
index 0000000000..0c8750b4ac
--- /dev/null
+++ b/rts/gmp/mpn/cray/mul_1.c
@@ -0,0 +1,44 @@
+/* mpn_mul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_ptr p0, p1;
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  rp[0] = p0[0];
+  cy_limb = mpn_add_n (rp + 1, p0 + 1, p1, n - 1);
+  cy_limb += p1[n - 1];
+
+  TMP_FREE (marker);
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/cray/mulww.f b/rts/gmp/mpn/cray/mulww.f
new file mode 100644
index 0000000000..99507c1e44
--- /dev/null
+++ b/rts/gmp/mpn/cray/mulww.f
@@ -0,0 +1,54 @@
+c     Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+c     Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+c     This file is part of the GNU MP Library.
+
+c     The GNU MP Library is free software; you can redistribute it and/or
+c     modify it under the terms of the GNU Lesser General Public License as
+c     published by the Free Software Foundation; either version 2.1 of the
+c     License, or (at your option) any later version.
+
+c     The GNU MP Library is distributed in the hope that it will be useful,
+c     but WITHOUT ANY WARRANTY; without even the implied warranty of
+c     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+c     Lesser General Public License for more details.
+
+c     You should have received a copy of the GNU Lesser General Public
+c     License along with the GNU MP Library; see the file COPYING.LIB.  If
+c     not, write to the Free Software Foundation, Inc., 59 Temple Place -
+c     Suite 330, Boston, MA 02111-1307, USA.
+
+c     p1[] = hi(a[]*s); the upper limbs of each product
+c     p0[] = low(a[]*s); the corresponding lower limbs
+c     n is number of limbs in the vectors
+
+      subroutine gmpn_mulww(p1,p0,a,n,s)
+      integer*8 p1(0:*),p0(0:*),a(0:*),s
+      integer n
+
+      integer*8 a0,a1,a2,s0,s1,s2,c
+      integer*8 ai,t0,t1,t2,t3,t4
+
+      s0 = shiftl(and(s,4194303),24)
+      s1 = shiftl(and(shiftr(s,22),4194303),24)
+      s2 = shiftl(and(shiftr(s,44),4194303),24)
+
+      do i = 0,n-1
+         ai = a(i)
+         a0 = shiftl(and(ai,4194303),24)
+         a1 = shiftl(and(shiftr(ai,22),4194303),24)
+         a2 = shiftl(and(shiftr(ai,44),4194303),24)
+
+         t0 = i24mult(a0,s0)
+         t1 = i24mult(a0,s1)+i24mult(a1,s0)
+         t2 = i24mult(a0,s2)+i24mult(a1,s1)+i24mult(a2,s0)
+         t3 = i24mult(a1,s2)+i24mult(a2,s1)
+         t4 = i24mult(a2,s2)
+
+         p0(i)=shiftl(t2,44)+shiftl(t1,22)+t0
+         c=shiftr(shiftr(t0,22)+and(t1,4398046511103)+
+     $        shiftl(and(t2,1048575),22),42)
+         p1(i)=shiftl(t4,24)+shiftl(t3,2)+shiftr(t2,20)+shiftr(t1,42)+c
+      end do
+      end
diff --git a/rts/gmp/mpn/cray/mulww.s b/rts/gmp/mpn/cray/mulww.s
new file mode 100644
index 0000000000..890cdcf94d
--- /dev/null
+++ b/rts/gmp/mpn/cray/mulww.s
@@ -0,0 +1,245 @@
+*     Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+*     Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+*     This file is generated from mulww.f in this same directory.
+
+*     This file is part of the GNU MP Library.
+
+*     The GNU MP Library is free software; you can redistribute it and/or
+*     modify it under the terms of the GNU Lesser General Public License as
+*     published by the Free Software Foundation; either version 2.1 of the
+*     License, or (at your option) any later version.
+
+*     The GNU MP Library is distributed in the hope that it will be useful,
+*     but WITHOUT ANY WARRANTY; without even the implied warranty of
+*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*     Lesser General Public License for more details.
+
+*     You should have received a copy of the GNU Lesser General Public
+*     License along with the GNU MP Library; see the file COPYING.LIB.  If
+*     not, write to the Free Software Foundation, Inc., 59 Temple Place -
+*     Suite 330, Boston, MA 02111-1307, USA.
+
+            IDENT           GMPN_MULWW
+**********************************************
+*      Assemble with Cal Version 2.0         *
+*                                            *
+* Generated by CFT77   6.0.4.19              *
+*           on 06/27/00 at 04:34:13          *
+*                                            *
+**********************************************
+* ALLOW UNDERSCORES IN IDENTIFIERS
+            EDIT            OFF
+            FORMAT          NEW
+@DATA       SECTION         DATA,CM
+@DATA       =               W.*
+            CON             O'0000000000040000000000
+            CON             O'0435152404713723252514        ;GMPN_MUL       1
+            CON             O'0535270000000000000000        ;WW             1
+            CON             O'0000000000000001200012        ;trbk tbl       1
+            VWD             32/0,32/P.GMPN_MULWW            ;trbk tbl       1
+            CON             O'0014003000000000001416        ;trbk tbl       1
+            CON             O'0000000000000000000011        ;trbk tbl       1
+            CON             O'0000000000000000000215        ;trbk tbl       1
+            BSSZ            1                               ;trbk tbl       1
+@CODE       SECTION         CODE
+@CODE       =               P.*
+L3          =               P.*                             ;               1
+            A0              A6                              ;arg base       1
+            A5              6                               ;num Darg       1
+            B03,A5          0,A0                            ;load DAs       1
+            A0              A1+A2                           ;               1
+            A5              1                               ;num Ts         1
+            0,A0            T00,A5                          ;               1
+            B02             A2                              ;new base       1
+            B66             A3                              ;stk top        1
+            B01             A6                              ;arg base       1
+            A7              P.L4                            ;ofrn rtn       1
+            B00             A7                              ;return         1
+            A6              @DATA                           ;               1
+            J               $STKOFEN                        ;$STKOFEN       1
+GMPN_MULWW  =               P.*                             ;               1
+            A0              @DATA+3                         ;(trbk)         1
+            B77             A0                              ;(trbk)         1
+            A1              13                              ;num Bs         1
+            A0              B66                             ;stk top        1
+            A2              B66                             ;stk tmp        1
+            A4              B67                             ;stk limt       1
+            0,A0            B77,A1                          ;               1
+            A7              782                             ;stk size       1
+            A3              A2+A7                           ;               1
+            A0              A4-A3                           ;               1
+            JAM             L3                              ;overflow       1
+            A0              A6                              ;arg base       1
+            A5              6                               ;num Darg       1
+            B03,A5          0,A0                            ;load DAs       1
+            A0              A1+A2                           ;               1
+            A5              1                               ;num Ts         1
+            0,A0            T00,A5                          ;               1
+            B02             A2                              ;new base       1
+            B66             A3                              ;new top        1
+            B01             A6                              ;arg base       1
+L4          =               P.*                             ;ofrn rtn       1
+            A7              B07                             ;regs          14
+            S7              0,A7                            ;              14
+            A6              B10                             ;regs           9
+            S6              0,A6                            ;               9
+            S5              1                               ;              14
+            S4              <22                             ;               9
+            S7              S7-S5                           ;              14
+            S5              #S7                             ;              14
+            T00             S6                              ;regs          10
+            S6              S6>22                           ;              10
+            S7              T00                             ;regs          11
+            S7              S7>44                           ;              11
+            S3              T00                             ;regs           9
+            S3              S3&S4                           ;               9
+            S6              S6&S4                           ;              10
+            S7              S7&S4                           ;              11
+            S3              S3<24                           ;               9
+            S6              S6<24                           ;              10
+            S7              S7<24                           ;              11
+            S0              S5                              ;regs          14
+            S4              S5                              ;regs          14
+            S1              S6                              ;regs          14
+            S2              S3                              ;regs          14
+            S3              S7                              ;regs          14
+            JSP             L5                              ;              14
+L6          =               P.*                             ;              14
+            S7              -S4                             ;              14
+            A2              S7                              ;regs          14
+            VL              A2                              ;regs          14
+            A3              B06                             ;s_bt_sp       14
+            A5              B05                             ;s_bt_sp       14
+            A4              B04                             ;s_bt_sp       14
+            A1              VL                              ;              14
+            A2              S4                              ;regs          14
+L7          =               P.*                             ;              14
+            A0              A3                              ;regs          15
+            VL              A1                              ;regs          15
+            V7              ,A0,1                           ;              15
+            B11             A5                              ;s_bt_sp       15
+            A7              22                              ;              17
+            B12             A4                              ;s_bt_sp       17
+            V6              V7>A7                           ;              17
+            B13             A3                              ;s_bt_sp       17
+            S7              <22                             ;              17
+            A3              B02                             ;s_bt_sp       17
+            V5              S7&V6                           ;              17
+            A6              24                              ;              17
+            V4              V5<A6                           ;              17
+            V3              S1*FV4                          ;              22
+            V2              S7&V7                           ;              16
+            V1              V2<A6                           ;              16
+            V0              S3*FV1                          ;              22
+            V6              V0+V3                           ;              22
+            A5              44                              ;              18
+            V5              V7>A5                           ;              18
+            V2              S1*FV1                          ;              21
+            V3              S7&V5                           ;              18
+            A0              14                              ;              34
+            B77             A0                              ;regs          34
+            A4              B77                             ;regs          34
+            A0              A4+A3                           ;              34
+            ,A0,1           V2                              ;v_ld_str      34
+            V0              V3<A6                           ;              18
+            V7              S2*FV1                          ;              20
+            A4              142                             ;              34
+            A0              A4+A3                           ;              34
+            ,A0,1           V7                              ;v_ld_str      34
+            V5              V7>A7                           ;              28
+            V2              S2*FV0                          ;              22
+            V3              V6+V2                           ;              22
+            S7              <20                             ;              28
+            V1              S7&V3                           ;              28
+            A4              270                             ;              34
+            A0              A4+A3                           ;              34
+            ,A0,1           V0                              ;v_ld_str      34
+            A4              14                              ;              34
+            A0              A4+A3                           ;              34
+            V7              ,A0,1                           ;v_ld_str      34
+            V6              V1<A7                           ;              28
+            V2              S2*FV4                          ;              21
+            V0              V7+V2                           ;              21
+            S7              <42                             ;              28
+            V1              S7&V0                           ;              28
+            A4              398                             ;              34
+            A0              A4+A3                           ;              34
+            ,A0,1           V0                              ;v_ld_str      34
+            V7              S3*FV4                          ;              23
+            V2              V5+V1                           ;              28
+            V0              V3<A5                           ;              26
+            A5              526                             ;              34
+            A0              A5+A3                           ;              34
+            ,A0,1           V0                              ;v_ld_str      34
+            A5              270                             ;              34
+            A0              A5+A3                           ;              34
+            V4              ,A0,1                           ;v_ld_str      34
+            V5              V2+V6                           ;              28
+            A5              20                              ;              32
+            V1              V3>A5                           ;              32
+            V0              S1*FV4                          ;              23
+            A5              654                             ;              34
+            A0              A5+A3                           ;              34
+            ,A0,1           V1                              ;v_ld_str      34
+            V6              V7+V0                           ;              23
+            A5              2                               ;              32
+            V2              V6<A5                           ;              32
+            V3              S3*FV4                          ;              24
+            A5              142                             ;              34
+            A0              A5+A3                           ;              34
+            V1              ,A0,1                           ;v_ld_str      34
+            A5              526                             ;              34
+            A0              A5+A3                           ;              34
+            V7              ,A0,1                           ;v_ld_str      34
+            V0              V1+V7                           ;              26
+            V6              V3<A6                           ;              32
+            V4              V6+V2                           ;              32
+            A6              42                              ;              28
+            V7              V5>A6                           ;              28
+            A5              654                             ;              34
+            CPW                                             ;cmr_vrsp      34
+            A0              A5+A3                           ;              34
+            V1              ,A0,1                           ;v_ld_str      34
+            A5              398                             ;              34
+            A0              A5+A3                           ;              34
+            V3              ,A0,1                           ;v_ld_str      34
+            V6              V4+V1                           ;              32
+            V2              V3>A6                           ;              32
+            V5              V6+V2                           ;              32
+            A6              B12                             ;s_bt_sp       32
+            V4              V3<A7                           ;              26
+            A7              B13                             ;regs          34
+            A3              A7+A1                           ;              34
+            A7              B11                             ;regs          34
+            A5              A7+A1                           ;              34
+            A4              A6+A1                           ;              34
+            A7              A2+A1                           ;              34
+            A0              A2+A1                           ;              34
+            A2              128                             ;              34
+            B13             A0                              ;s_bt_sp       34
+            V1              V0+V4                           ;              26
+            A0              B11                             ;regs          31
+            ,A0,1           V1                              ;              31
+            V6              V5+V7                           ;              33
+            A0              A6                              ;regs          33
+            ,A0,1           V6                              ;              33
+            A0              B13                             ;regs          34
+            A1              A2                              ;regs          34
+            A2              A7                              ;regs          34
+            JAN             L7                              ;              34
+L8          =               P.*                             ;              34
+L5          =               P.*                             ;              34
+            S1              0                               ;              35
+            A0              B02                             ;              35
+            A2              B02                             ;              35
+            A1              13                              ;num Bs        35
+            B66             A0                              ;              35
+            B77,A1          0,A0                            ;              35
+            A0              A2+A1                           ;              35
+            A1              1                               ;num Ts        35
+            T00,A1          0,A0                            ;              35
+            J               B00                             ;              35
+            EXT             $STKOFEN:p
+            ENTRY           GMPN_MULWW
+            END
diff --git a/rts/gmp/mpn/cray/sub_n.c b/rts/gmp/mpn/cray/sub_n.c
new file mode 100644
index 0000000000..902e07a727
--- /dev/null
+++ b/rts/gmp/mpn/cray/sub_n.c
@@ -0,0 +1,97 @@
+/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+   For Cray vector processors.
+
+   Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+   the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+   MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_sub_n (c, a, b, n)
+     mp_ptr c;
+     mp_srcptr a, b;
+     mp_size_t n;
+{
+  mp_size_t i;
+  mp_size_t nm1 = n - 1;
+  int more_carries = 0;
+  int carry_out;
+
+  /* For small operands the non-vector code is faster.  */
+  if (n < 16)
+    goto sequential;
+
+  if (a == c || b == c)
+    {
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      if (c == a)
+	{
+	  /* allocate temp space for a */
+	  mp_ptr ax = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (ax, a, n);
+	  a = (mp_srcptr) ax;
+	}
+      if (c == b)
+	{
+	  /* allocate temp space for b */
+	  mp_ptr bx = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (bx, b, n);
+	  b = (mp_srcptr) bx;
+	}
+      carry_out = mpn_sub_n (c, a, b, n);
+      TMP_FREE (marker);
+      return carry_out;
+    }
+
+  carry_out = a[nm1] < b[nm1];
+
+#pragma _CRI ivdep			/* Cray PVP systems */
+  for (i = nm1; i > 0; i--)
+    {
+      int cy_in; mp_limb_t t;
+      cy_in = a[i - 1] < b[i - 1];
+      t = a[i] - b[i];
+      more_carries += t < cy_in;
+      c[i] = t - cy_in;
+    }
+  c[0] = a[0] - b[0];
+
+  if (more_carries)
+    {
+      /* This won't vectorize, but we should come here rarely.  */
+      int cy;
+    sequential:
+      cy = 0;
+      for (i = 0; i < n; i++)
+	{
+	  mp_limb_t ai, ci, t;
+	  ai = a[i];
+	  t = b[i] + cy;
+	  cy = t < cy;
+	  ci = ai - t;
+	  cy += ci > ai;
+	  c[i] = ci;
+	}
+      carry_out = cy;
+    }
+
+  return carry_out;
+}
diff --git a/rts/gmp/mpn/cray/submul_1.c b/rts/gmp/mpn/cray/submul_1.c
new file mode 100644
index 0000000000..4d2fb13c62
--- /dev/null
+++ b/rts/gmp/mpn/cray/submul_1.c
@@ -0,0 +1,46 @@
+/* mpn_submul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_ptr p0, p1, tp;
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  cy_limb = mpn_sub_n (tp, rp, p0, n);
+  rp[0] = tp[0];
+  cy_limb += mpn_sub_n (rp + 1, tp + 1, p1, n - 1);
+  cy_limb += p1[n - 1];
+
+  TMP_FREE (marker);
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/add_n.c b/rts/gmp/mpn/generic/add_n.c
new file mode 100644
index 0000000000..5fcb7e4835
--- /dev/null
+++ b/rts/gmp/mpn/generic/add_n.c
@@ -0,0 +1,62 @@
+/* mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpn_add_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+#else
+mpn_add_n (res_ptr, s1_ptr, s2_ptr, size)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_srcptr s2_ptr;
+     mp_size_t size;
+#endif
+{
+  register mp_limb_t x, y, cy;
+  register mp_size_t j;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  s1_ptr -= j;
+  s2_ptr -= j;
+  res_ptr -= j;
+
+  cy = 0;
+  do
+    {
+      y = s2_ptr[j];
+      x = s1_ptr[j];
+      y += cy;			/* add previous carry to one addend */
+      cy = (y < cy);		/* get out carry from that addition */
+      y = x + y;		/* add other addend */
+      cy = (y < x) + cy;	/* get out carry from that add, combine */
+      res_ptr[j] = y;
+    }
+  while (++j != 0);
+
+  return cy;
+}
diff --git a/rts/gmp/mpn/generic/addmul_1.c b/rts/gmp/mpn/generic/addmul_1.c
new file mode 100644
index 0000000000..746ae31307
--- /dev/null
+++ b/rts/gmp/mpn/generic/addmul_1.c
@@ -0,0 +1,65 @@
+/* mpn_addmul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+   by S2_LIMB, add the S1_SIZE least significant limbs of the product to the
+   limb vector pointed to by RES_PTR.  Return the most significant limb of
+   the product, adjusted for carry-out from the addition.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_addmul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+{
+  register mp_limb_t cy_limb;
+  register mp_size_t j;
+  register mp_limb_t prod_high, prod_low;
+  register mp_limb_t x;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -s1_size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  res_ptr -= j;
+  s1_ptr -= j;
+
+  cy_limb = 0;
+  do
+    {
+      umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+      prod_low += cy_limb;
+      cy_limb = (prod_low < cy_limb) + prod_high;
+
+      x = res_ptr[j];
+      prod_low = x + prod_low;
+      cy_limb += (prod_low < x);
+      res_ptr[j] = prod_low;
+    }
+  while (++j != 0);
+
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/addsub_n.c b/rts/gmp/mpn/generic/addsub_n.c
new file mode 100644
index 0000000000..c9bab3ef60
--- /dev/null
+++ b/rts/gmp/mpn/generic/addsub_n.c
@@ -0,0 +1,167 @@
+/* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length.
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifndef L1_CACHE_SIZE
+#define L1_CACHE_SIZE 8192	/* only 68040 has less than this */
+#endif
+
+#define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6)
+
+
+/* mpn_addsub_n.
+   r1[] = s1[] + s2[]
+   r2[] = s1[] - s2[]
+   All operands have n limbs.
+   In-place operations allowed.  */
+mp_limb_t
+#if __STDC__
+mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
+#else
+mpn_addsub_n (r1p, r2p, s1p, s2p, n)
+     mp_ptr r1p, r2p;
+     mp_srcptr s1p, s2p;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t acyn, acyo;		/* carry for add */
+  mp_limb_t scyn, scyo;		/* carry for subtract */
+  mp_size_t off;		/* offset in operands */
+  mp_size_t this_n;		/* size of current chunk */
+
+  /* We alternatingly add and subtract in chunks that fit into the (L1)
+     cache.  Since the chunks are several hundred limbs, the function call
+     overhead is insignificant, but we get much better locality.  */
+
+  /* We have three variant of the inner loop, the proper loop is chosen
+     depending on whether r1 or r2 are the same operand as s1 or s2.  */
+
+  if (r1p != s1p && r1p != s2p)
+    {
+      /* r1 is not identical to either input operand.  We can therefore write
+	 to r1 directly, without using temporary storage.  */
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+	}
+    }
+  else if (r2p != s1p && r2p != s2p)
+    {
+      /* r2 is not identical to either input operand.  We can therefore write
+	 to r2 directly, without using temporary storage.  */
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+	}
+    }
+  else
+    {
+      /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa)
+	 Need temporary storage.  */
+      mp_limb_t tp[PART_SIZE];
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+	  acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+	  MPN_COPY (r1p + off, tp, this_n);
+	}
+    }
+
+  return 2 * acyo + scyo;
+}
+
+#ifdef MAIN
+#include <stdlib.h>
+#include <stdio.h>
+#include "timing.h"
+
+long cputime ();
+
+int
+main (int argc, char **argv)
+{
+  mp_ptr r1p, r2p, s1p, s2p;
+  double t;
+  mp_size_t n;
+
+  n = strtol (argv[1], 0, 0);
+
+  r1p = malloc (n * BYTES_PER_MP_LIMB);
+  r2p = malloc (n * BYTES_PER_MP_LIMB);
+  s1p = malloc (n * BYTES_PER_MP_LIMB);
+  s2p = malloc (n * BYTES_PER_MP_LIMB);
+  TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
+  printf ("              separate add and sub: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n));
+  printf ("combined addsub separate variables: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
+  printf ("        combined addsub r1 overlap: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
+  printf ("        combined addsub r2 overlap: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n));
+  printf ("          combined addsub in-place: %.3f\n", t);
+
+  return 0;
+}
+#endif
diff --git a/rts/gmp/mpn/generic/bdivmod.c b/rts/gmp/mpn/generic/bdivmod.c
new file mode 100644
index 0000000000..c4bcb414e6
--- /dev/null
+++ b/rts/gmp/mpn/generic/bdivmod.c
@@ -0,0 +1,120 @@
+/* mpn/bdivmod.c: mpn_bdivmod for computing U/V mod 2^d.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* q_high = mpn_bdivmod (qp, up, usize, vp, vsize, d).
+
+   Puts the low d/BITS_PER_MP_LIMB limbs of Q = U / V mod 2^d at qp, and
+   returns the high d%BITS_PER_MP_LIMB bits of Q as the result.
+
+   Also, U - Q * V mod 2^(usize*BITS_PER_MP_LIMB) is placed at up.  Since the
+   low d/BITS_PER_MP_LIMB limbs of this difference are zero, the code allows
+   the limb vectors at qp to overwrite the low limbs at up, provided qp <= up.
+
+   Preconditions:
+   1.  V is odd.
+   2.  usize * BITS_PER_MP_LIMB >= d.
+   3.  If Q and U overlap, qp <= up.
+
+   Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+   Funding for this work has been partially provided by Conselho Nacional
+   de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+   301314194-2, and was done while I was a visiting reseacher in the Instituto
+   de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+   References:
+       T. Jebelean, An algorithm for exact division, Journal of Symbolic
+       Computation, v. 15, 1993, pp. 169-180.
+
+       K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+       Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize,
+	     mp_srcptr vp, mp_size_t vsize, unsigned long int d)
+#else
+mpn_bdivmod (qp, up, usize, vp, vsize, d)
+     mp_ptr qp;
+     mp_ptr up;
+     mp_size_t usize;
+     mp_srcptr vp;
+     mp_size_t vsize;
+     unsigned long int d;
+#endif
+{
+  mp_limb_t v_inv;
+
+  /* 1/V mod 2^BITS_PER_MP_LIMB. */
+  modlimb_invert (v_inv, vp[0]);
+
+  /* Fast code for two cases previously used by the accel part of mpn_gcd.
+     (Could probably remove this now it's inlined there.) */
+  if (usize == 2 && vsize == 2 &&
+      (d == BITS_PER_MP_LIMB || d == 2*BITS_PER_MP_LIMB))
+    {
+      mp_limb_t hi, lo;
+      mp_limb_t q = up[0] * v_inv;
+      umul_ppmm (hi, lo, q, vp[0]);
+      up[0] = 0, up[1] -= hi + q*vp[1], qp[0] = q;
+      if (d == 2*BITS_PER_MP_LIMB)
+	q = up[1] * v_inv, up[1] = 0, qp[1] = q;
+      return 0;
+    }
+
+  /* Main loop.  */
+  while (d >= BITS_PER_MP_LIMB)
+    {
+      mp_limb_t q = up[0] * v_inv;
+      mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+      if (usize > vsize)
+	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+      d -= BITS_PER_MP_LIMB;
+      up += 1, usize -= 1;
+      *qp++ = q;
+    }
+
+  if (d)
+    {
+      mp_limb_t b;
+      mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1);
+      if (q <= 1)
+	{
+	  if (q == 0)
+	    return 0;
+	  else
+	    b = mpn_sub_n (up, up, vp, MIN (usize, vsize));
+	}
+      else
+	b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+
+      if (usize > vsize)
+	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+      return q;
+    }
+
+  return 0;
+}
diff --git a/rts/gmp/mpn/generic/bz_divrem_n.c b/rts/gmp/mpn/generic/bz_divrem_n.c
new file mode 100644
index 0000000000..d234b22af5
--- /dev/null
+++ b/rts/gmp/mpn/generic/bz_divrem_n.c
@@ -0,0 +1,153 @@
+/* mpn_bz_divrem_n and auxilliary routines.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+   IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+   FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 2000 Free Software Foundation, Inc.
+Contributed by Paul Zimmermann.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/*
+[1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler,
+    Technical report MPI-I-98-1-022, october 1998.
+    http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz
+*/
+
+static mp_limb_t mpn_bz_div_3_halves_by_2
+  _PROTO ((mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n));
+
+
+/* mpn_bz_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than
+   div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way,
+   i.e. n/2 >= KARATSUBA_MUL_THRESHOLD */
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
+#endif
+
+#if 0
+static
+unused_mpn_divrem (qp, qxn, np, nn, dp, dn)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_ptr np;
+     mp_size_t nn;
+     mp_srcptr dp;
+     mp_size_t dn;
+{
+  /* This might be useful: */
+  if (qxn != 0)
+    {
+      mp_limb_t c;
+      mp_ptr tp = alloca ((nn + qxn) * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp + qxn - nn, np, nn);
+      MPN_ZERO (tp, qxn);
+      c = mpn_divrem (qp, 0L, tp, nn + qxn, dp, dn);
+      /* Maybe copy proper part of tp to np?  Documentation is unclear about
+	 the returned np value when qxn != 0 */
+      return c;
+    }
+}
+#endif
+
+
+/* mpn_bz_divrem_n - Implements algorithm of page 8 in [1]: divides (np,2n)
+   by (dp,n) and puts the quotient in (qp,n), the remainder in (np,n).
+   Returns most significant limb of the quotient, which is 0 or 1.
+   Requires that the most significant bit of the divisor is set.  */
+
+mp_limb_t
+#if __STDC__
+mpn_bz_divrem_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
+#else
+mpn_bz_divrem_n (qp, np, dp, n)
+     mp_ptr qp;
+     mp_ptr np;
+     mp_srcptr dp;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t qhl, cc;
+
+  if (n % 2 != 0)
+    {
+      qhl = mpn_bz_divrem_n (qp + 1, np + 2, dp + 1, n - 1);
+      cc = mpn_submul_1 (np + 1, qp + 1, n - 1, dp[0]);
+      cc = mpn_sub_1 (np + n, np + n, 1, cc);
+      if (qhl) cc += mpn_sub_1 (np + n, np + n, 1, dp[0]);
+      while (cc)
+        {
+          qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, (mp_limb_t) 1);
+          cc -= mpn_add_n (np + 1, np + 1, dp, n);
+        }
+      qhl += mpn_add_1 (qp + 1, qp + 1, n - 1,
+                        mpn_sb_divrem_mn (qp, np, n + 1, dp, n));
+    }
+  else
+    {
+      mp_size_t n2 = n/2;
+      qhl = mpn_bz_div_3_halves_by_2 (qp + n2, np + n2, dp, n2);
+      qhl += mpn_add_1 (qp + n2, qp + n2, n2,
+                        mpn_bz_div_3_halves_by_2 (qp, np, dp, n2));
+    }
+  return qhl;
+}
+
+
+/* divides (np, 3n) by (dp, 2n) and puts the quotient in (qp, n),
+   the remainder in (np, 2n) */
+
+static mp_limb_t
+#if __STDC__
+mpn_bz_div_3_halves_by_2 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
+#else
+mpn_bz_div_3_halves_by_2 (qp, np, dp, n)
+     mp_ptr qp;
+     mp_ptr np;
+     mp_srcptr dp;
+     mp_size_t n;
+#endif
+{
+  mp_size_t twon = n + n; 
+  mp_limb_t qhl, cc;
+  mp_ptr tmp;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  if (n < BZ_THRESHOLD)
+    qhl = mpn_sb_divrem_mn (qp, np + n, twon, dp + n, n);
+  else
+    qhl = mpn_bz_divrem_n (qp, np + n, dp + n, n);
+  tmp = (mp_ptr) TMP_ALLOC (twon * BYTES_PER_MP_LIMB);
+  mpn_mul_n (tmp, qp, dp, n);
+  cc = mpn_sub_n (np, np, tmp, twon);
+  TMP_FREE (marker);
+  if (qhl) cc += mpn_sub_n (np + n, np + n, dp, n);
+  while (cc)
+    {
+      qhl -= mpn_sub_1 (qp, qp, n, (mp_limb_t) 1);
+      cc -= mpn_add_n (np, np, dp, twon);
+    }
+  return qhl;
+}
diff --git a/rts/gmp/mpn/generic/cmp.c b/rts/gmp/mpn/generic/cmp.c
new file mode 100644
index 0000000000..8e9792f54e
--- /dev/null
+++ b/rts/gmp/mpn/generic/cmp.c
@@ -0,0 +1,56 @@
+/* mpn_cmp -- Compare two low-level natural-number integers.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE.
+   There are no restrictions on the relative sizes of
+   the two arguments.
+   Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2.  */
+
+int
+#if __STDC__
+mpn_cmp (mp_srcptr op1_ptr, mp_srcptr op2_ptr, mp_size_t size)
+#else
+mpn_cmp (op1_ptr, op2_ptr, size)
+     mp_srcptr op1_ptr;
+     mp_srcptr op2_ptr;
+     mp_size_t size;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t op1_word, op2_word;
+
+  for (i = size - 1; i >= 0; i--)
+    {
+      op1_word = op1_ptr[i];
+      op2_word = op2_ptr[i];
+      if (op1_word != op2_word)
+	goto diff;
+    }
+  return 0;
+ diff:
+  /* This can *not* be simplified to
+	op2_word - op2_word
+     since that expression might give signed overflow.  */
+  return (op1_word > op2_word) ? 1 : -1;
+}
diff --git a/rts/gmp/mpn/generic/diveby3.c b/rts/gmp/mpn/generic/diveby3.c
new file mode 100644
index 0000000000..a2fb552bfa
--- /dev/null
+++ b/rts/gmp/mpn/generic/diveby3.c
@@ -0,0 +1,77 @@
+/* mpn_divexact_by3 -- mpn division by 3, expecting no remainder. */
+
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
+   0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
+#define INVERSE_3      ((MP_LIMB_T_MAX / 3) * 2 + 1)
+
+
+/* The "c += ..."s are adding the high limb of 3*l to c.  That high limb
+   will be 0, 1 or 2.  Doing two separate "+="s seems to turn out better
+   code on gcc (as of 2.95.2 at least).
+
+   When a subtraction of a 0,1,2 carry value causes a borrow, that leaves a
+   limb value of either 0xFF...FF or 0xFF...FE and the multiply by INVERSE_3
+   gives 0x55...55 or 0xAA...AA respectively, producing a further borrow of
+   only 0 or 1 respectively.  Hence the carry out of each stage and for the
+   return value is always only 0, 1 or 2.  */
+
+mp_limb_t
+#if __STDC__
+mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t c)
+#else
+mpn_divexact_by3c (dst, src, size, c)
+     mp_ptr    dst;
+     mp_srcptr src;
+     mp_size_t size;
+     mp_limb_t c;
+#endif
+{
+  mp_size_t  i;
+
+  ASSERT (size >= 1);
+
+  i = 0;
+  do
+    {
+      mp_limb_t  l, s;
+
+      s = src[i];
+      l = s - c;
+      c = (l > s);
+
+      l *= INVERSE_3;
+      dst[i] = l;
+
+      c += (l > MP_LIMB_T_MAX/3);
+      c += (l > (MP_LIMB_T_MAX/3)*2);
+    }
+  while (++i < size);
+
+  return c;
+}
diff --git a/rts/gmp/mpn/generic/divrem.c b/rts/gmp/mpn/generic/divrem.c
new file mode 100644
index 0000000000..30673e76d9
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem.c
@@ -0,0 +1,101 @@
+/* mpn_divrem -- Divide natural numbers, producing both remainder and
+   quotient.  This is now just a middle layer for calling the new
+   internal mpn_tdiv_qr.
+
+Copyright (C) 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_divrem (mp_ptr qp, mp_size_t qxn,
+	    mp_ptr np, mp_size_t nn,
+	    mp_srcptr dp, mp_size_t dn)
+#else
+mpn_divrem (qp, qxn, np, nn, dp, dn)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_ptr np;
+     mp_size_t nn;
+     mp_srcptr dp;
+     mp_size_t dn;
+#endif
+{
+  if (dn == 1)
+    {
+      mp_limb_t ret;
+      mp_ptr q2p;
+      mp_size_t qn;
+      TMP_DECL (marker);
+
+      TMP_MARK (marker);
+      q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
+
+      np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]);
+      qn = nn + qxn - 1;
+      MPN_COPY (qp, q2p, qn);
+      ret = q2p[qn];
+
+      TMP_FREE (marker);
+      return ret;
+    }
+  else if (dn == 2)
+    {
+      return mpn_divrem_2 (qp, qxn, np, nn, dp);
+    }
+  else
+    {
+      mp_ptr rp, q2p;
+      mp_limb_t qhl;
+      mp_size_t qn;
+      TMP_DECL (marker);
+
+      TMP_MARK (marker);
+      if (qxn != 0)
+	{
+	  mp_ptr n2p;
+	  n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
+	  MPN_ZERO (n2p, qxn);
+	  MPN_COPY (n2p + qxn, np, nn);
+	  q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB);
+	  rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+	  mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn);
+	  MPN_COPY (np, rp, dn);
+	  qn = nn - dn + qxn;
+	  MPN_COPY (qp, q2p, qn);
+	  qhl = q2p[qn];
+	}
+      else
+	{
+	  q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB);
+	  rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+	  mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn);
+	  MPN_COPY (np, rp, dn);	/* overwrite np area with remainder */
+	  qn = nn - dn;
+	  MPN_COPY (qp, q2p, qn);
+	  qhl = q2p[qn];
+	}
+      TMP_FREE (marker);
+      return qhl;
+    }
+}
diff --git a/rts/gmp/mpn/generic/divrem_1.c b/rts/gmp/mpn/generic/divrem_1.c
new file mode 100644
index 0000000000..e93f241c9d
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem_1.c
@@ -0,0 +1,248 @@
+/* mpn_divrem_1(quot_ptr, qsize, dividend_ptr, dividend_size, divisor_limb) --
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+   Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+   Return the single-limb remainder.
+   There are no constraints on the value of the divisor.
+
+   QUOT_PTR and DIVIDEND_PTR might point to the same limb.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+
+/* __gmpn_divmod_1_internal(quot_ptr,dividend_ptr,dividend_size,divisor_limb)
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+   Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+   Return the single-limb remainder.
+   There are no constraints on the value of the divisor.
+
+   QUOT_PTR and DIVIDEND_PTR might point to the same limb. */
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+static mp_limb_t
+#if __STDC__
+__gmpn_divmod_1_internal (mp_ptr quot_ptr,
+	      mp_srcptr dividend_ptr, mp_size_t dividend_size,
+	      mp_limb_t divisor_limb)
+#else
+__gmpn_divmod_1_internal (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
+     mp_ptr quot_ptr;
+     mp_srcptr dividend_ptr;
+     mp_size_t dividend_size;
+     mp_limb_t divisor_limb;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t n1, n0, r;
+  int dummy;
+
+  /* ??? Should this be handled at all?  Rely on callers?  */
+  if (dividend_size == 0)
+    return 0;
+
+  /* If multiplication is much faster than division, and the
+     dividend is large, pre-invert the divisor, and use
+     only multiplications in the inner loop.  */
+
+  /* This test should be read:
+       Does it ever help to use udiv_qrnnd_preinv?
+	 && Does what we save compensate for the inversion overhead?  */
+  if (UDIV_TIME > (2 * UMUL_TIME + 6)
+      && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME)
+    {
+      int normalization_steps;
+
+      count_leading_zeros (normalization_steps, divisor_limb);
+      if (normalization_steps != 0)
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  divisor_limb <<= normalization_steps;
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  n1 = dividend_ptr[dividend_size - 1];
+	  r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	  /* Possible optimization:
+	     if (r == 0
+	     && divisor_limb > ((n1 << normalization_steps)
+			     | (dividend_ptr[dividend_size - 2] >> ...)))
+	     ...one division less... */
+
+	  for (i = dividend_size - 2; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (quot_ptr[i + 1], r, r,
+				 ((n1 << normalization_steps)
+				  | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+				 divisor_limb, divisor_limb_inverted);
+	      n1 = n0;
+	    }
+	  udiv_qrnnd_preinv (quot_ptr[0], r, r,
+			     n1 << normalization_steps,
+			     divisor_limb, divisor_limb_inverted);
+	  return r >> normalization_steps;
+	}
+      else
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  i = dividend_size - 1;
+	  r = dividend_ptr[i];
+
+	  if (r >= divisor_limb)
+	    r = 0;
+	  else
+	    {
+	      quot_ptr[i] = 0;
+	      i--;
+	    }
+
+	  for (; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (quot_ptr[i], r, r,
+				 n0, divisor_limb, divisor_limb_inverted);
+	    }
+	  return r;
+	}
+    }
+  else
+    {
+      if (UDIV_NEEDS_NORMALIZATION)
+	{
+	  int normalization_steps;
+
+	  count_leading_zeros (normalization_steps, divisor_limb);
+	  if (normalization_steps != 0)
+	    {
+	      divisor_limb <<= normalization_steps;
+
+	      n1 = dividend_ptr[dividend_size - 1];
+	      r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	      /* Possible optimization:
+		 if (r == 0
+		 && divisor_limb > ((n1 << normalization_steps)
+				 | (dividend_ptr[dividend_size - 2] >> ...)))
+		 ...one division less... */
+
+	      for (i = dividend_size - 2; i >= 0; i--)
+		{
+		  n0 = dividend_ptr[i];
+		  udiv_qrnnd (quot_ptr[i + 1], r, r,
+			      ((n1 << normalization_steps)
+			       | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+			      divisor_limb);
+		  n1 = n0;
+		}
+	      udiv_qrnnd (quot_ptr[0], r, r,
+			  n1 << normalization_steps,
+			  divisor_limb);
+	      return r >> normalization_steps;
+	    }
+	}
+      /* No normalization needed, either because udiv_qrnnd doesn't require
+	 it, or because DIVISOR_LIMB is already normalized.  */
+
+      i = dividend_size - 1;
+      r = dividend_ptr[i];
+
+      if (r >= divisor_limb)
+	r = 0;
+      else
+	{
+	  quot_ptr[i] = 0;
+	  i--;
+	}
+
+      for (; i >= 0; i--)
+	{
+	  n0 = dividend_ptr[i];
+	  udiv_qrnnd (quot_ptr[i], r, r, n0, divisor_limb);
+	}
+      return r;
+    }
+}
+
+
+
+mp_limb_t
+#if __STDC__
+mpn_divrem_1 (mp_ptr qp, mp_size_t qxn,
+	      mp_srcptr np, mp_size_t nn,
+	      mp_limb_t d)
+#else
+mpn_divrem_1 (qp, qxn, np, nn, d)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_srcptr np;
+     mp_size_t nn;
+     mp_limb_t d;
+#endif
+{
+  mp_limb_t rlimb;
+  mp_size_t i;
+
+  /* Develop integer part of quotient.  */
+  rlimb = __gmpn_divmod_1_internal (qp + qxn, np, nn, d);
+
+  /* Develop fraction part of quotient.  This is not as fast as it should;
+     the preinvert stuff from __gmpn_divmod_1_internal ought to be used here
+     too.  */
+  if (UDIV_NEEDS_NORMALIZATION)
+    {
+      int normalization_steps;
+
+      count_leading_zeros (normalization_steps, d);
+      if (normalization_steps != 0)
+	{
+	  d <<= normalization_steps;
+	  rlimb <<= normalization_steps;
+
+	  for (i = qxn - 1; i >= 0; i--)
+	    udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
+
+	  return rlimb >> normalization_steps;
+	}
+      else
+	/* fall out */
+	;
+    }
+
+  for (i = qxn - 1; i >= 0; i--)
+    udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
+
+  return rlimb;
+}
diff --git a/rts/gmp/mpn/generic/divrem_2.c b/rts/gmp/mpn/generic/divrem_2.c
new file mode 100644
index 0000000000..0bc31ae2e7
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem_2.c
@@ -0,0 +1,151 @@
+/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and
+   quotient.  The divisor is two limbs.
+
+   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS
+   ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
+   RELEASE.
+
+
+Copyright (C) 1993, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Divide num (NP/NSIZE) by den (DP/2) and write
+   the NSIZE-2 least significant quotient limbs at QP
+   and the 2 long remainder at NP.  If QEXTRA_LIMBS is
+   non-zero, generate that many fraction bits and append them after the
+   other quotient limbs.
+   Return the most significant limb of the quotient, this is always 0 or 1.
+
+   Preconditions:
+   0. NSIZE >= 2.
+   1. The most significant bit of the divisor must be set.
+   2. QP must either not overlap with the input operands at all, or
+      QP + 2 >= NP must hold true.  (This means that it's
+      possible to put the quotient in the high part of NUM, right after the
+      remainder in NUM.
+   3. NSIZE >= 2, even if QEXTRA_LIMBS is non-zero.  */
+
+mp_limb_t
+#if __STDC__
+mpn_divrem_2 (mp_ptr qp, mp_size_t qxn,
+	      mp_ptr np, mp_size_t nsize,
+	      mp_srcptr dp)
+#else
+mpn_divrem_2 (qp, qxn, np, nsize, dp)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_ptr np;
+     mp_size_t nsize;
+     mp_srcptr dp;
+#endif
+{
+  mp_limb_t most_significant_q_limb = 0;
+  mp_size_t i;
+  mp_limb_t n1, n0, n2;
+  mp_limb_t d1, d0;
+  mp_limb_t d1inv;
+  int have_preinv;
+
+  np += nsize - 2;
+  d1 = dp[1];
+  d0 = dp[0];
+  n1 = np[1];
+  n0 = np[0];
+
+  if (n1 >= d1 && (n1 > d1 || n0 >= d0))
+    {
+      sub_ddmmss (n1, n0, n1, n0, d1, d0);
+      most_significant_q_limb = 1;
+    }
+
+  /* If multiplication is much faster than division, preinvert the most 
+     significant divisor limb before entering the loop.  */
+  if (UDIV_TIME > 2 * UMUL_TIME + 6)
+    {
+      have_preinv = 0;
+      if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - 2) > UDIV_TIME)
+	{
+	  invert_limb (d1inv, d1);
+	  have_preinv = 1;
+	}
+    }
+
+  for (i = qxn + nsize - 2 - 1; i >= 0; i--)
+    {
+      mp_limb_t q;
+      mp_limb_t r;
+
+      if (i >= qxn)
+	np--;
+      else
+	np[0] = 0;
+
+      if (n1 == d1)
+	{
+	  /* Q should be either 111..111 or 111..110.  Need special treatment
+	     of this rare case as normal division would give overflow.  */
+	  q = ~(mp_limb_t) 0;
+
+	  r = n0 + d1;
+	  if (r < d1)	/* Carry in the addition? */
+	    {
+	      add_ssaaaa (n1, n0, r - d0, np[0], 0, d0);
+	      qp[i] = q;
+	      continue;
+	    }
+	  n1 = d0 - (d0 != 0);
+	  n0 = -d0;
+	}
+      else
+	{
+	  if (UDIV_TIME > 2 * UMUL_TIME + 6 && have_preinv)
+	    udiv_qrnnd_preinv (q, r, n1, n0, d1, d1inv);
+	  else
+	    udiv_qrnnd (q, r, n1, n0, d1);
+	  umul_ppmm (n1, n0, d0, q);
+	}
+
+      n2 = np[0];
+
+    q_test:
+      if (n1 > r || (n1 == r && n0 > n2))
+	{
+	  /* The estimated Q was too large.  */
+	  q--;
+
+	  sub_ddmmss (n1, n0, n1, n0, 0, d0);
+	  r += d1;
+	  if (r >= d1)	/* If not carry, test Q again.  */
+	    goto q_test;
+	}
+
+      qp[i] = q;
+      sub_ddmmss (n1, n0, r, n2, n1, n0);
+    }
+  np[1] = n1;
+  np[0] = n0;
+
+  return most_significant_q_limb;
+}
diff --git a/rts/gmp/mpn/generic/dump.c b/rts/gmp/mpn/generic/dump.c
new file mode 100644
index 0000000000..66f375c74b
--- /dev/null
+++ b/rts/gmp/mpn/generic/dump.c
@@ -0,0 +1,76 @@
+/* THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS NOT SAFE TO
+   CALL THIS FUNCTION DIRECTLY.  IN FACT, IT IS ALMOST GUARANTEED THAT THIS
+   FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpn_dump (mp_srcptr ptr, mp_size_t size)
+#else
+mpn_dump (ptr, size)
+     mp_srcptr ptr;
+     mp_size_t size;
+#endif
+{
+  MPN_NORMALIZE (ptr, size);
+
+  if (size == 0)
+    printf ("0\n");
+  else
+    {
+      size--;
+      if (BYTES_PER_MP_LIMB > sizeof (long))
+	{
+	  if ((ptr[size] >> BITS_PER_MP_LIMB/2) != 0)
+	    {
+	      printf ("%lX",
+		      (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2));
+	      printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+		      (unsigned long) ptr[size]);
+	    }
+	  else
+	    printf ("%lX", (unsigned long) ptr[size]);
+	}
+      else
+	printf ("%lX", ptr[size]);
+
+      while (size)
+	{
+	  size--;
+	  if (BYTES_PER_MP_LIMB > sizeof (long))
+	    {
+	      printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+		(unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2));
+	      printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+		(unsigned long) ptr[size]);
+	    }
+	  else
+	    printf ("%0*lX", (int) (2 * BYTES_PER_MP_LIMB), ptr[size]);
+	}
+      printf ("\n");
+    }
+}
diff --git a/rts/gmp/mpn/generic/gcd.c b/rts/gmp/mpn/generic/gcd.c
new file mode 100644
index 0000000000..059e219a06
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcd.c
@@ -0,0 +1,414 @@
+/* mpn/gcd.c: mpn_gcd for gcd of two odd integers.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* Integer greatest common divisor of two unsigned integers, using
+   the accelerated algorithm (see reference below).
+
+   mp_size_t mpn_gcd (up, usize, vp, vsize).
+
+   Preconditions [U = (up, usize) and V = (vp, vsize)]:
+
+   1.  V is odd.
+   2.  numbits(U) >= numbits(V).
+
+   Both U and V are destroyed by the operation.  The result is left at vp,
+   and its size is returned.
+
+   Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+   Funding for this work has been partially provided by Conselho Nacional
+   de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+   301314194-2, and was done while I was a visiting reseacher in the Instituto
+   de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+   Refer to
+	K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+	Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
+   algorithm is used, otherwise the binary algorithm is used.  This may be
+   adjusted for different architectures.  */
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 5
+#endif
+
+/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
+   algorithm reduces using the bmod operation.  Otherwise, the k-ary reduction
+   is used.  0 <= BMOD_THRESHOLD < BITS_PER_MP_LIMB.  */
+enum
+  {
+    BMOD_THRESHOLD = BITS_PER_MP_LIMB/2
+  };
+
+
+/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
+   Both U and V must be odd.  */
+static __gmp_inline mp_size_t
+#if __STDC__
+gcd_2 (mp_ptr vp, mp_srcptr up)
+#else
+gcd_2 (vp, up)
+     mp_ptr vp;
+     mp_srcptr up;
+#endif
+{
+  mp_limb_t u0, u1, v0, v1;
+  mp_size_t vsize;
+
+  u0 = up[0], u1 = up[1], v0 = vp[0], v1 = vp[1];
+
+  while (u1 != v1 && u0 != v0)
+    {
+      unsigned long int r;
+      if (u1 > v1)
+	{
+	  u1 -= v1 + (u0 < v0), u0 -= v0;
+	  count_trailing_zeros (r, u0);
+	  u0 = u1 << (BITS_PER_MP_LIMB - r) | u0 >> r;
+	  u1 >>= r;
+	}
+      else  /* u1 < v1.  */
+	{
+	  v1 -= u1 + (v0 < u0), v0 -= u0;
+	  count_trailing_zeros (r, v0);
+	  v0 = v1 << (BITS_PER_MP_LIMB - r) | v0 >> r;
+	  v1 >>= r;
+	}
+    }
+
+  vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
+
+  /* If U == V == GCD, done.  Otherwise, compute GCD (V, |U - V|).  */
+  if (u1 == v1 && u0 == v0)
+    return vsize;
+
+  v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
+  vp[0] = mpn_gcd_1 (vp, vsize, v0);
+
+  return 1;
+}
+
+/* The function find_a finds 0 < N < 2^BITS_PER_MP_LIMB such that there exists
+   0 < |D| < 2^BITS_PER_MP_LIMB, and N == D * C mod 2^(2*BITS_PER_MP_LIMB).
+   In the reference article, D was computed along with N, but it is better to
+   compute D separately as D <-- N / C mod 2^(BITS_PER_MP_LIMB + 1), treating
+   the result as a twos' complement signed integer.
+
+   Initialize N1 to C mod 2^(2*BITS_PER_MP_LIMB).  According to the reference
+   article, N2 should be initialized to 2^(2*BITS_PER_MP_LIMB), but we use
+   2^(2*BITS_PER_MP_LIMB) - N1 to start the calculations within double
+   precision.  If N2 > N1 initially, the first iteration of the while loop
+   will swap them.  In all other situations, N1 >= N2 is maintained.  */
+
+static
+#if ! defined (__i386__)
+__gmp_inline			/* don't inline this for the x86 */
+#endif
+mp_limb_t
+#if __STDC__
+find_a (mp_srcptr cp)
+#else
+find_a (cp)
+     mp_srcptr cp;
+#endif
+{
+  unsigned long int leading_zero_bits = 0;
+
+  mp_limb_t n1_l = cp[0];	/* N1 == n1_h * 2^BITS_PER_MP_LIMB + n1_l.  */
+  mp_limb_t n1_h = cp[1];
+
+  mp_limb_t n2_l = -n1_l;	/* N2 == n2_h * 2^BITS_PER_MP_LIMB + n2_l.  */
+  mp_limb_t n2_h = ~n1_h;
+
+  /* Main loop.  */
+  while (n2_h)			/* While N2 >= 2^BITS_PER_MP_LIMB.  */
+    {
+      /* N1 <-- N1 % N2.  */
+      if ((MP_LIMB_T_HIGHBIT >> leading_zero_bits & n2_h) == 0)
+	{
+	  unsigned long int i;
+	  count_leading_zeros (i, n2_h);
+	  i -= leading_zero_bits, leading_zero_bits += i;
+	  n2_h = n2_h<<i | n2_l>>(BITS_PER_MP_LIMB - i), n2_l <<= i;
+	  do
+	    {
+	      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+		n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+	      n2_l = n2_l>>1 | n2_h<<(BITS_PER_MP_LIMB - 1), n2_h >>= 1;
+	      i -= 1;
+	    }
+	  while (i);
+	}
+      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+	n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+
+      MP_LIMB_T_SWAP (n1_h, n2_h);
+      MP_LIMB_T_SWAP (n1_l, n2_l);
+    }
+
+  return n2_l;
+}
+
+mp_size_t
+#if __STDC__
+mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcd (gp, up, usize, vp, vsize)
+     mp_ptr gp;
+     mp_ptr up;
+     mp_size_t usize;
+     mp_ptr vp;
+     mp_size_t vsize;
+#endif
+{
+  mp_ptr orig_vp = vp;
+  mp_size_t orig_vsize = vsize;
+  int binary_gcd_ctr;		/* Number of times binary gcd will execute.  */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
+     Two EXTRA limbs for U and V are required for kary reduction.  */
+  if (vsize >= GCD_ACCEL_THRESHOLD)
+    {
+      unsigned long int vbitsize, d;
+      mp_ptr orig_up = up;
+      mp_size_t orig_usize = usize;
+      mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
+
+      MPN_COPY (anchor_up, orig_up, usize);
+      up = anchor_up;
+
+      count_leading_zeros (d, up[usize-1]);
+      d = usize * BITS_PER_MP_LIMB - d;
+      count_leading_zeros (vbitsize, vp[vsize-1]);
+      vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+      d = d - vbitsize + 1;
+
+      /* Use bmod reduction to quickly discover whether V divides U.  */
+      up[usize++] = 0;				/* Insert leading zero.  */
+      mpn_bdivmod (up, up, usize, vp, vsize, d);
+
+      /* Now skip U/V mod 2^d and any low zero limbs.  */
+      d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+      while (usize != 0 && up[0] == 0)
+	up++, usize--;
+
+      if (usize == 0)				/* GCD == ORIG_V.  */
+	goto done;
+
+      vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
+      MPN_COPY (vp, orig_vp, vsize);
+
+      do					/* Main loop.  */
+	{
+          /* mpn_com_n can't be used here because anchor_up and up may
+             partially overlap */
+	  if (up[usize-1] & MP_LIMB_T_HIGHBIT)  /* U < 0; take twos' compl. */
+	    {
+	      mp_size_t i;
+	      anchor_up[0] = -up[0];
+	      for (i = 1; i < usize; i++)
+		anchor_up[i] = ~up[i];
+	      up = anchor_up;
+	    }
+
+	  MPN_NORMALIZE_NOT_ZERO (up, usize);
+
+	  if ((up[0] & 1) == 0)			/* Result even; remove twos. */
+	    {
+	      unsigned int r;
+	      count_trailing_zeros (r, up[0]);
+	      mpn_rshift (anchor_up, up, usize, r);
+	      usize -= (anchor_up[usize-1] == 0);
+	    }
+	  else if (anchor_up != up)
+	    MPN_COPY_INCR (anchor_up, up, usize);
+
+	  MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
+	  up = anchor_up;
+
+	  if (vsize <= 2)		/* Kary can't handle < 2 limbs and  */
+	    break;			/* isn't efficient for == 2 limbs.  */
+
+	  d = vbitsize;
+	  count_leading_zeros (vbitsize, vp[vsize-1]);
+	  vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+	  d = d - vbitsize + 1;
+
+	  if (d > BMOD_THRESHOLD)	/* Bmod reduction.  */
+	    {
+	      up[usize++] = 0;
+	      mpn_bdivmod (up, up, usize, vp, vsize, d);
+	      d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+	    }
+	  else				/* Kary reduction.  */
+	    {
+	      mp_limb_t bp[2], cp[2];
+
+	      /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB).  */
+              {
+                mp_limb_t u_inv, hi, lo;
+                modlimb_invert (u_inv, up[0]);
+                cp[0] = vp[0] * u_inv;
+                umul_ppmm (hi, lo, cp[0], up[0]);
+                cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv;
+              }
+
+	      /* U <-- find_a (C)  *  U.  */
+	      up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
+	      usize++;
+
+	      /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1).
+		  bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and
+		  bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2
+
+                 Like V/U above, but simplified because only the low bit of
+                 bp[1] is wanted. */
+              {
+                mp_limb_t  v_inv, hi, lo;
+                modlimb_invert (v_inv, vp[0]);
+                bp[0] = up[0] * v_inv;
+                umul_ppmm (hi, lo, bp[0], vp[0]);
+                bp[1] = (up[1] + hi + (bp[0]&vp[1])) & 1;
+              }
+
+	      up[usize++] = 0;
+	      if (bp[1])	/* B < 0: U <-- U + (-B)  * V.  */
+		{
+		   mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0]);
+		   mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
+		}
+	      else		/* B >= 0:  U <-- U - B * V.  */
+		{
+		  mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
+		  mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+		}
+
+	      up += 2, usize -= 2;  /* At least two low limbs are zero.  */
+	    }
+
+	  /* Must remove low zero limbs before complementing.  */
+	  while (usize != 0 && up[0] == 0)
+	    up++, usize--;
+	}
+      while (usize);
+
+      /* Compute GCD (ORIG_V, GCD (ORIG_U, V)).  Binary will execute twice.  */
+      up = orig_up, usize = orig_usize;
+      binary_gcd_ctr = 2;
+    }
+  else
+    binary_gcd_ctr = 1;
+
+  /* Finish up with the binary algorithm.  Executes once or twice.  */
+  for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
+    {
+      if (usize > 2)		/* First make U close to V in size.  */
+	{
+	  unsigned long int vbitsize, d;
+	  count_leading_zeros (d, up[usize-1]);
+	  d = usize * BITS_PER_MP_LIMB - d;
+	  count_leading_zeros (vbitsize, vp[vsize-1]);
+	  vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+	  d = d - vbitsize - 1;
+	  if (d != -(unsigned long int)1 && d > 2)
+	    {
+	      mpn_bdivmod (up, up, usize, vp, vsize, d);  /* Result > 0.  */
+	      d /= (unsigned long int)BITS_PER_MP_LIMB, up += d, usize -= d;
+	    }
+	}
+
+      /* Start binary GCD.  */
+      do
+	{
+	  mp_size_t zeros;
+
+	  /* Make sure U is odd.  */
+	  MPN_NORMALIZE (up, usize);
+	  while (up[0] == 0)
+	    up += 1, usize -= 1;
+	  if ((up[0] & 1) == 0)
+	    {
+	      unsigned int r;
+	      count_trailing_zeros (r, up[0]);
+	      mpn_rshift (up, up, usize, r);
+	      usize -= (up[usize-1] == 0);
+	    }
+
+	  /* Keep usize >= vsize.  */
+	  if (usize < vsize)
+	    MPN_PTR_SWAP (up, usize, vp, vsize);
+
+	  if (usize <= 2)				/* Double precision. */
+	    {
+	      if (vsize == 1)
+		vp[0] = mpn_gcd_1 (up, usize, vp[0]);
+	      else
+		vsize = gcd_2 (vp, up);
+	      break;					/* Binary GCD done.  */
+	    }
+
+	  /* Count number of low zero limbs of U - V.  */
+	  for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
+	    continue;
+
+	  /* If U < V, swap U and V; in any case, subtract V from U.  */
+	  if (zeros == vsize)				/* Subtract done.  */
+	    up += zeros, usize -= zeros;
+	  else if (usize == vsize)
+	    {
+	      mp_size_t size = vsize;
+	      do
+		size--;
+	      while (up[size] == vp[size]);
+	      if (up[size] < vp[size])			/* usize == vsize.  */
+		MP_PTR_SWAP (up, vp);
+	      up += zeros, usize = size + 1 - zeros;
+	      mpn_sub_n (up, up, vp + zeros, usize);
+	    }
+	  else
+	    {
+	      mp_size_t size = vsize - zeros;
+	      up += zeros, usize -= zeros;
+	      if (mpn_sub_n (up, up, vp + zeros, size))
+		{
+		  while (up[size] == 0)			/* Propagate borrow. */
+		    up[size++] = -(mp_limb_t)1;
+		  up[size] -= 1;
+		}
+	    }
+	}
+      while (usize);					/* End binary GCD.  */
+    }
+
+done:
+  if (vp != gp)
+    MPN_COPY (gp, vp, vsize);
+  TMP_FREE (marker);
+  return vsize;
+}
diff --git a/rts/gmp/mpn/generic/gcd_1.c b/rts/gmp/mpn/generic/gcd_1.c
new file mode 100644
index 0000000000..1832636636
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcd_1.c
@@ -0,0 +1,77 @@
+/* mpn_gcd_1 --
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Does not work for U == 0 or V == 0.  It would be tough to make it work for
+   V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t.  */
+
+mp_limb_t
+#if __STDC__
+mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb)
+#else
+mpn_gcd_1 (up, size, vlimb)
+     mp_srcptr up;
+     mp_size_t size;
+     mp_limb_t vlimb;
+#endif
+{
+  mp_limb_t ulimb;
+  unsigned long int u_low_zero_bits, v_low_zero_bits;
+
+  if (size > 1)
+    {
+      ulimb = mpn_mod_1 (up, size, vlimb);
+      if (ulimb == 0)
+	return vlimb;
+    }
+  else
+    ulimb = up[0];
+
+  /*  Need to eliminate low zero bits.  */
+  count_trailing_zeros (u_low_zero_bits, ulimb);
+  ulimb >>= u_low_zero_bits;
+
+  count_trailing_zeros (v_low_zero_bits, vlimb);
+  vlimb >>= v_low_zero_bits;
+
+  while (ulimb != vlimb)
+    {
+      if (ulimb > vlimb)
+	{
+	  ulimb -= vlimb;
+	  do
+	    ulimb >>= 1;
+	  while ((ulimb & 1) == 0);
+	}
+      else /*  vlimb > ulimb.  */
+	{
+	  vlimb -= ulimb;
+	  do
+	    vlimb >>= 1;
+	  while ((vlimb & 1) == 0);
+	}
+    }
+
+  return  ulimb << MIN (u_low_zero_bits, v_low_zero_bits);
+}
diff --git a/rts/gmp/mpn/generic/gcdext.c b/rts/gmp/mpn/generic/gcdext.c
new file mode 100644
index 0000000000..fe22d779a6
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcdext.c
@@ -0,0 +1,700 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright (C) 1996, 1998, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 17
+#endif
+
+#ifndef EXTEND
+#define EXTEND 1
+#endif
+
+#if STAT
+int arr[BITS_PER_MP_LIMB];
+#endif
+
+
+/* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE)
+
+   Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the
+   greatest common divisor at GP (unless it is 0), and the first cofactor at
+   SP.  Write the size of the cofactor through the pointer SSIZE.  Return the
+   size of the value at GP.  Note that SP might be a negative number; this is
+   denoted by storing the negative of the size through SSIZE.
+
+   {UP,USIZE} and {VP,VSIZE} are both clobbered.
+
+   The space allocation for all four areas needs to be USIZE+1.
+
+   Preconditions: 1) U >= V.
+		  2) V > 0.  */
+
+/* We use Lehmer's algorithm.  The idea is to extract the most significant
+   bits of the operands, and compute the continued fraction for them.  We then
+   apply the gathered cofactors to the full operands.
+
+   Idea 1: After we have performed a full division, don't shift operands back,
+	   but instead account for the extra factors-of-2 thus introduced.
+   Idea 2: Simple generalization to use divide-and-conquer would give us an
+	   algorithm that runs faster than O(n^2).
+   Idea 3: The input numbers need less space as the computation progresses,
+	   while the s0 and s1 variables need more space.  To save memory, we
+	   could make them share space, and have the latter variables grow
+	   into the former.
+   Idea 4: We should not do double-limb arithmetic from the start.  Instead,
+	   do things in single-limb arithmetic until the quotients differ,
+	   and then switch to double-limb arithmetic.  */
+
+
+/* Division optimized for small quotients.  If the quotient is more than one limb,
+   store 1 in *qh and return 0.  */
+static mp_limb_t
+#if __STDC__
+div2 (mp_limb_t *qh, mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0)
+#else
+div2 (qh, n1, n0, d1, d0)
+     mp_limb_t *qh;
+     mp_limb_t n1;
+     mp_limb_t n0;
+     mp_limb_t d1;
+     mp_limb_t d0;
+#endif
+{
+  if (d1 == 0)
+    {
+      *qh = 1;
+      return 0;
+    }
+
+  if ((mp_limb_signed_t) n1 < 0)
+    {
+      mp_limb_t q;
+      int cnt;
+      for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++)
+	{
+	  d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  q <<= 1;
+	  if (n1 > d1 || (n1 == d1 && n0 >= d0))
+	    {
+	      sub_ddmmss (n1, n0, n1, n0, d1, d0);
+	      q |= 1;
+	    }
+	  d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
+	  d1 = d1 >> 1;
+	  cnt--;
+	}
+
+      *qh = 0;
+      return q;
+    }
+  else
+    {
+      mp_limb_t q;
+      int cnt;
+      for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++)
+	{
+	  d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
+	  d1 = d1 >> 1;
+	  q <<= 1;
+	  if (n1 > d1 || (n1 == d1 && n0 >= d0))
+	    {
+	      sub_ddmmss (n1, n0, n1, n0, d1, d0);
+	      q |= 1;
+	    }
+	  cnt--;
+	}
+
+      *qh = 0;
+      return q;
+    }
+}
+
+mp_size_t
+#if EXTEND
+#if __STDC__
+mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
+	    mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcdext (gp, s0p, s0size, up, size, vp, vsize)
+     mp_ptr gp;
+     mp_ptr s0p;
+     mp_size_t *s0size;
+     mp_ptr up;
+     mp_size_t size;
+     mp_ptr vp;
+     mp_size_t vsize;
+#endif
+#else
+#if __STDC__
+mpn_gcd (mp_ptr gp,
+	 mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcd (gp, up, size, vp, vsize)
+     mp_ptr gp;
+     mp_ptr up;
+     mp_size_t size;
+     mp_ptr vp;
+     mp_size_t vsize;
+#endif
+#endif
+{
+  mp_limb_t A, B, C, D;
+  int cnt;
+  mp_ptr tp, wp;
+#if RECORD
+  mp_limb_t max = 0;
+#endif
+#if EXTEND
+  mp_ptr s1p;
+  mp_ptr orig_s0p = s0p;
+  mp_size_t ssize;
+  int sign = 1;
+#endif
+  int use_double_flag;
+  TMP_DECL (mark);
+
+  TMP_MARK (mark);
+
+  use_double_flag = (size >= GCDEXT_THRESHOLD);
+
+  tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+  wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+#if EXTEND
+  s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+
+  MPN_ZERO (s0p, size);
+  MPN_ZERO (s1p, size);
+
+  s0p[0] = 1;
+  s1p[0] = 0;
+  ssize = 1;
+#endif
+
+  if (size > vsize)
+    {
+      /* Normalize V (and shift up U the same amount).  */
+      count_leading_zeros (cnt, vp[vsize - 1]);
+      if (cnt != 0)
+	{
+	  mp_limb_t cy;
+	  mpn_lshift (vp, vp, vsize, cnt);
+	  cy = mpn_lshift (up, up, size, cnt);
+	  up[size] = cy;
+	  size += cy != 0;
+	}
+
+      mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+      /* This is really what it boils down to in this case... */
+      s0p[0] = 0;
+      s1p[0] = 1;
+      sign = -sign;
+#endif
+      size = vsize;
+      if (cnt != 0)
+	{
+	  mpn_rshift (up, up, size, cnt);
+	  mpn_rshift (vp, vp, size, cnt);
+	}
+      MP_PTR_SWAP (up, vp);
+    }
+
+  for (;;)
+    {
+      mp_limb_t asign;
+      /* Figure out exact size of V.  */
+      vsize = size;
+      MPN_NORMALIZE (vp, vsize);
+      if (vsize <= 1)
+	break;
+
+      if (use_double_flag)
+	{
+	  mp_limb_t uh, vh, ul, vl;
+	  /* Let UH,UL be the most significant limbs of U, and let VH,VL be
+	     the corresponding bits from V.  */
+	  uh = up[size - 1];
+	  vh = vp[size - 1];
+	  ul = up[size - 2];
+	  vl = vp[size - 2];
+	  count_leading_zeros (cnt, uh);
+	  if (cnt != 0)
+	    {
+	      uh = (uh << cnt) | (ul >> (BITS_PER_MP_LIMB - cnt));
+	      vh = (vh << cnt) | (vl >> (BITS_PER_MP_LIMB - cnt));
+	      vl <<= cnt;
+	      ul <<= cnt;
+	      if (size >= 3)
+		{
+		  ul |= (up[size - 3] >> (BITS_PER_MP_LIMB - cnt));
+		  vl |= (vp[size - 3] >> (BITS_PER_MP_LIMB - cnt));
+		}
+	    }
+
+	  A = 1;
+	  B = 0;
+	  C = 0;
+	  D = 1;
+
+	  asign = 0;
+	  for (;;)
+	    {
+	      mp_limb_t T;
+	      mp_limb_t qh, q1, q2;
+	      mp_limb_t nh, nl, dh, dl;
+	      mp_limb_t t1, t0;
+	      mp_limb_t Th, Tl;
+
+	      sub_ddmmss (dh, dl, vh, vl, 0, C);
+	      if ((dl | dh) == 0)
+		break;
+	      add_ssaaaa (nh, nl, uh, ul, 0, A);
+	      q1 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      add_ssaaaa (dh, dl, vh, vl, 0, D);
+	      if ((dl | dh) == 0)
+		break;
+	      sub_ddmmss (nh, nl, uh, ul, 0, B);
+	      q2 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      if (q1 != q2)
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q1 * C;
+	      A = C;
+	      C = T;
+	      T = B + q1 * D;
+	      B = D;
+	      D = T;
+	      umul_ppmm (t1, t0, q1, vl);
+	      t1 += q1 * vh;
+	      sub_ddmmss (Th, Tl, uh, ul, t1, t0);
+	      uh = vh, ul = vl;
+	      vh = Th, vl = Tl;
+
+	      add_ssaaaa (dh, dl, vh, vl, 0, C);
+	      sub_ddmmss (nh, nl, uh, ul, 0, A);
+	      q1 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      sub_ddmmss (dh, dl, vh, vl, 0, D);
+	      if ((dl | dh) == 0)
+		break;
+	      add_ssaaaa (nh, nl, uh, ul, 0, B);
+	      q2 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      if (q1 != q2)
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q1 * C;
+	      A = C;
+	      C = T;
+	      T = B + q1 * D;
+	      B = D;
+	      D = T;
+	      umul_ppmm (t1, t0, q1, vl);
+	      t1 += q1 * vh;
+	      sub_ddmmss (Th, Tl, uh, ul, t1, t0);
+	      uh = vh, ul = vl;
+	      vh = Th, vl = Tl;
+	    }
+#if EXTEND
+	  if (asign)
+	    sign = -sign;
+#endif
+	}
+      else /* Same, but using single-limb calculations.  */
+	{
+	  mp_limb_t uh, vh;
+	  /* Make UH be the most significant limb of U, and make VH be
+	     corresponding bits from V.  */
+	  uh = up[size - 1];
+	  vh = vp[size - 1];
+	  count_leading_zeros (cnt, uh);
+	  if (cnt != 0)
+	    {
+	      uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+	      vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+	    }
+
+	  A = 1;
+	  B = 0;
+	  C = 0;
+	  D = 1;
+
+	  asign = 0;
+	  for (;;)
+	    {
+	      mp_limb_t q, T;
+	      if (vh - C == 0 || vh + D == 0)
+		break;
+
+	      q = (uh + A) / (vh - C);
+	      if (q != (uh - B) / (vh + D))
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q * C;
+	      A = C;
+	      C = T;
+	      T = B + q * D;
+	      B = D;
+	      D = T;
+	      T = uh - q * vh;
+	      uh = vh;
+	      vh = T;
+
+	      if (vh - D == 0)
+		break;
+
+	      q = (uh - A) / (vh + C);
+	      if (q != (uh + B) / (vh - D))
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q * C;
+	      A = C;
+	      C = T;
+	      T = B + q * D;
+	      B = D;
+	      D = T;
+	      T = uh - q * vh;
+	      uh = vh;
+	      vh = T;
+	    }
+#if EXTEND
+	  if (asign)
+	    sign = -sign;
+#endif
+	}
+
+#if RECORD
+      max = MAX (A, max);  max = MAX (B, max);
+      max = MAX (C, max);  max = MAX (D, max);
+#endif
+
+      if (B == 0)
+	{
+	  mp_limb_t qh;
+	  mp_size_t i;
+	  /* This is quite rare.  I.e., optimize something else!  */
+
+	  /* Normalize V (and shift up U the same amount).  */
+	  count_leading_zeros (cnt, vp[vsize - 1]);
+	  if (cnt != 0)
+	    {
+	      mp_limb_t cy;
+	      mpn_lshift (vp, vp, vsize, cnt);
+	      cy = mpn_lshift (up, up, size, cnt);
+	      up[size] = cy;
+	      size += cy != 0;
+	    }
+
+	  qh = mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+	  MPN_COPY (tp, s0p, ssize);
+	  {
+	    mp_size_t qsize;
+
+	    qsize = size - vsize; /* size of stored quotient from division */
+	    if (ssize < qsize)
+	      {
+		MPN_ZERO (tp + ssize, qsize - ssize);
+		MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+		for (i = 0; i < ssize; i++)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_addmul_1 (tp + i, up + vsize, qsize, s1p[i]);
+		    tp[qsize + i] = cy;
+		  }
+		if (qh != 0)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
+		    if (cy != 0)
+		      abort ();
+		  }
+	      }
+	    else
+	      {
+		MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+		for (i = 0; i < qsize; i++)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]);
+		    tp[ssize + i] = cy;
+		  }
+		if (qh != 0)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
+		    if (cy != 0)
+		      {
+			tp[qsize + ssize] = cy;
+			s1p[qsize + ssize] = 0;
+			ssize++;
+		      }
+		  }
+	      }
+	    ssize += qsize;
+	    ssize -= tp[ssize - 1] == 0;
+	  }
+
+	  sign = -sign;
+	  MP_PTR_SWAP (s0p, s1p);
+	  MP_PTR_SWAP (s1p, tp);
+#endif
+	  size = vsize;
+	  if (cnt != 0)
+	    {
+	      mpn_rshift (up, up, size, cnt);
+	      mpn_rshift (vp, vp, size, cnt);
+	    }
+	  MP_PTR_SWAP (up, vp);
+	}
+      else
+	{
+#if EXTEND
+	  mp_size_t tsize, wsize;
+#endif
+	  /* T = U*A + V*B
+	     W = U*C + V*D
+	     U = T
+	     V = W	   */
+
+#if STAT
+	  { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x);
+	  arr[BITS_PER_MP_LIMB - cnt]++; }
+#endif
+	  if (A == 0)
+	    {
+	      /* B == 1 and C == 1 (D is arbitrary) */
+	      mp_limb_t cy;
+	      MPN_COPY (tp, vp, size);
+	      MPN_COPY (wp, up, size);
+	      mpn_submul_1 (wp, vp, size, D);
+	      MP_PTR_SWAP (tp, up);
+	      MP_PTR_SWAP (wp, vp);
+#if EXTEND
+	      MPN_COPY (tp, s1p, ssize);
+	      tsize = ssize;
+	      tp[ssize] = 0;	/* must zero since wp might spill below */
+	      MPN_COPY (wp, s0p, ssize);
+	      cy = mpn_addmul_1 (wp, s1p, ssize, D);
+	      wp[ssize] = cy;
+	      wsize = ssize + (cy != 0);
+	      MP_PTR_SWAP (tp, s0p);
+	      MP_PTR_SWAP (wp, s1p);
+	      ssize = MAX (wsize, tsize);
+#endif
+	    }
+	  else
+	    {
+	      if (asign)
+		{
+		  mp_limb_t cy;
+		  mpn_mul_1 (tp, vp, size, B);
+		  mpn_submul_1 (tp, up, size, A);
+		  mpn_mul_1 (wp, up, size, C);
+		  mpn_submul_1 (wp, vp, size, D);
+		  MP_PTR_SWAP (tp, up);
+		  MP_PTR_SWAP (wp, vp);
+#if EXTEND
+		  cy = mpn_mul_1 (tp, s1p, ssize, B);
+		  cy += mpn_addmul_1 (tp, s0p, ssize, A);
+		  tp[ssize] = cy;
+		  tsize = ssize + (cy != 0);
+		  cy = mpn_mul_1 (wp, s0p, ssize, C);
+		  cy += mpn_addmul_1 (wp, s1p, ssize, D);
+		  wp[ssize] = cy;
+		  wsize = ssize + (cy != 0);
+		  MP_PTR_SWAP (tp, s0p);
+		  MP_PTR_SWAP (wp, s1p);
+		  ssize = MAX (wsize, tsize);
+#endif
+		}
+	      else
+		{
+		  mp_limb_t cy;
+		  mpn_mul_1 (tp, up, size, A);
+		  mpn_submul_1 (tp, vp, size, B);
+		  mpn_mul_1 (wp, vp, size, D);
+		  mpn_submul_1 (wp, up, size, C);
+		  MP_PTR_SWAP (tp, up);
+		  MP_PTR_SWAP (wp, vp);
+#if EXTEND
+		  cy = mpn_mul_1 (tp, s0p, ssize, A);
+		  cy += mpn_addmul_1 (tp, s1p, ssize, B);
+		  tp[ssize] = cy;
+		  tsize = ssize + (cy != 0);
+		  cy = mpn_mul_1 (wp, s1p, ssize, D);
+		  cy += mpn_addmul_1 (wp, s0p, ssize, C);
+		  wp[ssize] = cy;
+		  wsize = ssize + (cy != 0);
+		  MP_PTR_SWAP (tp, s0p);
+		  MP_PTR_SWAP (wp, s1p);
+		  ssize = MAX (wsize, tsize);
+#endif
+		}
+	    }
+
+	  size -= up[size - 1] == 0;
+	}
+    }
+
+#if RECORD
+  printf ("max: %lx\n", max);
+#endif
+
+#if STAT
+ {int i; for (i = 0; i < BITS_PER_MP_LIMB; i++) printf ("%d:%d\n", i, arr[i]);}
+#endif
+
+  if (vsize == 0)
+    {
+      if (gp != up && gp != 0)
+	MPN_COPY (gp, up, size);
+#if EXTEND
+      MPN_NORMALIZE (s0p, ssize);
+      if (orig_s0p != s0p)
+	MPN_COPY (orig_s0p, s0p, ssize);
+      *s0size = sign >= 0 ? ssize : -ssize;
+#endif
+      TMP_FREE (mark);
+      return size;
+    }
+  else
+    {
+      mp_limb_t vl, ul, t;
+#if EXTEND
+      mp_size_t qsize, i;
+#endif
+      vl = vp[0];
+#if EXTEND
+      t = mpn_divmod_1 (wp, up, size, vl);
+
+      MPN_COPY (tp, s0p, ssize);
+
+      qsize = size - (wp[size - 1] == 0); /* size of quotient from division */
+      if (ssize < qsize)
+	{
+	  MPN_ZERO (tp + ssize, qsize - ssize);
+	  MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+	  for (i = 0; i < ssize; i++)
+	    {
+	      mp_limb_t cy;
+	      cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]);
+	      tp[qsize + i] = cy;
+	    }
+	}
+      else
+	{
+	  MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+	  for (i = 0; i < qsize; i++)
+	    {
+	      mp_limb_t cy;
+	      cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]);
+	      tp[ssize + i] = cy;
+	    }
+	}
+      ssize += qsize;
+      ssize -= tp[ssize - 1] == 0;
+
+      sign = -sign;
+      MP_PTR_SWAP (s0p, s1p);
+      MP_PTR_SWAP (s1p, tp);
+#else
+      t = mpn_mod_1 (up, size, vl);
+#endif
+      ul = vl;
+      vl = t;
+      while (vl != 0)
+	{
+	  mp_limb_t t;
+#if EXTEND
+	  mp_limb_t q;
+	  q = ul / vl;
+	  t = ul - q * vl;
+
+	  MPN_COPY (tp, s0p, ssize);
+
+	  MPN_ZERO (s1p + ssize, 1); /* zero s1 too */
+
+	  {
+	    mp_limb_t cy;
+	    cy = mpn_addmul_1 (tp, s1p, ssize, q);
+	    tp[ssize] = cy;
+	  }
+
+	  ssize += 1;
+	  ssize -= tp[ssize - 1] == 0;
+
+	  sign = -sign;
+	  MP_PTR_SWAP (s0p, s1p);
+	  MP_PTR_SWAP (s1p, tp);
+#else
+	  t = ul % vl;
+#endif
+	  ul = vl;
+	  vl = t;
+	}
+      if (gp != 0)
+	gp[0] = ul;
+#if EXTEND
+      MPN_NORMALIZE (s0p, ssize);
+      if (orig_s0p != s0p)
+	MPN_COPY (orig_s0p, s0p, ssize);
+      *s0size = sign >= 0 ? ssize : -ssize;
+#endif
+      TMP_FREE (mark);
+      return 1;
+    }
+}
diff --git a/rts/gmp/mpn/generic/get_str.c b/rts/gmp/mpn/generic/get_str.c
new file mode 100644
index 0000000000..a713b61825
--- /dev/null
+++ b/rts/gmp/mpn/generic/get_str.c
@@ -0,0 +1,216 @@
+/* mpn_get_str -- Convert a MSIZE long limb vector pointed to by MPTR
+   to a printable string in STR in base BASE.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Convert the limb vector pointed to by MPTR and MSIZE long to a
+   char array, using base BASE for the result array.  Store the
+   result in the character array STR.  STR must point to an array with
+   space for the largest possible number represented by a MSIZE long
+   limb vector + 1 extra character.
+
+   The result is NOT in Ascii, to convert it to printable format, add
+   '0' or 'A' depending on the base and range.
+
+   Return the number of digits in the result string.
+   This may include some leading zeros.
+
+   The limb vector pointed to by MPTR is clobbered.  */
+
+size_t
+#if __STDC__
+mpn_get_str (unsigned char *str, int base, mp_ptr mptr, mp_size_t msize)
+#else
+mpn_get_str (str, base, mptr, msize)
+     unsigned char *str;
+     int base;
+     mp_ptr mptr;
+     mp_size_t msize;
+#endif
+{
+  mp_limb_t big_base;
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+  int normalization_steps;
+#endif
+#if UDIV_TIME > 2 * UMUL_TIME
+  mp_limb_t big_base_inverted;
+#endif
+  unsigned int dig_per_u;
+  mp_size_t out_len;
+  register unsigned char *s;
+
+  big_base = __mp_bases[base].big_base;
+
+  s = str;
+
+  /* Special case zero, as the code below doesn't handle it.  */
+  if (msize == 0)
+    {
+      s[0] = 0;
+      return 1;
+    }
+
+  if ((base & (base - 1)) == 0)
+    {
+      /* The base is a power of 2.  Make conversion from most
+	 significant side.  */
+      mp_limb_t n1, n0;
+      register int bits_per_digit = big_base;
+      register int x;
+      register int bit_pos;
+      register int i;
+
+      n1 = mptr[msize - 1];
+      count_leading_zeros (x, n1);
+
+	/* BIT_POS should be R when input ends in least sign. nibble,
+	   R + bits_per_digit * n when input ends in n:th least significant
+	   nibble. */
+
+      {
+	int bits;
+
+	bits = BITS_PER_MP_LIMB * msize - x;
+	x = bits % bits_per_digit;
+	if (x != 0)
+	  bits += bits_per_digit - x;
+	bit_pos = bits - (msize - 1) * BITS_PER_MP_LIMB;
+      }
+
+      /* Fast loop for bit output.  */
+      i = msize - 1;
+      for (;;)
+	{
+	  bit_pos -= bits_per_digit;
+	  while (bit_pos >= 0)
+	    {
+	      *s++ = (n1 >> bit_pos) & ((1 << bits_per_digit) - 1);
+	      bit_pos -= bits_per_digit;
+	    }
+	  i--;
+	  if (i < 0)
+	    break;
+	  n0 = (n1 << -bit_pos) & ((1 << bits_per_digit) - 1);
+	  n1 = mptr[i];
+	  bit_pos += BITS_PER_MP_LIMB;
+	  *s++ = n0 | (n1 >> bit_pos);
+	}
+
+      *s = 0;
+
+      return s - str;
+    }
+  else
+    {
+      /* General case.  The base is not a power of 2.  Make conversion
+	 from least significant end.  */
+
+      /* If udiv_qrnnd only handles divisors with the most significant bit
+	 set, prepare BIG_BASE for being a divisor by shifting it to the
+	 left exactly enough to set the most significant bit.  */
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+      count_leading_zeros (normalization_steps, big_base);
+      big_base <<= normalization_steps;
+#if UDIV_TIME > 2 * UMUL_TIME
+      /* Get the fixed-point approximation to 1/(BIG_BASE << NORMALIZATION_STEPS).  */
+      big_base_inverted = __mp_bases[base].big_base_inverted;
+#endif
+#endif
+
+      dig_per_u = __mp_bases[base].chars_per_limb;
+      out_len = ((size_t) msize * BITS_PER_MP_LIMB
+		 * __mp_bases[base].chars_per_bit_exactly) + 1;
+      s += out_len;
+
+      while (msize != 0)
+	{
+	  int i;
+	  mp_limb_t n0, n1;
+
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+	  /* If we shifted BIG_BASE above, shift the dividend too, to get
+	     the right quotient.  We need to do this every loop,
+	     since the intermediate quotients are OK, but the quotient from
+	     one turn in the loop is going to be the dividend in the
+	     next turn, and the dividend needs to be up-shifted.  */
+	  if (normalization_steps != 0)
+	    {
+	      n0 = mpn_lshift (mptr, mptr, msize, normalization_steps);
+
+	      /* If the shifting gave a carry out limb, store it and
+		 increase the length.  */
+	      if (n0 != 0)
+		{
+		  mptr[msize] = n0;
+		  msize++;
+		}
+	    }
+#endif
+
+	  /* Divide the number at TP with BIG_BASE to get a quotient and a
+	     remainder.  The remainder is our new digit in base BIG_BASE.  */
+	  i = msize - 1;
+	  n1 = mptr[i];
+
+	  if (n1 >= big_base)
+	    n1 = 0;
+	  else
+	    {
+	      msize--;
+	      i--;
+	    }
+
+	  for (; i >= 0; i--)
+	    {
+	      n0 = mptr[i];
+#if UDIV_TIME > 2 * UMUL_TIME
+	      udiv_qrnnd_preinv (mptr[i], n1, n1, n0, big_base, big_base_inverted);
+#else
+	      udiv_qrnnd (mptr[i], n1, n1, n0, big_base);
+#endif
+	    }
+
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+	  /* If we shifted above (at previous UDIV_NEEDS_NORMALIZATION tests)
+	     the remainder will be up-shifted here.  Compensate.  */
+	  n1 >>= normalization_steps;
+#endif
+
+	  /* Convert N1 from BIG_BASE to a string of digits in BASE
+	     using single precision operations.  */
+	  for (i = dig_per_u - 1; i >= 0; i--)
+	    {
+	      *--s = n1 % base;
+	      n1 /= base;
+	      if (n1 == 0 && msize == 0)
+		break;
+	    }
+	}
+
+      while (s != str)
+	*--s = 0;
+      return out_len;
+    }
+}
diff --git a/rts/gmp/mpn/generic/gmp-mparam.h b/rts/gmp/mpn/generic/gmp-mparam.h
new file mode 100644
index 0000000000..14bcaece83
--- /dev/null
+++ b/rts/gmp/mpn/generic/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/generic/hamdist.c b/rts/gmp/mpn/generic/hamdist.c
new file mode 100644
index 0000000000..35c10e8450
--- /dev/null
+++ b/rts/gmp/mpn/generic/hamdist.c
@@ -0,0 +1,94 @@
+/* mpn_hamdist --
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined __GNUC__
+/* No processor claiming to be SPARC v9 compliant seem to
+   implement the POPC instruction.  Disable pattern for now.  */
+#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64
+#define popc_limb(a) \
+  ({									\
+    DItype __res;							\
+    asm ("popc %1,%0" : "=r" (__res) : "rI" (a));			\
+    __res;								\
+  })
+#endif
+#endif
+
+#ifndef popc_limb
+
+/* Cool population count of a mp_limb_t.
+   You have to figure out how this works, I won't tell you!  */
+
+static inline unsigned int
+#if __STDC__
+popc_limb (mp_limb_t x)
+#else
+popc_limb (x)
+     mp_limb_t x;
+#endif
+{
+#if BITS_PER_MP_LIMB == 64
+  /* We have to go into some trouble to define these constants.
+     (For mp_limb_t being `long long'.)  */
+  mp_limb_t cnst;
+  cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2);
+  x -= (x & cnst) >> 1;
+  cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2);
+  x = ((x & ~cnst) >> 2) + (x & cnst);
+  cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2);
+  x = ((x >> 4) + x) & cnst;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x);
+  x = ((x >> 32) + x) & 0xff;
+#endif
+#if BITS_PER_MP_LIMB == 32
+  x -= (x & 0xaaaaaaaa) >> 1;
+  x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L);
+  x = ((x >> 4) + x) & 0x0f0f0f0fL;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x) & 0xff;
+#endif
+  return x;
+}
+#endif
+
+unsigned long int
+#if __STDC__
+mpn_hamdist (mp_srcptr up, mp_srcptr vp, mp_size_t size)
+#else
+mpn_hamdist (up, vp, size)
+     register mp_srcptr up;
+     register mp_srcptr vp;
+     register mp_size_t size;
+#endif
+{
+  unsigned long int hamdist;
+  mp_size_t i;
+
+  hamdist = 0;
+  for (i = 0; i < size; i++)
+    hamdist += popc_limb (up[i] ^ vp[i]);
+
+  return hamdist;
+}
diff --git a/rts/gmp/mpn/generic/inlines.c b/rts/gmp/mpn/generic/inlines.c
new file mode 100644
index 0000000000..9487e58cf2
--- /dev/null
+++ b/rts/gmp/mpn/generic/inlines.c
@@ -0,0 +1,24 @@
+/*
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#define _FORCE_INLINES
+#define _EXTERN_INLINE /* empty */
+#include "gmp.h"
diff --git a/rts/gmp/mpn/generic/jacbase.c b/rts/gmp/mpn/generic/jacbase.c
new file mode 100644
index 0000000000..dd437f1ac1
--- /dev/null
+++ b/rts/gmp/mpn/generic/jacbase.c
@@ -0,0 +1,136 @@
+/* mpn_jacobi_base -- limb/limb Jacobi symbol with restricted arguments.
+
+   THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO
+   INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#if COUNT_TRAILING_ZEROS_TIME <= 7
+/* If count_trailing_zeros is fast, use it.
+   K7 at 7 cycles and P6 at 2 are good here.  K6 at 12-27 and P5 at 18-42
+   are not.  The default 15 in longlong.h is meant to mean not good here.  */
+
+#define PROCESS_TWOS_ANY                                \
+  {                                                     \
+    mp_limb_t  twos;                                    \
+    count_trailing_zeros (twos, a);                     \
+    result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b);        \
+    a >>= twos;                                         \
+  }
+
+#define PROCESS_TWOS_EVEN  PROCESS_TWOS_ANY
+
+#else
+/* Use a loop instead.  With "a" uniformly distributed there will usually be
+   only a few trailing zeros.
+
+   Unfortunately the branch for the while loop here will be on a 50/50
+   chance of a 1 or 0, which is bad for branch prediction.  */
+
+#define PROCESS_TWOS_EVEN               \
+  {                                     \
+    int  two;                           \
+    two = JACOBI_TWO_U_BIT1 (b);        \
+    do                                  \
+      {                                 \
+        a >>= 1;                        \
+        result_bit1 ^= two;             \
+        ASSERT (a != 0);                \
+      }                                 \
+    while ((a & 1) == 0);               \
+  }
+
+#define PROCESS_TWOS_ANY        \
+  if ((a & 1) == 0)             \
+    PROCESS_TWOS_EVEN;
+
+#endif
+
+
+/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but
+   with a restricted range of inputs accepted, namely b>1, b odd, and a<=b.
+
+   The initial result_bit1 is taken as a parameter for the convenience of
+   mpz_kronecker_zi_ui() et al.  The sign changes both here and in those
+   routines accumulate nicely in bit 1, see the JACOBI macros.
+
+   The return value here is the normal +1, 0, or -1.  Note that +1 and -1
+   have bit 1 in the "BIT1" sense, which could be useful if the caller is
+   accumulating it into some extended calculation.
+
+   Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be
+   possible, but a couple of tests suggest it's not a significant speedup,
+   and may even be a slowdown, so what's here is good enough for now.
+
+   Future: The code doesn't demand a<=b actually, so maybe this could be
+   relaxed.  All the places this is used currently call with a<=b though.  */
+
+int
+#if __STDC__
+mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1)
+#else
+mpn_jacobi_base (a, b, result_bit1)
+     mp_limb_t a;
+     mp_limb_t b;
+     int       result_bit1;
+#endif
+{
+  ASSERT (b & 1);  /* b odd */
+  ASSERT (b != 1);
+  ASSERT (a <= b);
+
+  if (a == 0)
+    return 0;
+
+  PROCESS_TWOS_ANY;
+  if (a == 1)
+    goto done;
+
+  for (;;)
+    {
+      result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b);
+      MP_LIMB_T_SWAP (a, b);
+
+      do
+	{
+          /* working on (a/b), a,b odd, a>=b */
+          ASSERT (a & 1);
+          ASSERT (b & 1);
+          ASSERT (a >= b);
+
+	  if ((a -= b) == 0)
+	    return 0;
+
+          PROCESS_TWOS_EVEN;
+	  if (a == 1)
+	    goto done;
+	}
+      while (a >= b);
+    }
+
+ done:
+  return JACOBI_BIT1_TO_PN (result_bit1);
+}
diff --git a/rts/gmp/mpn/generic/lshift.c b/rts/gmp/mpn/generic/lshift.c
new file mode 100644
index 0000000000..0b58389658
--- /dev/null
+++ b/rts/gmp/mpn/generic/lshift.c
@@ -0,0 +1,87 @@
+/* mpn_lshift -- Shift left low level.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left
+   and store the USIZE least significant digits of the result at WP.
+   Return the bits shifted out from the most significant digit.
+
+   Argument constraints:
+   1. 0 < CNT < BITS_PER_MP_LIMB
+   2. If the result is to be written over the input, WP must be >= UP.
+*/
+
+mp_limb_t
+#if __STDC__
+mpn_lshift (register mp_ptr wp,
+	    register mp_srcptr up, mp_size_t usize,
+	    register unsigned int cnt)
+#else
+mpn_lshift (wp, up, usize, cnt)
+     register mp_ptr wp;
+     register mp_srcptr up;
+     mp_size_t usize;
+     register unsigned int cnt;
+#endif
+{
+  register mp_limb_t high_limb, low_limb;
+  register unsigned sh_1, sh_2;
+  register mp_size_t i;
+  mp_limb_t retval;
+
+#ifdef DEBUG
+  if (usize == 0 || cnt == 0)
+    abort ();
+#endif
+
+  sh_1 = cnt;
+#if 0
+  if (sh_1 == 0)
+    {
+      if (wp != up)
+	{
+	  /* Copy from high end to low end, to allow specified input/output
+	     overlapping.  */
+	  for (i = usize - 1; i >= 0; i--)
+	    wp[i] = up[i];
+	}
+      return 0;
+    }
+#endif
+
+  wp += 1;
+  sh_2 = BITS_PER_MP_LIMB - sh_1;
+  i = usize - 1;
+  low_limb = up[i];
+  retval = low_limb >> sh_2;
+  high_limb = low_limb;
+  while (--i >= 0)
+    {
+      low_limb = up[i];
+      wp[i] = (high_limb << sh_1) | (low_limb >> sh_2);
+      high_limb = low_limb;
+    }
+  wp[i] = high_limb << sh_1;
+
+  return retval;
+}
diff --git a/rts/gmp/mpn/generic/mod_1.c b/rts/gmp/mpn/generic/mod_1.c
new file mode 100644
index 0000000000..168ec9df49
--- /dev/null
+++ b/rts/gmp/mpn/generic/mod_1.c
@@ -0,0 +1,175 @@
+/* mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) --
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+   Return the single-limb remainder.
+   There are no constraints on the value of the divisor.
+
+Copyright (C) 1991, 1993, 1994, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+mp_limb_t
+#if __STDC__
+mpn_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
+	   mp_limb_t divisor_limb)
+#else
+mpn_mod_1 (dividend_ptr, dividend_size, divisor_limb)
+     mp_srcptr dividend_ptr;
+     mp_size_t dividend_size;
+     mp_limb_t divisor_limb;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t n1, n0, r;
+  int dummy;
+
+  /* Botch: Should this be handled at all?  Rely on callers?  */
+  if (dividend_size == 0)
+    return 0;
+
+  /* If multiplication is much faster than division, and the
+     dividend is large, pre-invert the divisor, and use
+     only multiplications in the inner loop.  */
+
+  /* This test should be read:
+       Does it ever help to use udiv_qrnnd_preinv?
+	 && Does what we save compensate for the inversion overhead?  */
+  if (UDIV_TIME > (2 * UMUL_TIME + 6)
+      && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME)
+    {
+      int normalization_steps;
+
+      count_leading_zeros (normalization_steps, divisor_limb);
+      if (normalization_steps != 0)
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  divisor_limb <<= normalization_steps;
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  n1 = dividend_ptr[dividend_size - 1];
+	  r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	  /* Possible optimization:
+	     if (r == 0
+	     && divisor_limb > ((n1 << normalization_steps)
+			     | (dividend_ptr[dividend_size - 2] >> ...)))
+	     ...one division less... */
+
+	  for (i = dividend_size - 2; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (dummy, r, r,
+				 ((n1 << normalization_steps)
+				  | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+				 divisor_limb, divisor_limb_inverted);
+	      n1 = n0;
+	    }
+	  udiv_qrnnd_preinv (dummy, r, r,
+			     n1 << normalization_steps,
+			     divisor_limb, divisor_limb_inverted);
+	  return r >> normalization_steps;
+	}
+      else
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  i = dividend_size - 1;
+	  r = dividend_ptr[i];
+
+	  if (r >= divisor_limb)
+	    r = 0;
+	  else
+	    i--;
+
+	  for (; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (dummy, r, r,
+				 n0, divisor_limb, divisor_limb_inverted);
+	    }
+	  return r;
+	}
+    }
+  else
+    {
+      if (UDIV_NEEDS_NORMALIZATION)
+	{
+	  int normalization_steps;
+
+	  count_leading_zeros (normalization_steps, divisor_limb);
+	  if (normalization_steps != 0)
+	    {
+	      divisor_limb <<= normalization_steps;
+
+	      n1 = dividend_ptr[dividend_size - 1];
+	      r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	      /* Possible optimization:
+		 if (r == 0
+		 && divisor_limb > ((n1 << normalization_steps)
+				 | (dividend_ptr[dividend_size - 2] >> ...)))
+		 ...one division less... */
+
+	      for (i = dividend_size - 2; i >= 0; i--)
+		{
+		  n0 = dividend_ptr[i];
+		  udiv_qrnnd (dummy, r, r,
+			      ((n1 << normalization_steps)
+			       | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+			      divisor_limb);
+		  n1 = n0;
+		}
+	      udiv_qrnnd (dummy, r, r,
+			  n1 << normalization_steps,
+			  divisor_limb);
+	      return r >> normalization_steps;
+	    }
+	}
+      /* No normalization needed, either because udiv_qrnnd doesn't require
+	 it, or because DIVISOR_LIMB is already normalized.  */
+
+      i = dividend_size - 1;
+      r = dividend_ptr[i];
+
+      if (r >= divisor_limb)
+	r = 0;
+      else
+	i--;
+
+      for (; i >= 0; i--)
+	{
+	  n0 = dividend_ptr[i];
+	  udiv_qrnnd (dummy, r, r, n0, divisor_limb);
+	}
+      return r;
+    }
+}
diff --git a/rts/gmp/mpn/generic/mod_1_rs.c b/rts/gmp/mpn/generic/mod_1_rs.c
new file mode 100644
index 0000000000..62aaa94b92
--- /dev/null
+++ b/rts/gmp/mpn/generic/mod_1_rs.c
@@ -0,0 +1,111 @@
+/* mpn_mod_1_rshift -- mpn remainder under hypothetical right shift.
+
+   THE FUNCTION IN THIS FILE IS FOR INTERNAL USE AND HAS A MUTABLE
+   INTERFACE.  IT IS ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.
+   IT'S ALMOST GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
+   RELEASE. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* When testing on a CPU with UDIV_NEEDS_NORMALIZATION equal to 0, it can be
+   changed to 1 temporarily to test the code under that case too. */
+#if 0
+#undef UDIV_NEEDS_NORMALIZATION
+#define UDIV_NEEDS_NORMALIZATION 1
+#endif
+
+
+/* Calculate the remainder "(ptr,size >> shift) % divisor".  Note ptr,size
+   is unchanged, the shift is only for its effect on the remainder.
+   The shift doesn't even need to be considered until the last limb.
+
+   This function has the normal size!=0 restriction, unlike the basic
+   mpn_mod_1. */
+
+mp_limb_t
+#if __STDC__
+mpn_mod_1_rshift (mp_srcptr ptr, mp_size_t size, unsigned shift,
+                  mp_limb_t divisor)
+#else
+mpn_mod_1_rshift (ptr, size, shift, divisor)
+     mp_srcptr ptr;
+     mp_size_t size;
+     unsigned  shift;
+     mp_limb_t divisor;
+#endif
+{
+  mp_limb_t  quot, rem;
+
+  ASSERT (shift >= 1);
+  ASSERT (shift < BITS_PER_MP_LIMB);
+  ASSERT (size >= 1);
+
+  if (size == 1)
+    return (ptr[0] >> shift) % divisor;
+
+#if UDIV_NEEDS_NORMALIZATION 
+  {
+    int  norm;
+    int  delta;
+
+    count_leading_zeros (norm, divisor);
+    divisor <<= norm;
+
+    delta = shift - norm;
+    if (delta == 0)
+      return mpn_mod_1 (ptr, size, divisor) >> norm;
+
+    if (delta > 0)
+      {
+        rem = mpn_mod_1 (ptr+1, size-1, divisor);
+        udiv_qrnnd (quot, rem,
+                    rem >> delta,
+                    (rem << (BITS_PER_MP_LIMB-delta)) | (ptr[0] >> delta),
+                    divisor);
+        return rem >> norm;
+      }
+    else
+      {
+        rem = mpn_mod_1 (ptr, size, divisor);
+        udiv_qrnnd (quot, rem,
+                    rem >> (BITS_PER_MP_LIMB+delta),
+                    rem << -delta,
+                    divisor);
+        return rem >> norm;
+      }
+  }
+
+#else /* !UDIV_NEEDS_NORMALIZATION */
+
+  rem = mpn_mod_1 (ptr+1, size-1, divisor);
+  udiv_qrnnd (quot, rem,
+              rem >> shift,
+              (rem << (BITS_PER_MP_LIMB-shift)) | (ptr[0] >> shift),
+              divisor);
+  return rem;
+
+#endif
+}
diff --git a/rts/gmp/mpn/generic/mul.c b/rts/gmp/mpn/generic/mul.c
new file mode 100644
index 0000000000..cecfa19ca1
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul.c
@@ -0,0 +1,190 @@
+/* mpn_mul -- Multiply two natural numbers.
+
+   THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul)
+   ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH
+   THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
+   THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
+   (pointed to by VP, with VN limbs), and store the result at PRODP.  The
+   result is UN + VN limbs.  Return the most significant limb of the result.
+
+   NOTE: The space pointed to by PRODP is overwritten before finished with U
+   and V, so overlap is an error.
+
+   Argument constraints:
+   1. UN >= VN.
+   2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
+      the multiplier and the multiplicand.  */
+
+void
+#if __STDC__
+mpn_sqr_n (mp_ptr prodp,
+         mp_srcptr up, mp_size_t un)
+#else
+mpn_sqr_n (prodp, up, un)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t un;
+#endif
+{
+  if (un < KARATSUBA_SQR_THRESHOLD)
+    { /* plain schoolbook multiplication */
+      if (un == 0)
+	return;
+      mpn_sqr_basecase (prodp, up, un);
+    }
+  else if (un < TOOM3_SQR_THRESHOLD)
+    { /* karatsuba multiplication */
+      mp_ptr tspace;
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
+      mpn_kara_sqr_n (prodp, up, un, tspace);
+      TMP_FREE (marker);
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else if (un < FFT_SQR_THRESHOLD)
+#else
+  else
+#endif
+    { /* toom3 multiplication */
+      mp_ptr tspace;
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
+      mpn_toom3_sqr_n (prodp, up, un, tspace);
+      TMP_FREE (marker);
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else
+    {
+      /* schoenhage multiplication */
+      mpn_mul_fft_full (prodp, up, un, up, un);
+    }
+#endif
+}
+
+mp_limb_t
+#if __STDC__
+mpn_mul (mp_ptr prodp,
+	 mp_srcptr up, mp_size_t un,
+	 mp_srcptr vp, mp_size_t vn)
+#else
+mpn_mul (prodp, up, un, vp, vn)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t un;
+     mp_srcptr vp;
+     mp_size_t vn;
+#endif
+{
+  mp_size_t l;
+  mp_limb_t c;
+
+  if (up == vp && un == vn)
+    {
+      mpn_sqr_n (prodp, up, un);
+      return prodp[2 * un - 1];
+    }
+
+  if (vn < KARATSUBA_MUL_THRESHOLD)
+    { /* long multiplication */
+      mpn_mul_basecase (prodp, up, un, vp, vn);
+      return prodp[un + vn - 1];
+    }
+
+  mpn_mul_n (prodp, up, vp, vn);
+  if (un != vn)
+    { mp_limb_t t;
+      mp_ptr ws;
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+
+      prodp += vn;
+      l = vn;
+      up += vn;
+      un -= vn;
+
+      if (un < vn) 
+	{
+	  /* Swap u's and v's. */
+          MPN_SRCPTR_SWAP (up,un, vp,vn);
+	}
+
+      ws = (mp_ptr) TMP_ALLOC (((vn >= KARATSUBA_MUL_THRESHOLD ? vn : un) + vn)
+			       * BYTES_PER_MP_LIMB);
+
+      t = 0;
+      while (vn >= KARATSUBA_MUL_THRESHOLD)
+	{
+	  mpn_mul_n (ws, up, vp, vn);
+	  if (l <= 2*vn) 
+	    {
+	      t += mpn_add_n (prodp, prodp, ws, l);
+	      if (l != 2*vn)
+		{
+		  t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t);
+		  l = 2*vn;
+		}
+	    }
+	  else
+	    {
+	      c = mpn_add_n (prodp, prodp, ws, 2*vn);
+	      t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c);
+	    }
+	  prodp += vn;
+	  l -= vn;
+	  up += vn;
+	  un -= vn;
+	  if (un < vn) 
+	    {
+	      /* Swap u's and v's. */
+              MPN_SRCPTR_SWAP (up,un, vp,vn);
+	    }
+	}
+
+      if (vn)
+	{
+	  mpn_mul_basecase (ws, up, un, vp, vn);
+	  if (l <= un + vn) 
+	    {
+	      t += mpn_add_n (prodp, prodp, ws, l);
+	      if (l != un + vn)
+		t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t);
+	    } 
+	  else
+	    {
+	      c = mpn_add_n (prodp, prodp, ws, un + vn);
+	      t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c);
+	    }
+	}
+
+    TMP_FREE (marker);
+  }
+  return prodp[un + vn - 1];
+}
diff --git a/rts/gmp/mpn/generic/mul_1.c b/rts/gmp/mpn/generic/mul_1.c
new file mode 100644
index 0000000000..1c36b5fb1f
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_1.c
@@ -0,0 +1,59 @@
+/* mpn_mul_1 -- Multiply a limb vector with a single limb and
+   store the product in a second limb vector.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_mul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+{
+  register mp_limb_t cy_limb;
+  register mp_size_t j;
+  register mp_limb_t prod_high, prod_low;
+
+  /* The loop counter and index J goes from -S1_SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -s1_size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  s1_ptr -= j;
+  res_ptr -= j;
+
+  cy_limb = 0;
+  do
+    {
+      umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+      prod_low += cy_limb;
+      cy_limb = (prod_low < cy_limb) + prod_high;
+
+      res_ptr[j] = prod_low;
+    }
+  while (++j != 0);
+
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/mul_basecase.c b/rts/gmp/mpn/generic/mul_basecase.c
new file mode 100644
index 0000000000..00c06aa5c4
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_basecase.c
@@ -0,0 +1,87 @@
+/* mpn_mul_basecase -- Internal routine to multiply two natural numbers
+   of length m and n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Handle simple cases with traditional multiplication.
+
+   This is the most critical code of multiplication.  All multiplies rely on
+   this, both small and huge.  Small ones arrive here immediately, huge ones
+   arrive here as this is the base case for Karatsuba's recursive algorithm. */
+
+void
+#if __STDC__
+mpn_mul_basecase (mp_ptr prodp,
+		     mp_srcptr up, mp_size_t usize,
+		     mp_srcptr vp, mp_size_t vsize)
+#else
+mpn_mul_basecase (prodp, up, usize, vp, vsize)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t usize;
+     mp_srcptr vp;
+     mp_size_t vsize;
+#endif
+{
+  /* We first multiply by the low order one or two limbs, as the result can
+     be stored, not added, to PROD.  We also avoid a loop for zeroing this
+     way.  */
+#if HAVE_NATIVE_mpn_mul_2
+  if (vsize >= 2)
+    {
+      prodp[usize + 1] = mpn_mul_2 (prodp, up, usize, vp[0], vp[1]);
+      prodp += 2, vp += 2, vsize -= 2;
+    }
+  else
+    {
+      prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
+      return;
+    }
+#else
+  prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
+  prodp += 1, vp += 1, vsize -= 1;
+#endif
+
+#if HAVE_NATIVE_mpn_addmul_2
+  while (vsize >= 2)
+    {
+      prodp[usize + 1] = mpn_addmul_2 (prodp, up, usize, vp[0], vp[1]);
+      prodp += 2, vp += 2, vsize -= 2;
+    }
+  if (vsize != 0)
+    prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
+#else
+  /* For each iteration in the loop, multiply U with one limb from V, and
+     add the result to PROD.  */
+  while (vsize != 0)
+    {
+      prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
+      prodp += 1, vp += 1, vsize -= 1;
+    }
+#endif
+}
diff --git a/rts/gmp/mpn/generic/mul_fft.c b/rts/gmp/mpn/generic/mul_fft.c
new file mode 100644
index 0000000000..00fd6d72de
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_fft.c
@@ -0,0 +1,772 @@
+/* An implementation in GMP of Scho"nhage's fast multiplication algorithm
+   modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998.
+
+   THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE
+   MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
+   INTERFACES.  IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN
+   A FUTURE GNU MP RELEASE.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+/* References:
+
+   Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker
+   Strassen, Computing 7, p. 281-292, 1971.
+
+   Asymptotically fast algorithms for the numerical multiplication
+   and division of polynomials with complex coefficients, by Arnold Scho"nhage,
+   Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
+
+   Tapes versus Pointers, a study in implementing fast algorithms,
+   by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
+
+   See also http://www.loria.fr/~zimmerma/bignum
+
+
+   Future:
+
+   K==2 isn't needed in the current uses of this code and the bits specific
+   for that could be dropped.
+
+   It might be possible to avoid a small number of MPN_COPYs by using a
+   rotating temporary or two.
+
+   Multiplications of unequal sized operands can be done with this code, but
+   it needs a tighter test for identifying squaring (same sizes as well as
+   same pointers).  */
+
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Change this to "#define TRACE(x) x" for some traces. */
+#define TRACE(x)
+
+
+
+FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = {
+  FFT_MUL_TABLE,
+  FFT_SQR_TABLE
+};
+
+
+static void mpn_mul_fft_internal
+_PROTO ((mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
+         int k, int K,
+         mp_limb_t **Ap, mp_limb_t **Bp,
+         mp_limb_t *A, mp_limb_t *B,
+         mp_size_t nprime, mp_size_t l, mp_size_t Mp, int **_fft_l,
+         mp_limb_t *T, int rec));
+
+
+/* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT.
+   sqr==0 if for a multiply, sqr==1 for a square */
+int
+#if __STDC__
+mpn_fft_best_k (mp_size_t n, int sqr)
+#else
+mpn_fft_best_k (n, sqr)
+     mp_size_t n;
+     int       sqr;
+#endif
+{
+  mp_size_t  t;
+  int        i;
+
+  for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
+    if (n < mpn_fft_table[sqr][i])
+      return i + FFT_FIRST_K;
+
+  /* treat 4*last as one further entry */
+  if (i == 0 || n < 4*mpn_fft_table[sqr][i-1])
+    return i + FFT_FIRST_K;
+  else
+    return i + FFT_FIRST_K + 1;
+}
+
+
+/* Returns smallest possible number of limbs >= pl for a fft of size 2^k.
+   FIXME: Is this simply pl rounded up to the next multiple of 2^k ?  */
+
+mp_size_t
+#if __STDC__
+mpn_fft_next_size (mp_size_t pl, int k)
+#else
+mpn_fft_next_size (pl, k)
+     mp_size_t pl;
+     int       k;
+#endif
+{
+  mp_size_t N, M;
+  int       K;
+
+  /*  if (k==0) k = mpn_fft_best_k (pl, sqr); */
+  N = pl*BITS_PER_MP_LIMB;
+  K = 1<<k;
+  if (N%K) N=(N/K+1)*K;
+  M = N/K;
+  if (M%BITS_PER_MP_LIMB) N=((M/BITS_PER_MP_LIMB)+1)*BITS_PER_MP_LIMB*K;
+  return (N/BITS_PER_MP_LIMB);
+}
+
+
+static void
+#if __STDC__
+mpn_fft_initl(int **l, int k)
+#else
+mpn_fft_initl(l, k)
+     int  **l;
+     int  k;
+#endif
+{
+    int i,j,K;
+
+    l[0][0] = 0;
+    for (i=1,K=2;i<=k;i++,K*=2) {
+	for (j=0;j<K/2;j++) {
+	    l[i][j] = 2*l[i-1][j];
+	    l[i][K/2+j] = 1+l[i][j];
+	}
+    }
+}
+
+
+/* a <- -a mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_neg_modF(mp_limb_t *ap, mp_size_t n)
+#else
+mpn_fft_neg_modF(ap, n)
+     mp_limb_t *ap;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t c;
+
+  c = ap[n]+2;
+  mpn_com_n (ap, ap, n);
+  ap[n]=0; mpn_incr_u(ap, c);
+}
+
+
+/* a <- a*2^e mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_mul_2exp_modF(mp_limb_t *ap, int e, mp_size_t n, mp_limb_t *tp)
+#else
+mpn_fft_mul_2exp_modF(ap, e, n, tp)
+     mp_limb_t *ap;
+     int e;
+     mp_size_t n;
+     mp_limb_t *tp;
+#endif
+{
+  int d, sh, i; mp_limb_t cc;
+
+  d = e%(n*BITS_PER_MP_LIMB); /* 2^e = (+/-) 2^d */
+  sh = d % BITS_PER_MP_LIMB;
+  if (sh) mpn_lshift(tp, ap, n+1, sh); /* no carry here */
+  else MPN_COPY(tp, ap, n+1);
+  d /= BITS_PER_MP_LIMB; /* now shift of d limbs to the left */
+ if (d) { 
+   /* ap[d..n-1] = tp[0..n-d-1], ap[0..d-1] = -tp[n-d..n-1] */
+   /* mpn_xor would be more efficient here */
+   for (i=d-1;i>=0;i--) ap[i] = ~tp[n-d+i];
+   cc = 1-mpn_add_1(ap, ap, d, 1);
+   if (cc) cc=mpn_sub_1(ap+d, tp, n-d, 1);
+   else MPN_COPY(ap+d, tp, n-d);
+   if (cc+=mpn_sub_1(ap+d, ap+d, n-d, tp[n]))
+     ap[n]=mpn_add_1(ap, ap, n, cc);
+   else ap[n]=0;
+  }
+  else if ((ap[n]=mpn_sub_1(ap, tp, n, tp[n]))) {
+    ap[n]=mpn_add_1(ap, ap, n, 1);
+  }
+  if ((e/(n*BITS_PER_MP_LIMB))%2) mpn_fft_neg_modF(ap, n);
+}
+
+
+/* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_add_modF (mp_limb_t *ap, mp_limb_t *bp, int n)
+#else
+mpn_fft_add_modF (ap, bp, n)
+     mp_limb_t *ap,*bp;
+     int n;
+#endif
+{
+  mp_limb_t c;
+
+  c = ap[n] + bp[n] + mpn_add_n(ap, ap, bp, n);
+  if (c>1) c -= 1+mpn_sub_1(ap,ap,n,1);
+  ap[n]=c;
+}
+
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+          N=n*BITS_PER_MP_LIMB 
+          2^omega is a primitive root mod 2^N+1
+   output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
+
+static void
+#if __STDC__
+mpn_fft_fft_sqr (mp_limb_t **Ap, mp_size_t K, int **ll,
+                 mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
+#else
+mpn_fft_fft_sqr(Ap,K,ll,omega,n,inc,tp)
+mp_limb_t **Ap,*tp;
+mp_size_t K,omega,n,inc;
+int       **ll;
+#endif
+{
+  if (K==2) {
+#ifdef ADDSUB
+      if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
+#else
+      MPN_COPY(tp, Ap[0], n+1);
+      mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
+      if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
+#endif
+      	Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
+    }
+    else {
+      int       j, inc2=2*inc;
+      int       *lk = *ll;
+      mp_limb_t *tmp;
+      TMP_DECL(marker);
+
+      TMP_MARK(marker);
+      tmp = TMP_ALLOC_LIMBS (n+1);
+	mpn_fft_fft_sqr(Ap, K/2,ll-1,2*omega,n,inc2, tp);
+	mpn_fft_fft_sqr(Ap+inc, K/2,ll-1,2*omega,n,inc2, tp);
+	/* A[2*j*inc]   <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
+	   A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
+	for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc) {
+	  MPN_COPY(tp, Ap[inc], n+1);
+	  mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[inc], Ap[0], n);
+	  mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[0], tp, n);
+	}
+        TMP_FREE(marker);
+    }
+}
+
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+          N=n*BITS_PER_MP_LIMB 
+         2^omega is a primitive root mod 2^N+1 
+   output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
+
+static void
+#if __STDC__
+mpn_fft_fft (mp_limb_t **Ap, mp_limb_t **Bp, mp_size_t K, int **ll,
+             mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
+#else
+mpn_fft_fft(Ap,Bp,K,ll,omega,n,inc,tp)
+     mp_limb_t **Ap,**Bp,*tp;
+     mp_size_t K,omega,n,inc;
+     int       **ll;
+#endif
+{
+  if (K==2) {
+#ifdef ADDSUB
+      if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
+#else
+      MPN_COPY(tp, Ap[0], n+1);
+      mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
+      if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
+#endif
+      	Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
+#ifdef ADDSUB
+      if (mpn_addsub_n(Bp[0], Bp[inc], Bp[0], Bp[inc], n+1) & 1)
+#else
+      MPN_COPY(tp, Bp[0], n+1);
+      mpn_add_n(Bp[0], Bp[0], Bp[inc],n+1);
+      if (mpn_sub_n(Bp[inc], tp, Bp[inc],n+1))
+#endif
+      	Bp[inc][n] = mpn_add_1(Bp[inc], Bp[inc], n, 1);
+    }
+    else {
+	int       j, inc2=2*inc;
+        int       *lk=*ll;
+        mp_limb_t *tmp;
+	TMP_DECL(marker);
+
+	TMP_MARK(marker);
+	tmp = TMP_ALLOC_LIMBS (n+1);
+	mpn_fft_fft(Ap, Bp, K/2,ll-1,2*omega,n,inc2, tp);
+	mpn_fft_fft(Ap+inc, Bp+inc, K/2,ll-1,2*omega,n,inc2, tp);
+	/* A[2*j*inc]   <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
+	   A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
+	for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc,Bp+=2*inc) {
+	  MPN_COPY(tp, Ap[inc], n+1);
+	  mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[inc], Ap[0], n);
+	  mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[0], tp, n);
+	  MPN_COPY(tp, Bp[inc], n+1);
+	  mpn_fft_mul_2exp_modF(Bp[inc], lk[1]*omega, n, tmp);
+	  mpn_fft_add_modF(Bp[inc], Bp[0], n);
+	  mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+	  mpn_fft_add_modF(Bp[0], tp, n);
+	}
+	TMP_FREE(marker);
+    }
+}
+
+
+/* a[i] <- a[i]*b[i] mod 2^(n*BITS_PER_MP_LIMB)+1 for 0 <= i < K */
+static void
+#if __STDC__
+mpn_fft_mul_modF_K (mp_limb_t **ap, mp_limb_t **bp, mp_size_t n, int K) 
+#else
+mpn_fft_mul_modF_K(ap, bp, n, K) 
+     mp_limb_t **ap, **bp;
+     mp_size_t n;
+     int       K;
+#endif
+{
+  int  i;
+  int  sqr = (ap == bp);
+  TMP_DECL(marker);
+  
+  TMP_MARK(marker); 
+
+  if (n >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
+    int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2;
+    int       **_fft_l;
+    mp_limb_t **Ap,**Bp,*A,*B,*T;
+
+    k = mpn_fft_best_k (n, sqr);
+    K2 = 1<<k;
+    maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB;
+    M2 = n*BITS_PER_MP_LIMB/K2;
+    l = n/K2;
+    Nprime2 = ((2*M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/
+    nprime2 = Nprime2/BITS_PER_MP_LIMB;
+    Mp2 = Nprime2/K2;
+
+    Ap = TMP_ALLOC_MP_PTRS (K2);
+    Bp = TMP_ALLOC_MP_PTRS (K2);
+    A = TMP_ALLOC_LIMBS (2*K2*(nprime2+1));
+    T = TMP_ALLOC_LIMBS (nprime2+1);
+    B = A + K2*(nprime2+1);
+    _fft_l = TMP_ALLOC_TYPE (k+1, int*);
+    for (i=0;i<=k;i++)
+      _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+    mpn_fft_initl(_fft_l, k);
+
+    TRACE (printf("recurse: %dx%d limbs -> %d times %dx%d (%1.2f)\n", n,
+                  n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
+
+    for (i=0;i<K;i++,ap++,bp++)
+      mpn_mul_fft_internal(*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
+	 l, Mp2, _fft_l, T, 1);
+  }
+  else {
+     mp_limb_t *a, *b, cc, *tp, *tpn; int n2=2*n;
+     tp = TMP_ALLOC_LIMBS (n2);
+     tpn = tp+n;
+     TRACE (printf ("  mpn_mul_n %d of %d limbs\n", K, n));
+     for (i=0;i<K;i++) {
+        a = *ap++; b=*bp++;
+        if (sqr)
+          mpn_sqr_n(tp, a, n);
+        else
+          mpn_mul_n(tp, b, a, n);
+	if (a[n]) cc=mpn_add_n(tpn, tpn, b, n); else cc=0;
+	if (b[n]) cc += mpn_add_n(tpn, tpn, a, n) + a[n];
+	if (cc) {
+          cc = mpn_add_1(tp, tp, n2, cc);
+          ASSERT_NOCARRY (mpn_add_1(tp, tp, n2, cc));
+        }
+	a[n] = mpn_sub_n(a, tp, tpn, n) && mpn_add_1(a, a, n, 1); 
+     }
+  }
+  TMP_FREE(marker); 
+}
+
+
+/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
+   output: K*A[0] K*A[K-1] ... K*A[1] */
+
+static void
+#if __STDC__
+mpn_fft_fftinv (mp_limb_t **Ap, int K, mp_size_t omega, mp_size_t n,
+                mp_limb_t *tp)
+#else
+mpn_fft_fftinv(Ap,K,omega,n,tp)
+     mp_limb_t **Ap, *tp;
+     int       K;
+     mp_size_t omega, n;
+#endif
+{
+    if (K==2) {
+#ifdef ADDSUB
+      if (mpn_addsub_n(Ap[0], Ap[1], Ap[0], Ap[1], n+1) & 1)
+#else
+      MPN_COPY(tp, Ap[0], n+1);
+      mpn_add_n(Ap[0], Ap[0], Ap[1], n+1);
+      if (mpn_sub_n(Ap[1], tp, Ap[1], n+1))
+#endif
+        Ap[1][n] = mpn_add_1(Ap[1], Ap[1], n, 1);
+    }
+    else {
+	int j, K2=K/2; mp_limb_t **Bp=Ap+K2, *tmp; 
+	TMP_DECL(marker);
+
+	TMP_MARK(marker);
+	tmp = TMP_ALLOC_LIMBS (n+1);
+	mpn_fft_fftinv(Ap, K2, 2*omega, n, tp);
+	mpn_fft_fftinv(Bp, K2, 2*omega, n, tp);
+	/* A[j]     <- A[j] + omega^j A[j+K/2]
+	   A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
+        for (j=0;j<K2;j++,Ap++,Bp++) {
+	  MPN_COPY(tp, Bp[0], n+1);
+	  mpn_fft_mul_2exp_modF(Bp[0], (j+K2)*omega, n, tmp);
+	  mpn_fft_add_modF(Bp[0], Ap[0], n);
+	  mpn_fft_mul_2exp_modF(tp, j*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[0], tp, n);
+	}
+	TMP_FREE(marker);
+    }
+}
+
+
+/* A <- A/2^k mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_div_2exp_modF (mp_limb_t *ap, int k, mp_size_t n, mp_limb_t *tp)
+#else
+mpn_fft_div_2exp_modF(ap,k,n,tp)
+     mp_limb_t *ap,*tp;
+     int       k;
+     mp_size_t n;
+#endif
+{
+    int i;
+    
+    i = 2*n*BITS_PER_MP_LIMB;
+    i = (i-k) % i;
+    mpn_fft_mul_2exp_modF(ap,i,n,tp); 
+    /* 1/2^k = 2^(2nL-k) mod 2^(n*BITS_PER_MP_LIMB)+1 */
+    /* normalize so that A < 2^(n*BITS_PER_MP_LIMB)+1 */
+    if (ap[n]==1) {
+      for (i=0;i<n && ap[i]==0;i++);
+      if (i<n) {
+	ap[n]=0;
+	mpn_sub_1(ap, ap, n, 1);
+      }
+    }
+}
+
+
+/* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n<=an<=3*n */
+static void
+#if __STDC__
+mpn_fft_norm_modF(mp_limb_t *rp, mp_limb_t *ap, mp_size_t n, mp_size_t an) 
+#else
+mpn_fft_norm_modF(rp, ap, n, an)
+     mp_limb_t *rp;
+     mp_limb_t *ap;
+     mp_size_t n;
+     mp_size_t an;
+#endif
+{
+  mp_size_t l;
+
+   if (an>2*n) {
+     l = n;
+     rp[n] = mpn_add_1(rp+an-2*n, ap+an-2*n, 3*n-an, 
+		       mpn_add_n(rp,ap,ap+2*n,an-2*n));
+   }
+   else {
+     l = an-n;
+     MPN_COPY(rp, ap, n);
+     rp[n]=0;
+   }
+   if (mpn_sub_n(rp,rp,ap+n,l)) {
+     if (mpn_sub_1(rp+l,rp+l,n+1-l,1))
+       rp[n]=mpn_add_1(rp,rp,n,1); 
+   }
+}
+
+
+static void
+#if __STDC__
+mpn_mul_fft_internal(mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
+                     int k, int K,
+                     mp_limb_t **Ap, mp_limb_t **Bp,
+                     mp_limb_t *A, mp_limb_t *B,
+                     mp_size_t nprime, mp_size_t l, mp_size_t Mp,
+                     int **_fft_l,
+                     mp_limb_t *T, int rec)
+#else
+mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,rec)
+     mp_limb_t *op;
+     mp_srcptr n, m;
+     mp_limb_t **Ap,**Bp,*A,*B,*T;
+     mp_size_t pl,nprime;
+     int       **_fft_l;
+     int       k,K,l,Mp,rec;
+#endif
+{
+  int       i, sqr, pla, lo, sh, j;
+  mp_limb_t *p;
+
+    sqr = (n==m);
+
+    TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n",
+                   pl,k,K,nprime,l,Mp,rec,sqr));
+
+    /* decomposition of inputs into arrays Ap[i] and Bp[i] */
+    if (rec) for (i=0;i<K;i++) {
+      Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
+      /* store the next M bits of n into A[i] */
+      /* supposes that M is a multiple of BITS_PER_MP_LIMB */
+      MPN_COPY(Ap[i], n, l); n+=l; MPN_ZERO(Ap[i]+l, nprime+1-l);
+      /* set most significant bits of n and m (important in recursive calls) */
+      if (i==K-1) Ap[i][l]=n[0];
+      mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
+      if (!sqr) {
+	MPN_COPY(Bp[i], m, l); m+=l; MPN_ZERO(Bp[i]+l, nprime+1-l);
+	if (i==K-1) Bp[i][l]=m[0];
+	mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
+      }
+    }
+
+    /* direct fft's */
+    if (sqr) mpn_fft_fft_sqr(Ap,K,_fft_l+k,2*Mp,nprime,1, T);
+    else mpn_fft_fft(Ap,Bp,K,_fft_l+k,2*Mp,nprime,1, T);
+
+    /* term to term multiplications */
+    mpn_fft_mul_modF_K(Ap, (sqr) ? Ap : Bp, nprime, K);
+
+    /* inverse fft's */
+    mpn_fft_fftinv(Ap, K, 2*Mp, nprime, T);
+
+    /* division of terms after inverse fft */
+    for (i=0;i<K;i++) mpn_fft_div_2exp_modF(Ap[i],k+((K-i)%K)*Mp,nprime, T);
+
+    /* addition of terms in result p */
+    MPN_ZERO(T,nprime+1); 
+    pla = l*(K-1)+nprime+1; /* number of required limbs for p */
+    p = B; /* B has K*(n'+1) limbs, which is >= pla, i.e. enough */
+    MPN_ZERO(p, pla);
+    sqr=0; /* will accumulate the (signed) carry at p[pla] */
+    for (i=K-1,lo=l*i+nprime,sh=l*i;i>=0;i--,lo-=l,sh-=l) {
+        mp_ptr n = p+sh;
+	j = (K-i)%K;
+	if (mpn_add_n(n,n,Ap[j],nprime+1))
+	  sqr += mpn_add_1(n+nprime+1,n+nprime+1,pla-sh-nprime-1,1);
+	T[2*l]=i+1; /* T = (i+1)*2^(2*M) */
+	if (mpn_cmp(Ap[j],T,nprime+1)>0) { /* subtract 2^N'+1 */
+	  sqr -= mpn_sub_1(n,n,pla-sh,1);
+	  sqr -= mpn_sub_1(p+lo,p+lo,pla-lo,1);
+	}
+    }
+    if (sqr==-1) {
+      if ((sqr=mpn_add_1(p+pla-pl,p+pla-pl,pl,1))) {
+	/* p[pla-pl]...p[pla-1] are all zero */
+        mpn_sub_1(p+pla-pl-1,p+pla-pl-1,pl+1,1);
+	mpn_sub_1(p+pla-1,p+pla-1,1,1);
+      }
+    }
+    else if (sqr==1) {
+	    if (pla>=2*pl)
+	      while ((sqr=mpn_add_1(p+pla-2*pl,p+pla-2*pl,2*pl,sqr)));
+	    else {
+	      sqr = mpn_sub_1(p+pla-pl,p+pla-pl,pl,sqr);
+              ASSERT (sqr == 0);
+	    }
+    }
+    else
+      ASSERT (sqr == 0);
+
+    /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
+              < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
+	      < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
+    mpn_fft_norm_modF(op,p,pl,pla);
+}
+
+
+/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB
+   n and m have respectively nl and ml limbs
+   op must have space for pl+1 limbs
+   One must have pl = mpn_fft_next_size(pl, k).
+*/
+
+void
+#if __STDC__
+mpn_mul_fft (mp_ptr op, mp_size_t pl,
+             mp_srcptr n, mp_size_t nl,
+             mp_srcptr m, mp_size_t ml,
+             int k)
+#else
+mpn_mul_fft (op, pl, n, nl, m, ml, k)
+     mp_ptr    op;
+     mp_size_t pl;
+     mp_srcptr n;
+     mp_size_t nl;
+     mp_srcptr m;
+     mp_size_t ml;
+     int k;
+#endif
+{
+    int        K,maxLK,i,j;
+    mp_size_t  N,Nprime,nprime,M,Mp,l;
+    mp_limb_t  **Ap,**Bp,*A,*T,*B;
+    int        **_fft_l;
+    int        sqr = (n==m && nl==ml);
+    TMP_DECL(marker);
+
+    TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n",
+                   pl, nl, ml, k));
+    ASSERT_ALWAYS (mpn_fft_next_size(pl, k) == pl);
+
+    TMP_MARK(marker);
+    N = pl*BITS_PER_MP_LIMB;
+    _fft_l = TMP_ALLOC_TYPE (k+1, int*);
+    for (i=0;i<=k;i++)
+      _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+    mpn_fft_initl(_fft_l, k);
+    K = 1<<k;
+    M = N/K;	/* N = 2^k M */
+    l = M/BITS_PER_MP_LIMB;
+    maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB;
+
+    Nprime = ((2*M+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */
+    nprime = Nprime/BITS_PER_MP_LIMB; 
+    TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n",
+                   N, K, M, l, maxLK, Nprime, nprime));
+    if (nprime >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
+      maxLK = (1<<mpn_fft_best_k(nprime,n==m))*BITS_PER_MP_LIMB;
+      if (Nprime % maxLK) {
+	Nprime=((Nprime/maxLK)+1)*maxLK;
+	nprime = Nprime/BITS_PER_MP_LIMB;
+      }
+      TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime));
+    }
+
+    T = TMP_ALLOC_LIMBS (nprime+1);
+    Mp = Nprime/K;
+
+    TRACE (printf("%dx%d limbs -> %d times %dx%d limbs (%1.2f)\n",
+                  pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K);
+           printf("   temp space %ld\n", 2*K*(nprime+1)));
+
+    A = _MP_ALLOCATE_FUNC_LIMBS (2*K*(nprime+1));
+    B = A+K*(nprime+1);
+    Ap = TMP_ALLOC_MP_PTRS (K); 
+    Bp = TMP_ALLOC_MP_PTRS (K); 
+    /* special decomposition for main call */
+    for (i=0;i<K;i++) {
+      Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
+      /* store the next M bits of n into A[i] */
+      /* supposes that M is a multiple of BITS_PER_MP_LIMB */
+      if (nl>0) {
+	j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */
+	MPN_COPY(Ap[i], n, j); n+=l; MPN_ZERO(Ap[i]+j, nprime+1-j);
+	mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
+      }
+      else MPN_ZERO(Ap[i], nprime+1);
+      nl -= l;
+      if (n!=m) {
+	if (ml>0) {
+	  j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */
+	  MPN_COPY(Bp[i], m, j); m+=l; MPN_ZERO(Bp[i]+j, nprime+1-j);
+	  mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
+	}
+	else MPN_ZERO(Bp[i], nprime+1);
+      }
+      ml -= l;
+    }
+    mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,0);
+    TMP_FREE(marker);
+    _MP_FREE_FUNC_LIMBS (A, 2*K*(nprime+1));
+}
+
+
+#if WANT_ASSERT
+static int
+#if __STDC__
+mpn_zero_p (mp_ptr p, mp_size_t n)
+#else
+     mpn_zero_p (p, n)
+     mp_ptr p;
+     mp_size_t n;
+#endif
+{
+  mp_size_t i;
+
+  for (i = 0; i < n; i++)
+    {
+      if (p[i] != 0)
+        return 0;
+    }
+
+  return 1;
+}
+#endif
+
+
+/* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}.
+
+   FIXME: Duplicating the result like this is wasteful, do something better
+   perhaps at the norm_modF stage above. */
+
+void
+#if __STDC__
+mpn_mul_fft_full (mp_ptr op,
+                  mp_srcptr n, mp_size_t nl,
+                  mp_srcptr m, mp_size_t ml)
+#else
+mpn_mul_fft_full (op, n, nl, m, ml)
+     mp_ptr    op;
+     mp_srcptr n;
+     mp_size_t nl;
+     mp_srcptr m;
+     mp_size_t ml;
+#endif
+{
+  mp_ptr     pad_op;
+  mp_size_t  pl;
+  int        k;
+  int        sqr = (n==m && nl==ml);
+
+  k = mpn_fft_best_k (nl+ml, sqr);
+  pl = mpn_fft_next_size (nl+ml, k);
+
+  TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n",
+                 nl, ml, pl, k));
+
+  pad_op = _MP_ALLOCATE_FUNC_LIMBS (pl+1);
+  mpn_mul_fft (pad_op, pl, n, nl, m, ml, k);
+
+  ASSERT (mpn_zero_p (pad_op+nl+ml, pl+1-(nl+ml)));
+  MPN_COPY (op, pad_op, nl+ml);
+
+  _MP_FREE_FUNC_LIMBS (pad_op, pl+1);
+}
diff --git a/rts/gmp/mpn/generic/mul_n.c b/rts/gmp/mpn/generic/mul_n.c
new file mode 100644
index 0000000000..b7563be2d3
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_n.c
@@ -0,0 +1,1343 @@
+/* mpn_mul_n and helper function -- Multiply/square natural numbers.
+
+   THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
+   ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH
+   THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
+   THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
+   0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
+#define INVERSE_3      ((MP_LIMB_T_MAX / 3) * 2 + 1)
+
+#if !defined (__alpha) && !defined (__mips)
+/* For all other machines, we want to call mpn functions for the compund
+   operations instead of open-coding them.  */
+#define USE_MORE_MPN
+#endif
+
+/*== Function declarations =================================================*/
+
+static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
+                               mp_ptr, mp_ptr, mp_ptr,
+                               mp_srcptr, mp_srcptr, mp_srcptr,
+                               mp_size_t, mp_size_t));
+static void interpolate3 _PROTO ((mp_srcptr,
+                                  mp_ptr, mp_ptr, mp_ptr,
+                                  mp_srcptr,
+                                  mp_ptr, mp_ptr, mp_ptr,
+                                  mp_size_t, mp_size_t));
+static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+
+
+/*-- mpn_kara_mul_n ---------------------------------------------------------------*/
+
+/* Multiplies using 3 half-sized mults and so on recursively.
+ * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
+ * No overlap of p[...] with a[...] or b[...].
+ * ws is workspace.
+ */
+
+void
+#if __STDC__
+mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
+#else
+mpn_kara_mul_n(p, a, b, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_srcptr b;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t i, sign, w, w0, w1;
+  mp_size_t n2;
+  mp_srcptr x, y;
+
+  n2 = n >> 1;
+  ASSERT (n2 > 0);
+
+  if (n & 1)
+    {
+      /* Odd length. */
+      mp_size_t n1, n3, nm1;
+
+      n3 = n - n2;
+
+      sign = 0;
+      w = a[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p, a, a + n3, n2);
+      else
+	{
+	  i = n2;
+	  do
+	    {
+	      --i;
+	      w0 = a[i];
+	      w1 = a[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = a + n3;
+	      y = a;
+	      sign = 1;
+	    }
+	  else
+	    {
+	      x = a;
+	      y = a + n3;
+	    }
+	  mpn_sub_n (p, x, y, n2);
+	}
+      p[n2] = w;
+
+      w = b[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p + n3, b, b + n3, n2);
+      else
+	{
+	  i = n2;
+	  do 
+	    {
+	      --i;
+	      w0 = b[i]; 
+	      w1 = b[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = b + n3;
+	      y = b;
+	      sign ^= 1;
+	    }
+	  else
+	    {
+	      x = b;
+	      y = b + n3;
+	    }
+	  mpn_sub_n (p + n3, x, y, n2);
+	}
+      p[n] = w;
+
+      n1 = n + 1;
+      if (n2 < KARATSUBA_MUL_THRESHOLD)
+	{
+	  if (n3 < KARATSUBA_MUL_THRESHOLD)
+	    {
+	      mpn_mul_basecase (ws, p, n3, p + n3, n3);
+	      mpn_mul_basecase (p, a, n3, b, n3);
+	    }
+	  else
+	    {
+	      mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+	      mpn_kara_mul_n (p, a, b, n3, ws + n1);
+	    }
+	  mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
+	}
+      else
+	{
+	  mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+	  mpn_kara_mul_n (p, a, b, n3, ws + n1);
+	  mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
+	}
+
+      if (sign)
+	mpn_add_n (ws, p, ws, n1);
+      else
+	mpn_sub_n (ws, p, ws, n1);
+
+      nm1 = n - 1;
+      if (mpn_add_n (ws, p + n1, ws, nm1))
+	{
+	  mp_limb_t x = ws[nm1] + 1;
+	  ws[nm1] = x;
+	  if (x == 0)
+	    ++ws[n];
+	}
+      if (mpn_add_n (p + n3, p + n3, ws, n1))
+	{
+	  mp_limb_t x;
+	  i = n1 + n3;
+	  do
+	    {
+	      x = p[i] + 1;
+	      p[i] = x;
+	      ++i;
+	    } while (x == 0);
+	}
+    }
+  else
+    {
+      /* Even length. */
+      mp_limb_t t;
+
+      i = n2;
+      do
+	{
+	  --i;
+	  w0 = a[i];
+	  w1 = a[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      sign = 0;
+      if (w0 < w1)
+	{
+	  x = a + n2;
+	  y = a;
+	  sign = 1;
+	}
+      else
+	{
+	  x = a;
+	  y = a + n2;
+	}
+      mpn_sub_n (p, x, y, n2);
+
+      i = n2;
+      do 
+	{
+	  --i;
+	  w0 = b[i];
+	  w1 = b[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      if (w0 < w1)
+	{
+	  x = b + n2;
+	  y = b;
+	  sign ^= 1;
+	}
+      else
+	{
+	  x = b;
+	  y = b + n2;
+	}
+      mpn_sub_n (p + n2, x, y, n2);
+
+      /* Pointwise products. */
+      if (n2 < KARATSUBA_MUL_THRESHOLD)
+	{
+	  mpn_mul_basecase (ws, p, n2, p + n2, n2);
+	  mpn_mul_basecase (p, a, n2, b, n2);
+	  mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
+	}
+      else
+	{
+	  mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
+	  mpn_kara_mul_n (p, a, b, n2, ws + n);
+	  mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
+	}
+
+      /* Interpolate. */
+      if (sign)
+	w = mpn_add_n (ws, p, ws, n);
+      else
+	w = -mpn_sub_n (ws, p, ws, n);
+      w += mpn_add_n (ws, p + n, ws, n);
+      w += mpn_add_n (p + n2, p + n2, ws, n);
+      /* TO DO: could put "if (w) { ... }" here.
+       * Less work but badly predicted branch.
+       * No measurable difference in speed on Alpha.
+       */
+      i = n + n2;
+      t = p[i] + w;
+      p[i] = t;
+      if (t < w)
+	{
+	  do
+	    {
+	      ++i;
+	      w = p[i] + 1;
+	      p[i] = w;
+	    }
+	  while (w == 0);
+	}
+    }
+}
+
+void
+#if __STDC__
+mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
+#else
+mpn_kara_sqr_n (p, a, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t i, sign, w, w0, w1;
+  mp_size_t n2;
+  mp_srcptr x, y;
+
+  n2 = n >> 1;
+  ASSERT (n2 > 0);
+
+  if (n & 1)
+    {
+      /* Odd length. */
+      mp_size_t n1, n3, nm1;
+
+      n3 = n - n2;
+
+      sign = 0;
+      w = a[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p, a, a + n3, n2);
+      else
+	{
+	  i = n2;
+	  do
+	    {
+	      --i;
+	      w0 = a[i];
+	      w1 = a[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = a + n3;
+	      y = a;
+	      sign = 1;
+	    }
+	  else
+	    {
+	      x = a;
+	      y = a + n3;
+	    }
+	  mpn_sub_n (p, x, y, n2);
+	}
+      p[n2] = w;
+
+      w = a[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p + n3, a, a + n3, n2);
+      else
+	{
+	  i = n2;
+	  do 
+	    {
+	      --i;
+	      w0 = a[i]; 
+	      w1 = a[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = a + n3;
+	      y = a;
+	      sign ^= 1;
+	    }
+	  else
+	    {
+	      x = a;
+	      y = a + n3;
+	    }
+	  mpn_sub_n (p + n3, x, y, n2);
+	}
+      p[n] = w;
+
+      n1 = n + 1;
+      if (n2 < KARATSUBA_SQR_THRESHOLD)
+	{
+	  if (n3 < KARATSUBA_SQR_THRESHOLD)
+	    {
+	      mpn_sqr_basecase (ws, p, n3);
+	      mpn_sqr_basecase (p, a, n3);
+	    }
+	  else
+	    {
+	      mpn_kara_sqr_n (ws, p, n3, ws + n1);
+	      mpn_kara_sqr_n (p, a, n3, ws + n1);
+	    }
+	  mpn_sqr_basecase (p + n1, a + n3, n2);
+	}
+      else
+	{
+	  mpn_kara_sqr_n (ws, p, n3, ws + n1);
+	  mpn_kara_sqr_n (p, a, n3, ws + n1);
+	  mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
+	}
+
+      if (sign)
+	mpn_add_n (ws, p, ws, n1);
+      else
+	mpn_sub_n (ws, p, ws, n1);
+
+      nm1 = n - 1;
+      if (mpn_add_n (ws, p + n1, ws, nm1))
+	{
+	  mp_limb_t x = ws[nm1] + 1;
+	  ws[nm1] = x;
+	  if (x == 0)
+	    ++ws[n];
+	}
+      if (mpn_add_n (p + n3, p + n3, ws, n1))
+	{
+	  mp_limb_t x;
+	  i = n1 + n3;
+	  do
+	    {
+	      x = p[i] + 1;
+	      p[i] = x;
+	      ++i;
+	    } while (x == 0);
+	}
+    }
+  else
+    {
+      /* Even length. */
+      mp_limb_t t;
+
+      i = n2;
+      do
+	{
+	  --i;
+	  w0 = a[i];
+	  w1 = a[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      sign = 0;
+      if (w0 < w1)
+	{
+	  x = a + n2;
+	  y = a;
+	  sign = 1;
+	}
+      else
+	{
+	  x = a;
+	  y = a + n2;
+	}
+      mpn_sub_n (p, x, y, n2);
+
+      i = n2;
+      do 
+	{
+	  --i;
+	  w0 = a[i];
+	  w1 = a[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      if (w0 < w1)
+	{
+	  x = a + n2;
+	  y = a;
+	  sign ^= 1;
+	}
+      else
+	{
+	  x = a;
+	  y = a + n2;
+	}
+      mpn_sub_n (p + n2, x, y, n2);
+
+      /* Pointwise products. */
+      if (n2 < KARATSUBA_SQR_THRESHOLD)
+	{
+	  mpn_sqr_basecase (ws, p, n2);
+	  mpn_sqr_basecase (p, a, n2);
+	  mpn_sqr_basecase (p + n, a + n2, n2);
+	}
+      else
+	{
+	  mpn_kara_sqr_n (ws, p, n2, ws + n);
+	  mpn_kara_sqr_n (p, a, n2, ws + n);
+	  mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
+	}
+
+      /* Interpolate. */
+      if (sign)
+	w = mpn_add_n (ws, p, ws, n);
+      else
+	w = -mpn_sub_n (ws, p, ws, n);
+      w += mpn_add_n (ws, p + n, ws, n);
+      w += mpn_add_n (p + n2, p + n2, ws, n);
+      /* TO DO: could put "if (w) { ... }" here.
+       * Less work but badly predicted branch.
+       * No measurable difference in speed on Alpha.
+       */
+      i = n + n2;
+      t = p[i] + w;
+      p[i] = t;
+      if (t < w)
+	{
+	  do
+	    {
+	      ++i;
+	      w = p[i] + 1;
+	      p[i] = w;
+	    }
+	  while (w == 0);
+	}
+    }
+}
+
+/*-- add2Times -------------------------------------------------------------*/
+
+/* z[] = x[] + 2 * y[]
+   Note that z and x might point to the same vectors. */
+#ifdef USE_MORE_MPN
+static inline mp_limb_t
+#if __STDC__
+add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
+#else
+add2Times (z, x, y, n)
+     mp_ptr    z;
+     mp_srcptr x;
+     mp_srcptr y;
+     mp_size_t n;
+#endif
+{
+  mp_ptr t;
+  mp_limb_t c;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+  t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  c = mpn_lshift (t, y, n, 1);
+  c += mpn_add_n (z, x, t, n);
+  TMP_FREE (marker);
+  return c;
+}
+#else
+
+static mp_limb_t
+#if __STDC__
+add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
+#else
+add2Times (z, x, y, n)
+     mp_ptr    z;
+     mp_srcptr x;
+     mp_srcptr y;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t c, v, w;
+
+  ASSERT (n > 0);
+  v = *x; w = *y;
+  c = w >> (BITS_PER_MP_LIMB - 1);
+  w <<= 1;
+  v += w;
+  c += v < w;
+  *z = v;
+  ++x; ++y; ++z;
+  while (--n)
+    {
+      v = *x;
+      w = *y;
+      v += c;
+      c = v < c;
+      c += w >> (BITS_PER_MP_LIMB - 1);
+      w <<= 1;
+      v += w;
+      c += v < w;
+      *z = v;
+      ++x; ++y; ++z;
+    }
+
+  return c;
+}
+#endif
+
+/*-- evaluate3 -------------------------------------------------------------*/
+
+/* Evaluates:
+ *   ph := 4*A+2*B+C
+ *   p1 := A+B+C
+ *   p2 := A+2*B+4*C
+ * where:
+ *   ph[], p1[], p2[], A[] and B[] all have length len,
+ *   C[] has length len2 with len-len2 = 0, 1 or 2.
+ * Returns top words (overflow) at pth, pt1 and pt2 respectively.
+ */
+#ifdef USE_MORE_MPN
+static void
+#if __STDC__
+evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
+	   mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
+#else
+evaluate3 (ph, p1, p2, pth, pt1, pt2,
+           A, B, C, len, len2)
+     mp_ptr    ph;
+     mp_ptr    p1;
+     mp_ptr    p2;
+     mp_ptr    pth;
+     mp_ptr    pt1;
+     mp_ptr    pt2;
+     mp_srcptr A;
+     mp_srcptr B;
+     mp_srcptr C;
+     mp_size_t len;
+     mp_size_t len2;
+#endif
+{
+  mp_limb_t c, d, e;
+  
+  ASSERT (len - len2 <= 2);
+
+  e = mpn_lshift (p1, B, len, 1);
+
+  c = mpn_lshift (ph, A, len, 2);
+  c += e + mpn_add_n (ph, ph, p1, len);
+  d = mpn_add_n (ph, ph, C, len2);
+  if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
+  ASSERT (c < 7);
+  *pth = c;
+
+  c = mpn_lshift (p2, C, len2, 2);
+#if 1
+  if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
+  c += e + mpn_add_n (p2, p2, p1, len);
+#else
+  d = mpn_add_n (p2, p2, p1, len2);
+  c += d;
+  if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
+  c += e;
+#endif
+  c += mpn_add_n (p2, p2, A, len);
+  ASSERT (c < 7);
+  *pt2 = c;
+
+  c = mpn_add_n (p1, A, B, len);
+  d = mpn_add_n (p1, p1, C, len2);
+  if (len2 == len) c += d;
+  else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
+  ASSERT (c < 3);
+  *pt1 = c;
+
+}
+
+#else
+
+static void
+#if __STDC__
+evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
+	   mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
+#else
+evaluate3 (ph, p1, p2, pth, pt1, pt2,
+           A, B, C, l, ls)
+     mp_ptr    ph;
+     mp_ptr    p1;
+     mp_ptr    p2;
+     mp_ptr    pth;
+     mp_ptr    pt1;
+     mp_ptr    pt2;
+     mp_srcptr A;
+     mp_srcptr B;
+     mp_srcptr C;
+     mp_size_t l;
+     mp_size_t ls;
+#endif
+{
+  mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
+
+  ASSERT (l - ls <= 2);
+
+  th = t1 = t2 = 0;
+  for (i = 0; i < l; ++i)
+    {
+      a = *A;
+      b = *B;
+      c = i < ls ? *C : 0;
+
+      /* TO DO: choose one of the following alternatives. */
+#if 0
+      t = a << 2;
+      vh = th + t;
+      th = vh < t;
+      th += a >> (BITS_PER_MP_LIMB - 2);
+      t = b << 1;
+      vh += t;
+      th += vh < t;
+      th += b >> (BITS_PER_MP_LIMB - 1);
+      vh += c;
+      th += vh < c;
+#else
+      vh = th + c;
+      th = vh < c;
+      t = b << 1;
+      vh += t;
+      th += vh < t;
+      th += b >> (BITS_PER_MP_LIMB - 1);
+      t = a << 2;
+      vh += t;
+      th += vh < t;
+      th += a >> (BITS_PER_MP_LIMB - 2);
+#endif
+
+      v1 = t1 + a;
+      t1 = v1 < a;
+      v1 += b;
+      t1 += v1 < b;
+      v1 += c;
+      t1 += v1 < c;
+
+      v2 = t2 + a;
+      t2 = v2 < a;
+      t = b << 1;
+      v2 += t;
+      t2 += v2 < t;
+      t2 += b >> (BITS_PER_MP_LIMB - 1);
+      t = c << 2;
+      v2 += t;
+      t2 += v2 < t;
+      t2 += c >> (BITS_PER_MP_LIMB - 2);
+
+      *ph = vh;
+      *p1 = v1;
+      *p2 = v2;
+
+      ++A; ++B; ++C;
+      ++ph; ++p1; ++p2;
+    }
+
+  ASSERT (th < 7);
+  ASSERT (t1 < 3);
+  ASSERT (t2 < 7);
+
+  *pth = th;
+  *pt1 = t1;
+  *pt2 = t2;
+}
+#endif
+
+
+/*-- interpolate3 ----------------------------------------------------------*/
+
+/* Interpolates B, C, D (in-place) from:
+ *   16*A+8*B+4*C+2*D+E
+ *   A+B+C+D+E
+ *   A+2*B+4*C+8*D+16*E
+ * where:
+ *   A[], B[], C[] and D[] all have length l,
+ *   E[] has length ls with l-ls = 0, 2 or 4.
+ *
+ * Reads top words (from earlier overflow) from ptb, ptc and ptd,
+ * and returns new top words there.
+ */
+
+#ifdef USE_MORE_MPN
+static void
+#if __STDC__
+interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
+              mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
+#else
+interpolate3 (A, B, C, D, E,
+              ptb, ptc, ptd, len, len2)
+     mp_srcptr A;
+     mp_ptr    B;
+     mp_ptr    C;
+     mp_ptr    D;
+     mp_srcptr E;
+     mp_ptr    ptb;
+     mp_ptr    ptc;
+     mp_ptr    ptd;
+     mp_size_t len;
+     mp_size_t len2;
+#endif
+{
+  mp_ptr ws;
+  mp_limb_t t, tb,tc,td;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
+
+  /* Let x1, x2, x3 be the values to interpolate.  We have:
+   *         b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
+   *         c =    a +   x1 +   x2 +   x3 +    e
+   *         d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
+   */
+
+  ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
+
+  tb = *ptb; tc = *ptc; td = *ptd;
+
+
+  /* b := b - 16*a -    e
+   * c := c -    a -    e
+   * d := d -    a - 16*e
+   */
+
+  t = mpn_lshift (ws, A, len, 4);
+  tb -= t + mpn_sub_n (B, B, ws, len);
+  t = mpn_sub_n (B, B, E, len2);
+  if (len2 == len) tb -= t;
+  else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
+
+  tc -= mpn_sub_n (C, C, A, len);
+  t = mpn_sub_n (C, C, E, len2);
+  if (len2 == len) tc -= t;
+  else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
+
+  t = mpn_lshift (ws, E, len2, 4);
+  t += mpn_add_n (ws, ws, A, len2);
+#if 1
+  if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
+  td -= t + mpn_sub_n (D, D, ws, len);
+#else
+  t += mpn_sub_n (D, D, ws, len2);
+  if (len2 != len) {
+    t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
+    t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
+  } /* end if/else */
+  td -= t;
+#endif
+
+
+  /* b, d := b + d, b - d */
+
+#ifdef HAVE_MPN_ADD_SUB_N
+  /* #error TO DO ... */
+#else
+  t = tb + td + mpn_add_n (ws, B, D, len);  
+  td = tb - td - mpn_sub_n (D, B, D, len);
+  tb = t;
+  MPN_COPY (B, ws, len);
+#endif
+  
+  /* b := b-8*c */
+  t = 8 * tc + mpn_lshift (ws, C, len, 3);
+  tb -= t + mpn_sub_n (B, B, ws, len);
+
+  /* c := 2*c - b */
+  tc = 2 * tc + mpn_lshift (C, C, len, 1);
+  tc -= tb + mpn_sub_n (C, C, B, len);
+
+  /* d := d/3 */
+  td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
+
+  /* b, d := b + d, b - d */
+#ifdef HAVE_MPN_ADD_SUB_N
+  /* #error TO DO ... */
+#else
+  t = tb + td + mpn_add_n (ws, B, D, len);  
+  td = tb - td - mpn_sub_n (D, B, D, len);
+  tb = t;
+  MPN_COPY (B, ws, len);
+#endif
+
+      /* Now:
+       *	 b = 4*x1
+       *	 c = 2*x2
+       *	 d = 4*x3
+       */
+
+  ASSERT(!(*B & 3));
+  mpn_rshift (B, B, len, 2);
+  B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
+  ASSERT((long)tb >= 0);
+  tb >>= 2;
+
+  ASSERT(!(*C & 1));
+  mpn_rshift (C, C, len, 1);
+  C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
+  ASSERT((long)tc >= 0);
+  tc >>= 1;
+
+  ASSERT(!(*D & 3));
+  mpn_rshift (D, D, len, 2);
+  D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
+  ASSERT((long)td >= 0);
+  td >>= 2;
+
+#if WANT_ASSERT
+  ASSERT (tb < 2);
+  if (len == len2)
+    {
+      ASSERT (tc < 3);
+      ASSERT (td < 2);
+    }
+  else
+    {
+      ASSERT (tc < 2);
+      ASSERT (!td);
+    }
+#endif
+
+  *ptb = tb;
+  *ptc = tc;
+  *ptd = td;
+
+  TMP_FREE (marker);
+}
+
+#else
+
+static void
+#if __STDC__
+interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
+	      mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
+#else
+interpolate3 (A, B, C, D, E,
+              ptb, ptc, ptd, l, ls)
+     mp_srcptr A;
+     mp_ptr    B;
+     mp_ptr    C;
+     mp_ptr    D;
+     mp_srcptr E;
+     mp_ptr    ptb;
+     mp_ptr    ptc;
+     mp_ptr    ptd;
+     mp_size_t l;
+     mp_size_t ls;
+#endif
+{
+  mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
+  const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
+
+#if WANT_ASSERT
+  t = l - ls;
+  ASSERT (t == 0 || t == 2 || t == 4);
+#endif
+
+  sb = sc = sd = 0;
+  for (i = 0; i < l; ++i)
+    {
+      mp_limb_t tb, tc, td, tt;
+
+      a = *A;
+      b = *B;
+      c = *C;
+      d = *D;
+      e = i < ls ? *E : 0;
+
+      /* Let x1, x2, x3 be the values to interpolate.  We have:
+       *	 b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
+       *	 c =	a +   x1 +   x2 +   x3 +    e
+       *	 d =	a + 2*x1 + 4*x2 + 8*x3 + 16*e
+       */
+
+      /* b := b - 16*a -    e
+       * c := c -    a -    e
+       * d := d -    a - 16*e
+       */
+      t = a << 4;
+      tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
+      b -= t;
+      tb -= b < e;
+      b -= e;
+      tc = -(c < a);
+      c -= a;
+      tc -= c < e;
+      c -= e;
+      td = -(d < a);
+      d -= a;
+      t = e << 4;
+      td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
+      d -= t;
+
+      /* b, d := b + d, b - d */
+      t = b + d;
+      tt = tb + td + (t < b);
+      td = tb - td - (b < d);
+      d = b - d;
+      b = t;
+      tb = tt;
+
+      /* b := b-8*c */
+      t = c << 3;
+      tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
+      b -= t;
+
+      /* c := 2*c - b */
+      t = c << 1;
+      tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
+      c = t - b;
+
+      /* d := d/3 */
+      d *= INVERSE_3;
+      td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
+      td *= INVERSE_3;
+
+      /* b, d := b + d, b - d */
+      t = b + d;
+      tt = tb + td + (t < b);
+      td = tb - td - (b < d);
+      d = b - d;
+      b = t;
+      tb = tt;
+
+      /* Now:
+       *	 b = 4*x1
+       *	 c = 2*x2
+       *	 d = 4*x3
+       */
+
+      /* sb has period 2. */
+      b += sb;
+      tb += b < sb;
+      sb &= maskOffHalf;
+      sb |= sb >> (BITS_PER_MP_LIMB >> 1);
+      sb += tb;
+
+      /* sc has period 1. */
+      c += sc;
+      tc += c < sc;
+      /* TO DO: choose one of the following alternatives. */
+#if 1
+      sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1));
+      sc += tc;
+#else
+      sc = tc - ((long)sc < 0L);
+#endif
+
+      /* sd has period 2. */
+      d += sd;
+      td += d < sd;
+      sd &= maskOffHalf;
+      sd |= sd >> (BITS_PER_MP_LIMB >> 1);
+      sd += td;
+
+      if (i != 0)
+	{
+	  B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
+	  C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
+	  D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
+	}
+      ob = b >> 2;
+      oc = c >> 1;
+      od = d >> 2;
+
+      ++A; ++B; ++C; ++D; ++E;
+    }
+
+  /* Handle top words. */
+  b = *ptb;
+  c = *ptc;
+  d = *ptd;
+
+  t = b + d;
+  d = b - d;
+  b = t;
+  b -= c << 3;
+  c = (c << 1) - b;
+  d *= INVERSE_3;
+  t = b + d;
+  d = b - d;
+  b = t;
+
+  b += sb;
+  c += sc;
+  d += sd;
+
+  B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
+  C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
+  D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
+
+  b >>= 2;
+  c >>= 1;
+  d >>= 2;
+
+#if WANT_ASSERT
+  ASSERT (b < 2);
+  if (l == ls)
+    {
+      ASSERT (c < 3);
+      ASSERT (d < 2);
+    }
+  else
+    {
+      ASSERT (c < 2);
+      ASSERT (!d);
+    }
+#endif
+
+  *ptb = b;
+  *ptc = c;
+  *ptd = d;
+}
+#endif
+
+
+/*-- mpn_toom3_mul_n --------------------------------------------------------------*/
+
+/* Multiplies using 5 mults of one third size and so on recursively.
+ * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
+ * No overlap of p[...] with a[...] or b[...].
+ * ws is workspace.
+ */
+
+/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
+ *        recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
+ *        because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
+ */
+
+#define TOOM3_MUL_REC(p, a, b, n, ws) \
+  do {								\
+    if (n < KARATSUBA_MUL_THRESHOLD)				\
+      mpn_mul_basecase (p, a, n, b, n);				\
+    else if (n < TOOM3_MUL_THRESHOLD)				\
+      mpn_kara_mul_n (p, a, b, n, ws);				\
+    else							\
+      mpn_toom3_mul_n (p, a, b, n, ws);				\
+  } while (0)
+
+void
+#if __STDC__
+mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
+#else
+mpn_toom3_mul_n (p, a, b, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_srcptr b;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
+  mp_limb_t *A,*B,*C,*D,*E, *W;
+  mp_size_t l,l2,l3,l4,l5,ls;
+
+  /* Break n words into chunks of size l, l and ls.
+   * n = 3*k   => l = k,   ls = k
+   * n = 3*k+1 => l = k+1, ls = k-1
+   * n = 3*k+2 => l = k+1, ls = k
+   */
+  {
+    mp_limb_t m;
+
+    ASSERT (n >= TOOM3_MUL_THRESHOLD);
+    l = ls = n / 3;
+    m = n - l * 3;
+    if (m != 0)
+      ++l;
+    if (m == 1)
+      --ls;
+
+    l2 = l * 2;
+    l3 = l * 3;
+    l4 = l * 4;
+    l5 = l * 5;
+    A = p;
+    B = ws;
+    C = p + l2;
+    D = ws + l2;
+    E = p + l4;
+    W = ws + l4;
+  }
+
+  /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
+  evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
+  evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
+
+  /** Second stage: pointwise multiplies. **/
+  TOOM3_MUL_REC(D, C, C + l, l, W);
+  tD = cD*dD;
+  if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
+  if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
+  ASSERT (tD < 49);
+  TOOM3_MUL_REC(C, B, B + l, l, W);
+  tC = cC*dC;
+  /* TO DO: choose one of the following alternatives. */
+#if 0
+  if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
+  if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
+#else
+  if (cC)
+    {
+      if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
+      else tC += add2Times (C + l, C + l, B + l, l);
+    }
+  if (dC)
+    {
+      if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
+      else tC += add2Times (C + l, C + l, B, l);
+    }
+#endif
+  ASSERT (tC < 9);
+  TOOM3_MUL_REC(B, A, A + l, l, W);
+  tB = cB*dB;
+  if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
+  if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
+  ASSERT (tB < 49);
+  TOOM3_MUL_REC(A, a, b, l, W);
+  TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
+
+  /** Third stage: interpolation. **/
+  interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
+
+  /** Final stage: add up the coefficients. **/
+  {
+    mp_limb_t i, x, y;
+    tB += mpn_add_n (p + l, p + l, B, l2);
+    tD += mpn_add_n (p + l3, p + l3, D, l2);
+    mpn_incr_u (p + l3, tB);
+    mpn_incr_u (p + l4, tC);
+    mpn_incr_u (p + l5, tD);
+  }
+}
+
+/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
+
+/* Like previous function but for squaring */
+
+#define TOOM3_SQR_REC(p, a, n, ws) \
+  do {								\
+    if (n < KARATSUBA_SQR_THRESHOLD)				\
+      mpn_sqr_basecase (p, a, n);				\
+    else if (n < TOOM3_SQR_THRESHOLD)				\
+      mpn_kara_sqr_n (p, a, n, ws);				\
+    else							\
+      mpn_toom3_sqr_n (p, a, n, ws);				\
+  } while (0)
+
+void
+#if __STDC__
+mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
+#else
+mpn_toom3_sqr_n (p, a, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t cB,cC,cD, tB,tC,tD;
+  mp_limb_t *A,*B,*C,*D,*E, *W;
+  mp_size_t l,l2,l3,l4,l5,ls;
+
+  /* Break n words into chunks of size l, l and ls.
+   * n = 3*k   => l = k,   ls = k
+   * n = 3*k+1 => l = k+1, ls = k-1
+   * n = 3*k+2 => l = k+1, ls = k
+   */
+  {
+    mp_limb_t m;
+
+    ASSERT (n >= TOOM3_MUL_THRESHOLD);
+    l = ls = n / 3;
+    m = n - l * 3;
+    if (m != 0)
+      ++l;
+    if (m == 1)
+      --ls;
+
+    l2 = l * 2;
+    l3 = l * 3;
+    l4 = l * 4;
+    l5 = l * 5;
+    A = p;
+    B = ws;
+    C = p + l2;
+    D = ws + l2;
+    E = p + l4;
+    W = ws + l4;
+  }
+
+  /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
+  evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
+
+  /** Second stage: pointwise multiplies. **/
+  TOOM3_SQR_REC(D, C, l, W);
+  tD = cD*cD;
+  if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
+  ASSERT (tD < 49);
+  TOOM3_SQR_REC(C, B, l, W);
+  tC = cC*cC;
+  /* TO DO: choose one of the following alternatives. */
+#if 0
+  if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
+#else
+  if (cC >= 1)
+    {
+      tC += add2Times (C + l, C + l, B, l);
+      if (cC == 2)
+        tC += add2Times (C + l, C + l, B, l);
+    }
+#endif
+  ASSERT (tC < 9);
+  TOOM3_SQR_REC(B, A, l, W);
+  tB = cB*cB;
+  if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
+  ASSERT (tB < 49);
+  TOOM3_SQR_REC(A, a, l, W);
+  TOOM3_SQR_REC(E, a + l2, ls, W);
+
+  /** Third stage: interpolation. **/
+  interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
+
+  /** Final stage: add up the coefficients. **/
+  {
+    mp_limb_t i, x, y;
+    tB += mpn_add_n (p + l, p + l, B, l2);
+    tD += mpn_add_n (p + l3, p + l3, D, l2);
+    mpn_incr_u (p + l3, tB);
+    mpn_incr_u (p + l4, tC);
+    mpn_incr_u (p + l5, tD);
+  }
+}
+
+void
+#if __STDC__
+mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
+#else
+mpn_mul_n (p, a, b, n)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_srcptr b;
+     mp_size_t n;
+#endif
+{
+  if (n < KARATSUBA_MUL_THRESHOLD)
+    mpn_mul_basecase (p, a, n, b, n);
+  else if (n < TOOM3_MUL_THRESHOLD)
+    {
+      /* Allocate workspace of fixed size on stack: fast! */
+#if TUNE_PROGRAM_BUILD
+      mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
+#else
+      mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
+#endif
+      mpn_kara_mul_n (p, a, b, n, ws);
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else if (n < FFT_MUL_THRESHOLD)
+#else
+  else
+#endif
+    {
+      /* Use workspace of unknown size in heap, as stack space may
+       * be limited.  Since n is at least TOOM3_MUL_THRESHOLD, the
+       * multiplication will take much longer than malloc()/free().  */
+      mp_limb_t wsLen, *ws;
+      wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
+      ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t));
+      mpn_toom3_mul_n (p, a, b, n, ws);
+      (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t));
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else
+    {
+      mpn_mul_fft_full (p, a, n, b, n);      
+    }
+#endif
+}
diff --git a/rts/gmp/mpn/generic/perfsqr.c b/rts/gmp/mpn/generic/perfsqr.c
new file mode 100644
index 0000000000..42ee3405d7
--- /dev/null
+++ b/rts/gmp/mpn/generic/perfsqr.c
@@ -0,0 +1,123 @@
+/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
+   zero otherwise.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* sq_res_0x100[x mod 0x100] == 1 iff x mod 0x100 is a quadratic residue
+   modulo 0x100.  */
+static unsigned char const sq_res_0x100[0x100] =
+{
+  1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+};
+
+int
+#if __STDC__
+mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
+#else
+mpn_perfect_square_p (up, usize)
+     mp_srcptr up;
+     mp_size_t usize;
+#endif
+{
+  mp_limb_t rem;
+  mp_ptr root_ptr;
+  int res;
+  TMP_DECL (marker);
+
+  /* The first test excludes 55/64 (85.9%) of the perfect square candidates
+     in O(1) time.  */
+  if ((sq_res_0x100[(unsigned int) up[0] % 0x100] & 1) == 0)
+    return 0;
+
+#if defined (PP)
+  /* The second test excludes 30652543/30808063 (99.5%) of the remaining
+     perfect square candidates in O(n) time.  */
+
+  /* Firstly, compute REM = A mod PP.  */
+  if (UDIV_TIME > (2 * UMUL_TIME + 6))
+    rem = mpn_preinv_mod_1 (up, usize, (mp_limb_t) PP, (mp_limb_t) PP_INVERTED);
+  else
+    rem = mpn_mod_1 (up, usize, (mp_limb_t) PP);
+
+  /* Now decide if REM is a quadratic residue modulo the factors in PP.  */
+
+  /* If A is just a few limbs, computing the square root does not take long
+     time, so things might run faster if we limit this loop according to the
+     size of A.  */
+
+#if BITS_PER_MP_LIMB == 64
+  if (((CNST_LIMB(0x12DD703303AED3) >> rem % 53) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x4351B2753DF) >> rem % 47) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x35883A3EE53) >> rem % 43) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x1B382B50737) >> rem % 41) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x165E211E9B) >> rem % 37) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x121D47B7) >> rem % 31) & 1) == 0)
+    return 0;
+#endif
+  if (((0x13D122F3L >> rem % 29) & 1) == 0)
+    return 0;
+  if (((0x5335FL >> rem % 23) & 1) == 0)
+    return 0;
+  if (((0x30AF3L >> rem % 19) & 1) == 0)
+    return 0;
+  if (((0x1A317L >> rem % 17) & 1) == 0)
+    return 0;
+  if (((0x161BL >> rem % 13) & 1) == 0)
+    return 0;
+  if (((0x23BL >> rem % 11) & 1) == 0)
+    return 0;
+  if (((0x017L >> rem % 7) & 1) == 0)
+    return 0;
+  if (((0x13L >> rem % 5) & 1) == 0)
+    return 0;
+  if (((0x3L >> rem % 3) & 1) == 0)
+    return 0;
+#endif
+
+  TMP_MARK (marker);
+
+  /* For the third and last test, we finally compute the square root,
+     to make sure we've really got a perfect square.  */
+  root_ptr = (mp_ptr) TMP_ALLOC ((usize + 1) / 2 * BYTES_PER_MP_LIMB);
+
+  /* Iff mpn_sqrtrem returns zero, the square is perfect.  */
+  res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
+  TMP_FREE (marker);
+  return res;
+}
diff --git a/rts/gmp/mpn/generic/popcount.c b/rts/gmp/mpn/generic/popcount.c
new file mode 100644
index 0000000000..387be9536d
--- /dev/null
+++ b/rts/gmp/mpn/generic/popcount.c
@@ -0,0 +1,93 @@
+/* popcount.c
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined __GNUC__
+/* No processor claiming to be SPARC v9 compliant seem to
+   implement the POPC instruction.  Disable pattern for now.  */
+#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64
+#define popc_limb(a) \
+  ({									\
+    DItype __res;							\
+    asm ("popc %1,%0" : "=r" (__res) : "rI" (a));			\
+    __res;								\
+  })
+#endif
+#endif
+
+#ifndef popc_limb
+
+/* Cool population count of a mp_limb_t.
+   You have to figure out how this works, I won't tell you!  */
+
+static inline unsigned int
+#if __STDC__
+popc_limb (mp_limb_t x)
+#else
+popc_limb (x)
+     mp_limb_t x;
+#endif
+{
+#if BITS_PER_MP_LIMB == 64
+  /* We have to go into some trouble to define these constants.
+     (For mp_limb_t being `long long'.)  */
+  mp_limb_t cnst;
+  cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2);
+  x -= (x & cnst) >> 1;
+  cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2);
+  x = ((x & ~cnst) >> 2) + (x & cnst);
+  cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2);
+  x = ((x >> 4) + x) & cnst;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x);
+  x = ((x >> 32) + x) & 0xff;
+#endif
+#if BITS_PER_MP_LIMB == 32
+  x -= (x & 0xaaaaaaaa) >> 1;
+  x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L);
+  x = ((x >> 4) + x) & 0x0f0f0f0fL;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x) & 0xff;
+#endif
+  return x;
+}
+#endif
+
+unsigned long int
+#if __STDC__
+mpn_popcount (register mp_srcptr p, register mp_size_t size)
+#else
+mpn_popcount (p, size)
+     register mp_srcptr p;
+     register mp_size_t size;
+#endif
+{
+  unsigned long int popcnt;
+  mp_size_t i;
+
+  popcnt = 0;
+  for (i = 0; i < size; i++)
+    popcnt += popc_limb (p[i]);
+
+  return popcnt;
+}
diff --git a/rts/gmp/mpn/generic/pre_mod_1.c b/rts/gmp/mpn/generic/pre_mod_1.c
new file mode 100644
index 0000000000..27179683b3
--- /dev/null
+++ b/rts/gmp/mpn/generic/pre_mod_1.c
@@ -0,0 +1,69 @@
+/* mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb,
+		       divisor_limb_inverted) --
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by the normalized DIVISOR_LIMB.
+   DIVISOR_LIMB_INVERTED should be 2^(2*BITS_PER_MP_LIMB) / DIVISOR_LIMB +
+   - 2^BITS_PER_MP_LIMB.
+   Return the single-limb remainder.
+
+Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+mp_limb_t
+#if __STDC__
+mpn_preinv_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
+		  mp_limb_t divisor_limb, mp_limb_t divisor_limb_inverted)
+#else
+mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb, divisor_limb_inverted)
+     mp_srcptr dividend_ptr;
+     mp_size_t dividend_size;
+     mp_limb_t divisor_limb;
+     mp_limb_t divisor_limb_inverted;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t n0, r;
+  int dummy;
+
+  i = dividend_size - 1;
+  r = dividend_ptr[i];
+
+  if (r >= divisor_limb)
+    r = 0;
+  else
+    i--;
+
+  for (; i >= 0; i--)
+    {
+      n0 = dividend_ptr[i];
+      udiv_qrnnd_preinv (dummy, r, r, n0, divisor_limb, divisor_limb_inverted);
+    }
+  return r;
+}
diff --git a/rts/gmp/mpn/generic/random.c b/rts/gmp/mpn/generic/random.c
new file mode 100644
index 0000000000..dea4e20e56
--- /dev/null
+++ b/rts/gmp/mpn/generic/random.c
@@ -0,0 +1,43 @@
+/* mpn_random -- Generate random numbers.
+
+Copyright (C) 1996, 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "urandom.h"
+
+void
+#if __STDC__
+mpn_random (mp_ptr res_ptr, mp_size_t size)
+#else
+mpn_random (res_ptr, size)
+     mp_ptr res_ptr;
+     mp_size_t size;
+#endif
+{
+  mp_size_t i;
+
+  for (i = 0; i < size; i++)
+    res_ptr[i] = urandom ();
+
+  /* Make sure the most significant limb is non-zero.  */
+  while (res_ptr[size - 1] == 0)
+    res_ptr[size - 1] = urandom ();
+}
diff --git a/rts/gmp/mpn/generic/random2.c b/rts/gmp/mpn/generic/random2.c
new file mode 100644
index 0000000000..86682f81fa
--- /dev/null
+++ b/rts/gmp/mpn/generic/random2.c
@@ -0,0 +1,105 @@
+/* mpn_random2 -- Generate random numbers with relatively long strings
+   of ones and zeroes.  Suitable for border testing.
+
+Copyright (C) 1992, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined (__hpux) || defined (__alpha)  || defined (__svr4__) || defined (__SVR4)
+/* HPUX lacks random().  DEC OSF/1 1.2 random() returns a double.  */
+long mrand48 ();
+static inline long
+random ()
+{
+  return mrand48 ();
+}
+#elif defined(_WIN32) && !(defined(__CYGWIN__) || defined(__CYGWIN32__))
+/* MS CRT supplies just the poxy rand(), with an upper bound of 0x7fff */
+static inline unsigned long
+random ()
+{
+  return rand () ^ (rand () << 16) ^ (rand() << 32);
+}
+
+#else
+long random ();
+#endif
+
+/* It's a bit tricky to get this right, so please test the code well
+   if you hack with it.  Some early versions of the function produced
+   random numbers with the leading limb == 0, and some versions never
+   made the most significant bit set.  */
+
+void
+#if __STDC__
+mpn_random2 (mp_ptr res_ptr, mp_size_t size)
+#else
+mpn_random2 (res_ptr, size)
+     mp_ptr res_ptr;
+     mp_size_t size;
+#endif
+{
+  int n_bits;
+  int bit_pos;
+  mp_size_t limb_pos;
+  unsigned int ran;
+  mp_limb_t limb;
+
+  limb = 0;
+
+  /* Start off in a random bit position in the most significant limb.  */
+  bit_pos = random () & (BITS_PER_MP_LIMB - 1);
+
+  /* Least significant bit of RAN chooses string of ones/string of zeroes.
+     Make most significant limb be non-zero by setting bit 0 of RAN.  */
+  ran = random () | 1;
+
+  for (limb_pos = size - 1; limb_pos >= 0; )
+    {
+      n_bits = (ran >> 1) % BITS_PER_MP_LIMB + 1;
+      if ((ran & 1) != 0)
+	{
+	  /* Generate a string of ones.  */
+	  if (n_bits >= bit_pos)
+	    {
+	      res_ptr[limb_pos--] = limb | ((((mp_limb_t) 2) << bit_pos) - 1);
+	      bit_pos += BITS_PER_MP_LIMB;
+	      limb = (~(mp_limb_t) 0) << (bit_pos - n_bits);
+	    }
+	  else
+	    {
+	      limb |= ((((mp_limb_t) 1) << n_bits) - 1) << (bit_pos - n_bits + 1);
+	    }
+	}
+      else
+	{
+	  /* Generate a string of zeroes.  */
+	  if (n_bits >= bit_pos)
+	    {
+	      res_ptr[limb_pos--] = limb;
+	      limb = 0;
+	      bit_pos += BITS_PER_MP_LIMB;
+	    }
+	}
+      bit_pos -= n_bits;
+      ran = random ();
+    }
+}
diff --git a/rts/gmp/mpn/generic/rshift.c b/rts/gmp/mpn/generic/rshift.c
new file mode 100644
index 0000000000..59caf73529
--- /dev/null
+++ b/rts/gmp/mpn/generic/rshift.c
@@ -0,0 +1,88 @@
+/* mpn_rshift -- Shift right a low-level natural-number integer.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right
+   and store the USIZE least significant limbs of the result at WP.
+   The bits shifted out to the right are returned.
+
+   Argument constraints:
+   1. 0 < CNT < BITS_PER_MP_LIMB
+   2. If the result is to be written over the input, WP must be <= UP.
+*/
+
+mp_limb_t
+#if __STDC__
+mpn_rshift (register mp_ptr wp,
+	    register mp_srcptr up, mp_size_t usize,
+	    register unsigned int cnt)
+#else
+mpn_rshift (wp, up, usize, cnt)
+     register mp_ptr wp;
+     register mp_srcptr up;
+     mp_size_t usize;
+     register unsigned int cnt;
+#endif
+{
+  register mp_limb_t high_limb, low_limb;
+  register unsigned sh_1, sh_2;
+  register mp_size_t i;
+  mp_limb_t retval;
+
+#ifdef DEBUG
+  if (usize == 0 || cnt == 0)
+    abort ();
+#endif
+
+  sh_1 = cnt;
+
+#if 0
+  if (sh_1 == 0)
+    {
+      if (wp != up)
+	{
+	  /* Copy from low end to high end, to allow specified input/output
+	     overlapping.  */
+	  for (i = 0; i < usize; i++)
+	    wp[i] = up[i];
+	}
+      return usize;
+    }
+#endif
+
+  wp -= 1;
+  sh_2 = BITS_PER_MP_LIMB - sh_1;
+  high_limb = up[0];
+  retval = high_limb << sh_2;
+  low_limb = high_limb;
+
+  for (i = 1; i < usize; i++)
+    {
+      high_limb = up[i];
+      wp[i] = (low_limb >> sh_1) | (high_limb << sh_2);
+      low_limb = high_limb;
+    }
+  wp[i] = low_limb >> sh_1;
+
+  return retval;
+}
diff --git a/rts/gmp/mpn/generic/sb_divrem_mn.c b/rts/gmp/mpn/generic/sb_divrem_mn.c
new file mode 100644
index 0000000000..a269e34f5f
--- /dev/null
+++ b/rts/gmp/mpn/generic/sb_divrem_mn.c
@@ -0,0 +1,201 @@
+/* mpn_sb_divrem_mn -- Divide natural numbers, producing both remainder and
+   quotient.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+   IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+   FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1993, 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
+   the NSIZE-DSIZE least significant quotient limbs at QP
+   and the DSIZE long remainder at NP.  If QEXTRA_LIMBS is
+   non-zero, generate that many fraction bits and append them after the
+   other quotient limbs.
+   Return the most significant limb of the quotient, this is always 0 or 1.
+
+   Preconditions:
+   0. NSIZE >= DSIZE.
+   1. The most significant bit of the divisor must be set.
+   2. QP must either not overlap with the input operands at all, or
+      QP + DSIZE >= NP must hold true.  (This means that it's
+      possible to put the quotient in the high part of NUM, right after the
+      remainder in NUM.
+   3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero.
+   4. DSIZE >= 2.  */
+
+
+#define PREINVERT_VIABLE \
+  (UDIV_TIME > 2 * UMUL_TIME + 6 /* && ! TARGET_REGISTER_STARVED */)
+
+mp_limb_t
+#if __STDC__
+mpn_sb_divrem_mn (mp_ptr qp,
+	       mp_ptr np, mp_size_t nsize,
+	       mp_srcptr dp, mp_size_t dsize)
+#else
+mpn_sb_divrem_mn (qp, np, nsize, dp, dsize)
+     mp_ptr qp;
+     mp_ptr np;
+     mp_size_t nsize;
+     mp_srcptr dp;
+     mp_size_t dsize;
+#endif
+{
+  mp_limb_t most_significant_q_limb = 0;
+  mp_size_t i;
+  mp_limb_t dx, d1, n0;
+  mp_limb_t dxinv;
+  int have_preinv;
+
+  ASSERT_ALWAYS (dsize > 2);
+
+  np += nsize - dsize;
+  dx = dp[dsize - 1];
+  d1 = dp[dsize - 2];
+  n0 = np[dsize - 1];
+
+  if (n0 >= dx)
+    {
+      if (n0 > dx || mpn_cmp (np, dp, dsize - 1) >= 0)
+	{
+	  mpn_sub_n (np, np, dp, dsize);
+	  most_significant_q_limb = 1;
+	}
+    }
+
+  /* If multiplication is much faster than division, preinvert the
+     most significant divisor limb before entering the loop.  */
+  if (PREINVERT_VIABLE)
+    {
+      have_preinv = 0;
+      if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - dsize) > UDIV_TIME)
+	{
+	  invert_limb (dxinv, dx);
+	  have_preinv = 1;
+	}
+    }
+
+  for (i = nsize - dsize - 1; i >= 0; i--)
+    {
+      mp_limb_t q;
+      mp_limb_t nx;
+      mp_limb_t cy_limb;
+
+      nx = np[dsize - 1];
+      np--;
+
+      if (nx == dx)
+	{
+	  /* This might over-estimate q, but it's probably not worth
+	     the extra code here to find out.  */
+	  q = ~(mp_limb_t) 0;
+
+#if 1
+	  cy_limb = mpn_submul_1 (np, dp, dsize, q);
+#else
+	  /* This should be faster on many machines */
+	  cy_limb = mpn_sub_n (np + 1, np + 1, dp, dsize);
+	  cy = mpn_add_n (np, np, dp, dsize);
+	  np[dsize] += cy;
+#endif
+
+	  if (nx != cy_limb)
+	    {
+	      mpn_add_n (np, np, dp, dsize);
+	      q--;
+	    }
+
+	  qp[i] = q;
+	}
+      else
+	{
+	  mp_limb_t rx, r1, r0, p1, p0;
+
+          /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register
+             usage when np[dsize-1] is used in an asm statement like
+             umul_ppmm in udiv_qrnnd_preinv.  The symptom is seg faults due
+             to registers being clobbered.  gcc 2.95 i386 doesn't have the
+             problem. */
+          {
+            mp_limb_t  workaround = np[dsize - 1];
+            if (PREINVERT_VIABLE && have_preinv)
+              udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv);
+            else
+              udiv_qrnnd (q, r1, nx, workaround, dx);
+          }
+	  umul_ppmm (p1, p0, d1, q);
+
+	  r0 = np[dsize - 2];
+	  rx = 0;
+	  if (r1 < p1 || (r1 == p1 && r0 < p0))
+	    {
+	      p1 -= p0 < d1;
+	      p0 -= d1;
+	      q--;
+	      r1 += dx;
+	      rx = r1 < dx;
+	    }
+
+	  p1 += r0 < p0;	/* cannot carry! */
+	  rx -= r1 < p1;	/* may become 11..1 if q is still too large */
+	  r1 -= p1;
+	  r0 -= p0;
+
+	  cy_limb = mpn_submul_1 (np, dp, dsize - 2, q);
+
+	  {
+	    mp_limb_t cy1, cy2;
+	    cy1 = r0 < cy_limb;
+	    r0 -= cy_limb;
+	    cy2 = r1 < cy1;
+	    r1 -= cy1;
+	    np[dsize - 1] = r1;
+	    np[dsize - 2] = r0;
+	    if (cy2 != rx)
+	      {
+		mpn_add_n (np, np, dp, dsize);
+		q--;
+	      }
+	  }
+	  qp[i] = q;
+	}
+    }
+
+  /* ______ ______ ______
+    |__rx__|__r1__|__r0__|		partial remainder
+	    ______ ______
+	 - |__p1__|__p0__|		partial product to subtract
+	    ______ ______
+	 - |______|cylimb|		
+
+     rx is -1, 0 or 1.  If rx=1, then q is correct (it should match
+     carry out).  If rx=-1 then q is too large.  If rx=0, then q might
+     be too large, but it is most likely correct.
+  */
+
+  return most_significant_q_limb;
+}
diff --git a/rts/gmp/mpn/generic/scan0.c b/rts/gmp/mpn/generic/scan0.c
new file mode 100644
index 0000000000..96f05ce854
--- /dev/null
+++ b/rts/gmp/mpn/generic/scan0.c
@@ -0,0 +1,62 @@
+/* mpn_scan0 -- Scan from a given bit position for the next clear bit.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Design issues:
+   1. What if starting_bit is not within U?  Caller's problem?
+   2. Bit index should be 'unsigned'?
+
+   Argument constraints:
+   1. U must sooner ot later have a limb with a clear bit.
+ */
+
+unsigned long int
+#if __STDC__
+mpn_scan0 (register mp_srcptr up,
+	   register unsigned long int starting_bit)
+#else
+mpn_scan0 (up, starting_bit)
+     register mp_srcptr up;
+     register unsigned long int starting_bit;
+#endif
+{
+  mp_size_t starting_word;
+  mp_limb_t alimb;
+  int cnt;
+  mp_srcptr p;
+
+  /* Start at the word implied by STARTING_BIT.  */
+  starting_word = starting_bit / BITS_PER_MP_LIMB;
+  p = up + starting_word;
+  alimb = ~*p++;
+
+  /* Mask off any bits before STARTING_BIT in the first limb.  */
+  alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB);
+
+  while (alimb == 0)
+    alimb = ~*p++;
+
+  count_leading_zeros (cnt, alimb & -alimb);
+  return (p - up) * BITS_PER_MP_LIMB - 1 - cnt;
+}
diff --git a/rts/gmp/mpn/generic/scan1.c b/rts/gmp/mpn/generic/scan1.c
new file mode 100644
index 0000000000..98e2e0dcc0
--- /dev/null
+++ b/rts/gmp/mpn/generic/scan1.c
@@ -0,0 +1,62 @@
+/* mpn_scan1 -- Scan from a given bit position for the next set bit.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Design issues:
+   1. What if starting_bit is not within U?  Caller's problem?
+   2. Bit index should be 'unsigned'?
+
+   Argument constraints:
+   1. U must sooner ot later have a limb != 0.
+ */
+
+unsigned long int
+#if __STDC__
+mpn_scan1 (register mp_srcptr up,
+	   register unsigned long int starting_bit)
+#else
+mpn_scan1 (up, starting_bit)
+     register mp_srcptr up;
+     register unsigned long int starting_bit;
+#endif
+{
+  mp_size_t starting_word;
+  mp_limb_t alimb;
+  int cnt;
+  mp_srcptr p;
+
+  /* Start at the word implied by STARTING_BIT.  */
+  starting_word = starting_bit / BITS_PER_MP_LIMB;
+  p = up + starting_word;
+  alimb = *p++;
+
+  /* Mask off any bits before STARTING_BIT in the first limb.  */
+  alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB);
+
+  while (alimb == 0)
+    alimb = *p++;
+
+  count_leading_zeros (cnt, alimb & -alimb);
+  return (p - up) * BITS_PER_MP_LIMB - 1 - cnt;
+}
diff --git a/rts/gmp/mpn/generic/set_str.c b/rts/gmp/mpn/generic/set_str.c
new file mode 100644
index 0000000000..e6ccc92154
--- /dev/null
+++ b/rts/gmp/mpn/generic/set_str.c
@@ -0,0 +1,159 @@
+/* mpn_set_str (mp_ptr res_ptr, const char *str, size_t str_len, int base)
+   -- Convert a STR_LEN long base BASE byte string pointed to by STR to a
+   limb vector pointed to by RES_PTR.  Return the number of limbs in
+   RES_PTR.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_size_t
+#if __STDC__
+mpn_set_str (mp_ptr xp, const unsigned char *str, size_t str_len, int base)
+#else
+mpn_set_str (xp, str, str_len, base)
+     mp_ptr xp;
+     const unsigned char *str;
+     size_t str_len;
+     int base;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t big_base;
+  int indigits_per_limb;
+  mp_limb_t res_digit;
+
+  big_base = __mp_bases[base].big_base;
+  indigits_per_limb = __mp_bases[base].chars_per_limb;
+
+/*   size = str_len / indigits_per_limb + 1;  */
+
+  size = 0;
+
+  if ((base & (base - 1)) == 0)
+    {
+      /* The base is a power of 2.  Read the input string from
+	 least to most significant character/digit.  */
+
+      const unsigned char *s;
+      int next_bitpos;
+      int bits_per_indigit = big_base;
+
+      res_digit = 0;
+      next_bitpos = 0;
+
+      for (s = str + str_len - 1; s >= str; s--)
+	{
+	  int inp_digit = *s;
+
+	  res_digit |= (mp_limb_t) inp_digit << next_bitpos;
+	  next_bitpos += bits_per_indigit;
+	  if (next_bitpos >= BITS_PER_MP_LIMB)
+	    {
+	      xp[size++] = res_digit;
+	      next_bitpos -= BITS_PER_MP_LIMB;
+	      res_digit = inp_digit >> (bits_per_indigit - next_bitpos);
+	    }
+	}
+
+      if (res_digit != 0)
+	xp[size++] = res_digit;
+    }
+  else
+    {
+      /* General case.  The base is not a power of 2.  */
+
+      size_t i;
+      int j;
+      mp_limb_t cy_limb;
+
+      for (i = indigits_per_limb; i < str_len; i += indigits_per_limb)
+	{
+	  res_digit = *str++;
+	  if (base == 10)
+	    { /* This is a common case.
+		 Help the compiler to avoid multiplication.  */
+	      for (j = 1; j < indigits_per_limb; j++)
+		res_digit = res_digit * 10 + *str++;
+	    }
+	  else
+	    {
+	      for (j = 1; j < indigits_per_limb; j++)
+		res_digit = res_digit * base + *str++;
+	    }
+
+	  if (size == 0)
+	    {
+	      if (res_digit != 0)
+		{
+		  xp[0] = res_digit;
+		  size = 1;
+		}
+	    }
+	  else
+	    {
+	      cy_limb = mpn_mul_1 (xp, xp, size, big_base);
+	      cy_limb += mpn_add_1 (xp, xp, size, res_digit);
+	      if (cy_limb != 0)
+		xp[size++] = cy_limb;
+	    }
+	}
+
+      big_base = base;
+      res_digit = *str++;
+      if (base == 10)
+	{ /* This is a common case.
+	     Help the compiler to avoid multiplication.  */
+	  for (j = 1; j < str_len - (i - indigits_per_limb); j++)
+	    {
+	      res_digit = res_digit * 10 + *str++;
+	      big_base *= 10;
+	    }
+	}
+      else
+	{
+	  for (j = 1; j < str_len - (i - indigits_per_limb); j++)
+	    {
+	      res_digit = res_digit * base + *str++;
+	      big_base *= base;
+	    }
+	}
+
+      if (size == 0)
+	{
+	  if (res_digit != 0)
+	    {
+	      xp[0] = res_digit;
+	      size = 1;
+	    }
+	}
+      else
+	{
+	  cy_limb = mpn_mul_1 (xp, xp, size, big_base);
+	  cy_limb += mpn_add_1 (xp, xp, size, res_digit);
+	  if (cy_limb != 0)
+	    xp[size++] = cy_limb;
+	}
+    }
+
+  return size;
+}
diff --git a/rts/gmp/mpn/generic/sqr_basecase.c b/rts/gmp/mpn/generic/sqr_basecase.c
new file mode 100644
index 0000000000..760258a3e0
--- /dev/null
+++ b/rts/gmp/mpn/generic/sqr_basecase.c
@@ -0,0 +1,83 @@
+/* mpn_sqr_basecase -- Internal routine to square two natural numbers
+   of length m and n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpn_sqr_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t n)
+#else
+mpn_sqr_basecase (prodp, up, n)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t n;
+#endif
+{
+  mp_size_t i;
+
+  {
+    /* N.B.!  We need the superfluous indirection through argh to work around
+       a reloader bug in GCC 2.7.*.  */
+    mp_limb_t x;
+    mp_limb_t argh;
+    x = up[0];
+    umul_ppmm (argh, prodp[0], x, x);
+    prodp[1] = argh;
+  }
+  if (n > 1)
+    {
+      mp_limb_t tarr[2 * KARATSUBA_SQR_THRESHOLD];
+      mp_ptr tp = tarr;
+      mp_limb_t cy;
+
+      /* must fit 2*n limbs in tarr */
+      ASSERT (n <= KARATSUBA_SQR_THRESHOLD);
+
+      cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
+      tp[n - 1] = cy;
+      for (i = 2; i < n; i++)
+	{
+	  mp_limb_t cy;
+	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
+	  tp[n + i - 2] = cy;
+	}
+      for (i = 1; i < n; i++)
+	{
+	  mp_limb_t x;
+	  x = up[i];
+	  umul_ppmm (prodp[2 * i + 1], prodp[2 * i], x, x);
+	}
+      {
+	mp_limb_t cy;
+	cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
+	cy += mpn_add_n (prodp + 1, prodp + 1, tp, 2 * n - 2);
+	prodp[2 * n - 1] += cy;
+      }
+    }
+}
diff --git a/rts/gmp/mpn/generic/sqrtrem.c b/rts/gmp/mpn/generic/sqrtrem.c
new file mode 100644
index 0000000000..ee3b5144dd
--- /dev/null
+++ b/rts/gmp/mpn/generic/sqrtrem.c
@@ -0,0 +1,509 @@
+/* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
+
+   Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
+   Write the remainder at REM_PTR, if REM_PTR != NULL.
+   Return the size of the remainder.
+   (The size of the root is always half of the size of the operand.)
+
+   OP_PTR and ROOT_PTR may not point to the same object.
+   OP_PTR and REM_PTR may point to the same object.
+
+   If REM_PTR is NULL, only the root is computed and the return value of
+   the function is 0 if OP is a perfect square, and *any* non-zero number
+   otherwise.
+
+Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* This code is just correct if "unsigned char" has at least 8 bits.  It
+   doesn't help to use CHAR_BIT from limits.h, as the real problem is
+   the static arrays.  */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Square root algorithm:
+
+   1. Shift OP (the input) to the left an even number of bits s.t. there
+      are an even number of words and either (or both) of the most
+      significant bits are set.  This way, sqrt(OP) has exactly half as
+      many words as OP, and has its most significant bit set.
+
+   2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
+      This approximation is used for the first single-precision
+      iterations of Newton's method, yielding a full-word approximation
+      to sqrt(OP).
+
+   3. Perform multiple-precision Newton iteration until we have the
+      exact result.  Only about half of the input operand is used in
+      this calculation, as the square root is perfectly determinable
+      from just the higher half of a number.  */
+
+/* Define this macro for IEEE P854 machines with a fast sqrt instruction.  */
+#if defined __GNUC__ && ! defined __SOFT_FLOAT
+
+#if defined (__sparc__) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+
+#if defined (__HAVE_68881__)
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+
+#if defined (__hppa) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a));		\
+    __sqrt_res;								\
+  })
+#endif
+
+#if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+
+#if 0
+#if defined (__i386__) || defined (__i486__)
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+#endif
+
+#endif
+
+#ifndef SQRT
+
+/* Tables for initial approximation of the square root.  These are
+   indexed with bits 1-8 of the operand for which the square root is
+   calculated, where bit 0 is the most significant non-zero bit.  I.e.
+   the most significant one-bit is not used, since that per definition
+   is one.  Likewise, the tables don't return the highest bit of the
+   result.  That bit must be inserted by or:ing the returned value with
+   0x100.  This way, we get a 9-bit approximation from 8-bit tables!  */
+
+/* Table to be used for operands with an even total number of bits.
+   (Exactly as in the decimal system there are similarities between the
+   square root of numbers with the same initial digits and an even
+   difference in the total number of digits.  Consider the square root
+   of 1, 10, 100, 1000, ...)  */
+static const unsigned char even_approx_tab[256] =
+{
+  0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
+  0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
+  0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
+  0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
+  0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
+  0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
+  0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
+  0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
+  0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
+  0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
+  0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
+  0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
+  0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
+  0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
+  0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
+  0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
+  0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
+  0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
+  0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
+  0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
+  0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
+  0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
+  0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
+  0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
+  0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
+  0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
+  0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
+  0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
+  0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
+  0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
+  0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
+  0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
+};
+
+/* Table to be used for operands with an odd total number of bits.
+   (Further comments before previous table.)  */
+static const unsigned char odd_approx_tab[256] =
+{
+  0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+  0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
+  0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
+  0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
+  0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
+  0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
+  0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
+  0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
+  0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
+  0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
+  0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
+  0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
+  0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
+  0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
+  0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
+  0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
+  0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
+  0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
+  0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
+  0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
+  0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
+  0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
+  0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
+  0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
+  0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
+  0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
+  0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
+  0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
+  0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
+  0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
+  0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
+  0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
+};
+#endif
+
+
+mp_size_t
+#if __STDC__
+mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)
+#else
+mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
+     mp_ptr root_ptr;
+     mp_ptr rem_ptr;
+     mp_srcptr op_ptr;
+     mp_size_t op_size;
+#endif
+{
+  /* R (root result) */
+  mp_ptr rp;			/* Pointer to least significant word */
+  mp_size_t rsize;		/* The size in words */
+
+  /* T (OP shifted to the left a.k.a. normalized) */
+  mp_ptr tp;			/* Pointer to least significant word */
+  mp_size_t tsize;		/* The size in words */
+  mp_ptr t_end_ptr;		/* Pointer right beyond most sign. word */
+  mp_limb_t t_high0, t_high1;	/* The two most significant words */
+
+  /* TT (temporary for numerator/remainder) */
+  mp_ptr ttp;			/* Pointer to least significant word */
+
+  /* X (temporary for quotient in main loop) */
+  mp_ptr xp;			/* Pointer to least significant word */
+  mp_size_t xsize;		/* The size in words */
+
+  unsigned cnt;
+  mp_limb_t initial_approx;	/* Initially made approximation */
+  mp_size_t tsizes[BITS_PER_MP_LIMB];	/* Successive calculation precisions */
+  mp_size_t tmp;
+  mp_size_t i;
+
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+
+  /* If OP is zero, both results are zero.  */
+  if (op_size == 0)
+    return 0;
+
+  count_leading_zeros (cnt, op_ptr[op_size - 1]);
+  tsize = op_size;
+  if ((tsize & 1) != 0)
+    {
+      cnt += BITS_PER_MP_LIMB;
+      tsize++;
+    }
+
+  rsize = tsize / 2;
+  rp = root_ptr;
+
+  TMP_MARK (marker);
+
+  /* Shift OP an even number of bits into T, such that either the most or
+     the second most significant bit is set, and such that the number of
+     words in T becomes even.  This way, the number of words in R=sqrt(OP)
+     is exactly half as many as in OP, and the most significant bit of R
+     is set.
+
+     Also, the initial approximation is simplified by this up-shifted OP.
+
+     Finally, the Newtonian iteration which is the main part of this
+     program performs division by R.  The fast division routine expects
+     the divisor to be "normalized" in exactly the sense of having the
+     most significant bit set.  */
+
+  tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+
+  if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)
+    t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
+			  (cnt & ~1) % BITS_PER_MP_LIMB);
+  else
+    MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);
+
+  if (cnt >= BITS_PER_MP_LIMB)
+    tp[0] = 0;
+
+  t_high0 = tp[tsize - 1];
+  t_high1 = tp[tsize - 2];	/* Never stray.  TSIZE is >= 2.  */
+
+/* Is there a fast sqrt instruction defined for this machine?  */
+#ifdef SQRT
+  {
+    initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1);
+    /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
+       become incorrect due to overflow in the conversion from double to
+       mp_limb_t above.  It will typically be zero in that case, but might be
+       a small number on some machines.  The most significant bit of
+       INITIAL_APPROX should be set, so that bit is a good overflow
+       indication.  */
+    if ((mp_limb_signed_t) initial_approx >= 0)
+      initial_approx = ~(mp_limb_t)0;
+  }
+#else
+  /* Get a 9 bit approximation from the tables.  The tables expect to
+     be indexed with the 8 high bits right below the highest bit.
+     Also, the highest result bit is not returned by the tables, and
+     must be or:ed into the result.  The scheme gives 9 bits of start
+     approximation with just 256-entry 8 bit tables.  */
+
+  if ((cnt & 1) == 0)
+    {
+      /* The most significant bit of t_high0 is set.  */
+      initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
+      initial_approx &= 0xff;
+      initial_approx = even_approx_tab[initial_approx];
+    }
+  else
+    {
+      /* The most significant bit of t_high0 is unset,
+	 the second most significant is set.  */
+      initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
+      initial_approx &= 0xff;
+      initial_approx = odd_approx_tab[initial_approx];
+    }
+  initial_approx |= 0x100;
+  initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
+
+  /* Perform small precision Newtonian iterations to get a full word
+     approximation.  For small operands, these iterations will do the
+     entire job.  */
+  if (t_high0 == ~(mp_limb_t)0)
+    initial_approx = t_high0;
+  else
+    {
+      mp_limb_t quot;
+
+      if (t_high0 >= initial_approx)
+	initial_approx = t_high0 + 1;
+
+      /* First get about 18 bits with pure C arithmetics.  */
+      quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
+      initial_approx = (initial_approx + quot) / 2;
+      initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+
+      /* Now get a full word by one (or for > 36 bit machines) several
+	 iterations.  */
+      for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1)
+	{
+	  mp_limb_t ignored_remainder;
+
+	  udiv_qrnnd (quot, ignored_remainder,
+		      t_high0, t_high1, initial_approx);
+	  initial_approx = (initial_approx + quot) / 2;
+	  initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+	}
+    }
+#endif
+
+  rp[0] = initial_approx;
+  rsize = 1;
+
+#ifdef SQRT_DEBUG
+	  printf ("\n\nT = ");
+	  mpn_dump (tp, tsize);
+#endif
+
+  if (tsize > 2)
+    {
+      /* Determine the successive precisions to use in the iteration.  We
+	 minimize the precisions, beginning with the highest (i.e. last
+	 iteration) to the lowest (i.e. first iteration).  */
+
+      xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+      ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+
+      t_end_ptr = tp + tsize;
+
+      tmp = tsize / 2;
+      for (i = 0;; i++)
+	{
+	  tsize = (tmp + 1) / 2;
+	  if (tmp == tsize)
+	    break;
+	  tsizes[i] = tsize + tmp;
+	  tmp = tsize;
+	}
+
+      /* Main Newton iteration loop.  For big arguments, most of the
+	 time is spent here.  */
+
+      /* It is possible to do a great optimization here.  The successive
+	 divisors in the mpn_divmod call below have more and more leading
+	 words equal to its predecessor.  Therefore the beginning of
+	 each division will repeat the same work as did the last
+	 division.  If we could guarantee that the leading words of two
+	 consecutive divisors are the same (i.e. in this case, a later
+	 divisor has just more digits at the end) it would be a simple
+	 matter of just using the old remainder of the last division in
+	 a subsequent division, to take care of this optimization.  This
+	 idea would surely make a difference even for small arguments.  */
+
+      /* Loop invariants:
+
+	 R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
+	 X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
+	 R <= shiftdown_to_same_size(X).  */
+
+      while (--i >= 0)
+	{
+	  mp_limb_t cy;
+#ifdef SQRT_DEBUG
+	  mp_limb_t old_least_sign_r = rp[0];
+	  mp_size_t old_rsize = rsize;
+
+	  printf ("R = ");
+	  mpn_dump (rp, rsize);
+#endif
+	  tsize = tsizes[i];
+
+	  /* Need to copy the numerator into temporary space, as
+	     mpn_divmod overwrites its numerator argument with the
+	     remainder (which we currently ignore).  */
+	  MPN_COPY (ttp, t_end_ptr - tsize, tsize);
+	  cy = mpn_divmod (xp, ttp, tsize, rp, rsize);
+	  xsize = tsize - rsize;
+
+#ifdef SQRT_DEBUG
+	  printf ("X =%d ", cy);
+	  mpn_dump (xp, xsize);
+#endif
+
+	  /* Add X and R with the most significant limbs aligned,
+	     temporarily ignoring at least one limb at the low end of X.  */
+	  tmp = xsize - rsize;
+	  cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);
+
+	  /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
+	     intermediate roots that'd need an extra bit.  We don't want to
+	     handle that since it would make the subsequent divisor
+	     non-normalized, so round such roots down to be only ones in the
+	     current precision.  */
+	  if (cy == 2)
+	    {
+	      mp_size_t j;
+	      for (j = xsize; j >= 0; j--)
+		xp[j] = ~(mp_limb_t)0;
+	    }
+
+	  /* Divide X by 2 and put the result in R.  This is the new
+	     approximation.  Shift in the carry from the addition.  */
+	  mpn_rshift (rp, xp, xsize, 1);
+	  rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
+	  rsize = xsize;
+#ifdef SQRT_DEBUG
+	  if (old_least_sign_r != rp[rsize - old_rsize])
+	    printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",
+		    i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,
+		    2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);
+#endif
+	}
+    }
+
+#ifdef SQRT_DEBUG
+  printf ("(final) R = ");
+  mpn_dump (rp, rsize);
+#endif
+
+  /* We computed the square root of OP * 2**(2*floor(cnt/2)).
+     This has resulted in R being 2**floor(cnt/2) to large.
+     Shift it down here to fix that.  */
+  if (cnt / 2 != 0)
+    {
+      mpn_rshift (rp, rp, rsize, cnt/2);
+      rsize -= rp[rsize - 1] == 0;
+    }
+
+  /* Calculate the remainder.  */
+  mpn_mul_n (tp, rp, rp, rsize);
+  tsize = rsize + rsize;
+  tsize -= tp[tsize - 1] == 0;
+  if (op_size < tsize
+      || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
+    {
+      /* R is too large.  Decrement it.  */
+
+      /* These operations can't overflow.  */
+      cy_limb  = mpn_sub_n (tp, tp, rp, rsize);
+      cy_limb += mpn_sub_n (tp, tp, rp, rsize);
+      mpn_decr_u (tp + rsize, cy_limb);
+      mpn_incr_u (tp, (mp_limb_t) 1);
+
+      mpn_decr_u (rp, (mp_limb_t) 1);
+
+#ifdef SQRT_DEBUG
+      printf ("(adjusted) R = ");
+      mpn_dump (rp, rsize);
+#endif
+    }
+
+  if (rem_ptr != NULL)
+    {
+      cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
+      MPN_NORMALIZE (rem_ptr, op_size);
+      TMP_FREE (marker);
+      return op_size;
+    }
+  else
+    {
+      int res;
+      res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);
+      TMP_FREE (marker);
+      return res;
+    }
+}
diff --git a/rts/gmp/mpn/generic/sub_n.c b/rts/gmp/mpn/generic/sub_n.c
new file mode 100644
index 0000000000..4f2f06099c
--- /dev/null
+++ b/rts/gmp/mpn/generic/sub_n.c
@@ -0,0 +1,62 @@
+/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpn_sub_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+#else
+mpn_sub_n (res_ptr, s1_ptr, s2_ptr, size)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_srcptr s2_ptr;
+     mp_size_t size;
+#endif
+{
+  register mp_limb_t x, y, cy;
+  register mp_size_t j;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  s1_ptr -= j;
+  s2_ptr -= j;
+  res_ptr -= j;
+
+  cy = 0;
+  do
+    {
+      y = s2_ptr[j];
+      x = s1_ptr[j];
+      y += cy;			/* add previous carry to subtrahend */
+      cy = (y < cy);		/* get out carry from that addition */
+      y = x - y;		/* main subtract */
+      cy = (y > x) + cy;	/* get out carry from the subtract, combine */
+      res_ptr[j] = y;
+    }
+  while (++j != 0);
+
+  return cy;
+}
diff --git a/rts/gmp/mpn/generic/submul_1.c b/rts/gmp/mpn/generic/submul_1.c
new file mode 100644
index 0000000000..c7c08ee4af
--- /dev/null
+++ b/rts/gmp/mpn/generic/submul_1.c
@@ -0,0 +1,65 @@
+/* mpn_submul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+   by S2_LIMB, subtract the S1_SIZE least significant limbs of the product
+   from the limb vector pointed to by RES_PTR.  Return the most significant
+   limb of the product, adjusted for carry-out from the subtraction.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_submul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+{
+  register mp_limb_t cy_limb;
+  register mp_size_t j;
+  register mp_limb_t prod_high, prod_low;
+  register mp_limb_t x;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -s1_size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  res_ptr -= j;
+  s1_ptr -= j;
+
+  cy_limb = 0;
+  do
+    {
+      umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+      prod_low += cy_limb;
+      cy_limb = (prod_low < cy_limb) + prod_high;
+
+      x = res_ptr[j];
+      prod_low = x - prod_low;
+      cy_limb += (prod_low > x);
+      res_ptr[j] = prod_low;
+    }
+  while (++j != 0);
+
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/tdiv_qr.c b/rts/gmp/mpn/generic/tdiv_qr.c
new file mode 100644
index 0000000000..b748b5d810
--- /dev/null
+++ b/rts/gmp/mpn/generic/tdiv_qr.c
@@ -0,0 +1,401 @@
+/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and
+   write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp.  If
+   qxn is non-zero, generate that many fraction limbs and append them after the
+   other quotient limbs, and update the remainder accordningly.  The input
+   operands are unaffected.
+
+   Preconditions:
+   1. The most significant limb of of the divisor must be non-zero.
+   2. No argument overlap is permitted.  (??? relax this ???)
+   3. nn >= dn, even if qxn is non-zero.  (??? relax this ???)
+
+   The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time
+   complexity of multiplication.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
+#endif
+
+/* Extract the middle limb from ((h,,l) << cnt) */
+#define SHL(h,l,cnt) \
+  ((h << cnt) | ((l >> 1) >> ((~cnt) & (BITS_PER_MP_LIMB - 1))))
+
+void
+#if __STDC__
+mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
+	     mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
+#else
+mpn_tdiv_qr (qp, rp, qxn, np, nn, dp, dn)
+     mp_ptr qp;
+     mp_ptr rp;
+     mp_size_t qxn;
+     mp_srcptr np;
+     mp_size_t nn;
+     mp_srcptr dp;
+     mp_size_t dn;
+#endif
+{
+  /* FIXME:
+     1. qxn
+     2. pass allocated storage in additional parameter?
+  */
+  if (qxn != 0)
+    abort ();
+
+  switch (dn)
+    {
+    case 0:
+      DIVIDE_BY_ZERO;
+
+    case 1:
+      {
+	rp[0] = mpn_divmod_1 (qp, np, nn, dp[0]);
+	return;
+      }
+
+    case 2:
+      {
+	int cnt;
+	mp_ptr n2p, d2p;
+	mp_limb_t qhl, cy;
+	TMP_DECL (marker);
+	TMP_MARK (marker);
+	count_leading_zeros (cnt, dp[dn - 1]);
+	if (cnt != 0)
+	  {
+	    d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+	    mpn_lshift (d2p, dp, dn, cnt);
+	    n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+	    cy = mpn_lshift (n2p, np, nn, cnt);
+	    n2p[nn] = cy;
+	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p);
+	    if (cy == 0)
+	      qp[nn - 2] = qhl;	/* always store nn-dn+1 quotient limbs */
+	  }
+	else
+	  {
+	    d2p = (mp_ptr) dp;
+	    n2p = (mp_ptr) TMP_ALLOC (nn * BYTES_PER_MP_LIMB);
+	    MPN_COPY (n2p, np, nn);
+	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+	    qp[nn - 2] = qhl;	/* always store nn-dn+1 quotient limbs */
+	  }
+
+	if (cnt != 0)
+	  mpn_rshift (rp, n2p, dn, cnt);
+	else
+	  MPN_COPY (rp, n2p, dn);
+	TMP_FREE (marker);
+	return;
+      }
+
+    default:
+      {
+	int adjust;
+	TMP_DECL (marker);
+	TMP_MARK (marker);
+	adjust = np[nn - 1] >= dp[dn - 1];	/* conservative tests for quotient size */
+	if (nn + adjust >= 2 * dn)
+	  {
+	    mp_ptr n2p, d2p;
+	    mp_limb_t cy;
+	    int cnt;
+	    count_leading_zeros (cnt, dp[dn - 1]);
+
+	    qp[nn - dn] = 0;			/* zero high quotient limb */
+	    if (cnt != 0)			/* normalize divisor if needed */
+	      {
+		d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+		mpn_lshift (d2p, dp, dn, cnt);
+		n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+		cy = mpn_lshift (n2p, np, nn, cnt);
+		n2p[nn] = cy;
+		nn += adjust;
+	      }
+	    else
+	      {
+		d2p = (mp_ptr) dp;
+		n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+		MPN_COPY (n2p, np, nn);
+		n2p[nn] = 0;
+		nn += adjust;
+	      }
+
+	    if (dn == 2)
+	      mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+	    else if (dn < BZ_THRESHOLD)
+	      mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
+	    else
+	      {
+		/* Perform 2*dn / dn limb divisions as long as the limbs
+		   in np last.  */
+		mp_ptr q2p = qp + nn - 2 * dn;
+		n2p += nn - 2 * dn;
+		mpn_bz_divrem_n (q2p, n2p, d2p, dn);
+		nn -= dn;
+		while (nn >= 2 * dn)
+		  {
+		    mp_limb_t c;
+		    q2p -= dn;  n2p -= dn;
+		    c = mpn_bz_divrem_n (q2p, n2p, d2p, dn);
+		    ASSERT_ALWAYS (c == 0);
+		    nn -= dn;
+		  }
+
+		if (nn != dn)
+		  {
+		    n2p -= nn - dn;
+		    /* In theory, we could fall out to the cute code below
+		       since we now have exactly the situation that code
+		       is designed to handle.  We botch this badly and call
+		       the basic mpn_sb_divrem_mn!  */
+		    if (dn == 2)
+		      mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+		    else
+		      mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
+		  }
+	      }
+
+
+	    if (cnt != 0)
+	      mpn_rshift (rp, n2p, dn, cnt);
+	    else
+	      MPN_COPY (rp, n2p, dn);
+	    TMP_FREE (marker);
+	    return;
+	  }
+
+	/* When we come here, the numerator/partial remainder is less
+	   than twice the size of the denominator.  */
+
+	  {
+	    /* Problem:
+
+	       Divide a numerator N with nn limbs by a denominator D with dn
+	       limbs forming a quotient of nn-dn+1 limbs.  When qn is small
+	       compared to dn, conventional division algorithms perform poorly.
+	       We want an algorithm that has an expected running time that is
+	       dependent only on qn.  It is assumed that the most significant
+	       limb of the numerator is smaller than the most significant limb
+	       of the denominator.
+
+	       Algorithm (very informally stated):
+
+	       1) Divide the 2 x qn most significant limbs from the numerator
+		  by the qn most significant limbs from the denominator.  Call
+		  the result qest.  This is either the correct quotient, but
+		  might be 1 or 2 too large.  Compute the remainder from the
+		  division.  (This step is implemented by a mpn_divrem call.)
+
+	       2) Is the most significant limb from the remainder < p, where p
+		  is the product of the most significant limb from the quotient
+		  and the next(d).  (Next(d) denotes the next ignored limb from
+		  the denominator.)  If it is, decrement qest, and adjust the
+		  remainder accordingly.
+
+	       3) Is the remainder >= qest?  If it is, qest is the desired
+		  quotient.  The algorithm terminates.
+
+	       4) Subtract qest x next(d) from the remainder.  If there is
+		  borrow out, decrement qest, and adjust the remainder
+		  accordingly.
+
+	       5) Skip one word from the denominator (i.e., let next(d) denote
+		  the next less significant limb.  */
+
+	    mp_size_t qn;
+	    mp_ptr n2p, d2p;
+	    mp_ptr tp;
+	    mp_limb_t cy;
+	    mp_size_t in, rn;
+	    mp_limb_t quotient_too_large;
+	    int cnt;
+
+	    qn = nn - dn;
+	    qp[qn] = 0;				/* zero high quotient limb */
+	    qn += adjust;			/* qn cannot become bigger */
+
+	    if (qn == 0)
+	      {
+		MPN_COPY (rp, np, dn);
+		TMP_FREE (marker);
+		return;
+	      }
+
+	    in = dn - qn;		/* (at least partially) ignored # of limbs in ops */
+	    /* Normalize denominator by shifting it to the left such that its
+	       most significant bit is set.  Then shift the numerator the same
+	       amount, to mathematically preserve quotient.  */
+	    count_leading_zeros (cnt, dp[dn - 1]);
+	    if (cnt != 0)
+	      {
+		d2p = (mp_ptr) TMP_ALLOC (qn * BYTES_PER_MP_LIMB);
+
+		mpn_lshift (d2p, dp + in, qn, cnt);
+		d2p[0] |= dp[in - 1] >> (BITS_PER_MP_LIMB - cnt);
+
+		n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
+		cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt);
+		if (adjust)
+		  {
+		    n2p[2 * qn] = cy;
+		    n2p++;
+		  }
+		else
+		  {
+		    n2p[0] |= np[nn - 2 * qn - 1] >> (BITS_PER_MP_LIMB - cnt);
+		  }
+	      }
+	    else
+	      {
+		d2p = (mp_ptr) dp + in;
+
+		n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
+		MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn);
+		if (adjust)
+		  {
+		    n2p[2 * qn] = 0;
+		    n2p++;
+		  }
+	      }
+
+	    /* Get an approximate quotient using the extracted operands.  */
+	    if (qn == 1)
+	      {
+		mp_limb_t q0, r0;
+		mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0;
+		/* Due to a gcc 2.7.2.3 reload pass bug, we have to use some
+		   temps here.  This doesn't hurt code quality on any machines
+		   so we do it unconditionally.  */
+		gcc272bug_n1 = n2p[1];
+		gcc272bug_n0 = n2p[0];
+		gcc272bug_d0 = d2p[0];
+		udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0, gcc272bug_d0);
+		n2p[0] = r0;
+		qp[0] = q0;
+	      }
+	    else if (qn == 2)
+	      mpn_divrem_2 (qp, 0L, n2p, 4L, d2p);
+	    else if (qn < BZ_THRESHOLD)
+	      mpn_sb_divrem_mn (qp, n2p, qn * 2, d2p, qn);
+	    else
+	      mpn_bz_divrem_n (qp, n2p, d2p, qn);
+
+	    rn = qn;
+	    /* Multiply the first ignored divisor limb by the most significant
+	       quotient limb.  If that product is > the partial remainder's
+	       most significant limb, we know the quotient is too large.  This
+	       test quickly catches most cases where the quotient is too large;
+	       it catches all cases where the quotient is 2 too large.  */
+	    {
+	      mp_limb_t dl, x;
+	      mp_limb_t h, l;
+
+	      if (in - 2 < 0)
+		dl = 0;
+	      else
+		dl = dp[in - 2];
+
+	      x = SHL (dp[in - 1], dl, cnt);
+	      umul_ppmm (h, l, x, qp[qn - 1]);
+
+	      if (n2p[qn - 1] < h)
+		{
+		  mp_limb_t cy;
+
+		  mpn_decr_u (qp, (mp_limb_t) 1);
+		  cy = mpn_add_n (n2p, n2p, d2p, qn);
+		  if (cy)
+		    {
+		      /* The partial remainder is safely large.  */
+		      n2p[qn] = cy;
+		      ++rn;
+		    }
+		}
+	    }
+
+	    quotient_too_large = 0;
+	    if (cnt != 0)
+	      {
+		mp_limb_t cy1, cy2;
+
+		/* Append partially used numerator limb to partial remainder.  */
+		cy1 = mpn_lshift (n2p, n2p, rn, BITS_PER_MP_LIMB - cnt);
+		n2p[0] |= np[in - 1] & (~(mp_limb_t) 0 >> cnt);
+
+		/* Update partial remainder with partially used divisor limb.  */
+		cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (~(mp_limb_t) 0 >> cnt));
+		if (qn != rn)
+		  {
+		    if (n2p[qn] < cy2)
+		      abort ();
+		    n2p[qn] -= cy2;
+		  }
+		else
+		  {
+		    n2p[qn] = cy1 - cy2;
+
+		    quotient_too_large = (cy1 < cy2);
+		    ++rn;
+		  }
+		--in;
+	      }
+	    /* True: partial remainder now is neutral, i.e., it is not shifted up.  */
+
+	    tp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+
+	    if (in < qn)
+	      {
+		if (in == 0)
+		  {
+		    MPN_COPY (rp, n2p, rn);
+		    if (rn != dn)
+		      abort ();
+		    goto foo;
+		  }
+		mpn_mul (tp, qp, qn, dp, in);
+	      }
+	    else
+	      mpn_mul (tp, dp, in, qp, qn);
+
+	    cy = mpn_sub (n2p, n2p, rn, tp + in, qn);
+	    MPN_COPY (rp + in, n2p, dn - in);
+	    quotient_too_large |= cy;
+	    cy = mpn_sub_n (rp, np, tp, in);
+	    cy = mpn_sub_1 (rp + in, rp + in, rn, cy);
+	    quotient_too_large |= cy;
+	  foo:
+	    if (quotient_too_large)
+	      {
+		mpn_decr_u (qp, (mp_limb_t) 1);
+		mpn_add_n (rp, rp, dp, dn);
+	      }
+	  }
+	TMP_FREE (marker);
+	return;
+      }
+    }
+}
diff --git a/rts/gmp/mpn/generic/udiv_w_sdiv.c b/rts/gmp/mpn/generic/udiv_w_sdiv.c
new file mode 100644
index 0000000000..061cce86e1
--- /dev/null
+++ b/rts/gmp/mpn/generic/udiv_w_sdiv.c
@@ -0,0 +1,131 @@
+/* mpn_udiv_w_sdiv -- implement udiv_qrnnd on machines with only signed
+   division.
+
+   Contributed by Peter L. Montgomery.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY SAFE
+   TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THIS FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE
+   GNU MP RELEASE.
+
+
+Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_udiv_w_sdiv (rp, a1, a0, d)
+     mp_limb_t *rp, a1, a0, d;
+{
+  mp_limb_t q, r;
+  mp_limb_t c0, c1, b1;
+
+  if ((mp_limb_signed_t) d >= 0)
+    {
+      if (a1 < d - a1 - (a0 >> (BITS_PER_MP_LIMB - 1)))
+	{
+	  /* dividend, divisor, and quotient are nonnegative */
+	  sdiv_qrnnd (q, r, a1, a0, d);
+	}
+      else
+	{
+	  /* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */
+	  sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (BITS_PER_MP_LIMB - 1));
+	  /* Divide (c1*2^32 + c0) by d */
+	  sdiv_qrnnd (q, r, c1, c0, d);
+	  /* Add 2^31 to quotient */
+	  q += (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+	}
+    }
+  else
+    {
+      b1 = d >> 1;			/* d/2, between 2^30 and 2^31 - 1 */
+      c1 = a1 >> 1;			/* A/2 */
+      c0 = (a1 << (BITS_PER_MP_LIMB - 1)) + (a0 >> 1);
+
+      if (a1 < b1)			/* A < 2^32*b1, so A/2 < 2^31*b1 */
+	{
+	  sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+	  r = 2*r + (a0 & 1);		/* Remainder from A/(2*b1) */
+	  if ((d & 1) != 0)
+	    {
+	      if (r >= q)
+		r = r - q;
+	      else if (q - r <= d)
+		{
+		  r = r - q + d;
+		  q--;
+		}
+	      else
+		{
+		  r = r - q + 2*d;
+		  q -= 2;
+		}
+	    }
+	}
+      else if (c1 < b1)			/* So 2^31 <= (A/2)/b1 < 2^32 */
+	{
+	  c1 = (b1 - 1) - c1;
+	  c0 = ~c0;			/* logical NOT */
+
+	  sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+	  q = ~q;			/* (A/2)/b1 */
+	  r = (b1 - 1) - r;
+
+	  r = 2*r + (a0 & 1);		/* A/(2*b1) */
+
+	  if ((d & 1) != 0)
+	    {
+	      if (r >= q)
+		r = r - q;
+	      else if (q - r <= d)
+		{
+		  r = r - q + d;
+		  q--;
+		}
+	      else
+		{
+		  r = r - q + 2*d;
+		  q -= 2;
+		}
+	    }
+	}
+      else				/* Implies c1 = b1 */
+	{				/* Hence a1 = d - 1 = 2*b1 - 1 */
+	  if (a0 >= -d)
+	    {
+	      q = -1;
+	      r = a0 + d;
+	    }
+	  else
+	    {
+	      q = -2;
+	      r = a0 + 2*d;
+	    }
+	}
+    }
+
+  *rp = r;
+  return q;
+}
diff --git a/rts/gmp/mpn/hppa/README b/rts/gmp/mpn/hppa/README
new file mode 100644
index 0000000000..97e7abe011
--- /dev/null
+++ b/rts/gmp/mpn/hppa/README
@@ -0,0 +1,91 @@
+This directory contains mpn functions for various HP PA-RISC chips.  Code
+that runs faster on the PA7100 and later implementations, is in the pa7100
+directory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+  Load and Store timing
+
+On the PA7000 no memory instructions can issue the two cycles after a store.
+For the PA7100, this is reduced to one cycle.
+
+The PA7100 has a lookup-free cache, so it helps to schedule loads and the
+dependent instruction really far from each other.
+
+STATUS
+
+1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the
+   instructions below (but some sw pipelining is needed to avoid the
+   xmpyu-fstds delay):
+
+	fldds	s1_ptr
+
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+
+	addib	Loop
+
+2. mpn_addmul_1 could be improved from the current 10 to 7.5 cycles/limb
+   (asymptotically) on the PA7100, using the instructions below.  With proper
+   sw pipelining and the unrolling level below, the speed becomes 8
+   cycles/limb.
+
+	fldds	s1_ptr
+	fldds	s1_ptr
+
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	addc
+	addc
+	addc
+	addc
+	addc	%r0,%r0,cy-limb
+
+	ldws	res_ptr
+	ldws	res_ptr
+	ldws	res_ptr
+	ldws	res_ptr
+	add
+	stws	res_ptr
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+
+	addib
+
+3. For the PA8000 we have to stick to using 32-bit limbs before compiler
+   support emerges.  But we want to use 64-bit operations whenever possible,
+   in particular for loads and stores.  It is possible to handle mpn_add_n
+   efficiently by rotating (when s1/s2 are aligned), masking+bit field
+   inserting when (they are not).  The speed should double compared to the
+   code used today.
diff --git a/rts/gmp/mpn/hppa/add_n.s b/rts/gmp/mpn/hppa/add_n.s
new file mode 100644
index 0000000000..c53b2f71b3
--- /dev/null
+++ b/rts/gmp/mpn/hppa/add_n.s
@@ -0,0 +1,58 @@
+; HP-PA  __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 add		%r20,%r19,%r28	; add first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 addc		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/gmp-mparam.h b/rts/gmp/mpn/hppa/gmp-mparam.h
new file mode 100644
index 0000000000..98b6d9ce3c
--- /dev/null
+++ b/rts/gmp/mpn/hppa/gmp-mparam.h
@@ -0,0 +1,63 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the PA7100 using GCC.  */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   30
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      172
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   59
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      185
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              96
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            122
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            18
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD       46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          33
+#endif
diff --git a/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s
new file mode 100644
index 0000000000..c7d218f922
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s
@@ -0,0 +1,102 @@
+; HP-PA-1.1 __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 11 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 10 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+	.code
+	.export		__gmpn_addmul_1
+__gmpn_addmul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	add		%r29,%r1,%r19
+	stw		%r19,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/mul_1.s b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s
new file mode 100644
index 0000000000..4512fddec9
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s
@@ -0,0 +1,98 @@
+; HP-PA-1.1 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 9 cycles/limb on a PA7000.  With the used instructions, it can
+; not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since
+; only the xmpyu does not need the integer pipeline, so the only dual-issue
+; we will get are addc+xmpyu.  Unrolling would not help either CPU.
+
+; We could use fldds to read two limbs at a time from the S1 array, and that
+; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+; PA7100, respectively.  We don't do that since it does not seem worth the
+; (alignment) troubles...
+
+; At least the PA7100 is rumored to be able to deal with cache-misses
+; without stalling instruction issue.  If this is true, and the cache is
+; actually also lockup-free, we should use a deeper software pipeline, and
+; load from S1 very early!  (The loads and stores to -12(sp) will surely be
+; in the cache.)
+
+	.code
+	.export		__gmpn_mul_1
+__gmpn_mul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds	 	%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	fldws,ma	4(%r25),%fr5
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	fstds		%fr6,-16(%r30)
+	ldw		-16(%r30),%r28
+	ldo		-64(%r30),%r30
+	bv		0(%r2)
+	 fstws		%fr6R,0(%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s
new file mode 100644
index 0000000000..4f4be08b37
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s
@@ -0,0 +1,75 @@
+; HP-PA  __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+	.code
+	.export		__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,<=	-5,%r23,L$rest
+	 add		%r20,%r19,%r28	; add first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-4,%r23,L$loop
+	addc		%r20,%r19,%r28
+
+L$rest	addib,=		4,%r23,L$end
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-1,%r23,L$eloop
+	addc		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S
new file mode 100644
index 0000000000..04db06822e
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S
@@ -0,0 +1,189 @@
+; HP-PA 7100/7200 __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr	%r26
+#define s1_ptr	%r25
+#define size	%r24
+#define s2_limb	%r23
+
+#define cylimb	%r28
+#define s0	%r19
+#define s1	%r20
+#define s2	%r3
+#define s3	%r4
+#define lo0	%r21
+#define lo1	%r5
+#define lo2	%r6
+#define lo3	%r7
+#define hi0	%r22
+#define hi1	%r23				/* safe to reuse */
+#define hi2	%r29
+#define hi3	%r1
+
+	.code
+	.export		__gmpn_addmul_1
+__gmpn_addmul_1
+	.proc
+	.callinfo	frame=128,no_calls
+	.entry
+
+	ldo	128(%r30),%r30
+	stws	s2_limb,-16(%r30)
+	add	 %r0,%r0,cylimb			; clear cy and cylimb
+	addib,<	-4,size,L$few_limbs
+	fldws	-16(%r30),%fr31R
+
+	ldo	-112(%r30),%r31
+	stw	%r3,-96(%r30)
+	stw	%r4,-92(%r30)
+	stw	%r5,-88(%r30)
+	stw	%r6,-84(%r30)
+	stw	%r7,-80(%r30)
+
+	bb,>=,n	 s1_ptr,29,L$0
+
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r31)
+	ldws	-16(%r31),cylimb
+	ldws	-12(%r31),lo0
+	add	 s0,lo0,s0
+	addib,< -1,size,L$few_limbs
+	stws,ma	 s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	xmpyu	 %fr4L,%fr31R,%fr5
+	xmpyu	 %fr4R,%fr31R,%fr6
+	xmpyu	 %fr8L,%fr31R,%fr9
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	fstds	 %fr6,-8(%r31)
+	fstds	 %fr9,0(%r31)
+	fstds	 %fr10,8(%r31)
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	addc	 lo1,hi0,lo1
+	addc	 lo2,hi1,lo2
+	addc	 lo3,hi2,lo3
+
+	addib,<	 -4,size,L$end
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4L,%fr31R,%fr5
+	ldws	 4(res_ptr),s1
+	xmpyu	 %fr4R,%fr31R,%fr6
+	ldws	 8(res_ptr),s2
+	xmpyu	 %fr8L,%fr31R,%fr9
+	ldws	12(res_ptr),s3
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	add	 s0,lo0,s0
+	fstds	 %fr6,-8(%r31)
+	addc	 s1,lo1,s1
+	fstds	 %fr9,0(%r31)
+	addc	 s2,lo2,s2
+	fstds	 %fr10,8(%r31)
+	addc	 s3,lo3,s3
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	stws,ma	 s0,4(res_ptr)
+	addc	 lo1,hi0,lo1
+	stws,ma	 s1,4(res_ptr)
+	addc	 lo2,hi1,lo2
+	stws,ma	 s2,4(res_ptr)
+	addc	 lo3,hi2,lo3
+	stws,ma	 s3,4(res_ptr)
+
+	addib,>= -4,size,L$loop
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end	ldws	 0(res_ptr),s0
+	ldws	 4(res_ptr),s1
+	ldws	 8(res_ptr),s2
+	ldws	12(res_ptr),s3
+
+	add	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	addc	 s1,lo1,s1
+	stws,ma	 s1,4(res_ptr)
+	addc	 s2,lo2,s2
+	stws,ma	 s2,4(res_ptr)
+	addc	 s3,lo3,s3
+	stws,ma	 s3,4(res_ptr)
+
+; restore callee-saves registers ---------------------------------------------
+	ldw	-96(%r30),%r3
+	ldw	-92(%r30),%r4
+	ldw	-88(%r30),%r5
+	ldw	-84(%r30),%r6
+	ldw	-80(%r30),%r7
+
+L$few_limbs
+	addib,=,n 4,size,L$ret
+L$loop2	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r30)
+	ldws	-16(%r30),hi0
+	ldws	-12(%r30),lo0
+	addc	 lo0,cylimb,lo0
+	addc	 %r0,hi0,cylimb
+	add	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	addib,<> -1,size,L$loop2
+	nop
+
+L$ret	addc	 %r0,cylimb,cylimb
+	bv	 0(%r2)
+	ldo	 -128(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s
new file mode 100644
index 0000000000..31669b1a55
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s
@@ -0,0 +1,83 @@
+; HP-PA  __gmpn_lshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_lshift
+__gmpn_lshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L$0004
+	vshd		%r0,%r22,%r28		; compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,<=	-5,%r24,L$rest
+	vshd		%r22,%r29,%r20
+
+L$loop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,>		-4,%r24,L$loop
+	vshd		%r22,%r29,%r20
+
+L$rest	addib,=		4,%r24,L$end1
+	nop
+L$eloop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,<=	-1,%r24,L$end2
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,>		-1,%r24,L$eloop
+	vshd		%r22,%r29,%r20
+
+L$end1	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+L$end2	stws,mb		%r20,-4(0,%r26)
+L$0004	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s
new file mode 100644
index 0000000000..d32b10b4b1
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s
@@ -0,0 +1,80 @@
+; HP-PA  __gmpn_rshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_rshift
+__gmpn_rshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L$0004
+	vshd		%r22,%r0,%r28		; compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,<=	-5,%r24,L$rest
+	vshd		%r29,%r22,%r20
+
+L$loop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,>		-4,%r24,L$loop
+	vshd		%r29,%r22,%r20
+
+L$rest	addib,=		4,%r24,L$end1
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,<=	-1,%r24,L$end2
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,>		-1,%r24,L$eloop
+	vshd		%r29,%r22,%r20
+
+L$end1	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+L$end2	stws,ma		%r20,4(0,%r26)
+L$0004	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s
new file mode 100644
index 0000000000..0eec41c4b3
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s
@@ -0,0 +1,76 @@
+; HP-PA  __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+	.code
+	.export		__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,<=	-5,%r23,L$rest
+	 sub		%r20,%r19,%r28	; subtract first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-4,%r23,L$loop
+	subb		%r20,%r19,%r28
+
+L$rest	addib,=		4,%r23,L$end
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-1,%r23,L$eloop
+	subb		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S
new file mode 100644
index 0000000000..0fba21dcef
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S
@@ -0,0 +1,195 @@
+; HP-PA 7100/7200 __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr	%r26
+#define s1_ptr	%r25
+#define size	%r24
+#define s2_limb	%r23
+
+#define cylimb	%r28
+#define s0	%r19
+#define s1	%r20
+#define s2	%r3
+#define s3	%r4
+#define lo0	%r21
+#define lo1	%r5
+#define lo2	%r6
+#define lo3	%r7
+#define hi0	%r22
+#define hi1	%r23				/* safe to reuse */
+#define hi2	%r29
+#define hi3	%r1
+
+	.code
+	.export		__gmpn_submul_1
+__gmpn_submul_1
+	.proc
+	.callinfo	frame=128,no_calls
+	.entry
+
+	ldo	128(%r30),%r30
+	stws	s2_limb,-16(%r30)
+	add	 %r0,%r0,cylimb			; clear cy and cylimb
+	addib,<	-4,size,L$few_limbs
+	fldws	-16(%r30),%fr31R
+
+	ldo	-112(%r30),%r31
+	stw	%r3,-96(%r30)
+	stw	%r4,-92(%r30)
+	stw	%r5,-88(%r30)
+	stw	%r6,-84(%r30)
+	stw	%r7,-80(%r30)
+
+	bb,>=,n	 s1_ptr,29,L$0
+
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r31)
+	ldws	-16(%r31),cylimb
+	ldws	-12(%r31),lo0
+	sub	 s0,lo0,s0
+	add	 s0,lo0,%r0			; invert cy
+	addib,< -1,size,L$few_limbs
+	stws,ma	 s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	xmpyu	 %fr4L,%fr31R,%fr5
+	xmpyu	 %fr4R,%fr31R,%fr6
+	xmpyu	 %fr8L,%fr31R,%fr9
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	fstds	 %fr6,-8(%r31)
+	fstds	 %fr9,0(%r31)
+	fstds	 %fr10,8(%r31)
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	addc	 lo1,hi0,lo1
+	addc	 lo2,hi1,lo2
+	addc	 lo3,hi2,lo3
+
+	addib,<	 -4,size,L$end
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4L,%fr31R,%fr5
+	ldws	 4(res_ptr),s1
+	xmpyu	 %fr4R,%fr31R,%fr6
+	ldws	 8(res_ptr),s2
+	xmpyu	 %fr8L,%fr31R,%fr9
+	ldws	12(res_ptr),s3
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	sub	 s0,lo0,s0
+	fstds	 %fr6,-8(%r31)
+	subb	 s1,lo1,s1
+	fstds	 %fr9,0(%r31)
+	subb	 s2,lo2,s2
+	fstds	 %fr10,8(%r31)
+	subb	 s3,lo3,s3
+	subb	 %r0,%r0,lo0			; these two insns ...
+	add	 lo0,lo0,%r0			; ... just invert cy
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	stws,ma	 s0,4(res_ptr)
+	addc	 lo1,hi0,lo1
+	stws,ma	 s1,4(res_ptr)
+	addc	 lo2,hi1,lo2
+	stws,ma	 s2,4(res_ptr)
+	addc	 lo3,hi2,lo3
+	stws,ma	 s3,4(res_ptr)
+
+	addib,>= -4,size,L$loop
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end	ldws	 0(res_ptr),s0
+	ldws	 4(res_ptr),s1
+	ldws	 8(res_ptr),s2
+	ldws	12(res_ptr),s3
+
+	sub	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	subb	 s1,lo1,s1
+	stws,ma	 s1,4(res_ptr)
+	subb	 s2,lo2,s2
+	stws,ma	 s2,4(res_ptr)
+	subb	 s3,lo3,s3
+	stws,ma	 s3,4(res_ptr)
+	subb	 %r0,%r0,lo0			; these two insns ...
+	add	 lo0,lo0,%r0			; ... invert cy
+
+; restore callee-saves registers ---------------------------------------------
+	ldw	-96(%r30),%r3
+	ldw	-92(%r30),%r4
+	ldw	-88(%r30),%r5
+	ldw	-84(%r30),%r6
+	ldw	-80(%r30),%r7
+
+L$few_limbs
+	addib,=,n 4,size,L$ret
+L$loop2	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r30)
+	ldws	-16(%r30),hi0
+	ldws	-12(%r30),lo0
+	addc	 lo0,cylimb,lo0
+	addc	 %r0,hi0,cylimb
+	sub	 s0,lo0,s0
+	add	 s0,lo0,%r0			; invert cy
+	stws,ma	 s0,4(res_ptr)
+	addib,<> -1,size,L$loop2
+	nop
+
+L$ret	addc	 %r0,cylimb,cylimb
+	bv	 0(%r2)
+	ldo	 -128(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/submul_1.s b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s
new file mode 100644
index 0000000000..20a5b5ce0a
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s
@@ -0,0 +1,111 @@
+; HP-PA-1.1 __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 12 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 11 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+; It seems possible to make this run as fast as __gmpn_addmul_1, if we use
+; 	sub,>>=	%r29,%r19,%r22
+;	addi	1,%r28,%r28
+; but that requires reworking the hairy software pipeline...
+
+	.code
+	.export		__gmpn_submul_1
+__gmpn_submul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	sub		%r29,%r1,%r22
+	add		%r22,%r1,%r0
+	stw		%r22,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S
new file mode 100644
index 0000000000..b83d6f4dd2
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S
@@ -0,0 +1,80 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on PA 7000 and later.
+
+; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+	.code
+L$0000	.word		0x43f00000		; 2^64
+	.word		0x0
+	.export		__gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+	ldo		64(%r30),%r30
+
+	stws		%r25,-16(0,%r30)	; n_hi
+	stws		%r24,-12(0,%r30)	; n_lo
+#ifdef PIC
+	addil		LT%L$0000,%r19
+	ldo		RT%L$0000(%r1),%r19
+#else
+	ldil		L%L$0000,%r19
+	ldo		R%L$0000(%r19),%r19
+#endif	
+	fldds		-16(0,%r30),%fr5
+	stws		%r23,-12(0,%r30)
+	comib,<=	0,%r25,L$1
+	fcnvxf,dbl,dbl	%fr5,%fr5
+	fldds		0(0,%r19),%fr4
+	fadd,dbl	%fr4,%fr5,%fr5
+L$1
+	fcpy,sgl	%fr0,%fr6L
+	fldws		-12(0,%r30),%fr6R
+	fcnvxf,dbl,dbl	%fr6,%fr4
+
+	fdiv,dbl	%fr5,%fr4,%fr5
+
+	fcnvfx,dbl,dbl	%fr5,%fr4
+	fstws		%fr4R,-16(%r30)
+	xmpyu		%fr4R,%fr6R,%fr6
+	ldws		-16(%r30),%r28
+	fstds		%fr6,-16(0,%r30)
+	ldws		-12(0,%r30),%r21
+	ldws		-16(0,%r30),%r20
+	sub		%r24,%r21,%r22
+	subb		%r25,%r20,%r19
+	comib,=		0,%r19,L$2
+	ldo		-64(%r30),%r30
+
+	add		%r22,%r23,%r22
+	ldo		-1(%r28),%r28
+L$2	bv		0(%r2)
+	stws		%r22,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/umul.s b/rts/gmp/mpn/hppa/hppa1_1/umul.s
new file mode 100644
index 0000000000..1f1300ac9b
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/umul.s
@@ -0,0 +1,42 @@
+; Copyright (C) 1999 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+	.code
+	.export		__umul_ppmm
+	.align 4
+__umul_ppmm
+	.proc
+	.callinfo frame=64,no_calls
+	.entry
+
+	ldo 64(%r30),%r30
+	stw %r25,-16(0,%r30)
+	fldws -16(0,%r30),%fr22R
+	stw %r24,-16(0,%r30)
+	fldws -16(0,%r30),%fr22L
+	xmpyu %fr22R,%fr22L,%fr22
+	fstds %fr22,-16(0,%r30)
+	ldw -16(0,%r30),%r28
+	ldw -12(0,%r30),%r29
+	stw %r29,0(0,%r26)
+	bv 0(%r2)
+	ldo -64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa2_0/add_n.s b/rts/gmp/mpn/hppa/hppa2_0/add_n.s
new file mode 100644
index 0000000000..6e97278a39
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa2_0/add_n.s
@@ -0,0 +1,88 @@
+; HP-PA 2.0 32-bit __gmpn_add_n -- Add two limb vectors of the same length > 0
+; and store sum in a third limb vector.
+
+; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.code
+	.export	__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo frame=0,no_calls
+	.entry
+
+	sub		%r0,%r23,%r22
+	zdep		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	zdep		%r22,29,3,%r22		; r22 = 4 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	sub		%r26,%r22,%r26		; offset res_ptr
+	blr		%r28,%r0		; branch into loop
+	add		%r0,%r0,%r0		; reset carry
+
+L$loop	ldw		0(%r25),%r20
+	ldw		0(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,0(%r26)
+L$7	ldw		4(%r25),%r21
+	ldw		4(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,4(%r26)
+L$6	ldw		8(%r25),%r20
+	ldw		8(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,8(%r26)
+L$5	ldw		12(%r25),%r21
+	ldw		12(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,12(%r26)
+L$4	ldw		16(%r25),%r20
+	ldw		16(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,16(%r26)
+L$3	ldw		20(%r25),%r21
+	ldw		20(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,20(%r26)
+L$2	ldw		24(%r25),%r20
+	ldw		24(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,24(%r26)
+L$1	ldw		28(%r25),%r21
+	ldo		32(%r25),%r25
+	ldw		28(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,28(%r26)
+	ldo		32(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		32(%r26),%r26
+
+	bv		(%r2)
+	.exit
+	addc		%r0,%r0,%r28
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa2_0/sub_n.s b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s
new file mode 100644
index 0000000000..7d9b50fc27
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s
@@ -0,0 +1,88 @@
+; HP-PA 2.0 32-bit __gmpn_sub_n -- Subtract two limb vectors of the same
+; length > 0 and store difference in a third limb vector.
+
+; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.code
+	.export	__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo frame=0,no_calls
+	.entry
+
+	sub		%r0,%r23,%r22
+	zdep		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	zdep		%r22,29,3,%r22		; r22 = 4 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	blr		%r28,%r0		; branch into loop
+	sub		%r26,%r22,%r26		; offset res_ptr and set carry
+
+L$loop	ldw		0(%r25),%r20
+	ldw		0(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,0(%r26)
+L$7	ldw		4(%r25),%r21
+	ldw		4(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,4(%r26)
+L$6	ldw		8(%r25),%r20
+	ldw		8(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,8(%r26)
+L$5	ldw		12(%r25),%r21
+	ldw		12(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,12(%r26)
+L$4	ldw		16(%r25),%r20
+	ldw		16(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,16(%r26)
+L$3	ldw		20(%r25),%r21
+	ldw		20(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,20(%r26)
+L$2	ldw		24(%r25),%r20
+	ldw		24(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,24(%r26)
+L$1	ldw		28(%r25),%r21
+	ldo		32(%r25),%r25
+	ldw		28(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,28(%r26)
+	ldo		32(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		32(%r26),%r26
+
+	addc		%r0,%r0,%r28
+	bv		(%r2)
+	.exit
+	subi		1,%r28,%r28
+	.procend
diff --git a/rts/gmp/mpn/hppa/lshift.s b/rts/gmp/mpn/hppa/lshift.s
new file mode 100644
index 0000000000..f5a2daad60
--- /dev/null
+++ b/rts/gmp/mpn/hppa/lshift.s
@@ -0,0 +1,66 @@
+; HP-PA  __gmpn_lshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_lshift
+__gmpn_lshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L$0004
+	vshd		%r0,%r22,%r28		; compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r22,%r29,%r20
+
+L$loop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r22,%r29,%r20
+
+L$0002	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+L$0003	stws,mb		%r20,-4(0,%r26)
+L$0004	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/rshift.s b/rts/gmp/mpn/hppa/rshift.s
new file mode 100644
index 0000000000..e05e2f10b5
--- /dev/null
+++ b/rts/gmp/mpn/hppa/rshift.s
@@ -0,0 +1,63 @@
+; HP-PA  __gmpn_rshift -- 
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_rshift
+__gmpn_rshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L$0004
+	vshd		%r22,%r0,%r28		; compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r29,%r22,%r20
+
+L$loop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r29,%r22,%r20
+
+L$0002	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+L$0003	stws,ma		%r20,4(0,%r26)
+L$0004	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/sub_n.s b/rts/gmp/mpn/hppa/sub_n.s
new file mode 100644
index 0000000000..8f770ad1ad
--- /dev/null
+++ b/rts/gmp/mpn/hppa/sub_n.s
@@ -0,0 +1,59 @@
+; HP-PA  __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 sub		%r20,%r19,%r28	; subtract first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 subb		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/udiv_qrnnd.s b/rts/gmp/mpn/hppa/udiv_qrnnd.s
new file mode 100644
index 0000000000..9aa3b8a830
--- /dev/null
+++ b/rts/gmp/mpn/hppa/udiv_qrnnd.s
@@ -0,0 +1,286 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on pre-PA7000 CPUs.
+
+; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+; The code size is a bit excessive.  We could merge the last two ds;addc
+; sequences by simply moving the "bb,< Odd" instruction down.  The only
+; trouble is the FFFFFFFF code that would need some hacking.
+
+	.code
+	.export		__gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	comb,<		%r23,0,L$largedivisor
+	 sub		%r0,%r23,%r1		; clear cy as side-effect
+	ds		%r0,%r1,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r28
+	ds		%r25,%r23,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r23,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r28,%r28,%r28
+
+L$largedivisor
+	extru		%r24,31,1,%r19		; r19 = n0 & 1
+	bb,<		%r23,31,L$odd
+	 extru		%r23,30,31,%r22		; r22 = d >> 1
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r24,%r24,%r28
+
+L$odd	addib,sv,n	1,%r22,L$FF..		; r22 = (d / 2 + 1)
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r28
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
+	add,nuv		%r28,%r25,%r25
+	addl		%r25,%r1,%r25
+	addc		%r0,%r28,%r28
+	sub,<<		%r25,%r23,%r0
+	addl		%r25,%r1,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r28,%r28
+
+; This is just a special case of the code above.
+; We come here when d == 0xFFFFFFFF
+L$FF..	add,uv		%r25,%r24,%r24
+	sub,<<		%r24,%r23,%r0
+	ldo		1(%r24),%r24
+	stws		%r24,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r25,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/i960/README b/rts/gmp/mpn/i960/README
new file mode 100644
index 0000000000..d68a0a83eb
--- /dev/null
+++ b/rts/gmp/mpn/i960/README
@@ -0,0 +1,9 @@
+This directory contains mpn functions for Intel i960 processors.
+
+RELEVANT OPTIMIZATION ISSUES
+
+The code in this directory is not well optimized.
+
+STATUS
+
+The code in this directory has not been tested.
diff --git a/rts/gmp/mpn/i960/add_n.s b/rts/gmp/mpn/i960/add_n.s
new file mode 100644
index 0000000000..387317a397
--- /dev/null
+++ b/rts/gmp/mpn/i960/add_n.s
@@ -0,0 +1,43 @@
+# I960 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align 4
+	.globl ___gmpn_add_n
+___gmpn_add_n:
+	mov	0,g6		# clear carry-save register
+	cmpo	1,0		# clear cy
+
+Loop:	subo	1,g3,g3		# update loop counter
+	ld	(g1),g5		# load from s1_ptr
+	addo	4,g1,g1		# s1_ptr++
+	ld	(g2),g4		# load from s2_ptr
+	addo	4,g2,g2		# s2_ptr++
+	cmpo	g6,1		# restore cy from g6, relies on cy being 0
+	addc	g4,g5,g4	# main add
+	subc	0,0,g6		# save cy in g6
+	st	g4,(g0)		# store result to res_ptr
+	addo	4,g0,g0		# res_ptr++
+	cmpobne	0,g3,Loop	# when branch is taken, clears C bit
+
+	mov	g6,g0
+	ret
diff --git a/rts/gmp/mpn/i960/addmul_1.s b/rts/gmp/mpn/i960/addmul_1.s
new file mode 100644
index 0000000000..7df1418356
--- /dev/null
+++ b/rts/gmp/mpn/i960/addmul_1.s
@@ -0,0 +1,48 @@
+# I960 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	4
+	.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	subo	g2,0,g2
+	shlo	2,g2,g4
+	subo	g4,g1,g1
+	subo	g4,g0,g13
+	mov	0,g0
+
+	cmpo	1,0		# clear C bit on AC.cc
+
+Loop:	ld	(g1)[g2*4],g5
+	emul	g3,g5,g6
+	ld	(g13)[g2*4],g5
+
+	addc	g0,g6,g6	# relies on that C bit is clear
+	addc	0,g7,g7
+	addc	g5,g6,g6	# relies on that C bit is clear
+	st	g6,(g13)[g2*4]
+	addc	0,g7,g0
+
+	addo	g2,1,g2
+	cmpobne	0,g2,Loop	# when branch is taken, clears C bit
+
+	ret
diff --git a/rts/gmp/mpn/i960/mul_1.s b/rts/gmp/mpn/i960/mul_1.s
new file mode 100644
index 0000000000..5c0c985aa5
--- /dev/null
+++ b/rts/gmp/mpn/i960/mul_1.s
@@ -0,0 +1,45 @@
+# I960 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	4
+	.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	subo	g2,0,g2
+	shlo	2,g2,g4
+	subo	g4,g1,g1
+	subo	g4,g0,g13
+	mov	0,g0
+
+	cmpo	1,0		# clear C bit on AC.cc
+
+Loop:	ld	(g1)[g2*4],g5
+	emul	g3,g5,g6
+
+	addc	g0,g6,g6	# relies on that C bit is clear
+	st	g6,(g13)[g2*4]
+	addc	0,g7,g0
+
+	addo	g2,1,g2
+	cmpobne	0,g2,Loop	# when branch is taken, clears C bit
+
+	ret
diff --git a/rts/gmp/mpn/i960/sub_n.s b/rts/gmp/mpn/i960/sub_n.s
new file mode 100644
index 0000000000..2db2d46aad
--- /dev/null
+++ b/rts/gmp/mpn/i960/sub_n.s
@@ -0,0 +1,43 @@
+# I960 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align 4
+	.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	mov	1,g6		# set carry-save register
+	cmpo	1,0		# clear cy
+
+Loop:	subo	1,g3,g3		# update loop counter
+	ld	(g1),g5		# load from s1_ptr
+	addo	4,g1,g1		# s1_ptr++
+	ld	(g2),g4		# load from s2_ptr
+	addo	4,g2,g2		# s2_ptr++
+	cmpo	g6,1		# restore cy from g6, relies on cy being 0
+	subc	g4,g5,g4	# main subtract
+	subc	0,0,g6		# save cy in g6
+	st	g4,(g0)		# store result to res_ptr
+	addo	4,g0,g0		# res_ptr++
+	cmpobne	0,g3,Loop	# when branch is taken, cy will be 0
+
+	mov	g6,g0
+	ret
diff --git a/rts/gmp/mpn/lisp/gmpasm-mode.el b/rts/gmp/mpn/lisp/gmpasm-mode.el
new file mode 100644
index 0000000000..5d9da7fa1f
--- /dev/null
+++ b/rts/gmp/mpn/lisp/gmpasm-mode.el
@@ -0,0 +1,351 @@
+;;; gmpasm-mode.el -- GNU MP asm and m4 editing mode.
+
+
+;; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+;;
+;; This file is part of the GNU MP Library.
+;;
+;; The GNU MP Library is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU Lesser General Public License as published by
+;; the Free Software Foundation; either version 2.1 of the License, or (at your
+;; option) any later version.
+;;
+;; The GNU MP Library is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU Lesser General Public License
+;; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+;; MA 02111-1307, USA.
+
+
+;;; Commentary:
+;;
+;; gmpasm-mode is an editing mode for m4 processed assembler code and m4
+;; macro files in GMP.  It's similar to m4-mode, but has a number of
+;; settings better suited to GMP.
+;;
+;;
+;; Install
+;; -------
+;;
+;; To make M-x gmpasm-mode available, put gmpasm-mode.el somewhere in the
+;; load-path and the following in .emacs
+;;
+;;	(autoload 'gmpasm-mode "gmpasm-mode" nil t)
+;;
+;; To use gmpasm-mode automatically on all .asm and .m4 files, put the
+;; following in .emacs
+;;
+;;	(add-to-list 'auto-mode-alist '("\\.asm\\'" . gmpasm-mode))
+;;	(add-to-list 'auto-mode-alist '("\\.m4\\'" . gmpasm-mode))
+;;
+;; To have gmpasm-mode only on gmp files, try instead something like the
+;; following, which uses it only in a directory starting with "gmp", or a
+;; sub-directory of such.
+;;
+;;	(add-to-list 'auto-mode-alist
+;;	             '("/gmp.*/.*\\.\\(asm\\|m4\\)\\'" . gmpasm-mode))
+;;
+;; Byte compiling will slightly speed up loading.  If you want a docstring
+;; in the autoload you can use M-x update-file-autoloads if you set it up
+;; right.
+;;
+;;
+;; Emacsen
+;; -------
+;;
+;; FSF Emacs 20.x - gmpasm-mode is designed for this.
+;; XEmacs 20.x - seems to work.
+;;
+;; FSF Emacs 19.x - should work if replacements for some 20.x-isms are
+;;    available.  comment-region with "C" won't really do the right thing
+;;    though.
+
+
+;;; Code:
+
+(defgroup gmpasm nil
+  "GNU MP m4 and asm editing."
+  :prefix "gmpasm-"
+  :group 'languages)
+
+(defcustom gmpasm-mode-hook nil
+  "*Hook called by `gmpasm-mode'."
+  :type 'hook
+  :group 'gmpasm)
+
+(defcustom gmpasm-comment-start-regexp "[#;!@C]"
+  "*Regexp matching possible comment styles.
+See `gmpasm-mode' docstring for how this is used."
+  :type 'regexp
+  :group 'gmpasm)
+
+
+(defun gmpasm-add-to-list-second (list-var element)
+  "(gmpasm-add-to-list-second LIST-VAR ELEMENT)
+
+Add ELEMENT to LIST-VAR as the second element in the list, if it isn't
+already in the list.  If LIST-VAR is nil, then ELEMENT is just added as the
+sole element in the list.
+
+This is like `add-to-list', but it puts the new value second in the list.
+
+The first cons cell is copied rather than changed in-place, so references to
+the list elsewhere won't be affected."
+
+  (if (member element (symbol-value list-var))
+      (symbol-value list-var)
+    (set list-var
+	 (if (symbol-value list-var)
+	     (cons (car (symbol-value list-var))
+		   (cons element
+			 (cdr (symbol-value list-var))))
+	   (list element)))))
+
+
+(defun gmpasm-delete-from-list (list-var element)
+  "(gmpasm-delete-from-list LIST-VAR ELEMENT)
+
+Delete ELEMENT from LIST-VAR, using `delete'.
+This is like `add-to-list', but the element is deleted from the list.
+The list is copied rather than changed in-place, so references to it elsewhere
+won't be affected."
+
+  (set list-var (delete element (copy-sequence (symbol-value list-var)))))
+
+
+(defvar gmpasm-mode-map
+  (let ((map (make-sparse-keymap)))
+    
+    ;; assembler and dnl commenting
+    (define-key map "\C-c\C-c" 'comment-region)
+    (define-key map "\C-c\C-d" 'gmpasm-comment-region-dnl)
+    
+    ;; kill an M-x compile, since it's not hard to put m4 into an infinite
+    ;; loop
+    (define-key map "\C-c\C-k" 'kill-compilation)
+    
+    map)
+  "Keymap for `gmpasm-mode'.")
+
+
+(defvar gmpasm-mode-syntax-table
+  (let ((table (make-syntax-table)))
+    ;; underscore left as a symbol char, like C mode
+    
+    ;; m4 quotes
+    (modify-syntax-entry ?`  "('"  table)
+    (modify-syntax-entry ?'  ")`"  table)
+
+    table)
+  "Syntax table used in `gmpasm-mode'.
+
+m4 ignores quote marks in # comments at the top level, but inside quotes #
+isn't special and all quotes are active.  There seems no easy way to express
+this in the syntax table, so nothing is done for comments.  Usually this is
+best, since it picks up invalid apostrophes in comments inside quotes.")
+
+
+(defvar gmpasm-font-lock-keywords
+  (eval-when-compile
+    (list
+     (cons
+      (concat
+       "\\b"
+       (regexp-opt
+	'("deflit" "defreg" "defframe" "defframe_pushl"
+	  "define_not_for_expansion"
+	  "ASM_START" "ASM_END" "PROLOGUE" "EPILOGUE"
+	  "forloop"
+	  "TEXT" "DATA" "ALIGN" "W32"
+	  "builtin" "changecom" "changequote" "changeword" "debugfile"
+	  "debugmode" "decr" "define" "defn" "divert" "divnum" "dumpdef"
+	  "errprint" "esyscmd" "eval" "__file__" "format" "gnu" "ifdef"
+	  "ifelse" "include" "incr" "index" "indir" "len" "__line__"
+	  "m4exit" "m4wrap" "maketemp" "patsubst" "popdef" "pushdef"
+	  "regexp" "shift" "sinclude" "substr" "syscmd" "sysval"
+	  "traceoff" "traceon" "translit" "undefine" "undivert" "unix")
+	t)
+       "\\b") 'font-lock-keyword-face)))
+
+  "`font-lock-keywords' for `gmpasm-mode'.
+
+The keywords are m4 builtins and some of the GMP macros used in asm files.
+L and LF don't look good fontified, so they're omitted.
+
+The right assembler comment regexp is added dynamically buffer-local (with
+dnl too).")
+
+
+;; Initialized if gmpasm-mode finds filladapt loaded.
+(defvar gmpasm-filladapt-token-table nil
+  "Filladapt token table used in `gmpasm-mode'.")
+(defvar gmpasm-filladapt-token-match-table nil
+  "Filladapt token match table used in `gmpasm-mode'.")
+(defvar gmpasm-filladapt-token-conversion-table nil
+  "Filladapt token conversion table used in `gmpasm-mode'.")
+
+
+;;;###autoload
+(defun gmpasm-mode ()
+  "A major mode for editing GNU MP asm and m4 files.
+
+\\{gmpasm-mode-map}
+`comment-start' and `comment-end' are set buffer-local to assembler
+commenting appropriate for the CPU by looking for something matching
+`gmpasm-comment-start-regexp' at the start of a line, or \"#\" is used if
+there's no match (if \"#\" isn't what you want, type in a desired comment
+and do \\[gmpasm-mode] to reinitialize).
+
+`adaptive-fill-regexp' is set buffer-local to the standard regexp with
+`comment-start' and dnl added.  If filladapt.el has been loaded it similarly
+gets `comment-start' and dnl added as buffer-local fill prefixes.
+
+Font locking has the m4 builtins, some of the GMP macros, m4 dnl commenting,
+and assembler commenting (based on the `comment-start' determined).
+
+Note that `gmpasm-comment-start-regexp' is only matched as a whole word, so
+the `C' in it is only matched as a whole word, not on something that happens
+to start with `C'.  Also it's only the particular `comment-start' determined
+that's added for filling etc, not the whole `gmpasm-comment-start-regexp'.
+
+`gmpasm-mode-hook' is run after initializations are complete.
+"
+
+  (interactive)
+  (kill-all-local-variables)
+  (setq major-mode 'gmpasm-mode
+        mode-name  "gmpasm")
+  (use-local-map gmpasm-mode-map)
+  (set-syntax-table gmpasm-mode-syntax-table)
+  (setq fill-column 76)
+
+  ;; Short instructions might fit with 32, but anything with labels or
+  ;; expressions soon needs the comments pushed out to column 40.
+  (setq comment-column 40)
+
+  ;; Don't want to find out the hard way which dumb assemblers don't like a
+  ;; missing final newline.
+  (set (make-local-variable 'require-final-newline) t)
+
+  ;; The first match of gmpasm-comment-start-regexp at the start of a line
+  ;; determines comment-start, or "#" if no match.
+  (set (make-local-variable 'comment-start)
+       (save-excursion
+	 (goto-char (point-min))
+	 (if (re-search-forward
+	      (concat "^\\(" gmpasm-comment-start-regexp "\\)\\(\\s-\\|$\\)")
+	      nil t)
+	     (match-string 1)
+	   "#")))
+  (set (make-local-variable 'comment-end) "")
+
+  ;; If comment-start ends in an alphanumeric then \b is used to match it
+  ;; only as a separate word.  The test is for an alphanumeric rather than
+  ;; \w since we might try # or ! as \w characters but without wanting \b.
+  (let ((comment-regexp
+	 (concat (regexp-quote comment-start)
+		 (if (string-match "[a-zA-Z0-9]\\'" comment-start) "\\b"))))
+    
+    ;; Whitespace is required before a comment-start so m4 $# doesn't match
+    ;; when comment-start is "#".
+    ;; Only spaces or tabs match after, so newline isn't included in the
+    ;; font lock below.
+    (set (make-local-variable 'comment-start-skip)
+	 (concat "\\(^\\|\\s-\\)" comment-regexp "[ \t]*"))
+
+    ;; Comment fontification based on comment-start, matching through to the
+    ;; end of the line.
+    (add-to-list (make-local-variable 'gmpasm-font-lock-keywords)
+		 (cons (concat
+			"\\(\\bdnl\\b\\|" comment-start-skip "\\).*$")
+		       'font-lock-comment-face))
+
+    (set (make-local-variable 'font-lock-defaults)
+	 '(gmpasm-font-lock-keywords
+	   t	         ; no syntactic fontification (of strings etc)
+	   nil           ; no case-fold
+	   ((?_ . "w"))  ; _ part of a word while fontifying
+	   ))
+
+    ;; Paragraphs are separated by blank lines, or lines with only dnl or
+    ;; comment-start.
+    (set (make-local-variable 'paragraph-separate)
+	 (concat "[ \t\f]*\\(\\(" comment-regexp "\\|dnl\\)[ \t]*\\)*$"))
+    (set (make-local-variable 'paragraph-start)
+	 (concat "\f\\|" paragraph-separate))
+ 
+    ;; Adaptive fill gets dnl and comment-start as comment style prefixes on
+    ;; top of the standard regexp (which has # and ; already actually).
+    (set (make-local-variable 'adaptive-fill-regexp)
+	 (concat "[ \t]*\\(\\("
+		 comment-regexp
+		 "\\|dnl\\|[-|#;>*]+\\|(?[0-9]+[.)]\\)[ \t]*\\)*"))
+    (set (make-local-variable 'adaptive-fill-first-line-regexp)
+	 "\\`\\([ \t]*dnl\\)?[ \t]*\\'")
+
+    (when (fboundp 'filladapt-mode)
+      (when (not gmpasm-filladapt-token-table)
+	(setq gmpasm-filladapt-token-table
+	      filladapt-token-table)
+	(setq gmpasm-filladapt-token-match-table
+	      filladapt-token-match-table)
+	(setq gmpasm-filladapt-token-conversion-table
+	      filladapt-token-conversion-table)
+	
+	;; Numbered bullet points like "2.1" get matched at the start of a
+	;; line when it's really something like "2.1 cycles/limb", so delete
+	;; this from the list.  The regexp for "1.", "2." etc is left
+	;; though.
+	(gmpasm-delete-from-list 'gmpasm-filladapt-token-table
+				 '("[0-9]+\\(\\.[0-9]+\\)+[ \t]"
+				   bullet))
+	  
+	;; "%" as a comment prefix interferes with x86 register names
+	;; like %eax, so delete this.
+	(gmpasm-delete-from-list 'gmpasm-filladapt-token-table
+				 '("%+" postscript-comment))
+	
+	(add-to-list 'gmpasm-filladapt-token-match-table
+		     '(gmpasm-comment gmpasm-comment))
+	(add-to-list 'gmpasm-filladapt-token-conversion-table
+		     '(gmpasm-comment . exact))
+	)
+      
+      (set (make-local-variable 'filladapt-token-table)
+	   gmpasm-filladapt-token-table)
+      (set (make-local-variable 'filladapt-token-match-table)
+	   gmpasm-filladapt-token-match-table)
+      (set (make-local-variable 'filladapt-token-conversion-table)
+	   gmpasm-filladapt-token-conversion-table)
+    
+      ;; Add dnl and comment-start as fill prefixes.
+      ;; Comments in filladapt.el say filladapt-token-table must begin
+      ;; with ("^" beginning-of-line), so put our addition second.
+      (gmpasm-add-to-list-second 'filladapt-token-table
+				 (list (concat "dnl[ \t]\\|" comment-regexp)
+				       'gmpasm-comment))
+      ))
+  
+  (run-hooks 'gmpasm-mode-hook))
+
+
+(defun gmpasm-comment-region-dnl (beg end &optional arg)
+  "(gmpasm-comment-region BEG END &option ARG)
+
+Comment or uncomment each line in the region using `dnl'.
+With \\[universal-argument] prefix arg, uncomment each line in region.
+This is `comment-region', but using \"dnl\"."
+
+  (interactive "r\nP")
+  (let ((comment-start "dnl")
+	(comment-end ""))
+    (comment-region beg end arg)))
+
+
+(provide 'gmpasm-mode)
+
+;;; gmpasm-mode.el ends here
diff --git a/rts/gmp/mpn/m68k/add_n.S b/rts/gmp/mpn/m68k/add_n.S
new file mode 100644
index 0000000000..9e1d89d64f
--- /dev/null
+++ b/rts/gmp/mpn/m68k/add_n.S
@@ -0,0 +1,79 @@
+/* mc68020 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s2_ptr	(sp + 16)
+  size		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_add_n)
+
+C_SYMBOL_NAME(__gmpn_add_n:)
+PROLOG(__gmpn_add_n)
+/* Save used registers on the stack.  */
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,12),R(a2)
+	movel	MEM_DISP(sp,16),R(a0)
+	movel	MEM_DISP(sp,20),R(a1)
+	movel	MEM_DISP(sp,24),R(d2)
+
+	eorw	#1,R(d2)
+	lsrl	#1,R(d2)
+	bcc	L(L1)
+	subql	#1,R(d2)	/* clears cy as side effect */
+
+L(Loop:)
+	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	addxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+L(L1:)	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	addxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+
+	dbf	R(d2),L(Loop)		/* loop until 16 lsb of %4 == -1 */
+	subxl	R(d0),R(d0)	/* d0 <= -cy; save cy as 0 or -1 in d0 */
+	subl	#0x10000,R(d2)
+	bcs	L(L2)
+	addl	R(d0),R(d0)	/* restore cy */
+	bra	L(Loop)
+
+L(L2:)
+	negl	R(d0)
+
+/* Restore used registers from stack frame.  */
+	movel	MEM_POSTINC(sp),R(a2)
+	movel	MEM_POSTINC(sp),R(d2)
+
+	rts
+EPILOG(__gmpn_add_n)
diff --git a/rts/gmp/mpn/m68k/lshift.S b/rts/gmp/mpn/m68k/lshift.S
new file mode 100644
index 0000000000..a539d5d42e
--- /dev/null
+++ b/rts/gmp/mpn/m68k/lshift.S
@@ -0,0 +1,150 @@
+/* mc68020 __gmpn_lshift -- Shift left a low-level natural-number integer.
+
+Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  s_size	(sp + 16)
+  cnt		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+#define res_ptr a1
+#define s_ptr a0
+#define s_size d6
+#define cnt d4
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_lshift)
+
+C_SYMBOL_NAME(__gmpn_lshift:)
+PROLOG(__gmpn_lshift)
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  */
+	movel	MEM_DISP(sp,28),R(res_ptr)
+	movel	MEM_DISP(sp,32),R(s_ptr)
+	movel	MEM_DISP(sp,36),R(s_size)
+	movel	MEM_DISP(sp,40),R(cnt)
+
+	moveql	#1,R(d5)
+	cmpl	R(d5),R(cnt)
+	bne	L(Lnormal)
+	cmpl	R(s_ptr),R(res_ptr)
+	bls	L(Lspecial)		/* jump if s_ptr >= res_ptr */
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(a2)
+#else /* not mc68020 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	lea	MEM_INDX(s_ptr,d0,l),R(a2)
+#endif
+	cmpl	R(res_ptr),R(a2)
+	bls	L(Lspecial)		/* jump if res_ptr >= s_ptr + s_size */
+
+L(Lnormal:)
+	moveql	#32,R(d5)
+	subl	R(cnt),R(d5)
+
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
+#else /* not mc68000 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	addl	R(s_size),R(s_ptr)
+	addl	R(s_size),R(res_ptr)
+#endif
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	movel	R(d2),R(d0)
+	lsrl	R(d5),R(d0)		/* compute carry limb */
+
+	lsll	R(cnt),R(d2)
+	movel	R(d2),R(d1)
+	subql	#1,R(s_size)
+	beq	L(Lend)
+	lsrl	#1,R(s_size)
+	bcs	L(L1)
+	subql	#1,R(s_size)
+
+L(Loop:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	movel	R(d2),R(d3)
+	lsrl	R(d5),R(d3)
+	orl	R(d3),R(d1)
+	movel	R(d1),MEM_PREDEC(res_ptr)
+	lsll	R(cnt),R(d2)
+L(L1:)
+	movel	MEM_PREDEC(s_ptr),R(d1)
+	movel	R(d1),R(d3)
+	lsrl	R(d5),R(d3)
+	orl	R(d3),R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+	lsll	R(cnt),R(d1)
+
+	dbf	R(s_size),L(Loop)
+	subl	#0x10000,R(s_size)
+	bcc	L(Loop)
+
+L(Lend:)
+	movel	R(d1),MEM_PREDEC(res_ptr) /* store least significant limb */
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.  */
+
+L(Lspecial:)
+	clrl	R(d0)			/* initialize carry */
+	eorw	#1,R(s_size)
+	lsrl	#1,R(s_size)
+	bcc	L(LL1)
+	subql	#1,R(s_size)
+
+L(LLoop:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	addxl	R(d2),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+L(LL1:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	addxl	R(d2),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+
+	dbf	R(s_size),L(LLoop)
+	addxl	R(d0),R(d0)		/* save cy in lsb */
+	subl	#0x10000,R(s_size)
+	bcs	L(LLend)
+	lsrl	#1,R(d0)		/* restore cy */
+	bra	L(LLoop)
+
+L(LLend:)
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+EPILOG(__gmpn_lshift)
diff --git a/rts/gmp/mpn/m68k/mc68020/addmul_1.S b/rts/gmp/mpn/m68k/mc68020/addmul_1.S
new file mode 100644
index 0000000000..6638115d71
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/addmul_1.S
@@ -0,0 +1,83 @@
+/* mc68020 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s1_size	(sp + 12)
+  s2_limb	(sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_addmul_1)
+
+C_SYMBOL_NAME(__gmpn_addmul_1:)
+PROLOG(__gmpn_addmul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d5),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,20),R(res_ptr)
+	movel	MEM_DISP(sp,24),R(s1_ptr)
+	movel	MEM_DISP(sp,28),R(s1_size)
+	movel	MEM_DISP(sp,32),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	clrl	R(d5)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)		/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	addxl	R(d5),R(d1)
+	addl	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	addxl	R(d5),R(d0)
+	addl	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	addxl	R(d5),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d5)
+
+	rts
+EPILOG(__gmpn_addmul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/mul_1.S b/rts/gmp/mpn/m68k/mc68020/mul_1.S
new file mode 100644
index 0000000000..fdd4c39d70
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/mul_1.S
@@ -0,0 +1,90 @@
+/* mc68020 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s1_size	(sp + 12)
+  s2_limb	(sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_mul_1)
+
+C_SYMBOL_NAME(__gmpn_mul_1:)
+PROLOG(__gmpn_mul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d4),MEM_PREDEC(sp)
+#if 0
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(d3),MEM_PREDEC(sp)
+	movel	R(d4),MEM_PREDEC(sp)
+#endif
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,16),R(res_ptr)
+	movel	MEM_DISP(sp,20),R(s1_ptr)
+	movel	MEM_DISP(sp,24),R(s1_size)
+	movel	MEM_DISP(sp,28),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)	/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	movel	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	movel	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	clrl	R(d3)
+	addxl	R(d3),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d4)
+#if 0
+	movel	MEM_POSTINC(sp),R(d4)
+	movel	MEM_POSTINC(sp),R(d3)
+	movel	MEM_POSTINC(sp),R(d2)
+#endif
+	rts
+EPILOG(__gmpn_mul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/submul_1.S b/rts/gmp/mpn/m68k/mc68020/submul_1.S
new file mode 100644
index 0000000000..3c36b70166
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/submul_1.S
@@ -0,0 +1,83 @@
+/* mc68020 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s1_size	(sp + 12)
+  s2_limb	(sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_submul_1)
+
+C_SYMBOL_NAME(__gmpn_submul_1:)
+PROLOG(__gmpn_submul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d5),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,20),R(res_ptr)
+	movel	MEM_DISP(sp,24),R(s1_ptr)
+	movel	MEM_DISP(sp,28),R(s1_size)
+	movel	MEM_DISP(sp,32),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	clrl	R(d5)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)	/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	addxl	R(d5),R(d1)
+	subl	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	addxl	R(d5),R(d0)
+	subl	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	addxl	R(d5),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d5)
+
+	rts
+EPILOG(__gmpn_submul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/udiv.S b/rts/gmp/mpn/m68k/mc68020/udiv.S
new file mode 100644
index 0000000000..d00cf13558
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/udiv.S
@@ -0,0 +1,31 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+.text
+	.even
+.globl ___udiv_qrnnd
+___udiv_qrnnd:
+	movel sp@(4),a0
+	movel sp@(8),d1
+	movel sp@(12),d0
+	divul sp@(16),d1:d0
+	movel d1,a0@
+	rts
diff --git a/rts/gmp/mpn/m68k/mc68020/umul.S b/rts/gmp/mpn/m68k/mc68020/umul.S
new file mode 100644
index 0000000000..a34ae6c543
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/umul.S
@@ -0,0 +1,31 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+.text
+	.even
+.globl ___umul_ppmm
+___umul_ppmm:
+	movel sp@(4),a0
+	movel sp@(8),d1
+	movel sp@(12),d0
+	mulul d0,d0:d1
+	movel d1,a0@
+	rts
diff --git a/rts/gmp/mpn/m68k/rshift.S b/rts/gmp/mpn/m68k/rshift.S
new file mode 100644
index 0000000000..b47a48e52a
--- /dev/null
+++ b/rts/gmp/mpn/m68k/rshift.S
@@ -0,0 +1,149 @@
+/* mc68020 __gmpn_rshift -- Shift right a low-level natural-number integer.
+
+Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  s_size	(sp + 16)
+  cnt		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+#define res_ptr a1
+#define s_ptr a0
+#define s_size d6
+#define cnt d4
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_rshift)
+
+C_SYMBOL_NAME(__gmpn_rshift:)
+PROLOG(__gmpn_rshift)
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  */
+	movel	MEM_DISP(sp,28),R(res_ptr)
+	movel	MEM_DISP(sp,32),R(s_ptr)
+	movel	MEM_DISP(sp,36),R(s_size)
+	movel	MEM_DISP(sp,40),R(cnt)
+
+	moveql	#1,R(d5)
+	cmpl	R(d5),R(cnt)
+	bne	L(Lnormal)
+	cmpl	R(res_ptr),R(s_ptr)
+	bls	L(Lspecial)		/* jump if res_ptr >= s_ptr */
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(a2)
+#else /* not mc68020 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	lea	MEM_INDX(res_ptr,d0,l),R(a2)
+#endif
+	cmpl	R(s_ptr),R(a2)
+	bls	L(Lspecial)		/* jump if s_ptr >= res_ptr + s_size */
+
+L(Lnormal:)
+	moveql	#32,R(d5)
+	subl	R(cnt),R(d5)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	movel	R(d2),R(d0)
+	lsll	R(d5),R(d0)		/* compute carry limb */
+
+	lsrl	R(cnt),R(d2)
+	movel	R(d2),R(d1)
+	subql	#1,R(s_size)
+	beq	L(Lend)
+	lsrl	#1,R(s_size)
+	bcs	L(L1)
+	subql	#1,R(s_size)
+
+L(Loop:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	movel	R(d2),R(d3)
+	lsll	R(d5),R(d3)
+	orl	R(d3),R(d1)
+	movel	R(d1),MEM_POSTINC(res_ptr)
+	lsrl	R(cnt),R(d2)
+L(L1:)
+	movel	MEM_POSTINC(s_ptr),R(d1)
+	movel	R(d1),R(d3)
+	lsll	R(d5),R(d3)
+	orl	R(d3),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+	lsrl	R(cnt),R(d1)
+
+	dbf	R(s_size),L(Loop)
+	subl	#0x10000,R(s_size)
+	bcc	L(Loop)
+
+L(Lend:)
+	movel	R(d1),MEM(res_ptr) /* store most significant limb */
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+
+/* We loop from most significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.  */
+
+L(Lspecial:)
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
+#else /* not mc68000 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	addl	R(s_size),R(s_ptr)
+	addl	R(s_size),R(res_ptr)
+#endif
+
+	clrl	R(d0)			/* initialize carry */
+	eorw	#1,R(s_size)
+	lsrl	#1,R(s_size)
+	bcc	L(LL1)
+	subql	#1,R(s_size)
+
+L(LLoop:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	roxrl	#1,R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+L(LL1:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	roxrl	#1,R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+
+	dbf	R(s_size),L(LLoop)
+	roxrl	#1,R(d0)		/* save cy in msb */
+	subl	#0x10000,R(s_size)
+	bcs	L(LLend)
+	addl	R(d0),R(d0)		/* restore cy */
+	bra	L(LLoop)
+
+L(LLend:)
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+EPILOG(__gmpn_rshift)
diff --git a/rts/gmp/mpn/m68k/sub_n.S b/rts/gmp/mpn/m68k/sub_n.S
new file mode 100644
index 0000000000..ce45b24db5
--- /dev/null
+++ b/rts/gmp/mpn/m68k/sub_n.S
@@ -0,0 +1,79 @@
+/* mc68020 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+   store difference in a third limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s2_ptr	(sp + 16)
+  size		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_sub_n)
+
+C_SYMBOL_NAME(__gmpn_sub_n:)
+PROLOG(__gmpn_sub_n)
+/* Save used registers on the stack.  */
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,12),R(a2)
+	movel	MEM_DISP(sp,16),R(a0)
+	movel	MEM_DISP(sp,20),R(a1)
+	movel	MEM_DISP(sp,24),R(d2)
+
+	eorw	#1,R(d2)
+	lsrl	#1,R(d2)
+	bcc	L(L1)
+	subql	#1,R(d2)	/* clears cy as side effect */
+
+L(Loop:)
+	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	subxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+L(L1:)	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	subxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+
+	dbf	R(d2),L(Loop)		/* loop until 16 lsb of %4 == -1 */
+	subxl	R(d0),R(d0)	/* d0 <= -cy; save cy as 0 or -1 in d0 */
+	subl	#0x10000,R(d2)
+	bcs	L(L2)
+	addl	R(d0),R(d0)	/* restore cy */
+	bra	L(Loop)
+
+L(L2:)
+	negl	R(d0)
+
+/* Restore used registers from stack frame.  */
+	movel	MEM_POSTINC(sp),R(a2)
+	movel	MEM_POSTINC(sp),R(d2)
+
+	rts
+EPILOG(__gmpn_sub_n)
diff --git a/rts/gmp/mpn/m68k/syntax.h b/rts/gmp/mpn/m68k/syntax.h
new file mode 100644
index 0000000000..9eec279c06
--- /dev/null
+++ b/rts/gmp/mpn/m68k/syntax.h
@@ -0,0 +1,177 @@
+/* asm.h -- Definitions for 68k syntax variations.
+
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#undef ALIGN
+
+#ifdef MIT_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)base@
+#define MEM_DISP(base,displacement)base@(displacement)
+#define MEM_INDX(base,idx,size_suffix)base@(idx:size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)base@(idx:size_suffix:scale)
+#define MEM_PREDEC(memory_base)memory_base@-
+#define MEM_POSTINC(memory_base)memory_base@+
+#define L(label) label
+#define TEXT .text
+#define ALIGN .even
+#define GLOBL .globl
+#define moveql moveq
+/* Use variable sized opcodes.  */
+#define bcc jcc
+#define bcs jcs
+#define bls jls
+#define beq jeq
+#define bne jne
+#define bra jra
+#endif
+
+#ifdef SONY_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)(base)
+#define MEM_DISP(base,displacement)(displacement,base)
+#define MEM_INDX(base,idx,size_suffix)(base,idx.size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(base,idx.size_suffix*scale)
+#define MEM_PREDEC(memory_base)-(memory_base)
+#define MEM_POSTINC(memory_base)(memory_base)+
+#define L(label) label
+#define TEXT .text
+#define ALIGN .even
+#define GLOBL .globl
+#endif
+
+#ifdef MOTOROLA_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)(base)
+#define MEM_DISP(base,displacement)(displacement,base)
+#define MEM_INDX(base,idx,size_suffix)(base,idx.size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(base,idx.size_suffix*scale)
+#define MEM_PREDEC(memory_base)-(memory_base)
+#define MEM_POSTINC(memory_base)(memory_base)+
+#define L(label) label
+#define TEXT
+#define ALIGN
+#define GLOBL XDEF
+#define lea LEA
+#define movel MOVE.L
+#define moveml MOVEM.L
+#define moveql MOVEQ.L
+#define cmpl CMP.L
+#define orl OR.L
+#define clrl CLR.L
+#define eorw EOR.W
+#define lsrl LSR.L
+#define lsll LSL.L
+#define roxrl ROXR.L
+#define roxll ROXL.L
+#define addl ADD.L
+#define addxl ADDX.L
+#define addql ADDQ.L
+#define subl SUB.L
+#define subxl SUBX.L
+#define subql SUBQ.L
+#define negl NEG.L
+#define mulul MULU.L
+#define bcc BCC
+#define bcs BCS
+#define bls BLS
+#define beq BEQ
+#define bne BNE
+#define bra BRA
+#define dbf DBF
+#define rts RTS
+#define d0 D0
+#define d1 D1
+#define d2 D2
+#define d3 D3
+#define d4 D4
+#define d5 D5
+#define d6 D6
+#define d7 D7
+#define a0 A0
+#define a1 A1
+#define a2 A2
+#define a3 A3
+#define a4 A4
+#define a5 A5
+#define a6 A6
+#define a7 A7
+#define sp SP
+#endif
+
+#ifdef ELF_SYNTAX
+#define PROLOG(name) .type name,@function
+#define EPILOG(name) .size name,.-name
+#define MEM(base)(R(base))
+#define MEM_DISP(base,displacement)(displacement,R(base))
+#define MEM_PREDEC(memory_base)-(R(memory_base))
+#define MEM_POSTINC(memory_base)(R(memory_base))+
+#ifdef __STDC__
+#define R_(r)%##r
+#define R(r)R_(r)
+#define MEM_INDX_(base,idx,size_suffix)(R(base),R(idx##.##size_suffix))
+#define MEM_INDX(base,idx,size_suffix)MEM_INDX_(base,idx,size_suffix)
+#define MEM_INDX1_(base,idx,size_suffix,scale)(R(base),R(idx##.##size_suffix*scale))
+#define MEM_INDX1(base,idx,size_suffix,scale)MEM_INDX1_(base,idx,size_suffix,scale)
+#define L(label) .##label
+#else
+#define R(r)%/**/r
+#define MEM_INDX(base,idx,size_suffix)(R(base),R(idx).size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(R(base),R(idx).size_suffix*scale)
+#define L(label) ./**/label
+#endif
+#define TEXT .text
+#define ALIGN .align 2
+#define GLOBL .globl
+#define bcc jbcc
+#define bcs jbcs
+#define bls jbls
+#define beq jbeq
+#define bne jbne
+#define bra jbra
+#endif
+
+#if defined (SONY_SYNTAX) || defined (ELF_SYNTAX)
+#define movel move.l
+#define moveml movem.l
+#define moveql moveq.l
+#define cmpl cmp.l
+#define orl or.l
+#define clrl clr.l
+#define eorw eor.w
+#define lsrl lsr.l
+#define lsll lsl.l
+#define roxrl roxr.l
+#define roxll roxl.l
+#define addl add.l
+#define addxl addx.l
+#define addql addq.l
+#define subl sub.l
+#define subxl subx.l
+#define subql subq.l
+#define negl neg.l
+#define mulul mulu.l
+#endif
diff --git a/rts/gmp/mpn/m88k/add_n.s b/rts/gmp/mpn/m88k/add_n.s
new file mode 100644
index 0000000000..0b776c618a
--- /dev/null
+++ b/rts/gmp/mpn/m88k/add_n.s
@@ -0,0 +1,104 @@
+; mc88100 __gmpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___gmpn_add_n
+___gmpn_add_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu.co	r5,r0,r5		; (clear carry as side effect)
+	mak	r5,r5,3<4>
+	bcnd	eq0,r5,Lzero
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; add 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	addu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; add 7 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; add 6 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; add 5 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; add 4 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; add 3 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; add 2 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; add 1 + 8r limbs
+	 addu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	jmp.n	 r1
+	 addu.ci r2,r0,r0		; return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/m88k/mc88110/add_n.S b/rts/gmp/mpn/m88k/mc88110/add_n.S
new file mode 100644
index 0000000000..843a50dded
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/add_n.S
@@ -0,0 +1,200 @@
+; mc88110 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+#define res_ptr	r2
+#define s1_ptr	r3
+#define s2_ptr	r4
+#define size	r5
+
+#include "sysdep.h"
+
+	text
+	align	16
+	global	C_SYMBOL_NAME(__gmpn_add_n)
+C_SYMBOL_NAME(__gmpn_add_n):
+	addu.co	 r0,r0,r0		; clear cy flag
+	xor	 r12,s2_ptr,res_ptr
+	bb1	 2,r12,L1
+; **  V1a  **
+L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned?
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	addu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s1_ptr,0
+	ld	 r12,s1_ptr,4
+	ld.d	 r8,s2_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1:	subu	 size,size,8
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,16
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,20
+	ld.d	 r8,s2_ptr,16
+	st.d	 r6,res_ptr,8
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,24
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,28
+	ld.d	 r8,s2_ptr,24
+	st.d	 r6,res_ptr,16
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,32
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,36
+	addu	 s1_ptr,s1_ptr,32
+	ld.d	 r8,s2_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1
+
+Lfin1:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1:	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1
+Lend1:	addu.cio r6,r10,r8
+	addu.cio r7,r12,r9
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1
+/* Add last limb */
+	ld	 r10,s1_ptr,8
+	ld	 r8,s2_ptr,8
+	addu.cio r6,r10,r8
+	st	 r6,res_ptr,8
+
+Lret1:	jmp.n	 r1
+	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb
+
+L1:	xor	 r12,s1_ptr,res_ptr
+	bb1	 2,r12,L2
+; **  V1b  **
+	or	 r12,r0,s2_ptr
+	or	 s2_ptr,r0,s1_ptr
+	or	 s1_ptr,r0,r12
+	br	 L0
+
+; **  V2  **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+   things can be aligned (that we care about) we now know that the alignment
+   of s1_ptr and s2_ptr are the same.  */
+
+L2:	cmp	 r12,size,1
+	bb1	 eq,r12,Ljone
+	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	addu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+
+L_v2:	subu	 size,size,8
+	bcnd	 lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop2:	subu	 size,size,8
+	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	ld.d	 r8,s1_ptr,8
+	ld.d	 r6,s2_ptr,8
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,8
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,12
+	ld.d	 r8,s1_ptr,16
+	ld.d	 r6,s2_ptr,16
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,16
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,20
+	ld.d	 r8,s1_ptr,24
+	ld.d	 r6,s2_ptr,24
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,24
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,28
+	addu	 s1_ptr,s1_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop2
+
+Lfin2:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend2
+Loope2:	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope2
+Lend2:	bb0	 0,size,Lret2
+/* Add last limb */
+Ljone:	ld	 r10,s1_ptr,0
+	ld	 r8,s2_ptr,0
+	addu.cio r6,r10,r8
+	st	 r6,res_ptr,0
+
+Lret2:	jmp.n	 r1
+	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/m88k/mc88110/addmul_1.s b/rts/gmp/mpn/m88k/mc88110/addmul_1.s
new file mode 100644
index 0000000000..7d97c87c79
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/addmul_1.s
@@ -0,0 +1,61 @@
+; mc88110 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+	text
+	align	16
+	global	___gmpn_addmul_1
+___gmpn_addmul_1:
+	lda	 r3,r3[r4]
+	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
+	subu	 r4,r0,r4
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+
+	ld	 r6,r3[r4]
+	addu	 r4,r4,1
+	subu	 r8,r8,4
+	bcnd.n	 eq0,r4,Lend
+	 mulu.d	 r10,r6,r5
+
+Loop:	ld	 r7,r8[r4]
+	ld	 r6,r3[r4]
+	addu.cio r9,r11,r2
+	addu.ci	 r2,r10,r0
+	addu.co	 r9,r9,r7
+	st	 r9,r8[r4]
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd	 ne0,r4,Loop
+
+Lend:	ld	 r7,r8,0
+	addu.cio r9,r11,r2
+	addu.ci	 r2,r10,r0
+	addu.co	 r9,r9,r7
+	st	 r9,r8,0
+	jmp.n	 r1
+	 addu.ci r2,r2,r0
diff --git a/rts/gmp/mpn/m88k/mc88110/mul_1.s b/rts/gmp/mpn/m88k/mc88110/mul_1.s
new file mode 100644
index 0000000000..b8483afa91
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/mul_1.s
@@ -0,0 +1,59 @@
+; mc88110 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+	text
+	align	16
+	global	___gmpn_mul_1
+___gmpn_mul_1:
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+
+	ld	 r6,r3[r4]
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 eq0,r4,Lend
+	 subu	 r8,r8,8
+
+Loop:	ld	 r6,r3[r4]
+	addu.cio r9,r11,r2
+	or	 r2,r10,r0		; could be avoided if unrolled
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 ne0,r4,Loop
+	 st	 r9,r8[r4]
+
+Lend:	addu.cio r9,r11,r2
+	st	 r9,r8,4
+	jmp.n	 r1
+	 addu.ci r2,r10,r0
diff --git a/rts/gmp/mpn/m88k/mc88110/sub_n.S b/rts/gmp/mpn/m88k/mc88110/sub_n.S
new file mode 100644
index 0000000000..715a3faf25
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/sub_n.S
@@ -0,0 +1,276 @@
+; mc88110 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+#define res_ptr	r2
+#define s1_ptr	r3
+#define s2_ptr	r4
+#define size	r5
+
+#include "sysdep.h"
+
+	text
+	align	16
+	global	C_SYMBOL_NAME(__gmpn_sub_n)
+C_SYMBOL_NAME(__gmpn_sub_n):
+	subu.co	 r0,r0,r0		; set cy flag
+	xor	 r12,s2_ptr,res_ptr
+	bb1	 2,r12,L1
+; **  V1a  **
+L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s1_ptr,0
+	ld	 r12,s1_ptr,4
+	ld.d	 r8,s2_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1:	subu	 size,size,8
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,16
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,20
+	ld.d	 r8,s2_ptr,16
+	st.d	 r6,res_ptr,8
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,24
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,28
+	ld.d	 r8,s2_ptr,24
+	st.d	 r6,res_ptr,16
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,32
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,36
+	addu	 s1_ptr,s1_ptr,32
+	ld.d	 r8,s2_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1
+
+Lfin1:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1:	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1
+Lend1:	subu.cio r6,r10,r8
+	subu.cio r7,r12,r9
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1
+/* Add last limb */
+	ld	 r10,s1_ptr,8
+	ld	 r8,s2_ptr,8
+	subu.cio r6,r10,r8
+	st	 r6,res_ptr,8
+
+Lret1:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
+
+L1:	xor	 r12,s1_ptr,res_ptr
+	bb1	 2,r12,L2
+; **  V1b  **
+	bb0	 2,res_ptr,L_v1b	; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s1_ptr */
+	ld	 r10,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	ld	 r8,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r8,r10
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1b:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s2_ptr,0
+	ld	 r12,s2_ptr,4
+	ld.d	 r8,s1_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1b
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1b:	subu	 size,size,8
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,8
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,12
+	ld.d	 r8,s1_ptr,8
+	st.d	 r6,res_ptr,0
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,16
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,20
+	ld.d	 r8,s1_ptr,16
+	st.d	 r6,res_ptr,8
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,24
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,28
+	ld.d	 r8,s1_ptr,24
+	st.d	 r6,res_ptr,16
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,32
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,36
+	addu	 s2_ptr,s2_ptr,32
+	ld.d	 r8,s1_ptr,32
+	addu	 s1_ptr,s1_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1b
+
+Lfin1b:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1b
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1b:subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,8
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,12
+	ld.d	 r8,s1_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1b
+Lend1b:	subu.cio r6,r8,r10
+	subu.cio r7,r9,r12
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1b
+/* Add last limb */
+	ld	 r10,s2_ptr,8
+	ld	 r8,s1_ptr,8
+	subu.cio r6,r8,r10
+	st	 r6,res_ptr,8
+
+Lret1b:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
+
+; **  V2  **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+   things can be aligned (that we care about) we now know that the alignment
+   of s1_ptr and s2_ptr are the same.  */
+
+L2:	cmp	 r12,size,1
+	bb1	 eq,r12,Ljone
+	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+
+L_v2:	subu	 size,size,8
+	bcnd	 lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop2:	subu	 size,size,8
+	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	ld.d	 r8,s1_ptr,8
+	ld.d	 r6,s2_ptr,8
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,8
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,12
+	ld.d	 r8,s1_ptr,16
+	ld.d	 r6,s2_ptr,16
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,16
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,20
+	ld.d	 r8,s1_ptr,24
+	ld.d	 r6,s2_ptr,24
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,24
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,28
+	addu	 s1_ptr,s1_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop2
+
+Lfin2:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend2
+Loope2:	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope2
+Lend2:	bb0	 0,size,Lret2
+/* Add last limb */
+Ljone:	ld	 r10,s1_ptr,0
+	ld	 r8,s2_ptr,0
+	subu.cio r6,r10,r8
+	st	 r6,res_ptr,0
+
+Lret2:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
diff --git a/rts/gmp/mpn/m88k/mul_1.s b/rts/gmp/mpn/m88k/mul_1.s
new file mode 100644
index 0000000000..06370837ef
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mul_1.s
@@ -0,0 +1,127 @@
+; mc88100 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+; Common overhead is about 11 cycles/invocation.
+
+; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention.)
+
+; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
+
+; To enhance speed:
+; 1. Unroll main loop 4-8 times.
+; 2. Schedule code to avoid WB contention.  It might be tempting to move the
+;    ld instruction in the loops down to save 2 cycles (less WB contention),
+;    but that looses because the ultimate value will be read from outside
+;    the allocated space.  But if we handle the ultimate multiplication in
+;    the tail, we can do this.
+; 3. Make the multiplication with less instructions.  I think the code for
+;    (S2_LIMB >= 0x10000) is not minimal.
+; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
+; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
+; cycles/limb.  (Assuming infinite unrolling.)
+
+	text
+	align	 16
+	global	 ___gmpn_mul_1
+___gmpn_mul_1:
+
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r6,r2[r4]	; RES_PTR in r6 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0	; r2 = cy = 0
+	ld	 r9,r3[r4]
+	mask	 r7,r5,0xffff	; r7 = lo(S2_LIMB)
+	extu	 r8,r5,16	; r8 = hi(S2_LIMB)
+	bcnd.n	 eq0,r8,Lsmall	; jump if (hi(S2_LIMB) == 0)
+	 subu	 r6,r6,4
+
+; General code for any value of S2_LIMB.
+
+	; Make a stack frame and save r25 and r26
+	subu	 r31,r31,16
+	st.d	 r25,r31,8
+
+	; Enter the loop in the middle
+	br.n	L1
+	addu	 r4,r4,1
+
+Loop:	ld	 r9,r3[r4]
+	st	 r26,r6[r4]
+; bcnd	ne0,r0,0		; bubble
+	addu	 r4,r4,1
+L1:	mul	 r26,r9,r5	; low word of product	mul_1	WB ld
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)	mask_1
+	mul	 r11,r12,r7	; r11 =  prod_0		mul_2	WB mask_1
+	mul	 r10,r12,r8	; r10 = prod_1a		mul_3
+	extu	 r13,r9,16	; r13 = hi(s1_limb)	extu_1	WB mul_1
+	mul	 r12,r13,r7	; r12 = prod_1b		mul_4	WB extu_1
+	mul	 r25,r13,r8	; r25  = prod_2		mul_5	WB mul_2
+	extu	 r11,r11,16	; r11 = hi(prod_0)	extu_2	WB mul_3
+	addu	 r10,r10,r11	;			addu_1	WB extu_2
+; bcnd	ne0,r0,0		; bubble			WB addu_1
+	addu.co	 r10,r10,r12	;				WB mul_4
+	mask.u	 r10,r10,0xffff	; move the 16 most significant bits...
+	addu.ci	 r10,r10,r0	; ...to the low half of the word...
+	rot	 r10,r10,16	; ...and put carry in pos 16.
+	addu.co	 r26,r26,r2	; add old carry limb
+	bcnd.n	 ne0,r4,Loop
+	 addu.ci r2,r25,r10	; compute new carry limb
+
+	st	 r26,r6[r4]
+	ld.d	 r25,r31,8
+	jmp.n	 r1
+	 addu	 r31,r31,16
+
+; Fast code for S2_LIMB < 0x10000
+Lsmall:
+	; Enter the loop in the middle
+	br.n	SL1
+	addu	 r4,r4,1
+
+SLoop:	ld	 r9,r3[r4]	;
+	st	 r8,r6[r4]	;
+	addu	 r4,r4,1	;
+SL1:	mul	 r8,r9,r5	; low word of product
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)
+	extu	 r13,r9,16	; r13 = hi(s1_limb)
+	mul	 r11,r12,r7	; r11 =  prod_0
+	mul	 r12,r13,r7	; r12 = prod_1b
+	addu.cio r8,r8,r2	; add old carry limb
+	extu	 r10,r11,16	; r11 = hi(prod_0)
+	addu	 r10,r10,r12	;
+	bcnd.n	 ne0,r4,SLoop
+	extu	 r2,r10,16	; r2 = new carry limb
+
+	jmp.n	 r1
+	st	 r8,r6[r4]
diff --git a/rts/gmp/mpn/m88k/sub_n.s b/rts/gmp/mpn/m88k/sub_n.s
new file mode 100644
index 0000000000..2fd345a135
--- /dev/null
+++ b/rts/gmp/mpn/m88k/sub_n.s
@@ -0,0 +1,106 @@
+; mc88100 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___gmpn_sub_n
+___gmpn_sub_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu	r5,r0,r5
+	mak	r5,r5,3<4>
+	bcnd.n	eq0,r5,Lzero
+	subu.co	r0,r0,r0		; initialize carry
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; subtract 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	subu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; subtract 7 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; subtract 6 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; subtract 5 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; subtract 4 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; subtract 3 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; subtract 2 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; subtract 1 + 8r limbs
+	 subu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
diff --git a/rts/gmp/mpn/mips2/add_n.s b/rts/gmp/mpn/mips2/add_n.s
new file mode 100644
index 0000000000..5c3c7fc8a1
--- /dev/null
+++ b/rts/gmp/mpn/mips2/add_n.s
@@ -0,0 +1,120 @@
+ # MIPS2 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_add_n
+	.ent	__gmpn_add_n
+__gmpn_add_n:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_add_n
diff --git a/rts/gmp/mpn/mips2/addmul_1.s b/rts/gmp/mpn/mips2/addmul_1.s
new file mode 100644
index 0000000000..1e5037751b
--- /dev/null
+++ b/rts/gmp/mpn/mips2/addmul_1.s
@@ -0,0 +1,97 @@
+ # MIPS __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __gmpn_addmul_1
+	.ent	__gmpn_addmul_1
+__gmpn_addmul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_addmul_1
diff --git a/rts/gmp/mpn/mips2/lshift.s b/rts/gmp/mpn/mips2/lshift.s
new file mode 100644
index 0000000000..2ca3a3c800
--- /dev/null
+++ b/rts/gmp/mpn/mips2/lshift.s
@@ -0,0 +1,95 @@
+ # MIPS2 __gmpn_lshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_lshift
+	.ent	__gmpn_lshift
+__gmpn_lshift:
+	.set	noreorder
+	.set	nomacro
+
+	sll	$2,$6,2
+	addu	$5,$5,$2	# make r5 point at end of src
+	lw	$10,-4($5)	# load first limb
+	subu	$13,$0,$7
+	addu	$4,$4,$2	# make r4 point at end of res
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 srl	$2,$10,$13	# compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,-8($5)
+	addiu	$4,$4,-4
+	addiu	$5,$5,-4
+	addiu	$9,$9,-1
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,-8($5)
+	addiu	$4,$4,-16
+	addiu	$6,$6,-4
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+
+	lw	$10,-12($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,12($4)
+	srl	$9,$10,$13
+
+	lw	$3,-16($5)
+	sll	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,8($4)
+	srl	$12,$3,$13
+
+	lw	$10,-20($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,4($4)
+	srl	$9,$10,$13
+
+	addiu	$5,$5,-16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,0($4)
+
+.Lend:	sll	$8,$10,$7
+	j	$31
+	sw	$8,-4($4)
+	.end	__gmpn_lshift
diff --git a/rts/gmp/mpn/mips2/mul_1.s b/rts/gmp/mpn/mips2/mul_1.s
new file mode 100644
index 0000000000..ea8aa26809
--- /dev/null
+++ b/rts/gmp/mpn/mips2/mul_1.s
@@ -0,0 +1,85 @@
+ # MIPS __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __gmpn_mul_1
+	.ent	__gmpn_mul_1
+__gmpn_mul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	mflo	$10
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$10,$10,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$10,$2	# carry from previous addition -> $2
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	multu	$8,$7
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	sw	$10,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_mul_1
diff --git a/rts/gmp/mpn/mips2/rshift.s b/rts/gmp/mpn/mips2/rshift.s
new file mode 100644
index 0000000000..37c8f39cb4
--- /dev/null
+++ b/rts/gmp/mpn/mips2/rshift.s
@@ -0,0 +1,92 @@
+ # MIPS2 __gmpn_rshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_rshift
+	.ent	__gmpn_rshift
+__gmpn_rshift:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)	# load first limb
+	subu	$13,$0,$7
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 sll	$2,$10,$13	# compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,4($5)
+	addiu	$4,$4,4
+	addiu	$5,$5,4
+	addiu	$9,$9,-1
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,-4($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,4($5)
+	addiu	$4,$4,16
+	addiu	$6,$6,-4
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+
+	lw	$10,8($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-16($4)
+	sll	$9,$10,$13
+
+	lw	$3,12($5)
+	srl	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,-12($4)
+	sll	$12,$3,$13
+
+	lw	$10,16($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-8($4)
+	sll	$9,$10,$13
+
+	addiu	$5,$5,16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,-4($4)
+
+.Lend:	srl	$8,$10,$7
+	j	$31
+	sw	$8,0($4)
+	.end	__gmpn_rshift
diff --git a/rts/gmp/mpn/mips2/sub_n.s b/rts/gmp/mpn/mips2/sub_n.s
new file mode 100644
index 0000000000..51d34f3ac3
--- /dev/null
+++ b/rts/gmp/mpn/mips2/sub_n.s
@@ -0,0 +1,120 @@
+ # MIPS2 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_sub_n
+	.ent	__gmpn_sub_n
+__gmpn_sub_n:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_sub_n
diff --git a/rts/gmp/mpn/mips2/submul_1.s b/rts/gmp/mpn/mips2/submul_1.s
new file mode 100644
index 0000000000..495dea3ba2
--- /dev/null
+++ b/rts/gmp/mpn/mips2/submul_1.s
@@ -0,0 +1,97 @@
+ # MIPS __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __gmpn_submul_1
+	.ent	__gmpn_submul_1
+__gmpn_submul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_submul_1
diff --git a/rts/gmp/mpn/mips2/umul.s b/rts/gmp/mpn/mips2/umul.s
new file mode 100644
index 0000000000..40e847614c
--- /dev/null
+++ b/rts/gmp/mpn/mips2/umul.s
@@ -0,0 +1,30 @@
+ # Copyright (C) 1999 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+	.text
+	.align	2
+	.globl	__umul_ppmm
+	.ent	__umul_ppmm
+__umul_ppmm:
+	multu $5,$6
+	mflo	$3
+	mfhi	$2
+	sw	$3,0($4)
+	j	$31
+	.end	__umul_ppmm
diff --git a/rts/gmp/mpn/mips3/README b/rts/gmp/mpn/mips3/README
new file mode 100644
index 0000000000..e94b2c7460
--- /dev/null
+++ b/rts/gmp/mpn/mips3/README
@@ -0,0 +1,23 @@
+This directory contains mpn functions optimized for MIPS3.  Example of
+processors that implement MIPS3 are R4000, R4400, R4600, R4700, and R8000.
+
+RELEVANT OPTIMIZATION ISSUES
+
+1. On the R4000 and R4400, branches, both the plain and the "likely" ones,
+   take 3 cycles to execute.  (The fastest possible loop will take 4 cycles,
+   because of the delay insn.)
+
+   On the R4600, branches takes a single cycle
+
+   On the R8000, branches often take no noticable cycles, as they are
+   executed in a separate function unit..
+
+2. The R4000 and R4400 have a load latency of 4 cycles.
+
+3. On the R4000 and R4400, multiplies take a data-dependent number of
+   cycles, contrary to the SGI documentation.  There seem to be 3 or 4
+   possible latencies.
+
+STATUS
+
+Good...
diff --git a/rts/gmp/mpn/mips3/add_n.s b/rts/gmp/mpn/mips3/add_n.s
new file mode 100644
index 0000000000..adad0beaef
--- /dev/null
+++ b/rts/gmp/mpn/mips3/add_n.s
@@ -0,0 +1,120 @@
+ # MIPS3 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_add_n
+	.ent	__gmpn_add_n
+__gmpn_add_n:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	daddu	$13,$13,$2
+	ld	$11,16($6)
+	sltu	$8,$13,$2
+	daddu	$13,$12,$13
+	sltu	$2,$13,$12
+	sd	$13,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	daddu	$11,$11,$2
+	ld	$13,24($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	daddu	$13,$13,$2
+	ld	$11,32($6)
+	sltu	$8,$13,$2
+	daddu	$13,$12,$13
+	sltu	$2,$13,$12
+	sd	$13,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_add_n
diff --git a/rts/gmp/mpn/mips3/addmul_1.s b/rts/gmp/mpn/mips3/addmul_1.s
new file mode 100644
index 0000000000..d390e2298e
--- /dev/null
+++ b/rts/gmp/mpn/mips3/addmul_1.s
@@ -0,0 +1,97 @@
+ # MIPS3 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__gmpn_addmul_1
+	.ent	__gmpn_addmul_1
+__gmpn_addmul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_addmul_1
diff --git a/rts/gmp/mpn/mips3/gmp-mparam.h b/rts/gmp/mpn/mips3/gmp-mparam.h
new file mode 100644
index 0000000000..656e90c7b0
--- /dev/null
+++ b/rts/gmp/mpn/mips3/gmp-mparam.h
@@ -0,0 +1,58 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the R10000 usign the system cc.  */
+/* Generated by tuneup.c, 2000-07-25. */
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 16
+#endif
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 32
+#endif
+
+/* Supressed the TOOM3 values as they looked absolutely crazy
+   (698 and 21 respectively) */
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              58
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             54
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            82
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD         159
+#endif
diff --git a/rts/gmp/mpn/mips3/lshift.s b/rts/gmp/mpn/mips3/lshift.s
new file mode 100644
index 0000000000..372606fddf
--- /dev/null
+++ b/rts/gmp/mpn/mips3/lshift.s
@@ -0,0 +1,95 @@
+ # MIPS3 __gmpn_lshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_lshift
+	.ent	__gmpn_lshift
+__gmpn_lshift:
+	.set	noreorder
+	.set	nomacro
+
+	dsll	$2,$6,3
+	daddu	$5,$5,$2	# make r5 point at end of src
+	ld	$10,-8($5)	# load first limb
+	dsubu	$13,$0,$7
+	daddu	$4,$4,$2	# make r4 point at end of res
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 dsrl	$2,$10,$13	# compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,-16($5)
+	daddiu	$4,$4,-8
+	daddiu	$5,$5,-8
+	daddiu	$9,$9,-1
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,-16($5)
+	daddiu	$4,$4,-32
+	daddiu	$6,$6,-4
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+
+	ld	$10,-24($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,24($4)
+	dsrl	$9,$10,$13
+
+	ld	$3,-32($5)
+	dsll	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,16($4)
+	dsrl	$12,$3,$13
+
+	ld	$10,-40($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,8($4)
+	dsrl	$9,$10,$13
+
+	daddiu	$5,$5,-32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,0($4)
+
+.Lend:	dsll	$8,$10,$7
+	j	$31
+	sd	$8,-8($4)
+	.end	__gmpn_lshift
diff --git a/rts/gmp/mpn/mips3/mul_1.s b/rts/gmp/mpn/mips3/mul_1.s
new file mode 100644
index 0000000000..6659e2b4eb
--- /dev/null
+++ b/rts/gmp/mpn/mips3/mul_1.s
@@ -0,0 +1,85 @@
+ # MIPS3 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__gmpn_mul_1
+	.ent	__gmpn_mul_1
+__gmpn_mul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	mflo	$10
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$10,$10,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$10,$2	# carry from previous addition -> $2
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	dmultu	$8,$7
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	sd	$10,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_mul_1
diff --git a/rts/gmp/mpn/mips3/rshift.s b/rts/gmp/mpn/mips3/rshift.s
new file mode 100644
index 0000000000..59c7fd3492
--- /dev/null
+++ b/rts/gmp/mpn/mips3/rshift.s
@@ -0,0 +1,92 @@
+ # MIPS3 __gmpn_rshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_rshift
+	.ent	__gmpn_rshift
+__gmpn_rshift:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)	# load first limb
+	dsubu	$13,$0,$7
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 dsll	$2,$10,$13	# compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,8($5)
+	daddiu	$4,$4,8
+	daddiu	$5,$5,8
+	daddiu	$9,$9,-1
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,-8($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,8($5)
+	daddiu	$4,$4,32
+	daddiu	$6,$6,-4
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+
+	ld	$10,16($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-32($4)
+	dsll	$9,$10,$13
+
+	ld	$3,24($5)
+	dsrl	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,-24($4)
+	dsll	$12,$3,$13
+
+	ld	$10,32($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-16($4)
+	dsll	$9,$10,$13
+
+	daddiu	$5,$5,32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,-8($4)
+
+.Lend:	dsrl	$8,$10,$7
+	j	$31
+	sd	$8,0($4)
+	.end	__gmpn_rshift
diff --git a/rts/gmp/mpn/mips3/sub_n.s b/rts/gmp/mpn/mips3/sub_n.s
new file mode 100644
index 0000000000..c57c824b04
--- /dev/null
+++ b/rts/gmp/mpn/mips3/sub_n.s
@@ -0,0 +1,120 @@
+ # MIPS3 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_sub_n
+	.ent	__gmpn_sub_n
+__gmpn_sub_n:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	daddu	$13,$13,$2
+	ld	$11,16($6)
+	sltu	$8,$13,$2
+	dsubu	$13,$12,$13
+	sltu	$2,$12,$13
+	sd	$13,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	daddu	$11,$11,$2
+	ld	$13,24($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	daddu	$13,$13,$2
+	ld	$11,32($6)
+	sltu	$8,$13,$2
+	dsubu	$13,$12,$13
+	sltu	$2,$12,$13
+	sd	$13,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_sub_n
diff --git a/rts/gmp/mpn/mips3/submul_1.s b/rts/gmp/mpn/mips3/submul_1.s
new file mode 100644
index 0000000000..531f9705a6
--- /dev/null
+++ b/rts/gmp/mpn/mips3/submul_1.s
@@ -0,0 +1,97 @@
+ # MIPS3 __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__gmpn_submul_1
+	.ent	__gmpn_submul_1
+__gmpn_submul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_submul_1
diff --git a/rts/gmp/mpn/mp_bases.c b/rts/gmp/mpn/mp_bases.c
new file mode 100644
index 0000000000..011c328c80
--- /dev/null
+++ b/rts/gmp/mpn/mp_bases.c
@@ -0,0 +1,550 @@
+/* __mp_bases -- Structure for conversion between internal binary
+   format and strings in base 2..255.  The fields are explained in
+   gmp-impl.h.
+
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+#if BITS_PER_MP_LIMB == 32
+const struct bases __mp_bases[256] =
+{
+  /*  0 */ {0, 0.0, 0, 0},
+  /*  1 */ {0, 1e38, 0, 0},
+  /*  2 */ {32, 1.0000000000000000, 0x1, 0x0},
+  /*  3 */ {20, 0.6309297535714575, 0xcfd41b91, 0x3b563c24},
+  /*  4 */ {16, 0.5000000000000000, 0x2, 0x0},
+  /*  5 */ {13, 0.4306765580733931, 0x48c27395, 0xc25c2684},
+  /*  6 */ {12, 0.3868528072345416, 0x81bf1000, 0xf91bd1b6},
+  /*  7 */ {11, 0.3562071871080222, 0x75db9c97, 0x1607a2cb},
+  /*  8 */ {10, 0.3333333333333334, 0x3, 0x0},
+  /*  9 */ {10, 0.3154648767857287, 0xcfd41b91, 0x3b563c24},
+  /* 10 */ {9, 0.3010299956639811, 0x3b9aca00, 0x12e0be82},
+  /* 11 */ {9, 0.2890648263178878, 0x8c8b6d2b, 0xd24cde04},
+  /* 12 */ {8, 0.2789429456511298, 0x19a10000, 0x3fa39ab5},
+  /* 13 */ {8, 0.2702381544273197, 0x309f1021, 0x50f8ac5f},
+  /* 14 */ {8, 0.2626495350371936, 0x57f6c100, 0x74843b1e},
+  /* 15 */ {8, 0.2559580248098155, 0x98c29b81, 0xad0326c2},
+  /* 16 */ {8, 0.2500000000000000, 0x4, 0x0},
+  /* 17 */ {7, 0.2446505421182260, 0x18754571, 0x4ef0b6bd},
+  /* 18 */ {7, 0.2398124665681315, 0x247dbc80, 0xc0fc48a1},
+  /* 19 */ {7, 0.2354089133666382, 0x3547667b, 0x33838942},
+  /* 20 */ {7, 0.2313782131597592, 0x4c4b4000, 0xad7f29ab},
+  /* 21 */ {7, 0.2276702486969530, 0x6b5a6e1d, 0x313c3d15},
+  /* 22 */ {7, 0.2242438242175754, 0x94ace180, 0xb8cca9e0},
+  /* 23 */ {7, 0.2210647294575037, 0xcaf18367, 0x42ed6de9},
+  /* 24 */ {6, 0.2181042919855316, 0xb640000, 0x67980e0b},
+  /* 25 */ {6, 0.2153382790366965, 0xe8d4a51, 0x19799812},
+  /* 26 */ {6, 0.2127460535533632, 0x1269ae40, 0xbce85396},
+  /* 27 */ {6, 0.2103099178571525, 0x17179149, 0x62c103a9},
+  /* 28 */ {6, 0.2080145976765095, 0x1cb91000, 0x1d353d43},
+  /* 29 */ {6, 0.2058468324604344, 0x23744899, 0xce1decea},
+  /* 30 */ {6, 0.2037950470905062, 0x2b73a840, 0x790fc511},
+  /* 31 */ {6, 0.2018490865820999, 0x34e63b41, 0x35b865a0},
+  /* 32 */ {6, 0.2000000000000000, 0x5, 0x0},
+  /* 33 */ {6, 0.1982398631705605, 0x4cfa3cc1, 0xa9aed1b3},
+  /* 34 */ {6, 0.1965616322328226, 0x5c13d840, 0x63dfc229},
+  /* 35 */ {6, 0.1949590218937863, 0x6d91b519, 0x2b0fee30},
+  /* 36 */ {6, 0.1934264036172708, 0x81bf1000, 0xf91bd1b6},
+  /* 37 */ {6, 0.1919587200065601, 0x98ede0c9, 0xac89c3a9},
+  /* 38 */ {6, 0.1905514124267734, 0xb3773e40, 0x6d2c32fe},
+  /* 39 */ {6, 0.1892003595168700, 0xd1bbc4d1, 0x387907c9},
+  /* 40 */ {6, 0.1879018247091076, 0xf4240000, 0xc6f7a0b},
+  /* 41 */ {5, 0.1866524112389434, 0x6e7d349, 0x28928154},
+  /* 42 */ {5, 0.1854490234153689, 0x7ca30a0, 0x6e8629d},
+  /* 43 */ {5, 0.1842888331487062, 0x8c32bbb, 0xd373dca0},
+  /* 44 */ {5, 0.1831692509136336, 0x9d46c00, 0xa0b17895},
+  /* 45 */ {5, 0.1820879004699383, 0xaffacfd, 0x746811a5},
+  /* 46 */ {5, 0.1810425967800402, 0xc46bee0, 0x4da6500f},
+  /* 47 */ {5, 0.1800313266566926, 0xdab86ef, 0x2ba23582},
+  /* 48 */ {5, 0.1790522317510414, 0xf300000, 0xdb20a88},
+  /* 49 */ {5, 0.1781035935540111, 0x10d63af1, 0xe68d5ce4},
+  /* 50 */ {5, 0.1771838201355579, 0x12a05f20, 0xb7cdfd9d},
+  /* 51 */ {5, 0.1762914343888821, 0x1490aae3, 0x8e583933},
+  /* 52 */ {5, 0.1754250635819545, 0x16a97400, 0x697cc3ea},
+  /* 53 */ {5, 0.1745834300480449, 0x18ed2825, 0x48a5ca6c},
+  /* 54 */ {5, 0.1737653428714400, 0x1b5e4d60, 0x2b52db16},
+  /* 55 */ {5, 0.1729696904450771, 0x1dff8297, 0x111586a6},
+  /* 56 */ {5, 0.1721954337940981, 0x20d38000, 0xf31d2b36},
+  /* 57 */ {5, 0.1714416005739134, 0x23dd1799, 0xc8d76d19},
+  /* 58 */ {5, 0.1707072796637201, 0x271f35a0, 0xa2cb1eb4},
+  /* 59 */ {5, 0.1699916162869140, 0x2a9ce10b, 0x807c3ec3},
+  /* 60 */ {5, 0.1692938075987814, 0x2e593c00, 0x617ec8bf},
+  /* 61 */ {5, 0.1686130986895011, 0x3257844d, 0x45746cbe},
+  /* 62 */ {5, 0.1679487789570419, 0x369b13e0, 0x2c0aa273},
+  /* 63 */ {5, 0.1673001788101741, 0x3b27613f, 0x14f90805},
+  /* 64 */ {5, 0.1666666666666667, 0x6, 0x0},
+  /* 65 */ {5, 0.1660476462159378, 0x4528a141, 0xd9cf0829},
+  /* 66 */ {5, 0.1654425539190583, 0x4aa51420, 0xb6fc4841},
+  /* 67 */ {5, 0.1648508567221604, 0x50794633, 0x973054cb},
+  /* 68 */ {5, 0.1642720499620502, 0x56a94400, 0x7a1dbe4b},
+  /* 69 */ {5, 0.1637056554452156, 0x5d393975, 0x5f7fcd7f},
+  /* 70 */ {5, 0.1631512196835108, 0x642d7260, 0x47196c84},
+  /* 71 */ {5, 0.1626083122716341, 0x6b8a5ae7, 0x30b43635},
+  /* 72 */ {5, 0.1620765243931223, 0x73548000, 0x1c1fa5f6},
+  /* 73 */ {5, 0.1615554674429964, 0x7b908fe9, 0x930634a},
+  /* 74 */ {5, 0.1610447717564445, 0x84435aa0, 0xef7f4a3c},
+  /* 75 */ {5, 0.1605440854340214, 0x8d71d25b, 0xcf5552d2},
+  /* 76 */ {5, 0.1600530732548213, 0x97210c00, 0xb1a47c8e},
+  /* 77 */ {5, 0.1595714156699382, 0xa1563f9d, 0x9634b43e},
+  /* 78 */ {5, 0.1590988078692941, 0xac16c8e0, 0x7cd3817d},
+  /* 79 */ {5, 0.1586349589155960, 0xb768278f, 0x65536761},
+  /* 80 */ {5, 0.1581795909397823, 0xc3500000, 0x4f8b588e},
+  /* 81 */ {5, 0.1577324383928644, 0xcfd41b91, 0x3b563c24},
+  /* 82 */ {5, 0.1572932473495469, 0xdcfa6920, 0x28928154},
+  /* 83 */ {5, 0.1568617748594410, 0xeac8fd83, 0x1721bfb0},
+  /* 84 */ {5, 0.1564377883420716, 0xf9461400, 0x6e8629d},
+  /* 85 */ {4, 0.1560210650222250, 0x31c84b1, 0x491cc17c},
+  /* 86 */ {4, 0.1556113914024940, 0x342ab10, 0x3a11d83b},
+  /* 87 */ {4, 0.1552085627701551, 0x36a2c21, 0x2be074cd},
+  /* 88 */ {4, 0.1548123827357682, 0x3931000, 0x1e7a02e7},
+  /* 89 */ {4, 0.1544226628011101, 0x3bd5ee1, 0x11d10edd},
+  /* 90 */ {4, 0.1540392219542636, 0x3e92110, 0x5d92c68},
+  /* 91 */ {4, 0.1536618862898642, 0x4165ef1, 0xf50dbfb2},
+  /* 92 */ {4, 0.1532904886526781, 0x4452100, 0xdf9f1316},
+  /* 93 */ {4, 0.1529248683028321, 0x4756fd1, 0xcb52a684},
+  /* 94 */ {4, 0.1525648706011593, 0x4a75410, 0xb8163e97},
+  /* 95 */ {4, 0.1522103467132434, 0x4dad681, 0xa5d8f269},
+  /* 96 */ {4, 0.1518611533308632, 0x5100000, 0x948b0fcd},
+  /* 97 */ {4, 0.1515171524096389, 0x546d981, 0x841e0215},
+  /* 98 */ {4, 0.1511782109217764, 0x57f6c10, 0x74843b1e},
+  /* 99 */ {4, 0.1508442006228941, 0x5b9c0d1, 0x65b11e6e},
+  /* 100 */ {4, 0.1505149978319906, 0x5f5e100, 0x5798ee23},
+  /* 101 */ {4, 0.1501904832236879, 0x633d5f1, 0x4a30b99b},
+  /* 102 */ {4, 0.1498705416319474, 0x673a910, 0x3d6e4d94},
+  /* 103 */ {4, 0.1495550618645152, 0x6b563e1, 0x314825b0},
+  /* 104 */ {4, 0.1492439365274121, 0x6f91000, 0x25b55f2e},
+  /* 105 */ {4, 0.1489370618588283, 0x73eb721, 0x1aadaccb},
+  /* 106 */ {4, 0.1486343375718350, 0x7866310, 0x10294ba2},
+  /* 107 */ {4, 0.1483356667053617, 0x7d01db1, 0x620f8f6},
+  /* 108 */ {4, 0.1480409554829326, 0x81bf100, 0xf91bd1b6},
+  /* 109 */ {4, 0.1477501131786861, 0x869e711, 0xe6d37b2a},
+  /* 110 */ {4, 0.1474630519902391, 0x8ba0a10, 0xd55cff6e},
+  /* 111 */ {4, 0.1471796869179852, 0x90c6441, 0xc4ad2db2},
+  /* 112 */ {4, 0.1468999356504447, 0x9610000, 0xb4b985cf},
+  /* 113 */ {4, 0.1466237184553111, 0x9b7e7c1, 0xa5782bef},
+  /* 114 */ {4, 0.1463509580758620, 0xa112610, 0x96dfdd2a},
+  /* 115 */ {4, 0.1460815796324244, 0xa6cc591, 0x88e7e509},
+  /* 116 */ {4, 0.1458155105286054, 0xacad100, 0x7b8813d3},
+  /* 117 */ {4, 0.1455526803620167, 0xb2b5331, 0x6eb8b595},
+  /* 118 */ {4, 0.1452930208392428, 0xb8e5710, 0x627289db},
+  /* 119 */ {4, 0.1450364656948130, 0xbf3e7a1, 0x56aebc07},
+  /* 120 */ {4, 0.1447829506139581, 0xc5c1000, 0x4b66dc33},
+  /* 121 */ {4, 0.1445324131589439, 0xcc6db61, 0x4094d8a3},
+  /* 122 */ {4, 0.1442847926987864, 0xd345510, 0x3632f7a5},
+  /* 123 */ {4, 0.1440400303421672, 0xda48871, 0x2c3bd1f0},
+  /* 124 */ {4, 0.1437980688733775, 0xe178100, 0x22aa4d5f},
+  /* 125 */ {4, 0.1435588526911310, 0xe8d4a51, 0x19799812},
+  /* 126 */ {4, 0.1433223277500932, 0xf05f010, 0x10a523e5},
+  /* 127 */ {4, 0.1430884415049874, 0xf817e01, 0x828a237},
+  /* 128 */ {4, 0.1428571428571428, 0x7, 0x0},
+  /* 129 */ {4, 0.1426283821033600, 0x10818201, 0xf04ec452},
+  /* 130 */ {4, 0.1424021108869747, 0x11061010, 0xe136444a},
+  /* 131 */ {4, 0.1421782821510107, 0x118db651, 0xd2af9589},
+  /* 132 */ {4, 0.1419568500933153, 0x12188100, 0xc4b42a83},
+  /* 133 */ {4, 0.1417377701235801, 0x12a67c71, 0xb73dccf5},
+  /* 134 */ {4, 0.1415209988221527, 0x1337b510, 0xaa4698c5},
+  /* 135 */ {4, 0.1413064939005528, 0x13cc3761, 0x9dc8f729},
+  /* 136 */ {4, 0.1410942141636095, 0x14641000, 0x91bf9a30},
+  /* 137 */ {4, 0.1408841194731412, 0x14ff4ba1, 0x86257887},
+  /* 138 */ {4, 0.1406761707131039, 0x159df710, 0x7af5c98c},
+  /* 139 */ {4, 0.1404703297561400, 0x16401f31, 0x702c01a0},
+  /* 140 */ {4, 0.1402665594314587, 0x16e5d100, 0x65c3ceb1},
+  /* 141 */ {4, 0.1400648234939879, 0x178f1991, 0x5bb91502},
+  /* 142 */ {4, 0.1398650865947379, 0x183c0610, 0x5207ec23},
+  /* 143 */ {4, 0.1396673142523192, 0x18eca3c1, 0x48ac9c19},
+  /* 144 */ {4, 0.1394714728255649, 0x19a10000, 0x3fa39ab5},
+  /* 145 */ {4, 0.1392775294872041, 0x1a592841, 0x36e98912},
+  /* 146 */ {4, 0.1390854521985406, 0x1b152a10, 0x2e7b3140},
+  /* 147 */ {4, 0.1388952096850913, 0x1bd51311, 0x2655840b},
+  /* 148 */ {4, 0.1387067714131417, 0x1c98f100, 0x1e7596ea},
+  /* 149 */ {4, 0.1385201075671774, 0x1d60d1b1, 0x16d8a20d},
+  /* 150 */ {4, 0.1383351890281539, 0x1e2cc310, 0xf7bfe87},
+  /* 151 */ {4, 0.1381519873525671, 0x1efcd321, 0x85d2492},
+  /* 152 */ {4, 0.1379704747522905, 0x1fd11000, 0x179a9f4},
+  /* 153 */ {4, 0.1377906240751463, 0x20a987e1, 0xf59e80eb},
+  /* 154 */ {4, 0.1376124087861776, 0x21864910, 0xe8b768db},
+  /* 155 */ {4, 0.1374358029495937, 0x226761f1, 0xdc39d6d5},
+  /* 156 */ {4, 0.1372607812113589, 0x234ce100, 0xd021c5d1},
+  /* 157 */ {4, 0.1370873187823978, 0x2436d4d1, 0xc46b5e37},
+  /* 158 */ {4, 0.1369153914223921, 0x25254c10, 0xb912f39c},
+  /* 159 */ {4, 0.1367449754241439, 0x26185581, 0xae150294},
+  /* 160 */ {4, 0.1365760475984821, 0x27100000, 0xa36e2eb1},
+  /* 161 */ {4, 0.1364085852596902, 0x280c5a81, 0x991b4094},
+  /* 162 */ {4, 0.1362425662114337, 0x290d7410, 0x8f19241e},
+  /* 163 */ {4, 0.1360779687331669, 0x2a135bd1, 0x8564e6b7},
+  /* 164 */ {4, 0.1359147715670014, 0x2b1e2100, 0x7bfbb5b4},
+  /* 165 */ {4, 0.1357529539050150, 0x2c2dd2f1, 0x72dadcc8},
+  /* 166 */ {4, 0.1355924953769863, 0x2d428110, 0x69ffc498},
+  /* 167 */ {4, 0.1354333760385373, 0x2e5c3ae1, 0x6167f154},
+  /* 168 */ {4, 0.1352755763596663, 0x2f7b1000, 0x5911016e},
+  /* 169 */ {4, 0.1351190772136599, 0x309f1021, 0x50f8ac5f},
+  /* 170 */ {4, 0.1349638598663645, 0x31c84b10, 0x491cc17c},
+  /* 171 */ {4, 0.1348099059658079, 0x32f6d0b1, 0x417b26d8},
+  /* 172 */ {4, 0.1346571975321549, 0x342ab100, 0x3a11d83b},
+  /* 173 */ {4, 0.1345057169479844, 0x3563fc11, 0x32dee622},
+  /* 174 */ {4, 0.1343554469488779, 0x36a2c210, 0x2be074cd},
+  /* 175 */ {4, 0.1342063706143054, 0x37e71341, 0x2514bb58},
+  /* 176 */ {4, 0.1340584713587980, 0x39310000, 0x1e7a02e7},
+  /* 177 */ {4, 0.1339117329233981, 0x3a8098c1, 0x180ea5d0},
+  /* 178 */ {4, 0.1337661393673756, 0x3bd5ee10, 0x11d10edd},
+  /* 179 */ {4, 0.1336216750601996, 0x3d311091, 0xbbfb88e},
+  /* 180 */ {4, 0.1334783246737591, 0x3e921100, 0x5d92c68},
+  /* 181 */ {4, 0.1333360731748201, 0x3ff90031, 0x1c024c},
+  /* 182 */ {4, 0.1331949058177136, 0x4165ef10, 0xf50dbfb2},
+  /* 183 */ {4, 0.1330548081372441, 0x42d8eea1, 0xea30efa3},
+  /* 184 */ {4, 0.1329157659418126, 0x44521000, 0xdf9f1316},
+  /* 185 */ {4, 0.1327777653067443, 0x45d16461, 0xd555c0c9},
+  /* 186 */ {4, 0.1326407925678156, 0x4756fd10, 0xcb52a684},
+  /* 187 */ {4, 0.1325048343149731, 0x48e2eb71, 0xc193881f},
+  /* 188 */ {4, 0.1323698773862368, 0x4a754100, 0xb8163e97},
+  /* 189 */ {4, 0.1322359088617821, 0x4c0e0f51, 0xaed8b724},
+  /* 190 */ {4, 0.1321029160581950, 0x4dad6810, 0xa5d8f269},
+  /* 191 */ {4, 0.1319708865228925, 0x4f535d01, 0x9d15039d},
+  /* 192 */ {4, 0.1318398080287045, 0x51000000, 0x948b0fcd},
+  /* 193 */ {4, 0.1317096685686114, 0x52b36301, 0x8c394d1d},
+  /* 194 */ {4, 0.1315804563506306, 0x546d9810, 0x841e0215},
+  /* 195 */ {4, 0.1314521597928493, 0x562eb151, 0x7c3784f8},
+  /* 196 */ {4, 0.1313247675185968, 0x57f6c100, 0x74843b1e},
+  /* 197 */ {4, 0.1311982683517524, 0x59c5d971, 0x6d02985d},
+  /* 198 */ {4, 0.1310726513121843, 0x5b9c0d10, 0x65b11e6e},
+  /* 199 */ {4, 0.1309479056113158, 0x5d796e61, 0x5e8e5c64},
+  /* 200 */ {4, 0.1308240206478128, 0x5f5e1000, 0x5798ee23},
+  /* 201 */ {4, 0.1307009860033912, 0x614a04a1, 0x50cf7bde},
+  /* 202 */ {4, 0.1305787914387386, 0x633d5f10, 0x4a30b99b},
+  /* 203 */ {4, 0.1304574268895465, 0x65383231, 0x43bb66bd},
+  /* 204 */ {4, 0.1303368824626505, 0x673a9100, 0x3d6e4d94},
+  /* 205 */ {4, 0.1302171484322746, 0x69448e91, 0x374842ee},
+  /* 206 */ {4, 0.1300982152363760, 0x6b563e10, 0x314825b0},
+  /* 207 */ {4, 0.1299800734730872, 0x6d6fb2c1, 0x2b6cde75},
+  /* 208 */ {4, 0.1298627138972530, 0x6f910000, 0x25b55f2e},
+  /* 209 */ {4, 0.1297461274170591, 0x71ba3941, 0x2020a2c5},
+  /* 210 */ {4, 0.1296303050907487, 0x73eb7210, 0x1aadaccb},
+  /* 211 */ {4, 0.1295152381234257, 0x7624be11, 0x155b891f},
+  /* 212 */ {4, 0.1294009178639407, 0x78663100, 0x10294ba2},
+  /* 213 */ {4, 0.1292873358018581, 0x7aafdeb1, 0xb160fe9},
+  /* 214 */ {4, 0.1291744835645007, 0x7d01db10, 0x620f8f6},
+  /* 215 */ {4, 0.1290623529140715, 0x7f5c3a21, 0x14930ef},
+  /* 216 */ {4, 0.1289509357448472, 0x81bf1000, 0xf91bd1b6},
+  /* 217 */ {4, 0.1288402240804449, 0x842a70e1, 0xefdcb0c7},
+  /* 218 */ {4, 0.1287302100711567, 0x869e7110, 0xe6d37b2a},
+  /* 219 */ {4, 0.1286208859913518, 0x891b24f1, 0xddfeb94a},
+  /* 220 */ {4, 0.1285122442369443, 0x8ba0a100, 0xd55cff6e},
+  /* 221 */ {4, 0.1284042773229231, 0x8e2ef9d1, 0xcceced50},
+  /* 222 */ {4, 0.1282969778809442, 0x90c64410, 0xc4ad2db2},
+  /* 223 */ {4, 0.1281903386569819, 0x93669481, 0xbc9c75f9},
+  /* 224 */ {4, 0.1280843525090381, 0x96100000, 0xb4b985cf},
+  /* 225 */ {4, 0.1279790124049077, 0x98c29b81, 0xad0326c2},
+  /* 226 */ {4, 0.1278743114199984, 0x9b7e7c10, 0xa5782bef},
+  /* 227 */ {4, 0.1277702427352035, 0x9e43b6d1, 0x9e1771a9},
+  /* 228 */ {4, 0.1276667996348261, 0xa1126100, 0x96dfdd2a},
+  /* 229 */ {4, 0.1275639755045533, 0xa3ea8ff1, 0x8fd05c41},
+  /* 230 */ {4, 0.1274617638294791, 0xa6cc5910, 0x88e7e509},
+  /* 231 */ {4, 0.1273601581921741, 0xa9b7d1e1, 0x8225759d},
+  /* 232 */ {4, 0.1272591522708010, 0xacad1000, 0x7b8813d3},
+  /* 233 */ {4, 0.1271587398372755, 0xafac2921, 0x750eccf9},
+  /* 234 */ {4, 0.1270589147554692, 0xb2b53310, 0x6eb8b595},
+  /* 235 */ {4, 0.1269596709794558, 0xb5c843b1, 0x6884e923},
+  /* 236 */ {4, 0.1268610025517973, 0xb8e57100, 0x627289db},
+  /* 237 */ {4, 0.1267629036018709, 0xbc0cd111, 0x5c80c07b},
+  /* 238 */ {4, 0.1266653683442337, 0xbf3e7a10, 0x56aebc07},
+  /* 239 */ {4, 0.1265683910770258, 0xc27a8241, 0x50fbb19b},
+  /* 240 */ {4, 0.1264719661804097, 0xc5c10000, 0x4b66dc33},
+  /* 241 */ {4, 0.1263760881150453, 0xc91209c1, 0x45ef7c7c},
+  /* 242 */ {4, 0.1262807514205999, 0xcc6db610, 0x4094d8a3},
+  /* 243 */ {4, 0.1261859507142915, 0xcfd41b91, 0x3b563c24},
+  /* 244 */ {4, 0.1260916806894653, 0xd3455100, 0x3632f7a5},
+  /* 245 */ {4, 0.1259979361142023, 0xd6c16d31, 0x312a60c3},
+  /* 246 */ {4, 0.1259047118299582, 0xda488710, 0x2c3bd1f0},
+  /* 247 */ {4, 0.1258120027502338, 0xdddab5a1, 0x2766aa45},
+  /* 248 */ {4, 0.1257198038592741, 0xe1781000, 0x22aa4d5f},
+  /* 249 */ {4, 0.1256281102107963, 0xe520ad61, 0x1e06233c},
+  /* 250 */ {4, 0.1255369169267456, 0xe8d4a510, 0x19799812},
+  /* 251 */ {4, 0.1254462191960791, 0xec940e71, 0x15041c33},
+  /* 252 */ {4, 0.1253560122735751, 0xf05f0100, 0x10a523e5},
+  /* 253 */ {4, 0.1252662914786691, 0xf4359451, 0xc5c2749},
+  /* 254 */ {4, 0.1251770521943144, 0xf817e010, 0x828a237},
+  /* 255 */ {4, 0.1250882898658681, 0xfc05fc01, 0x40a1423},
+};
+#endif
+#if BITS_PER_MP_LIMB == 64
+const struct bases __mp_bases[256] =
+{
+  /*  0 */ {0, 0.0, 0, 0},
+  /*  1 */ {0, 1e38, 0, 0},
+  /*  2 */ {64, 1.0000000000000000, CNST_LIMB(0x1), CNST_LIMB(0x0)},
+  /*  3 */ {40, 0.6309297535714574, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /*  4 */ {32, 0.5000000000000000, CNST_LIMB(0x2), CNST_LIMB(0x0)},
+  /*  5 */ {27, 0.4306765580733931, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)},
+  /*  6 */ {24, 0.3868528072345416, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+  /*  7 */ {22, 0.3562071871080222, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)},
+  /*  8 */ {21, 0.3333333333333334, CNST_LIMB(0x3), CNST_LIMB(0x0)},
+  /*  9 */ {20, 0.3154648767857287, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /* 10 */ {19, 0.3010299956639811, CNST_LIMB(0x8ac7230489e80000), CNST_LIMB(0xd83c94fb6d2ac34a)},
+  /* 11 */ {18, 0.2890648263178878, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)},
+  /* 12 */ {17, 0.2789429456511298, CNST_LIMB(0x1eca170c00000000), CNST_LIMB(0xa10c2bec5da8f8f)},
+  /* 13 */ {17, 0.2702381544273197, CNST_LIMB(0x780c7372621bd74d), CNST_LIMB(0x10f4becafe412ec3)},
+  /* 14 */ {16, 0.2626495350371936, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)},
+  /* 15 */ {16, 0.2559580248098155, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)},
+  /* 16 */ {16, 0.2500000000000000, CNST_LIMB(0x4), CNST_LIMB(0x0)},
+  /* 17 */ {15, 0.2446505421182260, CNST_LIMB(0x27b95e997e21d9f1), CNST_LIMB(0x9c71e11bab279323)},
+  /* 18 */ {15, 0.2398124665681315, CNST_LIMB(0x5da0e1e53c5c8000), CNST_LIMB(0x5dfaa697ec6f6a1c)},
+  /* 19 */ {15, 0.2354089133666382, CNST_LIMB(0xd2ae3299c1c4aedb), CNST_LIMB(0x3711783f6be7e9ec)},
+  /* 20 */ {14, 0.2313782131597592, CNST_LIMB(0x16bcc41e90000000), CNST_LIMB(0x6849b86a12b9b01e)},
+  /* 21 */ {14, 0.2276702486969530, CNST_LIMB(0x2d04b7fdd9c0ef49), CNST_LIMB(0x6bf097ba5ca5e239)},
+  /* 22 */ {14, 0.2242438242175754, CNST_LIMB(0x5658597bcaa24000), CNST_LIMB(0x7b8015c8d7af8f08)},
+  /* 23 */ {14, 0.2210647294575037, CNST_LIMB(0xa0e2073737609371), CNST_LIMB(0x975a24b3a3151b38)},
+  /* 24 */ {13, 0.2181042919855316, CNST_LIMB(0xc29e98000000000), CNST_LIMB(0x50bd367972689db1)},
+  /* 25 */ {13, 0.2153382790366965, CNST_LIMB(0x14adf4b7320334b9), CNST_LIMB(0x8c240c4aecb13bb5)},
+  /* 26 */ {13, 0.2127460535533632, CNST_LIMB(0x226ed36478bfa000), CNST_LIMB(0xdbd2e56854e118c9)},
+  /* 27 */ {13, 0.2103099178571525, CNST_LIMB(0x383d9170b85ff80b), CNST_LIMB(0x2351ffcaa9c7c4ae)},
+  /* 28 */ {13, 0.2080145976765095, CNST_LIMB(0x5a3c23e39c000000), CNST_LIMB(0x6b24188ca33b0636)},
+  /* 29 */ {13, 0.2058468324604344, CNST_LIMB(0x8e65137388122bcd), CNST_LIMB(0xcc3dceaf2b8ba99d)},
+  /* 30 */ {13, 0.2037950470905062, CNST_LIMB(0xdd41bb36d259e000), CNST_LIMB(0x2832e835c6c7d6b6)},
+  /* 31 */ {12, 0.2018490865820999, CNST_LIMB(0xaee5720ee830681), CNST_LIMB(0x76b6aa272e1873c5)},
+  /* 32 */ {12, 0.2000000000000000, CNST_LIMB(0x5), CNST_LIMB(0x0)},
+  /* 33 */ {12, 0.1982398631705605, CNST_LIMB(0x172588ad4f5f0981), CNST_LIMB(0x61eaf5d402c7bf4f)},
+  /* 34 */ {12, 0.1965616322328226, CNST_LIMB(0x211e44f7d02c1000), CNST_LIMB(0xeeb658123ffb27ec)},
+  /* 35 */ {12, 0.1949590218937863, CNST_LIMB(0x2ee56725f06e5c71), CNST_LIMB(0x5d5e3762e6fdf509)},
+  /* 36 */ {12, 0.1934264036172708, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+  /* 37 */ {12, 0.1919587200065601, CNST_LIMB(0x5b5b57f8a98a5dd1), CNST_LIMB(0x66ae7831762efb6f)},
+  /* 38 */ {12, 0.1905514124267734, CNST_LIMB(0x7dcff8986ea31000), CNST_LIMB(0x47388865a00f544)},
+  /* 39 */ {12, 0.1892003595168700, CNST_LIMB(0xabd4211662a6b2a1), CNST_LIMB(0x7d673c33a123b54c)},
+  /* 40 */ {12, 0.1879018247091076, CNST_LIMB(0xe8d4a51000000000), CNST_LIMB(0x19799812dea11197)},
+  /* 41 */ {11, 0.1866524112389434, CNST_LIMB(0x7a32956ad081b79), CNST_LIMB(0xc27e62e0686feae)},
+  /* 42 */ {11, 0.1854490234153689, CNST_LIMB(0x9f49aaff0e86800), CNST_LIMB(0x9b6e7507064ce7c7)},
+  /* 43 */ {11, 0.1842888331487062, CNST_LIMB(0xce583bb812d37b3), CNST_LIMB(0x3d9ac2bf66cfed94)},
+  /* 44 */ {11, 0.1831692509136336, CNST_LIMB(0x109b79a654c00000), CNST_LIMB(0xed46bc50ce59712a)},
+  /* 45 */ {11, 0.1820879004699383, CNST_LIMB(0x1543beff214c8b95), CNST_LIMB(0x813d97e2c89b8d46)},
+  /* 46 */ {11, 0.1810425967800402, CNST_LIMB(0x1b149a79459a3800), CNST_LIMB(0x2e81751956af8083)},
+  /* 47 */ {11, 0.1800313266566926, CNST_LIMB(0x224edfb5434a830f), CNST_LIMB(0xdd8e0a95e30c0988)},
+  /* 48 */ {11, 0.1790522317510413, CNST_LIMB(0x2b3fb00000000000), CNST_LIMB(0x7ad4dd48a0b5b167)},
+  /* 49 */ {11, 0.1781035935540111, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)},
+  /* 50 */ {11, 0.1771838201355579, CNST_LIMB(0x43c33c1937564800), CNST_LIMB(0xe392010175ee5962)},
+  /* 51 */ {11, 0.1762914343888821, CNST_LIMB(0x54411b2441c3cd8b), CNST_LIMB(0x84eaf11b2fe7738e)},
+  /* 52 */ {11, 0.1754250635819545, CNST_LIMB(0x6851455acd400000), CNST_LIMB(0x3a1e3971e008995d)},
+  /* 53 */ {11, 0.1745834300480449, CNST_LIMB(0x80a23b117c8feb6d), CNST_LIMB(0xfd7a462344ffce25)},
+  /* 54 */ {11, 0.1737653428714400, CNST_LIMB(0x9dff7d32d5dc1800), CNST_LIMB(0x9eca40b40ebcef8a)},
+  /* 55 */ {11, 0.1729696904450771, CNST_LIMB(0xc155af6faeffe6a7), CNST_LIMB(0x52fa161a4a48e43d)},
+  /* 56 */ {11, 0.1721954337940981, CNST_LIMB(0xebb7392e00000000), CNST_LIMB(0x1607a2cbacf930c1)},
+  /* 57 */ {10, 0.1714416005739134, CNST_LIMB(0x50633659656d971), CNST_LIMB(0x97a014f8e3be55f1)},
+  /* 58 */ {10, 0.1707072796637201, CNST_LIMB(0x5fa8624c7fba400), CNST_LIMB(0x568df8b76cbf212c)},
+  /* 59 */ {10, 0.1699916162869140, CNST_LIMB(0x717d9faa73c5679), CNST_LIMB(0x20ba7c4b4e6ef492)},
+  /* 60 */ {10, 0.1692938075987814, CNST_LIMB(0x86430aac6100000), CNST_LIMB(0xe81ee46b9ef492f5)},
+  /* 61 */ {10, 0.1686130986895011, CNST_LIMB(0x9e64d9944b57f29), CNST_LIMB(0x9dc0d10d51940416)},
+  /* 62 */ {10, 0.1679487789570419, CNST_LIMB(0xba5ca5392cb0400), CNST_LIMB(0x5fa8ed2f450272a5)},
+  /* 63 */ {10, 0.1673001788101741, CNST_LIMB(0xdab2ce1d022cd81), CNST_LIMB(0x2ba9eb8c5e04e641)},
+  /* 64 */ {10, 0.1666666666666667, CNST_LIMB(0x6), CNST_LIMB(0x0)},
+  /* 65 */ {10, 0.1660476462159378, CNST_LIMB(0x12aeed5fd3e2d281), CNST_LIMB(0xb67759cc00287bf1)},
+  /* 66 */ {10, 0.1654425539190583, CNST_LIMB(0x15c3da1572d50400), CNST_LIMB(0x78621feeb7f4ed33)},
+  /* 67 */ {10, 0.1648508567221604, CNST_LIMB(0x194c05534f75ee29), CNST_LIMB(0x43d55b5f72943bc0)},
+  /* 68 */ {10, 0.1642720499620502, CNST_LIMB(0x1d56299ada100000), CNST_LIMB(0x173decb64d1d4409)},
+  /* 69 */ {10, 0.1637056554452156, CNST_LIMB(0x21f2a089a4ff4f79), CNST_LIMB(0xe29fb54fd6b6074f)},
+  /* 70 */ {10, 0.1631512196835108, CNST_LIMB(0x2733896c68d9a400), CNST_LIMB(0xa1f1f5c210d54e62)},
+  /* 71 */ {10, 0.1626083122716341, CNST_LIMB(0x2d2cf2c33b533c71), CNST_LIMB(0x6aac7f9bfafd57b2)},
+  /* 72 */ {10, 0.1620765243931223, CNST_LIMB(0x33f506e440000000), CNST_LIMB(0x3b563c2478b72ee2)},
+  /* 73 */ {10, 0.1615554674429964, CNST_LIMB(0x3ba43bec1d062211), CNST_LIMB(0x12b536b574e92d1b)},
+  /* 74 */ {10, 0.1610447717564444, CNST_LIMB(0x4455872d8fd4e400), CNST_LIMB(0xdf86c03020404fa5)},
+  /* 75 */ {10, 0.1605440854340214, CNST_LIMB(0x4e2694539f2f6c59), CNST_LIMB(0xa34adf02234eea8e)},
+  /* 76 */ {10, 0.1600530732548213, CNST_LIMB(0x5938006c18900000), CNST_LIMB(0x6f46eb8574eb59dd)},
+  /* 77 */ {10, 0.1595714156699382, CNST_LIMB(0x65ad9912474aa649), CNST_LIMB(0x42459b481df47cec)},
+  /* 78 */ {10, 0.1590988078692941, CNST_LIMB(0x73ae9ff4241ec400), CNST_LIMB(0x1b424b95d80ca505)},
+  /* 79 */ {10, 0.1586349589155960, CNST_LIMB(0x836612ee9c4ce1e1), CNST_LIMB(0xf2c1b982203a0dac)},
+  /* 80 */ {10, 0.1581795909397823, CNST_LIMB(0x9502f90000000000), CNST_LIMB(0xb7cdfd9d7bdbab7d)},
+  /* 81 */ {10, 0.1577324383928644, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /* 82 */ {10, 0.1572932473495469, CNST_LIMB(0xbebf59a07dab4400), CNST_LIMB(0x57931eeaf85cf64f)},
+  /* 83 */ {10, 0.1568617748594410, CNST_LIMB(0xd7540d4093bc3109), CNST_LIMB(0x305a944507c82f47)},
+  /* 84 */ {10, 0.1564377883420716, CNST_LIMB(0xf2b96616f1900000), CNST_LIMB(0xe007ccc9c22781a)},
+  /* 85 */ {9, 0.1560210650222250, CNST_LIMB(0x336de62af2bca35), CNST_LIMB(0x3e92c42e000eeed4)},
+  /* 86 */ {9, 0.1556113914024940, CNST_LIMB(0x39235ec33d49600), CNST_LIMB(0x1ebe59130db2795e)},
+  /* 87 */ {9, 0.1552085627701551, CNST_LIMB(0x3f674e539585a17), CNST_LIMB(0x268859e90f51b89)},
+  /* 88 */ {9, 0.1548123827357682, CNST_LIMB(0x4645b6958000000), CNST_LIMB(0xd24cde0463108cfa)},
+  /* 89 */ {9, 0.1544226628011101, CNST_LIMB(0x4dcb74afbc49c19), CNST_LIMB(0xa536009f37adc383)},
+  /* 90 */ {9, 0.1540392219542636, CNST_LIMB(0x56064e1d18d9a00), CNST_LIMB(0x7cea06ce1c9ace10)},
+  /* 91 */ {9, 0.1536618862898642, CNST_LIMB(0x5f04fe2cd8a39fb), CNST_LIMB(0x58db032e72e8ba43)},
+  /* 92 */ {9, 0.1532904886526781, CNST_LIMB(0x68d74421f5c0000), CNST_LIMB(0x388cc17cae105447)},
+  /* 93 */ {9, 0.1529248683028321, CNST_LIMB(0x738df1f6ab4827d), CNST_LIMB(0x1b92672857620ce0)},
+  /* 94 */ {9, 0.1525648706011593, CNST_LIMB(0x7f3afbc9cfb5e00), CNST_LIMB(0x18c6a9575c2ade4)},
+  /* 95 */ {9, 0.1522103467132434, CNST_LIMB(0x8bf187fba88f35f), CNST_LIMB(0xd44da7da8e44b24f)},
+  /* 96 */ {9, 0.1518611533308632, CNST_LIMB(0x99c600000000000), CNST_LIMB(0xaa2f78f1b4cc6794)},
+  /* 97 */ {9, 0.1515171524096389, CNST_LIMB(0xa8ce21eb6531361), CNST_LIMB(0x843c067d091ee4cc)},
+  /* 98 */ {9, 0.1511782109217764, CNST_LIMB(0xb92112c1a0b6200), CNST_LIMB(0x62005e1e913356e3)},
+  /* 99 */ {9, 0.1508442006228941, CNST_LIMB(0xcad7718b8747c43), CNST_LIMB(0x4316eed01dedd518)},
+  /* 100 */ {9, 0.1505149978319906, CNST_LIMB(0xde0b6b3a7640000), CNST_LIMB(0x2725dd1d243aba0e)},
+  /* 101 */ {9, 0.1501904832236879, CNST_LIMB(0xf2d8cf5fe6d74c5), CNST_LIMB(0xddd9057c24cb54f)},
+  /* 102 */ {9, 0.1498705416319474, CNST_LIMB(0x1095d25bfa712600), CNST_LIMB(0xedeee175a736d2a1)},
+  /* 103 */ {9, 0.1495550618645152, CNST_LIMB(0x121b7c4c3698faa7), CNST_LIMB(0xc4699f3df8b6b328)},
+  /* 104 */ {9, 0.1492439365274121, CNST_LIMB(0x13c09e8d68000000), CNST_LIMB(0x9ebbe7d859cb5a7c)},
+  /* 105 */ {9, 0.1489370618588283, CNST_LIMB(0x15876ccb0b709ca9), CNST_LIMB(0x7c828b9887eb2179)},
+  /* 106 */ {9, 0.1486343375718350, CNST_LIMB(0x17723c2976da2a00), CNST_LIMB(0x5d652ab99001adcf)},
+  /* 107 */ {9, 0.1483356667053617, CNST_LIMB(0x198384e9c259048b), CNST_LIMB(0x4114f1754e5d7b32)},
+  /* 108 */ {9, 0.1480409554829326, CNST_LIMB(0x1bbde41dfeec0000), CNST_LIMB(0x274b7c902f7e0188)},
+  /* 109 */ {9, 0.1477501131786861, CNST_LIMB(0x1e241d6e3337910d), CNST_LIMB(0xfc9e0fbb32e210c)},
+  /* 110 */ {9, 0.1474630519902391, CNST_LIMB(0x20b91cee9901ee00), CNST_LIMB(0xf4afa3e594f8ea1f)},
+  /* 111 */ {9, 0.1471796869179852, CNST_LIMB(0x237ff9079863dfef), CNST_LIMB(0xcd85c32e9e4437b0)},
+  /* 112 */ {9, 0.1468999356504447, CNST_LIMB(0x267bf47000000000), CNST_LIMB(0xa9bbb147e0dd92a8)},
+  /* 113 */ {9, 0.1466237184553111, CNST_LIMB(0x29b08039fbeda7f1), CNST_LIMB(0x8900447b70e8eb82)},
+  /* 114 */ {9, 0.1463509580758620, CNST_LIMB(0x2d213df34f65f200), CNST_LIMB(0x6b0a92adaad5848a)},
+  /* 115 */ {9, 0.1460815796324244, CNST_LIMB(0x30d201d957a7c2d3), CNST_LIMB(0x4f990ad8740f0ee5)},
+  /* 116 */ {9, 0.1458155105286054, CNST_LIMB(0x34c6d52160f40000), CNST_LIMB(0x3670a9663a8d3610)},
+  /* 117 */ {9, 0.1455526803620167, CNST_LIMB(0x3903f855d8f4c755), CNST_LIMB(0x1f5c44188057be3c)},
+  /* 118 */ {9, 0.1452930208392428, CNST_LIMB(0x3d8de5c8ec59b600), CNST_LIMB(0xa2bea956c4e4977)},
+  /* 119 */ {9, 0.1450364656948130, CNST_LIMB(0x4269541d1ff01337), CNST_LIMB(0xed68b23033c3637e)},
+  /* 120 */ {9, 0.1447829506139581, CNST_LIMB(0x479b38e478000000), CNST_LIMB(0xc99cf624e50549c5)},
+  /* 121 */ {9, 0.1445324131589439, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)},
+  /* 122 */ {9, 0.1442847926987864, CNST_LIMB(0x5317871fa13aba00), CNST_LIMB(0x8a5bc740b1c113e5)},
+  /* 123 */ {9, 0.1440400303421672, CNST_LIMB(0x596d2f44de9fa71b), CNST_LIMB(0x6e6c7efb81cfbb9b)},
+  /* 124 */ {9, 0.1437980688733775, CNST_LIMB(0x602fd125c47c0000), CNST_LIMB(0x54aba5c5cada5f10)},
+  /* 125 */ {9, 0.1435588526911310, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)},
+  /* 126 */ {9, 0.1433223277500932, CNST_LIMB(0x6f15be069b847e00), CNST_LIMB(0x26fb43de2c8cd2a8)},
+  /* 127 */ {9, 0.1430884415049874, CNST_LIMB(0x7746b3e82a77047f), CNST_LIMB(0x12b94793db8486a1)},
+  /* 128 */ {9, 0.1428571428571428, CNST_LIMB(0x7), CNST_LIMB(0x0)},
+  /* 129 */ {9, 0.1426283821033600, CNST_LIMB(0x894953f7ea890481), CNST_LIMB(0xdd5deca404c0156d)},
+  /* 130 */ {9, 0.1424021108869747, CNST_LIMB(0x932abffea4848200), CNST_LIMB(0xbd51373330291de0)},
+  /* 131 */ {9, 0.1421782821510107, CNST_LIMB(0x9dacb687d3d6a163), CNST_LIMB(0x9fa4025d66f23085)},
+  /* 132 */ {9, 0.1419568500933153, CNST_LIMB(0xa8d8102a44840000), CNST_LIMB(0x842530ee2db4949d)},
+  /* 133 */ {9, 0.1417377701235801, CNST_LIMB(0xb4b60f9d140541e5), CNST_LIMB(0x6aa7f2766b03dc25)},
+  /* 134 */ {9, 0.1415209988221527, CNST_LIMB(0xc15065d4856e4600), CNST_LIMB(0x53035ba7ebf32e8d)},
+  /* 135 */ {9, 0.1413064939005528, CNST_LIMB(0xceb1363f396d23c7), CNST_LIMB(0x3d12091fc9fb4914)},
+  /* 136 */ {9, 0.1410942141636095, CNST_LIMB(0xdce31b2488000000), CNST_LIMB(0x28b1cb81b1ef1849)},
+  /* 137 */ {9, 0.1408841194731412, CNST_LIMB(0xebf12a24bca135c9), CNST_LIMB(0x15c35be67ae3e2c9)},
+  /* 138 */ {9, 0.1406761707131039, CNST_LIMB(0xfbe6f8dbf88f4a00), CNST_LIMB(0x42a17bd09be1ff0)},
+  /* 139 */ {8, 0.1404703297561400, CNST_LIMB(0x1ef156c084ce761), CNST_LIMB(0x8bf461f03cf0bbf)},
+  /* 140 */ {8, 0.1402665594314587, CNST_LIMB(0x20c4e3b94a10000), CNST_LIMB(0xf3fbb43f68a32d05)},
+  /* 141 */ {8, 0.1400648234939879, CNST_LIMB(0x22b0695a08ba421), CNST_LIMB(0xd84f44c48564dc19)},
+  /* 142 */ {8, 0.1398650865947379, CNST_LIMB(0x24b4f35d7a4c100), CNST_LIMB(0xbe58ebcce7956abe)},
+  /* 143 */ {8, 0.1396673142523192, CNST_LIMB(0x26d397284975781), CNST_LIMB(0xa5fac463c7c134b7)},
+  /* 144 */ {8, 0.1394714728255649, CNST_LIMB(0x290d74100000000), CNST_LIMB(0x8f19241e28c7d757)},
+  /* 145 */ {8, 0.1392775294872041, CNST_LIMB(0x2b63b3a37866081), CNST_LIMB(0x799a6d046c0ae1ae)},
+  /* 146 */ {8, 0.1390854521985406, CNST_LIMB(0x2dd789f4d894100), CNST_LIMB(0x6566e37d746a9e40)},
+  /* 147 */ {8, 0.1388952096850913, CNST_LIMB(0x306a35e51b58721), CNST_LIMB(0x526887dbfb5f788f)},
+  /* 148 */ {8, 0.1387067714131417, CNST_LIMB(0x331d01712e10000), CNST_LIMB(0x408af3382b8efd3d)},
+  /* 149 */ {8, 0.1385201075671774, CNST_LIMB(0x35f14200a827c61), CNST_LIMB(0x2fbb374806ec05f1)},
+  /* 150 */ {8, 0.1383351890281539, CNST_LIMB(0x38e858b62216100), CNST_LIMB(0x1fe7c0f0afce87fe)},
+  /* 151 */ {8, 0.1381519873525671, CNST_LIMB(0x3c03b2c13176a41), CNST_LIMB(0x11003d517540d32e)},
+  /* 152 */ {8, 0.1379704747522905, CNST_LIMB(0x3f44c9b21000000), CNST_LIMB(0x2f5810f98eff0dc)},
+  /* 153 */ {8, 0.1377906240751463, CNST_LIMB(0x42ad23cef3113c1), CNST_LIMB(0xeb72e35e7840d910)},
+  /* 154 */ {8, 0.1376124087861776, CNST_LIMB(0x463e546b19a2100), CNST_LIMB(0xd27de19593dc3614)},
+  /* 155 */ {8, 0.1374358029495937, CNST_LIMB(0x49f9fc3f96684e1), CNST_LIMB(0xbaf391fd3e5e6fc2)},
+  /* 156 */ {8, 0.1372607812113589, CNST_LIMB(0x4de1c9c5dc10000), CNST_LIMB(0xa4bd38c55228c81d)},
+  /* 157 */ {8, 0.1370873187823978, CNST_LIMB(0x51f77994116d2a1), CNST_LIMB(0x8fc5a8de8e1de782)},
+  /* 158 */ {8, 0.1369153914223921, CNST_LIMB(0x563cd6bb3398100), CNST_LIMB(0x7bf9265bea9d3a3b)},
+  /* 159 */ {8, 0.1367449754241439, CNST_LIMB(0x5ab3bb270beeb01), CNST_LIMB(0x69454b325983dccd)},
+  /* 160 */ {8, 0.1365760475984821, CNST_LIMB(0x5f5e10000000000), CNST_LIMB(0x5798ee2308c39df9)},
+  /* 161 */ {8, 0.1364085852596902, CNST_LIMB(0x643dce0ec16f501), CNST_LIMB(0x46e40ba0fa66a753)},
+  /* 162 */ {8, 0.1362425662114337, CNST_LIMB(0x6954fe21e3e8100), CNST_LIMB(0x3717b0870b0db3a7)},
+  /* 163 */ {8, 0.1360779687331669, CNST_LIMB(0x6ea5b9755f440a1), CNST_LIMB(0x2825e6775d11cdeb)},
+  /* 164 */ {8, 0.1359147715670014, CNST_LIMB(0x74322a1c0410000), CNST_LIMB(0x1a01a1c09d1b4dac)},
+  /* 165 */ {8, 0.1357529539050150, CNST_LIMB(0x79fc8b6ae8a46e1), CNST_LIMB(0xc9eb0a8bebc8f3e)},
+  /* 166 */ {8, 0.1355924953769863, CNST_LIMB(0x80072a66d512100), CNST_LIMB(0xffe357ff59e6a004)},
+  /* 167 */ {8, 0.1354333760385373, CNST_LIMB(0x86546633b42b9c1), CNST_LIMB(0xe7dfd1be05fa61a8)},
+  /* 168 */ {8, 0.1352755763596663, CNST_LIMB(0x8ce6b0861000000), CNST_LIMB(0xd11ed6fc78f760e5)},
+  /* 169 */ {8, 0.1351190772136599, CNST_LIMB(0x93c08e16a022441), CNST_LIMB(0xbb8db609dd29ebfe)},
+  /* 170 */ {8, 0.1349638598663645, CNST_LIMB(0x9ae49717f026100), CNST_LIMB(0xa71aec8d1813d532)},
+  /* 171 */ {8, 0.1348099059658079, CNST_LIMB(0xa25577ae24c1a61), CNST_LIMB(0x93b612a9f20fbc02)},
+  /* 172 */ {8, 0.1346571975321549, CNST_LIMB(0xaa15f068e610000), CNST_LIMB(0x814fc7b19a67d317)},
+  /* 173 */ {8, 0.1345057169479844, CNST_LIMB(0xb228d6bf7577921), CNST_LIMB(0x6fd9a03f2e0a4b7c)},
+  /* 174 */ {8, 0.1343554469488779, CNST_LIMB(0xba91158ef5c4100), CNST_LIMB(0x5f4615a38d0d316e)},
+  /* 175 */ {8, 0.1342063706143054, CNST_LIMB(0xc351ad9aec0b681), CNST_LIMB(0x4f8876863479a286)},
+  /* 176 */ {8, 0.1340584713587980, CNST_LIMB(0xcc6db6100000000), CNST_LIMB(0x4094d8a3041b60eb)},
+  /* 177 */ {8, 0.1339117329233981, CNST_LIMB(0xd5e85d09025c181), CNST_LIMB(0x32600b8ed883a09b)},
+  /* 178 */ {8, 0.1337661393673756, CNST_LIMB(0xdfc4e816401c100), CNST_LIMB(0x24df8c6eb4b6d1f1)},
+  /* 179 */ {8, 0.1336216750601996, CNST_LIMB(0xea06b4c72947221), CNST_LIMB(0x18097a8ee151acef)},
+  /* 180 */ {8, 0.1334783246737591, CNST_LIMB(0xf4b139365210000), CNST_LIMB(0xbd48cc8ec1cd8e3)},
+  /* 181 */ {8, 0.1333360731748201, CNST_LIMB(0xffc80497d520961), CNST_LIMB(0x3807a8d67485fb)},
+  /* 182 */ {8, 0.1331949058177136, CNST_LIMB(0x10b4ebfca1dee100), CNST_LIMB(0xea5768860b62e8d8)},
+  /* 183 */ {8, 0.1330548081372441, CNST_LIMB(0x117492de921fc141), CNST_LIMB(0xd54faf5b635c5005)},
+  /* 184 */ {8, 0.1329157659418126, CNST_LIMB(0x123bb2ce41000000), CNST_LIMB(0xc14a56233a377926)},
+  /* 185 */ {8, 0.1327777653067443, CNST_LIMB(0x130a8b6157bdecc1), CNST_LIMB(0xae39a88db7cd329f)},
+  /* 186 */ {8, 0.1326407925678156, CNST_LIMB(0x13e15dede0e8a100), CNST_LIMB(0x9c10bde69efa7ab6)},
+  /* 187 */ {8, 0.1325048343149731, CNST_LIMB(0x14c06d941c0ca7e1), CNST_LIMB(0x8ac36c42a2836497)},
+  /* 188 */ {8, 0.1323698773862368, CNST_LIMB(0x15a7ff487a810000), CNST_LIMB(0x7a463c8b84f5ef67)},
+  /* 189 */ {8, 0.1322359088617821, CNST_LIMB(0x169859ddc5c697a1), CNST_LIMB(0x6a8e5f5ad090fd4b)},
+  /* 190 */ {8, 0.1321029160581950, CNST_LIMB(0x1791c60f6fed0100), CNST_LIMB(0x5b91a2943596fc56)},
+  /* 191 */ {8, 0.1319708865228925, CNST_LIMB(0x18948e8c0e6fba01), CNST_LIMB(0x4d4667b1c468e8f0)},
+  /* 192 */ {8, 0.1318398080287045, CNST_LIMB(0x19a1000000000000), CNST_LIMB(0x3fa39ab547994daf)},
+  /* 193 */ {8, 0.1317096685686114, CNST_LIMB(0x1ab769203dafc601), CNST_LIMB(0x32a0a9b2faee1e2a)},
+  /* 194 */ {8, 0.1315804563506306, CNST_LIMB(0x1bd81ab557f30100), CNST_LIMB(0x26357ceac0e96962)},
+  /* 195 */ {8, 0.1314521597928493, CNST_LIMB(0x1d0367a69fed1ba1), CNST_LIMB(0x1a5a6f65caa5859e)},
+  /* 196 */ {8, 0.1313247675185968, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)},
+  /* 197 */ {8, 0.1311982683517524, CNST_LIMB(0x1f7b2a18f29ac3e1), CNST_LIMB(0x4383340615612ca)},
+  /* 198 */ {8, 0.1310726513121843, CNST_LIMB(0x20c850694c2aa100), CNST_LIMB(0xf3c77969ee4be5a2)},
+  /* 199 */ {8, 0.1309479056113158, CNST_LIMB(0x222173cc014980c1), CNST_LIMB(0xe00993cc187c5ec9)},
+  /* 200 */ {8, 0.1308240206478128, CNST_LIMB(0x2386f26fc1000000), CNST_LIMB(0xcd2b297d889bc2b6)},
+  /* 201 */ {8, 0.1307009860033912, CNST_LIMB(0x24f92ce8af296d41), CNST_LIMB(0xbb214d5064862b22)},
+  /* 202 */ {8, 0.1305787914387386, CNST_LIMB(0x2678863cd0ece100), CNST_LIMB(0xa9e1a7ca7ea10e20)},
+  /* 203 */ {8, 0.1304574268895465, CNST_LIMB(0x280563f0a9472d61), CNST_LIMB(0x99626e72b39ea0cf)},
+  /* 204 */ {8, 0.1303368824626505, CNST_LIMB(0x29a02e1406210000), CNST_LIMB(0x899a5ba9c13fafd9)},
+  /* 205 */ {8, 0.1302171484322746, CNST_LIMB(0x2b494f4efe6d2e21), CNST_LIMB(0x7a80a705391e96ff)},
+  /* 206 */ {8, 0.1300982152363760, CNST_LIMB(0x2d0134ef21cbc100), CNST_LIMB(0x6c0cfe23de23042a)},
+  /* 207 */ {8, 0.1299800734730872, CNST_LIMB(0x2ec84ef4da2ef581), CNST_LIMB(0x5e377df359c944dd)},
+  /* 208 */ {8, 0.1298627138972530, CNST_LIMB(0x309f102100000000), CNST_LIMB(0x50f8ac5fc8f53985)},
+  /* 209 */ {8, 0.1297461274170591, CNST_LIMB(0x3285ee02a1420281), CNST_LIMB(0x44497266278e35b7)},
+  /* 210 */ {8, 0.1296303050907487, CNST_LIMB(0x347d6104fc324100), CNST_LIMB(0x382316831f7ee175)},
+  /* 211 */ {8, 0.1295152381234257, CNST_LIMB(0x3685e47dade53d21), CNST_LIMB(0x2c7f377833b8946e)},
+  /* 212 */ {8, 0.1294009178639407, CNST_LIMB(0x389ff6bb15610000), CNST_LIMB(0x2157c761ab4163ef)},
+  /* 213 */ {8, 0.1292873358018581, CNST_LIMB(0x3acc1912ebb57661), CNST_LIMB(0x16a7071803cc49a9)},
+  /* 214 */ {8, 0.1291744835645007, CNST_LIMB(0x3d0acff111946100), CNST_LIMB(0xc6781d80f8224fc)},
+  /* 215 */ {8, 0.1290623529140715, CNST_LIMB(0x3f5ca2e692eaf841), CNST_LIMB(0x294092d370a900b)},
+  /* 216 */ {8, 0.1289509357448472, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+  /* 217 */ {8, 0.1288402240804449, CNST_LIMB(0x443bcb714399a5c1), CNST_LIMB(0xe03b98f103fad6d2)},
+  /* 218 */ {8, 0.1287302100711567, CNST_LIMB(0x46ca406c81af2100), CNST_LIMB(0xcee3d32cad2a9049)},
+  /* 219 */ {8, 0.1286208859913518, CNST_LIMB(0x496e106ac22aaae1), CNST_LIMB(0xbe3f9df9277fdada)},
+  /* 220 */ {8, 0.1285122442369443, CNST_LIMB(0x4c27d39fa5410000), CNST_LIMB(0xae46f0d94c05e933)},
+  /* 221 */ {8, 0.1284042773229231, CNST_LIMB(0x4ef825c296e43ca1), CNST_LIMB(0x9ef2280fb437a33d)},
+  /* 222 */ {8, 0.1282969778809442, CNST_LIMB(0x51dfa61f5ad88100), CNST_LIMB(0x9039ff426d3f284b)},
+  /* 223 */ {8, 0.1281903386569819, CNST_LIMB(0x54def7a6d2f16901), CNST_LIMB(0x82178c6d6b51f8f4)},
+  /* 224 */ {8, 0.1280843525090381, CNST_LIMB(0x57f6c10000000000), CNST_LIMB(0x74843b1ee4c1e053)},
+  /* 225 */ {8, 0.1279790124049077, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)},
+  /* 226 */ {8, 0.1278743114199984, CNST_LIMB(0x5e7268b9bbdf8100), CNST_LIMB(0x5af23c74f9ad9fe9)},
+  /* 227 */ {8, 0.1277702427352035, CNST_LIMB(0x61d7a7932ff3d6a1), CNST_LIMB(0x4ee7eae2acdc617e)},
+  /* 228 */ {8, 0.1276667996348261, CNST_LIMB(0x65581f53c8c10000), CNST_LIMB(0x43556aa2ac262a0b)},
+  /* 229 */ {8, 0.1275639755045533, CNST_LIMB(0x68f48a385b8320e1), CNST_LIMB(0x3835949593b8ddd1)},
+  /* 230 */ {8, 0.1274617638294791, CNST_LIMB(0x6cada69ed07c2100), CNST_LIMB(0x2d837fbe78458762)},
+  /* 231 */ {8, 0.1273601581921741, CNST_LIMB(0x70843718cdbf27c1), CNST_LIMB(0x233a7e150a54a555)},
+  /* 232 */ {8, 0.1272591522708010, CNST_LIMB(0x7479027ea1000000), CNST_LIMB(0x19561984a50ff8fe)},
+  /* 233 */ {8, 0.1271587398372755, CNST_LIMB(0x788cd40268f39641), CNST_LIMB(0xfd211159fe3490f)},
+  /* 234 */ {8, 0.1270589147554692, CNST_LIMB(0x7cc07b437ecf6100), CNST_LIMB(0x6aa563e655033e3)},
+  /* 235 */ {8, 0.1269596709794558, CNST_LIMB(0x8114cc6220762061), CNST_LIMB(0xfbb614b3f2d3b14c)},
+  /* 236 */ {8, 0.1268610025517973, CNST_LIMB(0x858aa0135be10000), CNST_LIMB(0xeac0f8837fb05773)},
+  /* 237 */ {8, 0.1267629036018709, CNST_LIMB(0x8a22d3b53c54c321), CNST_LIMB(0xda6e4c10e8615ca5)},
+  /* 238 */ {8, 0.1266653683442337, CNST_LIMB(0x8ede496339f34100), CNST_LIMB(0xcab755a8d01fa67f)},
+  /* 239 */ {8, 0.1265683910770258, CNST_LIMB(0x93bde80aec3a1481), CNST_LIMB(0xbb95a9ae71aa3e0c)},
+  /* 240 */ {8, 0.1264719661804097, CNST_LIMB(0x98c29b8100000000), CNST_LIMB(0xad0326c296b4f529)},
+  /* 241 */ {8, 0.1263760881150453, CNST_LIMB(0x9ded549671832381), CNST_LIMB(0x9ef9f21eed31b7c1)},
+  /* 242 */ {8, 0.1262807514205999, CNST_LIMB(0xa33f092e0b1ac100), CNST_LIMB(0x91747422be14b0b2)},
+  /* 243 */ {8, 0.1261859507142915, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /* 244 */ {8, 0.1260916806894653, CNST_LIMB(0xae5b564ac3a10000), CNST_LIMB(0x77df79e9a96c06f6)},
+  /* 245 */ {8, 0.1259979361142023, CNST_LIMB(0xb427f4b3be74c361), CNST_LIMB(0x6bc6019636c7d0c2)},
+  /* 246 */ {8, 0.1259047118299582, CNST_LIMB(0xba1f9a938041e100), CNST_LIMB(0x601c4205aebd9e47)},
+  /* 247 */ {8, 0.1258120027502338, CNST_LIMB(0xc0435871d1110f41), CNST_LIMB(0x54ddc59756f05016)},
+  /* 248 */ {8, 0.1257198038592741, CNST_LIMB(0xc694446f01000000), CNST_LIMB(0x4a0648979c838c18)},
+  /* 249 */ {8, 0.1256281102107963, CNST_LIMB(0xcd137a5b57ac3ec1), CNST_LIMB(0x3f91b6e0bb3a053d)},
+  /* 250 */ {8, 0.1255369169267456, CNST_LIMB(0xd3c21bcecceda100), CNST_LIMB(0x357c299a88ea76a5)},
+  /* 251 */ {8, 0.1254462191960791, CNST_LIMB(0xdaa150410b788de1), CNST_LIMB(0x2bc1e517aecc56e3)},
+  /* 252 */ {8, 0.1253560122735751, CNST_LIMB(0xe1b24521be010000), CNST_LIMB(0x225f56ceb3da9f5d)},
+  /* 253 */ {8, 0.1252662914786691, CNST_LIMB(0xe8f62df12777c1a1), CNST_LIMB(0x1951136d53ad63ac)},
+  /* 254 */ {8, 0.1251770521943144, CNST_LIMB(0xf06e445906fc0100), CNST_LIMB(0x1093d504b3cd7d93)},
+  /* 255 */ {8, 0.1250882898658681, CNST_LIMB(0xf81bc845c81bf801), CNST_LIMB(0x824794d1ec1814f)},
+};
+#endif
diff --git a/rts/gmp/mpn/ns32k/add_n.s b/rts/gmp/mpn/ns32k/add_n.s
new file mode 100644
index 0000000000..bd063d07d9
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/add_n.s
@@ -0,0 +1,46 @@
+# ns32000 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_add_n
+___gmpn_add_n:
+	save	[r3,r4,r5]
+	negd	28(sp),r3
+	movd	r3,r0
+	lshd	2,r0
+	movd	24(sp),r4
+	subd	r0,r4			# r4 -> to end of S2
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r2
+	subd	r0,r2			# r2 -> to end of RES
+	subd	r0,r0			# cy = 0
+
+Loop:	movd	r5[r3:d],r0
+	addcd	r4[r3:d],r0
+	movd	r0,r2[r3:d]
+	acbd	1,r3,Loop
+
+	scsd	r0			# r0 = cy.
+	restore	[r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/addmul_1.s b/rts/gmp/mpn/ns32k/addmul_1.s
new file mode 100644
index 0000000000..df0dcdd4af
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/addmul_1.s
@@ -0,0 +1,48 @@
+# ns32000 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+	save	[r3,r4,r5,r6,r7]
+	negd	24(sp),r4
+	movd	r4,r0
+	lshd	2,r0
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r6
+	subd	r0,r6			# r6 -> to end of RES
+	subd	r0,r0			# r0 = 0, cy = 0
+	movd	28(sp),r7		# r7 = s2_limb
+
+Loop:	movd	r5[r4:d],r2
+	meid	r7,r2			# r2 = low_prod, r3 = high_prod
+	addcd	r0,r2			# r2 = low_prod + cy_limb
+	movd	r3,r0			# r0 = new cy_limb
+	addcd	0,r0
+	addd	r2,r6[r4:d]
+	acbd	1,r4,Loop
+
+	addcd	0,r0
+	restore	[r7,r6,r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/mul_1.s b/rts/gmp/mpn/ns32k/mul_1.s
new file mode 100644
index 0000000000..0a77efba29
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/mul_1.s
@@ -0,0 +1,47 @@
+# ns32000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+	save	[r3,r4,r5,r6,r7]
+	negd	24(sp),r4
+	movd	r4,r0
+	lshd	2,r0
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r6
+	subd	r0,r6			# r6 -> to end of RES
+	subd	r0,r0			# r0 = 0, cy = 0
+	movd	28(sp),r7		# r7 = s2_limb
+
+Loop:	movd	r5[r4:d],r2
+	meid	r7,r2			# r2 = low_prod, r3 = high_prod
+	addcd	r0,r2			# r2 = low_prod + cy_limb
+	movd	r3,r0			# r0 = new cy_limb
+	movd	r2,r6[r4:d]
+	acbd	1,r4,Loop
+
+	addcd	0,r0
+	restore	[r7,r6,r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/sub_n.s b/rts/gmp/mpn/ns32k/sub_n.s
new file mode 100644
index 0000000000..cd89f4fd3f
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/sub_n.s
@@ -0,0 +1,46 @@
+# ns32000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	save	[r3,r4,r5]
+	negd	28(sp),r3
+	movd	r3,r0
+	lshd	2,r0
+	movd	24(sp),r4
+	subd	r0,r4			# r4 -> to end of S2
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r2
+	subd	r0,r2			# r2 -> to end of RES
+	subd	r0,r0			# cy = 0
+
+Loop:	movd	r5[r3:d],r0
+	subcd	r4[r3:d],r0
+	movd	r0,r2[r3:d]
+	acbd	1,r3,Loop
+
+	scsd	r0			# r0 = cy.
+	restore	[r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/submul_1.s b/rts/gmp/mpn/ns32k/submul_1.s
new file mode 100644
index 0000000000..f811aedcf1
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/submul_1.s
@@ -0,0 +1,48 @@
+# ns32000 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_submul_1
+___gmpn_submul_1:
+	save	[r3,r4,r5,r6,r7]
+	negd	24(sp),r4
+	movd	r4,r0
+	lshd	2,r0
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r6
+	subd	r0,r6			# r6 -> to end of RES
+	subd	r0,r0			# r0 = 0, cy = 0
+	movd	28(sp),r7		# r7 = s2_limb
+
+Loop:	movd	r5[r4:d],r2
+	meid	r7,r2			# r2 = low_prod, r3 = high_prod
+	addcd	r0,r2			# r2 = low_prod + cy_limb
+	movd	r3,r0			# r0 = new cy_limb
+	addcd	0,r0
+	subd	r2,r6[r4:d]
+	acbd	1,r4,Loop
+
+	addcd	0,r0
+	restore	[r7,r6,r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/pa64/README b/rts/gmp/mpn/pa64/README
new file mode 100644
index 0000000000..8d2976dabc
--- /dev/null
+++ b/rts/gmp/mpn/pa64/README
@@ -0,0 +1,38 @@
+This directory contains mpn functions for 64-bit PA-RISC 2.0.
+
+RELEVANT OPTIMIZATION ISSUES
+
+The PA8000 has a multi-issue pipeline with large buffers for instructions
+awaiting pending results.  Therefore, no latency scheduling is necessary
+(and might actually be harmful).
+
+Two 64-bit loads can be completed per cycle.  One 64-bit store can be
+completed per cycle.  A store cannot complete in the same cycle as a load.
+
+STATUS
+
+* mpn_lshift, mpn_rshift, mpn_add_n, mpn_sub_n are all well-tuned and run at
+  the peak cache bandwidth; 1.5 cycles/limb for shifting and 2.0 cycles/limb
+  for add/subtract.
+
+* The multiplication functions run at 11 cycles/limb.  The cache bandwidth
+  allows 7.5 cycles/limb.  Perhaps it would be possible, using unrolling or
+  better scheduling, to get closer to the cache bandwidth limit.
+
+* xaddmul_1.S contains a quicker method for forming the 128 bit product.  It
+  uses some fewer operations, and keep the carry flag live across the loop
+  boundary.  But it seems hard to make it run more than 1/4 cycle faster
+  than the old code.  Perhaps we really ought to unroll this loop be 2x?
+  2x should suffice since register latency schedling is never needed,
+  but the unrolling would hide the store-load latency.  Here is a sketch:
+
+	1. A multiply and store 64-bit products
+	2. B sum 64-bit products 128-bit product
+	3. B load  64-bit products to integer registers
+	4. B multiply and store 64-bit products
+	5. A sum 64-bit products 128-bit product
+	6. A load  64-bit products to integer registers
+	7. goto 1
+
+  In practice, adjacent groups (1 and 2, 2 and 3, etc) will be interleaved
+  for better instruction mix.
diff --git a/rts/gmp/mpn/pa64/add_n.s b/rts/gmp/mpn/pa64/add_n.s
new file mode 100644
index 0000000000..22ff19c184
--- /dev/null
+++ b/rts/gmp/mpn/pa64/add_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+; store sum in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_add_n,entry
+__gmpn_add_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	sub		%r26,%r22,%r26		; offset res_ptr
+	blr		%r28,%r0		; branch into loop
+	add		%r0,%r0,%r0		; reset carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	bve		(%r2)
+	.exit
+	ldi		0,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/addmul_1.S b/rts/gmp/mpn/pa64/addmul_1.S
new file mode 100644
index 0000000000..b1885b432c
--- /dev/null
+++ b/rts/gmp/mpn/pa64/addmul_1.S
@@ -0,0 +1,167 @@
+; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_addmul_1,entry
+__gmpn_addmul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,rlimb,rlimb
+	add,dc		t2,hi,cylimb
+	add		t4,rlimb,t3
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/gmp-mparam.h b/rts/gmp/mpn/pa64/gmp-mparam.h
new file mode 100644
index 0000000000..847735b987
--- /dev/null
+++ b/rts/gmp/mpn/pa64/gmp-mparam.h
@@ -0,0 +1,65 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values were measured in a PA8000 using the system compiler version
+   A.10.32.30.  Presumably the PA8200 and PA8500 have the same timing
+   characteristic, but GCC might give somewhat different results.  */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   16
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      105
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   40
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      116
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              72
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             94
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            50
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD       46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           1
+#endif
diff --git a/rts/gmp/mpn/pa64/lshift.s b/rts/gmp/mpn/pa64/lshift.s
new file mode 100644
index 0000000000..994bc1c4d6
--- /dev/null
+++ b/rts/gmp/mpn/pa64/lshift.s
@@ -0,0 +1,103 @@
+; HP-PA 2.0 __gmpn_lshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_lshift,entry
+__gmpn_lshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	shladd		%r24,3,%r25,%r25
+	shladd		%r24,3,%r26,%r26
+	subi		64,%r23,%r23
+	mtsar		%r23
+	ldd		-8(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r0,%r21,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	add		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	add		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		-16(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-8(%r26)
+L$7	ldd		-24(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-16(%r26)
+L$6	ldd		-32(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-24(%r26)
+L$5	ldd		-40(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-32(%r26)
+L$4	ldd		-48(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-40(%r26)
+L$3	ldd		-56(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-48(%r26)
+L$2	ldd		-64(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-56(%r26)
+L$1	ldd		-72(%r25),%r21
+	ldo		-64(%r25),%r25
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-64(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		-64(%r26),%r26
+
+L$end	shrpd		%r21,%r0,%sar,%r21
+	std		%r21,-8(%r26)
+	bve		(%r2)
+	.exit
+	extrd,u		%r29,31,32,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/mul_1.S b/rts/gmp/mpn/pa64/mul_1.S
new file mode 100644
index 0000000000..ab310c1264
--- /dev/null
+++ b/rts/gmp/mpn/pa64/mul_1.S
@@ -0,0 +1,158 @@
+; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and
+; store the result in a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_mul_1,entry
+__gmpn_mul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t2 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t3
+	add,dc		t2,hi,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/rshift.s b/rts/gmp/mpn/pa64/rshift.s
new file mode 100644
index 0000000000..f0730e2a91
--- /dev/null
+++ b/rts/gmp/mpn/pa64/rshift.s
@@ -0,0 +1,100 @@
+; HP-PA 2.0 __gmpn_rshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_rshift,entry
+__gmpn_rshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	mtsar		%r23
+	ldd		0(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r21,%r0,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	sub		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		8(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,0(%r26)
+L$7	ldd		16(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,8(%r26)
+L$6	ldd		24(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,16(%r26)
+L$5	ldd		32(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,24(%r26)
+L$4	ldd		40(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,32(%r26)
+L$3	ldd		48(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,40(%r26)
+L$2	ldd		56(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,48(%r26)
+L$1	ldd		64(%r25),%r21
+	ldo		64(%r25),%r25
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,56(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		64(%r26),%r26
+
+L$end	shrpd		%r0,%r21,%sar,%r21
+	std		%r21,0(%r26)
+	bve		(%r2)
+	.exit
+	extrd,u		%r29,31,32,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/sub_n.s b/rts/gmp/mpn/pa64/sub_n.s
new file mode 100644
index 0000000000..dda1f54b34
--- /dev/null
+++ b/rts/gmp/mpn/pa64/sub_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+; and store difference in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_sub_n,entry
+__gmpn_sub_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	blr		%r28,%r0		; branch into loop
+	sub		%r26,%r22,%r26		; offset res_ptr and set carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	subi		1,%r29,%r29
+	bve		(%r2)
+	.exit
+	ldi		0,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/submul_1.S b/rts/gmp/mpn/pa64/submul_1.S
new file mode 100644
index 0000000000..27666b99df
--- /dev/null
+++ b/rts/gmp/mpn/pa64/submul_1.S
@@ -0,0 +1,170 @@
+; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_submul_1,entry
+__gmpn_submul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t4
+	add,dc		t2,hi,cylimb
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/udiv_qrnnd.c b/rts/gmp/mpn/pa64/udiv_qrnnd.c
new file mode 100644
index 0000000000..1c9fe084db
--- /dev/null
+++ b/rts/gmp/mpn/pa64/udiv_qrnnd.c
@@ -0,0 +1,111 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define TWO64 18446744073709551616.0
+
+mp_limb_t
+#if __STDC__
+__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r)
+#else
+__MPN(udiv_qrnnd) (n1, n0, d, r)
+     mp_limb_t n1;
+     mp_limb_t n0;
+     mp_limb_t d;
+     mp_limb_t *r;
+#endif
+{
+  mp_limb_t q1, q2, q;
+  mp_limb_t p1, p0;
+  double di, dq;
+
+  di = 1.0 / d;
+
+  /* Generate upper 53 bits of quotient.  Be careful here; the `double'
+     quotient may be rounded to 2^64 which we cannot safely convert back
+     to a 64-bit integer.  */
+  dq = (TWO64 * (double) n1 + (double) n0) * di;
+  if (dq >= TWO64)
+    q1 = 0xfffffffffffff800LL;
+  else
+    q1 = (mp_limb_t) dq;
+
+  /* Multiply back in order to compare the product to the dividend.  */
+  umul_ppmm (p1, p0, q1, d);
+
+  /* Was the 53-bit quotient greater that our sought quotient?  Test the
+     sign of the partial remainder to find out.  */
+  if (n1 < p1 || (n1 == p1 && n0 < p0))
+    {
+      /* 53-bit quotient too large.  Partial remainder is negative.
+	 Compute the absolute value of the remainder in n1,,n0.  */
+      n1 = p1 - (n1 + (p0 < n0));
+      n0 = p0 - n0;
+
+      /* Now use the partial remainder as new dividend to compute more bits of
+	 quotient.  This is an adjustment for the one we got previously.  */
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 - q2;
+      if (n1 < p1 || (n1 == p1 && n0 <= p0))
+	{
+	  n0 = p0 - n0;
+	}
+      else
+	{
+	  n0 = p0 - n0;
+	  n0 += d;
+	  q--;
+	}
+    }
+  else
+    {
+      n1 = n1 - (p1 + (n0 < p0));
+      n0 = n0 - p0;
+
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 + q2;
+      if (n1 < p1 || (n1 == p1 && n0 < p0))
+	{
+	  n0 = n0 - p0;
+	  n0 += d;
+	  q--;
+	}
+      else
+	{
+	  n0 = n0 - p0;
+	  if (n0 >= d)
+	    {
+	      n0 -= d;
+	      q++;
+	    }
+	}
+    }
+
+  *r = n0;
+  return q;
+}
diff --git a/rts/gmp/mpn/pa64/umul_ppmm.S b/rts/gmp/mpn/pa64/umul_ppmm.S
new file mode 100644
index 0000000000..ceff2d752f
--- /dev/null
+++ b/rts/gmp/mpn/pa64/umul_ppmm.S
@@ -0,0 +1,74 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+#define p0	%r28
+#define p1	%r29
+#define t32	%r19
+#define t0	%r20
+#define t1	%r21
+#define x	%r22
+#define m0	%r23
+#define m1	%r24
+	.level  2.0n
+	.code
+	.export __gmpn_umul_ppmm,entry
+__gmpn_umul_ppmm
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	ldo		128(%r30),%r30
+	depd		%r25,31,32,%r26
+	std		%r26,-64(%r30)
+	depd		%r23,31,32,%r24
+	std		%r24,-56(%r30)
+
+	ldw		-180(%r30),%r31
+
+        fldd		-64(%r30),%fr4
+        fldd		-56(%r30),%fr5
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+
+	depdi,z		1,31,1,t32		; t32 = 2^32
+
+	ldd		-128(%r30),p0		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),p1		; hi = high 64 bit of product
+
+	add,l,*nuv	m0,m1,x			; x = m1+m0
+	 add,l		t32,p1,p1		; propagate carry to mid of p1
+	depd,z		x,31,32,t0		; lo32(m1+m0)
+	add		t0,p0,p0
+	extrd,u		x,31,32,t1		; hi32(m1+m0)
+	add,dc		t1,p1,p1
+
+	std		p0,0(%r31)		; store low half of product
+	extrd,u		p1,31,32,%r28		; return high half of product
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/README b/rts/gmp/mpn/pa64w/README
new file mode 100644
index 0000000000..cf590a7b98
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/README
@@ -0,0 +1,2 @@
+This directory contains mpn functions for 64-bit PA-RISC 2.0
+using 64-bit pointers (2.0W).
diff --git a/rts/gmp/mpn/pa64w/add_n.s b/rts/gmp/mpn/pa64w/add_n.s
new file mode 100644
index 0000000000..1bb9e8fbc7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/add_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+; store sum in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_add_n,entry
+__gmpn_add_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	sub		%r26,%r22,%r26		; offset res_ptr
+	blr		%r28,%r0		; branch into loop
+	add		%r0,%r0,%r0		; reset carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/addmul_1.S b/rts/gmp/mpn/pa64w/addmul_1.S
new file mode 100644
index 0000000000..4799f90fc5
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/addmul_1.S
@@ -0,0 +1,168 @@
+; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		%r23
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0w
+	.code
+	.export __gmpn_addmul_1,entry
+__gmpn_addmul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	std		s2limb,-56(%r30)
+        fldd		-56(%r30),%fr5
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,rlimb,rlimb
+	add,dc		t2,hi,cylimb
+	add		t4,rlimb,t3
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	copy		cylimb,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/gmp-mparam.h b/rts/gmp/mpn/pa64w/gmp-mparam.h
new file mode 100644
index 0000000000..ee5a0a3ab7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/gmp-mparam.h
@@ -0,0 +1,65 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values were measured on a PA8500 using the system compiler version
+   A.11.01.02.  Presumably the PA8000 and PA8200 have the same timing
+   characteristic, but GCC might give somewhat different results..  */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   18
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      105
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   46
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       83
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              58
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            134
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            56
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD       26
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           1
+#endif
diff --git a/rts/gmp/mpn/pa64w/lshift.s b/rts/gmp/mpn/pa64w/lshift.s
new file mode 100644
index 0000000000..84f925a105
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/lshift.s
@@ -0,0 +1,103 @@
+; HP-PA 2.0 __gmpn_lshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_lshift,entry
+__gmpn_lshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	shladd		%r24,3,%r25,%r25
+	shladd		%r24,3,%r26,%r26
+	subi		64,%r23,%r23
+	mtsar		%r23
+	ldd		-8(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r0,%r21,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	add		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	add		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		-16(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-8(%r26)
+L$7	ldd		-24(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-16(%r26)
+L$6	ldd		-32(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-24(%r26)
+L$5	ldd		-40(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-32(%r26)
+L$4	ldd		-48(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-40(%r26)
+L$3	ldd		-56(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-48(%r26)
+L$2	ldd		-64(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-56(%r26)
+L$1	ldd		-72(%r25),%r21
+	ldo		-64(%r25),%r25
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-64(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		-64(%r26),%r26
+
+L$end	shrpd		%r21,%r0,%sar,%r21
+	std		%r21,-8(%r26)
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/mul_1.S b/rts/gmp/mpn/pa64w/mul_1.S
new file mode 100644
index 0000000000..48f13fbd1b
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/mul_1.S
@@ -0,0 +1,159 @@
+; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and
+; store the result in a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		%r23
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0w
+	.code
+	.export __gmpn_mul_1,entry
+__gmpn_mul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	std		s2limb,-56(%r30)
+        fldd		-56(%r30),%fr5
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t2 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t3
+	add,dc		t2,hi,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	copy		cylimb,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/rshift.s b/rts/gmp/mpn/pa64w/rshift.s
new file mode 100644
index 0000000000..2517cb1f87
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/rshift.s
@@ -0,0 +1,100 @@
+; HP-PA 2.0 __gmpn_rshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_rshift,entry
+__gmpn_rshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	mtsar		%r23
+	ldd		0(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r21,%r0,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	sub		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		8(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,0(%r26)
+L$7	ldd		16(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,8(%r26)
+L$6	ldd		24(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,16(%r26)
+L$5	ldd		32(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,24(%r26)
+L$4	ldd		40(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,32(%r26)
+L$3	ldd		48(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,40(%r26)
+L$2	ldd		56(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,48(%r26)
+L$1	ldd		64(%r25),%r21
+	ldo		64(%r25),%r25
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,56(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		64(%r26),%r26
+
+L$end	shrpd		%r0,%r21,%sar,%r21
+	std		%r21,0(%r26)
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/sub_n.s b/rts/gmp/mpn/pa64w/sub_n.s
new file mode 100644
index 0000000000..ad01e24aa7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/sub_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+; and store difference in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_sub_n,entry
+__gmpn_sub_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	blr		%r28,%r0		; branch into loop
+	sub		%r26,%r22,%r26		; offset res_ptr and set carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	subi		1,%r29,%r29
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/submul_1.S b/rts/gmp/mpn/pa64w/submul_1.S
new file mode 100644
index 0000000000..294f6239b2
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/submul_1.S
@@ -0,0 +1,171 @@
+; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		%r23
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0w
+	.code
+	.export __gmpn_submul_1,entry
+__gmpn_submul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	std		s2limb,-56(%r30)
+        fldd		-56(%r30),%fr5
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t4
+	add,dc		t2,hi,cylimb
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	copy		cylimb,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/udiv_qrnnd.c b/rts/gmp/mpn/pa64w/udiv_qrnnd.c
new file mode 100644
index 0000000000..1852913000
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/udiv_qrnnd.c
@@ -0,0 +1,117 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define TWO64 18446744073709551616.0
+#define TWO63 9223372036854775808.0
+
+mp_limb_t
+#if __STDC__
+__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r)
+#else
+__MPN(udiv_qrnnd) (n1, n0, d, r)
+     mp_limb_t n1;
+     mp_limb_t n0;
+     mp_limb_t d;
+     mp_limb_t *r;
+#endif
+{
+  mp_limb_t q1, q2, q;
+  mp_limb_t p1, p0;
+  double di, dq;
+
+  di = 1.0 / d;
+
+  /* Generate upper 53 bits of quotient.  Be careful here; the `double'
+     quotient may be rounded to 2^64 which we cannot safely convert back
+     to a 64-bit integer.  */
+  dq = (TWO64 * (double) n1 + (double) n0) * di;
+  if (dq >= TWO64)
+    q1 = 0xfffffffffffff800L;
+#ifndef __GNUC__
+  /* Work around HP compiler bug.  */
+  else if (dq > TWO63)
+    q1 = (mp_limb_t) (dq - TWO63) + 0x8000000000000000L;
+#endif
+  else
+    q1 = (mp_limb_t) dq;
+
+  /* Multiply back in order to compare the product to the dividend.  */
+  umul_ppmm (p1, p0, q1, d);
+
+  /* Was the 53-bit quotient greater that our sought quotient?  Test the
+     sign of the partial remainder to find out.  */
+  if (n1 < p1 || (n1 == p1 && n0 < p0))
+    {
+      /* 53-bit quotient too large.  Partial remainder is negative.
+	 Compute the absolute value of the remainder in n1,,n0.  */
+      n1 = p1 - (n1 + (p0 < n0));
+      n0 = p0 - n0;
+
+      /* Now use the partial remainder as new dividend to compute more bits of
+	 quotient.  This is an adjustment for the one we got previously.  */
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 - q2;
+      if (n1 < p1 || (n1 == p1 && n0 <= p0))
+	{
+	  n0 = p0 - n0;
+	}
+      else
+	{
+	  n0 = p0 - n0;
+	  n0 += d;
+	  q--;
+	}
+    }
+  else
+    {
+      n1 = n1 - (p1 + (n0 < p0));
+      n0 = n0 - p0;
+
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 + q2;
+      if (n1 < p1 || (n1 == p1 && n0 < p0))
+	{
+	  n0 = n0 - p0;
+	  n0 += d;
+	  q--;
+	}
+      else
+	{
+	  n0 = n0 - p0;
+	  if (n0 >= d)
+	    {
+	      n0 -= d;
+	      q++;
+	    }
+	}
+    }
+
+  *r = n0;
+  return q;
+}
diff --git a/rts/gmp/mpn/pa64w/umul_ppmm.S b/rts/gmp/mpn/pa64w/umul_ppmm.S
new file mode 100644
index 0000000000..d9fb92be8c
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/umul_ppmm.S
@@ -0,0 +1,72 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+#define p0	%r28
+#define p1	%r29
+#define t32	%r19
+#define t0	%r20
+#define t1	%r21
+#define x	%r22
+#define m0	%r23
+#define m1	%r24
+	.level  2.0w
+	.code
+	.export __gmpn_umul_ppmm,entry
+__gmpn_umul_ppmm
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	ldo		128(%r30),%r30
+	std		%r26,-64(%r30)
+	std		%r25,-56(%r30)
+
+	copy		%r24,%r31
+
+        fldd		-64(%r30),%fr4
+        fldd		-56(%r30),%fr5
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+
+	depdi,z		1,31,1,t32		; t32 = 2^32
+
+	ldd		-128(%r30),p0		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),p1		; hi = high 64 bit of product
+
+	add,l,*nuv	m0,m1,x			; x = m1+m0
+	 add,l		t32,p1,p1		; propagate carry to mid of p1
+	depd,z		x,31,32,t0		; lo32(m1+m0)
+	add		t0,p0,p0
+	extrd,u		x,31,32,t1		; hi32(m1+m0)
+	add,dc		t1,p1,p1
+
+	std		p0,0(%r31)		; store low half of product
+	copy		p1,%r28			; return high half of product
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/power/add_n.s b/rts/gmp/mpn/power/add_n.s
new file mode 100644
index 0000000000..0f9f48f1cc
--- /dev/null
+++ b/rts/gmp/mpn/power/add_n.s
@@ -0,0 +1,79 @@
+# IBM POWER __gmpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+# Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.globl	__gmpn_add_n
+	.globl	.__gmpn_add_n
+	.csect	__gmpn_add_n[DS]
+__gmpn_add_n:
+	.long	.__gmpn_add_n, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_add_n:
+	andil.	10,6,1		# odd or even number of limbs?
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before it's used
+	sri	10,6,1		# count for unrolled loop
+	a	7,0,8		# add least significant limbs, set cy
+	mtctr	10		# copy count into CTR
+	beq	0,Leven		# branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		# is count for unrolled loop zero?
+	bc	4,6,L1		# bne cr1,L1 (misassembled by gas)
+	st	7,4(3)
+	aze	3,10		# use the fact that r10 is zero...
+	br			# return
+
+# We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)
+	ae	7,0,8		# add limbs, set cy
+Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	bdz	Lend		# If done, skip loop
+
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	ae	11,9,10		# add previous limbs with cy, set cy
+	stu	7,4(3)		# 
+	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	ae	7,0,8		# add previous limbs with cy, set cy
+	stu	11,4(3)		# 
+	bdn	Loop		# decrement CTR and loop back
+
+Lend:	ae	11,9,10		# add limbs with cy, set cy
+	st	7,4(3)		# 
+	st	11,8(3)		# 
+	lil	3,0		# load cy into ...
+	aze	3,3		# ... return value register
+	br
diff --git a/rts/gmp/mpn/power/addmul_1.s b/rts/gmp/mpn/power/addmul_1.s
new file mode 100644
index 0000000000..8ecc651579
--- /dev/null
+++ b/rts/gmp/mpn/power/addmul_1.s
@@ -0,0 +1,122 @@
+# IBM POWER __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction.  To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result.  We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.globl	__gmpn_addmul_1
+	.globl	.__gmpn_addmul_1
+	.csect	__gmpn_addmul_1[DS]
+__gmpn_addmul_1:
+	.long	.__gmpn_addmul_1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_addmul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	cax	9,9,7
+	l	7,4(3)
+	a	8,8,7		# add res_limb
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	l	7,4(3)
+	aze	9,9
+	a	8,8,7
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	8,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	8,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/rts/gmp/mpn/power/lshift.s b/rts/gmp/mpn/power/lshift.s
new file mode 100644
index 0000000000..ab71fb7727
--- /dev/null
+++ b/rts/gmp/mpn/power/lshift.s
@@ -0,0 +1,56 @@
+# IBM POWER __gmpn_lshift -- 
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.globl	__gmpn_lshift
+	.globl	.__gmpn_lshift
+	.csect	__gmpn_lshift[DS]
+__gmpn_lshift:
+	.long	.__gmpn_lshift, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_lshift:
+	sli	0,5,2
+	cax	9,3,0
+	cax	4,4,0
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	lu	0,-4(4)		# read most significant limb
+	sre	3,0,8		# compute carry out limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,-4(4)		# read 2:nd most significant limb
+	sreq	7,0,8		# compute most significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,-4(4)		# load next lower limb
+	stu	7,-4(9)		# store previous result during read latency
+	sreq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,-4(9)		# store 2:nd least significant limb
+Lend2:	sle	7,0,6		# compute least significant limb
+	st      7,-4(9)		# store it"				\
+	br
diff --git a/rts/gmp/mpn/power/mul_1.s b/rts/gmp/mpn/power/mul_1.s
new file mode 100644
index 0000000000..4e08ade583
--- /dev/null
+++ b/rts/gmp/mpn/power/mul_1.s
@@ -0,0 +1,109 @@
+# IBM POWER __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction.  To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result.  We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.globl	__gmpn_mul_1
+	.globl	.__gmpn_mul_1
+	.csect	__gmpn_mul_1[DS]
+__gmpn_mul_1:
+	.long	.__gmpn_mul_1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_mul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	ai	0,0,0		# reset carry
+	cax	9,9,7
+	blt	Lneg
+Lpos:	bdz	Lend
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	cax	10,10,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,9
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	cax	9,9,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,10
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/rts/gmp/mpn/power/rshift.s b/rts/gmp/mpn/power/rshift.s
new file mode 100644
index 0000000000..65b3945f8a
--- /dev/null
+++ b/rts/gmp/mpn/power/rshift.s
@@ -0,0 +1,54 @@
+# IBM POWER __gmpn_rshift -- 
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.globl	__gmpn_rshift
+	.globl	.__gmpn_rshift
+	.csect	__gmpn_rshift[DS]
+__gmpn_rshift:
+	.long	.__gmpn_rshift, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_rshift:
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	l	0,0(4)		# read least significant limb
+	ai	9,3,-4		# adjust res_ptr since it's offset in the stu:s
+	sle	3,0,8		# compute carry limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,4(4)		# read 2:nd least significant limb
+	sleq	7,0,8		# compute least significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,4(4)		# load next higher limb
+	stu	7,4(9)		# store previous result during read latency
+	sleq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,4(9)		# store 2:nd most significant limb
+Lend2:	sre	7,0,6		# compute most significant limb
+	st      7,4(9)		# store it"				\
+	br
diff --git a/rts/gmp/mpn/power/sdiv.s b/rts/gmp/mpn/power/sdiv.s
new file mode 100644
index 0000000000..81da622fbc
--- /dev/null
+++ b/rts/gmp/mpn/power/sdiv.s
@@ -0,0 +1,34 @@
+# Copyright (C) 1999 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+	.toc
+	.globl	__sdiv_qrnnd
+	.globl	.__sdiv_qrnnd
+	.csect	__sdiv_qrnnd[DS]
+__sdiv_qrnnd:
+	.long	.__sdiv_qrnnd, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__sdiv_qrnnd:
+	mtmq	5
+	div	0,4,6
+	mfmq	9
+	st	9,0(3)
+	mr	3,0
+	br
diff --git a/rts/gmp/mpn/power/sub_n.s b/rts/gmp/mpn/power/sub_n.s
new file mode 100644
index 0000000000..aa09cf5bc1
--- /dev/null
+++ b/rts/gmp/mpn/power/sub_n.s
@@ -0,0 +1,80 @@
+# IBM POWER __gmpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+# Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.globl	__gmpn_sub_n
+	.globl	.__gmpn_sub_n
+	.csect	__gmpn_sub_n[DS]
+__gmpn_sub_n:
+	.long	.__gmpn_sub_n, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_sub_n:
+	andil.	10,6,1		# odd or even number of limbs?
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before it's used
+	sri	10,6,1		# count for unrolled loop
+	sf	7,0,8		# subtract least significant limbs, set cy
+	mtctr	10		# copy count into CTR
+	beq	0,Leven		# branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		# is count for unrolled loop zero?
+	bc	4,6,L1		# bne cr1,L1 (misassembled by gas)
+	st	7,4(3)
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br			# return
+
+# We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)
+	sfe	7,0,8		# subtract limbs, set cy
+Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	bdz	Lend		# If done, skip loop
+
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	sfe	11,10,9		# subtract previous limbs with cy, set cy
+	stu	7,4(3)		# 
+	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	sfe	7,0,8		# subtract previous limbs with cy, set cy
+	stu	11,4(3)		# 
+	bdn	Loop		# decrement CTR and loop back
+
+Lend:	sfe	11,10,9		# subtract limbs with cy, set cy
+	st	7,4(3)		# 
+	st	11,8(3)		# 
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br
diff --git a/rts/gmp/mpn/power/submul_1.s b/rts/gmp/mpn/power/submul_1.s
new file mode 100644
index 0000000000..bc01b7c95d
--- /dev/null
+++ b/rts/gmp/mpn/power/submul_1.s
@@ -0,0 +1,127 @@
+# IBM POWER __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction.  To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result.  We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.globl	__gmpn_submul_1
+	.globl	.__gmpn_submul_1
+	.csect	__gmpn_submul_1[DS]
+__gmpn_submul_1:
+	.long	.__gmpn_submul_1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_submul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	11
+	cax	9,9,7
+	l	7,4(3)
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	11,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	11,0,10
+	l	7,4(3)
+	aze	9,9
+	sf	8,11,7
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	11,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	11,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/rts/gmp/mpn/power/umul.s b/rts/gmp/mpn/power/umul.s
new file mode 100644
index 0000000000..8c77496380
--- /dev/null
+++ b/rts/gmp/mpn/power/umul.s
@@ -0,0 +1,38 @@
+# Copyright (C) 1999 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+	.toc
+	.globl	__umul_ppmm
+	.globl	.__umul_ppmm
+	.csect	__umul_ppmm[DS]
+__umul_ppmm:
+	.long	.__umul_ppmm, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__umul_ppmm:
+	mul	9,4,5
+	srai	0,4,31
+	and	0,0,5
+	srai	5,5,31
+	and	5,5,4
+	cax	0,0,5
+	mfmq	11
+	st	11,0(3)
+	cax	3,9,0
+	br
diff --git a/rts/gmp/mpn/powerpc32/add_n.asm b/rts/gmp/mpn/powerpc32/add_n.asm
new file mode 100644
index 0000000000..81ed04b162
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/add_n.asm
@@ -0,0 +1,61 @@
+dnl PowerPC-32 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl s2_ptr	r5
+dnl size	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	mtctr	r6		C copy size into CTR
+	addic	r0,r0,0		C clear cy
+	lwz	r8,0(r4)	C load least significant s1 limb
+	lwz	r0,0(r5)	C load least significant s2 limb
+	addi	r3,r3,-4	C offset res_ptr, it's updated before it's used
+	bdz	.Lend		C If done, skip loop
+.Loop:	lwz	r9,4(r4)	C load s1 limb
+	lwz	r10,4(r5)	C load s2 limb
+	adde	r7,r0,r8	C add limbs with cy, set cy
+	stw	r7,4(r3)	C store result limb
+	bdz	.Lexit		C decrement CTR and exit if done
+	lwzu	r8,8(r4)	C load s1 limb and update s1_ptr
+	lwzu	r0,8(r5)	C load s2 limb and update s2_ptr
+	adde	r7,r10,r9	C add limbs with cy, set cy
+	stwu	r7,8(r3)	C store result limb and update res_ptr
+	bdnz	.Loop		C decrement CTR and loop back
+
+.Lend:	adde	r7,r0,r8
+	stw	r7,4(r3)	C store ultimate result limb
+	li	r3,0		C load cy into ...
+	addze	r3,r3		C ... return value register
+	blr
+.Lexit:	adde	r7,r10,r9
+	stw	r7,8(r3)
+	li	r3,0		C load cy into ...
+	addze	r3,r3		C ... return value register
+	blr
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/powerpc32/addmul_1.asm b/rts/gmp/mpn/powerpc32/addmul_1.asm
new file mode 100644
index 0000000000..3ef75b1532
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/addmul_1.asm
@@ -0,0 +1,124 @@
+dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl s2_limb	r6
+
+dnl This is optimized for the PPC604.  It has not been tested on PPC601, PPC603
+dnl or PPC750 since I don't have access to any such machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	cmpi	cr0,r5,9	C more than 9 limbs?
+	bgt	cr0,.Lbig	C branch if more than 9 limbs
+
+	mtctr	r5
+	lwz	r0,0(r4)
+	mullw	r7,r0,r6
+	mulhwu	r10,r0,r6
+	lwz	r9,0(r3)
+	addc	r8,r7,r9
+	addi	r3,r3,-4
+	bdz	.Lend
+.Lloop:
+	lwzu	r0,4(r4)
+	stwu	r8,4(r3)
+	mullw	r8,r0,r6
+	adde	r7,r8,r10
+	mulhwu	r10,r0,r6
+	lwz	r9,4(r3)
+	addze	r10,r10
+	addc	r8,r7,r9
+	bdnz	.Lloop
+.Lend:	stw	r8,4(r3)
+	addze	r3,r10
+	blr
+
+.Lbig:	stmw	r30,-32(r1)
+	addi	r5,r5,-1
+	srwi	r0,r5,2
+	mtctr	r0
+
+	lwz	r7,0(r4)
+	mullw	r8,r7,r6
+	mulhwu	r0,r7,r6
+	lwz	r7,0(r3)
+	addc	r8,r8,r7
+	stw	r8,0(r3)
+
+.LloopU:
+	lwz	r7,4(r4)
+	lwz	r12,8(r4)
+	lwz	r30,12(r4)
+	lwzu	r31,16(r4)
+	mullw	r8,r7,r6
+	mullw	r9,r12,r6
+	mullw	r10,r30,r6
+	mullw	r11,r31,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	adde	r9,r9,r0
+	mulhwu	r0,r12,r6
+	lwz	r12,8(r3)
+	adde	r10,r10,r0
+	mulhwu	r0,r30,r6
+	lwz	r30,12(r3)
+	adde	r11,r11,r0
+	mulhwu	r0,r31,r6
+	lwz	r31,16(r3)
+	addze	r0,r0		C new cy_limb
+	addc	r8,r8,r7
+	stw	r8,4(r3)
+	adde	r9,r9,r12
+	stw	r9,8(r3)
+	adde	r10,r10,r30
+	stw	r10,12(r3)
+	adde	r11,r11,r31
+	stwu	r11,16(r3)
+	bdnz	.LloopU
+
+	andi.	r31,r5,3
+	mtctr	r31
+	beq	cr0,.Lendx
+
+.LloopE:
+	lwzu	r7,4(r4)
+	mullw	r8,r7,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	addze	r0,r0		C new cy_limb
+	addc	r8,r8,r7
+	stwu	r8,4(r3)
+	bdnz	.LloopE
+.Lendx:
+	addze	r3,r0
+	lmw	r30,-32(r1)
+	blr
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/powerpc32/aix.m4 b/rts/gmp/mpn/powerpc32/aix.m4
new file mode 100644
index 0000000000..2bd8425817
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/aix.m4
@@ -0,0 +1,39 @@
+divert(-1)
+dnl  m4 macros for AIX 32-bit assembly.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+define(`ASM_START',
+	`.toc')
+	
+define(`PROLOGUE',
+	`
+	.globl	$1
+	.globl	.$1
+	.csect	$1[DS],2
+$1:
+	.long	.$1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.$1:')
+
+define(`EPILOGUE', `')
+
+divert
diff --git a/rts/gmp/mpn/powerpc32/gmp-mparam.h b/rts/gmp/mpn/powerpc32/gmp-mparam.h
new file mode 100644
index 0000000000..b283185789
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/gmp-mparam.h
@@ -0,0 +1,66 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the 604.  Presumably, these should be considerably
+   different for the 603 and 750 that have much slower multiply
+   instructions.  */
+
+/* Generated by tuneup.c, 2000-05-26. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   26	/* tuneup says 20 */
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      228
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   46	/* tuneup says 44 */
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      262
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              52
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             86
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            23
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        7
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          53
+#endif
diff --git a/rts/gmp/mpn/powerpc32/lshift.asm b/rts/gmp/mpn/powerpc32/lshift.asm
new file mode 100644
index 0000000000..73a85430ab
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/lshift.asm
@@ -0,0 +1,145 @@
+dnl  PowerPC-32 mpn_lshift -- Shift a number left.
+
+dnl Copyright (C) 1995, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	cmpi	cr0,r5,12	C more than 12 limbs?
+	slwi	r0,r5,2
+	add	r4,r4,r0	C make r4 point at end of s1
+	add	r7,r3,r0	C make r7 point at end of res
+	bgt	.LBIG		C branch if more than 12 limbs
+
+	mtctr	r5		C copy size into CTR
+	subfic	r8,r6,32
+	lwzu	r11,-4(r4)	C load first s1 limb
+	srw	r3,r11,r8	C compute function return value
+	bdz	.Lend1
+
+.Loop:	lwzu	r10,-4(r4)
+	slw	r9,r11,r6
+	srw	r12,r10,r8
+	or	r9,r9,r12
+	stwu	r9,-4(r7)
+	bdz	.Lend2
+	lwzu	r11,-4(r4)
+	slw	r9,r10,r6
+	srw	r12,r11,r8
+	or	r9,r9,r12
+	stwu	r9,-4(r7)
+	bdnz	.Loop
+
+.Lend1:	slw	r0,r11,r6
+	stw	r0,-4(r7)
+	blr
+.Lend2:	slw	r0,r10,r6
+	stw	r0,-4(r7)
+	blr
+
+.LBIG:
+	stmw	r24,-32(r1)	C save registers we are supposed to preserve
+	lwzu	r9,-4(r4)
+	subfic	r8,r6,32
+	srw	r3,r9,r8	C compute function return value
+	slw	r0,r9,r6
+	addi	r5,r5,-1
+
+	andi.	r10,r5,3	C count for spill loop
+	beq	.Le
+	mtctr	r10
+	lwzu	r28,-4(r4)
+	bdz	.Lxe0
+
+.Loop0:	slw	r12,r28,r6
+	srw	r24,r28,r8
+	lwzu	r28,-4(r4)
+	or	r24,r0,r24
+	stwu	r24,-4(r7)
+	mr	r0,r12
+	bdnz	.Loop0		C taken at most once!
+
+.Lxe0:	slw	r12,r28,r6
+	srw	r24,r28,r8
+	or	r24,r0,r24
+	stwu	r24,-4(r7)
+	mr	r0,r12
+
+.Le:	srwi	r5,r5,2		C count for unrolled loop
+	addi	r5,r5,-1
+	mtctr	r5
+	lwz	r28,-4(r4)
+	lwz	r29,-8(r4)
+	lwz	r30,-12(r4)
+	lwzu	r31,-16(r4)
+
+.LoopU:	slw	r9,r28,r6
+	srw	r24,r28,r8
+	lwz	r28,-4(r4)
+	slw	r10,r29,r6
+	srw	r25,r29,r8
+	lwz	r29,-8(r4)
+	slw	r11,r30,r6
+	srw	r26,r30,r8
+	lwz	r30,-12(r4)
+	slw	r12,r31,r6
+	srw	r27,r31,r8
+	lwzu	r31,-16(r4)
+	or	r24,r0,r24
+	stw	r24,-4(r7)
+	or	r25,r9,r25
+	stw	r25,-8(r7)
+	or	r26,r10,r26
+	stw	r26,-12(r7)
+	or	r27,r11,r27
+	stwu	r27,-16(r7)
+	mr	r0,r12
+	bdnz	.LoopU
+
+	slw	r9,r28,r6
+	srw	r24,r28,r8
+	slw	r10,r29,r6
+	srw	r25,r29,r8
+	slw	r11,r30,r6
+	srw	r26,r30,r8
+	slw	r12,r31,r6
+	srw	r27,r31,r8
+	or	r24,r0,r24
+	stw	r24,-4(r7)
+	or	r25,r9,r25
+	stw	r25,-8(r7)
+	or	r26,r10,r26
+	stw	r26,-12(r7)
+	or	r27,r11,r27
+	stwu	r27,-16(r7)
+	mr	r0,r12
+
+	stw	r0,-4(r7)
+	lmw	r24,-32(r1)	C restore registers
+	blr
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/powerpc32/mul_1.asm b/rts/gmp/mpn/powerpc32/mul_1.asm
new file mode 100644
index 0000000000..ec878b54d5
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/mul_1.asm
@@ -0,0 +1,86 @@
+dnl PowerPC-32 mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl s2_limb	r6
+
+dnl This is optimized for the PPC604 but it runs decently even on PPC601.  It
+dnl has not been tested on a PPC603 since I don't have access to any such
+dnl machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	mtctr	r5
+	addi	r3,r3,-4	C adjust res_ptr, it's offset before it's used
+	li	r12,0		C clear upper product reg
+	addic	r0,r0,0		C clear cy
+C Start software pipeline
+	lwz	r8,0(r4)
+	bdz	.Lend3
+	stmw	r30,-8(r1)	C save registers we are supposed to preserve
+	lwzu	r9,4(r4)
+	mullw	r11,r8,r6
+	mulhwu	r0,r8,r6
+	bdz	.Lend1
+C Software pipelined main loop
+.Loop:	lwz	r8,4(r4)
+	mullw	r10,r9,r6
+	adde	r30,r11,r12
+	mulhwu	r12,r9,r6
+	stw	r30,4(r3)
+	bdz	.Lend2
+	lwzu	r9,8(r4)
+	mullw	r11,r8,r6
+	adde	r31,r10,r0
+	mulhwu	r0,r8,r6
+	stwu	r31,8(r3)
+	bdnz	.Loop
+C Finish software pipeline
+.Lend1:	mullw	r10,r9,r6
+	adde	r30,r11,r12
+	mulhwu	r12,r9,r6
+	stw	r30,4(r3)
+	adde	r31,r10,r0
+	stwu	r31,8(r3)
+	addze	r3,r12
+	lmw	r30,-8(r1)	C restore registers from stack
+	blr
+.Lend2:	mullw	r11,r8,r6
+	adde	r31,r10,r0
+	mulhwu	r0,r8,r6
+	stwu	r31,8(r3)
+	adde	r30,r11,r12
+	stw	r30,4(r3)
+	addze	r3,r0
+	lmw	r30,-8(r1)	C restore registers from stack
+	blr
+.Lend3:	mullw	r11,r8,r6
+	stw	r11,4(r3)
+	mulhwu	r3,r8,r6
+	blr
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/powerpc32/regmap.m4 b/rts/gmp/mpn/powerpc32/regmap.m4
new file mode 100644
index 0000000000..978f18902a
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/regmap.m4
@@ -0,0 +1,34 @@
+divert(-1)
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl Map register names r0, r1, etc, to just `0', `1', etc.
+dnl This is needed on all systems but NeXT, Rhapsody, and MacOS-X
+forloop(i,0,31,
+`define(`r'i,i)'
+)
+
+dnl Likewise for cr0, cr1, etc.
+forloop(i,0,7,
+`define(`cr'i,i)'
+)
+
+divert
diff --git a/rts/gmp/mpn/powerpc32/rshift.asm b/rts/gmp/mpn/powerpc32/rshift.asm
new file mode 100644
index 0000000000..a09ba04938
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/rshift.asm
@@ -0,0 +1,60 @@
+dnl PowerPC-32 mpn_rshift -- Shift a number right.
+
+dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	mtctr	r5		C copy size into CTR
+	addi	r7,r3,-4	C move adjusted res_ptr to free return reg
+	subfic	r8,r6,32
+	lwz	r11,0(r4)	C load first s1 limb
+	slw	r3,r11,r8	C compute function return value
+	bdz	.Lend1
+
+.Loop:	lwzu	r10,4(r4)
+	srw	r9,r11,r6
+	slw	r12,r10,r8
+	or	r9,r9,r12
+	stwu	r9,4(r7)
+	bdz	.Lend2
+	lwzu	r11,4(r4)
+	srw	r9,r10,r6
+	slw	r12,r11,r8
+	or	r9,r9,r12
+	stwu	r9,4(r7)
+	bdnz	.Loop
+
+.Lend1:	srw	r0,r11,r6
+	stw	r0,4(r7)
+	blr
+
+.Lend2:	srw	r0,r10,r6
+	stw	r0,4(r7)
+	blr
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/powerpc32/sub_n.asm b/rts/gmp/mpn/powerpc32/sub_n.asm
new file mode 100644
index 0000000000..b04b4192ef
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/sub_n.asm
@@ -0,0 +1,61 @@
+dnl PowerPC-32 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl s2_ptr	r5
+dnl size	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	mtctr	r6		C copy size into CTR
+	addic	r0,r6,-1	C set cy
+	lwz	r8,0(r4)	C load least significant s1 limb
+	lwz	r0,0(r5)	C load least significant s2 limb
+	addi	r3,r3,-4	C offset res_ptr, it's updated before it's used
+	bdz	.Lend		C If done, skip loop
+.Loop:	lwz	r9,4(r4)	C load s1 limb
+	lwz	r10,4(r5)	C load s2 limb
+	subfe	r7,r0,r8	C subtract limbs with cy, set cy
+	stw	r7,4(r3)	C store result limb
+	bdz	.Lexit		C decrement CTR and exit if done
+	lwzu	r8,8(r4)	C load s1 limb and update s1_ptr
+	lwzu	r0,8(r5)	C load s2 limb and update s2_ptr
+	subfe	r7,r10,r9	C subtract limbs with cy, set cy
+	stwu	r7,8(r3)	C store result limb and update res_ptr
+	bdnz	.Loop		C decrement CTR and loop back
+
+.Lend:	subfe	r7,r0,r8
+	stw	r7,4(r3)	C store ultimate result limb
+	subfe	r3,r0,r0	C load !cy into ...
+	subfic	r3,r3,0		C ... return value register
+	blr
+.Lexit:	subfe	r7,r10,r9
+	stw	r7,8(r3)
+	subfe	r3,r0,r0	C load !cy into ...
+	subfic	r3,r3,0		C ... return value register
+	blr
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/powerpc32/submul_1.asm b/rts/gmp/mpn/powerpc32/submul_1.asm
new file mode 100644
index 0000000000..a129e9f9ea
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/submul_1.asm
@@ -0,0 +1,130 @@
+dnl PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl the result from a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl s2_limb	r6
+
+dnl This is optimized for the PPC604.  It has not been tested on PPC601, PPC603
+dnl or PPC750 since I don't have access to any such machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	cmpi	cr0,r5,9	C more than 9 limbs?
+	bgt	cr0,.Lbig	C branch if more than 9 limbs
+
+	mtctr	r5
+	lwz	r0,0(r4)
+	mullw	r7,r0,r6
+	mulhwu	r10,r0,r6
+	lwz	r9,0(r3)
+	subfc	r8,r7,r9
+	addc	r7,r7,r8	C invert cy (r7 is junk)
+	addi	r3,r3,-4
+	bdz	.Lend
+.Lloop:
+	lwzu	r0,4(r4)
+	stwu	r8,4(r3)
+	mullw	r8,r0,r6
+	adde	r7,r8,r10
+	mulhwu	r10,r0,r6
+	lwz	r9,4(r3)
+	addze	r10,r10
+	subfc	r8,r7,r9
+	addc	r7,r7,r8	C invert cy (r7 is junk)
+	bdnz	.Lloop
+.Lend:	stw	r8,4(r3)
+	addze	r3,r10
+	blr
+
+.Lbig:	stmw	r30,-32(r1)
+	addi	r5,r5,-1
+	srwi	r0,r5,2
+	mtctr	r0
+
+	lwz	r7,0(r4)
+	mullw	r8,r7,r6
+	mulhwu	r0,r7,r6
+	lwz	r7,0(r3)
+	subfc	r7,r8,r7
+	addc	r8,r8,r7
+	stw	r7,0(r3)
+
+.LloopU:
+	lwz	r7,4(r4)
+	lwz	r12,8(r4)
+	lwz	r30,12(r4)
+	lwzu	r31,16(r4)
+	mullw	r8,r7,r6
+	mullw	r9,r12,r6
+	mullw	r10,r30,r6
+	mullw	r11,r31,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	adde	r9,r9,r0
+	mulhwu	r0,r12,r6
+	lwz	r12,8(r3)
+	adde	r10,r10,r0
+	mulhwu	r0,r30,r6
+	lwz	r30,12(r3)
+	adde	r11,r11,r0
+	mulhwu	r0,r31,r6
+	lwz	r31,16(r3)
+	addze	r0,r0		C new cy_limb
+	subfc	r7,r8,r7
+	stw	r7,4(r3)
+	subfe	r12,r9,r12
+	stw	r12,8(r3)
+	subfe	r30,r10,r30
+	stw	r30,12(r3)
+	subfe	r31,r11,r31
+	stwu	r31,16(r3)
+	subfe	r11,r11,r11	C invert ...
+	addic	r11,r11,1	C ... carry
+	bdnz	.LloopU
+
+	andi.	r31,r5,3
+	mtctr	r31
+	beq	cr0,.Lendx
+
+.LloopE:
+	lwzu	r7,4(r4)
+	mullw	r8,r7,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	addze	r0,r0		C new cy_limb
+	subfc	r7,r8,r7
+	addc	r8,r8,r7
+	stwu	r7,4(r3)
+	bdnz	.LloopE
+.Lendx:
+	addze	r3,r0
+	lmw	r30,-32(r1)
+	blr
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/powerpc32/umul.asm b/rts/gmp/mpn/powerpc32/umul.asm
new file mode 100644
index 0000000000..eeaa0a4dc8
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/umul.asm
@@ -0,0 +1,32 @@
+dnl PowerPC-32 umul_ppmm -- support for longlong.h
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	mullw	0,4,5
+	mulhwu	9,4,5
+	stw	0,0(3)
+	mr	3,9
+	blr
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/powerpc64/README b/rts/gmp/mpn/powerpc64/README
new file mode 100644
index 0000000000..c779276917
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/README
@@ -0,0 +1,36 @@
+PPC630 (aka Power3) pipeline information:
+
+Decoding is 4-way and issue is 8-way with some out-of-order capability.
+LS1  - ld/st unit 1
+LS2  - ld/st unit 2
+FXU1 - integer unit 1, handles any simple integer instructions
+FXU2 - integer unit 2, handles any simple integer instructions
+FXU3 - integer unit 3, handles integer multiply and divide
+FPU1 - floating-point unit 1
+FPU2 - floating-point unit 2
+
+Memory:		  Any two memory operations can issue, but memory subsystem
+		  can sustain just one store per cycle.
+Simple integer:	  2 operations (such as add, rl*)
+Integer multiply: 1 operation every 9th cycle worst case; exact timing depends
+		  on 2nd operand most significant bit position (10 bits per
+		  cycle).  Multiply unit is not pipelined, only one multiply
+		  operation in progress is allowed.
+Integer divide:	  ?
+Floating-point:	  Any plain 2 arithmetic instructions (such as fmul, fadd, fmadd)
+		  Latency = 4.
+Floating-point divide:
+		  ?
+Floating-point square root:
+		  ?
+
+Best possible times for the main loops:
+shift:	      1.5 cycles limited by integer unit contention.
+	      With 63 special loops, one for each shift count, we could
+	      reduce the needed integer instructions to 2, which would
+	      reduce the best possible time to 1 cycle.
+add/sub:      1.5 cycles, limited by ld/st unit contention.
+mul:	      18 cycles (average) unless floating-point operations are used,
+	      but that would only help for multiplies of perhaps 10 and more
+	      limbs.
+addmul/submul:Same situation as for mul.
diff --git a/rts/gmp/mpn/powerpc64/add_n.asm b/rts/gmp/mpn/powerpc64/add_n.asm
new file mode 100644
index 0000000000..c3325376dc
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/add_n.asm
@@ -0,0 +1,61 @@
+# PowerPC-64 mpn_add_n -- Add two limb vectors of the same length > 0 and
+# store sum in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	mtctr	r6		# copy size into CTR
+	addic	r0,r0,0		# clear cy
+	ld	r8,0(r4)	# load least significant s1 limb
+	ld	r0,0(r5)	# load least significant s2 limb
+	addi	r3,r3,-8	# offset res_ptr, it's updated before it's used
+	bdz	.Lend		# If done, skip loop
+.Loop:	ld	r9,8(r4)	# load s1 limb
+	ld	r10,8(r5)	# load s2 limb
+	adde	r7,r0,r8	# add limbs with cy, set cy
+	std	r7,8(r3)	# store result limb
+	bdz	.Lexit		# decrement CTR and exit if done
+	ldu	r8,16(r4)	# load s1 limb and update s1_ptr
+	ldu	r0,16(r5)	# load s2 limb and update s2_ptr
+	adde	r7,r10,r9	# add limbs with cy, set cy
+	stdu	r7,16(r3)	# store result limb and update res_ptr
+	bdnz	.Loop		# decrement CTR and loop back
+
+.Lend:	adde	r7,r0,r8
+	std	r7,8(r3)	# store ultimate result limb
+	li	r3,0		# load cy into ...
+	addze	r3,r3		# ... return value register
+	blr
+.Lexit:	adde	r7,r10,r9
+	std	r7,16(r3)
+	li	r3,0		# load cy into ...
+	addze	r3,r3		# ... return value register
+	blr
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/powerpc64/addmul_1.asm b/rts/gmp/mpn/powerpc64/addmul_1.asm
new file mode 100644
index 0000000000..81774482fe
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/addmul_1.asm
@@ -0,0 +1,52 @@
+# PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	mtctr	5
+	li	9,0		# cy_limb = 0
+	addic	0,0,0
+	cal	3,-8(3)
+	cal	4,-8(4)
+.Loop:
+	ldu	0,8(4)
+	ld	10,8(3)
+	mulld	7,0,6
+	adde	7,7,9
+	mulhdu	9,0,6
+	addze	9,9
+	addc	7,7,10
+	stdu	7,8(3)
+	bdnz	.Loop
+
+	addze	3,9
+	blr
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/powerpc64/addsub_n.asm b/rts/gmp/mpn/powerpc64/addsub_n.asm
new file mode 100644
index 0000000000..4ed40d71ae
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/addsub_n.asm
@@ -0,0 +1,107 @@
+# PowerPC-64 mpn_addsub_n -- Simultaneous add and sub.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+include(`asm-syntax.m4')
+
+define(SAVE_BORROW_RESTORE_CARRY,
+	`sldi $1,$1,63
+	adde $1,$1,$1')
+define(SAVE_CARRY_RESTORE_BORROW,
+	`sldi $1,$1,63
+	adde $1,$1,$1')
+
+# 19991117
+
+# This is just crafted for testing some ideas, and verifying that we can make
+# it run fast.  It runs at 2.55 cycles/limb on the 630, which is very good.
+# We should play a little with the schedule.  No time has been spent on that.
+
+# To finish this, the loop warm up and cool down code needs to be written,
+# and the result need to be tested.  Also, the proper calling sequence should
+# be used.
+
+#             r1p r2p s1p s2p n
+# Use reg r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12
+
+ASM_START()
+PROLOGUE(mpn_addsub_n)
+	std	r14,-64(1)
+	std	r15,-56(1)
+	std	r16,-48(1)
+	std	r17,-40(1)
+	std	r18,-32(1)
+	std	r19,-24(1)
+
+	srdi	r7,r7,2
+	mtctr	r7		# copy size into CTR
+	addic	r0,r0,0		# clear cy
+	addi	r3,r3,-8	# offset res_ptr, it's updated before it's used
+	addi	r4,r4,-8	# offset res_ptr, it's updated before it's used
+
+.Loop:
+	adde	r12,r8,r9
+	std	r12,8(r3)
+	adde	r12,r10,r11
+	std	r12,16(r3)
+
+	SAVE_CARRY_RESTORE_BORROW(r0)
+
+	subfe	r12,r8,r9
+	std	r12,8(r4)
+	ld	r8,8(r5)	# s1 L 1
+	ld	r9,8(r6)	# s2 L 1
+	subfe	r12,r10,r11
+	std	r12,16(r4)
+	ld	r10,16(r5)	# s1 L 2
+	ld	r11,16(r6)	# s2 L 2
+# pair -------------------------
+	subfe	r12,r14,r15
+	std	r12,24(r4)
+	subfe	r12,r16,r17
+	stdu	r12,32(r4)
+
+	SAVE_BORROW_RESTORE_CARRY(r0)
+
+	adde	r12,r14,r15
+	std	r12,24(r3)
+	ld	r14,24(r5)	# s1 L 3
+	ld	r15,24(r6)	# s2 L 3
+	adde	r12,r16,r17
+	stdu	r12,32(r3)
+	ldu	r16,32(r5)	# s1 L 4
+	ldu	r17,32(r6)	# s2 L 4
+	bdnz	.Loop
+
+	ld	r14,-64(1)
+	ld	r15,-56(1)
+	ld	r16,-48(1)
+	ld	r17,-40(1)
+	ld	r18,-32(1)
+	ld	r19,-24(1)
+	blr
+EPILOGUE(mpn_addsub_n)
diff --git a/rts/gmp/mpn/powerpc64/aix.m4 b/rts/gmp/mpn/powerpc64/aix.m4
new file mode 100644
index 0000000000..aee9f1f97a
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/aix.m4
@@ -0,0 +1,40 @@
+divert(-1)
+dnl  m4 macros for AIX 64-bit assembly.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+define(`ASM_START',
+	`.machine	"ppc64"
+	.toc')
+	
+define(`PROLOGUE',
+	`
+	.globl	$1
+	.globl	.$1
+	.csect	$1[DS],3
+$1:
+	.llong	.$1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.$1:')
+
+define(`EPILOGUE', `')
+
+divert
diff --git a/rts/gmp/mpn/powerpc64/copyd.asm b/rts/gmp/mpn/powerpc64/copyd.asm
new file mode 100644
index 0000000000..d06e8c25fd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/copyd.asm
@@ -0,0 +1,45 @@
+# PowerPC-64 mpn_copyd -- Copy a limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr	r3
+# sptr	r4
+# n	r5
+
+include(`../config.m4')
+
+# Unrolling this analogous to sparc64/copyi.s doesn't help for any
+# operand sizes.
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	cmpdi	cr0,r5,0
+	mtctr	r5
+	sldi	r5,r5,3
+	add	r4,r4,r5
+	add	r3,r3,r5
+	beq	cr0,.Lend
+.Loop:	ldu	r0,-8(r4)
+	stdu	r0,-8(r3)
+	bdnz	.Loop
+.Lend:	blr
+EPILOGUE(mpn_copyd)
diff --git a/rts/gmp/mpn/powerpc64/copyi.asm b/rts/gmp/mpn/powerpc64/copyi.asm
new file mode 100644
index 0000000000..a1bedc4c5b
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/copyi.asm
@@ -0,0 +1,44 @@
+# PowerPC-64 mpn_copyi -- Copy a limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr	r3
+# sptr	r4
+# n	r5
+
+include(`../config.m4')
+
+# Unrolling this analogous to sparc64/copyi.s doesn't help for any
+# operand sizes.
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	cmpdi	cr0,r5,0
+	mtctr	r5
+	addi	r4,r4,-8
+	addi	r3,r3,-8
+	beq	cr0,.Lend
+.Loop:	ldu	r0,8(r4)
+	stdu	r0,8(r3)
+	bdnz	.Loop
+.Lend:	blr
+EPILOGUE(mpn_copyi)
diff --git a/rts/gmp/mpn/powerpc64/gmp-mparam.h b/rts/gmp/mpn/powerpc64/gmp-mparam.h
new file mode 100644
index 0000000000..6fefb960cd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/gmp-mparam.h
@@ -0,0 +1,62 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Generated by tuneup.c, 2000-07-16. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   10
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD       57
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   16
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       89
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              28
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            216
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            14
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        6
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD         163
+#endif
diff --git a/rts/gmp/mpn/powerpc64/lshift.asm b/rts/gmp/mpn/powerpc64/lshift.asm
new file mode 100644
index 0000000000..cef3a81fdd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/lshift.asm
@@ -0,0 +1,159 @@
+#  PowerPC-64 mpn_lshift -- Shift a number left.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	cmpdi	cr0,r5,20	# more than 20 limbs?
+	sldi	r0,r5,3
+	add	r4,r4,r0	# make r4 point at end of s1
+	add	r7,r3,r0	# make r7 point at end of res
+	bgt	.LBIG		# branch if more than 12 limbs
+
+	mtctr	r5		# copy size into CTR
+	subfic	r8,r6,64
+	ldu	r11,-8(r4)	# load first s1 limb
+	srd	r3,r11,r8	# compute function return value
+	bdz	.Lend1
+
+.Loop:	ldu	r10,-8(r4)
+	sld	r9,r11,r6
+	srd	r12,r10,r8
+	or	r9,r9,r12
+	stdu	r9,-8(r7)
+	bdz	.Lend2
+	ldu	r11,-8(r4)
+	sld	r9,r10,r6
+	srd	r12,r11,r8
+	or	r9,r9,r12
+	stdu	r9,-8(r7)
+	bdnz	.Loop
+
+.Lend1:	sld	r0,r11,r6
+	std	r0,-8(r7)
+	blr
+.Lend2:	sld	r0,r10,r6
+	std	r0,-8(r7)
+	blr
+
+.LBIG:
+	std	r24,-64(1)
+	std	r25,-56(1)
+	std	r26,-48(1)
+	std	r27,-40(1)
+	std	r28,-32(1)
+	std	r29,-24(1)
+	std	r30,-16(1)
+	std	r31,-8(1)
+	ldu	r9,-8(r4)
+	subfic	r8,r6,64
+	srd	r3,r9,r8	# compute function return value
+	sld	r0,r9,r6
+	addi	r5,r5,-1
+
+	andi.	r10,r5,3	# count for spill loop
+	beq	.Le
+	mtctr	r10
+	ldu	r28,-8(r4)
+	bdz	.Lxe0
+
+.Loop0:	sld	r12,r28,r6
+	srd	r24,r28,r8
+	ldu	r28,-8(r4)
+	or	r24,r0,r24
+	stdu	r24,-8(r7)
+	mr	r0,r12
+	bdnz	.Loop0		# taken at most once!
+
+.Lxe0:	sld	r12,r28,r6
+	srd	r24,r28,r8
+	or	r24,r0,r24
+	stdu	r24,-8(r7)
+	mr	r0,r12
+
+.Le:	srdi	r5,r5,2		# count for unrolled loop
+	addi	r5,r5,-1
+	mtctr	r5
+	ld	r28,-8(r4)
+	ld	r29,-16(r4)
+	ld	r30,-24(r4)
+	ldu	r31,-32(r4)
+
+.LoopU:	sld	r9,r28,r6
+	srd	r24,r28,r8
+	ld	r28,-8(r4)
+	sld	r10,r29,r6
+	srd	r25,r29,r8
+	ld	r29,-16(r4)
+	sld	r11,r30,r6
+	srd	r26,r30,r8
+	ld	r30,-24(r4)
+	sld	r12,r31,r6
+	srd	r27,r31,r8
+	ldu	r31,-32(r4)
+	or	r24,r0,r24
+	std	r24,-8(r7)
+	or	r25,r9,r25
+	std	r25,-16(r7)
+	or	r26,r10,r26
+	std	r26,-24(r7)
+	or	r27,r11,r27
+	stdu	r27,-32(r7)
+	mr	r0,r12
+	bdnz	.LoopU
+
+	sld	r9,r28,r6
+	srd	r24,r28,r8
+	sld	r10,r29,r6
+	srd	r25,r29,r8
+	sld	r11,r30,r6
+	srd	r26,r30,r8
+	sld	r12,r31,r6
+	srd	r27,r31,r8
+	or	r24,r0,r24
+	std	r24,-8(r7)
+	or	r25,r9,r25
+	std	r25,-16(r7)
+	or	r26,r10,r26
+	std	r26,-24(r7)
+	or	r27,r11,r27
+	stdu	r27,-32(r7)
+	mr	r0,r12
+
+	std	r0,-8(r7)
+	ld	r24,-64(1)
+	ld	r25,-56(1)
+	ld	r26,-48(1)
+	ld	r27,-40(1)
+	ld	r28,-32(1)
+	ld	r29,-24(1)
+	ld	r30,-16(1)
+	ld	r31,-8(1)
+	blr
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/powerpc64/mul_1.asm b/rts/gmp/mpn/powerpc64/mul_1.asm
new file mode 100644
index 0000000000..47597283ff
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/mul_1.asm
@@ -0,0 +1,49 @@
+# PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	mtctr	5
+	li	9,0		# cy_limb = 0
+	addic	0,0,0
+	cal	3,-8(3)
+	cal	4,-8(4)
+.Loop:
+	ldu	0,8(4)
+	mulld	7,0,6
+	adde	7,7,9
+	mulhdu	9,0,6
+	stdu	7,8(3)
+	bdnz	.Loop
+
+	addze	3,9
+	blr
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/powerpc64/rshift.asm b/rts/gmp/mpn/powerpc64/rshift.asm
new file mode 100644
index 0000000000..88272c7fa9
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/rshift.asm
@@ -0,0 +1,60 @@
+# PowerPC-64 mpn_rshift -- Shift a number right.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	mtctr	r5		# copy size into CTR
+	addi	r7,r3,-8	# move adjusted res_ptr to free return reg
+	subfic	r8,r6,64
+	ld	r11,0(r4)	# load first s1 limb
+	sld	r3,r11,r8	# compute function return value
+	bdz	.Lend1
+
+.Loop:	ldu	r10,8(r4)
+	srd	r9,r11,r6
+	sld	r12,r10,r8
+	or	r9,r9,r12
+	stdu	r9,8(r7)
+	bdz	.Lend2
+	ldu	r11,8(r4)
+	srd	r9,r10,r6
+	sld	r12,r11,r8
+	or	r9,r9,r12
+	stdu	r9,8(r7)
+	bdnz	.Loop
+
+.Lend1:	srd	r0,r11,r6
+	std	r0,8(r7)
+	blr
+
+.Lend2:	srd	r0,r10,r6
+	std	r0,8(r7)
+	blr
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/powerpc64/sub_n.asm b/rts/gmp/mpn/powerpc64/sub_n.asm
new file mode 100644
index 0000000000..4de3de69c7
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/sub_n.asm
@@ -0,0 +1,61 @@
+# PowerPC-64 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+# and store difference in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.b
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	mtctr	r6		# copy size into CTR
+	addic	r0,r6,-1	# set cy
+	ld	r8,0(r4)	# load least significant s1 limb
+	ld	r0,0(r5)	# load least significant s2 limb
+	addi	r3,r3,-8	# offset res_ptr, it's updated before it's used
+	bdz	.Lend		# If done, skip loop
+.Loop:	ld	r9,8(r4)	# load s1 limb
+	ld	r10,8(r5)	# load s2 limb
+	subfe	r7,r0,r8	# subtract limbs with cy, set cy
+	std	r7,8(r3)	# store result limb
+	bdz	.Lexit		# decrement CTR and exit if done
+	ldu	r8,16(r4)	# load s1 limb and update s1_ptr
+	ldu	r0,16(r5)	# load s2 limb and update s2_ptr
+	subfe	r7,r10,r9	# subtract limbs with cy, set cy
+	stdu	r7,16(r3)	# store result limb and update res_ptr
+	bdnz	.Loop		# decrement CTR and loop back
+
+.Lend:	subfe	r7,r0,r8
+	std	r7,8(r3)	# store ultimate result limb
+	subfe	r3,r0,r0	# load !cy into ...
+	subfic	r3,r3,0		# ... return value register
+	blr
+.Lexit:	subfe	r7,r10,r9
+	std	r7,16(r3)
+	subfe	r3,r0,r0	# load !cy into ...
+	subfic	r3,r3,0		# ... return value register
+	blr
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/powerpc64/submul_1.asm b/rts/gmp/mpn/powerpc64/submul_1.asm
new file mode 100644
index 0000000000..17f6369a38
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/submul_1.asm
@@ -0,0 +1,54 @@
+# PowerPC-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	mtctr	5
+	li	9,0		# cy_limb = 0
+	addic	0,0,0
+	cal	3,-8(3)
+	cal	4,-8(4)
+.Loop:
+	ldu	0,8(4)
+	ld	10,8(3)
+	mulld	7,0,6
+	adde	7,7,9
+	mulhdu	9,0,6
+	addze	9,9
+	subfc	7,7,10
+	stdu	7,8(3)
+	subfe	11,11,11	# invert ...
+	addic	11,11,1		# ... carry
+	bdnz	.Loop
+
+	addze	3,9
+	blr
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/pyr/add_n.s b/rts/gmp/mpn/pyr/add_n.s
new file mode 100644
index 0000000000..e1fc535846
--- /dev/null
+++ b/rts/gmp/mpn/pyr/add_n.s
@@ -0,0 +1,76 @@
+# Pyramid __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_add_n
+___gmpn_add_n:
+	movw	$-1,tr0		# representation for carry clear
+
+	movw	pr3,tr2
+	andw	$3,tr2
+	beq	Lend0
+	subw	tr2,pr3
+
+Loop0:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	addwc	(pr2),tr1
+	movw	tr1,(pr0)
+
+	subwb	tr0,tr0
+	addw	$4,pr0
+	addw	$4,pr1
+	addw	$4,pr2
+	addw	$-1,tr2
+	bne	Loop0
+
+	mtstw	pr3,pr3
+	beq	Lend
+Lend0:
+Loop:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	addwc	(pr2),tr1
+	movw	tr1,(pr0)
+
+	movw	4(pr1),tr1
+	addwc	4(pr2),tr1
+	movw	tr1,4(pr0)
+
+	movw	8(pr1),tr1
+	addwc	8(pr2),tr1
+	movw	tr1,8(pr0)
+
+	movw	12(pr1),tr1
+	addwc	12(pr2),tr1
+	movw	tr1,12(pr0)
+
+	subwb	tr0,tr0
+	addw	$16,pr0
+	addw	$16,pr1
+	addw	$16,pr2
+	addw	$-4,pr3
+	bne	Loop
+Lend:
+	mnegw	tr0,pr0
+	ret
diff --git a/rts/gmp/mpn/pyr/addmul_1.s b/rts/gmp/mpn/pyr/addmul_1.s
new file mode 100644
index 0000000000..65c3f8f008
--- /dev/null
+++ b/rts/gmp/mpn/pyr/addmul_1.s
@@ -0,0 +1,45 @@
+# Pyramid __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_addmul_1
+___gmpn_addmul_1:
+	mova	(pr0)[pr2*4],pr0
+	mova	(pr1)[pr2*4],pr1
+	mnegw	pr2,pr2
+	movw	$0,tr3
+
+Loop:	movw	(pr1)[pr2*4],tr1
+	uemul	pr3,tr0
+	addw	tr3,tr1
+	movw	$0,tr3
+	addwc	tr0,tr3
+	movw	(pr0)[pr2*0x4],tr0
+	addw	tr0,tr1
+	addwc	$0,tr3
+	movw	tr1,(pr0)[pr2*4]
+	addw	$1,pr2
+	bne	Loop
+
+	movw	tr3,pr0
+	ret
diff --git a/rts/gmp/mpn/pyr/mul_1.s b/rts/gmp/mpn/pyr/mul_1.s
new file mode 100644
index 0000000000..1272297c42
--- /dev/null
+++ b/rts/gmp/mpn/pyr/mul_1.s
@@ -0,0 +1,42 @@
+# Pyramid __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	mova	(pr0)[pr2*4],pr0
+	mova	(pr1)[pr2*4],pr1
+	mnegw	pr2,pr2
+	movw	$0,tr3
+
+Loop:	movw	(pr1)[pr2*4],tr1
+	uemul	pr3,tr0
+	addw	tr3,tr1
+	movw	$0,tr3
+	addwc	tr0,tr3
+	movw	tr1,(pr0)[pr2*4]
+	addw	$1,pr2
+	bne	Loop
+
+	movw	tr3,pr0
+	ret
diff --git a/rts/gmp/mpn/pyr/sub_n.s b/rts/gmp/mpn/pyr/sub_n.s
new file mode 100644
index 0000000000..1fd2eb0f17
--- /dev/null
+++ b/rts/gmp/mpn/pyr/sub_n.s
@@ -0,0 +1,76 @@
+# Pyramid __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_sub_n
+___gmpn_sub_n:
+	movw	$-1,tr0		# representation for carry clear
+
+	movw	pr3,tr2
+	andw	$3,tr2
+	beq	Lend0
+	subw	tr2,pr3
+
+Loop0:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	subwb	(pr2),tr1
+	movw	tr1,(pr0)
+
+	subwb	tr0,tr0
+	addw	$4,pr0
+	addw	$4,pr1
+	addw	$4,pr2
+	addw	$-1,tr2
+	bne	Loop0
+
+	mtstw	pr3,pr3
+	beq	Lend
+Lend0:
+Loop:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	subwb	(pr2),tr1
+	movw	tr1,(pr0)
+
+	movw	4(pr1),tr1
+	subwb	4(pr2),tr1
+	movw	tr1,4(pr0)
+
+	movw	8(pr1),tr1
+	subwb	8(pr2),tr1
+	movw	tr1,8(pr0)
+
+	movw	12(pr1),tr1
+	subwb	12(pr2),tr1
+	movw	tr1,12(pr0)
+
+	subwb	tr0,tr0
+	addw	$16,pr0
+	addw	$16,pr1
+	addw	$16,pr2
+	addw	$-4,pr3
+	bne	Loop
+Lend:
+	mnegw	tr0,pr0
+	ret
diff --git a/rts/gmp/mpn/sh/add_n.s b/rts/gmp/mpn/sh/add_n.s
new file mode 100644
index 0000000000..df388b31a3
--- /dev/null
+++ b/rts/gmp/mpn/sh/add_n.s
@@ -0,0 +1,47 @@
+! SH __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! s2_ptr	r6
+! size		r7
+
+	.text
+	.align 2
+	.global	___gmpn_add_n
+___gmpn_add_n:
+	mov	#0,r3		! clear cy save reg
+
+Loop:	mov.l	@r5+,r1
+	mov.l	@r6+,r2
+	shlr	r3		! restore cy
+	addc	r2,r1
+	movt	r3		! save cy
+	mov.l	r1,@r4
+	dt	r7
+	bf.s	Loop
+	 add	#4,r4
+
+	rts
+	mov	r3,r0		! return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/sh/sh2/addmul_1.s b/rts/gmp/mpn/sh/sh2/addmul_1.s
new file mode 100644
index 0000000000..f34a7f0503
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/addmul_1.s
@@ -0,0 +1,53 @@
+! SH2 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+! the result to a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! size		r6
+! s2_limb	r7
+
+	.text
+	.align 1
+	.global	___gmpn_addmul_1
+___gmpn_addmul_1:
+	mov	#0,r2		! cy_limb = 0
+	mov	#0,r0		! Keep r0 = 0 for entire loop
+	clrt
+
+Loop:	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1		! lo_prod += old cy_limb
+	sts	mach,r2		! new cy_limb = hi_prod
+	mov.l	@r4,r3
+	addc	r0,r2		! cy_limb += T, T = 0
+	addc	r3,r1
+	addc	r0,r2		! cy_limb += T, T = 0
+	dt	r6
+	mov.l	r1,@r4
+	bf.s	Loop
+	add	#4,r4
+
+	rts
+	mov	r2,r0
diff --git a/rts/gmp/mpn/sh/sh2/mul_1.s b/rts/gmp/mpn/sh/sh2/mul_1.s
new file mode 100644
index 0000000000..2a117a3175
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/mul_1.s
@@ -0,0 +1,50 @@
+! SH2 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! size		r6
+! s2_limb	r7
+
+	.text
+	.align 1
+	.global	___gmpn_mul_1
+___gmpn_mul_1:
+	mov	#0,r2		! cy_limb = 0
+	mov	#0,r0		! Keep r0 = 0 for entire loop
+	clrt
+
+Loop:	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1
+	sts	mach,r2
+	addc	r0,r2		! propagate carry to cy_limb (dt clobbers T)
+	dt	r6
+	mov.l	r1,@r4
+	bf.s	Loop
+	add	#4,r4
+
+	rts
+	mov	r2,r0
diff --git a/rts/gmp/mpn/sh/sh2/submul_1.s b/rts/gmp/mpn/sh/sh2/submul_1.s
new file mode 100644
index 0000000000..eb9a27dde3
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/submul_1.s
@@ -0,0 +1,53 @@
+! SH2 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+! the result from a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! size		r6
+! s2_limb	r7
+
+	.text
+	.align 1
+	.global	___gmpn_submul_1
+___gmpn_submul_1:
+	mov	#0,r2		! cy_limb = 0
+	mov	#0,r0		! Keep r0 = 0 for entire loop
+	clrt
+
+Loop:	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1		! lo_prod += old cy_limb
+	sts	mach,r2		! new cy_limb = hi_prod
+	mov.l	@r4,r3
+	addc	r0,r2		! cy_limb += T, T = 0
+	subc	r3,r1
+	addc	r0,r2		! cy_limb += T, T = 0
+	dt	r6
+	mov.l	r1,@r4
+	bf.s	Loop
+	add	#4,r4
+
+	rts
+	mov	r2,r0
diff --git a/rts/gmp/mpn/sh/sub_n.s b/rts/gmp/mpn/sh/sub_n.s
new file mode 100644
index 0000000000..5f818c95a8
--- /dev/null
+++ b/rts/gmp/mpn/sh/sub_n.s
@@ -0,0 +1,47 @@
+! SH __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+! difference in a third limb vector.
+
+! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! s2_ptr	r6
+! size		r7
+
+	.text
+	.align 2
+	.global	___gmpn_sub_n
+___gmpn_sub_n:
+	mov	#0,r3		! clear cy save reg
+
+Loop:	mov.l	@r5+,r1
+	mov.l	@r6+,r2
+	shlr	r3		! restore cy
+	subc	r2,r1
+	movt	r3		! save cy
+	mov.l	r1,@r4
+	dt	r7
+	bf.s	Loop
+	 add	#4,r4
+
+	rts
+	mov	r3,r0		! return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/sparc32/README b/rts/gmp/mpn/sparc32/README
new file mode 100644
index 0000000000..7c19df7bc4
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/README
@@ -0,0 +1,36 @@
+This directory contains mpn functions for various SPARC chips.  Code that
+runs only on version 8 SPARC implementations, is in the v8 subdirectory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+  Load and Store timing
+
+On most early SPARC implementations, the ST instructions takes multiple
+cycles, while a STD takes just a single cycle more than an ST.  For the CPUs
+in SPARCstation I and II, the times are 3 and 4 cycles, respectively.
+Therefore, combining two ST instrucitons into a STD when possible is a
+significant optimiation.
+
+Later SPARC implementations have single cycle ST.
+
+For SuperSPARC, we can perform just one memory instruction per cycle, even
+if up to two integer instructions can be executed in its pipeline.  For
+programs that perform so many memory operations that there are not enough
+non-memory operations to issue in parallel with all memory operations, using
+LDD and STD when possible helps.
+
+STATUS
+
+1. On a SuperSPARC, mpn_lshift and mpn_rshift run at 3 cycles/limb, or 2.5
+   cycles/limb asymptotically.  We could optimize speed for special counts
+   by using ADDXCC.
+
+2. On a SuperSPARC, mpn_add_n and mpn_sub_n runs at 2.5 cycles/limb, or 2
+   cycles/limb asymptotically.
+
+3. mpn_mul_1 runs at what is believed to be optimal speed.
+
+4. On SuperSPARC, mpn_addmul_1 and mpn_submul_1 could both be improved by a
+   cycle by avoiding one of the add instrucitons.  See a29k/addmul_1.
+
+The speed of the code for other SPARC implementations is uncertain.
diff --git a/rts/gmp/mpn/sparc32/add_n.asm b/rts/gmp/mpn/sparc32/add_n.asm
new file mode 100644
index 0000000000..5f1d00c0e0
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/add_n.asm
@@ -0,0 +1,236 @@
+dnl  SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl  sum in a third limb vector.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(res_ptr,%o0)
+define(s1_ptr,%o1)
+define(s2_ptr,%o2)
+define(n,%o3)
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	xor	s2_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(1)			C branch if alignment differs
+	nop
+C **  V1a  **
+L(0):	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	addcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s1_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s1_ptr+4],%g1
+	ldd	[s2_ptr+0],%g2
+	blt	L(fin1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1):
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+16],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+20],%g1
+	ldd	[s2_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+24],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+28],%g1
+	ldd	[s2_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+32],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+36],%g1
+	ldd	[s2_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1):
+	addcc	n,8-2,n
+	blt	L(end1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1):
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1):
+	addxcc	%g4,%g2,%o4
+	addxcc	%g1,%g3,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s1_ptr+8],%g4
+	ld	[s2_ptr+8],%g2
+	addxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+
+L(1):	xor	s1_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(2)
+	nop
+C **  V1b  **
+	mov	s2_ptr,%g1
+	mov	s1_ptr,s2_ptr
+	b	L(0)
+	mov	%g1,s1_ptr
+
+C **  V2  **
+C If we come here, the alignment of s1_ptr and res_ptr as well as the
+C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+C things can be aligned (that we care about) we now know that the alignment
+C of s1_ptr and s2_ptr are the same.
+
+L(2):	cmp	n,1
+	be	L(jone)
+	nop
+	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
+	be	L(v2)			C if no, branch
+	nop
+C Add least significant limb separately to align s1_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	addcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+
+L(v2):	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	blt	L(fin2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	ldd	[s1_ptr+8],%g2
+	ldd	[s2_ptr+8],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+8]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+12]
+	ldd	[s1_ptr+16],%g2
+	ldd	[s2_ptr+16],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+16]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+20]
+	ldd	[s1_ptr+24],%g2
+	ldd	[s2_ptr+24],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+24]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+28]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop2)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin2):
+	addcc	n,8-2,n
+	blt	L(end2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(loope2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end2):
+	andcc	n,1,%g0
+	be	L(ret2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+L(jone):
+	ld	[s1_ptr],%g4
+	ld	[s2_ptr],%g2
+	addxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+
+L(ret2):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/sparc32/addmul_1.asm b/rts/gmp/mpn/sparc32/addmul_1.asm
new file mode 100644
index 0000000000..80c94e4251
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/addmul_1.asm
@@ -0,0 +1,146 @@
+dnl  SPARC mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl  result to a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	addcc	%o5,%g1,%g1
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+ 	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne	L(loop0)
+	 ld	[%o4+%o2],%o5
+
+	addcc	%o5,%g1,%g1
+	addx	%o0,%g0,%o0
+	retl
+	st	%g1,[%o4+%o2]
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	addcc	%o5,%g3,%g3
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 ld	[%o4+%o2],%o5
+
+	addcc	%o5,%g3,%g3
+	addx	%o0,%g0,%o0
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/lshift.asm b/rts/gmp/mpn/sparc32/lshift.asm
new file mode 100644
index 0000000000..529733ac2d
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/lshift.asm
@@ -0,0 +1,97 @@
+dnl  SPARC mpn_lshift -- Shift a number left.
+dnl  
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	%o0
+C src_ptr	%o1
+C size		%o2
+C cnt		%o3
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	sll	%o2,2,%g1
+	add	%o1,%g1,%o1	C make %o1 point at end of src
+	ld	[%o1-4],%g2	C load first limb
+	sub	%g0,%o3,%o5	C negate shift count
+	add	%o0,%g1,%o0	C make %o0 point at end of res
+	add	%o2,-1,%o2
+	andcc	%o2,4-1,%g4	C number of limbs in first loop
+	srl	%g2,%o5,%g1	C compute function result
+	be	L(0)		C if multiple of 4 limbs, skip first loop
+	st	%g1,[%sp+80]
+
+	sub	%o2,%g4,%o2	C adjust count for main loop
+
+L(loop0):
+	ld	[%o1-8],%g3
+	add	%o0,-4,%o0
+	add	%o1,-4,%o1
+	addcc	%g4,-1,%g4
+	sll	%g2,%o3,%o4
+	srl	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	bne	L(loop0)
+	 st	%o4,[%o0+0]
+
+L(0):	tst	%o2
+	be	L(end)
+	 nop
+
+L(loop):
+	ld	[%o1-8],%g3
+	add	%o0,-16,%o0
+	addcc	%o2,-4,%o2
+	sll	%g2,%o3,%o4
+	srl	%g3,%o5,%g1
+
+	ld	[%o1-12],%g2
+	sll	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0+12]
+	srl	%g2,%o5,%g1
+
+	ld	[%o1-16],%g3
+	sll	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	st	%g4,[%o0+8]
+	srl	%g3,%o5,%g1
+
+	ld	[%o1-20],%g2
+	sll	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0+4]
+	srl	%g2,%o5,%g1
+
+	add	%o1,-16,%o1
+	or	%g4,%g1,%g4
+	bne	L(loop)
+	 st	%g4,[%o0+0]
+
+L(end):	sll	%g2,%o3,%g2
+	st	%g2,[%o0-4]
+	retl
+	ld	[%sp+80],%o0
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/sparc32/mul_1.asm b/rts/gmp/mpn/sparc32/mul_1.asm
new file mode 100644
index 0000000000..e5fedeabaa
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/mul_1.asm
@@ -0,0 +1,137 @@
+dnl  SPARC mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+ 	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne,a	L(loop0)
+	 ld	[%o1+%o2],%o5
+
+	retl
+	st	%g1,[%o4+%o2]
+
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2	C g2 = S1_LIMB iff S2_LIMB < 0, else 0
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne,a	L(loop)
+	 ld	[%o1+%o2],%o5
+
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/rshift.asm b/rts/gmp/mpn/sparc32/rshift.asm
new file mode 100644
index 0000000000..9187dbaa6f
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/rshift.asm
@@ -0,0 +1,93 @@
+dnl  SPARC mpn_rshift -- Shift a number right.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	%o0
+C src_ptr	%o1
+C size		%o2
+C cnt		%o3
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ld	[%o1],%g2	C load first limb
+	sub	%g0,%o3,%o5	C negate shift count
+	add	%o2,-1,%o2
+	andcc	%o2,4-1,%g4	C number of limbs in first loop
+	sll	%g2,%o5,%g1	C compute function result
+	be	L(0)		C if multiple of 4 limbs, skip first loop
+	st	%g1,[%sp+80]
+
+	sub	%o2,%g4,%o2	C adjust count for main loop
+
+L(loop0):
+	ld	[%o1+4],%g3
+	add	%o0,4,%o0
+	add	%o1,4,%o1
+	addcc	%g4,-1,%g4
+	srl	%g2,%o3,%o4
+	sll	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	bne	L(loop0)
+	 st	%o4,[%o0-4]
+
+L(0):	tst	%o2
+	be	L(end)
+	 nop
+
+L(loop):
+	ld	[%o1+4],%g3
+	add	%o0,16,%o0
+	addcc	%o2,-4,%o2
+	srl	%g2,%o3,%o4
+	sll	%g3,%o5,%g1
+
+	ld	[%o1+8],%g2
+	srl	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0-16]
+	sll	%g2,%o5,%g1
+
+	ld	[%o1+12],%g3
+	srl	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	st	%g4,[%o0-12]
+	sll	%g3,%o5,%g1
+
+	ld	[%o1+16],%g2
+	srl	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0-8]
+	sll	%g2,%o5,%g1
+
+	add	%o1,16,%o1
+	or	%g4,%g1,%g4
+	bne	L(loop)
+	 st	%g4,[%o0-4]
+
+L(end):	srl	%g2,%o3,%g2
+	st	%g2,[%o0-0]
+	retl
+	ld	[%sp+80],%o0
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/sparc32/sub_n.asm b/rts/gmp/mpn/sparc32/sub_n.asm
new file mode 100644
index 0000000000..071909a1b6
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/sub_n.asm
@@ -0,0 +1,326 @@
+dnl  SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(res_ptr,%o0)
+define(s1_ptr,%o1)
+define(s2_ptr,%o2)
+define(n,%o3)
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	xor	s2_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(1)			C branch if alignment differs
+	nop
+C **  V1a  **
+	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	subcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s1_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s1_ptr+4],%g1
+	ldd	[s2_ptr+0],%g2
+	blt	L(fin1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1):
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+16],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+20],%g1
+	ldd	[s2_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+24],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+28],%g1
+	ldd	[s2_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+32],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+36],%g1
+	ldd	[s2_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1):
+	addcc	n,8-2,n
+	blt	L(end1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1):
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1):
+	subxcc	%g4,%g2,%o4
+	subxcc	%g1,%g3,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s1_ptr+8],%g4
+	ld	[s2_ptr+8],%g2
+	subxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+
+L(1):	xor	s1_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(2)
+	nop
+C **  V1b  **
+	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1b)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s1_ptr
+	ld	[s2_ptr],%g4
+	add	s2_ptr,4,s2_ptr
+	ld	[s1_ptr],%g2
+	add	s1_ptr,4,s1_ptr
+	add	n,-1,n
+	subcc	%g2,%g4,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1b):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s2_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s2_ptr+4],%g1
+	ldd	[s1_ptr+0],%g2
+	blt	L(fin1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1b):
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+8],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+12],%g1
+	ldd	[s1_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+16],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+20],%g1
+	ldd	[s1_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+24],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+28],%g1
+	ldd	[s1_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+32],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+36],%g1
+	ldd	[s1_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1b)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1b):
+	addcc	n,8-2,n
+	blt	L(end1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1b):
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+8],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+12],%g1
+	ldd	[s1_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1b)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1b):
+	subxcc	%g2,%g4,%o4
+	subxcc	%g3,%g1,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s2_ptr+8],%g4
+	ld	[s1_ptr+8],%g2
+	subxcc	%g2,%g4,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1b):
+	retl
+	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
+
+C **  V2  **
+C If we come here, the alignment of s1_ptr and res_ptr as well as the
+C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+C things can be aligned (that we care about) we now know that the alignment
+C of s1_ptr and s2_ptr are the same.
+
+L(2):	cmp	n,1
+	be	L(jone)
+	nop
+	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
+	be	L(v2)			C if no, branch
+	nop
+C Add least significant limb separately to align s1_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	subcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+
+L(v2):	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	blt	L(fin2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	ldd	[s1_ptr+8],%g2
+	ldd	[s2_ptr+8],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+8]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+12]
+	ldd	[s1_ptr+16],%g2
+	ldd	[s2_ptr+16],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+16]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+20]
+	ldd	[s1_ptr+24],%g2
+	ldd	[s2_ptr+24],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+24]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+28]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop2)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin2):
+	addcc	n,8-2,n
+	blt	L(end2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(loope2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end2):
+	andcc	n,1,%g0
+	be	L(ret2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+L(jone):
+	ld	[s1_ptr],%g4
+	ld	[s2_ptr],%g2
+	subxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+
+L(ret2):
+	retl
+	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/sparc32/submul_1.asm b/rts/gmp/mpn/sparc32/submul_1.asm
new file mode 100644
index 0000000000..12abd844ce
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/submul_1.asm
@@ -0,0 +1,146 @@
+dnl  SPARC mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl  the result from a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	subcc	%o5,%g1,%g1
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+ 	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne	L(loop0)
+	 ld	[%o4+%o2],%o5
+
+	subcc	%o5,%g1,%g1
+	addx	%o0,%g0,%o0
+	retl
+	st	%g1,[%o4+%o2]
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	subcc	%o5,%g3,%g3
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 ld	[%o4+%o2],%o5
+
+	subcc	%o5,%g3,%g3
+	addx	%o0,%g0,%o0
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc32/udiv_fp.asm b/rts/gmp/mpn/sparc32/udiv_fp.asm
new file mode 100644
index 0000000000..e340e147d2
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/udiv_fp.asm
@@ -0,0 +1,158 @@
+dnl  SPARC v7 __udiv_qrnnd division support, used from longlong.h.
+dnl  This is for v7 CPUs with a floating-point unit.
+
+dnl  Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	i0
+C n1		i1
+C n0		i2
+C d		i3
+
+ASM_START()
+
+ifdef(`PIC',
+`	TEXT
+L(getpc):
+	retl
+	nop')
+
+	TEXT
+	ALIGN(8)
+L(C0):	.double	0r4294967296
+L(C1):	.double	0r2147483648
+
+PROLOGUE(mpn_udiv_qrnnd)
+	save	%sp,-104,%sp
+	st	%i1,[%fp-8]
+	ld	[%fp-8],%f10
+
+ifdef(`PIC',
+`L(pc):	call	L(getpc)		C put address of this insn in %o7
+	ldd	[%o7+L(C0)-L(pc)],%f8',
+`	sethi	%hi(L(C0)),%o7
+	ldd	[%o7+%lo(L(C0))],%f8')
+
+	fitod	%f10,%f4
+	cmp	%i1,0
+	bge	L(248)
+	mov	%i0,%i5
+	faddd	%f4,%f8,%f4
+L(248):
+	st	%i2,[%fp-8]
+	ld	[%fp-8],%f10
+	fmuld	%f4,%f8,%f6
+	cmp	%i2,0
+	bge	L(249)
+	fitod	%f10,%f2
+	faddd	%f2,%f8,%f2
+L(249):
+	st	%i3,[%fp-8]
+	faddd	%f6,%f2,%f2
+	ld	[%fp-8],%f10
+	cmp	%i3,0
+	bge	L(250)
+	fitod	%f10,%f4
+	faddd	%f4,%f8,%f4
+L(250):
+	fdivd	%f2,%f4,%f2
+
+ifdef(`PIC',
+`	ldd	[%o7+L(C1)-L(pc)],%f4',
+`	sethi	%hi(L(C1)),%o7
+	ldd	[%o7+%lo(L(C1))],%f4')
+
+	fcmped	%f2,%f4
+	nop
+	fbge,a	L(251)
+	fsubd	%f2,%f4,%f2
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	b	L(252)
+	ld	[%fp-8],%i4
+L(251):
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	ld	[%fp-8],%i4
+	sethi	%hi(-2147483648),%g2
+	xor	%i4,%g2,%i4
+L(252):
+	wr	%g0,%i4,%y
+	sra	%i3,31,%g2
+	and	%i4,%g2,%g2
+	andcc	%g0,0,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,0,%g1
+	add	%g1,%g2,%i0
+	rd	%y,%g3
+	subcc	%i2,%g3,%o7
+	subxcc	%i1,%i0,%g0
+	be	L(253)
+	cmp	%o7,%i3
+
+	add	%i4,-1,%i0
+	add	%o7,%i3,%o7
+	st	%o7,[%i5]
+	ret
+	restore
+L(253):
+	blu	L(246)
+	mov	%i4,%i0
+	add	%i4,1,%i0
+	sub	%o7,%i3,%o7
+L(246):
+	st	%o7,[%i5]
+	ret
+	restore
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/udiv_nfp.asm b/rts/gmp/mpn/sparc32/udiv_nfp.asm
new file mode 100644
index 0000000000..ae19f4c6e9
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/udiv_nfp.asm
@@ -0,0 +1,193 @@
+dnl  SPARC v7 __udiv_qrnnd division support, used from longlong.h.
+dnl  This is for v7 CPUs without a floating-point unit.
+
+dnl  Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+ 
+C INPUT PARAMETERS
+C rem_ptr	o0
+C n1		o1
+C n0		o2
+C d		o3
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+	tst	%o3
+	bneg	L(largedivisor)
+	mov	8,%g1
+
+	b	L(p1)
+	addxcc	%o2,%o2,%o2
+
+L(plop):
+	bcc	L(n1)
+	addxcc	%o2,%o2,%o2
+L(p1):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n2)
+	addxcc	%o2,%o2,%o2
+L(p2):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n3)
+	addxcc	%o2,%o2,%o2
+L(p3):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n4)
+	addxcc	%o2,%o2,%o2
+L(p4):	addx	%o1,%o1,%o1
+	addcc	%g1,-1,%g1
+	bne	L(plop)
+	subcc	%o1,%o3,%o4
+	bcc	L(n5)
+	addxcc	%o2,%o2,%o2
+L(p5):	st	%o1,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(nlop):
+	bcc	L(p1)
+	addxcc	%o2,%o2,%o2
+L(n1):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p2)
+	addxcc	%o2,%o2,%o2
+L(n2):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p3)
+	addxcc	%o2,%o2,%o2
+L(n3):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p4)
+	addxcc	%o2,%o2,%o2
+L(n4):	addx	%o4,%o4,%o4
+	addcc	%g1,-1,%g1
+	bne	L(nlop)
+	subcc	%o4,%o3,%o1
+	bcc	L(p5)
+	addxcc	%o2,%o2,%o2
+L(n5):	st	%o4,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(largedivisor):
+	and	%o2,1,%o5	C %o5 = n0 & 1
+
+	srl	%o2,1,%o2
+	sll	%o1,31,%g2
+	or	%g2,%o2,%o2	C %o2 = lo(n1n0 >> 1)
+	srl	%o1,1,%o1	C %o1 = hi(n1n0 >> 1)
+
+	and	%o3,1,%g2
+	srl	%o3,1,%g3	C %g3 = floor(d / 2)
+	add	%g3,%g2,%g3	C %g3 = ceil(d / 2)
+
+	b	L(Lp1)
+	addxcc	%o2,%o2,%o2
+
+L(Lplop):
+	bcc	L(Ln1)
+	addxcc	%o2,%o2,%o2
+L(Lp1):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln2)
+	addxcc	%o2,%o2,%o2
+L(Lp2):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln3)
+	addxcc	%o2,%o2,%o2
+L(Lp3):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln4)
+	addxcc	%o2,%o2,%o2
+L(Lp4):	addx	%o1,%o1,%o1
+	addcc	%g1,-1,%g1
+	bne	L(Lplop)
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln5)
+	addxcc	%o2,%o2,%o2
+L(Lp5):	add	%o1,%o1,%o1	C << 1
+	tst	%g2
+	bne	L(oddp)
+	add	%o5,%o1,%o1
+	st	%o1,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(Lnlop):
+	bcc	L(Lp1)
+	addxcc	%o2,%o2,%o2
+L(Ln1):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp2)
+	addxcc	%o2,%o2,%o2
+L(Ln2):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp3)
+	addxcc	%o2,%o2,%o2
+L(Ln3):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp4)
+	addxcc	%o2,%o2,%o2
+L(Ln4):	addx	%o4,%o4,%o4
+	addcc	%g1,-1,%g1
+	bne	L(Lnlop)
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp5)
+	addxcc	%o2,%o2,%o2
+L(Ln5):	add	%o4,%o4,%o4	C << 1
+	tst	%g2
+	bne	L(oddn)
+	add	%o5,%o4,%o4
+	st	%o4,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(oddp):
+	xnor	%g0,%o2,%o2
+	C q' in %o2. r' in %o1
+	addcc	%o1,%o2,%o1
+	bcc	L(Lp6)
+	addx	%o2,0,%o2
+	sub	%o1,%o3,%o1
+L(Lp6):	subcc	%o1,%o3,%g0
+	bcs	L(Lp7)
+	subx	%o2,-1,%o2
+	sub	%o1,%o3,%o1
+L(Lp7):	st	%o1,[%o0]
+	retl
+	mov	%o2,%o0
+
+L(oddn):
+	xnor	%g0,%o2,%o2
+	C q' in %o2. r' in %o4
+	addcc	%o4,%o2,%o4
+	bcc	L(Ln6)
+	addx	%o2,0,%o2
+	sub	%o4,%o3,%o4
+L(Ln6):	subcc	%o4,%o3,%g0
+	bcs	L(Ln7)
+	subx	%o2,-1,%o2
+	sub	%o4,%o3,%o4
+L(Ln7):	st	%o4,[%o0]
+	retl
+	mov	%o2,%o0
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/umul.asm b/rts/gmp/mpn/sparc32/umul.asm
new file mode 100644
index 0000000000..efa56851d6
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/umul.asm
@@ -0,0 +1,68 @@
+dnl  SPARC mpn_umul_ppmm -- support for longlong.h for non-gcc.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	wr	%g0,%o1,%y
+	sra	%o2,31,%g2	C Don't move this insn
+	and	%o1,%g2,%g2	C Don't move this insn
+	andcc	%g0,0,%g1	C Don't move this insn
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,0,%g1
+	rd	%y,%g3
+	st	%g3,[%o0]
+	retl
+	add	%g1,%g2,%o0
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/sparc32/v8/addmul_1.asm b/rts/gmp/mpn/sparc32/v8/addmul_1.asm
new file mode 100644
index 0000000000..da44644b51
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/addmul_1.asm
@@ -0,0 +1,122 @@
+dnl  SPARC v8 mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	orcc	%g0,%g0,%g2
+	ld	[%o1+0],%o4	C 1
+
+	sll	%o2,4,%g1
+	and	%g1,(4-1)<<4,%g1
+ifdef(`PIC',
+`	mov	%o7,%g4		C Save return address register
+0:	call	1f
+	add	%o7,L(1)-0b,%g3
+1:	mov	%g4,%o7		C Restore return address register
+',
+`	sethi	%hi(L(1)),%g3
+	or	%g3,%lo(L(1)),%g3
+')
+	jmp	%g3+%g1
+	nop
+L(1):
+L(L00):	add	%o0,-4,%o0
+	b	L(loop00)	C 4, 8, 12, ...
+	add	%o1,-4,%o1
+	nop
+L(L01):	b	L(loop01)	C 1, 5, 9, ...
+	nop
+	nop
+	nop
+L(L10):	add	%o0,-12,%o0	C 2, 6, 10, ...
+	b	L(loop10)
+	add	%o1,4,%o1
+	nop
+L(L11):	add	%o0,-8,%o0	C 3, 7, 11, ...
+	b	L(loop11)
+	add	%o1,-8,%o1
+	nop
+
+L(loop):
+	addcc	%g3,%g2,%g3	C 1
+	ld	[%o1+4],%o4	C 2
+	rd	%y,%g2		C 1
+	addx	%g0,%g2,%g2
+	ld	[%o0+0],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+0]	C 1
+L(loop00):
+	umul	%o4,%o3,%g3	C 2
+	ld	[%o0+4],%g1	C 2
+	addxcc	%g3,%g2,%g3	C 2
+	ld	[%o1+8],%o4	C 3
+	rd	%y,%g2		C 2
+	addx	%g0,%g2,%g2
+	nop
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+4]	C 2
+L(loop11):
+	umul	%o4,%o3,%g3	C 3
+	addxcc	%g3,%g2,%g3	C 3
+	ld	[%o1+12],%o4	C 4
+	rd	%y,%g2		C 3
+	add	%o1,16,%o1
+	addx	%g0,%g2,%g2
+	ld	[%o0+8],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+8]	C 3
+L(loop10):
+	umul	%o4,%o3,%g3	C 4
+	addxcc	%g3,%g2,%g3	C 4
+	ld	[%o1+0],%o4	C 1
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+	ld	[%o0+12],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+12]	C 4
+	add	%o0,16,%o0
+	addx	%g0,%g2,%g2
+L(loop01):
+	addcc	%o2,-4,%o2
+	bg	L(loop)
+	umul	%o4,%o3,%g3	C 1
+
+	addcc	%g3,%g2,%g3	C 4
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+	ld	[%o0+0],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+0]	C 4
+	addx	%g0,%g2,%o0
+
+	retl
+	 nop
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/mul_1.asm b/rts/gmp/mpn/sparc32/v8/mul_1.asm
new file mode 100644
index 0000000000..801247553a
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/mul_1.asm
@@ -0,0 +1,103 @@
+dnl  SPARC v8 mpn_mul_1 -- Multiply a limb vector with a single limb and
+dnl  store the product in a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	sll	%o2,4,%g1
+	and	%g1,(4-1)<<4,%g1
+ifdef(`PIC',
+`	mov	%o7,%g4		C Save return address register
+0:	call	1f
+	add	%o7,L(1)-0b,%g3
+1:	mov	%g4,%o7		C Restore return address register
+',
+`	sethi	%hi(L(1)),%g3
+	or	%g3,%lo(L(1)),%g3
+')
+	jmp	%g3+%g1
+	ld	[%o1+0],%o4	C 1
+L(1):
+L(L00):	add	%o0,-4,%o0
+	add	%o1,-4,%o1
+	b	L(loop00)	C 4, 8, 12, ...
+	orcc	%g0,%g0,%g2
+L(L01):	b	L(loop01)	C 1, 5, 9, ...
+	orcc	%g0,%g0,%g2
+	nop
+	nop
+L(L10):	add	%o0,-12,%o0	C 2, 6, 10, ...
+	add	%o1,4,%o1
+	b	L(loop10)
+	orcc	%g0,%g0,%g2
+	nop
+L(L11):	add	%o0,-8,%o0	C 3, 7, 11, ...
+	add	%o1,-8,%o1
+	b	L(loop11)
+	orcc	%g0,%g0,%g2
+
+L(loop):
+	addcc	%g3,%g2,%g3	C 1
+	ld	[%o1+4],%o4	C 2
+	st	%g3,[%o0+0]	C 1
+	rd	%y,%g2		C 1
+L(loop00):
+	umul	%o4,%o3,%g3	C 2
+	addxcc	%g3,%g2,%g3	C 2
+	ld	[%o1+8],%o4	C 3
+	st	%g3,[%o0+4]	C 2
+	rd	%y,%g2		C 2
+L(loop11):
+	umul	%o4,%o3,%g3	C 3
+	addxcc	%g3,%g2,%g3	C 3
+	ld	[%o1+12],%o4	C 4
+	add	%o1,16,%o1
+	st	%g3,[%o0+8]	C 3
+	rd	%y,%g2		C 3
+L(loop10):
+	umul	%o4,%o3,%g3	C 4
+	addxcc	%g3,%g2,%g3	C 4
+	ld	[%o1+0],%o4	C 1
+	st	%g3,[%o0+12]	C 4
+	add	%o0,16,%o0
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+L(loop01):
+	addcc	%o2,-4,%o2
+	bg	L(loop)
+	umul	%o4,%o3,%g3	C 1
+
+	addcc	%g3,%g2,%g3	C 4
+	st	%g3,[%o0+0]	C 4
+	rd	%y,%g2		C 4
+
+	retl
+	addx	%g0,%g2,%o0
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/submul_1.asm b/rts/gmp/mpn/sparc32/v8/submul_1.asm
new file mode 100644
index 0000000000..9ed132f4c1
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/submul_1.asm
@@ -0,0 +1,58 @@
+dnl  SPARC v8 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	sub	%g0,%o2,%o2		C negate ...
+	sll	%o2,2,%o2		C ... and scale size
+	sub	%o1,%o2,%o1		C o1 is offset s1_ptr
+	sub	%o0,%o2,%g1		C g1 is offset res_ptr
+
+	mov	0,%o0			C clear cy_limb
+
+L(loop):
+	ld	[%o1+%o2],%o4
+	ld	[%g1+%o2],%g2
+	umul	%o4,%o3,%o5
+	rd	%y,%g3
+	addcc	%o5,%o0,%o5
+	addx	%g3,0,%o0
+	subcc	%g2,%o5,%g2
+	addx	%o0,0,%o0
+	st	%g2,[%g1+%o2]
+
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 nop
+
+	retl
+	 nop
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm b/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm
new file mode 100644
index 0000000000..0d5e8d415d
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm
@@ -0,0 +1,122 @@
+dnl  SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h.
+dnl  This is for SuperSPARC only, to compensate for its semi-functional
+dnl  udiv instruction.
+
+dnl  Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	i0
+C n1		i1
+C n0		i2
+C d		i3
+
+ASM_START()
+
+ifdef(`PIC',
+`	TEXT
+L(getpc):
+	retl
+	nop')
+
+	TEXT
+	ALIGN(8)
+L(C0):	.double	0r4294967296
+L(C1):	.double	0r2147483648
+
+PROLOGUE(mpn_udiv_qrnnd)
+	save	%sp,-104,%sp
+	st	%i1,[%fp-8]
+	ld	[%fp-8],%f10
+
+ifdef(`PIC',
+`L(pc):	call	L(getpc)		C put address of this insn in %o7
+	ldd	[%o7+L(C0)-L(pc)],%f8',
+`	sethi	%hi(L(C0)),%o7
+	ldd	[%o7+%lo(L(C0))],%f8')
+
+	fitod	%f10,%f4
+	cmp	%i1,0
+	bge	L(248)
+	mov	%i0,%i5
+	faddd	%f4,%f8,%f4
+L(248):
+	st	%i2,[%fp-8]
+	ld	[%fp-8],%f10
+	fmuld	%f4,%f8,%f6
+	cmp	%i2,0
+	bge	L(249)
+	fitod	%f10,%f2
+	faddd	%f2,%f8,%f2
+L(249):
+	st	%i3,[%fp-8]
+	faddd	%f6,%f2,%f2
+	ld	[%fp-8],%f10
+	cmp	%i3,0
+	bge	L(250)
+	fitod	%f10,%f4
+	faddd	%f4,%f8,%f4
+L(250):
+	fdivd	%f2,%f4,%f2
+
+ifdef(`PIC',
+`	ldd	[%o7+L(C1)-L(pc)],%f4',
+`	sethi	%hi(L(C1)),%o7
+	ldd	[%o7+%lo(L(C1))],%f4')
+
+	fcmped	%f2,%f4
+	nop
+	fbge,a	L(251)
+	fsubd	%f2,%f4,%f2
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	b	L(252)
+	ld	[%fp-8],%i4
+L(251):
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	ld	[%fp-8],%i4
+	sethi	%hi(-2147483648),%g2
+	xor	%i4,%g2,%i4
+L(252):
+	umul	%i3,%i4,%g3
+	rd	%y,%i0
+	subcc	%i2,%g3,%o7
+	subxcc	%i1,%i0,%g0
+	be	L(253)
+	cmp	%o7,%i3
+
+	add	%i4,-1,%i0
+	add	%o7,%i3,%o7
+	st	%o7,[%i5]
+	ret
+	restore
+L(253):
+	blu	L(246)
+	mov	%i4,%i0
+	add	%i4,1,%i0
+	sub	%o7,%i3,%o7
+L(246):
+	st	%o7,[%i5]
+	ret
+	restore
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/v8/umul.asm b/rts/gmp/mpn/sparc32/v8/umul.asm
new file mode 100644
index 0000000000..ae8f692a0a
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/umul.asm
@@ -0,0 +1,31 @@
+dnl  SPARC v8 mpn_umul_ppmm -- support for longlong.h for non-gcc.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	umul %o1,%o2,%g2
+	st %g2,[%o0]
+	retl
+	rd %y,%o0
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/sparc32/v9/README b/rts/gmp/mpn/sparc32/v9/README
new file mode 100644
index 0000000000..9b39713271
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/README
@@ -0,0 +1,4 @@
+Code for SPARC processors implementing version 9 of the SPARC architecture.
+This code is for systems that doesn't preserve the full 64-bit contents of
+integer register at context switch.  For other systems (such as Solaris 7 or
+later) use the code in ../../sparc64.
diff --git a/rts/gmp/mpn/sparc32/v9/addmul_1.asm b/rts/gmp/mpn/sparc32/v9/addmul_1.asm
new file mode 100644
index 0000000000..c1762cc41f
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/addmul_1.asm
@@ -0,0 +1,288 @@
+dnl  SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+PROLOGUE(mpn_addmul_1)
+	save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hi(L(noll)),%g1
+	ld	[%g1+%lo(L(noll))],%f10')
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1],%f11
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end2)
+	std	%f12,[%fp-16]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end3)
+	std	%f12,[%fp-32]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end4)
+	std	%f12,[%fp-16]
+
+	b,a	L(loopm)
+
+	.align 16
+C BEGIN LOOP
+L(loop):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(loope)
+	add	%i0,4,%i0		C res_ptr++
+L(loopm):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	subcc	%i2,1,%i2
+	bne,pt	%icc,L(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+
+	fxtod	%f10,%f2
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(xxx)
+L(loope):
+L(end4):
+	fxtod	%f10,%f2
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(yyy)
+
+L(end3):
+	fxtod	%f10,%f2
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+L(xxx):	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+L(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end1):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+L(ret):	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	st	%g4,[%i0-4]
+
+	ret
+	restore %g0,%g3,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/v9/gmp-mparam.h b/rts/gmp/mpn/sparc32/v9/gmp-mparam.h
new file mode 100644
index 0000000000..f946b900f0
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/gmp-mparam.h
@@ -0,0 +1,69 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+/* These values are for UltraSPARC I, II, and IIi.  It is bogus that
+   this file lives in v9, but that will do for now.  */
+
+/* Variations in addmul_1 speed make the multiply and square thresholds
+   doubtful.  TOOM3_SQR_THRESHOLD had to be estimated here.  */
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   30
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      200
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   59
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      500
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD             107
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            146
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            29
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           3
+#endif
diff --git a/rts/gmp/mpn/sparc32/v9/mul_1.asm b/rts/gmp/mpn/sparc32/v9/mul_1.asm
new file mode 100644
index 0000000000..f8f0fdd8c2
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/mul_1.asm
@@ -0,0 +1,267 @@
+dnl  SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and
+dnl  store the result in a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+PROLOGUE(mpn_mul_1)
+	save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hi(L(noll)),%g1
+	ld	[%g1+%lo(L(noll))],%f10')
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1],%f11
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end2)
+	std	%f12,[%fp-16]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end3)
+	std	%f12,[%fp-32]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end4)
+	std	%f12,[%fp-16]
+
+	b,a	L(loopm)
+
+	.align 16
+C BEGIN LOOP
+L(loop):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(loope)
+	add	%i0,4,%i0		C res_ptr++
+L(loopm):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	subcc	%i2,1,%i2
+	bne,pt	%icc,L(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(xxx)
+L(loope):
+L(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(yyy)
+
+L(end3):
+	fxtod	%f10,%f2
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+L(xxx):	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+L(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end1):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+L(ret):	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	st	%g4,[%i0-4]
+
+	ret
+	restore %g0,%g3,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/v9/submul_1.asm b/rts/gmp/mpn/sparc32/v9/submul_1.asm
new file mode 100644
index 0000000000..6195ea88ea
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/submul_1.asm
@@ -0,0 +1,291 @@
+dnl  SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+PROLOGUE(mpn_submul_1)
+	save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hi(L(noll)),%g1
+	ld	[%g1+%lo(L(noll))],%f10')
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1],%f11
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end2)
+	std	%f12,[%fp-16]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end3)
+	std	%f12,[%fp-32]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end4)
+	std	%f12,[%fp-16]
+
+	b,a	L(loopm)
+
+	.align 16
+C BEGIN LOOP
+L(loop):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	addx	%g3,0,%g3
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(loope)
+	add	%i0,4,%i0		C res_ptr++
+L(loopm):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	addx	%g3,0,%g3
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	subcc	%i2,1,%i2
+	bne,pt	%icc,L(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	b,a	L(xxx)
+L(loope):
+L(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	b,a	L(yyy)
+
+L(end3):
+	fxtod	%f10,%f2
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+L(xxx):	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+L(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end1):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+L(ret):	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	srlx	%g4,32,%g3
+	st	%l2,[%i0-4]
+
+	addx	%g3,%g0,%g3
+	ret
+	restore %g0,%g3,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc64/README b/rts/gmp/mpn/sparc64/README
new file mode 100644
index 0000000000..6923a133f3
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/README
@@ -0,0 +1,48 @@
+This directory contains mpn functions for 64-bit V9 SPARC
+
+RELEVANT OPTIMIZATION ISSUES
+
+The Ultra I/II pipeline executes up to two simple integer arithmetic operations
+per cycle.  The 64-bit integer multiply instruction mulx takes from 5 cycles to
+35 cycles, depending on the position of the most significant bit of the 1st
+source operand.  It cannot overlap with other instructions.  For our use of
+mulx, it will take from 5 to 20 cycles.
+
+Integer conditional move instructions cannot dual-issue with other integer
+instructions.  No conditional move can issue 1-5 cycles after a load.  (Or
+something such bizzare.)
+
+Integer branches can issue with two integer arithmetic instructions.  Likewise
+for integer loads.  Four instructions may issue (arith, arith, ld/st, branch)
+but only if the branch is last.
+
+(The V9 architecture manual recommends that the 2nd operand of a multiply
+instruction be the smaller one.  For UltraSPARC, they got things backwards and
+optimize for the wrong operand!  Really helpful in the light of that multiply
+is incredibly slow on these CPUs!)
+
+STATUS
+
+There is new code in ~/prec/gmp-remote/sparc64.  Not tested or completed, but
+the pipelines are worked out.  Here are the timings:
+
+* lshift, rshift: The code is well-optimized and runs at 2.0 cycles/limb.
+
+* add_n, sub_n: add3.s currently runs at 6 cycles/limb.  We use a bizarre
+  scheme of compares and branches (with some nops and fnops to align things)
+  and carefully stay away from the instructions intended for this application
+  (i.e., movcs and movcc).
+
+  Using movcc/movcs, even with deep unrolling, seems to get down to 7
+  cycles/limb.
+
+  The most promising approach is to split operands in 32-bit pieces using
+  srlx, then use two addccc, and finally compile the results with sllx+or.
+  The result could run at 5 cycles/limb, I think.  It might be possible to
+  do without unrolling, or with minimal unrolling.
+
+* addmul_1/submul_1: Should optimize for when scalar operand < 2^32.
+* addmul_1/submul_1: Since mulx is horrendously slow on UltraSPARC I/II,
+  Karatsuba's method should save up to 16 cycles (i.e. > 20%).
+* mul_1 (and possibly the other multiply functions): Handle carry in the
+  same tricky way as add_n,sub_n.
diff --git a/rts/gmp/mpn/sparc64/add_n.asm b/rts/gmp/mpn/sparc64/add_n.asm
new file mode 100644
index 0000000000..72b3895a5b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/add_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! s1_ptr	%o1
+! s2_ptr	%o2
+! size		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_add_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+	save	%sp,-160,%sp
+
+	addcc	%g0,%g0,%g0
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(there)
+	nop
+
+	ldx	[%i1+0],%l0
+	ldx	[%i2+0],%l4
+	ldx	[%i1+8],%l1
+	ldx	[%i2+8],%l5
+	ldx	[%i1+16],%l2
+	ldx	[%i2+16],%l6
+	ldx	[%i1+24],%l3
+	ldx	[%i2+24],%l7
+	add	%i1,32,%i1
+	add	%i2,32,%i2
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(skip)
+	nop
+	b	L(loop1)	! jump instead of executing many NOPs
+	nop
+	ALIGN(32)
+!---------  Start main loop ---------
+L(loop1):
+	addccc	%l0,%l4,%g1
+!-
+	srlx	%l0,32,%o0
+	ldx	[%i1+0],%l0
+!-
+	srlx	%l4,32,%o4
+	ldx	[%i2+0],%l4
+!-
+	addccc	%o0,%o4,%g0
+!-
+	addccc	%l1,%l5,%g2
+!-
+	srlx	%l1,32,%o1
+	ldx	[%i1+8],%l1
+!-
+	srlx	%l5,32,%o5
+	ldx	[%i2+8],%l5
+!-
+	addccc	%o1,%o5,%g0
+!-
+	addccc	%l2,%l6,%g3
+!-
+	srlx	%l2,32,%o2
+	ldx	[%i1+16],%l2
+!-
+	srlx	%l6,32,%g5	! asymmetry
+	ldx	[%i2+16],%l6
+!-
+	addccc	%o2,%g5,%g0
+!-
+	addccc	%l3,%l7,%g4
+!-
+	srlx	%l3,32,%o3
+	ldx	[%i1+24],%l3
+	add	%i1,32,%i1
+!-
+	srlx	%l7,32,%o7
+	ldx	[%i2+24],%l7
+	add	%i2,32,%i2
+!-
+	addccc	%o3,%o7,%g0
+!-
+	stx	%g1,[%i0+0]
+!-
+	stx	%g2,[%i0+8]
+!-
+	stx	%g3,[%i0+16]
+	add	%i3,-4,%i3
+!-
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+	brgez,pt	%i3,L(loop1)
+	nop
+!---------  End main loop ---------
+L(skip):
+	addccc	%l0,%l4,%g1
+	srlx	%l0,32,%o0
+	srlx	%l4,32,%o4
+	addccc	%o0,%o4,%g0
+	addccc	%l1,%l5,%g2
+	srlx	%l1,32,%o1
+	srlx	%l5,32,%o5
+	addccc	%o1,%o5,%g0
+	addccc	%l2,%l6,%g3
+	srlx	%l2,32,%o2
+	srlx	%l6,32,%g5	! asymmetry
+	addccc	%o2,%g5,%g0
+	addccc	%l3,%l7,%g4
+	srlx	%l3,32,%o3
+	srlx	%l7,32,%o7
+	addccc	%o3,%o7,%g0
+	stx	%g1,[%i0+0]
+	stx	%g2,[%i0+8]
+	stx	%g3,[%i0+16]
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+L(there):
+	add	%i3,4,%i3
+	brz,pt	%i3,L(end)
+	nop
+
+L(loop2):
+	ldx	[%i1+0],%l0
+	add	%i1,8,%i1
+	ldx	[%i2+0],%l4
+	add	%i2,8,%i2
+	srlx	%l0,32,%g2
+	srlx	%l4,32,%g3
+	addccc	%l0,%l4,%g1
+	addccc	%g2,%g3,%g0
+	stx	%g1,[%i0+0]
+	add	%i0,8,%i0
+	add	%i3,-1,%i3
+	brgz,pt	%i3,L(loop2)
+	nop
+
+L(end):	addc	%g0,%g0,%i0
+	ret
+	restore
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/sparc64/addmul1h.asm b/rts/gmp/mpn/sparc64/addmul1h.asm
new file mode 100644
index 0000000000..96cb5f7369
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul1h.asm
@@ -0,0 +1,203 @@
+dnl  SPARC 64-bit addmull/addmulu -- Helper for mpn_addmul_1 and mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`addmull:',
+`addmulu:')
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	ld	[%i0+DHI],%g5')
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+ifdef(`LOWPART',
+`	add	%g5,%g1,%g1')		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	st	%g4,[%i0-4+DHI]
+	srlx	%g4,32,%g4')
+
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(addmull)',
+`EPILOGUE(addmulu)')
diff --git a/rts/gmp/mpn/sparc64/addmul_1.asm b/rts/gmp/mpn/sparc64/addmul_1.asm
new file mode 100644
index 0000000000..c3f04cea6a
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul_1.asm
@@ -0,0 +1,114 @@
+dnl  SPARC 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_addmul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_addmul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/addmul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/copyi.asm b/rts/gmp/mpn/sparc64/copyi.asm
new file mode 100644
index 0000000000..d9957e3c90
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/copyi.asm
@@ -0,0 +1,79 @@
+! SPARC v9 __gmpn_copy -- Copy a limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! rptr	%o0
+! sptr	%o1
+! n	%o2
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_copyi)
+	add	%o2,-8,%o2
+	brlz,pn	%o2,L(skip)
+	nop
+	b,a	L(loop1)
+	nop
+
+	ALIGN(16)
+L(loop1):
+	ldx	[%o1+0],%g1
+	ldx	[%o1+8],%g2
+	ldx	[%o1+16],%g3
+	ldx	[%o1+24],%g4
+	ldx	[%o1+32],%g5
+	ldx	[%o1+40],%o3
+	ldx	[%o1+48],%o4
+	ldx	[%o1+56],%o5
+	add	%o1,64,%o1
+	stx	%g1,[%o0+0]
+	stx	%g2,[%o0+8]
+	stx	%g3,[%o0+16]
+	stx	%g4,[%o0+24]
+	stx	%g5,[%o0+32]
+	stx	%o3,[%o0+40]
+	stx	%o4,[%o0+48]
+	stx	%o5,[%o0+56]
+	add	%o2,-8,%o2
+	brgez,pt	%o2,L(loop1)
+	add	%o0,64,%o0
+
+L(skip):
+	add	%o2,8,%o2
+	brz,pt	%o2,L(end)
+	nop
+
+L(loop2):
+	ldx	[%o1],%g1
+	add	%o1,8,%o1
+	add	%o2,-1,%o2
+	stx	%g1,[%o0]
+	add	%o0,8,%o0
+	brgz,pt	%o2,L(loop2)
+	nop
+
+L(end):	retl
+	nop
+EPILOGUE(mpn_copyi)
diff --git a/rts/gmp/mpn/sparc64/gmp-mparam.h b/rts/gmp/mpn/sparc64/gmp-mparam.h
new file mode 100644
index 0000000000..74f61661c1
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/gmp-mparam.h
@@ -0,0 +1,88 @@
+/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Tell the toom3 multiply implementation to call low-level mpn
+   functions instead of open-coding operations in C.  */
+#define USE_MORE_MPN 1
+
+
+/* Run on sun workshop cc. */
+/* Generated by tuneup.c, 2000-07-30. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD     12
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD         95
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD     33
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD        125
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD                27
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD              107
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD              12
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD          4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           199
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 304, 608, 1344, 2304, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     320
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         1664
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 304, 608, 1344, 2816, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     320
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         1664
+#endif
diff --git a/rts/gmp/mpn/sparc64/lshift.asm b/rts/gmp/mpn/sparc64/lshift.asm
new file mode 100644
index 0000000000..2d2edc50a7
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/lshift.asm
@@ -0,0 +1,97 @@
+! SPARC v9 __gmpn_lshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! src_ptr	%o1
+! size		%o2
+! cnt		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_lshift)
+	sllx	%o2,3,%g1
+	add	%o1,%g1,%o1	! make %o1 point at end of src
+	ldx	[%o1-8],%g2	! load first limb
+	sub	%g0,%o3,%o5	! negate shift count
+	add	%o0,%g1,%o0	! make %o0 point at end of res
+	add	%o2,-1,%o2
+	and	%o2,4-1,%g4	! number of limbs in first loop
+	srlx	%g2,%o5,%g1	! compute function result
+	brz,pn	%g4,L(0)		! if multiple of 4 limbs, skip first loop
+	mov	%g1,%g5
+
+	sub	%o2,%g4,%o2	! adjust count for main loop
+
+L(loop0):
+	ldx	[%o1-16],%g3
+	add	%o0,-8,%o0
+	add	%o1,-8,%o1
+	add	%g4,-1,%g4
+	sllx	%g2,%o3,%o4
+	srlx	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	brnz,pt	%g4,L(loop0)
+	 stx	%o4,[%o0+0]
+
+L(0):	brz,pn	%o2,L(end)
+	 nop
+
+L(loop1):
+	ldx	[%o1-16],%g3
+	add	%o0,-32,%o0
+	add	%o2,-4,%o2
+	sllx	%g2,%o3,%o4
+	srlx	%g3,%o5,%g1
+
+	ldx	[%o1-24],%g2
+	sllx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0+24]
+	srlx	%g2,%o5,%g1
+
+	ldx	[%o1-32],%g3
+	sllx	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	stx	%g4,[%o0+16]
+	srlx	%g3,%o5,%g1
+
+	ldx	[%o1-40],%g2
+	sllx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0+8]
+	srlx	%g2,%o5,%g1
+
+	add	%o1,-32,%o1
+	or	%g4,%g1,%g4
+	brnz,pt	%o2,L(loop1)
+	 stx	%g4,[%o0+0]
+
+L(end):	sllx	%g2,%o3,%g2
+	stx	%g2,[%o0-8]
+	retl
+	mov	%g5,%o0
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/sparc64/mul_1.asm b/rts/gmp/mpn/sparc64/mul_1.asm
new file mode 100644
index 0000000000..f2f2821d51
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1.asm
@@ -0,0 +1,113 @@
+dnl  SPARC 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and
+dnl  store the result to a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_mul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	mull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_mul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`E',`L($1)')
+include_mpn(`sparc64/mul_1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/mul_1h.asm b/rts/gmp/mpn/sparc64/mul_1h.asm
new file mode 100644
index 0000000000..5078c01c3f
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1h.asm
@@ -0,0 +1,183 @@
+dnl  SPARC 64-bit mull -- Helper for mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+mull:
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	st	%g4,[%i0-4+DHI]
+	srlx	%g4,32,%g4
+
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mull)
diff --git a/rts/gmp/mpn/sparc64/rshift.asm b/rts/gmp/mpn/sparc64/rshift.asm
new file mode 100644
index 0000000000..baf7920efb
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/rshift.asm
@@ -0,0 +1,94 @@
+! SPARC v9 __gmpn_rshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! src_ptr	%o1
+! size		%o2
+! cnt		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_rshift)
+	ldx	[%o1],%g2	! load first limb
+	sub	%g0,%o3,%o5	! negate shift count
+	add	%o2,-1,%o2
+	and	%o2,4-1,%g4	! number of limbs in first loop
+	sllx	%g2,%o5,%g1	! compute function result
+	brz,pn	%g4,L(0)	! if multiple of 4 limbs, skip first loop
+	mov	%g1,%g5
+
+	sub	%o2,%g4,%o2	! adjust count for main loop
+
+L(loop0):
+	ldx	[%o1+8],%g3
+	add	%o0,8,%o0
+	add	%o1,8,%o1
+	add	%g4,-1,%g4
+	srlx	%g2,%o3,%o4
+	sllx	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	brnz,pt	%g4,L(loop0)
+	 stx	%o4,[%o0-8]
+
+L(0):	brz,pn	%o2,L(end)
+	 nop
+
+L(loop1):
+	ldx	[%o1+8],%g3
+	add	%o0,32,%o0
+	add	%o2,-4,%o2
+	srlx	%g2,%o3,%o4
+	sllx	%g3,%o5,%g1
+
+	ldx	[%o1+16],%g2
+	srlx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0-32]
+	sllx	%g2,%o5,%g1
+
+	ldx	[%o1+24],%g3
+	srlx	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	stx	%g4,[%o0-24]
+	sllx	%g3,%o5,%g1
+
+	ldx	[%o1+32],%g2
+	srlx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0-16]
+	sllx	%g2,%o5,%g1
+
+	add	%o1,32,%o1
+	or	%g4,%g1,%g4
+	brnz	%o2,L(loop1)
+	 stx	%g4,[%o0-8]
+
+L(end):	srlx	%g2,%o3,%g2
+	stx	%g2,[%o0-0]
+	retl
+	mov	%g5,%o0
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/sparc64/sub_n.asm b/rts/gmp/mpn/sparc64/sub_n.asm
new file mode 100644
index 0000000000..61547138e0
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/sub_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! s1_ptr	%o1
+! s2_ptr	%o2
+! size		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_sub_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+	save	%sp,-160,%sp
+
+	addcc	%g0,%g0,%g0
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(there)
+	nop
+
+	ldx	[%i1+0],%l0
+	ldx	[%i2+0],%l4
+	ldx	[%i1+8],%l1
+	ldx	[%i2+8],%l5
+	ldx	[%i1+16],%l2
+	ldx	[%i2+16],%l6
+	ldx	[%i1+24],%l3
+	ldx	[%i2+24],%l7
+	add	%i1,32,%i1
+	add	%i2,32,%i2
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(skip)
+	nop
+	b	L(loop1)	! jump instead of executing many NOPs
+	nop
+	ALIGN(32)
+!---------  Start main loop ---------
+L(loop1):
+	subccc	%l0,%l4,%g1
+!-
+	srlx	%l0,32,%o0
+	ldx	[%i1+0],%l0
+!-
+	srlx	%l4,32,%o4
+	ldx	[%i2+0],%l4
+!-
+	subccc	%o0,%o4,%g0
+!-
+	subccc	%l1,%l5,%g2
+!-
+	srlx	%l1,32,%o1
+	ldx	[%i1+8],%l1
+!-
+	srlx	%l5,32,%o5
+	ldx	[%i2+8],%l5
+!-
+	subccc	%o1,%o5,%g0
+!-
+	subccc	%l2,%l6,%g3
+!-
+	srlx	%l2,32,%o2
+	ldx	[%i1+16],%l2
+!-
+	srlx	%l6,32,%g5	! asymmetry
+	ldx	[%i2+16],%l6
+!-
+	subccc	%o2,%g5,%g0
+!-
+	subccc	%l3,%l7,%g4
+!-
+	srlx	%l3,32,%o3
+	ldx	[%i1+24],%l3
+	add	%i1,32,%i1
+!-
+	srlx	%l7,32,%o7
+	ldx	[%i2+24],%l7
+	add	%i2,32,%i2
+!-
+	subccc	%o3,%o7,%g0
+!-
+	stx	%g1,[%i0+0]
+!-
+	stx	%g2,[%i0+8]
+!-
+	stx	%g3,[%i0+16]
+	add	%i3,-4,%i3
+!-
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+	brgez,pt	%i3,L(loop1)
+	nop
+!---------  End main loop ---------
+L(skip):
+	subccc	%l0,%l4,%g1
+	srlx	%l0,32,%o0
+	srlx	%l4,32,%o4
+	subccc	%o0,%o4,%g0
+	subccc	%l1,%l5,%g2
+	srlx	%l1,32,%o1
+	srlx	%l5,32,%o5
+	subccc	%o1,%o5,%g0
+	subccc	%l2,%l6,%g3
+	srlx	%l2,32,%o2
+	srlx	%l6,32,%g5	! asymmetry
+	subccc	%o2,%g5,%g0
+	subccc	%l3,%l7,%g4
+	srlx	%l3,32,%o3
+	srlx	%l7,32,%o7
+	subccc	%o3,%o7,%g0
+	stx	%g1,[%i0+0]
+	stx	%g2,[%i0+8]
+	stx	%g3,[%i0+16]
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+L(there):
+	add	%i3,4,%i3
+	brz,pt	%i3,L(end)
+	nop
+
+L(loop2):
+	ldx	[%i1+0],%l0
+	add	%i1,8,%i1
+	ldx	[%i2+0],%l4
+	add	%i2,8,%i2
+	srlx	%l0,32,%g2
+	srlx	%l4,32,%g3
+	subccc	%l0,%l4,%g1
+	subccc	%g2,%g3,%g0
+	stx	%g1,[%i0+0]
+	add	%i0,8,%i0
+	add	%i3,-1,%i3
+	brgz,pt	%i3,L(loop2)
+	nop
+
+L(end):	addc	%g0,%g0,%i0
+	ret
+	restore
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/sparc64/submul1h.asm b/rts/gmp/mpn/sparc64/submul1h.asm
new file mode 100644
index 0000000000..7f51ba59c6
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul1h.asm
@@ -0,0 +1,204 @@
+dnl  SPARC 64-bit submull/submulu -- Helper for mpn_submul_1 and mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`submull:',
+`submulu:')
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+`	ld	[%i0+DHI],%g5')
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	subxcc	%g5,%g4,%l2')		C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+`	st	%l2,[%i0-4+DHI]
+	srlx	%g4,32,%g4')
+
+	addx	%g4,0,%g4
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(submull)',
+`EPILOGUE(submulu)')
diff --git a/rts/gmp/mpn/sparc64/submul_1.asm b/rts/gmp/mpn/sparc64/submul_1.asm
new file mode 100644
index 0000000000..7c6af0a98b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul_1.asm
@@ -0,0 +1,114 @@
+dnl  SPARC 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_submul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	submull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	submulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_submul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/submul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/submul1h.asm')
diff --git a/rts/gmp/mpn/thumb/add_n.s b/rts/gmp/mpn/thumb/add_n.s
new file mode 100644
index 0000000000..c1eeb6ca87
--- /dev/null
+++ b/rts/gmp/mpn/thumb/add_n.s
@@ -0,0 +1,50 @@
+@ ARM/Thumb __gmpn_add -- Add two limb vectors of the same length > 0 and store
+@ sum in a third limb vector.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+
+@ INPUT PARAMETERS
+@ RES_ptr	r0
+@ S1_ptr	r1
+@ S2_ptr	r2
+@ SIZE		r3
+
+@ NOT TESTED CODE
+
+	.text
+	.thumb
+	.align	0
+	.global	___gmpn_add_n
+___gmpn_add_n:
+	push	{r4, r5, r6, lr}
+	mov	r6, #1			@ init carry save register
+
+Loop:	sub	r6, #1			@ restore carry (set iff r6 was 0)
+	ldmia	r1!, {r4}		@ load next limb from S1
+	ldmia	r2!, {r5}		@ load next limb from S2
+	adc	r4, r5
+	stmia	r0!, {r4}		@ store result limb to RES
+	sbc	r6, r6			@ save negated carry
+	sub	r3, #1
+	bge	Loop			@ loop back while remaining count >= 4
+
+	mov	r0, r6
+	pop	{r4, r5, r6, pc}
diff --git a/rts/gmp/mpn/thumb/sub_n.s b/rts/gmp/mpn/thumb/sub_n.s
new file mode 100644
index 0000000000..53c292375f
--- /dev/null
+++ b/rts/gmp/mpn/thumb/sub_n.s
@@ -0,0 +1,50 @@
+@ ARM/Thumb __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+@ store difference in a third limb vector.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+
+@ INPUT PARAMETERS
+@ RES_ptr	r0
+@ S1_ptr	r1
+@ S2_ptr	r2
+@ SIZE		r3
+
+@ NOT TESTED CODE
+
+	.text
+	.thumb
+	.align	0
+	.global	___gmpn_sub_n
+___gmpn_sub_n:
+	push	{r4, r5, r6, lr}
+	mov	r6, #1			@ init carry save register
+
+Loop:	sub	r6, #1			@ restore carry (set iff r6 was 0)
+	ldmia	r1!, {r4}		@ load next limb from S1
+	ldmia	r2!, {r5}		@ load next limb from S2
+	sbc	r4, r5
+	stmia	r0!, {r4}		@ store result limb to RES
+	sbc	r6, r6			@ save negated carry
+	sub	r3, #1
+	bge	Loop			@ loop back while remaining count >= 4
+
+	mov	r0, r6
+	pop	{r4, r5, r6, pc}
diff --git a/rts/gmp/mpn/underscore.h b/rts/gmp/mpn/underscore.h
new file mode 100644
index 0000000000..240dae0f63
--- /dev/null
+++ b/rts/gmp/mpn/underscore.h
@@ -0,0 +1,26 @@
+/*
+Copyright (C) 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#if __STDC__
+#define C_SYMBOL_NAME(name) _##name
+#else
+#define C_SYMBOL_NAME(name) _/**/name
+#endif
diff --git a/rts/gmp/mpn/vax/add_n.s b/rts/gmp/mpn/vax/add_n.s
new file mode 100644
index 0000000000..cf4060f521
--- /dev/null
+++ b/rts/gmp/mpn/vax/add_n.s
@@ -0,0 +1,61 @@
+# VAX __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# s2_ptr	(sp + 12)
+# size		(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_add_n
+___gmpn_add_n:
+	.word	0x0
+	movl	16(ap),r0
+	movl	12(ap),r1
+	movl	8(ap),r2
+	movl	4(ap),r3
+	mnegl	r0,r5
+	addl2	$3,r0
+	ashl	$-2,r0,r0	# unroll loop count
+	bicl2	$-4,r5		# mask out low 2 bits
+	movaq	(r5)[r5],r5	# 9x
+	jmp	Loop(r5)
+
+Loop:	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	sobgtr	r0,Loop
+
+	adwc	r0,r0
+	ret
diff --git a/rts/gmp/mpn/vax/addmul_1.s b/rts/gmp/mpn/vax/addmul_1.s
new file mode 100644
index 0000000000..379061dcb7
--- /dev/null
+++ b/rts/gmp/mpn/vax/addmul_1.s
@@ -0,0 +1,126 @@
+# VAX __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	addl2	r2,(r9)+
+	adwc	r1,r3
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	addl2	r10,(r9)+
+	adwc	r1,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/rts/gmp/mpn/vax/lshift.s b/rts/gmp/mpn/vax/lshift.s
new file mode 100644
index 0000000000..fd311a9782
--- /dev/null
+++ b/rts/gmp/mpn/vax/lshift.s
@@ -0,0 +1,58 @@
+# VAX __gmpn_lshift -- left shift.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr		(sp + 4)
+# sptr		(sp + 8)
+# size		(sp + 12)
+# cnt		(sp + 16)
+# r0=retval r1=size r2,r3=itmp r4,r5=otmp	call-used registers
+# r6=sptr r7=rptr r8=cnt r9 r10 r11		call-saved registers
+
+.text
+	.align 1
+.globl ___gmpn_lshift
+___gmpn_lshift:
+	.word	0x1c0
+	movl	4(ap),r7
+	movl	8(ap),r6
+	movl	12(ap),r1
+	movl	16(ap),r8
+
+	moval	(r6)[r1],r6
+	moval	(r7)[r1],r7
+	clrl	r3
+	movl	-(r6),r2
+	ashq	r8,r2,r4
+	movl	r5,r0
+	movl	r2,r3
+	decl	r1
+	jeql	Lend
+
+Loop:	movl	-(r6),r2
+	ashq	r8,r2,r4
+	movl	r5,-(r7)
+	movl	r2,r3
+	jsobgtr	r1,Loop
+
+Lend:	movl	r4,-4(r7)
+	ret
diff --git a/rts/gmp/mpn/vax/mul_1.s b/rts/gmp/mpn/vax/mul_1.s
new file mode 100644
index 0000000000..708e8ca6ca
--- /dev/null
+++ b/rts/gmp/mpn/vax/mul_1.s
@@ -0,0 +1,123 @@
+# VAX __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+# One might want to combine the addl2 and the store below, but that
+# is actually just slower according to my timing tests.  (VAX 3600)
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	movl	r2,(r9)+
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	movl	r2,(r9)+
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	movl	r2,(r9)+
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r1,r3
+	addl2	r11,r2
+	adwc	r6,r3
+	movl	r2,(r9)+
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r1,r11
+	addl2	r3,r10
+	adwc	r6,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/rts/gmp/mpn/vax/rshift.s b/rts/gmp/mpn/vax/rshift.s
new file mode 100644
index 0000000000..515813208d
--- /dev/null
+++ b/rts/gmp/mpn/vax/rshift.s
@@ -0,0 +1,56 @@
+# VAX __gmpn_rshift -- right shift.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr		(sp + 4)
+# sptr		(sp + 8)
+# size		(sp + 12)
+# cnt		(sp + 16)
+# r0=retval r1=size r2,r3=itmp r4,r5=otmp	call-used registers
+# r6=sptr r7=rptr r8=cnt r9 r10 r11		call-saved registers
+
+.text
+	.align 1
+.globl ___gmpn_rshift
+___gmpn_rshift:
+	.word	0x1c0
+	movl	4(ap),r7
+	movl	8(ap),r6
+	movl	12(ap),r1
+	movl	16(ap),r8
+
+	movl	(r6)+,r2
+	subl3	r8,$32,r8
+	ashl	r8,r2,r0
+	decl	r1
+	jeql	Lend
+
+Loop:	movl	(r6)+,r3
+	ashq	r8,r2,r4
+	movl	r5,(r7)+
+	movl	r3,r2
+	jsobgtr	r1,Loop
+
+Lend:	clrl	r3
+	ashq	r8,r2,r4
+	movl	r5,(r7)
+	ret
diff --git a/rts/gmp/mpn/vax/sub_n.s b/rts/gmp/mpn/vax/sub_n.s
new file mode 100644
index 0000000000..eff4b1c044
--- /dev/null
+++ b/rts/gmp/mpn/vax/sub_n.s
@@ -0,0 +1,61 @@
+# VAX __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+# difference in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# s2_ptr	(sp + 12)
+# size		(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	.word	0x0
+	movl	16(ap),r0
+	movl	12(ap),r1
+	movl	8(ap),r2
+	movl	4(ap),r3
+	mnegl	r0,r5
+	addl2	$3,r0
+	ashl	$-2,r0,r0	# unroll loop count
+	bicl2	$-4,r5		# mask out low 2 bits
+	movaq	(r5)[r5],r5	# 9x
+	jmp	Loop(r5)
+
+Loop:	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	sobgtr	r0,Loop
+
+	adwc	r0,r0
+	ret
diff --git a/rts/gmp/mpn/vax/submul_1.s b/rts/gmp/mpn/vax/submul_1.s
new file mode 100644
index 0000000000..be42286935
--- /dev/null
+++ b/rts/gmp/mpn/vax/submul_1.s
@@ -0,0 +1,126 @@
+# VAX __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_submul_1
+___gmpn_submul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	subl2	r2,(r9)+
+	adwc	r1,r3
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	subl2	r10,(r9)+
+	adwc	r1,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/rts/gmp/mpn/x86/README b/rts/gmp/mpn/x86/README
new file mode 100644
index 0000000000..3507548b8c
--- /dev/null
+++ b/rts/gmp/mpn/x86/README
@@ -0,0 +1,40 @@
+
+                      X86 MPN SUBROUTINES
+
+
+This directory contains mpn functions for various 80x86 chips.
+
+
+CODE ORGANIZATION
+
+	x86              i386, i486, generic
+	x86/pentium      Intel Pentium (P5, P54)
+	x86/pentium/mmx  Intel Pentium with MMX (P55)
+	x86/p6           Intel Pentium Pro
+	x86/p6/mmx       Intel Pentium II, III
+	x86/p6/p3mmx     Intel Pentium III
+	x86/k6           AMD K6, K6-2, K6-3
+	x86/k6/mmx
+	x86/k6/k62mmx    AMD K6-2
+	x86/k7           AMD Athlon
+	x86/k7/mmx
+
+
+The x86 directory is also the main support for P6 at the moment, and
+is something of a blended style, meant to be reasonable on all x86s.
+
+	
+
+STATUS
+
+The code is well-optimized for AMD and Intel chips, but not so well
+optimized for Cyrix chips.
+
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+For implementations with slow double shift instructions (SHLD and
+SHRD), it might be better to mimic their operation with SHL+SHR+OR.
+(M2 is likely to benefit from that, but not Pentium due to its slow
+plain SHL and SHR.)
diff --git a/rts/gmp/mpn/x86/README.family b/rts/gmp/mpn/x86/README.family
new file mode 100644
index 0000000000..3bc73f58b0
--- /dev/null
+++ b/rts/gmp/mpn/x86/README.family
@@ -0,0 +1,333 @@
+
+                    X86 CPU FAMILY MPN SUBROUTINES
+
+
+This file has some notes on things common to all the x86 family code.
+
+
+
+ASM FILES
+
+The x86 .asm files are BSD style x86 assembler code, first put through m4
+for macro processing.  The generic mpn/asm-defs.m4 is used, together with
+mpn/x86/x86-defs.m4.  Detailed notes are in those files.
+
+The code is meant for use with GNU "gas" or a system "as".  There's no
+support for assemblers that demand Intel style, and with gas freely
+available and easy to use that shouldn't be a problem.
+
+
+
+STACK FRAME
+
+m4 macros are used to define the parameters passed on the stack, and these
+act like comments on what the stack frame looks like too.  For example,
+mpn_mul_1() has the following.
+
+        defframe(PARAM_MULTIPLIER, 16)
+        defframe(PARAM_SIZE,       12)
+        defframe(PARAM_SRC,         8)
+        defframe(PARAM_DST,         4)
+
+Here PARAM_MULTIPLIER gets defined as `FRAME+16(%esp)', and the others
+similarly.  The return address is at offset 0, but there's not normally any
+need to access that.
+
+FRAME is redefined as necessary through the code so it's the number of bytes
+pushed on the stack, and hence the offsets in the parameter macros stay
+correct.  At the start of a routine FRAME should be zero.
+
+        deflit(`FRAME',0)
+	...
+	deflit(`FRAME',4)
+	...
+	deflit(`FRAME',8)
+	...
+
+Helper macros FRAME_pushl(), FRAME_popl(), FRAME_addl_esp() and
+FRAME_subl_esp() exist to adjust FRAME for the effect of those instructions,
+and can be used instead of explicit definitions if preferred.
+defframe_pushl() is a combination FRAME_pushl() and defframe().
+
+There's generally some slackness in redefining FRAME.  If new values aren't
+going to get used, then the redefinitions are omitted to keep from
+cluttering up the code.  This happens for instance at the end of a routine,
+where there might be just four register pops and then a ret, so FRAME isn't
+getting used.
+
+Local variables and saved registers can be similarly defined, with negative
+offsets representing stack space below the initial stack pointer.  For
+example,
+
+	defframe(SAVE_ESI,   -4)
+	defframe(SAVE_EDI,   -8)
+	defframe(VAR_COUNTER,-12)
+
+	deflit(STACK_SPACE, 12)
+
+Here STACK_SPACE gets used in a "subl $STACK_SPACE, %esp" to allocate the
+space, and that instruction must be followed by a redefinition of FRAME
+(setting it equal to STACK_SPACE) to reflect the change in %esp.
+
+Definitions for pushed registers are only put in when they're going to be
+used.  If registers are just saved and restored with pushes and pops then
+definitions aren't made.
+
+
+
+ASSEMBLER EXPRESSIONS
+
+Only addition and subtraction seem to be universally available, certainly
+that's all the Solaris 8 "as" seems to accept.  If expressions are wanted
+then m4 eval() should be used.
+
+In particular note that a "/" anywhere in a line starts a comment in Solaris
+"as", and in some configurations of gas too.
+
+	addl	$32/2, %eax           <-- wrong
+
+	addl	$eval(32/2), %eax     <-- right
+
+Binutils gas/config/tc-i386.c has a choice between "/" being a comment
+anywhere in a line, or only at the start.  FreeBSD patches 2.9.1 to select
+the latter, and as of 2.9.5 it's the default for GNU/Linux too.
+
+
+
+ASSEMBLER COMMENTS
+
+Solaris "as" doesn't support "#" commenting, using /* */ instead,
+unfortunately.  For that reason "C" commenting is used (see asm-defs.m4) and
+the intermediate ".s" files have no comments.
+
+
+
+ZERO DISPLACEMENTS
+
+In a couple of places addressing modes like 0(%ebx) with a byte-sized zero
+displacement are wanted, rather than (%ebx) with no displacement.  These are
+either for computed jumps or to get desirable code alignment.  Explicit
+.byte sequences are used to ensure the assembler doesn't turn 0(%ebx) into
+(%ebx).  The Zdisp() macro in x86-defs.m4 is used for this.
+
+Current gas 2.9.5 or recent 2.9.1 leave 0(%ebx) as written, but old gas
+1.92.3 changes it.  In general changing would be the sort of "optimization"
+an assembler might perform, hence explicit ".byte"s are used where
+necessary.
+
+
+
+SHLD/SHRD INSTRUCTIONS
+
+The %cl count forms of double shift instructions like "shldl %cl,%eax,%ebx"
+must be written "shldl %eax,%ebx" for some assemblers.  gas takes either,
+Solaris "as" doesn't allow %cl, gcc generates %cl for gas and NeXT (which is
+gas), and omits %cl elsewhere.
+
+For GMP an autoconf test is used to determine whether %cl should be used and
+the macros shldl, shrdl, shldw and shrdw in mpn/x86/x86-defs.m4 then pass
+through or omit %cl as necessary.  See comments with those macros for usage.
+
+
+
+DIRECTION FLAG
+
+The x86 calling conventions say that the direction flag should be clear at
+function entry and exit.  (See iBCS2 and SVR4 ABI books, references below.)
+
+Although this has been so since the year dot, it's not absolutely clear
+whether it's universally respected.  Since it's better to be safe than
+sorry, gmp follows glibc and does a "cld" if it depends on the direction
+flag being clear.  This happens only in a few places.
+
+
+
+POSITION INDEPENDENT CODE
+
+Defining the symbol PIC in m4 processing selects position independent code.
+This mainly affects computed jumps, and these are implemented in a
+self-contained fashion (without using the global offset table).  The few
+calls from assembly code to global functions use the normal procedure
+linkage table.
+
+PIC is necessary for ELF shared libraries because they can be mapped into
+different processes at different virtual addresses.  Text relocations in
+shared libraries are allowed, but that presumably means a page with such a
+relocation isn't shared.  The use of the PLT for PIC adds a fixed cost to
+every function call, which is small but might be noticeable when working with
+small operands.
+
+Calls from one library function to another don't need to go through the PLT,
+since of course the call instruction uses a displacement, not an absolute
+address, and the relative locations of object files are known when libgmp.so
+is created.  "ld -Bsymbolic" (or "gcc -Wl,-Bsymbolic") will resolve calls
+this way, so that there's no jump through the PLT, but of course leaving
+setups of the GOT address in %ebx that may be unnecessary.
+
+The %ebx setup could be avoided in assembly if a separate option controlled
+PIC for calls as opposed to computed jumps etc.  But there's only ever
+likely to be a handful of calls out of assembler, and getting the same
+optimization for C intra-library calls would be more important.  There seems
+no easy way to tell gcc that certain functions can be called non-PIC, and
+unfortunately many gmp functions use the global memory allocation variables,
+so they need the GOT anyway.  Object files with no global data references
+and only intra-library calls could go into the library as non-PIC under
+-Bsymbolic.  Integrating this into libtool and automake is left as an
+exercise for the reader.
+
+
+
+SIMPLE LOOPS
+
+The overheads in setting up for an unrolled loop can mean that at small
+sizes a simple loop is faster.  Making small sizes go fast is important,
+even if it adds a cycle or two to bigger sizes.  To this end various
+routines choose between a simple loop and an unrolled loop according to
+operand size.  The path to the simple loop, or to special case code for
+small sizes, is always as fast as possible.
+
+Adding a simple loop requires a conditional jump to choose between the
+simple and unrolled code.  The size of a branch misprediction penalty
+affects whether a simple loop is worthwhile.
+
+The convention is for an m4 definition UNROLL_THRESHOLD to set the crossover
+point, with sizes < UNROLL_THRESHOLD using the simple loop, sizes >=
+UNROLL_THRESHOLD using the unrolled loop.  If position independent code adds
+a couple of cycles to an unrolled loop setup, the threshold will vary with
+PIC or non-PIC.  Something like the following is typical.
+
+	ifdef(`PIC',`
+	deflit(UNROLL_THRESHOLD, 10)
+	',`
+	deflit(UNROLL_THRESHOLD, 8)
+	')
+
+There's no automated way to determine the threshold.  Setting it to a small
+value and then to a big value makes it possible to measure the simple and
+unrolled loops each over a range of sizes, from which the crossover point
+can be determined.  Alternately, just adjust the threshold up or down until
+there's no more speedups.
+
+
+
+UNROLLED LOOP CODING
+
+The x86 addressing modes allow a byte displacement of -128 to +127, making
+it possible to access 256 bytes, which is 64 limbs, without adjusting
+pointer registers within the loop.  Dword sized displacements can be used
+too, but they increase code size, and unrolling to 64 ought to be enough.
+
+When unrolling to the full 64 limbs/loop, the limb at the top of the loop
+will have a displacement of -128, so pointers have to have a corresponding
++128 added before entering the loop.  When unrolling to 32 limbs/loop
+displacements 0 to 127 can be used with 0 at the top of the loop and no
+adjustment needed to the pointers.
+
+Where 64 limbs/loop is supported, the +128 adjustment is done only when 64
+limbs/loop is selected.  Usually the gain in speed using 64 instead of 32 or
+16 is small, so support for 64 limbs/loop is generally only for comparison.
+
+
+
+COMPUTED JUMPS
+
+When working from least significant limb to most significant limb (most
+routines) the computed jump and pointer calculations in preparation for an
+unrolled loop are as follows.
+
+	S = operand size in limbs
+	N = number of limbs per loop (UNROLL_COUNT)
+	L = log2 of unrolling (UNROLL_LOG2)
+	M = mask for unrolling (UNROLL_MASK)
+	C = code bytes per limb in the loop
+	B = bytes per limb (4 for x86)
+	
+	computed jump            (-S & M) * C + entrypoint
+	subtract from pointers   (-S & M) * B
+	initial loop counter     (S-1) >> L
+	displacements            0 to B*(N-1)
+
+The loop counter is decremented at the end of each loop, and the looping
+stops when the decrement takes the counter to -1.  The displacements are for
+the addressing accessing each limb, eg. a load with "movl disp(%ebx), %eax".
+
+Usually the multiply by "C" can be handled without an imul, using instead an
+leal, or a shift and subtract.
+
+When working from most significant to least significant limb (eg. mpn_lshift
+and mpn_copyd), the calculations change as follows.
+
+	add to pointers          (-S & M) * B
+	displacements            0 to -B*(N-1)
+
+
+
+OLD GAS 1.92.3
+
+This version comes with FreeBSD 2.2.8 and has a couple of gremlins that
+affect gmp code.
+
+Firstly, an expression involving two forward references to labels comes out
+as zero.  For example,
+
+		addl	$bar-foo, %eax
+	foo:
+		nop
+	bar:
+
+This should lead to "addl $1, %eax", but it comes out as "addl $0, %eax".
+When only one forward reference is involved, it works correctly, as for
+example,
+
+	foo:
+		addl	$bar-foo, %eax
+		nop
+	bar:
+
+Secondly, an expression involving two labels can't be used as the
+displacement for an leal.  For example,
+
+	foo:
+		nop
+	bar:
+		leal	bar-foo(%eax,%ebx,8), %ecx
+
+A slightly cryptic error is given, "Unimplemented segment type 0 in
+parse_operand".  When only one label is used it's ok, and the label can be a
+forward reference too, as for example,
+
+		leal	foo(%eax,%ebx,8), %ecx
+		nop
+	foo:
+
+These problems only affect PIC computed jump calculations.  The workarounds
+are just to do an leal without a displacement and then an addl, and to make
+sure the code is placed so that there's at most one forward reference in the
+addl.
+
+
+
+REFERENCES
+
+"Intel Architecture Software Developer's Manual", volumes 1 to 3, 1999,
+order numbers 243190, 243191 and 243192.  Available on-line,
+
+	ftp://download.intel.com/design/PentiumII/manuals/243190.htm
+	ftp://download.intel.com/design/PentiumII/manuals/243191.htm
+	ftp://download.intel.com/design/PentiumII/manuals/243192.htm
+
+"Intel386 Family Binary Compatibility Specification 2", Intel Corporation,
+published by McGraw-Hill, 1991, ISBN 0-07-031219-2.
+
+"System V Application Binary Interface", Unix System Laboratories Inc, 1992,
+published by Prentice Hall, ISBN 0-13-880410-9.  And the "Intel386 Processor
+Supplement", AT&T, 1991, ISBN 0-13-877689-X.  (These have details of ELF
+shared library PIC coding.)
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/addsub_n.S b/rts/gmp/mpn/x86/addsub_n.S
new file mode 100644
index 0000000000..fe6f648f53
--- /dev/null
+++ b/rts/gmp/mpn/x86/addsub_n.S
@@ -0,0 +1,174 @@
+/* Currently not working and not used. */
+
+/*
+Copyright (C) 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+
+#define SAVE_BORROW_RESTORE_CARRY(r)	adcl r,r; shll $31,r
+#define SAVE_CARRY_RESTORE_BORROW(r)	adcl r,r
+
+	.globl	mpn_addsub_n_0
+	.globl	mpn_addsub_n_1
+
+/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s2,r2==s1.
+   We let subtraction and addition alternate in being two limbs
+   ahead of the other, thereby avoiding some SAVE_RESTORE. */
+// r1 = r2 + r1    edi = esi + edi
+// r2 = r2 - r1    esi = esi - edi
+//			s1  s2
+//	                r2  r1
+//	eax,ebx,ecx,edx,esi,edi,ebp
+mpn_addsub_n_0:
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s1_ptr */
+	movl	36(%esp),%ebp		/* size */
+
+	shrl	$2,%ebp
+	xorl	%edx,%edx
+	.align	4
+Loop0:				// L=load E=execute S=store
+	movl	(%esi),%ebx	// sub 0 L
+	movl	4(%esi),%ecx	// sub 1 L
+	sbbl	(%edi),%ebx	// sub 0 LE
+	sbbl	4(%edi),%ecx	// sub 1 LE
+//	SAVE_BORROW_RESTORE_CARRY(%edx)
+	movl	(%esi),%eax	// add 0 L
+	adcl	%eax,(%edi)	// add 0 LES
+	movl	4(%esi),%eax	// add 1 L
+	adcl	%eax,4(%edi)	// add 1 LES
+	movl	%ebx,(%esi)	// sub 0 S
+	movl	%ecx,4(%esi)	// sub 1 S
+	movl	8(%esi),%ebx	// add 2 L
+	adcl	8(%edi),%ebx	// add 2 LE
+	movl	12(%esi),%ecx	// add 3 L
+	adcl	12(%edi),%ecx	// add 3 LE
+//	SAVE_CARRY_RESTORE_BORROW(%edx)
+	movl	8(%edi),%eax	// sub 2 L
+	sbbl	%eax,8(%esi)	// sub 2 LES
+	movl	12(%edi),%eax	// sub 3 L
+	sbbl	%eax,12(%esi)	// sub 3 LES
+	movl	%ebx,8(%edi)	// add 2 S
+	movl	%ecx,12(%edi)	// add 3 S
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ebp
+	jnz	Loop0
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s1,r2==s2.
+   We let subtraction and addition alternate in being two limbs
+   ahead of the other, thereby avoiding some SAVE_RESTORE. */
+// r1 = r1 + r2    edi = edi + esi
+// r2 = r1 - r2    esi = edi - esi
+//			s2  s1
+//	                r2  r1
+//	eax,ebx,ecx,edx,esi,edi,ebp
+mpn_addsub_n_1:
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s1_ptr */
+	movl	36(%esp),%ebp		/* size */
+
+	shrl	$2,%ebp
+	xorl	%edx,%edx
+	.align	4
+Loop1:				// L=load E=execute S=store
+	movl	(%edi),%ebx	// sub 0 L
+	sbbl	(%esi),%ebx	// sub 0 LE
+	movl	4(%edi),%ecx	// sub 1 L
+	sbbl	4(%esi),%ecx	// sub 1 LE
+//	SAVE_BORROW_RESTORE_CARRY(%edx)
+	movl	(%esi),%eax	// add 0 L
+	adcl	%eax,(%edi)	// add 0 LES
+	movl	4(%esi),%eax	// add 1 L
+	adcl	%eax,4(%edi)	// add 1 LES
+	movl	%ebx,(%esi)	// sub 0 S
+	movl	%ecx,4(%esi)	// sub 1 S
+	movl	8(%esi),%ebx	// add 2 L
+	adcl	8(%edi),%ebx	// add 2 LE
+	movl	12(%esi),%ecx	// add 3 L
+	adcl	12(%edi),%ecx	// add 3 LE
+//	SAVE_CARRY_RESTORE_BORROW(%edx)
+	movl	8(%edi),%eax	// sub 2 L
+	sbbl	8(%esi),%eax	// sub 2 LES
+	movl	%eax,8(%esi)	// sub 2 S
+	movl	12(%edi),%eax	// sub 3 L
+	sbbl	12(%esi),%eax	// sub 3 LE
+	movl	%eax,12(%esi)	// sub 3 S
+	movl	%ebx,8(%edi)	// add 2 S
+	movl	%ecx,12(%edi)	// add 3 S
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ebp
+	jnz	Loop1
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+	.globl	mpn_copy
+mpn_copy:
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s1_ptr */
+	movl	28(%esp),%ebp		/* size */
+
+	shrl	$2,%ebp
+	.align	4
+Loop2:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	8(%esi),%eax
+	movl	12(%esi),%ebx
+	movl	%eax,8(%edi)
+	movl	%ebx,12(%edi)
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ebp
+	jnz	Loop2
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
diff --git a/rts/gmp/mpn/x86/aors_n.asm b/rts/gmp/mpn/x86/aors_n.asm
new file mode 100644
index 0000000000..18ef816b4d
--- /dev/null
+++ b/rts/gmp/mpn/x86/aors_n.asm
@@ -0,0 +1,187 @@
+dnl  x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n',`
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                    mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(M4_function_nc)
+deflit(`FRAME',0)
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%edx
+	movl	PARAM_SIZE,%ecx
+
+	movl	%ecx,%eax
+	shrl	$3,%ecx			C compute count for unrolled loop
+	negl	%eax
+	andl	$7,%eax			C get index where to start loop
+	jz	LF(M4_function_n,oopgo)	C necessary special case for 0
+	incl	%ecx			C adjust loop count
+	shll	$2,%eax			C adjustment for pointers...
+	subl	%eax,%edi		C ... since they are offset ...
+	subl	%eax,%esi		C ... by a constant when we ...
+	subl	%eax,%edx		C ... enter the loop
+	shrl	$2,%eax			C restore previous value
+
+ifdef(`PIC',`
+	C Calculate start address in loop for PIC.  Due to limitations in
+	C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal
+	call	L(0a)
+L(0a):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$LF(M4_function_n,oop)-L(0a)-3,%eax
+	addl	$4,%esp
+',`
+	C Calculate start address in loop for non-PIC.
+ 	leal	LF(M4_function_n,oop)-3(%eax,%eax,8),%eax
+')
+
+	C These lines initialize carry from the 5th parameter.  Should be
+	C possible to simplify.
+	pushl	%ebp		FRAME_pushl()
+	movl	PARAM_CARRY,%ebp
+	shrl	$1,%ebp			C shift bit 0 into carry
+	popl	%ebp		FRAME_popl()
+
+	jmp	*%eax			C jump into loop
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_n)
+deflit(`FRAME',0)
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%edx
+	movl	PARAM_SIZE,%ecx
+
+	movl	%ecx,%eax
+	shrl	$3,%ecx			C compute count for unrolled loop
+	negl	%eax
+	andl	$7,%eax			C get index where to start loop
+	jz	L(oop)			C necessary special case for 0
+	incl	%ecx			C adjust loop count
+	shll	$2,%eax			C adjustment for pointers...
+	subl	%eax,%edi		C ... since they are offset ...
+	subl	%eax,%esi		C ... by a constant when we ...
+	subl	%eax,%edx		C ... enter the loop
+	shrl	$2,%eax			C restore previous value
+
+ifdef(`PIC',`
+	C Calculate start address in loop for PIC.  Due to limitations in
+	C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal
+	call	L(0b)
+L(0b):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$L(oop)-L(0b)-3,%eax
+	addl	$4,%esp
+',`
+	C Calculate start address in loop for non-PIC.
+ 	leal	L(oop)-3(%eax,%eax,8),%eax
+')
+	jmp	*%eax			C jump into loop
+
+L(oopgo):
+	pushl	%ebp		FRAME_pushl()
+	movl	PARAM_CARRY,%ebp
+	shrl	$1,%ebp			C shift bit 0 into carry
+	popl	%ebp		FRAME_popl()
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	M4_inst	(%edx),%eax
+	movl	%eax,(%edi)
+	movl	4(%esi),%eax
+	M4_inst	4(%edx),%eax
+	movl	%eax,4(%edi)
+	movl	8(%esi),%eax
+	M4_inst	8(%edx),%eax
+	movl	%eax,8(%edi)
+	movl	12(%esi),%eax
+	M4_inst	12(%edx),%eax
+	movl	%eax,12(%edi)
+	movl	16(%esi),%eax
+	M4_inst	16(%edx),%eax
+	movl	%eax,16(%edi)
+	movl	20(%esi),%eax
+	M4_inst	20(%edx),%eax
+	movl	%eax,20(%edi)
+	movl	24(%esi),%eax
+	M4_inst	24(%edx),%eax
+	movl	%eax,24(%edi)
+	movl	28(%esi),%eax
+	M4_inst	28(%edx),%eax
+	movl	%eax,28(%edi)
+	leal	32(%edi),%edi
+	leal	32(%esi),%esi
+	leal	32(%edx),%edx
+	decl	%ecx
+	jnz	L(oop)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/aorsmul_1.asm b/rts/gmp/mpn/x86/aorsmul_1.asm
new file mode 100644
index 0000000000..f32ad83989
--- /dev/null
+++ b/rts/gmp/mpn/x86/aorsmul_1.asm
@@ -0,0 +1,134 @@
+dnl  x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a
+dnl  limb and add the result to a second limb vector.
+
+
+dnl  Copyright (C) 1992, 1994, 1997, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_addmul_1',`
+      define(M4_inst,        addl)
+      define(M4_function_1,  mpn_addmul_1)
+
+',`ifdef(`OPERATION_submul_1',`
+      define(M4_inst,        subl)
+      define(M4_function_1,  mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult);
+
+define(PARAM_MULTIPLIER, `FRAME+16(%esp)')
+define(PARAM_SIZE,       `FRAME+12(%esp)')
+define(PARAM_SRC,        `FRAME+8(%esp)')
+define(PARAM_DST,        `FRAME+4(%esp)')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	M4_inst	%eax,(%edi)
+	adcl	%edx,%ebx	C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_SIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(end)
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebx,(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebp,4(%edi)
+	adcl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebx,8(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	M4_inst	%ebp,12(%edi)
+	adcl	$0,%ebx		C propagate carry into cylimb
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oop)
+
+L(end):	movl	%ebx,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/copyd.asm b/rts/gmp/mpn/x86/copyd.asm
new file mode 100644
index 0000000000..439640e836
--- /dev/null
+++ b/rts/gmp/mpn/x86/copyd.asm
@@ -0,0 +1,80 @@
+dnl  x86 mpn_copyd -- copy limb vector, decrementing.
+dnl 
+dnl  Future: On P6 an MMX loop should be able to go faster than this code.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from high to low addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+C
+C P5 - 1.0 cycles/limb.
+C
+C P6 - 2.4 cycles/limb, approx 40 cycles startup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+	C eax	saved esi
+	C ebx
+	C ecx	counter
+	C edx	saved edi
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	movl	PARAM_DST, %edi
+	leal	-4(%esi,%ecx,4), %esi
+
+	leal	-4(%edi,%ecx,4), %edi
+	
+	std
+
+	rep
+	movsl
+
+	cld
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/copyi.asm b/rts/gmp/mpn/x86/copyi.asm
new file mode 100644
index 0000000000..5bc4e36689
--- /dev/null
+++ b/rts/gmp/mpn/x86/copyi.asm
@@ -0,0 +1,79 @@
+dnl  x86 mpn_copyi -- copy limb vector, incrementing.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from low to high addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+C
+C P5 - 1.0 cycles/limb.
+C
+C P6 - 0.75 cycles/limb.  An MMX based copy was tried, but was found to be
+C      slower than a rep movs in all cases.  The fastest MMX found was 0.8
+C      cycles/limb (when fully aligned).  A rep movs seems to have a startup
+C      time of about 15 cycles, but doing something special for small sizes
+C      could lead to a branch misprediction that would destroy any saving.
+C      For now a plain rep movs seems ok for P6.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+	C eax	saved esi
+	C ebx
+	C ecx	counter
+	C edx	saved edi
+	C esi	src
+	C edi	dst
+	C ebp
+
+PROLOGUE(mpn_copyi)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README.family
+
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/diveby3.asm b/rts/gmp/mpn/x86/diveby3.asm
new file mode 100644
index 0000000000..df879da9e1
--- /dev/null
+++ b/rts/gmp/mpn/x86/diveby3.asm
@@ -0,0 +1,115 @@
+dnl  x86 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  The following all have their own optimized versions of this routine,
+dnl  but for reference the code here runs as follows.
+dnl
+dnl       cycles/limb
+dnl  P54     18.0
+dnl  P55     17.0
+dnl  P6      14.5
+dnl  K6      14.0
+dnl  K7      10.0
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3,       0xAAAAAAAB)
+
+dnl  ceil(b/3) and ceil(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL,  0x55555556)
+deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %ecx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_SIZE, %ebp
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	$INVERSE_3, %esi
+	pushl	%ebx		FRAME_pushl()
+
+	leal	(%ecx,%ebp,4), %ecx
+	movl	PARAM_CARRY, %ebx
+
+	leal	(%edi,%ebp,4), %edi
+	negl	%ebp
+
+
+	ALIGN(8)
+L(top):
+	C eax	scratch, low product
+	C ebx	carry limb (0 to 3)
+	C ecx	&src[size]
+	C edx	scratch, high product
+	C esi	multiplier
+	C edi	&dst[size]
+	C ebp	counter, limbs, negative
+
+	movl	(%ecx,%ebp,4), %eax
+
+	subl	%ebx, %eax
+
+	setc	%bl
+
+	imull	%esi
+
+	cmpl	$ONE_THIRD_CEIL, %eax
+	movl	%eax, (%edi,%ebp,4)
+
+	sbbl	$-1, %ebx	C +1 if eax>=ceil(b/3)
+	cmpl	$TWO_THIRDS_CEIL, %eax
+
+	sbbl	$-1, %ebx	C +1 if eax>=ceil(b*2/3)
+	incl	%ebp
+
+	jnz	L(top)
+
+
+	movl	%ebx, %eax
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/divrem_1.asm b/rts/gmp/mpn/x86/divrem_1.asm
new file mode 100644
index 0000000000..12f14676d6
--- /dev/null
+++ b/rts/gmp/mpn/x86/divrem_1.asm
@@ -0,0 +1,232 @@
+dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl        cycles/limb
+dnl  K6        20
+dnl  P5        44
+dnl  P6        39
+dnl  486   approx 43 maybe
+dnl
+dnl
+dnl  The following have their own optimized divrem_1 implementations, but
+dnl  for reference the code here runs as follows.
+dnl
+dnl        cycles/limb
+dnl  P6MMX     39
+dnl  K7        42
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C
+C Divide src,size by divisor and store the quotient in dst+xsize,size.
+C Extend the division to fractional quotient limbs in dst,xsize.  Return the
+C remainder.  Either or both xsize and size can be 0.
+C
+C mpn_divrem_1c takes a carry parameter which is an initial high limb,
+C effectively one extra limb at the top of src,size.  Must have
+C carry<divisor.
+C
+C
+C Essentially the code is the same as the division based part of
+C mpn/generic/divrem_1.c, but has the following advantages.
+C
+C - If gcc isn't being used then divrem_1.c will get the generic C
+C   udiv_qrnnd() and be rather slow.
+C
+C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't
+C   generate that instruction (as of gcc 2.95.2 at least).
+C
+C A test is done to see if the high limb is less the the divisor, and if so
+C one less div is done.  A div is between 20 and 40 cycles on the various
+C x86s, so assuming high<divisor about half the time, then this test saves
+C half that amount.  The branch misprediction penalty on each chip is less
+C than half a div.
+C  	
+C
+C K6: Back-to-back div instructions run at 20 cycles, the same as the loop
+C     here, so it seems there's nothing to gain by rearranging the loop.
+C     Pairing the mov and loop instructions was found to gain nothing.  (The
+C     same is true of the mpn/x86/mod_1.asm loop.)
+C
+C     With a "decl/jnz" rather than a "loop" this code runs at 22 cycles.
+C     The loop_or_decljnz macro is an easy way to get a 10% speedup.
+C
+C     The fast K6 multiply might be thought to suit a multiply-by-inverse,
+C     but that algorithm has been found to suffer from the releatively poor
+C     carry handling on K6 and too many auxiliary instructions.  The
+C     fractional part however could be done at about 13 c/l.
+C
+C P5: Moving the load down to pair with the store might save 1 cycle, but
+C     that doesn't seem worth bothering with, since it'd be only a 2.2%
+C     saving.
+C
+C     Again here the auxiliary instructions hinder a multiply-by-inverse,
+C     though there might be a 10-15% speedup available
+
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+	
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	orl	%ecx, %ecx
+
+	movl	PARAM_CARRY, %edx
+	jz	LF(mpn_divrem_1,fraction)
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jmp	LF(mpn_divrem_1,integer_top)
+
+EPILOGUE()
+
+
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+	
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	orl	%ecx,%ecx
+
+	jz	L(size_zero)
+	pushl	%ebx		FRAME_pushl()
+
+	movl	-4(%edi,%ecx,4), %eax	C src high limb
+	xorl	%edx, %edx
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	cmpl	%esi, %eax
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jae	L(integer_entry)
+
+
+	C high<divisor, so high of dst is zero, and avoid one div
+
+	movl	%edx, (%ebx,%ecx,4)
+	decl	%ecx
+
+	movl	%eax, %edx
+	jz	L(fraction)
+
+
+L(integer_top):
+	C eax	scratch (quotient)
+	C ebx	dst+4*xsize-4
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi	src
+	C ebp	xsize
+
+	movl	-4(%edi,%ecx,4), %eax
+L(integer_entry):
+
+	divl	%esi
+
+	movl	%eax, (%ebx,%ecx,4)
+	loop_or_decljnz	L(integer_top)
+
+
+L(fraction):
+	orl	%ebp, %ecx
+	jz	L(done)
+
+	movl	PARAM_DST, %ebx
+
+
+L(fraction_top):
+	C eax	scratch (quotient)
+	C ebx	dst
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi
+	C ebp
+
+	xorl	%eax, %eax
+
+	divl	%esi
+
+	movl	%eax, -4(%ebx,%ecx,4)
+	loop_or_decljnz	L(fraction_top)
+
+
+L(done):
+	popl	%ebp
+	movl	%edx, %eax
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+L(size_zero):
+deflit(`FRAME',8)
+	movl	PARAM_XSIZE, %ecx
+	xorl	%eax, %eax
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README.family
+
+	rep
+	stosl
+
+	popl	%esi
+	popl	%edi
+	ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/README b/rts/gmp/mpn/x86/k6/README
new file mode 100644
index 0000000000..3ad96c8b89
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/README
@@ -0,0 +1,237 @@
+
+			AMD K6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and
+K6-3.
+
+The mmx and k62mmx subdirectories have routines using MMX instructions.  All
+K6s have MMX, the separate directories are just so that ./configure can omit
+them if the assembler doesn't support MMX.
+
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+
+                                 cycles/limb
+
+	mpn_add_n/sub_n            3.25 normal, 2.75 in-place
+
+	mpn_mul_1                  6.25
+	mpn_add/submul_1           7.65-8.4  (varying with data values)
+
+	mpn_mul_basecase           9.25 cycles/crossproduct (approx)
+	mpn_sqr_basecase           4.7  cycles/crossproduct (approx)
+                                   or 9.2 cycles/triangleproduct (approx)
+
+	mpn_divrem_1              20.0
+	mpn_mod_1                 20.0
+	mpn_divexact_by3          11.0
+
+	mpn_l/rshift               3.0
+
+	mpn_copyi/copyd            1.0
+
+	mpn_com_n                  1.5-1.85  \
+	mpn_and/andn/ior/xor_n     1.5-1.75  | varying with
+	mpn_iorn/xnor_n            2.0-2.25  | data alignment
+	mpn_nand/nior_n            2.0-2.25  /
+
+	mpn_popcount		  12.5
+	mpn_hamdist		  13.0
+
+
+K6-2 and K6-3 have dual-issue MMX and get the following improvements.
+
+	mpn_l/rshift               1.75
+
+	mpn_copyi/copyd            0.56 or 1.0  \
+                                                |
+	mpn_com_n                  1.0-1.2      | varying with
+	mpn_and/andn/ior/xor_n     1.2-1.5      | data alignment
+	mpn_iorn/xnor_n            1.5-2.0      |
+	mpn_nand/nior_n            1.75-2.0     /
+
+	mpn_popcount		   9.0
+	mpn_hamdist		  11.5
+
+
+Prefetching of sources hasn't yet given any joy.  With the 3DNow "prefetch"
+instruction, code seems to run slower, and with just "mov" loads it doesn't
+seem faster.  Results so far are inconsistent.  The K6 does a hardware
+prefetch of the second cache line in a sector, so the penalty for not
+prefetching in software is reduced.
+
+
+
+
+NOTES
+
+All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow.
+
+Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can
+execute them in both X and Y (and together).
+
+Branch misprediction penalty is 1 to 4 cycles (Optimization Manual
+chapter 6 table 12).
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+Store queue is 7 entries of 64 bits each.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead.  The unrolling is
+configurable up to 32 limbs/loop for most routines, up to 64 for some.
+
+Sometimes computed jumps into the unrolling are used to handle sizes not a
+multiple of the unrolling.  An attractive feature of this is that times
+smoothly increase with operand size, but an indirect jump is about 6 cycles
+and the setups about another 6, so it depends on how much the unrolled code
+is faster than a simple loop as to whether a computed jump ought to be used.
+
+Position independent code is implemented using a call to get eip for
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.  Such a call however still costs 4 to 7
+cycles.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken.  Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+MMX
+
+Putting emms or femms as late as possible in a routine seems to be fastest.
+Perhaps an emms or femms stalls until all outstanding MMX instructions have
+completed, so putting it later gives them a chance to complete on their own,
+in parallel with other operations (like register popping).
+
+The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3
+at the start of a routine, in case it's been preceded by x87 floating point
+operations.  This isn't done because in gmp programs it's expected that x87
+floating point won't be much used and that chances are an mpn routine won't
+have been preceded by any x87 code.
+
+
+
+CODING
+
+Instructions in general code are shown paired if they can decode and execute
+together, meaning two short decode instructions with the second not
+depending on the first, only the first using the shifter, no more than one
+load, and no more than one store.
+
+K6 does some out of order execution so the pairings aren't essential, they
+just show what slots might be available.  When decoding is the limiting
+factor things can be scheduled that might not execute until later.
+
+
+
+NOTES
+
+Code alignment
+
+- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary,
+  short decode is inhibited.  The cross.pl script detects this.
+
+- loops and branch targets should be aligned to 16 bytes, or ensure at least
+  2 instructions before a 32 byte boundary.  This makes use of the 16 byte
+  cache in the BTB.
+
+Addressing modes
+
+- (%esi) degrades decoding from short to vector.  0(%esi) doesn't have this
+  problem, and can be used as an equivalent, or easier is just to use a
+  different register, like %ebx.
+
+- K6 and pre-CXT core K6-2 have the following problem.  (K6-2 CXT and K6-3
+  have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F).
+
+  If more than 3 bytes are needed to determine instruction length then
+  decoding degrades from direct to long, or from long to vector.  This
+  happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since
+  with mod=00 the sib determines whether there's a displacement.
+
+  This affects all MMX and 3DNow instructions, and others with an 0F prefix
+  like movzbl.  The modes affected are anything with an index and no
+  displacement, or an index but no base, and this includes (%esp) which is
+  really (,%esp,1).
+
+  The cross.pl script detects problem cases.  The workaround is to always
+  use a displacement, and to do this with Zdisp if it's zero so the
+  assembler doesn't discard it.
+
+  See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages
+  13-14 and 36-37.
+
+Calls
+
+- indirect jumps and calls are not branch predicted, they measure about 6
+  cycles.
+
+Various
+
+- adcl      2 cycles of decode, maybe 2 cycles executing in the X pipe
+- bsf       12-27 cycles
+- emms      5 cycles
+- femms     3 cycles
+- jecxz     2 cycles taken, 13 not taken (optimization manual says 7 not taken)
+- divl      20 cycles back-to-back
+- imull     2 decode, 2 execute
+- mull      2 decode, 3 execute (optimization manual decoding sample)
+- prefetch  2 cycles
+- rcll/rcrl implicit by one bit: 2 cycles
+            immediate or %cl count: 11 + 2 per bit for dword
+                                    13 + 4 per bit for byte
+- setCC	    2 cycles
+- xchgl	%eax,reg  1.5 cycles, back-to-back (strange)
+        reg,reg   2 cycles, back-to-back
+
+
+
+
+REFERENCES
+
+"AMD-K6 Processor Code Optimization Application Note", AMD publication
+number 21924, revision D amendment 0, January 2000.  This describes K6-2 and
+K6-3.  Available on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21924.pdf
+
+"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD
+publication number 21828, revision A amendment 0, August 1997.  This is an
+older edition of the above document, describing plain K6.  Available
+on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21828.pdf
+
+"3DNow Technology Manual", AMD publication number 21928F/0-August 1999.
+This describes the femms and prefetch instructions, but nothing else from
+3DNow has been used.  Available on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21928.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999.  This has some notes on general K6 optimizations as well as
+3DNow.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/k6/aors_n.asm b/rts/gmp/mpn/x86/k6/aors_n.asm
new file mode 100644
index 0000000000..31b05ada51
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/aors_n.asm
@@ -0,0 +1,329 @@
+dnl  AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
+dnl 
+dnl  K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                      mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result
+C (1 or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
+C an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
+C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
+
+define(PARAM_CARRY, `FRAME+20(%esp)')
+define(PARAM_SIZE,  `FRAME+16(%esp)')
+define(PARAM_SRC2,  `FRAME+12(%esp)')
+define(PARAM_SRC1,  `FRAME+8(%esp)')
+define(PARAM_DST,   `FRAME+4(%esp)')
+deflit(`FRAME',0)
+
+dnl  minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function_nc)
+	movl	PARAM_CARRY, %eax
+	jmp	LF(M4_function_n,start)
+EPILOGUE()
+
+
+PROLOGUE(M4_function_n)
+	xorl	%eax, %eax
+L(start):
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx
+FRAME_pushl()
+
+	movl	PARAM_SRC1, %ebx
+	pushl	%edi
+FRAME_pushl()
+
+	movl	PARAM_SRC2, %edx
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_DST, %edi
+	jae	L(unroll)
+
+
+	shrl	%eax		C initial carry flag
+
+	C offset 0x21 here, close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C The store to (%edi) could be done with a stosl; it'd be smaller
+	C code, but there's no speed gain and a cld would have to be added
+	C (per mpn/x86/README.family).
+
+	movl	(%ebx), %eax
+	leal	4(%ebx), %ebx
+	
+	M4_inst	(%edx), %eax
+
+	movl	%eax, (%edi)
+	leal	4(%edi), %edi
+
+	leal	4(%edx), %edx
+	loop	L(simple)
+
+	
+	movl	$0, %eax
+	popl	%edi
+
+	setc	%al
+
+	popl	%ebx
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(unroll):
+	C eax	carry
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	cmpl	%edi, %ebx
+	pushl	%esi
+
+	je	L(inplace)
+
+ifdef(`OPERATION_add_n',`
+	cmpl	%edi, %edx
+
+	je	L(inplace_reverse)
+')
+
+	movl	%ecx, %esi
+
+	andl	$-4, %ecx
+	andl	$3, %esi
+
+	leal	(%ebx,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %edx
+	leal	(%edi,%ecx,4), %edi
+
+	negl	%ecx
+	shrl	%eax
+
+	ALIGN(32)
+L(normal_top):
+	C eax	counter, qwords, negative
+	C ebx	src1
+	C ecx	scratch
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+ 	movl	(%ebx,%ecx,4), %eax
+	leal	5(%ecx), %ecx
+ 	M4_inst	-20(%edx,%ecx,4), %eax
+ 	movl	%eax, -20(%edi,%ecx,4)
+
+ 	movl	4-20(%ebx,%ecx,4), %eax
+ 	M4_inst	4-20(%edx,%ecx,4), %eax
+ 	movl	%eax, 4-20(%edi,%ecx,4)
+
+ 	movl	8-20(%ebx,%ecx,4), %eax
+ 	M4_inst	8-20(%edx,%ecx,4), %eax
+ 	movl	%eax, 8-20(%edi,%ecx,4)
+
+ 	movl	12-20(%ebx,%ecx,4), %eax
+ 	M4_inst	12-20(%edx,%ecx,4), %eax
+ 	movl	%eax, 12-20(%edi,%ecx,4)
+
+	loop	L(normal_top)
+
+
+	decl	%esi
+	jz	L(normal_finish_one)
+	js	L(normal_done)
+
+	C two or three more limbs
+
+ 	movl	(%ebx), %eax
+ 	M4_inst	(%edx), %eax
+ 	movl	%eax, (%edi)
+
+ 	movl	4(%ebx), %eax
+ 	M4_inst	4(%edx), %eax
+	decl	%esi
+ 	movl	%eax, 4(%edi)
+
+	jz	L(normal_done)
+	movl	$2, %ecx
+
+L(normal_finish_one):
+ 	movl	(%ebx,%ecx,4), %eax
+ 	M4_inst	(%edx,%ecx,4), %eax
+ 	movl	%eax, (%edi,%ecx,4)
+
+L(normal_done):	
+	popl	%esi
+	popl	%edi
+
+	movl	$0, %eax
+	popl	%ebx
+
+	setc	%al
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+
+ifdef(`OPERATION_add_n',`
+L(inplace_reverse):
+	C dst==src2
+
+	movl	%ebx, %edx
+')
+
+L(inplace):
+	C eax	initial carry
+	C ebx
+	C ecx	size
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	leal	-1(%ecx), %esi
+	decl	%ecx
+
+	andl	$-4, %ecx
+	andl	$3, %esi
+
+ 	movl	(%edx), %ebx		C src low limb
+	leal	(%edx,%ecx,4), %edx
+
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+
+	shrl	%eax
+
+
+	ALIGN(32)
+L(inplace_top):
+	C eax
+	C ebx	next src limb
+	C ecx	size
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+ 	M4_inst	%ebx, (%edi,%ecx,4)
+
+ 	movl	4(%edx,%ecx,4), %eax
+	leal	5(%ecx), %ecx
+
+ 	M4_inst	%eax, 4-20(%edi,%ecx,4)
+
+ 	movl	8-20(%edx,%ecx,4), %eax
+ 	movl	12-20(%edx,%ecx,4), %ebx
+
+ 	M4_inst	%eax, 8-20(%edi,%ecx,4)
+ 	M4_inst	%ebx, 12-20(%edi,%ecx,4)
+
+ 	movl	16-20(%edx,%ecx,4), %ebx
+	loop	L(inplace_top)
+
+
+	C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
+
+ 	M4_inst	%ebx, (%edi)
+
+	decl	%esi
+	jz	L(inplace_finish_one)
+	js	L(inplace_done)
+
+	C two or three more limbs
+
+ 	movl	4(%edx), %eax
+ 	movl	8(%edx), %ebx
+ 	M4_inst	%eax, 4(%edi)
+ 	M4_inst	%ebx, 8(%edi)
+
+	decl	%esi
+	movl	$2, %ecx
+
+	jz	L(normal_done)
+
+L(inplace_finish_one):
+ 	movl	4(%edx,%ecx,4), %eax
+ 	M4_inst	%eax, 4(%edi,%ecx,4)
+
+L(inplace_done):	
+	popl	%esi
+	popl	%edi
+
+	movl	$0, %eax
+	popl	%ebx
+
+	setc	%al
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/aorsmul_1.asm b/rts/gmp/mpn/x86/k6/aorsmul_1.asm
new file mode 100644
index 0000000000..da4120fe2f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/aorsmul_1.asm
@@ -0,0 +1,372 @@
+dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl 
+dnl  K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
+dnl  PIC adds about 6 cycles at the start.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6:           large multpliers  small multpliers
+dnl  UNROLL_COUNT    cycles/limb       cycles/limb
+dnl        4             9.5              7.78
+dnl        8             9.0              7.78
+dnl       16             8.4              7.65
+dnl       32             8.4              8.2
+dnl
+dnl  Maximum possible unrolling with the current code is 32.
+dnl
+dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
+dnl  byte block, which might explain the good speed at that unrolling.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+	define(M4_description, add it to)
+	define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+	define(M4_description, subtract it from)
+	define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
+C multipliers (most significant few bits clear) result in few carry bits and
+C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
+C significant few bits set) make the carry bits 50/50 and lead to something
+C more like 8.4 c/l.  (With adcl's both of these would be 9.3 c/l.)
+C
+C It's important that the gains for jadcl0 on small multipliers don't come
+C at the cost of slowing down other data.  Tests on uniformly distributed
+C random data, designed to confound branch prediction, show about a 7%
+C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
+C overheads included).
+C
+C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
+C 11.0 cycles/limb), and hence isn't used.
+C
+C In the simple loop, note that running ecx from negative to zero and using
+C it as an index in the two movs wouldn't help.  It would save one
+C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
+C that would be collapsed by this.
+C
+C
+C jadcl0
+C ------
+C
+C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
+C firstly the instruction decoding and secondly the fact that there's a
+C carry bit for the jadcl0 only on average about 1/4 of the time.
+C
+C The code in the unrolled loop decodes something like the following.
+C
+C                                         decode cycles
+C		mull	%ebp                    2
+C		M4_inst	%esi, disp(%edi)        1
+C		adcl	%eax, %ecx              2
+C		movl	%edx, %esi            \ 1
+C		jnc	1f                    /
+C		incl	%esi                  \ 1
+C	1:	movl	disp(%ebx), %eax      /
+C                                              ---
+C                                               7
+C
+C In a back-to-back style test this measures 7 with the jnc not taken, or 8
+C with it taken (both when correctly predicted).  This is opposite to the
+C measurements showing small multipliers running faster than large ones.
+C Watch this space for more info ...
+C
+C It's not clear how much branch misprediction might be costing.  The K6
+C doco says it will be 1 to 4 cycles, but presumably it's near the low end
+C of that range to get the measured results.
+C
+C
+C In the code the two carries are more or less the preceding mul product and
+C the calculation is roughly
+C
+C	x*y + u*b+v
+C
+C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
+C v are the two limbs it's added to (being the low of the next mul, and a
+C limb from the destination).
+C
+C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
+C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
+C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
+C and b-1, then the total probability can be summed over x and y,
+C
+C	 1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
+C	--- * sum sum --- = --- * ------- * ------- = 1/4
+C       b^2   x=0 y=1 b^2   b^4      2         2
+C
+C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
+C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
+
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+	pushl	%esi
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %esi
+	jmp	LF(M4_function_1,start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+	push	%esi
+deflit(`FRAME',4)
+	xorl	%esi, %esi	C initial carry
+
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx
+deflit(`FRAME',8)
+
+	movl	PARAM_SRC, %ebx
+	pushl	%edi
+deflit(`FRAME',12)
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_DST, %edi
+
+	pushl	%ebp
+deflit(`FRAME',16)
+	jae	L(unroll)
+
+	
+	C simple loop
+
+	movl	PARAM_MULTIPLIER, %ebp
+
+L(simple):
+	C eax	scratch
+	C ebx	src
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	$4, %edi
+	addl	%esi, %eax
+
+	adcl	$0, %edx
+
+	M4_inst	%eax, -4(%edi)
+
+	adcl	$0, %edx
+
+	movl	%edx, %esi
+	loop	L(simple)
+
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%ebx
+	movl	%esi, %eax
+
+	popl	%esi
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
+C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
+C For the computed jump an odd size means they start one way around, an even
+C size the other.
+C
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers at the point of doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %esi is necessary only for the
+C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
+C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl  overlapping with parameters already fetched
+define(VAR_COUNTER, `PARAM_SIZE')
+define(VAR_JUMP,    `PARAM_DST')
+
+L(unroll):
+	C eax
+	C ebx	src
+	C ecx	size
+	C edx
+	C esi	initial carry
+	C edi	dst
+	C ebp
+
+	movl	%ecx, %edx
+	decl	%ecx
+
+	subl	$2, %edx
+	negl	%ecx
+
+	shrl	$UNROLL_LOG2, %edx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%edx, VAR_COUNTER
+	movl	%ecx, %edx
+
+	shll	$4, %edx
+	negl	%ecx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edx,%ecx,1), %edx
+')
+	movl	(%ebx), %eax		C src low limb
+
+	movl	PARAM_MULTIPLIER, %ebp
+	movl	%edx, VAR_JUMP
+
+	mull	%ebp
+
+	addl	%esi, %eax	C initial carry (from _1c)
+	jadcl0(	%edx)
+
+
+	leal	4(%ebx,%ecx,4), %ebx
+	movl	%edx, %esi	C high carry
+
+	movl	VAR_JUMP, %edx
+	leal	(%edi,%ecx,4), %edi
+
+	testl	$1, %ecx
+	movl	%eax, %ecx	C low carry
+
+	jz	L(noswap)
+	movl	%esi, %ecx	C high,low carry other way around
+
+	movl	%eax, %esi
+L(noswap):
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%edx,%ecx,1), %edx
+	addl	$L(entry)-L(here), %edx
+	addl	(%esp), %edx
+	ret
+')
+
+
+C -----------------------------------------------------------
+	ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+	C eax	scratch
+	C ebx	src
+	C ecx	carry lo
+	C edx	scratch
+	C esi	carry hi
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 15 code bytes per limb
+
+	leal	UNROLL_BYTES(%edi), %edi
+
+L(entry):
+forloop(`i', 0, UNROLL_COUNT/2-1, `
+	deflit(`disp0', eval(2*i*4))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0(	%ecx)
+
+	movl	disp1(%ebx), %eax
+	mull	%ebp
+	M4_inst	%esi, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0(	%esi)
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%ebx), %ebx
+
+	jns	L(top)
+
+
+	popl	%ebp
+	M4_inst	%ecx, UNROLL_BYTES(%edi)
+
+	popl	%edi
+	movl	%esi, %eax
+
+	popl	%ebx
+	jadcl0(	%eax)
+
+	popl	%esi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/cross.pl b/rts/gmp/mpn/x86/k6/cross.pl
new file mode 100644
index 0000000000..21734f3e52
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/cross.pl
@@ -0,0 +1,141 @@
+#! /usr/bin/perl
+
+# Copyright (C) 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# Usage: cross.pl [filename.o]...
+#
+# Produce an annotated disassembly of the given object files, indicating
+# certain code alignment and addressing mode problems afflicting K6 chips.
+# "ZZ" is used on all annotations, so this can be searched for.
+#
+# With no arguments, all .o files corresponding to .asm files are processed.
+# This is good in the mpn object directory of a k6*-*-* build.
+#
+# As far as fixing problems goes, any cache line crossing problems in loops
+# get attention, but as a rule it's too tedious to rearrange code or slip in
+# nops to fix every problem in setup or finishup code.
+#
+# Bugs:
+#
+# Instructions without mod/rm bytes or which are already vector decoded are
+# unaffected by cache line boundary crossing, but not all of these have yet
+# been put in as exceptions.  All that occur in practice in GMP are present
+# though.
+#
+# There's no messages for using the vector decoded addressing mode (%esi),
+# but that mode is easy to avoid when coding.
+
+use strict;
+
+sub disassemble {
+    my ($file) = @_;
+    my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm);
+
+    open (IN, "objdump -Srfh $file |")
+	|| die "Cannot open pipe from objdump\n";
+    while (<IN>) {
+	print;
+
+	if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) {
+	    if ($1 < 5) {
+		print "ZZ need at least 2**5 for predictable cache line crossing\n";
+	    }
+	}
+	
+	if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4);
+
+	} elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,$3,'');
+
+	} elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,'','');
+
+	} else {
+	    next;
+	}
+
+	if ($b1 =~ /0f/) {
+	    $prefix = $b1;
+	    $opcode = $b2;
+	    $modrm = $b3;
+	} else {
+	    $prefix = '';
+	    $opcode = $b1;
+	    $modrm = $b2;
+	}
+
+	# modrm of the form 00-xxx-100 with an 0F prefix is the problem case
+	# for K6 and pre-CXT K6-2
+	if ($prefix =~ /0f/
+	    && $opcode !~ /^8/         # jcond disp32
+	    && $modrm =~ /^[0-3][4c]/) {
+	    print "ZZ ($file) >3 bytes to determine instruction length\n";
+	}
+
+	# with just an opcode, starting 1f mod 20h
+	if ($addr =~ /[13579bdf]f$/
+	    && $prefix !~ /0f/
+	    && $opcode !~ /1[012345]/ # adc
+	    && $opcode !~ /1[89abcd]/ # sbb
+	    && $opcode !~ /68/        # push $imm32
+	    && $opcode !~ /^7/        # jcond disp8
+	    && $opcode !~ /a[89]/     # test+imm
+	    && $opcode !~ /a[a-f]/    # stos/lods/scas
+	    && $opcode !~ /b8/        # movl $imm32,%eax
+	    && $opcode !~ /e[0123]/   # loop/loopz/loopnz/jcxz
+	    && $opcode !~ /e[b9]/     # jmp disp8/disp32
+	    && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std
+	    && !($opcode =~ /f[67]/          # grp 1
+		 && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv
+	    && $modrm !~ /^$/) {
+	    print "ZZ ($file) opcode/modrm cross 32-byte boundary\n";
+	}
+
+	# with an 0F prefix, anything starting at 1f mod 20h
+	if ($addr =~ /[13579bdf][f]$/
+	    && $prefix =~ /0f/) {
+	    print "ZZ ($file) prefix/opcode cross 32-byte boundary\n";
+	}
+
+	# with an 0F prefix, anything with mod/rm starting at 1e mod 20h
+	if ($addr =~ /[13579bdf][e]$/
+	    && $prefix =~ /0f/
+	     && $opcode !~ /^8/        # jcond disp32
+	    && $modrm !~ /^$/) {
+	    print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n";
+	}
+    }
+    close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+
+my @files;
+if ($#ARGV >= 0) {
+    @files = @ARGV;
+} else {
+    @files = glob "*.asm";
+    map {s/.asm/.o/} @files;
+}
+
+foreach (@files)  {
+    disassemble($_);
+}
diff --git a/rts/gmp/mpn/x86/k6/diveby3.asm b/rts/gmp/mpn/x86/k6/diveby3.asm
new file mode 100644
index 0000000000..ffb97bc380
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/diveby3.asm
@@ -0,0 +1,110 @@
+dnl  AMD K6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl 
+dnl  K6: 11.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+C
+C Using %esi in (%esi,%ecx,4) or 0(%esi,%ecx,4) addressing modes doesn't
+C lead to vector decoding, unlike plain (%esi) does.
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3, 0xAAAAAAAB)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi		defframe_pushl(SAVE_ESI)
+
+	movl	PARAM_SRC, %esi
+	pushl	%edi		defframe_pushl(SAVE_EDI)
+
+	movl	PARAM_DST, %edi
+	pushl	%ebx		defframe_pushl(SAVE_EBX)
+
+	movl	PARAM_CARRY, %ebx
+	leal	(%esi,%ecx,4), %esi
+
+	pushl	$3		defframe_pushl(VAR_THREE)
+	leal	(%edi,%ecx,4), %edi
+
+	negl	%ecx
+
+
+	C Need 32 alignment for claimed speed, to avoid the movl store
+	C opcode/modrm crossing a cache line boundary
+
+	ALIGN(32)
+L(top):
+	C eax	scratch, low product
+	C ebx	carry limb (0 to 3)
+	C ecx	counter, limbs, negative
+	C edx	scratch, high product
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+	C
+	C The 0(%esi,%ecx,4) form pads so the finishup "movl %ebx, %eax"
+	C doesn't cross a 32 byte boundary, saving a couple of cycles
+	C (that's a fixed couple, not per loop).
+
+Zdisp(	movl,	0,(%esi,%ecx,4), %eax)
+	subl	%ebx, %eax
+
+	setc	%bl
+
+	imull	$INVERSE_3, %eax
+
+	movl	%eax, (%edi,%ecx,4)
+	addl	$2, %ecx
+
+	mull	VAR_THREE
+
+	addl	%edx, %ebx
+	loop	L(top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	%ebx, %eax
+
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_EDI, %edi
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/gmp-mparam.h b/rts/gmp/mpn/x86/k6/gmp-mparam.h
new file mode 100644
index 0000000000..77f3948d77
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   3   /* cycles */
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME   20  /* cycles */
+#endif
+
+/* bsfl takes 12-27 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   14  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-04. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   18
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      130
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   34
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      116
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              68
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          67
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 528, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     472
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         4352
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 528, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     544
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         4352
+#endif
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
new file mode 100644
index 0000000000..20a33e6ccf
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
@@ -0,0 +1,179 @@
+dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
+dnl 
+dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
+dnl  alignment.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6-2 aligned:
+dnl  UNROLL_COUNT cycles/limb
+dnl        8          0.75
+dnl       16          0.625
+dnl       32          0.5625
+dnl       64          0.53
+dnl  Maximum possible with the current code is 64, the minimum is 2.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, processing limbs from high to low addresses.
+C
+C The comments in copyi.asm apply here too.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	std
+
+	movl	PARAM_DST, %edi
+	cmpl	$UNROLL_COUNT, %ecx
+
+	leal	-4(%esi,%ecx,4), %esi
+
+	leal	-4(%edi,%ecx,4), %edi
+	ja	L(unroll)
+
+L(simple):
+	rep
+	movsl
+
+	cld
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+
+L(unroll):
+	C if src and dst are different alignments mod8, then use rep movs
+	C if src and dst are both 4mod8 then process one limb to get 0mod8
+
+	pushl	%ebx
+	leal	(%esi,%edi), %ebx
+
+	testb	$4, %bl
+	popl	%ebx
+	
+	jnz	L(simple)
+	testl	$4, %esi
+
+	leal	-UNROLL_COUNT(%ecx), %ecx
+	jnz	L(already_aligned)
+
+	movsl
+
+	decl	%ecx
+L(already_aligned):
+
+
+ifelse(UNROLL_BYTES,256,`
+	subl	$128, %esi
+	subl	$128, %edi
+')
+
+	C offset 0x3D here, but gets full speed without further alignment
+L(top):
+	C eax	saved esi
+	C ebx
+	C ecx	counter, limbs
+	C edx	saved edi
+	C esi	src, incrementing
+	C edi	dst, incrementing
+	C ebp
+	C
+	C `disp' is never 0, so don't need to force 0(%esi).
+
+deflit(CHUNK_COUNT, 2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
+	movq	disp(%esi), %mm0
+	movq	%mm0, disp(%edi)
+')
+
+	leal	-UNROLL_BYTES(%esi), %esi
+	subl	$UNROLL_COUNT, %ecx
+
+	leal	-UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+	C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
+	C UNROLL_COUNT-1 limbs remaining
+
+	testb	$eval(UNROLL_COUNT/2), %cl
+
+	leal	UNROLL_COUNT(%ecx), %ecx
+	jz	L(not_half)
+
+
+	C at an unroll count of 32 this block of code is 16 cycles faster than
+	C the rep movs, less 3 or 4 to test whether to do it
+
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
+	deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
+	movq	disp(%esi), %mm0
+	movq	%mm0, disp(%edi)
+')
+
+	subl	$eval(UNROLL_BYTES/2), %esi
+	subl	$eval(UNROLL_BYTES/2), %edi
+
+	subl	$eval(UNROLL_COUNT/2), %ecx
+L(not_half):
+
+
+ifelse(UNROLL_BYTES,256,`
+	addl	$128, %esi
+	addl	$128, %edi
+')
+
+	rep
+	movsl
+
+	cld
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
new file mode 100644
index 0000000000..215d805f2e
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
@@ -0,0 +1,196 @@
+dnl  AMD K6-2 mpn_copyi -- copy limb vector, incrementing.
+dnl 
+dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
+dnl  alignment.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6-2 aligned:
+dnl  UNROLL_COUNT cycles/limb
+dnl        8          0.75
+dnl       16          0.625
+dnl       32          0.5625
+dnl       64          0.53
+dnl  Maximum possible with the current code is 64, the minimum is 2.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The MMX loop is faster than a rep movs when src and dst are both 0mod8.
+C With one 0mod8 and one 4mod8 it's 1.056 c/l and the rep movs at 1.0 c/l is
+C used instead.
+C
+C         mod8
+C	src  dst
+C	 0    0	   both aligned, use mmx
+C	 0    4    unaligned, use rep movs
+C	 4    0    unaligned, use rep movs
+C	 4    4    do one movs, then both aligned, use mmx
+C
+C The MMX code on aligned data is 0.5 c/l, plus loop overhead of 2
+C cycles/loop, which is 0.0625 c/l at 32 limbs/loop.
+C
+C A pattern of two movq loads and two movq stores (or four and four) was
+C tried, but found to be the same speed as just one of each.
+C
+C Note that this code only suits K6-2 and K6-3.  Plain K6 does only one mmx
+C instruction per cycle, so "movq"s are no faster than the simple 1 c/l rep
+C movs.
+C
+C Enhancement:
+C
+C Addressing modes like disp(%esi,%ecx,4) aren't currently used.  They'd
+C make it possible to avoid incrementing %esi and %edi in the loop and hence
+C get loop overhead down to 1 cycle.  Care would be needed to avoid bad
+C cache line crossings since the "movq"s would then be 5 code bytes rather
+C than 4.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_copyi)
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	cld
+
+	movl	PARAM_DST, %edi
+	cmpl	$UNROLL_COUNT, %ecx
+
+	ja	L(unroll)
+
+L(simple):
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+
+L(unroll):
+	C if src and dst are different alignments mod8, then use rep movs
+	C if src and dst are both 4mod8 then process one limb to get 0mod8
+
+	pushl	%ebx
+	leal	(%esi,%edi), %ebx
+
+	testb	$4, %bl
+	popl	%ebx
+	
+	jnz	L(simple)
+	testl	$4, %esi
+
+	leal	-UNROLL_COUNT(%ecx), %ecx
+	jz	L(already_aligned)
+
+	decl	%ecx
+
+	movsl
+L(already_aligned):
+
+
+ifelse(UNROLL_BYTES,256,`
+	addl	$128, %esi
+	addl	$128, %edi
+')
+
+	C this is offset 0x34, no alignment needed
+L(top):
+	C eax	saved esi
+	C ebx
+	C ecx	counter, limbs
+	C edx	saved edi
+	C esi	src, incrementing
+	C edi	dst, incrementing
+	C ebp
+	C
+	C Zdisp gets 0(%esi) left that way to avoid vector decode, and with
+	C 0(%edi) keeps code aligned to 16 byte boundaries.
+
+deflit(CHUNK_COUNT, 2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+Zdisp(	movq,	disp,(%esi), %mm0)
+Zdisp(	movq,	%mm0, disp,(%edi))
+')
+
+	addl	$UNROLL_BYTES, %esi
+	subl	$UNROLL_COUNT, %ecx
+
+	leal	UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+	C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
+	C UNROLL_COUNT-1 limbs remaining
+
+	testb	$eval(UNROLL_COUNT/2), %cl
+
+	leal	UNROLL_COUNT(%ecx), %ecx
+	jz	L(not_half)
+
+	C at an unroll count of 32 this block of code is 16 cycles faster than
+	C the rep movs, less 3 or 4 to test whether to do it
+
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
+	deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	movq	disp(%esi), %mm0
+	movq	%mm0, disp(%edi)
+')
+	addl	$eval(UNROLL_BYTES/2), %esi
+	addl	$eval(UNROLL_BYTES/2), %edi
+
+	subl	$eval(UNROLL_COUNT/2), %ecx
+L(not_half):
+
+
+ifelse(UNROLL_BYTES,256,`
+	subl	$128, %esi
+	subl	$128, %edi
+')
+
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
new file mode 100644
index 0000000000..f6d54f97a8
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
@@ -0,0 +1,286 @@
+dnl  AMD K6-2 mpn_lshift -- mpn left shift.
+dnl 
+dnl  K6-2: 1.75 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  used after src has been fetched
+define(VAR_RETVAL,`PARAM_SRC')
+
+dnl  minimum 9, because unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 9)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shldl(	%cl, %edx, %eax)	C return value
+
+ 	shll	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx,%eax,4), %edx	C src high limb
+	negl	%ecx
+
+	movd	PARAM_SHIFT, %mm6
+	addl	$32, %ecx		C 32-shift
+
+	shrl	%cl, %edx
+	cmpl	$UNROLL_THRESHOLD-1, %eax
+
+	movl	%edx, VAR_RETVAL
+	jae	L(unroll)
+
+
+	movd	%ecx, %mm7
+	movl	%eax, %ecx
+
+	movl	PARAM_DST, %eax
+
+L(simple):
+	C eax	dst
+	C ebx	src
+	C ecx	counter, size-1 to 1
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%ecx,4), %mm0
+
+ 	psrlq	%mm7, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%eax,%ecx,4))
+	loop	L(simple)
+
+
+	movd	(%ebx), %mm0
+	popl	%ebx
+
+ 	psllq	%mm6, %mm0
+
+	movd	%mm0, (%eax)
+	movl	%edx, %eax
+
+	femms
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval (but instead VAR_RETVAL is used)
+	C
+	C mm6	shift
+
+	addl	$32, %ecx
+	movl	PARAM_DST, %edx
+
+	movd	%ecx, %mm7
+	subl	$7, %eax			C size-8
+
+	leal	(%edx,%eax,4), %ecx		C alignment of dst
+
+	movq	32-8(%ebx,%eax,4), %mm2		C src high qword
+	testb	$4, %cl
+
+	jz	L(dst_aligned)
+	psllq	%mm6, %mm2
+
+	psrlq	$32, %mm2
+	decl	%eax
+
+	movd	%mm2, 32(%edx,%eax,4)		C dst high limb
+	movq	32-8(%ebx,%eax,4), %mm2		C new src high qword
+L(dst_aligned):
+
+	movq	32-16(%ebx,%eax,4), %mm0	C src second highest qword
+
+
+	C This loop is the important bit, the rest is just support for it.
+	C Four src limbs are held at the start, and four more will be read.
+	C Four dst limbs will be written.  This schedule seems necessary for
+	C full speed.
+	C
+	C The use of size-8 lets the loop stop when %eax goes negative and
+	C leaves -4 to -1 which can be tested with test $1 and $2.
+
+L(top):
+	C eax	counter, size-8 step by -4 until <0
+	C ebx	src
+	C ecx
+	C edx	dst
+	C
+	C mm0	src next qword
+	C mm1	scratch
+	C mm2	src prev qword
+	C mm6	shift
+	C mm7	64-shift
+
+	psllq	%mm6, %mm2
+	subl	$4, %eax
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	movq	24(%ebx,%eax,4), %mm0
+
+	psllq	%mm6, %mm1
+	movq	%mm2, 40(%edx,%eax,4)
+
+	movq	%mm0, %mm2
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	movq	16(%ebx,%eax,4), %mm0
+
+	movq	%mm1, 32(%edx,%eax,4)
+	jnc	L(top)
+
+
+	C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
+	C
+	C 8(%ebx) is the next source, and 24(%edx) is the next destination.
+	C %eax is between -4 and -1, representing respectively 0 to 3 extra
+	C limbs that must be read.
+
+
+	testl	$2, %eax	C testl to avoid bad cache line crossing
+	jz	L(finish_nottwo)
+
+	C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
+	C new mm2 and a new mm0 is loaded.
+
+	psllq	%mm6, %mm2
+	movq	%mm0, %mm1
+
+	psrlq	%mm7, %mm0
+	subl	$2, %eax
+
+	por	%mm0, %mm2
+	movq	16(%ebx,%eax,4), %mm0
+
+	movq	%mm2, 32(%edx,%eax,4)
+	movq	%mm1, %mm2
+L(finish_nottwo):
+
+
+	C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
+
+	testb	$1, %al
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	psllq	%mm6, %mm1
+
+	movq	%mm2, 24(%edx,%eax,4)
+	jz	L(finish_even)
+
+
+	C Size is odd, so mm1 and one extra limb to process.
+
+	movd	(%ebx), %mm0		C src[0]
+	popl	%ebx
+deflit(`FRAME',0)
+
+	movq	%mm0, %mm2
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	psllq	%mm6, %mm2
+	por	%mm0, %mm1
+
+	movq	%mm1, 4(%edx)		C dst[1,2]
+	movd	%mm2, (%edx)		C dst[0]
+
+	movl	VAR_RETVAL, %eax
+
+	femms
+	ret
+
+
+	nop	C avoid bad cache line crossing
+L(finish_even):
+deflit(`FRAME',4)
+	C Size is even, so only mm1 left to process.
+
+	movq	%mm1, (%edx)		C dst[0,1]
+	movl	VAR_RETVAL, %eax
+
+	popl	%ebx
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
new file mode 100644
index 0000000000..8a8c144241
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
@@ -0,0 +1,285 @@
+dnl  AMD K6-2 mpn_rshift -- mpn right shift.
+dnl 
+dnl  K6-2: 1.75 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 9, because the unrolled loop can't handle less.
+dnl
+deflit(UNROLL_THRESHOLD, 9)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shrdl(	%cl, %edx, %eax)	C return value
+
+ 	shrl	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx), %edx	C src low limb
+	negl	%ecx
+
+	addl	$32, %ecx
+	movd	PARAM_SHIFT, %mm6
+
+	shll	%cl, %edx
+	cmpl	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+
+
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval
+	C
+	C mm6	shift
+
+	movl	PARAM_DST, %ecx
+	leal	(%ebx,%eax,4), %ebx
+
+	leal	-4(%ecx,%eax,4), %ecx
+	negl	%eax
+
+	C This loop runs at about 3 cycles/limb, which is the amount of
+	C decoding, and this is despite every second access being unaligned.
+
+L(simple):
+	C eax	counter, -(size-1) to -1
+	C ebx	&src[size-1]
+	C ecx	&dst[size-1]
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+
+Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
+	jnz	L(simple)
+
+
+	movq	%mm0, (%ecx)
+	movl	%edx, %eax
+
+	popl	%ebx
+
+	femms
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	
+L(unroll):
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval
+	C
+	C mm6	shift
+
+	addl	$32, %ecx
+	subl	$7, %eax		C size-8
+
+	movd	%ecx, %mm7
+	movl	PARAM_DST, %ecx
+
+	movq	(%ebx), %mm2		C src low qword
+	leal	(%ebx,%eax,4), %ebx	C src end - 32
+
+	testb	$4, %cl
+	leal	(%ecx,%eax,4), %ecx	C dst end - 32
+
+	notl	%eax			C -(size-7)
+	jz	L(dst_aligned)
+
+	psrlq	%mm6, %mm2
+	incl	%eax
+
+Zdisp(	movd,	%mm2, 0,(%ecx,%eax,4))	C dst low limb
+	movq	4(%ebx,%eax,4), %mm2	C new src low qword
+L(dst_aligned):
+
+	movq	12(%ebx,%eax,4), %mm0	C src second lowest qword
+	nop 	C avoid bad cache line crossing
+
+
+	C This loop is the important bit, the rest is just support for it.
+	C Four src limbs are held at the start, and four more will be read.
+	C Four dst limbs will be written.  This schedule seems necessary for
+	C full speed.
+	C
+	C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
+	C and leaves 0 to 3 which can be tested with test $1 and $2.
+
+L(top):
+	C eax	counter, -(size-7) step by +4 until >=0
+	C ebx	src end - 32
+	C ecx	dst end - 32
+	C edx	retval
+	C
+	C mm0	src next qword
+	C mm1	scratch
+	C mm2	src prev qword
+	C mm6	shift
+	C mm7	64-shift
+
+	psrlq	%mm6, %mm2
+	addl	$4, %eax
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	movq	4(%ebx,%eax,4), %mm0
+
+	psrlq	%mm6, %mm1
+	movq	%mm2, -12(%ecx,%eax,4)
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	movq	12(%ebx,%eax,4), %mm0
+
+	movq	%mm1, -4(%ecx,%eax,4)
+	ja	L(top)		C jump if no carry and not zero
+
+
+
+	C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
+	C to 3 representing respectively 3 to 0 further limbs.
+
+	testl	$2, %eax	C testl to avoid bad cache line crossings
+	jnz	L(finish_nottwo)
+
+	C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
+	C becomes new mm2 and a new mm0 is loaded.
+
+	psrlq	%mm6, %mm2
+	movq	%mm0, %mm1
+
+	psllq	%mm7, %mm0
+	addl	$2, %eax
+
+	por	%mm0, %mm2
+	movq	12(%ebx,%eax,4), %mm0
+
+	movq	%mm2, -4(%ecx,%eax,4)
+	movq	%mm1, %mm2
+L(finish_nottwo):
+
+
+	testb	$1, %al
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	psrlq	%mm6, %mm1
+
+	movq	%mm2, 4(%ecx,%eax,4)
+	jnz	L(finish_even)
+
+
+	C one further extra limb to process
+
+	movd	32-4(%ebx), %mm0	C src[size-1], most significant limb
+	popl	%ebx
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	psrlq	%mm6, %mm2
+
+	movq	%mm1, 32-12(%ecx)	C dst[size-3,size-2]
+	movd	%mm2, 32-4(%ecx)	C dst[size-1]
+
+	movl	%edx, %eax		C retval
+
+	femms
+	ret
+
+
+	nop	C avoid bad cache line crossing
+L(finish_even):
+	C no further extra limbs
+
+	movq	%mm1, 32-8(%ecx)	C dst[size-2,size-1]
+	movl	%edx, %eax		C retval
+
+	popl	%ebx
+
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/com_n.asm b/rts/gmp/mpn/x86/k6/mmx/com_n.asm
new file mode 100644
index 0000000000..8915080f0f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/com_n.asm
@@ -0,0 +1,91 @@
+dnl  AMD K6-2 mpn_com_n -- mpn bitwise one's complement.
+dnl   
+dnl     alignment dst/src, A=0mod8 N=4mod8
+dnl        A/A   A/N   N/A   N/N
+dnl  K6-2  1.0   1.18  1.18  1.18  cycles/limb
+dnl  K6    1.5   1.85  1.75  1.85
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Take the bitwise ones-complement of src,size and write it to dst,size.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_com_n)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+	shrl	%ecx
+	jnz	L(two_or_more)
+
+	movl	(%eax), %eax
+	notl	%eax
+	movl	%eax, (%edx)
+	ret
+
+
+L(two_or_more):
+	pushl	%ebx
+FRAME_pushl()
+	movl	%ecx, %ebx
+
+	pcmpeqd	%mm7, %mm7	C all ones
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx	floor(size/2)
+	C ecx	counter
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+	movq	-8(%eax,%ecx,8), %mm0
+	pxor	%mm7, %mm0
+	movq	%mm0, -8(%edx,%ecx,8)
+	loop	L(top)
+
+
+	jnc	L(no_extra)
+	movl	(%eax,%ebx,8), %eax
+	notl	%eax
+	movl	%eax, (%edx,%ebx,8)
+L(no_extra):
+
+	popl	%ebx
+	emms_or_femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/logops_n.asm b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
new file mode 100644
index 0000000000..46cb3b7ea5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
@@ -0,0 +1,212 @@
+dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+dnl   
+dnl          alignment dst/src1/src2, A=0mod8, N=4mod8
+dnl       A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+dnl 
+dnl  K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
+dnl  K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
+dnl  K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
+dnl
+dnl  K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
+dnl  K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
+dnl  K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  M4_p and M4_i are the MMX and integer instructions
+dnl  M4_*_neg_dst means whether to negate the final result before writing
+dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
+
+define(M4_choose_op,
+m4_assert_numargs(7)
+`ifdef(`OPERATION_$1',`
+define(`M4_function',  `mpn_$1')
+define(`M4_operation', `$1')
+define(`M4_p',         `$2')
+define(`M4_p_neg_dst', `$3')
+define(`M4_p_neg_src2',`$4')
+define(`M4_i',         `$5')
+define(`M4_i_neg_dst', `$6')
+define(`M4_i_neg_src2',`$7')
+')')
+
+dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
+dnl  style (the two are equivalent for xor).
+
+M4_choose_op( and_n,  pand,0,0,  andl,0,0)
+M4_choose_op( andn_n, pandn,0,0, andl,0,1)
+M4_choose_op( nand_n, pand,1,0,  andl,1,0)
+M4_choose_op( ior_n,  por,0,0,   orl,0,0)
+M4_choose_op( iorn_n, por,0,1,   orl,0,1)
+M4_choose_op( nior_n, por,1,0,   orl,1,0)
+M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
+M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+
+C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                   mp_size_t size);
+C
+C Do src1,size M4_operation src2,size, storing the result in dst,size.
+C
+C Unaligned movq loads and stores are a bit slower than aligned ones.  The
+C test at the start of the routine checks the alignment of src1 and if
+C necessary processes one limb separately at the low end to make it aligned.
+C
+C The raw speeds without this alignment switch are as follows.
+C
+C           alignment dst/src1/src2, A=0mod8, N=4mod8
+C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
+C
+C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
+C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
+C K6                 2.0    2.25                2.35   2.28   nand,nior
+C
+C
+C Future:
+C
+C K6 can do one 64-bit load per cycle so each of these routines should be
+C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
+C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
+C The others are 4 instructions per 2 limbs, and so can only approach 1.0
+C because there's nowhere to hide some loop control.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_SRC2,12)
+defframe(PARAM_SRC1,8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+PROLOGUE(M4_function)
+			movl	PARAM_SIZE, %ecx
+			pushl	%ebx
+		FRAME_pushl()
+			movl	PARAM_SRC1, %eax
+			movl	PARAM_SRC2, %ebx
+			cmpl	$1, %ecx
+			movl	PARAM_DST, %edx
+			ja	L(two_or_more)
+
+
+			movl	(%ebx), %ecx
+			popl	%ebx
+ifelse(M4_i_neg_src2,1,`notl	%ecx')
+			M4_i	(%eax), %ecx
+ifelse(M4_i_neg_dst,1,`	notl	%ecx')
+			movl	%ecx, (%edx)
+
+			ret
+
+
+L(two_or_more):
+			C eax	src1
+			C ebx	src2
+			C ecx	size
+			C edx	dst
+			C esi
+			C edi
+			C ebp
+			C
+			C carry bit is low of size
+
+			pushl	%esi
+		FRAME_pushl()
+			testl	$4, %eax
+			jz	L(alignment_ok)
+
+			movl	(%ebx), %esi
+			addl	$4, %ebx
+ifelse(M4_i_neg_src2,1,`notl	%esi')
+			M4_i	(%eax), %esi
+			addl	$4, %eax
+ifelse(M4_i_neg_dst,1,`	notl	%esi')
+			movl	%esi, (%edx)
+			addl	$4, %edx
+			decl	%ecx
+
+L(alignment_ok):
+			movl	%ecx, %esi
+			shrl	%ecx
+			jnz	L(still_two_or_more)
+
+			movl	(%ebx), %ecx
+			popl	%esi
+ifelse(M4_i_neg_src2,1,`notl	%ecx')
+			M4_i	(%eax), %ecx
+ifelse(M4_i_neg_dst,1,`	notl	%ecx')
+			popl	%ebx
+			movl	%ecx, (%edx)
+			ret
+
+
+L(still_two_or_more):
+ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
+			pcmpeqd	%mm7, %mm7	C all ones
+')
+
+			ALIGN(16)
+L(top):
+			C eax	src1
+			C ebx	src2
+			C ecx	counter
+			C edx	dst
+			C esi
+			C edi
+			C ebp
+			C
+			C carry bit is low of size
+
+			movq	-8(%ebx,%ecx,8), %mm0
+ifelse(M4_p_neg_src2,1,`pxor	%mm7, %mm0')
+			M4_p	-8(%eax,%ecx,8), %mm0
+ifelse(M4_p_neg_dst,1,`	pxor	%mm7, %mm0')
+			movq	%mm0, -8(%edx,%ecx,8)
+
+			loop	L(top)
+
+
+			jnc	L(no_extra)
+
+			movl	-4(%ebx,%esi,4), %ebx
+ifelse(M4_i_neg_src2,1,`notl	%ebx')
+			M4_i	-4(%eax,%esi,4), %ebx
+ifelse(M4_i_neg_dst,1,`	notl	%ebx')
+			movl	%ebx, -4(%edx,%esi,4)
+L(no_extra):
+
+			popl	%esi
+			popl	%ebx
+			emms_or_femms
+			ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/lshift.asm b/rts/gmp/mpn/x86/k6/mmx/lshift.asm
new file mode 100644
index 0000000000..f1dc83db46
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/lshift.asm
@@ -0,0 +1,122 @@
+dnl  AMD K6 mpn_lshift -- mpn left shift.
+dnl 
+dnl  K6: 3.0 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions.  This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shldl(	%cl, %edx, %eax)	C return value
+
+ 	shll	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+	ALIGN(16)	C avoid offset 0x1f
+	nop		C avoid bad cache line crossing
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx,%eax,4), %edx	C src high limb
+	negl	%ecx
+
+	movd	PARAM_SHIFT, %mm6
+	addl	$32, %ecx		C 32-shift
+
+	shrl	%cl, %edx
+
+	movd	%ecx, %mm7
+	movl	PARAM_DST, %ecx
+
+L(top):
+	C eax	counter, size-1 to 1
+	C ebx	src
+	C ecx	dst
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%eax,4), %mm0
+	decl	%eax
+
+ 	psrlq	%mm7, %mm0
+
+	movd	%mm0, 4(%ecx,%eax,4)
+	jnz	L(top)
+
+
+	movd	(%ebx), %mm0
+	popl	%ebx
+
+ 	psllq	%mm6, %mm0
+	movl	%edx, %eax
+
+	movd	%mm0, (%ecx)
+
+	emms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/popham.asm b/rts/gmp/mpn/x86/k6/mmx/popham.asm
new file mode 100644
index 0000000000..2c619252bb
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/popham.asm
@@ -0,0 +1,238 @@
+dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
+dnl  hamming distance.
+dnl
+dnl         popcount  hamdist
+dnl  K6-2:    9.0       11.5   cycles/limb
+dnl  K6:      12.5      13.0
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here isn't optimal, but it's already a 2x speedup over the plain
+C integer mpn/generic/popcount.c,hamdist.c.
+
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
+')m4exit(1)')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC2,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+
+	DATA
+	ALIGN(8)
+
+define(LS,
+m4_assert_numargs(1)
+`LF(M4_function,`$1')')
+
+LS(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+
+LS(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+
+LS(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+
+LS(rodata_000000FF000000FF):
+	.long	0x000000FF
+	.long	0x000000FF
+')
+
+	.text
+	ALIGN(32)
+
+POP(`ifdef(`PIC', `
+	C avoid shrl crossing a 32-byte boundary
+	nop')')
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	orl	%ecx, %ecx
+	jz	L(zero)
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %eax
+	movl	$0x33333333, %edx
+
+	movd	%eax, %mm7
+	movd	%edx, %mm6
+
+	movl	$0x0F0F0F0F, %eax
+	movl	$0x000000FF, %edx
+
+	punpckldq %mm7, %mm7
+	punpckldq %mm6, %mm6
+
+	movd	%eax, %mm5
+	movd	%edx, %mm4
+
+	punpckldq %mm5, %mm5
+	punpckldq %mm4, %mm4
+',`
+
+	movq	LS(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	LS(rodata_3333333333333333), %mm6
+	movq	LS(rodata_0F0F0F0F0F0F0F0F), %mm5
+	movq	LS(rodata_000000FF000000FF), %mm4
+')
+
+define(REG_AAAAAAAAAAAAAAAA, %mm7)
+define(REG_3333333333333333, %mm6)
+define(REG_0F0F0F0F0F0F0F0F, %mm5)
+define(REG_000000FF000000FF, %mm4)
+
+
+	movl	PARAM_SRC, %eax
+HAM(`	movl	PARAM_SRC2, %edx')
+
+	pxor	%mm2, %mm2	C total
+
+	shrl	%ecx
+	jnc	L(top)
+
+Zdisp(	movd,	0,(%eax,%ecx,8), %mm1)
+
+HAM(`
+Zdisp(	movd,	0,(%edx,%ecx,8), %mm0)
+	pxor	%mm0, %mm1
+')
+
+	incl	%ecx
+	jmp	L(loaded)
+
+
+	ALIGN(16)
+POP(`	nop	C alignment to avoid crossing 32-byte boundaries')
+
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, qwords, decrementing
+	C edx	[hamdist] src2
+	C
+	C mm0	(scratch)
+	C mm1	(scratch)
+	C mm2	total (low dword)
+	C mm3
+	C mm4	\
+	C mm5	| special constants
+	C mm6	|
+	C mm7	/
+
+	movq	-8(%eax,%ecx,8), %mm1
+HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
+
+L(loaded):
+	movq	%mm1, %mm0
+	pand	REG_AAAAAAAAAAAAAAAA, %mm1
+
+	psrlq	$1, %mm1
+HAM(`	nop			C code alignment')
+
+	psubd	%mm1, %mm0	C bit pairs
+HAM(`	nop			C code alignment')
+
+
+	movq	%mm0, %mm1
+	psrlq	$2, %mm0
+
+	pand	REG_3333333333333333, %mm0
+	pand	REG_3333333333333333, %mm1
+
+	paddd	%mm1, %mm0	C nibbles
+
+
+	movq	%mm0, %mm1
+	psrlq	$4, %mm0
+
+	pand	REG_0F0F0F0F0F0F0F0F, %mm0
+	pand	REG_0F0F0F0F0F0F0F0F, %mm1
+
+	paddd	%mm1, %mm0	C bytes
+
+	movq	%mm0, %mm1
+	psrlq	$8, %mm0
+
+
+	paddb	%mm1, %mm0	C words
+
+
+	movq	%mm0, %mm1
+	psrlq	$16, %mm0
+
+	paddd	%mm1, %mm0	C dwords
+
+	pand	REG_000000FF000000FF, %mm0
+
+	paddd	%mm0, %mm2	C low to total
+	psrlq	$32, %mm0
+
+	paddd	%mm0, %mm2	C high to total
+	loop	L(top)
+
+
+
+	movd	%mm2, %eax
+	emms_or_femms
+	ret
+
+L(zero):
+	movl	$0, %eax
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/rshift.asm b/rts/gmp/mpn/x86/k6/mmx/rshift.asm
new file mode 100644
index 0000000000..cc5948f26c
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/rshift.asm
@@ -0,0 +1,122 @@
+dnl  AMD K6 mpn_rshift -- mpn right shift.
+dnl 
+dnl  K6: 3.0 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions.  This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shrdl(	%cl, %edx, %eax)	C return value
+
+ 	shrl	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx), %edx	C src low limb
+	negl	%ecx
+
+	addl	$32, %ecx	C 32-shift
+	movd	PARAM_SHIFT, %mm6
+
+	shll	%cl, %edx	C retval
+	movl	PARAM_DST, %ecx
+
+	leal	(%ebx,%eax,4), %ebx
+
+	leal	-4(%ecx,%eax,4), %ecx
+	negl	%eax
+
+
+L(simple):
+	C eax	counter (negative)
+	C ebx	&src[size-1]
+	C ecx	&dst[size-1]
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+
+Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
+	jnz	L(simple)
+
+
+	movq	%mm0, (%ecx)
+	movl	%edx, %eax
+
+	popl	%ebx
+
+	emms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_1.asm b/rts/gmp/mpn/x86/k6/mul_1.asm
new file mode 100644
index 0000000000..c2220fe4ca
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mul_1.asm
@@ -0,0 +1,272 @@
+dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
+dnl 
+dnl  K6: 6.25 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the result.
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+dnl  minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_mul_1c)
+	pushl	%esi
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %esi
+	jmp	LF(mpn_mul_1,start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+	push	%esi
+deflit(`FRAME',4)
+	xorl	%esi, %esi	C initial carry
+
+L(start_nc):
+	mov	PARAM_SIZE, %ecx
+	push	%ebx
+FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	push	%edi
+FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	pushl	%ebp
+FRAME_pushl()
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_MULTIPLIER, %ebp
+
+	jae	L(unroll)
+
+
+	C code offset 0x22 here, close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	src
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst
+	C ebp	multiplier
+	C
+	C this loop 8 cycles/limb
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi)
+	addl	$4, %edi
+
+	loop	L(simple)
+
+
+	popl	%ebp
+
+	popl	%edi
+	popl	%ebx
+
+	movl	%esi, %eax
+	popl	%esi
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C The code for each limb is 6 cycles, with instruction decoding being the
+C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
+C cycles/limb in total.
+C
+C The secret ingredient to get 6.25 is to start the loop with the mul and
+C have the load/store pair at the end.  Rotating the load/store to the top
+C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
+C
+C The whole unrolled loop fits nicely in exactly 80 bytes.
+
+
+	ALIGN(16)	C already aligned to 16 here actually
+L(unroll):
+	movl	(%ebx), %eax
+	leal	-16(%ebx,%ecx,4), %ebx
+
+	leal	-16(%edi,%ecx,4), %edi
+	subl	$4, %ecx
+
+	negl	%ecx
+
+
+	ALIGN(16)	C one byte nop for this alignment
+L(top):
+	C eax	scratch
+	C ebx	&src[size-4]
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	&dst[size-4]
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi,%ecx,4)
+	movl	12(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 12(%edi,%ecx,4)
+	movl	16(%ebx,%ecx,4), %eax
+
+
+	addl	$4, %ecx
+	js	L(top)
+
+
+
+	C eax	next src limb
+	C ebx	&src[size-4]
+	C ecx	0 to 3 representing respectively 4 to 1 further limbs
+	C edx
+	C esi	carry
+	C edi	&dst[size-4]
+
+	testb	$2, %cl
+	jnz	L(finish_not_two)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
+
+	addl	$2, %ecx
+L(finish_not_two):
+
+
+	testb	$1, %cl
+	jnz	L(finish_not_one)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi)
+	movl	12(%ebx), %eax
+L(finish_not_one):
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	popl	%ebp
+
+	adcl	$0, %edx
+
+	movl	%eax, 12(%edi)
+	popl	%edi
+
+	popl	%ebx
+	movl	%edx, %eax
+
+	popl	%esi
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_basecase.asm b/rts/gmp/mpn/x86/k6/mul_basecase.asm
new file mode 100644
index 0000000000..1f5a3a4b4b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mul_basecase.asm
@@ -0,0 +1,600 @@
+dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
+dnl 
+dnl  K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
+dnl      unrolling).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6: UNROLL_COUNT cycles/product (approx)
+dnl           8           9.75
+dnl          16           9.3
+dnl          32           9.3
+dnl  Maximum possible with the current code is 32.
+dnl
+dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
+dnl  might explain it's good performance.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() entry code only
+C once.  The saving is about 10-20% on typical sizes coming from the
+C Karatsuba multiply code.
+C
+C Future:
+C
+C The unrolled loop could be shared by mpn_addmul_1, with some extra stack
+C setups and maybe 2 or 3 wasted cycles at the end.  Code saving would be
+C 256 bytes.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+	movl	(%eax), %eax	C yp low limb
+
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two_limbs)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	movl	(%edx), %edx	C xp low limb
+	movl	PARAM_WP, %ecx
+	
+	mull	%edx
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+	decl	PARAM_YSIZE
+	pushl	%ebx
+deflit(`FRAME',4)
+
+	movl	PARAM_WP, %ebx
+	pushl	%esi
+deflit(`FRAME',8)
+
+	movl	%eax, %ecx	C yp low limb
+	movl	(%edx), %eax	C xp low limb	
+
+	movl	%edx, %esi	C xp
+	jnz	L(two_by_two)
+
+
+	C two limbs by one limb
+
+	mull	%ecx	
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	movl	%edx, %esi	C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+	movl	%esi, 4(%ebx)
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	popl	%esi
+
+	popl	%ebx
+	ret
+	
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp low limb
+	C ebx	wp
+	C ecx	yp low limb
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	push	%edi
+deflit(`FRAME',12)
+	movl	%eax, (%ebx)
+
+	movl	4(%esi), %eax
+	movl	%edx, %edi	C carry, for wp[1]
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+
+	movl	%edi, 4(%ebx)
+	movl	4(%ecx), %ecx	C yp[1]
+
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+
+	adcl	$0, %edx
+
+	movl	(%esi), %eax	C xp[0]
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	adcl	%edx, %edi
+	adcl	$0, %esi
+
+	movl	%edi, 8(%ebx)
+	popl	%edi
+
+	movl	%esi, 12(%ebx)
+	popl	%esi
+
+	popl	%ebx
+	ret
+
+	
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two_limbs):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline.  Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times).  A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 10-20
+C limb operations the Karatsuba code calls here with.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',0)
+
+	pushl	%edi		defframe_pushl(SAVE_EDI)
+	pushl	%ebp		defframe_pushl(SAVE_EBP)
+
+	movl	PARAM_WP, %edi
+	pushl	%esi		defframe_pushl(SAVE_ESI)
+
+	movl	%eax, %ebp
+	pushl	%ebx		defframe_pushl(SAVE_EBX)
+
+	leal	(%edx,%ecx,4), %ebx	C xp end
+	xorl	%esi, %esi
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	xp end
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	carry
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%ebx,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	incl	%ecx
+
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	%esi, (%edi)		C final carry
+
+	movl	PARAM_XSIZE, %ecx
+	decl	%edx
+
+	jnz	L(ysize_more_than_one_limb)
+
+	popl	%ebx
+	popl	%esi
+	popl	%ebp
+	popl	%edi
+	ret
+
+
+L(ysize_more_than_one_limb):
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+C Simple addmul loop.
+C
+C Using ebx and edi pointing at the ends of their respective locations saves
+C a couple of instructions in the outer loop.  The inner loop is still 11
+C cycles, the same as the simple loop in aorsmul_1.asm.
+
+	C eax	yp
+	C ebx	xp end
+	C ecx	xsize
+	C edx	ysize-1
+	C esi
+	C edi	wp end of mul1
+	C ebp
+
+	movl	4(%eax), %ebp		C multiplier
+	negl	%ecx
+
+	movl	%ecx, PARAM_XSIZE	C -xsize
+	xorl	%esi, %esi		C initial carry
+
+	leal	4(%eax,%edx,4), %eax	C yp end
+	negl	%edx
+
+	movl	%eax, PARAM_YP
+	movl	%edx, PARAM_YSIZE
+
+	jmp	L(simple_outer_entry)
+
+
+	C aligning here saves a couple of cycles
+	ALIGN(16)
+L(simple_outer_top):	
+	C edx	ysize counter, negative
+
+	movl	PARAM_YP, %eax		C yp end
+	xorl	%esi, %esi		C carry
+
+	movl	PARAM_XSIZE, %ecx	C -xsize
+	movl	%edx, PARAM_YSIZE
+
+	movl	(%eax,%edx,4), %ebp	C yp limb multiplier
+L(simple_outer_entry):
+	addl	$4, %edi
+
+
+L(simple_inner):
+	C eax	scratch
+	C ebx	xp end
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	carry
+	C edi	wp end of this addmul
+	C ebp	multiplier
+
+	movl	(%ebx,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	$0, %edx
+	addl	%eax, (%edi,%ecx,4)
+	adcl	%edx, %esi
+
+	incl	%ecx
+	jnz	L(simple_inner)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	%esi, (%edi)
+
+	incl	%edx
+	jnz	L(simple_outer_top)
+
+
+	popl	%ebx
+	popl	%esi
+	popl	%ebp
+	popl	%edi
+	ret
+
+
+C -----------------------------------------------------------------------------
+C Unrolled loop.
+C
+C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
+C some comments.
+C
+C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
+C 0, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
+C is entered.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.  This can't just be fetched through the xp
+C pointer because of the offset applied to it.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
+C to give the starting point in the destination for each unrolled loop (this
+C point is one limb upwards for each limb of yp processed).
+C
+C Having PARAM_YSIZE count negative to zero means it's not necessary to
+C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
+C the stack remain constant and on each loop an leal adjusts them with the
+C PARAM_YSIZE counter value.
+
+
+defframe(VAR_COUNTER,      -20)
+defframe(VAR_COUNTER_INIT, -24)
+defframe(VAR_JMP,          -28)
+defframe(VAR_XP_LOW,       -32)
+deflit(VAR_STACK_SPACE, 16)
+
+dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
+dnl  slower in this code, hence the defframe empty-if-zero feature is
+dnl  disabled.
+dnl
+dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
+dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
+dnl  anything isn't clear.
+dnl
+define(`defframe_empty_if_zero_disabled',1)
+
+L(unroll):
+	C eax	yp (not used)
+	C ebx	xp end (not used)
+	C ecx	xsize
+	C edx	ysize-1
+	C esi
+	C edi	wp end of mul1 (not used)
+	C ebp
+deflit(`FRAME', 16)
+
+	leal	-2(%ecx), %ebp	C one limb processed at start,
+	decl	%ecx		C and ebp is one less
+
+	shrl	$UNROLL_LOG2, %ebp
+	negl	%ecx
+
+	subl	$VAR_STACK_SPACE, %esp
+deflit(`FRAME', 16+VAR_STACK_SPACE)
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ecx, %esi
+	shll	$4, %ecx
+
+	movl	%ebp, VAR_COUNTER_INIT
+	negl	%esi
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_entry) (%ecx,%esi,1), %ecx
+')
+
+	movl	PARAM_XP, %ebx
+	movl	%ebp, VAR_COUNTER
+
+	movl	PARAM_WP, %edi
+	movl	%ecx, VAR_JMP
+
+	movl	(%ebx), %eax
+	leal	4(%edi,%esi,4), %edi	C wp adjust for unrolling and mul1
+
+	leal	(%ebx,%esi,4), %ebx	C xp adjust for unrolling
+
+	movl	%eax, VAR_XP_LOW
+
+	movl	%ebx, PARAM_XP
+	movl	PARAM_YP, %ebx
+
+	leal	(%edi,%edx,4), %ecx	C wp adjust for ysize indexing
+	movl	4(%ebx), %ebp		C multiplier (yp second limb)
+
+	leal	4(%ebx,%edx,4), %ebx	C yp adjust for ysize indexing
+
+	movl	%ecx, PARAM_WP
+
+	leal	1(%esi), %ecx	C adjust parity for decl %ecx above
+
+	movl	%ebx, PARAM_YP
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%ecx,%esi,1), %ecx
+	addl	$L(unroll_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
+	C cost any extra space, since the inner unrolled loop below is
+	C aligned to 32.
+	ALIGN(32)
+L(unroll_outer_top):
+	C edx	ysize
+
+	movl	PARAM_YP, %eax
+	movl	%edx, PARAM_YSIZE	C incremented ysize counter
+
+	movl	PARAM_WP, %edi
+
+	movl	VAR_COUNTER_INIT, %ebx
+	movl	(%eax,%edx,4), %ebp	C next multiplier
+
+	movl	PARAM_XSIZE, %ecx
+	leal	(%edi,%edx,4), %edi	C adjust wp for where we are in yp
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebx, VAR_COUNTER
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	C using testb is a tiny bit faster than testl
+	testb	$1, %cl
+
+	movl	%eax, %ecx	C low carry
+	movl	VAR_JMP, %eax
+
+	movl	%edx, %esi	C high carry
+	movl	PARAM_XP, %ebx
+
+	jnz	L(unroll_noswap)
+	movl	%ecx, %esi	C high,low carry other way around
+
+	movl	%edx, %ecx
+L(unroll_noswap):
+
+	jmp	*%eax
+
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_top):
+	C eax	scratch
+	C ebx	xp
+	C ecx	carry low
+	C edx	scratch
+	C esi	carry high
+	C edi	wp
+	C ebp	multiplier
+	C VAR_COUNTER  loop counter
+	C
+	C 15 code bytes each limb
+
+	leal	UNROLL_BYTES(%edi), %edi
+
+L(unroll_entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4))
+	deflit(`disp1', eval(disp0 + 4))
+	deflit(`disp2', eval(disp1 + 4))
+
+	movl	disp1(%ebx), %eax
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0( %ecx)
+
+	movl	disp2(%ebx), %eax
+	mull	%ebp
+	addl	%esi, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0( %esi)
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%ebx), %ebx
+
+	jns	L(unroll_top)
+
+
+	movl	PARAM_YSIZE, %edx
+	addl	%ecx, UNROLL_BYTES(%edi)
+
+	adcl	$0, %esi
+
+	incl	%edx
+	movl	%esi, UNROLL_BYTES+4(%edi)
+
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+
+	addl	$FRAME, %esp
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/sqr_basecase.asm b/rts/gmp/mpn/x86/k6/sqr_basecase.asm
new file mode 100644
index 0000000000..70d49b3e57
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/sqr_basecase.asm
@@ -0,0 +1,672 @@
+dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
+dnl  product (measured on the speed difference between 17 and 33 limbs,
+dnl  which is roughly the Karatsuba recursing range).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this
+dnl  code supports.  This value is used only by the tune program to know
+dnl  what it can go up to.  (An attempt to compile with a bigger value will
+dnl  trigger some m4_assert()s in the code, making the build fail.)
+dnl
+dnl  The value is determined by requiring the displacements in the unrolled
+dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
+dnl  63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
+
+
+dnl  Allow a value from the tune program to override config.m4.
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+
+dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
+dnl  number required is determined by KARATSUBA_SQR_THRESHOLD, since
+dnl  mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD.
+dnl
+dnl  The first addmul is the biggest, and this takes the second least
+dnl  significant limb and multiplies it by the third least significant and
+dnl  up.  Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1
+dnl  limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3.
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed
+C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 35x35 that do need all of it will
+C at least be getting value for money, because 35x35 spends something like
+C 5780 cycles here.
+C
+C Different values of UNROLL_COUNT give slightly different speeds, between
+C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
+C This isn't a big difference, but it's presumably some alignment effect
+C which if understood could give a simple speedup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %ecx
+	je	L(two_limbs)
+
+	movl	PARAM_DST, %edx
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+
+	movl	(%eax), %eax
+	movl	%edx, %ecx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+
+	pushl	%ebx
+	movl	%eax, %ebx	C src
+deflit(`FRAME',4)
+
+	movl	(%ebx), %eax
+	movl	PARAM_DST, %ecx
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, 8(%ecx)
+	movl	(%ebx), %eax
+
+	movl	%edx, 12(%ecx)
+	movl	4(%ebx), %edx
+
+	mull	%edx		C src[0]*src[1]
+
+	addl	%eax, 4(%ecx)
+
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+
+	popl	%ebx
+	addl	%eax, 4(%ecx)
+
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+deflit(`FRAME',0)
+	cmpl	$4, %ecx
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	pushl	%ebx
+	movl	%eax, %ebx	C src
+
+	movl	(%ebx), %eax
+	movl	%edx, %ecx	C dst
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)
+	pushl	%esi
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	8(%ebx), %eax
+
+	movl	%edx, 12(%ecx)
+	pushl	%edi
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	(%ebx), %eax
+
+	movl	%edx, 20(%ecx)
+	movl	4(%ebx), %edx
+
+	mull	%edx		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	(%ebx), %eax
+
+	movl	%edx, %edi
+	movl	8(%ebx), %edx
+
+	pushl	%ebp
+	xorl	%ebp, %ebp
+
+	mull	%edx		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	4(%ebx), %eax
+
+	adcl	%edx, %ebp
+
+	movl	8(%ebx), %edx
+
+	mull	%edx		C src[1] * src[2]
+
+	addl	%eax, %ebp
+
+	adcl	$0, %edx
+
+
+	C eax	will be dst[5]
+	C ebx
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	xorl	%eax, %eax
+	addl	%esi, %esi
+	adcl	%edi, %edi
+	adcl	%ebp, %ebp
+	adcl	%edx, %edx
+	adcl	$0, %eax
+
+	addl	%esi, 4(%ecx)
+	adcl	%edi, 8(%ecx)
+	adcl	%ebp, 12(%ecx)
+
+	popl	%ebp
+	popl	%edi
+
+	adcl	%edx, 16(%ecx)
+
+	popl	%esi
+	popl	%ebx
+
+	adcl	%eax, 20(%ecx)
+	ASSERT(nc)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+
+defframe(SAVE_EBX,   -4)
+defframe(SAVE_ESI,   -8)
+defframe(SAVE_EDI,   -12)
+defframe(SAVE_EBP,   -16)
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP,    -24)
+deflit(STACK_SPACE, 24)
+
+	ALIGN(16)
+L(four_or_more):
+
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C
+C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
+C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
+C a 5780 cycle operation, which is not surprising since the loop here is 8
+C c/l and mpn_mul_1 is 6.25 c/l.
+
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+	
+	movl	%edi, SAVE_EDI
+	leal	4(%edx), %edi
+
+	movl	%ebx, SAVE_EBX
+	leal	4(%eax), %ebx
+
+	movl	%esi, SAVE_ESI
+	xorl	%esi, %esi
+
+	movl	%ebp, SAVE_EBP
+
+	C eax
+	C ebx	src+4
+	C ecx	size
+	C edx
+	C esi
+	C edi	dst+4
+	C ebp
+
+	movl	(%eax), %ebp	C multiplier
+	leal	-1(%ecx), %ecx	C size-1, and pad to a 16 byte boundary
+
+
+	ALIGN(16)
+L(mul_1):
+	C eax	scratch
+	C ebx	src ptr
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst ptr
+	C ebp	multiplier
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi)
+	addl	$4, %edi
+
+	loop	L(mul_1)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K6 doesn't do any branch prediction on indirect jumps, which is good
+C actually because it's a different target each time.  The unrolled addmul
+C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
+C the indirect jump is quickly recovered.
+
+
+dnl  This value is also implicitly encoded in a shift and add.
+dnl
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl  With the unmodified &src[size] and &dst[size] pointers, the
+dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl  values up to 31.  Above that an offset must be added to them.
+dnl
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+	C eax
+	C ebx	&src[size]
+	C ecx
+	C edx
+	C esi	carry
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, (%edi)
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	movl	%ecx, %edx
+ifelse(OFFSET,0,,
+`	subl	$OFFSET, %ebx')
+
+	shll	$4, %ecx
+ifelse(OFFSET,0,,
+`	subl	$OFFSET, %edi')
+
+	negl	%ecx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+	negl	%edx
+
+
+	C The calculated jump mustn't be before the start of the available
+	C code.  This is the limitation UNROLL_COUNT puts on the src operand
+	C size, but checked here using the jump address directly.
+	C
+	ASSERT(ae,`
+	movl_text_address( L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx
+	')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	&src[size], constant
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	high limb to store
+	C edi	dst ptr, high of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%ebx,%edx,4), %ebp	C multiplier
+	movl	%edx, VAR_COUNTER
+
+	movl	-8+OFFSET(%ebx,%edx,4), %eax	C first limb of multiplicand
+
+	mull	%ebp
+
+	testb	$1, %cl
+
+	movl	%edx, %esi	C high carry
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	leal	CODE_BYTES_PER_LIMB(%edx), %edx
+
+	movl	%edx, VAR_JMP
+	leal	4(%edi), %edi
+
+	C A branch-free version of this using some xors was found to be a
+	C touch slower than just a conditional jump, despite the jump
+	C switching between taken and not taken on every loop.
+
+ifelse(eval(UNROLL_COUNT%2),0,
+	jz,jnz)	L(unroll_noswap)
+	movl	%esi, %eax	C high,low carry other way around
+
+	movl	%ecx, %esi
+	movl	%eax, %ecx
+L(unroll_noswap):
+
+	jmp	*%edx
+
+
+	C Must be on an even address here so the low bit of the jump address
+	C will indicate which way around ecx/esi should start.
+	C
+	C An attempt was made at padding here to get the end of the unrolled
+	C code to come out on a good alignment, to save padding before
+	C L(corner).  This worked, but turned out to run slower than just an
+	C ALIGN(2).  The reason for this is not clear, it might be related
+	C to the different speeds on different UNROLL_COUNTs noted above.
+
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	scratch
+	C ebx	src
+	C ecx	carry low
+	C edx	scratch
+	C esi	carry high
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 15 code bytes each limb
+	C ecx/esi swapped on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src - 4))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	addl,	%esi, disp_dst,(%edi))
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0( %esi)
+',`
+	dnl  this one comes out last
+Zdisp(	movl,	disp_src,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp_dst,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0( %ecx)
+')
+')
+L(unroll_inner_end):
+
+	addl	%esi, -4+OFFSET(%edi)
+
+	movl	VAR_COUNTER, %edx
+	jadcl0(	%ecx)
+
+	movl	%ecx, m4_empty_if_zero(OFFSET)(%edi)
+	movl	VAR_JMP, %ecx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+	
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %ebx
+	addl	$OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(corner):
+	C ebx	&src[size]
+	C edi	&dst[2*size-5]
+
+	movl	-12(%ebx), %ebp
+
+	movl	-8(%ebx), %eax
+	movl	%eax, %ecx
+
+	mull	%ebp
+
+	addl	%eax, -4(%edi)
+	adcl	$0, %edx
+
+	movl	-4(%ebx), %eax
+	movl	%edx, %esi
+	movl	%eax, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	adcl	$0, %edx
+
+	addl	%eax, (%edi)
+	adcl	$0, %edx
+
+	movl	%edx, %esi
+	movl	%ebx, %eax
+
+	mull	%ecx
+
+	addl	%esi, %eax
+	movl	%eax, 4(%edi)
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%edi)
+	
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+C The loop measures about 6 cycles/iteration, though it looks like it should
+C decode in 5.
+
+L(lshift_start):
+	movl	PARAM_SIZE, %ecx
+
+	movl	PARAM_DST, %edi
+	subl	$1, %ecx		C size-1 and clear carry
+
+	movl	PARAM_SRC, %ebx
+	movl	%ecx, %edx
+
+	xorl	%eax, %eax		C ready for adcl
+
+
+	ALIGN(16)
+L(lshift):
+	C eax
+	C ebx	src (for later use)
+	C ecx	counter, decrementing
+	C edx	size-1 (for later use)
+	C esi
+	C edi	dst, incrementing
+	C ebp
+
+	rcll	4(%edi)
+	rcll	8(%edi)
+	leal	8(%edi), %edi
+	loop	L(lshift)
+
+
+	adcl	%eax, %eax
+
+	movl	%eax, 4(%edi)		C dst most significant limb
+	movl	(%ebx), %eax		C src[0]
+
+	leal	4(%ebx,%edx,4), %ebx	C &src[size]
+	subl	%edx, %ecx		C -(size-1)
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+	mull	%eax
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+
+
+	ALIGN(16)
+L(diag):
+	C eax	scratch
+	C ebx	&src[size]
+	C ecx	counter, negative
+	C edx	carry
+	C esi	scratch
+	C edi	dst[2*size-2]
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	movl	%edx, %esi
+
+	mull	%eax
+
+	addl	%esi, 4(%edi,%ecx,8)
+	adcl	%eax, 8(%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	addl	%edx, 4(%edi)		C dst most significant limb
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+        C See README.family about old gas bugs
+	addl	(%esp), %ecx
+	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+	addl	%edx, %ecx
+	ret
+')
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/README b/rts/gmp/mpn/x86/k7/README
new file mode 100644
index 0000000000..c34315c401
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/README
@@ -0,0 +1,145 @@
+
+                      AMD K7 MPN SUBROUTINES
+
+
+This directory contains code optimized for the AMD Athlon CPU.
+
+The mmx subdirectory has routines using MMX instructions.  All Athlons have
+MMX, the separate directory is just so that configure can omit it if the
+assembler doesn't support MMX.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache.
+
+                               cycles/limb
+	mpn_add/sub_n             1.6
+
+	mpn_copyi                 0.75 or 1.0   \ varying with data alignment
+	mpn_copyd                 0.75 or 1.0   /
+
+	mpn_divrem_1             17.0 integer part, 15.0 fractional part
+	mpn_mod_1                17.0
+	mpn_divexact_by3          8.0
+
+	mpn_l/rshift              1.2
+
+	mpn_mul_1                 3.4
+	mpn_addmul/submul_1       3.9
+
+	mpn_mul_basecase          4.42 cycles/crossproduct (approx)
+
+	mpn_popcount		   5.0
+	mpn_hamdist		   6.0
+
+Prefetching of sources hasn't yet been tried.
+
+
+
+NOTES
+
+cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available.
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+Unsigned "mul"s can be issued every 3 cycles.  This suggests 3 is a limit on
+the speed of the multiplication routines.  The documentation shows mul
+executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that,
+to get near 3 cycles code has to be arranged so that nothing else is issued
+to IEU0.  A busy IEU0 could explain why some code takes 4 cycles and other
+apparently equivalent code takes 5.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead.  The unrolling is
+configurable up to 32 limbs/loop for most routines and up to 64 for some.
+The K7 has 64k L1 code cache so quite big unrolling is allowable.
+
+Computed jumps into the unrolling are used to handle sizes not a multiple of
+the unrolling.  An attractive feature of this is that times increase
+smoothly with operand size, but it may be that some routines should just
+have simple loops to finish up, especially when PIC adds between 2 and 16
+cycles to get %eip.
+
+Position independent code is implemented using a call to get %eip for the
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken.  Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three direct-path instructions which have no
+successive dependencies.  K7 always decodes three and has out-of-order
+execution, but the groupings show what slots might be available and what
+dependency chains exist.
+
+When there's vector-path instructions an effort is made to get triplets of
+direct-path instructions in between them, even if there's dependencies,
+since this maximizes decoding throughput and might save a cycle or two if
+decoding is the limiting factor.
+
+
+
+INSTRUCTIONS
+
+adcl       direct
+divl       39 cycles back-to-back
+lodsl,etc  vector
+loop       1 cycle vector (decl/jnz opens up one decode slot)
+movd reg   vector
+movd mem   direct
+mull       issue every 3 cycles, latency 4 cycles low word, 6 cycles high word
+popl	   vector (use movl for more than one pop)
+pushl	   direct, will pair with a load
+shrdl %cl  vector, 3 cycles, seems to be 3 decode too
+xorl r,r   false read dependency recognised
+
+
+
+REFERENCES
+
+"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number
+22007, revision E, November 1999.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22007.pdf
+
+"3DNow Technology Manual", AMD publication number 21928F/0-August 1999.
+This describes the femms and prefetch instructions.  Available on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21928.pdf
+
+"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD
+publication number 22466, revision B, August 1999.  This describes
+instructions added in the Athlon processor, such as pswapd and the extra
+prefetch forms.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22466.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999.  This has some notes on general Athlon optimizations as well as
+3DNow.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/k7/aors_n.asm b/rts/gmp/mpn/x86/k7/aors_n.asm
new file mode 100644
index 0000000000..85fa9d3036
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/aors_n.asm
@@ -0,0 +1,250 @@
+dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+dnl 
+dnl  K7: 1.64 cycles/limb (at 16 limb/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           8           1.9
+dnl          16           1.64
+dnl          32           1.7
+dnl          64           2.0
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                         mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                   mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C This code runs at 1.64 cycles/limb, which is probably the best possible
+C with plain integer operations.  Each limb is 2 loads and 1 store, and in
+C one cycle the K7 can do two loads, or a load and a store, leading to 1.5
+C c/l.
+
+dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+defframe(SAVE_EDI, -16)
+deflit(STACK_SPACE, 16)
+
+	.text
+	ALIGN(32)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_nc)
+	movl	PARAM_CARRY, %eax
+	jmp	LF(M4_function_n,start)
+EPILOGUE()
+
+PROLOGUE(M4_function_n)
+
+	xorl	%eax, %eax	C carry
+L(start):
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	movl	%ebx, SAVE_EBX
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_SRC2, %edx
+	movl	PARAM_SRC1, %ebx
+	jae	L(unroll)
+
+	movl	PARAM_DST, %edi
+	leal	(%ebx,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %edx
+
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+	shrl	%eax
+
+	C This loop in in a single 16 byte code block already, so no
+	C alignment necessary.
+L(simple):
+	C eax	scratch
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	M4_inst	(%edx,%ecx,4), %eax
+	movl	%eax, (%edi,%ecx,4)
+	incl	%ecx
+	jnz	L(simple)
+
+	movl	$0, %eax
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	setc	%al
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	C This is at 0x55, close enough to aligned.
+L(unroll):
+deflit(`FRAME',STACK_SPACE)
+	movl	%ebp, SAVE_EBP
+	andl	$-2, %ecx		C size low bit masked out
+	andl	$1, PARAM_SIZE		C size low bit kept
+
+	movl	%ecx, %edi
+	decl	%ecx
+	movl	PARAM_DST, %ebp
+
+	shrl	$UNROLL_LOG2, %ecx
+	negl	%edi
+	movl	%esi, SAVE_ESI
+
+	andl	$UNROLL_MASK, %edi
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edi,%edi,8), %esi	C 9 bytes per
+')
+	negl	%edi
+	shrl	%eax
+
+	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
+
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%edi,%edi,8), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	zero
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi	scratch (was computed jump)
+	C edi	dst
+	C ebp	scratch
+
+	leal	UNROLL_BYTES(%edx), %edx
+
+L(entry):
+deflit(CHUNK_COUNT, 2)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%ebx), %esi)
+	movl	disp1(%ebx), %ebp
+Zdisp(	M4_inst,disp0,(%edx), %esi)
+Zdisp(	movl,	%esi, disp0,(%edi))
+	M4_inst	disp1(%edx), %ebp
+	movl	%ebp, disp1(%edi)
+')
+
+	decl	%ecx
+	leal	UNROLL_BYTES(%ebx), %ebx
+	leal	UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+	mov	PARAM_SIZE, %esi
+	movl	SAVE_EBP, %ebp
+	movl	$0, %eax
+
+	decl	%esi
+	js	L(even)
+
+	movl	(%ebx), %ecx
+	M4_inst	UNROLL_BYTES(%edx), %ecx
+	movl	%ecx, (%edi)
+L(even):
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	setc	%al
+
+	movl	SAVE_ESI, %esi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/aorsmul_1.asm b/rts/gmp/mpn/x86/k7/aorsmul_1.asm
new file mode 100644
index 0000000000..9f9c3daaf4
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/aorsmul_1.asm
@@ -0,0 +1,364 @@
+dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl 
+dnl  K7: 3.9 cycles/limb.
+dnl 
+dnl  Future: It should be possible to avoid the separate mul after the
+dnl  unrolled loop by moving the movl/adcl to the top.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT  cycles/limb
+dnl           4            4.42
+dnl           8            4.16
+dnl          16            3.9
+dnl          32            3.9
+dnl          64            3.87
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1',`
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+	define(M4_description, add it to)
+	define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1',`
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+	define(M4_description, subtract it from)
+	define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                            mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                             mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+deflit(`FRAME',0)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(SAVE_SIZE, 16)
+
+	.text
+	ALIGN(32)
+PROLOGUE(M4_function_1)
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+	xorl	%ecx, %ecx
+
+	decl	%edx
+	jnz	LF(M4_function_1c,start_1)
+
+	movl	(%eax), %eax
+	movl	PARAM_DST, %ecx
+
+	mull	PARAM_MULTIPLIER
+
+	M4_inst	%eax, (%ecx)
+	adcl	$0, %edx
+	movl	%edx, %eax
+
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(M4_function_1c)
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+
+	decl	%edx
+	jnz	L(more_than_one_limb)
+
+	movl	(%eax), %eax
+	movl	PARAM_DST, %ecx
+
+	mull	PARAM_MULTIPLIER
+
+	addl	PARAM_CARRY, %eax
+
+	adcl	$0, %edx
+	M4_inst	%eax, (%ecx)
+
+	adcl	$0, %edx
+	movl	%edx, %eax
+
+	ret
+
+
+	C offset 0x44 so close enough to aligned
+L(more_than_one_limb):
+	movl	PARAM_CARRY, %ecx
+L(start_1):
+	C eax	src
+	C ecx	initial carry
+	C edx	size-1
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',16)
+
+	movl	%ebx, SAVE_EBX
+	movl	%esi, SAVE_ESI
+	movl	%edx, %ebx	C size-1
+
+	movl	PARAM_SRC, %esi
+	movl	%ebp, SAVE_EBP
+	cmpl	$UNROLL_THRESHOLD, %edx
+
+	movl	PARAM_MULTIPLIER, %ebp
+	movl	%edi, SAVE_EDI
+
+	movl	(%esi), %eax	C src low limb
+	movl	PARAM_DST, %edi
+	ja	L(unroll)
+
+
+	C simple loop
+
+	leal	4(%esi,%ebx,4), %esi	C point one limb past last
+	leal	(%edi,%ebx,4), %edi	C point at last limb
+	negl	%ebx
+
+	C The movl to load the next source limb is done well ahead of the
+	C mul.  This is necessary for full speed, and leads to one limb
+	C handled separately at the end.
+
+L(simple):
+	C eax	src limb
+	C ebx	loop counter
+	C ecx	carry limb
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%eax, %ecx
+	adcl	$0, %edx
+
+	M4_inst	%ecx, (%edi,%ebx,4)
+	movl	(%esi,%ebx,4), %eax
+	adcl	$0, %edx
+
+	incl	%ebx
+	movl	%edx, %ecx
+	jnz	L(simple)
+
+
+	mull	%ebp
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+
+	addl	%eax, %ecx
+	adcl	$0, %edx
+
+	M4_inst	%ecx, (%edi)
+	adcl	$0, %edx
+	movl	SAVE_EDI, %edi
+
+	addl	$SAVE_SIZE, %esp
+	movl	%edx, %eax
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	src low limb
+	C ebx	size-1
+	C ecx	carry
+	C edx	size-1
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+	
+dnl  overlapping with parameters no longer needed
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP,   `PARAM_MULTIPLIER')
+
+	subl	$2, %ebx	C (size-2)-1
+	decl	%edx		C size-2
+	
+	shrl	$UNROLL_LOG2, %ebx
+	negl	%edx
+
+	movl	%ebx, VAR_COUNTER
+	andl	$UNROLL_MASK, %edx
+
+	movl	%edx, %ebx
+	shll	$4, %edx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edx,%ebx,1), %edx
+')
+	negl	%ebx
+	movl	%edx, VAR_JUMP
+
+	mull	%ebp
+
+	addl	%eax, %ecx	C initial carry, becomes low carry
+	adcl	$0, %edx
+	testb	$1, %bl
+
+	movl	4(%esi), %eax	C src second limb
+	leal	ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
+	leal	ifelse(UNROLL_BYTES,256,128)   (%edi,%ebx,4), %edi
+
+	movl	%edx, %ebx	C high carry
+	cmovnz(	%ecx, %ebx)	C high,low carry other way around
+	cmovnz(	%edx, %ecx)
+
+	jmp	*VAR_JUMP
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%edx,%ebx,1), %edx
+	addl	$L(entry)-L(here), %edx
+	addl	(%esp), %edx
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+C This code uses a "two carry limbs" scheme.  At the top of the loop the
+C carries are ebx=lo, ecx=hi, then they swap for each limb processed.  For
+C the computed jump an odd size means they start one way around, an even
+C size the other.  Either way one limb is handled separately at the start of
+C the loop.
+C
+C The positioning of the movl to load the next source limb is important.
+C Moving it after the adcl with a view to avoiding a separate mul at the end
+C of the loop slows the code down.
+
+	ALIGN(32)
+L(top):
+	C eax	src limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src+8
+	C edi	dst
+	C ebp	multiplier
+	C
+	C VAR_COUNTER  loop counter
+	C
+	C 17 bytes each limb
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+	mull	%ebp
+
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	adcl	%edx, %ecx	
+
+
+	mull	%ebp
+
+	M4_inst	%ebx, disp1(%edi)
+	movl	$0, %ebx
+
+	adcl	%eax, %ecx
+
+	movl	disp1(%esi), %eax
+	adcl	%edx, %ebx
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+	leal	UNROLL_BYTES(%edi), %edi
+
+	jns	L(top)
+
+
+	C eax	src limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx
+	C esi
+	C edi	dst (points at second last limb)
+	C ebp	multiplier
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+	mull	%ebp
+
+	M4_inst	%ecx, disp0(%edi)
+	movl	SAVE_EBP, %ebp
+
+	adcl	%ebx, %eax
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	adcl	$0, %edx
+	M4_inst	%eax, disp1(%edi)
+	movl	SAVE_EDI, %edi
+
+	adcl	$0, %edx
+	addl	$SAVE_SIZE, %esp
+
+	movl	%edx, %eax
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/diveby3.asm b/rts/gmp/mpn/x86/k7/diveby3.asm
new file mode 100644
index 0000000000..57684958a5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/diveby3.asm
@@ -0,0 +1,131 @@
+dnl  AMD K7 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl 
+dnl  K7: 8.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3,        0xAAAAAAAB)
+
+dnl  ceil(b/3) and floor(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL,   0x55555556)
+deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %ecx
+	pushl	%ebx			defframe_pushl(SAVE_EBX)
+
+	movl	PARAM_CARRY, %ebx
+	pushl	%ebp			defframe_pushl(SAVE_EBP)
+
+	movl	PARAM_SIZE, %ebp
+	pushl	%edi			defframe_pushl(SAVE_EDI)
+
+	movl	(%ecx), %eax		C src low limb
+	pushl	%esi			defframe_pushl(SAVE_ESI)
+
+	movl	PARAM_DST, %edi
+	movl	$TWO_THIRDS_FLOOR, %esi
+	leal	-4(%ecx,%ebp,4), %ecx	C &src[size-1]
+
+	subl	%ebx, %eax
+
+	setc	%bl
+	decl	%ebp
+	jz	L(last)
+
+	leal	(%edi,%ebp,4), %edi	C &dst[size-1]
+	negl	%ebp
+
+
+	ALIGN(16)
+L(top):
+	C eax	src limb, carry subtracted
+	C ebx	carry limb (0 or 1)
+	C ecx	&src[size-1]
+	C edx	scratch
+	C esi	TWO_THIRDS_FLOOR
+	C edi	&dst[size-1]
+	C ebp	counter, limbs, negative
+
+	imull	$INVERSE_3, %eax, %edx
+
+	movl	4(%ecx,%ebp,4), %eax	C next src limb
+	cmpl	$ONE_THIRD_CEIL, %edx
+
+	sbbl	$-1, %ebx		C +1 if result>=ceil(b/3)
+	cmpl	%edx, %esi
+
+	sbbl	%ebx, %eax		C and further 1 if result>=ceil(b*2/3)
+	movl	%edx, (%edi,%ebp,4)
+	incl	%ebp
+
+	setc	%bl			C new carry
+	jnz	L(top)
+
+
+
+L(last):
+	C eax	src limb, carry subtracted
+	C ebx	carry limb (0 or 1)
+	C ecx	&src[size-1]
+	C edx	scratch
+	C esi	multiplier
+	C edi	&dst[size-1]
+	C ebp
+
+	imull	$INVERSE_3, %eax
+
+	cmpl	$ONE_THIRD_CEIL, %eax
+	movl	%eax, (%edi)
+	movl	SAVE_EBP, %ebp
+
+	sbbl	$-1, %ebx		C +1 if eax>=ceil(b/3)
+	cmpl	%eax, %esi
+	movl	$0, %eax
+
+	adcl	%ebx, %eax		C further +1 if eax>=ceil(b*2/3)
+	movl	SAVE_EDI, %edi
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/gmp-mparam.h b/rts/gmp/mpn/x86/k7/gmp-mparam.h
new file mode 100644
index 0000000000..c3bba0afc4
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/gmp-mparam.h
@@ -0,0 +1,100 @@
+/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+/* the low limb is ready after 4 cycles, but normally it's the high limb
+   which is of interest, and that comes out after 6 cycles */
+#ifndef UMUL_TIME
+#define UMUL_TIME   6  /* cycles */
+#endif
+
+/* AMD doco says 40, but it measures 39 back-to-back */
+#ifndef UDIV_TIME
+#define UDIV_TIME   39  /* cycles */
+#endif
+
+/* using bsf */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   7  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   26
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      177
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   52
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      173
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              76
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            114
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            34
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        5
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          54
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 720, 1440, 2944, 7680, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     736
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         6912
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 784, 1696, 3200, 7680, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     800
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         8448
+#endif
diff --git a/rts/gmp/mpn/x86/k7/mmx/copyd.asm b/rts/gmp/mpn/x86/k7/mmx/copyd.asm
new file mode 100644
index 0000000000..33214daa1f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/copyd.asm
@@ -0,0 +1,136 @@
+dnl  AMD K7 mpn_copyd -- copy limb vector, decrementing.
+dnl 
+dnl     alignment dst/src, A=0mod8 N=4mod8
+dnl        A/A   A/N   N/A   N/N
+dnl  K7    0.75  1.0   1.0   0.75
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The various comments in mpn/x86/k7/copyi.asm apply here too.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl  parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+define(SAVE_ESI,`PARAM_SRC')
+
+dnl  minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_copyd)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	jae	L(unroll)
+
+	orl	%ecx, %ecx
+	jz	L(simple_done)
+
+L(simple):
+	C eax	src
+	C ebx	scratch
+	C ecx	counter
+	C edx	dst
+	C
+	C this loop is 2 cycles/limb
+
+	movl	-4(%eax,%ecx,4), %ebx
+	movl	%ebx, -4(%edx,%ecx,4)
+	decl	%ecx
+	jnz	L(simple)
+
+L(simple_done):
+	movl	SAVE_EBX, %ebx
+	ret
+
+
+L(unroll):
+	movl	%esi, SAVE_ESI
+	leal	(%eax,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %esi
+
+	andl	%esi, %ebx
+	movl	SAVE_ESI, %esi
+	subl	$4, %ecx		C size-4
+
+	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
+	jz	L(aligned)
+
+	C both src and dst unaligned, process one limb to align them
+	movl	12(%eax,%ecx,4), %ebx
+	movl	%ebx, 12(%edx,%ecx,4)
+	decl	%ecx
+L(aligned):
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, limbs
+	C edx	dst
+
+	movq	8(%eax,%ecx,4), %mm0
+	movq	(%eax,%ecx,4), %mm1
+	subl	$4, %ecx
+	movq	%mm0, 16+8(%edx,%ecx,4)
+	movq	%mm1, 16(%edx,%ecx,4)
+	jns	L(top)
+
+
+	C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining
+
+	testb	$2, %cl
+	jz	L(finish_not_two)
+
+	movq	8(%eax,%ecx,4), %mm0
+	movq	%mm0, 8(%edx,%ecx,4)
+L(finish_not_two):
+
+	testb	$1, %cl
+	jz	L(done)
+
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+
+L(done):
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/copyi.asm b/rts/gmp/mpn/x86/k7/mmx/copyi.asm
new file mode 100644
index 0000000000..b234a1628c
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/copyi.asm
@@ -0,0 +1,147 @@
+dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
+dnl 
+dnl     alignment dst/src, A=0mod8 N=4mod8
+dnl        A/A   A/N   N/A   N/N
+dnl  K7    0.75  1.0   1.0   0.75
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size.
+C
+C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
+C 1.33 c/l.
+C
+C The K7 can do two loads, or two stores, or a load and a store, in one
+C cycle, so if those are 64-bit operations then 0.5 c/l should be possible,
+C however nothing under 0.7 c/l is known.
+C
+C If both source and destination are unaligned then one limb is processed at
+C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
+C used unaligned it would be 1.5 c/l.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl  parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+
+dnl  minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	jae	L(unroll)
+
+	orl	%ecx, %ecx
+	jz	L(simple_done)
+
+L(simple):
+	C eax	src, incrementing
+	C ebx	scratch
+	C ecx	counter
+	C edx	dst, incrementing
+	C
+	C this loop is 2 cycles/limb
+
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	decl	%ecx
+	leal	4(%eax), %eax
+	leal	4(%edx), %edx
+	jnz	L(simple)
+
+L(simple_done):
+	movl	SAVE_EBX, %ebx
+	ret
+
+
+L(unroll):
+	movl	%eax, %ebx
+	leal	-12(%eax,%ecx,4), %eax	C src end - 12
+	subl	$3, %ecx		C size-3
+
+	andl	%edx, %ebx
+	leal	(%edx,%ecx,4), %edx	C dst end - 12
+	negl	%ecx
+
+	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
+	jz	L(aligned)
+
+	C both src and dst unaligned, process one limb to align them
+	movl	(%eax,%ecx,4), %ebx
+	movl	%ebx, (%edx,%ecx,4)
+	incl	%ecx
+L(aligned):
+
+
+	ALIGN(16)
+L(top):
+	C eax	src end - 12
+	C ebx
+	C ecx	counter, negative, limbs
+	C edx	dst end - 12
+
+	movq	(%eax,%ecx,4), %mm0
+	movq	8(%eax,%ecx,4), %mm1
+	addl	$4, %ecx
+	movq	%mm0, -16(%edx,%ecx,4)
+	movq	%mm1, -16+8(%edx,%ecx,4)
+	ja	L(top)		C jump no carry and not zero
+
+
+	C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
+
+	testb	$2, %cl
+	jnz	L(finish_not_two)
+
+	movq	(%eax,%ecx,4), %mm0
+	movq	%mm0, (%edx,%ecx,4)
+L(finish_not_two):
+
+	testb	$1, %cl
+	jnz	L(done)
+
+	movl	8(%eax), %ebx
+	movl	%ebx, 8(%edx)
+
+L(done):
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm b/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm
new file mode 100644
index 0000000000..483ad6a9a1
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm
@@ -0,0 +1,718 @@
+dnl  AMD K7 mpn_divrem_1 -- mpn by limb division.
+dnl 
+dnl  K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
+C for m', and "d" for d_norm, which won't cause any confusion since it's
+C only the normalized divisor that's of any use in the code.  "b" is written
+C for 2^N, the size of a limb, N being 32 here.
+C
+C mpn_divrem_1 avoids one division if the src high limb is less than the
+C divisor.  mpn_divrem_1c doesn't check for a zero carry, since in normal
+C circumstances that will be a very rare event.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The inverse takes about 50 cycles to calculate, but after that the
+dnl  multiply is 17 c/l versus division at 42 c/l.
+dnl
+dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
+dnl  even more so on the fractional part.
+
+deflit(MUL_THRESHOLD, 3)
+
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC,     -28)
+defframe(VAR_DST,     -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	leal	-4(%edi,%ebx,4), %edi
+	jmp	LF(mpn_divrem_1,start_1c)
+
+EPILOGUE()
+
+
+	C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	orl	%ecx, %ecx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+
+	jz	L(no_skip_div)
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C one less div if high<divisor
+	jnb	L(no_skip_div)
+
+	movl	$0, (%edi,%ecx,4)	C dst high limb
+	decl	%ecx			C size-1
+	movl	%eax, %edx		C src high limb as initial carry
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax	
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+
+C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
+C It'd be possible to write them out without the looping, but no speedup
+C would be expected.
+C
+C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
+C integer part, but curiously not on the fractional part, where %ebp is a
+C (fixed) couple of cycles faster.
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	movl	%eax, (%edi,%ecx,4)
+	decl	%ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	movl	PARAM_DST, %edi
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+	movl	%edx, %eax
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	scratch (remainder)
+	C esi
+	C edi	dst
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	decl	%ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	bsrl	%ebp, %eax		C 31-l
+
+	leal	12(%edi), %ebx
+	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
+
+	movl	%edi, VAR_DST
+	movl	%ebx, VAR_DST_STOP
+
+	movl	%ecx, %ebx		C size
+	movl	$31, %ecx
+
+	movl	%edx, %edi		C carry
+	movl	$-1, %edx
+
+	C
+
+	xorl	%eax, %ecx		C l
+	incl	%eax			C 32-l
+
+	shll	%cl, %ebp		C d normalized
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	orl	%ebx, %ebx		C size
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	jz	L(start_zero)
+	movl	%eax, VAR_SRC
+	cmpl	$1, %ebx
+
+	movl	8(%eax), %esi		C src high limb
+	jz	L(start_one)
+
+L(start_two_or_more):
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	cmpl	$2, %ebx
+	je	L(integer_two_left)
+	jmp	L(integer_top)
+
+
+L(start_one):
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shll	%cl, %esi		C n10 = high << l
+	movl	%eax, VAR_SRC
+	jmp	L(integer_one_left)
+
+
+L(start_zero):
+	shll	%cl, %edi		C n2 = carry << l
+	movl	$0, %esi		C n10 = 0
+
+	C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
+	C must have xsize!=0
+	jmp	L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
+C execution.  The instruction scheduling is important, with various
+C apparently equivalent forms running 1 to 5 cycles slower.
+C
+C A lower bound for the time would seem to be 16 cycles, based on the
+C following successive dependencies.
+C
+C		      cycles
+C		n2+n1	1
+C		mul	6
+C		q1+1	1
+C		mul	6
+C		sub	1
+C		addback	1
+C		       ---
+C		       16
+C
+C This chain is what the loop has already, but 16 cycles isn't achieved.
+C K7 has enough decode, and probably enough execute (depending maybe on what
+C a mul actually consumes), but nothing running under 17 has been found.
+C
+C In theory n2+n1 could be done in the sub and addback stages (by
+C calculating both n2 and n2+n1 there), but lack of registers makes this an
+C unlikely proposition.
+C
+C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
+C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
+C chain, and nothing better than 18 cycles has been found when using it.
+C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
+C be an extremely rare event.
+C
+C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
+C if some special data is coming out with this always, the q1_ff special
+C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
+C induce the q1_ff case, for speed measurements or testing.  Note that
+C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
+C
+C The instruction groupings and empty comments show the cycles for a naive
+C in-order view of the code (conveniently ignoring the load latency on
+C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
+C to the extent that out-of-order execution rearranges it.  In this case
+C there's 19 cycles shown, but it executes at 17.
+
+	ALIGN(16)
+L(integer_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	VAR_SRC, %ecx
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next limb and the one below it
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_SRC
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+	movl	VAR_DST, %ecx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+
+	leal	-4(%ecx), %ecx
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_DST_STOP, %eax
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+	cmpl	%eax, %ecx
+
+	movl	%ebx, (%ecx)
+	movl	%ecx, VAR_DST
+	jne	L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case.  This make the code a bit smaller and simpler, and
+C costs only 1 cycle (each).
+
+L(integer_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	PARAM_SRC, %ecx
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	(%ecx), %mm0	   C src low limb
+
+	movl	VAR_DST_STOP, %ecx
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	movl	VAR_DST_STOP, %ecx
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx           C q1 if q1+1 overflowed
+
+	mull	%ebx
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -8(%ecx)
+	subl	$8, %ecx
+
+
+
+L(integer_none):
+	cmpl	$0, PARAM_XSIZE
+	jne	L(fraction_some)
+
+	movl	%edi, %eax
+L(fraction_done):
+	movl	VAR_NORM, %ecx
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	shrl	%cl, %eax
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx
+	C edx
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+
+	movl	VAR_DST, %ecx
+	movl	VAR_DST_STOP, %edx
+	subl	$4, %ecx
+
+	psrlq	%mm7, %mm0
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	movl	%ecx, VAR_DST
+
+	movd	%mm0, %esi		C next n10
+
+	movl	$-1, (%ecx)
+	cmpl	%ecx, %edx
+	jne	L(integer_top)
+
+	jmp	L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C Being the fractional part, the "source" limbs are all zero, meaning
+C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
+C
+C The loop runs at 15 cycles.  The dependent chain is the same as the
+C general case above, but without the n2+n1 stage (due to n1==0), so 15
+C would seem to be the lower bound.
+C
+C A not entirely obvious simplification is that q1+1 never overflows a limb,
+C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
+C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
+C rnd() means rounding down to a multiple of d.
+C
+C	m*n2 + b*n2 <= m*(d-1) + b*(d-1)
+C	             = m*d + b*d - m - b
+C	             = floor((b(b-d)-1)/d)*d + b*d - m - b
+C	             = rnd(b(b-d)-1) + b*d - m - b
+C	             = rnd(b(b-d)-1 + b*d) - m - b
+C	             = rnd(b*b-1) - m - b
+C	             <= (b-2)*b
+C
+C Unchanged from the general case is that the final quotient limb q can be
+C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
+C equation 8.4 of the paper which simplifies as follows when n1==0 and
+C n0==0.
+C
+C	n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
+C
+C As before, the instruction groupings and empty comments show a naive
+C in-order view of the code, which is made a nonsense by out of order
+C execution.  There's 17 cycles shown, but it executes at 15.
+C
+C Rotating the store q and remainder->n2 instructions up to the top of the
+C loop gets the run time down from 16 to 15.
+
+	ALIGN(16)
+L(fraction_some):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	carry
+	C ebp	divisor
+
+	movl	PARAM_DST, %esi
+	movl	VAR_DST_STOP, %ecx
+	movl	%edi, %eax
+
+	subl	$8, %ecx
+
+	jmp	L(fraction_entry)
+
+
+	ALIGN(16)
+L(fraction_top):
+	C eax	n2 carry, then scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst, decrementing
+	C edx	scratch
+	C esi	dst stop point
+	C edi	(will be n2)
+	C ebp	divisor
+
+	movl	%ebx, (%ecx)	C previous q
+	movl	%eax, %edi	C remainder->n2
+
+L(fraction_entry):
+	mull	VAR_INVERSE	C m*n2
+
+	movl	%ebp, %eax	C d
+	subl	$4, %ecx	C dst
+	leal	1(%edi), %ebx
+
+	C
+
+	C
+
+	C
+
+	C
+
+	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
+
+	mull	%ebx		C (q1+1)*d
+
+	C
+
+	C
+
+	C
+
+	negl	%eax		C low of n - (q1+1)*d
+
+	C
+
+ 	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
+ 	leal	(%ebp,%eax), %edx
+
+ 	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
+ 	sbbl	$0, %ebx	C q
+ 	cmpl	%esi, %ecx
+
+	jne	L(fraction_top)
+
+
+	movl	%ebx, (%ecx)
+	jmp	L(fraction_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/lshift.asm b/rts/gmp/mpn/x86/k7/mmx/lshift.asm
new file mode 100644
index 0000000000..4d17c881ec
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/lshift.asm
@@ -0,0 +1,472 @@
+dnl  AMD K7 mpn_lshift -- mpn left shift.
+dnl 
+dnl  K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           4           1.51
+dnl           8           1.26
+dnl          16           1.21
+dnl          32           1.2
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right.  The bits shifted out at the left are
+C the return value.
+C
+C The comments in mpn_rshift apply here too.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+	movl	PARAM_SHIFT, %ecx
+	movl	%edi, SAVE_EDI
+
+	movl	PARAM_DST, %edi
+	decl	%eax
+	jnz	L(more_than_one_limb)
+
+	movl	(%edx), %edx
+
+	shldl(	%cl, %edx, %eax)	C eax was decremented to zero
+
+ 	shll	%cl, %edx
+
+	movl	%edx, (%edi)
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	movd	PARAM_SHIFT, %mm6
+	movd	(%edx,%eax,4), %mm5	C src high limb
+	cmp	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+	negl	%ecx
+	movd	(%edx), %mm4		C src low limb
+
+	addl	$32, %ecx
+
+	movd	%ecx, %mm7
+
+L(simple_top):
+	C eax	loop counter, limbs
+	C ebx
+	C ecx
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm4	src low limb
+	C mm5	src high limb
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%edx,%eax,4), %mm0
+	decl	%eax
+
+ 	psrlq	%mm7, %mm0
+
+	movd	%mm0, 4(%edi,%eax,4)
+	jnz	L(simple_top)
+
+
+	psllq	%mm6, %mm5
+ 	psllq	%mm6, %mm4
+
+	psrlq	$32, %mm5
+	movd	%mm4, (%edi)		C dst low limb
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	(saved)
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm5	src high limb, for return value
+	C mm6	lshift
+
+	movl	%esi, SAVE_ESI
+	movl	%ebx, SAVE_EBX
+	leal	-4(%edx,%eax,4), %edx   C &src[size-2]
+
+	testb	$4, %dl
+	movq	(%edx), %mm1		C src high qword
+
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process high limb (marked xxx) separately to
+	C make it so
+	C
+	C  source    -4(edx,%eax,4)
+	C                  |
+	C  +-------+-------+-------+--
+	C  |  xxx          |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+	C
+	C  dest      -4(edi,%eax,4)
+	C                  |
+	C  +-------+-------+--
+	C  |  xxx  |       |  
+	C  +-------+-------+--
+
+	psllq	%mm6, %mm1
+	subl	$4, %edx
+	movl	%eax, PARAM_SIZE	C size-1
+
+	psrlq	$32, %mm1
+	decl	%eax			C size-2 is new size-1
+
+	movd	%mm1, 4(%edi,%eax,4)
+	movq	(%edx), %mm1		C new src high qword
+L(start_src_aligned):
+
+
+        leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
+	psllq	%mm6, %mm5
+
+	testl	$4, %edi
+	psrlq	$32, %mm5		C return value
+
+	jz	L(start_dst_aligned)
+
+
+	C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
+	C shift is 32 bits extra.  High limb of dst (marked xxx) handled
+	C here separately.
+	C
+	C  source       %edx
+	C  +-------+-------+--
+	C  |      mm1      |  
+	C  +-------+-------+--
+	C                0mod8   4mod8
+	C
+	C  dest         %edi
+	C  +-------+-------+-------+--
+	C  |  xxx  |          
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+
+	movq	%mm1, %mm0
+	psllq	%mm6, %mm1
+	addl	$32, %ecx		C shift+32
+
+	psrlq	$32, %mm1
+
+	movd	%mm1, 4(%edi)
+	movq	%mm0, %mm1
+	subl	$4, %edi
+
+	movd	%ecx, %mm6		C new lshift
+L(start_dst_aligned):
+
+	decl	%eax			C size-2, two last limbs handled at end
+	movq	%mm1, %mm2		C copy of src high qword
+	negl	%ecx
+
+	andl	$-2, %eax		C round size down to even
+	addl	$64, %ecx
+
+	movl	%eax, %ebx
+	negl	%eax
+
+	andl	$UNROLL_MASK, %eax
+	decl	%ebx
+
+	shll	%eax
+
+	movd	%ecx, %mm7		C rshift = 64-lshift
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%eax,%eax,4), %esi
+')
+	shrl	$UNROLL_LOG2, %ebx	C loop counter
+
+	leal	ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+	movl	PARAM_SIZE, %eax	C for use at end
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%eax,%eax,4), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	size (for use at end)
+	C ebx	loop counter
+	C ecx	rshift
+	C edx	src
+	C esi	computed jump
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm1	\ carry (alternating, mm2 first)
+	C mm2	/
+	C mm6	lshift
+	C mm7	rshift
+	C
+	C 10 code bytes/limb
+	C
+	C The two chunks differ in whether mm1 or mm2 hold the carry.
+	C The computed jump puts the initial carry in both mm1 and mm2.
+	
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 - 8))
+
+ 	movq	disp0(%edx), %mm0
+ 	psllq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psrlq	%mm7, %mm0
+
+ 	por	%mm2, %mm0
+ 	movq	%mm0, disp0(%edi)
+
+
+ 	movq	disp1(%edx), %mm0
+ 	psllq	%mm6, %mm1
+
+ 	movq	%mm0, %mm2
+ 	psrlq	%mm7, %mm0
+
+ 	por	%mm1, %mm0
+ 	movq	%mm0, disp1(%edi)
+')
+
+	subl	$UNROLL_BYTES, %edx
+	subl	$UNROLL_BYTES, %edi
+	decl	%ebx
+
+	jns	L(top)
+
+
+
+define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
+
+L(end):
+	testb	$1, %al
+	movl	SAVE_EBX, %ebx
+	psllq	%mm6, %mm2	C wanted left shifted in all cases below
+
+	movd	%mm5, %eax
+
+	movl	SAVE_ESI, %esi
+	jz	L(end_even)
+
+
+L(end_odd):
+
+	C Size odd, destination was aligned.
+	C
+	C                 source        edx+8   edx+4
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C dest                            edi
+	C --+---------------+---------------+-------+
+	C   |   written     |               |       |
+	C --+---------------+---------------+-------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size odd, destination was unaligned.
+	C
+	C                 source        edx+8   edx+4
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C         dest                            edi
+	C         --+---------------+---------------+
+	C           |   written     |               |
+	C         --+---------------+---------------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at (%edi), and in the aligned case
+	C there's an extra limb of dst to be formed from that extra src limb
+	C left shifted.
+
+	movd	disp(4) (%edx), %mm0
+	testb	$32, %cl
+
+	movq	%mm0, %mm1
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+	psllq	%mm6, %mm1
+
+	por	%mm2, %mm0
+
+	movq	%mm0, disp(0) (%edi)
+	jz	L(end_odd_unaligned)
+	movd	%mm1, disp(-4) (%edi)
+L(end_odd_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+L(end_even):
+
+	C Size even, destination was aligned.
+	C
+	C                 source        edx+8
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C dest                            edi
+	C --+---------------+---------------+
+	C   |   written     |               |
+	C --+---------------+---------------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size even, destination was unaligned.
+	C
+	C               source          edx+8
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C         dest                  edi+4
+	C         --+---------------+-------+
+	C           |    written    |       |
+	C         --+---------------+-------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C The movq for the aligned case overwrites the movd for the
+	C unaligned case.
+
+	movq	%mm2, %mm0
+	psrlq	$32, %mm2
+
+	testb	$32, %cl
+	movd	%mm2, disp(4) (%edi)
+
+	jz	L(end_even_unaligned)
+	movq	%mm0, disp(0) (%edi)
+L(end_even_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/mod_1.asm b/rts/gmp/mpn/x86/k7/mmx/mod_1.asm
new file mode 100644
index 0000000000..545ca56ddf
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/mod_1.asm
@@ -0,0 +1,457 @@
+dnl  AMD K7 mpn_mod_1 -- mpn by limb remainder.
+dnl 
+dnl  K7: 17.0 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                       mp_limb_t carry);
+C
+C The code here is the same as mpn_divrem_1, but with the quotient
+C discarded.  See mpn/x86/k7/mmx/divrem_1.c for some comments.
+
+
+dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl  used, rather than plain "divl"s.  Minimum value 2.
+dnl
+dnl  The inverse takes about 50 cycles to calculate, but after that the
+dnl  multiply is 17 c/l versus division at 41 c/l.
+dnl
+dnl  Using mul or div is about the same speed at 3 limbs, so the threshold
+dnl  is set to 4 to get the smaller div code used at 3.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC_STOP,-28)
+
+deflit(STACK_SPACE, 28)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	jmp	LF(mpn_mod_1,start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(32)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C carry flag if high<divisor
+					
+	cmovc(	%eax, %edx)		C src high limb as initial carry
+	sbbl	$0, %ecx		C size-1 to skip one div
+	jz	L(divide_done)
+
+
+	ALIGN(16)
+L(start_1c):
+	C eax	
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	cmpl	$MUL_THRESHOLD, %ecx
+	jae	L(mul_by_inverse)
+
+
+
+C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
+C but it's already fast and compact, and there's nothing to gain by
+C expanding it out.
+C
+C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+
+L(divide_top):
+	C eax	scratch (quotient)
+	C ebx
+	C ecx	counter, limbs, decrementing
+	C edx	scratch (remainder)
+	C esi	src
+	C edi
+	C ebp
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	decl	%ecx
+	jnz	L(divide_top)
+
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	movl	%edx, %eax
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	bsrl	%ebp, %eax		C 31-l
+
+	movl	%ebx, SAVE_EBX
+	leal	-4(%esi), %ebx
+
+	movl	%ebx, VAR_SRC_STOP
+	movl	%edi, SAVE_EDI
+
+	movl	%ecx, %ebx		C size
+	movl	$31, %ecx
+
+	movl	%edx, %edi		C carry
+	movl	$-1, %edx
+
+	C
+
+	xorl	%eax, %ecx		C l
+	incl	%eax			C 32-l
+
+	shll	%cl, %ebp		C d normalized
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	C
+
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	8(%eax), %esi		C src high limb
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	movl	%eax, %ecx		C &src[size-3]
+
+
+ifelse(MUL_THRESHOLD,2,`
+	cmpl	$2, %ebx
+	je	L(inverse_two_left)
+')
+
+
+C The dependent chain here is the same as in mpn_divrem_1, but a few
+C instructions are saved by not needing to store the quotient limbs.
+C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
+C
+C There's four dummy instructions in the loop, all of which are necessary
+C for the claimed 17 c/l.  It's a 1 to 3 cycle slowdown if any are removed,
+C or changed from load to store or vice versa.  They're not completely
+C random, since they correspond to what mpn_divrem_1 has, but there's no
+C obvious reason why they're necessary.  Presumably they induce something
+C good in the out of order execution, perhaps through some load/store
+C ordering and/or decoding effects.
+C
+C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1.  On
+C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
+C about 13.5 c/l.
+
+	ALIGN(32)
+L(inverse_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	src pointer, decrementing
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	PARAM_SIZE, %ebx   C dummy
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next src limb and the one below it
+	subl	$4, %ecx
+
+	movl	%ecx, PARAM_SIZE   C dummy
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+	nop                        C dummy
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+	leal	0(%ecx), %ecx      C dummy
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_SRC_STOP, %eax
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	cmpl	%eax, %ecx
+	jne	L(inverse_top)
+
+
+L(inverse_loop_done):
+
+
+C -----------------------------------------------------------------------------
+
+L(inverse_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	&src[-1]
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src dword)
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	4(%ecx), %mm0	   C src low limb
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+
+
+C One limb left
+
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movl	VAR_NORM, %ecx     C for final denorm
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	SAVE_EBX, %ebx
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	movl	%esi, %eax	   C remainder
+	movl	SAVE_ESI, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	leal	(%ebp,%eax), %edx
+	movl	SAVE_EBP, %ebp
+
+	cmovc(	%edx, %eax)	   C n - q1*d if underflow from using q1+1
+	movl	SAVE_EDI, %edi
+
+	shrl	%cl, %eax	   C denorm remainder
+	addl	$STACK_SPACE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx	src pointer
+	C edx
+	C esi	n10
+	C edi	(n2)
+	C ebp	divisor
+
+	movl	VAR_SRC_STOP, %edx
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi		C next n10
+
+	cmpl	%ecx, %edx
+	jne	L(inverse_top)
+	jmp	L(inverse_loop_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/popham.asm b/rts/gmp/mpn/x86/k7/mmx/popham.asm
new file mode 100644
index 0000000000..fa7c8c04a5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/popham.asm
@@ -0,0 +1,239 @@
+dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl  distance.
+dnl 
+dnl  K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  Only recent versions of gas know psadbw, in particular gas 2.9.1 on
+dnl  FreeBSD 3.3 and 3.4 doesn't recognise it.
+
+define(psadbw_mm4_mm0,
+`ifelse(m4_ifdef_anyof_p(`HAVE_TARGET_CPU_athlon',
+                         `HAVE_TARGET_CPU_pentium3'),1,
+	`.byte 0x0f,0xf6,0xc4	C psadbw %mm4, %mm0',
+
+`m4_warning(`warning, using simulated and only partly functional psadbw, use for testing only
+')	C this works enough for the sum of bytes done below, making it
+	C possible to test on an older cpu
+	leal	-8(%esp), %esp
+	movq	%mm4, (%esp)
+	movq	%mm0, %mm4
+forloop(i,1,7,
+`	psrlq	$ 8, %mm4
+	paddb	%mm4, %mm0
+')
+	pushl	$ 0
+	pushl	$ 0xFF
+	pand	(%esp), %mm0
+	movq	8(%esp), %mm4
+	leal	16(%esp), %esp
+')')
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here is almost certainly not optimal, but is already a 3x speedup
+C over the generic C code.  The main improvement would be to interleave
+C processing of two qwords in the loop so as to fully exploit the available
+C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
+C
+C The loop is based on the example "Efficient 64-bit population count using
+C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
+C page 158 of rev E (reference in mpn/x86/k7/README).
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC2,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+
+	DATA
+	ALIGN(8)
+
+define(LS,
+m4_assert_numargs(1)
+`LF(M4_function,`$1')')
+
+LS(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+
+LS(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+
+LS(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+')
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	orl	%ecx, %ecx
+	jz	L(zero)
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %eax
+	movl	$0x33333333, %edx
+
+	movd	%eax, %mm7
+	movd	%edx, %mm6
+
+	movl	$0x0F0F0F0F, %eax
+
+	punpckldq %mm7, %mm7
+	punpckldq %mm6, %mm6
+
+	movd	%eax, %mm5
+	movd	%edx, %mm4
+
+	punpckldq %mm5, %mm5
+
+',`
+	movq	LS(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	LS(rodata_3333333333333333), %mm6
+	movq	LS(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+	pxor	%mm4, %mm4
+
+define(REG_AAAAAAAAAAAAAAAA,%mm7)
+define(REG_3333333333333333,%mm6)
+define(REG_0F0F0F0F0F0F0F0F,%mm5)
+define(REG_0000000000000000,%mm4)
+
+
+	movl	PARAM_SRC, %eax
+HAM(`	movl	PARAM_SRC2, %edx')
+
+	pxor	%mm2, %mm2	C total
+
+	shrl	%ecx
+	jnc	L(top)
+
+	movd	(%eax,%ecx,8), %mm1
+
+HAM(`	movd	0(%edx,%ecx,8), %mm0
+	pxor	%mm0, %mm1
+')
+	orl	%ecx, %ecx
+	jmp	L(loaded)
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, qwords, decrementing
+	C edx	[hamdist] src2
+	C
+	C mm0	(scratch)
+	C mm1	(scratch)
+	C mm2	total (low dword)
+	C mm3
+	C mm4	\
+	C mm5	| special constants
+	C mm6	|
+	C mm7	/
+
+	movq	-8(%eax,%ecx,8), %mm1
+
+HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
+	decl	%ecx
+
+L(loaded):
+	movq	%mm1, %mm0
+	pand	REG_AAAAAAAAAAAAAAAA, %mm1
+
+	psrlq	$1, %mm1
+
+	psubd	%mm1, %mm0	C bit pairs
+
+
+	movq	%mm0, %mm1
+	psrlq	$2, %mm0
+
+	pand	REG_3333333333333333, %mm0
+	pand	REG_3333333333333333, %mm1
+
+	paddd	%mm1, %mm0	C nibbles
+
+
+	movq	%mm0, %mm1
+	psrlq	$4, %mm0
+
+	pand	REG_0F0F0F0F0F0F0F0F, %mm0
+	pand	REG_0F0F0F0F0F0F0F0F, %mm1
+
+	paddd	%mm1, %mm0	C bytes
+
+
+	psadbw_mm4_mm0
+
+	paddd	%mm0, %mm2	C add to total
+	jnz	L(top)
+
+
+	movd	%mm2, %eax
+	emms
+	ret
+
+
+L(zero):
+	movl	$0, %eax
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/rshift.asm b/rts/gmp/mpn/x86/k7/mmx/rshift.asm
new file mode 100644
index 0000000000..abb546cd5b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/rshift.asm
@@ -0,0 +1,471 @@
+dnl  AMD K7 mpn_rshift -- mpn right shift.
+dnl 
+dnl  K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           4           1.51
+dnl           8           1.26
+dnl          16           1.21
+dnl          32           1.2
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  The bits shifted out at the right are
+C the return value.
+C
+C This code uses 64-bit MMX operations, which makes it possible to handle
+C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
+C code, on the other hand, suffers from shrd being a vector path decode and
+C running at 3 cycles back-to-back.
+C
+C Full speed depends on source and destination being aligned, and some hairy
+C setups and finish-ups are done to arrange this for the loop.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+	movl	PARAM_SHIFT, %ecx
+	movl	%edi, SAVE_EDI
+
+	movl	PARAM_DST, %edi
+	decl	%eax
+	jnz	L(more_than_one_limb)
+
+	movl	(%edx), %edx		C src limb
+
+	shrdl(	%cl, %edx, %eax)	C eax was decremented to zero
+
+ 	shrl	%cl, %edx
+
+	movl	%edx, (%edi)		C dst limb
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	movd	PARAM_SHIFT, %mm6	C rshift
+	movd	(%edx), %mm5		C src low limb
+	cmp	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+	leal	(%edx,%eax,4), %edx	C &src[size-1]
+	leal	-4(%edi,%eax,4), %edi	C &dst[size-2]
+
+	movd	(%edx), %mm4		C src high limb
+	negl	%eax
+
+
+L(simple_top):
+	C eax	loop counter, limbs, negative
+	C ebx
+	C ecx	shift
+	C edx	carry
+	C edx	&src[size-1]
+	C edi	&dst[size-2]
+	C ebp
+	C
+	C mm0	scratch
+	C mm4	src high limb
+	C mm5	src low limb
+	C mm6	shift
+
+	movq	(%edx,%eax,4), %mm0
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+	movd	%mm0, (%edi,%eax,4)
+	jnz	L(simple_top)
+
+
+	psllq	$32, %mm5
+ 	psrlq	%mm6, %mm4
+
+	psrlq	%mm6, %mm5
+	movd	%mm4, 4(%edi)		C dst high limb
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm5	src low limb
+	C mm6	rshift
+
+	testb	$4, %dl
+	movl	%esi, SAVE_ESI
+	movl	%ebx, SAVE_EBX
+
+	psllq	$32, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process low limb separately (marked xxx) and
+	C step src and dst by one limb, making src aligned.
+	C
+	C source                  edx
+	C --+-------+-------+-------+
+	C           |          xxx  |
+	C --+-------+-------+-------+
+	C         4mod8   0mod8   4mod8
+	C
+	C         dest            edi
+	C         --+-------+-------+
+	C           |       |  xxx  |  
+	C         --+-------+-------+
+
+	movq	(%edx), %mm0		C src low two limbs
+	addl	$4, %edx
+	movl	%eax, PARAM_SIZE	C size-1
+
+	addl	$4, %edi
+	decl	%eax			C size-2 is new size-1
+
+	psrlq	%mm6, %mm0
+	movl	%edi, PARAM_DST		C new dst
+
+	movd	%mm0, -4(%edi)
+L(start_src_aligned):
+
+
+	movq	(%edx), %mm1		C src low two limbs
+	decl	%eax			C size-2, two last limbs handled at end
+	testl	$4, %edi
+
+	psrlq	%mm6, %mm5
+	jz	L(start_dst_aligned)
+
+
+	C dst isn't aligned, add 4 to make it so, and pretend the shift is
+	C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
+	C
+	C          source          edx
+	C          --+-------+-------+
+	C            |      mm1      |
+	C          --+-------+-------+
+	C                  4mod8   0mod8
+	C
+	C  dest                    edi
+	C  --+-------+-------+-------+
+	C                    |  xxx  |        
+	C  --+-------+-------+-------+
+	C          4mod8   0mod8   4mod8
+
+	movq	%mm1, %mm0
+	psrlq	%mm6, %mm1
+	addl	$32, %ecx		C shift+32
+
+	movd	%mm1, (%edi)
+	movq	%mm0, %mm1
+	addl	$4, %edi		C new dst
+
+	movd	%ecx, %mm6
+L(start_dst_aligned):
+
+
+	movq	%mm1, %mm2		C copy of src low two limbs
+	negl	%ecx
+	andl	$-2, %eax		C round size down to even
+
+	movl	%eax, %ebx
+	negl	%eax
+	addl	$64, %ecx
+
+	andl	$UNROLL_MASK, %eax
+	decl	%ebx
+
+	shll	%eax
+
+	movd	%ecx, %mm7		C lshift = 64-rshift
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%eax,%eax,4), %esi
+	negl	%eax
+')
+	shrl	$UNROLL_LOG2, %ebx	C loop counter
+
+	leal	ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+	movl	PARAM_SIZE, %eax	C for use at end
+
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%eax,%eax,4), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+	negl	%eax
+
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(64)
+L(top):
+	C eax	size, for use at end
+	C ebx	loop counter
+	C ecx	lshift
+	C edx	src
+	C esi	was computed jump
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm1	\ carry (alternating)
+	C mm2	/
+	C mm6	rshift
+	C mm7	lshift
+	C
+	C 10 code bytes/limb
+	C
+	C The two chunks differ in whether mm1 or mm2 hold the carry.
+	C The computed jump puts the initial carry in both mm1 and mm2.
+	
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 8))
+
+ 	movq	disp0(%edx), %mm0
+ 	psrlq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psllq	%mm7, %mm0
+
+ 	por	%mm2, %mm0
+ 	movq	%mm0, disp0(%edi)
+
+
+ 	movq	disp1(%edx), %mm0
+ 	psrlq	%mm6, %mm1
+
+ 	movq	%mm0, %mm2
+ 	psllq	%mm7, %mm0
+
+ 	por	%mm1, %mm0
+ 	movq	%mm0, disp1(%edi)
+')
+
+	addl	$UNROLL_BYTES, %edx
+	addl	$UNROLL_BYTES, %edi
+	decl	%ebx
+
+	jns	L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 8))
+
+	testb	$1, %al
+	psrlq	%mm6, %mm2	C wanted rshifted in all cases below
+	movl	SAVE_ESI, %esi
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EBX, %ebx
+	jz	L(end_even)
+
+	
+	C Size odd, destination was aligned.
+	C
+	C source
+	C       edx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                  edi
+	C +-------+---------------+---------------+--
+	C |       |               |    written    |
+	C +-------+---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size odd, destination was unaligned.
+	C
+	C source
+	C       edx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest          edi
+	C +---------------+---------------+--
+	C |               |    written    |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword to store, and in the aligned case there's
+	C a further extra limb of dst to be formed.
+
+
+	movd	disp0(%edx), %mm0
+	movq	%mm0, %mm1
+
+	psllq	%mm7, %mm0
+	testb	$32, %cl
+
+ 	por	%mm2, %mm0
+	psrlq	%mm6, %mm1
+
+	movq	%mm0, disp0(%edi)
+	jz	L(finish_odd_unaligned)
+
+	movd	%mm1, disp1(%edi)
+L(finish_odd_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+L(end_even):
+
+	C Size even, destination was aligned.
+	C
+	C source
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest          edi
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size even, destination was unaligned.
+	C
+	C source
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest  edi
+	C +-------+---------------+--
+	C |       |      mm3      |
+	C +-------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = 64-(shift+32)
+
+
+	C The movd for the unaligned case is the same data as the movq for
+	C the aligned case, it's just a choice between whether one or two
+	C limbs should be written.
+
+
+	testb	$32, %cl
+	movd	%mm2, disp0(%edi)
+
+	jz	L(end_even_unaligned)
+
+	movq	%mm2, disp0(%edi)
+L(end_even_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mul_1.asm b/rts/gmp/mpn/x86/k7/mul_1.asm
new file mode 100644
index 0000000000..07f7085b10
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mul_1.asm
@@ -0,0 +1,265 @@
+dnl  AMD K7 mpn_mul_1 -- mpn by limb multiply.
+dnl 
+dnl  K7: 3.4 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           8           3.9
+dnl          16           3.4
+dnl          32           3.4
+dnl          64           3.35
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the destination.
+C
+C Variations on the unrolled loop have been tried, with the current
+C registers or with the counter on the stack to free up ecx.  The current
+C code is the fastest found.
+C
+C An interesting effect is that removing the stores "movl %ebx, disp0(%edi)"
+C from the unrolled loop actually slows it down to 5.0 cycles/limb.  Code
+C with this change can be tested on sizes of the form UNROLL_COUNT*n+1
+C without having to change the computed jump.  There's obviously something
+C fishy going on, perhaps with what execution units the mul needs.
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_EDI, -8)
+defframe(SAVE_ESI, -12)
+defframe(SAVE_EBX, -16)
+deflit(STACK_SPACE, 16)
+
+dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 7)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_mul_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	jmp	LF(mpn_mul_1,start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+	xorl	%edx, %edx	C initial carry
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME', STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	movl	%ebx, SAVE_EBX
+	movl	%edx, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_DST, %edi
+	movl	%ebp, SAVE_EBP
+	jae	L(unroll)
+
+	leal	(%esi,%ecx,4), %esi
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+
+	movl	PARAM_MULTIPLIER, %ebp
+
+L(simple):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter (negative)
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(simple)
+
+	movl	%ebx, %eax
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C The mov to load the next source limb is done well ahead of the mul, this
+C is necessary for full speed.  It leads to one limb handled separately
+C after the loop.
+C
+C When unrolling to 32 or more, an offset of +4 is used on the src pointer,
+C to avoid having an 0x80 displacement in the code for the last limb in the
+C unrolled loop.  This is for a fair comparison between 16 and 32 unrolling.
+
+ifelse(eval(UNROLL_COUNT >= 32),1,`
+deflit(SRC_OFFSET,4)
+',`
+deflit(SRC_OFFSET,)
+')
+
+	C this is offset 0x62, so close enough to aligned
+L(unroll):
+	C eax
+	C ebx	initial carry
+	C ecx	size
+	C edx
+	C esi	src
+	C edi	dst
+	C ebp
+deflit(`FRAME', STACK_SPACE)
+
+	leal	-1(%ecx), %edx	C one limb handled at end
+	leal	-2(%ecx), %ecx	C and ecx is one less than edx
+	movl	%ebp, SAVE_EBP
+
+	negl	%edx
+	shrl	$UNROLL_LOG2, %ecx	C unrolled loop counter
+	movl	(%esi), %eax		C src low limb
+
+	andl	$UNROLL_MASK, %edx
+	movl	PARAM_DST, %edi
+
+	movl	%edx, %ebp
+	shll	$4, %edx
+
+	C 17 code bytes per limb
+ifdef(`PIC',`
+	call	L(add_eip_to_edx)
+L(here):
+',`
+	leal	L(entry) (%edx,%ebp), %edx
+')
+	negl	%ebp
+
+	leal	ifelse(UNROLL_BYTES,256,128+) SRC_OFFSET(%esi,%ebp,4), %esi
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ebp,4), %edi
+	movl	PARAM_MULTIPLIER, %ebp
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(add_eip_to_edx):
+	C See README.family about old gas bugs
+	leal	(%edx,%ebp), %edx
+	addl	$L(entry)-L(here), %edx
+	addl	(%esp), %edx
+	ret
+')
+
+
+C ----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	next src limb
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	src+4
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 17 code bytes per limb processed
+
+L(entry):
+forloop(i, 0, UNROLL_COUNT-1, `
+	deflit(`disp_dst', eval(i*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp_src', eval(disp_dst + 4-(SRC_OFFSET-0)))
+ 
+	mull	%ebp
+	
+	addl	%eax, %ebx
+Zdisp(	movl,	disp_src,(%esi), %eax)
+Zdisp(	movl,	%ebx, disp_dst,(%edi))
+
+	movl	$0, %ebx
+	adcl	%edx, %ebx
+')
+
+	decl	%ecx
+
+	leal	UNROLL_BYTES(%esi), %esi
+	leal	UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+
+	mull	%ebp
+	
+	addl	%eax, %ebx
+	movl	$0, %eax
+	movl	SAVE_ESI, %esi
+
+	movl	%ebx, disp0(%edi)
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EDI, %edi
+
+	adcl	%edx, %eax
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+	
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mul_basecase.asm b/rts/gmp/mpn/x86/k7/mul_basecase.asm
new file mode 100644
index 0000000000..c4be62e633
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mul_basecase.asm
@@ -0,0 +1,593 @@
+dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
+dnl 
+dnl  K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
+dnl      limbs/loop unrolling).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
+dnl           8           4.67    
+dnl          16           4.59
+dnl          32           4.42
+dnl  Maximum possible with the current code is 32.
+dnl
+dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
+dnl  done with a straight run through a block of code, no inner loop.  Using
+dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once.  The saving is 15-25% on typical sizes coming from
+C the Karatsuba multiply code.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+	movl	(%eax), %eax	C yp low limb
+
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	mull	(%edx)
+
+	movl	PARAM_WP, %ecx
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+	decl	PARAM_YSIZE
+	pushl	%ebx		defframe_pushl(`SAVE_EBX')
+	movl	%eax, %ecx	C yp low limb
+
+	movl	PARAM_WP, %ebx
+	pushl	%esi		defframe_pushl(`SAVE_ESI')
+	movl	%edx, %esi	C xp
+
+	movl	(%edx), %eax	C xp low limb	
+	jnz	L(two_by_two)
+
+
+	C two limbs by one limb
+
+	mull	%ecx	
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+	movl	%edx, %esi	C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+
+	movl	%esi, 4(%ebx)
+	movl	SAVE_ESI, %esi
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+	
+
+
+C -----------------------------------------------------------------------------
+C Could load yp earlier into another register.
+
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp low limb
+	C ebx	wp
+	C ecx	yp low limb
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+
+dnl  FRAME carries on from previous
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	push	%edi		defframe_pushl(`SAVE_EDI')
+	movl	%edx, %edi	C carry, for wp[1]
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+	movl	4(%ecx), %ecx	C yp[1]
+	movl	%edi, 4(%ebx)
+
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+
+	adcl	$0, %edx
+	movl	(%esi), %eax	C xp[0]
+
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	adcl	%edx, %edi
+	movl	%edi, 8(%ebx)
+
+	adcl	$0, %esi
+	movl	SAVE_EDI, %edi
+	movl	%esi, 12(%ebx)
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+	
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline.  Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times).  A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 13-26
+C limb operations the Karatsuba code calls here with.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+
+dnl  FRAME doesn't carry on from previous, no pushes yet here
+defframe(`SAVE_EBX',-4)
+defframe(`SAVE_ESI',-8)
+defframe(`SAVE_EDI',-12)
+defframe(`SAVE_EBP',-16)
+deflit(`FRAME',0)
+
+	subl	$16, %esp
+deflit(`FRAME',16)
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_WP, %edi
+
+	movl	%ebx, SAVE_EBX
+	movl	%ebp, SAVE_EBP
+	movl	%eax, %ebp
+
+	movl	%esi, SAVE_ESI
+	xorl	%ebx, %ebx
+	leal	(%edx,%ecx,4), %esi	C xp end
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	PARAM_XSIZE, %ecx
+
+	movl	%ebx, (%edi)		C final carry
+	decl	%edx
+
+	jnz	L(ysize_more_than_one)
+
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_EBP, %ebp
+	movl	SAVE_ESI, %esi
+	addl	$FRAME, %esp
+
+	ret
+
+
+L(ysize_more_than_one):
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+	C simple addmul looping
+	C
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	leal	4(%eax,%edx,4), %ebp	C yp end
+	negl	%ecx
+	negl	%edx
+
+	movl	(%esi,%ecx,4), %eax	C xp low limb
+	movl	%edx, PARAM_YSIZE	C -(ysize-1)
+	incl	%ecx
+
+	xorl	%ebx, %ebx		C initial carry
+	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
+	movl	%ebp, PARAM_YP
+
+	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
+	jmp	L(simple_outer_entry)
+
+
+	C this is offset 0x121 so close enough to aligned
+L(simple_outer_top):	
+	C ebp	ysize counter, negative
+
+	movl	PARAM_YP, %edx
+	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
+	xorl	%ebx, %ebx		C carry
+
+	movl	%ebp, PARAM_YSIZE
+	addl	$4, %edi		C next position in wp
+
+	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
+	movl	-4(%esi,%ecx,4), %eax	C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner):
+	C eax	xp limb
+	C ebx	carry limb
+	C ecx	loop counter (negative)
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%eax, %ebx
+	adcl	$0, %edx
+
+	addl	%ebx, (%edi,%ecx,4)
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %edx
+
+	incl	%ecx
+	movl	%edx, %ebx
+	jnz	L(simple_inner)
+
+
+	mull	%ebp
+
+	movl	PARAM_YSIZE, %ebp
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+	addl	%ebx, (%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, 4(%edi)
+	jnz	L(simple_outer_top)
+
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
+C to given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
+C outer loop to take care of xp, wp and the inner loop counter.
+
+defframe(VAR_COUNTER,  -20)
+defframe(VAR_ADJUST,   -24)
+defframe(VAR_JMP,      -28)
+defframe(VAR_XP_LOW,   -32)
+deflit(VAR_EXTRA_SPACE, 16)
+
+
+L(unroll):
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	movl	PARAM_XP, %esi
+	movl	4(%eax), %ebp		C multiplier (yp second limb)
+	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
+
+	movl	PARAM_WP, %edi
+	movl	%eax, PARAM_YP
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
+	decl	%ecx				C xsize-1
+
+	movl	(%esi), %eax		C xp low limb
+	andl	$-UNROLL_MASK-1, %ebx
+	negl	%ecx
+
+	subl	$VAR_EXTRA_SPACE, %esp
+deflit(`FRAME',16+VAR_EXTRA_SPACE)
+	negl	%ebx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ebx, VAR_ADJUST
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+	sarl	$UNROLL_LOG2, %ebx
+
+	C 17 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_entry) (%ecx,%edx,1), %ecx
+')
+	negl	%edx
+
+	movl	%eax, VAR_XP_LOW
+	movl	%ecx, VAR_JMP
+	leal	4(%edi,%edx,4), %edi	C wp and xp, adjust for unrolling,
+	leal	4(%esi,%edx,4), %esi	C  and start at second limb
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%ecx,%edx,1), %ecx
+	addl	$L(unroll_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret
+')
+
+
+C --------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_outer_top):
+	C ebp	ysize counter, negative
+
+	movl	VAR_ADJUST, %ebx
+	movl	PARAM_YP, %edx
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
+
+	leal	4(%edi,%ebx,4), %edi
+	leal	(%esi,%ebx,4), %esi
+	sarl	$UNROLL_LOG2, %ebx
+
+	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
+	movl	VAR_JMP, %ecx
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	testb	$1, %cl		C and clear carry bit
+	movl	%ebx, VAR_COUNTER
+	movl	$0, %ebx
+
+	movl	$0, %ecx
+	cmovz(	%eax, %ecx)	C eax into low carry, zero into high carry limb
+	cmovnz(	%eax, %ebx)
+
+	C Extra fetch of VAR_JMP is bad, but registers are tight
+	jmp	*VAR_JMP
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_top):
+	C eax	xp limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	xp+8
+	C edi	wp
+	C ebp	yp multiplier limb
+	C
+	C VAR_COUNTER  loop counter, negative
+	C
+	C 17 bytes each limb
+
+L(unroll_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	adcl	%edx, %ebx
+
+	mull	%ebp
+
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+
+	movl	disp1(%esi), %eax
+	adcl	%edx, %ecx	
+
+	mull	%ebp
+
+	addl	%ebx, disp1(%edi)
+	movl	$0, %ebx
+
+	adcl	%eax, %ecx
+')
+
+
+	incl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+	leal	UNROLL_BYTES(%edi), %edi
+
+	jnz	L(unroll_top)
+
+
+	C eax
+	C ebx	zero
+	C ecx	low
+	C edx	high
+	C esi
+	C edi	wp, pointing at second last limb)
+	C ebp
+	C
+	C carry flag to be added to high
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+	movl	PARAM_YSIZE, %ebp
+	adcl	$0, %edx
+	addl	%ecx, disp0(%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, disp1(%edi)
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/sqr_basecase.asm b/rts/gmp/mpn/x86/k7/sqr_basecase.asm
new file mode 100644
index 0000000000..84861ea66b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/sqr_basecase.asm
@@ -0,0 +1,627 @@
+dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
+dnl  (measured on the speed difference between 25 and 50 limbs, which is
+dnl  roughly the Karatsuba recursing range).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
+dnl  some comments.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C With a KARATSUBA_SQR_THRESHOLD around 50 this code is about 1500 bytes,
+C which is quite a bit, but is considered good value since squares big
+C enough to use most of the code will be spending quite a few cycles in it.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+	cmpl	$2, %ecx
+
+	movl	PARAM_DST, %edx
+	je	L(two_limbs)
+	ja	L(three_or_more)
+
+
+C------------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	movl	(%eax), %eax
+	movl	%edx, %ecx
+
+	mull	%eax
+
+	movl	%edx, 4(%ecx)
+	movl	%eax, (%ecx)
+	ret
+
+
+C------------------------------------------------------------------------------
+C
+C Using the read/modify/write "add"s seems to be faster than saving and
+C restoring registers.  Perhaps the loads for the first set hide under the
+C mul latency and the second gets store to load forwarding.
+
+	ALIGN(16)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+deflit(`FRAME',0)
+
+	pushl	%ebx		FRAME_pushl()
+	movl	%eax, %ebx	C src
+	movl	(%eax), %eax
+
+	movl	%edx, %ecx	C dst
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)	C dst[1]
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, 8(%ecx)	C dst[2]
+	movl	(%ebx), %eax
+
+	movl	%edx, 12(%ecx)	C dst[3]
+
+	mull	4(%ebx)		C src[0]*src[1]
+
+	popl	%ebx
+
+ 	addl	%eax, 4(%ecx)
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+	ASSERT(nc)
+
+	addl	%eax, 4(%ecx)
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+	ASSERT(nc)
+
+	ret
+
+
+C------------------------------------------------------------------------------
+defframe(SAVE_EBX,  -4)
+defframe(SAVE_ESI,  -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(STACK_SPACE, 16)
+
+L(three_or_more):
+	subl	$STACK_SPACE, %esp
+	cmpl	$4, %ecx
+	jae	L(four_or_more)
+deflit(`FRAME',STACK_SPACE)
+
+
+C------------------------------------------------------------------------------
+C Three limbs
+C
+C Writing out the loads and stores separately at the end of this code comes
+C out about 10 cycles faster than using adcls to memory.
+
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	movl	%ebx, SAVE_EBX
+	movl	%eax, %ebx	C src
+	movl	(%eax), %eax
+
+	movl	%edx, %ecx	C dst
+	movl	%esi, SAVE_ESI
+	movl	%edi, SAVE_EDI
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+	movl	%edx, 4(%ecx)
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	8(%ebx), %eax
+	movl	%edx, 12(%ecx)
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	(%ebx), %eax
+	movl	%edx, 20(%ecx)
+
+	mull	4(%ebx)		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	(%ebx), %eax
+	movl	%edx, %edi
+
+	mull	8(%ebx)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%ebp, SAVE_EBP
+	movl	$0, %ebp
+
+	movl	4(%ebx), %eax
+	adcl	%edx, %ebp
+
+	mull	8(%ebx)		C src[1] * src[2]
+
+	xorl	%ebx, %ebx
+	addl	%eax, %ebp
+
+	adcl	$0, %edx
+
+	C eax
+	C ebx	zero, will be dst[5]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%esi, %esi
+
+	adcl	%edi, %edi
+	movl	4(%ecx), %eax
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+
+	adcl	$0, %ebx
+	addl	%eax, %esi
+	movl	8(%ecx), %eax
+
+	adcl	%eax, %edi
+	movl	12(%ecx), %eax
+	movl	%esi, 4(%ecx)
+
+	adcl	%eax, %ebp
+	movl	16(%ecx), %eax
+	movl	%edi, 8(%ecx)
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+
+	adcl	%eax, %edx
+	movl	20(%ecx), %eax
+	movl	%ebp, 12(%ecx)
+
+	adcl	%ebx, %eax
+	ASSERT(nc)
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+
+	movl	%edx, 16(%ecx)
+	movl	%eax, 20(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+C------------------------------------------------------------------------------
+L(four_or_more):
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C Further products are added in rather than stored.
+ 
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+defframe(`VAR_COUNTER',-20)
+defframe(`VAR_JMP',    -24)
+deflit(EXTRA_STACK_SPACE, 8)
+
+	movl	%ebx, SAVE_EBX
+	movl	%edi, SAVE_EDI
+	leal	(%edx,%ecx,4), %edi	C &dst[size]
+
+	movl	%esi, SAVE_ESI
+	movl	%ebp, SAVE_EBP
+	leal	(%eax,%ecx,4), %esi	C &src[size]
+
+	movl	(%eax), %ebp		C multiplier
+	movl	$0, %ebx
+	decl	%ecx
+
+	negl	%ecx
+	subl	$EXTRA_STACK_SPACE, %esp
+FRAME_subl_esp(EXTRA_STACK_SPACE)
+
+L(mul_1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	multiplier
+
+        movl    (%esi,%ecx,4), %eax
+
+        mull    %ebp
+
+        addl    %ebx, %eax
+        movl    %eax, (%edi,%ecx,4)
+        movl    $0, %ebx
+
+        adcl    %edx, %ebx
+        incl    %ecx
+        jnz     L(mul_1)
+
+
+C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two products, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as in mpn_addmul_1, see that routine for
+C some comments.
+C
+C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K7 does branch prediction on indirect jumps, which is bad since it's a
+C different target each time.  There seems no way to avoid this.
+
+dnl  This value also hard coded in some shifts and adds
+deflit(CODE_BYTES_PER_LIMB, 17)
+
+dnl  With the unmodified &src[size] and &dst[size] pointers, the
+dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl  values up to 31, but above that an offset must be added to them.
+
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+dnl  Because the last chunk of code is generated differently, a label placed
+dnl  at the end doesn't work.  Instead calculate the implied end using the
+dnl  start and how many chunks of code there are.
+
+deflit(UNROLL_INNER_END,
+`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
+
+	C eax
+	C ebx	carry
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, (%edi)
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	negl	%ecx
+ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
+ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
+
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+
+
+	C The calculated jump mustn't come out to before the start of the
+	C code available.  This is the limit UNROLL_COUNT puts on the src
+	C operand size, but checked here directly using the jump address.
+	ASSERT(ae,
+	`movl_text_address(L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx')
+
+
+C------------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	high limb to store
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	&src[size], constant
+	C edi	dst ptr, high of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%esi,%edx,4), %ebp	C next multiplier
+	movl	-8+OFFSET(%esi,%edx,4), %eax	C first of multiplicand
+
+	movl	%edx, VAR_COUNTER
+
+	mull	%ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
+
+	testb	$1, %cl
+	movl	%edx, %ebx	C high carry
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	cmovX(	%ebx, %ecx)	C high carry reverse
+	cmovX(	%eax, %ebx)	C low carry reverse
+
+	leal	CODE_BYTES_PER_LIMB(%edx), %eax
+	xorl	%edx, %edx
+	leal	4(%edi), %edi
+
+	movl	%eax, VAR_JMP
+
+	jmp	*%eax
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	addl	(%esp), %ecx
+	addl	$UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
+	addl	%edx, %ecx
+	ret
+')
+
+
+	C Must be an even address to preserve the significance of the low
+	C bit of the jump address indicating which way around ecx/ebx should
+	C start.
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	next limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src - 4))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%esi), %eax)
+        adcl    %edx, %ebx
+
+        mull	%ebp
+
+Zdisp(  addl,	%ecx, disp_dst,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+',`
+	dnl  this bit comes out last
+Zdisp(  movl,	disp_src,(%esi), %eax)
+	adcl	%edx, %ecx
+
+	mull    %ebp
+
+dnl Zdisp(	addl	%ebx, disp_src,(%edi))
+	addl	%ebx, disp_dst(%edi)
+ifelse(forloop_last,0,
+`	movl	$0, %ebx')
+
+	adcl    %eax, %ecx
+')
+')
+
+	C eax	next limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+        adcl    $0, %edx
+	addl	%ecx, -4+OFFSET(%edi)
+	movl	VAR_JMP, %ecx
+
+        adcl    $0, %edx
+	
+	movl	%edx, m4_empty_if_zero(OFFSET) (%edi)
+	movl	VAR_COUNTER, %edx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+	
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %esi
+	addl	$OFFSET, %edi
+')
+
+
+C------------------------------------------------------------------------------
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-5]
+
+	movl	-12(%esi), %ebp
+	movl	-8(%esi), %eax
+	movl	%eax, %ecx
+
+	mull	%ebp
+
+	addl	%eax, -4(%edi)
+	movl	-4(%esi), %eax
+
+	adcl	$0, %edx
+	movl	%edx, %ebx
+	movl	%eax, %esi
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+
+	adcl	$0, %edx
+	addl	%eax, (%edi)
+	movl	%esi, %eax
+
+	adcl	$0, %edx
+	movl	%edx, %ebx
+
+	mull	%ecx
+
+	addl	%ebx, %eax
+	movl	%eax, 4(%edi)
+
+	adcl	$0, %edx
+	movl	%edx, 8(%edi)
+	
+
+
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift_start):
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edi
+	xorl	%ecx, %ecx		C clear carry
+
+	leal	(%edi,%eax,8), %edi
+	notl	%eax			C -size-1, preserve carry
+
+	leal	2(%eax), %eax		C -(size-1)
+
+L(lshift):
+	C eax	counter, negative
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	dst, pointing just after last limb
+	C ebp
+
+	rcll	-4(%edi,%eax,8)
+	rcll	(%edi,%eax,8)
+	incl	%eax
+	jnz	L(lshift)
+
+	setc	%al
+
+	movl	PARAM_SRC, %esi
+	movl	%eax, -4(%edi)		C dst most significant limb
+
+	movl	PARAM_SIZE, %ecx
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	(%esi), %eax		C src[0]
+
+	mull	%eax
+
+	leal	(%esi,%ecx,4), %esi	C src point just after last limb
+	negl	%ecx
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+	incl	%ecx
+
+L(diag):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, negative
+	C edx	carry
+	C esi	src just after last limb
+	C edi	dst just after last limb
+	C ebp
+
+	movl	(%esi,%ecx,4), %eax
+	movl	%edx, %ebx
+
+	mull	%eax
+
+	addl	%ebx, -4(%edi,%ecx,8)
+	adcl	%eax, (%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	addl	%edx, -4(%edi)		C dst most significant limb
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/lshift.asm b/rts/gmp/mpn/x86/lshift.asm
new file mode 100644
index 0000000000..4735335cbe
--- /dev/null
+++ b/rts/gmp/mpn/x86/lshift.asm
@@ -0,0 +1,90 @@
+dnl  x86 mpn_lshift -- mpn left shift.
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+deflit(`FRAME',12)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%edx
+	movl	PARAM_SHIFT,%ecx
+
+	subl	$4,%esi			C adjust src
+
+	movl	(%esi,%edx,4),%ebx	C read most significant limb
+	xorl	%eax,%eax
+	shldl(	%cl, %ebx, %eax)	C compute carry limb
+	decl	%edx
+	jz	L(end)
+	pushl	%eax			C push carry limb onto stack
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movl	%ebx,%eax
+
+	ALIGN(8)
+L(oop):	movl	(%esi,%edx,4),%ebx	C load next lower limb
+	shldl(	%cl, %ebx, %eax)	C compute result limb
+	movl	%eax,(%edi,%edx,4)	C store it
+	decl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shldl(	%cl, %eax, %ebx)
+	movl	%ebx,(%edi,%edx,4)
+	decl	%edx
+	jnz	L(oop)
+
+	shll	%cl,%eax		C compute least significant limb
+	movl	%eax,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+L(end):	shll	%cl,%ebx		C compute least significant limb
+	movl	%ebx,(%edi)		C store it
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mod_1.asm b/rts/gmp/mpn/x86/mod_1.asm
new file mode 100644
index 0000000000..3908161b3e
--- /dev/null
+++ b/rts/gmp/mpn/x86/mod_1.asm
@@ -0,0 +1,141 @@
+dnl  x86 mpn_mod_1 -- mpn by limb remainder.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl        cycles/limb
+dnl  K6        20
+dnl  P5        44
+dnl  P6        39
+dnl  486   approx 42 maybe
+dnl
+dnl  The following have their own optimized mod_1 implementations, but for
+dnl  reference the code here runs as follows.
+dnl
+dnl  P6MMX     39
+dnl  K7        41
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                       mp_limb_t carry);
+C
+C Divide src,size by divisor and return the remainder.  The quotient is
+C discarded.
+C
+C See mpn/x86/divrem_1.asm for some comments.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	orl	%ecx, %ecx
+
+	movl	PARAM_CARRY, %edx
+	jnz	LF(mpn_mod_1,top)
+
+	popl	%esi
+	movl	%edx, %eax
+
+	popl	%ebx
+	
+	ret
+
+EPILOGUE()
+
+
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	pushl	%esi		FRAME_pushl()
+
+	orl	%ecx, %ecx
+	jz	L(done_zero)
+
+	movl	PARAM_DIVISOR, %esi
+	movl	-4(%ebx,%ecx,4), %eax	C src high limb
+
+	cmpl	%esi, %eax
+
+	sbbl	%edx, %edx		C -1 if high<divisor
+
+	addl	%edx, %ecx		C skip one division if high<divisor
+	jz	L(done_eax)
+
+	andl	%eax, %edx		C carry if high<divisor
+
+
+L(top):
+	C eax	scratch (quotient)
+	C ebx	src
+	C ecx	counter
+	C edx	carry (remainder)
+	C esi	divisor
+	C edi
+	C ebp
+
+	movl	-4(%ebx,%ecx,4), %eax
+
+	divl	%esi
+
+	loop_or_decljnz	L(top)
+
+
+	movl	%edx, %eax
+L(done_eax):
+	popl	%esi
+
+	popl	%ebx
+
+	ret
+
+
+L(done_zero):
+	popl	%esi
+	xorl	%eax, %eax
+
+	popl	%ebx
+
+	ret
+	
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mul_1.asm b/rts/gmp/mpn/x86/mul_1.asm
new file mode 100644
index 0000000000..8817f291bc
--- /dev/null
+++ b/rts/gmp/mpn/x86/mul_1.asm
@@ -0,0 +1,130 @@
+dnl  x86 mpn_mul_1 (for 386, 486, and Pentium Pro) -- Multiply a limb vector
+dnl  with a limb and store the result in a second limb vector.
+dnl 
+dnl      cycles/limb
+dnl  P6:     5.5
+dnl
+dnl  The following CPUs have their own optimized code, but for reference the
+dnl  code here runs as follows.
+dnl
+dnl      cycles/limb
+dnl  P5:    12.5
+dnl  K6:    10.5
+dnl  K7:     4.5
+
+
+dnl  Copyright (C) 1992, 1994, 1997, 1998, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%ebx	C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_SIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(end)
+
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebx,(%edi)
+	addl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebp,4(%edi)
+	addl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebx,8(%edi)
+	addl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	%ebp,12(%edi)
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oop)
+
+L(end):	movl	%ebx,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mul_basecase.asm b/rts/gmp/mpn/x86/mul_basecase.asm
new file mode 100644
index 0000000000..3a9b73895b
--- /dev/null
+++ b/rts/gmp/mpn/x86/mul_basecase.asm
@@ -0,0 +1,209 @@
+dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
+dnl  in a third limb vector.
+
+
+dnl  Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C This was written in a haste since the Pentium optimized code that was used
+C for all x86 machines was slow for the Pentium II.  This code would benefit
+C from some cleanup.
+C
+C To shave off some percentage of the run-time, one should make 4 variants
+C of the Louter loop, for the four different outcomes of un mod 4.  That
+C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
+C part of the function, but since it is not very large, that would be
+C acceptable.
+C
+C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
+C unknown.
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+defframe(VAR_MULTIPLIER, -4)
+defframe(VAR_COUNTER,    -8)
+deflit(VAR_STACK_SPACE,  8)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	subl	$VAR_STACK_SPACE,%esp
+	pushl	%esi
+	pushl	%ebp
+	pushl	%edi
+deflit(`FRAME',eval(VAR_STACK_SPACE+12))
+
+	movl	PARAM_XP,%esi
+	movl	PARAM_WP,%edi
+	movl	PARAM_YP,%ebp
+
+	movl	(%esi),%eax		C load xp[0]
+	mull	(%ebp)			C multiply by yp[0]
+	movl	%eax,(%edi)		C store to wp[0]
+	movl	PARAM_XSIZE,%ecx	C xsize
+	decl	%ecx			C If xsize = 1, ysize = 1 too
+	jz	L(done)
+
+	pushl	%ebx
+FRAME_pushl()
+	movl	%edx,%ebx
+
+	leal	4(%esi),%esi
+	leal	4(%edi),%edi
+
+L(oopM):
+	movl	(%esi),%eax		C load next limb at xp[j]
+	leal	4(%esi),%esi
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	%edx,%ebx
+	adcl	$0,%ebx
+	movl	%eax,(%edi)
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oopM)
+
+	movl	%ebx,(%edi)		C most significant limb of product
+	addl	$4,%edi			C increment wp
+	movl	PARAM_XSIZE,%eax
+	shll	$2,%eax
+	subl	%eax,%edi
+	subl	%eax,%esi
+
+	movl	PARAM_YSIZE,%eax	C ysize
+	decl	%eax
+	jz	L(skip)
+	movl	%eax,VAR_COUNTER	C set index i to ysize
+
+L(outer):
+	movl	PARAM_YP,%ebp		C yp
+	addl	$4,%ebp			C make ebp point to next v limb
+	movl	%ebp,PARAM_YP
+	movl	(%ebp),%eax		C copy y limb ...
+	movl	%eax,VAR_MULTIPLIER	C ... to stack slot
+	movl	PARAM_XSIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	VAR_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	addl	%eax,(%edi)
+	adcl	%edx,%ebx		C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_XSIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(endX)
+
+	ALIGN(8)
+L(oopX):
+	movl	(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebx,(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebp,4(%edi)
+	adcl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebx,8(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	addl	%ebp,12(%edi)
+	adcl	$0,%ebx		C propagate carry into cylimb
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oopX)
+
+L(endX):
+	movl	%ebx,(%edi)
+	addl	$4,%edi
+
+	C we incremented wp and xp in the loop above; compensate
+	movl	PARAM_XSIZE,%eax
+	shll	$2,%eax
+	subl	%eax,%edi
+	subl	%eax,%esi
+
+	movl	VAR_COUNTER,%eax
+	decl	%eax
+	movl	%eax,VAR_COUNTER
+	jnz	L(outer)
+
+L(skip):
+	popl	%ebx
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$8,%esp
+	ret
+
+L(done):
+	movl	%edx,4(%edi)	   C store to wp[1]
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$8,%esp
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/README b/rts/gmp/mpn/x86/p6/README
new file mode 100644
index 0000000000..7dbc905a0d
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/README
@@ -0,0 +1,95 @@
+
+                      INTEL P6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for Intel P6 class CPUs, meaning
+PentiumPro, Pentium II and Pentium III.  The mmx and p3mmx subdirectories
+have routines using MMX instructions.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+Some of these might be able to be improved.
+
+                               cycles/limb
+
+	mpn_add_n/sub_n           3.7
+
+	mpn_copyi                 0.75
+	mpn_copyd                 2.4
+
+	mpn_divrem_1             39.0
+	mpn_mod_1                39.0
+	mpn_divexact_by3          8.5
+
+	mpn_mul_1                 5.5
+	mpn_addmul/submul_1       6.35
+
+	mpn_l/rshift              2.5
+
+	mpn_mul_basecase          8.2 cycles/crossproduct (approx)
+	mpn_sqr_basecase          4.0 cycles/crossproduct (approx)
+	                          or 7.75 cycles/triangleproduct (approx)
+
+Pentium II and III have MMX and get the following improvements.
+
+	mpn_divrem_1             25.0 integer part, 17.5 fractional part
+	mpn_mod_1                24.0
+
+	mpn_l/rshift              1.75
+
+
+
+
+NOTES
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Mispredicted branches have a penalty of between 9 and 15 cycles, and even up
+to 26 cycles depending how far speculative execution has gone.  The 9 cycle
+minimum penalty comes from the issue pipeline being 9 stages.
+
+A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4,
+5, 6 or 7 limb operations are all the same.  The 0.75 cycles/limb would be 3
+cycles per 16 byte block.
+
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three instructions with no successive
+dependencies, and with only the first being a multiple micro-op.
+
+P6 has out-of-order execution, so the groupings are really only showing
+dependent paths where some shuffling might allow some latencies to be
+hidden.
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated
+02/99, order number 245127 (order number 730795-001 is in the document too).
+Available on-line:
+
+	http://download.intel.com/design/PentiumII/manuals/245127.htm
+
+"Intel Architecture Optimization Manual", 1997, order number 242816.  This
+is an older document mostly about P5 and not as good as the above.
+Available on-line:
+
+	http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/p6/aorsmul_1.asm b/rts/gmp/mpn/x86/p6/aorsmul_1.asm
new file mode 100644
index 0000000000..feb364ec0b
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/aorsmul_1.asm
@@ -0,0 +1,300 @@
+dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl 
+dnl  P6: 6.35 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  P6 UNROLL_COUNT cycles/limb
+dnl          8           6.7
+dnl         16           6.35
+dnl         32           6.3
+dnl         64           6.3
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+	define(M4_description, add it to)
+	define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+	define(M4_description, subtract it from)
+	define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                            mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                             mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C This code is pretty much the same as the K6 code.  The unrolled loop is
+C the same, but there's just a few scheduling tweaks in the setups and the
+C simple loop.
+C
+C A number of variations have been tried for the unrolled loop, with one or
+C two carries, and with loads scheduled earlier, but nothing faster than 6
+C cycles/limb has been found.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+	pushl	%ebx
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %ebx
+	jmp	LF(M4_function_1,start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+	push	%ebx
+deflit(`FRAME',4)
+	xorl	%ebx, %ebx	C initial carry
+
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi
+deflit(`FRAME',8)
+
+	movl	PARAM_SRC, %esi
+	pushl	%edi
+deflit(`FRAME',12)
+
+	movl	PARAM_DST, %edi
+	pushl	%ebp
+deflit(`FRAME',16)
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_MULTIPLIER, %ebp
+	jae	L(unroll)
+
+	
+	C simple loop
+	C this is offset 0x22, so close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%esi), %eax
+	addl	$4, %edi
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	adcl	$0, %edx
+
+	M4_inst	%eax, -4(%edi)
+	movl	%edx, %ebx
+
+	adcl	$0, %ebx
+	decl	%ecx
+
+	leal	4(%esi), %esi
+	jnz	L(simple)
+
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	movl	%ebx, %eax
+
+	popl	%ebx
+	ret
+
+
+
+C------------------------------------------------------------------------------
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers when doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %ebx is necessary only for the
+C mpn_add/submul_1c entry points.  Duplicating the startup code to
+C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl  overlapping with parameters already fetched
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP,   `PARAM_DST')
+
+	C this is offset 0x43, so close enough to aligned
+L(unroll):
+	C eax
+	C ebx	initial carry
+	C ecx	size
+	C edx
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	%ecx, %edx
+	decl	%ecx
+
+	subl	$2, %edx
+	negl	%ecx
+
+	shrl	$UNROLL_LOG2, %edx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%edx, VAR_COUNTER
+	movl	%ecx, %edx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	shll	$4, %edx
+	negl	%ecx
+
+	leal	L(entry) (%edx,%ecx,1), %edx
+')
+	movl	(%esi), %eax		C src low limb
+
+	movl	%edx, VAR_JUMP
+	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
+
+	mull	%ebp
+
+	addl	%ebx, %eax	C initial carry (from _1c)
+	adcl	$0, %edx
+
+	movl	%edx, %ebx	C high carry
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
+
+	movl	VAR_JUMP, %edx
+	testl	$1, %ecx
+	movl	%eax, %ecx	C low carry
+
+	cmovnz(	%ebx, %ecx)	C high,low carry other way around
+	cmovnz(	%eax, %ebx)
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	shll	$4, %edx
+	negl	%ecx
+
+	C See README.family about old gas bugs
+	leal	(%edx,%ecx,1), %edx
+	addl	$L(entry)-L(here), %edx
+
+	addl	(%esp), %edx
+
+	ret
+')
+
+
+C -----------------------------------------------------------
+	ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+	C eax	scratch
+	C ebx	carry hi
+	C ecx	carry lo
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+	C
+	C VAR_COUNTER	loop counter
+	C
+	C 15 code bytes per limb
+
+	addl	$UNROLL_BYTES, %edi
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	mull	%ebp
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	adcl	%eax, %ebx
+	movl	%edx, %ecx
+	adcl	$0, %ecx
+
+	movl	disp1(%esi), %eax
+	mull	%ebp
+	M4_inst	%ebx, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %ebx
+	adcl	$0, %ebx
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+
+	jns	L(top)
+
+
+deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+
+	M4_inst	%ecx, disp0(%edi)
+	movl	%ebx, %eax
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+	adcl	$0, %eax
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/diveby3.asm b/rts/gmp/mpn/x86/p6/diveby3.asm
new file mode 100644
index 0000000000..a77703ea89
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/diveby3.asm
@@ -0,0 +1,37 @@
+dnl  Intel P6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl       
+dnl  P6: 8.5 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  The P5 code runs well on P6, in fact better than anything else found so
+dnl  far.  An imul is 4 cycles, meaning the two cmp/sbbl pairs on the
+dnl  dependent path are taking 4.5 cycles.
+dnl
+dnl  The destination cache line prefetching is unnecessary on P6, but
+dnl  removing it is a 2 cycle slowdown (approx), so it must be inducing
+dnl  something good in the out of order execution.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_divexact_by3c)
+include_mpn(`x86/pentium/diveby3.asm')
diff --git a/rts/gmp/mpn/x86/p6/gmp-mparam.h b/rts/gmp/mpn/x86/p6/gmp-mparam.h
new file mode 100644
index 0000000000..d7bfb6d60c
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/gmp-mparam.h
@@ -0,0 +1,96 @@
+/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   5   /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME   39  /* cycles */
+#endif
+
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   2  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   23
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      139
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   52
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      166
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD             116
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             66
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            20
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          54
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 592, 1440, 2688, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     608
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         5888
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 656, 1504, 2944, 6656, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     672
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         5888
+#endif
diff --git a/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm
new file mode 100644
index 0000000000..f1b011b623
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm
@@ -0,0 +1,677 @@
+dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
+dnl 
+dnl  P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C
+C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
+C see that file for some comments.  It's likely what's here can be improved.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The different speeds of the integer and fraction parts means that using
+dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
+dnl  for the integer part and a bit lower for the fraction part.  (Or what's
+dnl  really wanted is to speed up the integer part!)
+dnl
+dnl  The threshold is set to make the integer part right.  At 4 limbs the
+dnl  div and mul are about the same there, but on the fractional part the
+dnl  mul is much faster.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC,     -28)
+defframe(VAR_DST,     -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	leal	-4(%edi,%ebx,4), %edi
+	jmp	LF(mpn_divrem_1,start_1c)
+
+EPILOGUE()
+
+
+	C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	orl	%ecx, %ecx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+	jz	L(no_skip_div)
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+	cmpl	%ebp, %eax		C one less div if high<divisor
+	jnb	L(no_skip_div)
+
+	movl	$0, (%edi,%ecx,4)	C dst high limb
+	decl	%ecx			C size-1
+	movl	%eax, %edx		C src high limb as initial carry
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax	
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	%ebp
+
+	movl	%eax, (%edi,%ecx,4)
+	decl	%ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	movl	PARAM_DST, %edi
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	movl	%edx, %eax
+
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	scratch (remainder)
+	C esi
+	C edi	dst
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	decl	%ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	12(%edi), %ebx
+
+	movl	%ebx, VAR_DST_STOP
+	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
+
+	movl	%edi, VAR_DST
+	movl	%ecx, %ebx		C size
+
+	bsrl	%ebp, %ecx		C 31-l
+	movl	%edx, %edi		C carry
+
+	leal	1(%ecx), %eax		C 32-l
+	xorl	$31, %ecx		C l
+
+	movl	%ecx, VAR_NORM
+	movl	$-1, %edx
+
+	shll	%cl, %ebp		C d normalized
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	movl	%eax, VAR_INVERSE
+	orl	%ebx, %ebx		C size
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	%eax, VAR_SRC
+	jz	L(start_zero)
+
+	movl	8(%eax), %esi		C src high limb
+	cmpl	$1, %ebx
+	jz	L(start_one)
+
+L(start_two_or_more):
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	cmpl	$2, %ebx
+	je	L(integer_two_left)
+	jmp	L(integer_top)
+
+
+L(start_one):
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shll	%cl, %esi		C n10 = high << l
+	jmp	L(integer_one_left)
+
+
+L(start_zero):
+	shll	%cl, %edi		C n2 = carry << l
+	movl	$0, %esi		C n10 = 0
+
+	C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
+	C must have xsize!=0
+	jmp	L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C This loop runs at about 25 cycles, which is probably sub-optimal, and
+C certainly more than the dependent chain would suggest.  A better loop, or
+C a better rough analysis of what's possible, would be welcomed.
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		n2+n1	1   (addl)
+C		mul	5
+C		q1+1	3   (addl/adcl)
+C		mul	5
+C		sub	3   (subl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       19
+C
+C Lack of registers hinders explicit scheduling and it might be that the
+C normal out of order execution isn't able to hide enough under the mul
+C latencies.
+C
+C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
+C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
+C combination was tried for the addback (despite the fact it would lengthen
+C the dependent chain) but found to be no faster.
+
+
+	ALIGN(16)
+L(integer_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	d
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	VAR_SRC, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+	movq	(%ecx), %mm0       C next src limb and the one below it
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_SRC
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	movl	%ebp, %eax	   C d
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	VAR_DST, %ecx
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_DST_STOP, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+	subl	$4, %ecx
+
+	movl	%ebx, (%ecx)
+	cmpl	%eax, %ecx
+
+	movl	%ecx, VAR_DST
+	jne	L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case.  This make the code a bit smaller and simpler, and
+C costs only 2 cycles (each).
+
+L(integer_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+
+ 	movl	%esi, %eax
+ 	movl	%ebp, %ebx
+
+ 	sarl	$31, %eax          C -n1
+ 	movl	PARAM_SRC, %ecx
+
+ 	andl	%eax, %ebx         C -n1 & d
+ 	negl	%eax               C n1
+
+ 	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+ 	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	(%ecx), %mm0	   C src low limb
+
+	movl	VAR_DST_STOP, %ecx
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+
+ 	movl	%esi, %eax
+ 	movl	%ebp, %ebx
+
+ 	sarl	$31, %eax          C -n1
+ 	movl	VAR_DST_STOP, %ecx
+
+ 	andl	%eax, %ebx         C -n1 & d
+ 	negl	%eax               C n1
+
+ 	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+ 	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx           C q1 if q1+1 overflowed
+
+	mull	%ebx
+
+	C
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	PARAM_XSIZE, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -8(%ecx)
+	subl	$8, %ecx
+
+
+
+	orl	%eax, %eax         C xsize
+	jnz	L(fraction_some)
+
+	movl	%edi, %eax
+L(fraction_done):
+	movl	VAR_NORM, %ecx
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	shrl	%cl, %eax
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx
+	C edx
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+
+	movl	VAR_DST, %ecx
+	movl	VAR_DST_STOP, %edx
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_DST
+	psrlq	%mm7, %mm0
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+
+	movl	$-1, (%ecx)
+	movd	%mm0, %esi		C next n10
+
+	cmpl	%ecx, %edx
+	jne	L(integer_top)
+
+	jmp	L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		mul	5
+C		q1+1	1   (addl)
+C		mul	5
+C		sub	3   (negl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       16
+C
+C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
+C the addback was found to be a touch slower.
+
+
+	ALIGN(16)
+L(fraction_some):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	carry
+	C ebp	divisor
+
+	movl	PARAM_DST, %esi
+	movl	VAR_DST_STOP, %ecx
+	movl	%edi, %eax
+
+	subl	$8, %ecx
+
+
+	ALIGN(16)
+L(fraction_top):
+	C eax	n2, then scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst, decrementing
+	C edx	scratch
+	C esi	dst stop point
+	C edi	n2
+	C ebp	divisor
+
+	mull	VAR_INVERSE	C m*n2
+
+	movl	%ebp, %eax	C d
+	subl	$4, %ecx	C dst
+	leal	1(%edi), %ebx
+
+	C
+
+	C
+
+	C
+
+	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
+
+	mull	%ebx		C (q1+1)*d
+
+	C
+
+	C
+
+	C
+
+	C
+
+	negl	%eax		C low of n - (q1+1)*d
+
+ 	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
+	leal    (%ebp,%eax), %edx
+
+ 	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
+
+ 	sbbl	$0, %ebx	C q
+	movl	%eax, %edi	C remainder->n2
+ 	cmpl	%esi, %ecx
+
+	movl	%ebx, (%ecx)	C previous q
+	jne	L(fraction_top)
+
+
+	jmp	L(fraction_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/mmx/mod_1.asm b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm
new file mode 100644
index 0000000000..e7d8d94d33
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm
@@ -0,0 +1,444 @@
+dnl  Intel Pentium-II mpn_mod_1 -- mpn by limb remainder.
+dnl 
+dnl  P6MMX: 24.0 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                       mp_limb_t carry);
+C
+C The code here very similar to mpn_divrem_1, but with the quotient
+C discarded.  What's here probably isn't optimal.
+C
+C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some
+C comments.
+
+
+dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl  used, rather than plain "divl"s.  Minimum value 2.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC_STOP,-28)
+
+deflit(STACK_SPACE, 28)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	jmp	LF(mpn_mod_1,start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C carry flag if high<divisor
+					
+	cmovc(	%eax, %edx)		C src high limb as initial carry
+	sbbl	$0, %ecx		C size-1 to skip one div
+	jz	L(divide_done)
+
+
+	ALIGN(16)
+L(start_1c):
+	C eax	
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	cmpl	$MUL_THRESHOLD, %ecx
+	jae	L(mul_by_inverse)
+
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+
+L(divide_top):
+	C eax	scratch (quotient)
+	C ebx
+	C ecx	counter, limbs, decrementing
+	C edx	scratch (remainder)
+	C esi	src
+	C edi
+	C ebp
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	%ebp
+
+	decl	%ecx
+	jnz	L(divide_top)
+
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	%edx, %eax
+
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	movl	%ebx, SAVE_EBX
+	leal	-4(%esi), %ebx
+
+	movl	%ebx, VAR_SRC_STOP
+	movl	%ecx, %ebx		C size
+
+	movl	%edi, SAVE_EDI
+	movl	%edx, %edi		C carry
+
+	bsrl	%ebp, %ecx		C 31-l
+	movl	$-1, %edx
+
+	leal	1(%ecx), %eax		C 32-l
+	xorl	$31, %ecx		C l
+
+	movl	%ecx, VAR_NORM
+	shll	%cl, %ebp		C d normalized
+
+	movd	%eax, %mm7
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	C
+
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	8(%eax), %esi		C src high limb
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	movl	%eax, %ecx		C &src[size-3]
+
+
+ifelse(MUL_THRESHOLD,2,`
+	cmpl	$2, %ebx
+	je	L(inverse_two_left)
+')
+
+
+C The dependent chain here is the same as in mpn_divrem_1, but a few
+C instructions are saved by not needing to store the quotient limbs.  This
+C gets it down to 24 c/l, which is still a bit away from a theoretical 19
+C c/l.
+
+	ALIGN(16)
+L(inverse_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	src pointer, decrementing
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next src limb and the one below it
+	subl	$4, %ecx
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+	movl	VAR_SRC_STOP, %ebx
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+	cmpl	%ebx, %ecx
+
+	jne	L(inverse_top)
+
+
+L(inverse_loop_done):
+
+
+C -----------------------------------------------------------------------------
+
+L(inverse_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	&src[-1]
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src dword)
+	C mm7	rshift
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	4(%ecx), %mm0	   C src low limb
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+	movl	%ebp, %eax	   C d
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+
+C One limb left
+
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movl	VAR_NORM, %ecx     C for final denorm
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+	movl	%ebp, %eax	   C d
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	SAVE_EBX, %ebx
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	leal	(%ebp,%esi), %edx
+	movl	SAVE_EBP, %ebp
+
+	movl	%esi, %eax	   C remainder
+	movl	SAVE_ESI, %esi
+
+	cmovc(	%edx, %eax)	   C n - q1*d if underflow from using q1+1
+	movl	SAVE_EDI, %edi
+
+	shrl	%cl, %eax	   C denorm remainder
+	addl	$STACK_SPACE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx	src pointer
+	C edx
+	C esi	n10
+	C edi	(n2)
+	C ebp	divisor
+
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	movl	VAR_SRC_STOP, %edx
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi		C next n10
+	cmpl	%ecx, %edx
+	jne	L(inverse_top)
+
+	jmp	L(inverse_loop_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/mmx/popham.asm b/rts/gmp/mpn/x86/p6/mmx/popham.asm
new file mode 100644
index 0000000000..50f9a11218
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/popham.asm
@@ -0,0 +1,31 @@
+dnl  Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+dnl 
+dnl  P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb
+dnl  (approx)
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/p6/p3mmx/popham.asm b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm
new file mode 100644
index 0000000000..e63fbf334b
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm
@@ -0,0 +1,30 @@
+dnl  Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  Haven't actually measured it, but the K7 code with the psadbw should be
+dnl  good on P-III.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/p6/sqr_basecase.asm b/rts/gmp/mpn/x86/p6/sqr_basecase.asm
new file mode 100644
index 0000000000..174c78406a
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/sqr_basecase.asm
@@ -0,0 +1,641 @@
+dnl  Intel P6 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
+dnl  product (measured on the speed difference between 20 and 40 limbs,
+dnl  which is the Karatsuba recursing range).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
+dnl  a description.  The only difference here is that UNROLL_COUNT can go up
+dnl  to 64 (not 63) making KARATSUBA_SQR_THRESHOLD_MAX 67.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 67)
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed so
+C it won't all get into the code cache.  The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 40x40 that do use the full
+C unrolling will least be making good use of it, because 40x40 will take
+C something like 7000 cycles.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+	je	L(two_limbs)
+
+	movl	(%eax), %eax
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src limb
+	C ebx
+	C ecx	dst
+	C edx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+defframe(SAVE_ESI, -4)
+defframe(SAVE_EBX, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(`STACK_SPACE',16)
+
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	%eax, %esi
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	4(%esi), %eax
+
+	movl	%ebx, SAVE_EBX
+	movl	%edx, %ebx	C dst[1]
+
+	mull	%eax		C src[1]^2
+
+	movl	%edi, SAVE_EDI
+	movl	%eax, %edi	C dst[2]
+	movl	(%esi), %eax
+
+	movl	%ebp, SAVE_EBP
+	movl	%edx, %ebp	C dst[3]
+
+	mull	4(%esi)		C src[0]*src[1]
+
+	addl	%eax, %ebx
+	movl	SAVE_ESI, %esi
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%ebx, %eax
+	movl	SAVE_EBX, %ebx
+
+	adcl	%edi, %edx
+	movl	SAVE_EDI, %edi
+
+	adcl	$0, %ebp
+
+	movl	%eax, 4(%ecx)
+
+	movl	%ebp, 12(%ecx)
+	movl	SAVE_EBP, %ebp
+
+	movl	%edx, 8(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx	size
+deflit(`FRAME',0)
+
+	pushl	%esi	defframe_pushl(`SAVE_ESI')
+	cmpl	$4, %edx
+
+	movl	PARAM_SRC, %esi
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%ebp	defframe_pushl(`SAVE_EBP')
+	pushl	%edi	defframe_pushl(`SAVE_EDI')
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	movl	4(%esi), %eax
+	xorl	%ebp, %ebp
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	%edx, 12(%ecx)
+	movl	8(%esi), %eax
+
+	pushl	%ebx	defframe_pushl(`SAVE_EBX')
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	%edx, 20(%ecx)
+
+	movl	(%esi), %eax
+
+	mull	4(%esi)		C src[0] * src[1]
+
+	movl	%eax, %ebx
+	movl	%edx, %edi
+
+	movl	(%esi), %eax
+
+	mull	8(%esi)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%edx, %ebp
+
+	adcl	$0, %ebp
+	movl	4(%esi), %eax
+
+	mull	8(%esi)		C src[1] * src[2]
+
+	xorl	%esi, %esi
+	addl	%eax, %ebp
+
+	C eax
+	C ebx	dst[1]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	zero, will be dst[5]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%ebx, %ebx
+
+	adcl	%edi, %edi
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+	movl	4(%ecx), %eax
+
+	adcl	$0, %esi
+	addl	%ebx, %eax
+
+	movl	%eax, 4(%ecx)
+	movl	8(%ecx), %eax
+
+	adcl	%edi, %eax
+	movl	12(%ecx), %ebx
+
+	adcl	%ebp, %ebx
+	movl	16(%ecx), %edi
+
+	movl	%eax, 8(%ecx)
+	movl	SAVE_EBP, %ebp
+
+	movl	%ebx, 12(%ecx)
+	movl	SAVE_EBX, %ebx
+
+	adcl	%edx, %edi
+	movl	20(%ecx), %eax
+
+	movl	%edi, 16(%ecx)
+	movl	SAVE_EDI, %edi
+
+	adcl	%esi, %eax	C no carry out of this
+	movl	SAVE_ESI, %esi
+
+	movl	%eax, 20(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP,    -24)
+deflit(`STACK_SPACE',24)
+
+L(four_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx
+	C edx	size
+	C esi	src
+	C edi
+	C ebp
+deflit(`FRAME',4)  dnl  %esi already pushed
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+ 
+	subl	$STACK_SPACE-FRAME, %esp
+deflit(`FRAME',STACK_SPACE)
+	movl	$1, %ecx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+	subl	%edx, %ecx		C -(size-1)
+
+	movl	%ebp, SAVE_EBP
+	movl	$0, %ebx		C initial carry
+
+	leal	(%esi,%edx,4), %esi	C &src[size]
+	movl	%eax, %ebp		C multiplier
+
+	leal	-4(%edi,%edx,4), %edi	C &dst[size-1]
+
+
+C This loop runs at just over 6 c/l.
+
+L(mul_1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, limbs, negative, -(size-1) to -1
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size-1]
+	C ebp	multiplier
+
+	movl	%ebp, %eax
+
+	mull	(%esi,%ecx,4)
+
+	addl	%ebx, %eax
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	movl	%eax, 4(%edi,%ecx,4)
+
+	incl	%ecx
+	jnz	L(mul_1)
+
+
+	movl	%ebx, 4(%edi)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+
+dnl  This is also hard-coded in the address calculation below.
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl  With &src[size] and &dst[size-1] pointers, the displacements in the
+dnl  unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
+dnl  that an offset must be added to them.
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>32),1,
+eval((UNROLL_COUNT-32)*4),
+0))
+
+	C eax
+	C ebx	carry
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size-1]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	movl	%ecx, %edx
+	negl	%ecx
+
+	shll	$4, %ecx
+ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+	negl	%edx
+
+ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
+
+	C The calculated jump mustn't be before the start of the available
+	C code.  This is the limit that UNROLL_COUNT puts on the src operand
+	C size, but checked here using the jump address directly.
+
+	ASSERT(ae,
+	`movl_text_address( L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	high limb to store
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	&src[size], constant
+	C edi	dst ptr, second highest limb of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%esi,%edx,4), %ebp	C multiplier
+	movl	%edx, VAR_COUNTER
+
+	movl	-8+OFFSET(%esi,%edx,4), %eax	C first limb of multiplicand
+
+	mull	%ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
+
+	testb	$1, %cl
+
+	movl	%edx, %ebx	C high carry
+	leal	4(%edi), %edi
+
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	leal	CODE_BYTES_PER_LIMB(%edx), %edx
+
+	cmovX(	%ebx, %ecx)	C high carry reverse
+	cmovX(	%eax, %ebx)	C low carry reverse
+	movl	%edx, VAR_JMP
+	jmp	*%edx
+
+
+	C Must be on an even address here so the low bit of the jump address
+	C will indicate which way around ecx/ebx should start.
+
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	scratch
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src pointer
+	C edi	dst pointer
+	C ebp	multiplier
+	C
+	C 15 code bytes each limb
+	C ecx/ebx reversed on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ebx, disp_dst,(%edi))
+	adcl	%eax, %ecx
+	movl	%edx, %ebx
+	adcl	$0, %ebx
+',`
+	dnl  this one comes out last
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp_dst,(%edi))
+	adcl	%eax, %ebx
+	movl	%edx, %ecx
+	adcl	$0, %ecx
+')
+')
+L(unroll_inner_end):
+
+	addl	%ebx, m4_empty_if_zero(OFFSET)(%edi)
+
+	movl	VAR_COUNTER, %edx
+	adcl	$0, %ecx
+
+	movl	%ecx, m4_empty_if_zero(OFFSET+4)(%edi)
+	movl	VAR_JMP, %ecx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+	
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %esi
+	addl	$OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(corner):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[2*size-5]
+	C ebp
+
+	movl	-12(%esi), %eax
+
+	mull	-8(%esi)
+
+	addl	%eax, (%edi)
+	movl	-12(%esi), %eax
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+
+	mull	-4(%esi)
+
+	addl	%eax, %ebx
+	movl	-8(%esi), %eax
+
+	adcl	$0, %edx
+
+	addl	%ebx, 4(%edi)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+
+	mull	-4(%esi)
+
+	movl	PARAM_SIZE, %ecx
+	addl	%ebx, %eax
+
+	adcl	$0, %edx
+
+	movl	%eax, 8(%edi)
+
+	movl	%edx, 12(%edi)
+	movl	PARAM_DST, %edi
+
+
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+
+	subl	$1, %ecx		C size-1
+	xorl	%eax, %eax		C ready for final adcl, and clear carry
+
+	movl	%ecx, %edx
+	movl	PARAM_SRC, %esi
+
+
+L(lshift):
+	C eax
+	C ebx
+	C ecx	counter, size-1 to 1
+	C edx	size-1 (for later use)
+	C esi	src (for later use)
+	C edi	dst, incrementing
+	C ebp
+
+	rcll	4(%edi)
+	rcll	8(%edi)
+
+	leal	8(%edi), %edi
+	decl	%ecx
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax
+
+	movl	%eax, 4(%edi)		C dst most significant limb
+	movl	(%esi), %eax		C src[0]
+
+	leal	4(%esi,%edx,4), %esi	C &src[size]
+	subl	%edx, %ecx		C -(size-1)
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+	mull	%eax
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+
+
+L(diag):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, negative
+	C edx	carry
+	C esi	&src[size]
+	C edi	dst[2*size-2]
+	C ebp
+
+	movl	(%esi,%ecx,4), %eax
+	movl	%edx, %ebx
+
+	mull	%eax
+
+	addl	%ebx, 4(%edi,%ecx,8)
+	adcl	%eax, 8(%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	addl	%edx, 4(%edi)		C dst most significant limb
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+	addl	(%esp), %ecx
+	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+	addl	%edx, %ecx
+	ret
+')
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/README b/rts/gmp/mpn/x86/pentium/README
new file mode 100644
index 0000000000..3b9ec8ac6f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/README
@@ -0,0 +1,77 @@
+
+                   INTEL PENTIUM P5 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium (P5,P54)
+processors.  The mmx subdirectory has code for Pentium with MMX (P55).
+
+
+STATUS
+
+                                cycles/limb
+
+	mpn_add_n/sub_n            2.375
+
+	mpn_copyi/copyd            1.0
+
+	mpn_divrem_1              44.0
+	mpn_mod_1                 44.0
+	mpn_divexact_by3          15.0
+
+	mpn_l/rshift               5.375 normal (6.0 on P54)
+				   1.875 special shift by 1 bit
+
+	mpn_mul_1                 13.0
+	mpn_add/submul_1          14.0
+
+	mpn_mul_basecase          14.2 cycles/crossproduct (approx)
+
+	mpn_sqr_basecase           8 cycles/crossproduct (approx)
+                                   or 15.5 cycles/triangleproduct (approx)
+
+Pentium MMX gets the following improvements
+
+	mpn_l/rshift               1.75
+
+
+1. mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the
+documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
+or 5 cycles/limb asymptotically.  The P55 runs them at the expected speed.
+
+2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb.  Due to loop
+overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb.
+
+3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
+should.  Intel documentation says a mul instruction is 10 cycles, but it
+measures 9 and the routines using it run with it as 9.
+
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+1. Pentium doesn't allocate cache lines on writes, unlike most other modern
+processors.  Since the functions in the mpn class do array writes, we have to
+handle allocating the destination cache lines by reading a word from it in the
+loops, to achieve the best performance.
+
+2. Pairing of memory operations requires that the two issued operations refer
+to different cache banks.  The simplest way to insure this is to read/write
+two words from the same object.  If we make operations on different objects,
+they might or might not be to the same cache bank.
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Manual", 1997, order number 242816.  This
+is mostly about P5, the parts about P6 aren't relevant.  Available on-line:
+
+        http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/pentium/aors_n.asm b/rts/gmp/mpn/x86/pentium/aors_n.asm
new file mode 100644
index 0000000000..a61082a456
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/aors_n.asm
@@ -0,0 +1,196 @@
+dnl  Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+dnl
+dnl  P5: 2.375 cycles/limb
+
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n',`
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(M4_function_nc)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(endgo)
+
+	pushl	%edx
+FRAME_pushl()
+	movl	PARAM_CARRY,%eax
+	shrl	$1,%eax			C shift bit 0 into carry
+	jmp	LF(M4_function_n,oop)
+
+L(endgo):
+deflit(`FRAME',16)
+	movl	PARAM_CARRY,%eax
+	shrl	$1,%eax			C shift bit 0 into carry
+	jmp	LF(M4_function_n,end)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_n)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(end)
+	pushl	%edx
+FRAME_pushl()
+
+	ALIGN(8)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	leal	32(%edi),%edi
+
+L(1):	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	8(%ebp),%ebx
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L(2):	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	12(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	16(%ebp),%ebx
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L(3):	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	20(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	24(%ebp),%ebx
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L(4):	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	28(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	32(%ebp),%ebx
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebp),%ebp
+	decl	%ecx
+	jnz	L(oop)
+
+	popl	%edx
+FRAME_popl()
+L(end):
+	decl	%edx			C test %edx w/o clobbering carry
+	js	L(end2)
+	incl	%edx
+L(oop2):
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebp),%ebp
+	decl	%edx
+	jnz	L(oop2)
+L(end2):
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	%eax,(%edi)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/aorsmul_1.asm b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
new file mode 100644
index 0000000000..147b55610f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
@@ -0,0 +1,99 @@
+dnl  Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
+dnl 
+dnl  P5: 14.0 cycles/limb
+
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA. */
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_addmul_1', `
+      define(M4_inst,        addl)
+      define(M4_function_1,  mpn_addmul_1)
+
+',`ifdef(`OPERATION_submul_1', `
+      define(M4_inst,        subl)
+      define(M4_function_1,  mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(M4_function_1)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST, %edi
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_MULTIPLIER, %ebp
+
+	leal	(%edi,%ecx,4), %edi
+	leal	(%esi,%ecx,4), %esi
+	negl	%ecx
+	xorl	%ebx, %ebx
+	ALIGN(8)
+
+L(oop):	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	(%edi,%ecx,4), %ebx
+
+	adcl	$0, %edx
+	M4_inst	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(oop)
+
+	adcl	$0, %ebx
+	movl	%ebx, %eax
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/diveby3.asm b/rts/gmp/mpn/x86/pentium/diveby3.asm
new file mode 100644
index 0000000000..dbac81642f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/diveby3.asm
@@ -0,0 +1,183 @@
+dnl  Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl       
+dnl  P5: 15.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3,        0xAAAAAAAB)
+
+dnl  ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL,   0x55555556)
+deflit(TWO_THIRDS_CEIL,  0xAAAAAAAB)
+deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %ecx
+	movl	PARAM_SIZE, %edx
+
+	decl	%edx
+	jnz	L(two_or_more)
+
+	movl	(%ecx), %edx
+	movl	PARAM_CARRY, %eax	C risk of cache bank clash here
+
+	movl	PARAM_DST, %ecx
+	subl	%eax, %edx
+
+	sbbl	%eax, %eax		C 0 or -1
+
+	imull	$INVERSE_3, %edx, %edx
+
+	negl	%eax			C 0 or 1
+	cmpl	$ONE_THIRD_CEIL, %edx
+
+	sbbl	$-1, %eax		C +1 if edx>=ceil(b/3)
+	cmpl	$TWO_THIRDS_CEIL, %edx
+
+	sbbl	$-1, %eax		C +1 if edx>=ceil(b*2/3)
+	movl	%edx, (%ecx)
+
+	ret
+
+
+L(two_or_more):
+	C eax
+	C ebx
+	C ecx	src
+	C edx	size-1
+	C esi
+	C edi
+	C ebp
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	pushl	%edi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	movl	PARAM_CARRY, %esi
+
+	movl	(%ecx), %eax		C src low limb
+	xorl	%ebx, %ebx
+
+	sub	%esi, %eax
+	movl	$TWO_THIRDS_FLOOR, %esi
+
+	leal	(%ecx,%edx,4), %ecx	C &src[size-1]
+	leal	(%edi,%edx,4), %edi	C &dst[size-1]
+
+	adcl	$0, %ebx		C carry, 0 or 1
+	negl	%edx			C -(size-1)
+
+
+C The loop needs a source limb ready at the top, which leads to one limb
+C handled separately at the end, and the special case above for size==1.
+C There doesn't seem to be any scheduling that would keep the speed but move
+C the source load and carry subtract up to the top.
+C
+C The destination cache line prefetching adds 1 cycle to the loop but is
+C considered worthwhile.  The slowdown is a factor of 1.07, but will prevent
+C repeated write-throughs if the destination isn't in L1.  A version using
+C an outer loop to prefetch only every 8 limbs (a cache line) proved to be
+C no faster, due to unavoidable branch mispreditions in the inner loop.
+C
+C setc is 2 cycles on P54, so an adcl is used instead.  If the movl $0,%ebx
+C could be avoided then the src limb fetch could pair up and save a cycle.
+C This would probably mean going to a two limb loop with the carry limb
+C alternately positive or negative, since an sbbl %ebx,%ebx will leave a
+C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax.
+C
+C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as
+C "cmpl %edx, $n" with the immediate as the second operand.
+C
+C The "4" source displacement is in the loop rather than the setup because
+C this gets L(top) aligned to 8 bytes at no cost.
+
+	ALIGN(8)
+L(top):
+	C eax	source limb, carry subtracted
+	C ebx	carry (0 or 1)
+	C ecx	&src[size-1]
+	C edx	counter, limbs, negative
+	C esi	TWO_THIRDS_FLOOR
+	C edi	&dst[size-1]
+	C ebp	scratch (result limb)
+
+	imull	$INVERSE_3, %eax, %ebp
+
+	cmpl	$ONE_THIRD_CEIL, %ebp
+	movl	(%edi,%edx,4), %eax	C dst cache line prefetch
+
+	sbbl	$-1, %ebx		C +1 if ebp>=ceil(b/3)
+	cmpl	%ebp, %esi
+
+	movl	4(%ecx,%edx,4), %eax	C next src limb
+
+	sbbl	%ebx, %eax		C and further -1 if ebp>=ceil(b*2/3)
+	movl	$0, %ebx
+
+	adcl	$0, %ebx		C new carry
+	movl	%ebp, (%edi,%edx,4)
+
+	incl	%edx
+	jnz	L(top)
+
+
+
+	imull	$INVERSE_3, %eax, %edx
+
+	cmpl	$ONE_THIRD_CEIL, %edx
+	movl	%edx, (%edi)
+
+	sbbl	$-1, %ebx	C +1 if edx>=ceil(b/3)
+	cmpl	$TWO_THIRDS_CEIL, %edx
+
+	sbbl	$-1, %ebx	C +1 if edx>=ceil(b*2/3)
+	popl	%ebp
+
+	movl	%ebx, %eax
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/gmp-mparam.h
new file mode 100644
index 0000000000..d3ed3d73ce
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   9 /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME   41 /* cycles */
+#endif
+
+/* bsf takes 18-42 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   20  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   14
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      179
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   22
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      153
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              46
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            110
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          25
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 496, 928, 1920, 4608, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     512
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         3840
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 496, 1184, 1920, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     512
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         3840
+#endif
diff --git a/rts/gmp/mpn/x86/pentium/lshift.asm b/rts/gmp/mpn/x86/pentium/lshift.asm
new file mode 100644
index 0000000000..e1e35d4c57
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/lshift.asm
@@ -0,0 +1,236 @@
+dnl  Intel Pentium mpn_lshift -- mpn left shift.
+dnl
+dnl          cycles/limb
+dnl  P5,P54:    6.0
+dnl  P55:       5.375
+
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions. 
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%esi),%eax
+	cmpl	%edi,%eax
+	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
+	leal	(%esi,%ebp,4),%eax
+	cmpl	%eax,%edi
+	jnc	L(special)		C jump if res_ptr >= s_ptr + size
+
+L(normal):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+	xorl	%eax,%eax
+	shldl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	shldl(	%cl, %ebx, %edx)
+	shldl(	%cl, %eax, %ebx)
+	movl	%edx,-8(%edi)
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	shldl(	%cl, %edx, %eax)
+	shldl(	%cl, %ebx, %edx)
+	movl	%eax,-16(%edi)
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,-24(%edi)
+	movl	%eax,-28(%edi)
+
+	subl	$32,%esi
+	subl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shldl(	%cl,%eax,%edx)
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	subl	$4,%esi
+	subl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shll	%cl,%edx		C compute least significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	movl	(%esi),%edx
+	addl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	addl	%edx,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,(%edi)
+	adcl	%edx,%edx
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	adcl	%ebx,%ebx
+	movl	%edx,8(%edi)
+	adcl	%eax,%eax
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	adcl	%edx,%edx
+	movl	%eax,16(%edi)
+	adcl	%ebx,%ebx
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,24(%edi)
+	adcl	%edx,%edx
+	movl	%eax,28(%edi)
+
+	leal	32(%esi),%esi		C use leal not to clobber carry
+	leal	32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	%edx,%edx
+	movl	%ebx,(%edi)
+
+	leal	4(%esi),%esi		C use leal not to clobber carry
+	leal	4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
new file mode 100644
index 0000000000..2379077d0c
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   9 /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME   41 /* cycles */
+#endif
+
+/* bsf takes 18-42 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   20  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   14
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD       99
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   22
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       89
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              40
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        5
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          25
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 496, 1056, 1920, 4608, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     512
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         3840
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 496, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     512
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         4352
+#endif
diff --git a/rts/gmp/mpn/x86/pentium/mmx/lshift.asm b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
new file mode 100644
index 0000000000..2225438658
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
@@ -0,0 +1,455 @@
+dnl  Intel P5 mpn_lshift -- mpn left shift.
+dnl 
+dnl  P5: 1.75 cycles/limb.
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right.  Return the bits shifted out at the
+C left.
+C
+C The comments in mpn_rshift apply here too.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  minimum 5, because the unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_lshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	movl	-4(%ebx,%eax,4), %edi	C src high limb
+	decl	%eax
+
+	jnz	L(simple)
+
+	shldl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+ 	shll	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx,%eax,4), %mm5	C src high limb
+
+ 	movd	%ecx, %mm6		C lshift
+	negl	%ecx
+
+	psllq	%mm6, %mm5
+	addl	$32, %ecx
+
+	movd	%ecx, %mm7
+	psrlq	$32, %mm5		C retval
+
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%eax,4), %mm0
+	decl	%eax
+
+ 	psrlq	%mm7, %mm0
+
+	C
+
+	movd	%mm0, 4(%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+
+	movd	%mm5, %eax
+ 	psllq	%mm6, %mm0
+
+	popl	%edi
+	popl	%ebx
+
+	movd	%mm0, (%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	-4(%ebx,%eax,4), %mm5	C src high limb
+	leal	(%ebx,%eax,4), %edi
+
+ 	movd	%ecx, %mm6		C lshift
+	andl	$4, %edi
+
+	psllq	%mm6, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process high limb separately (marked xxx) to
+	C make it so.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+-------+--
+	C  |               |          
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+--
+	C  |  xxx  |       |  
+	C  +-------+-------+--
+
+	movq	-8(%ebx,%eax,4), %mm0	C unaligned load
+
+	psllq	%mm6, %mm0
+	decl	%eax
+
+	psrlq	$32, %mm0
+
+	C
+
+	movd	%mm0, (%edx,%eax,4)
+L(start_src_aligned):
+
+	movq	-8(%ebx,%eax,4), %mm1	C src high qword
+	leal	(%edx,%eax,4), %edi
+
+	andl	$4, %edi
+	psrlq	$32, %mm5		C return value
+
+	movq	-16(%ebx,%eax,4), %mm3	C src second highest qword
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, subtract 4 to make it so, and pretend the shift
+	C is 32 bits extra.  High limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+--
+	C  |      mm1      |  
+	C  +-------+-------+--
+	C                0mod8   4mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+-------+--
+	C  |  xxx  |               |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psllq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+	psrlq	$32, %mm0
+
+	C wasted cycle here waiting for %mm0
+
+	movd	%mm0, -4(%edx,%eax,4)
+	subl	$4, %edx
+L(start_dst_aligned):
+
+
+ 	psllq	%mm6, %mm1
+	negl	%ecx			C -shift
+
+        addl    $64, %ecx		C 64-shift
+ 	movq	%mm3, %mm2
+
+        movd    %ecx, %mm7
+	subl	$8, %eax		C size-8
+
+ 	psrlq	%mm7, %mm3
+
+ 	por	%mm1, %mm3		C mm3 ready to store
+	jc	L(finish)
+
+
+	C The comments in mpn_rshift apply here too.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from 48(%ebx,%eax,4)
+	C mm3	dst qword ready to store to 56(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	lshift
+	C mm7	rshift
+
+ 	movq	8(%ebx,%eax,4), %mm0
+ 	psllq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psrlq	%mm7, %mm0
+
+ 	movq	%mm3, 24(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+ 	movq	(%ebx,%eax,4), %mm3	C
+ 	psllq	%mm6, %mm1		C
+
+ 	movq	%mm0, 16(%edx,%eax,4)
+ 	movq	%mm3, %mm2		C
+
+ 	psrlq	%mm7, %mm3		C
+	subl	$4, %eax
+
+ 	por	%mm1, %mm3		C
+	jnc	L(unroll_loop)
+
+
+
+L(finish):
+	C eax	-4 to -1 representing respectively 0 to 3 limbs remaining
+
+	testb	$2, %al
+
+	jz	L(finish_no_two)
+
+ 	movq	8(%ebx,%eax,4), %mm0
+ 	psllq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psrlq	%mm7, %mm0
+
+ 	movq	%mm3, 24(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	subl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	-4 or -3 representing respectively 0 or 1 limbs remaining
+	C
+	C mm2	src prev qword, from 48(%ebx,%eax,4)
+	C mm3	dst qword, for 56(%edx,%eax,4)
+
+	testb	$1, %al
+	movd	%mm5, %eax	C retval
+
+	popl	%edi
+	jz	L(finish_zero)
+
+
+	C One extra src limb, destination was aligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C dest         edx+12           edx+4     edx
+	C --+---------------+---------------+-------+
+	C   |      mm3      |               |       |
+	C --+---------------+---------------+-------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra src limb, destination was unaligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C         dest         edx+12           edx+4
+	C         --+---------------+---------------+
+	C           |      mm3      |               |
+	C         --+---------------+---------------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 4(%edx), and in the aligned case
+	C there's an extra limb of dst to be formed from that extra src limb
+	C left shifted.
+
+
+        movd    (%ebx), %mm0
+ 	psllq	%mm6, %mm2
+
+	movq	%mm3, 12(%edx)
+	psllq	$32, %mm0
+
+        movq    %mm0, %mm1
+        psrlq   %mm7, %mm0
+
+        por     %mm2, %mm0
+        psllq   %mm6, %mm1
+
+	movq    %mm0, 4(%edx)
+	psrlq	$32, %mm1
+
+        andl	$32, %ecx
+	popl	%ebx
+
+	jz	L(finish_one_unaligned)
+
+	movd	%mm1, (%edx)
+L(finish_one_unaligned):
+
+	emms
+
+        ret
+
+
+L(finish_zero):
+
+	C No extra src limbs, destination was aligned.
+	C
+	C                 source          ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C dest          edx+8             edx
+	C --+---------------+---------------+
+	C   |      mm3      |               |
+	C --+---------------+---------------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra src limbs, destination was unaligned.
+	C
+	C               source            ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C         dest          edx+8   edx+4
+	C         --+---------------+-------+
+	C           |      mm3      |       |
+	C         --+---------------+-------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C The movd for the unaligned case writes the same data to 4(%edx)
+	C that the movq does for the aligned case.
+
+
+ 	movq	%mm3, 8(%edx)
+	andl	$32, %ecx
+
+ 	psllq	%mm6, %mm2
+	jz	L(finish_zero_unaligned)
+
+ 	movq	%mm2, (%edx)
+L(finish_zero_unaligned):
+
+	psrlq	$32, %mm2
+	popl	%ebx
+
+	movd	%mm5, %eax	C retval
+
+	movd	%mm2, 4(%edx)
+
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/popham.asm b/rts/gmp/mpn/x86/pentium/mmx/popham.asm
new file mode 100644
index 0000000000..587a07ab3d
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/popham.asm
@@ -0,0 +1,30 @@
+dnl  Intel P55 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl  distance.
+dnl 
+dnl  P55: popcount 11.5 cycles/limb, hamdist 12.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/pentium/mmx/rshift.asm b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
new file mode 100644
index 0000000000..7672630d57
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
@@ -0,0 +1,460 @@
+dnl  Intel P5 mpn_rshift -- mpn right shift.
+dnl 
+dnl  P5: 1.75 cycles/limb.
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  Return the bits shifted out at the
+C right.
+C
+C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
+C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
+C
+C Full speed depends on source and destination being aligned.  Unaligned mmx
+C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
+C setups and finish-ups are done to ensure alignment for the loop.
+C
+C MMX shifts work out a bit faster even for the simple loop.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 5, because the unrolled loop can't handle less.
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_rshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	decl	%eax
+	movl	(%ebx), %edi		C src low limb
+
+	jnz	L(simple)
+
+	shrdl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+ 	shrl	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	leal	(%ebx,%eax,4), %ebx	C &src[size-1]
+
+ 	movd	%ecx, %mm6		C rshift
+	leal	-4(%edx,%eax,4), %edx	C &dst[size-2]
+
+	psllq	$32, %mm5
+	negl	%eax
+
+
+C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
+C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
+C cycles and would be 8 in a simple loop.  Using mmx helps the return value
+C and last limb calculations too.
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-1]
+	C ecx	return value
+	C edx	&dst[size-2]
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+
+	movq	(%ebx,%eax,4), %mm0
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+	movd	%mm0, (%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+	psrlq	%mm6, %mm5		C return value
+
+ 	psrlq	%mm6, %mm0
+	popl	%edi
+
+	movd	%mm5, %eax
+	popl	%ebx
+
+	movd	%mm0, 4(%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	movl	$4, %edi
+
+ 	movd	%ecx, %mm6		C rshift
+	testl	%edi, %ebx
+
+	psllq	$32, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process low limb separately (marked xxx) and
+	C step src and dst by one limb, making src aligned.
+	C
+	C source                  ebx
+	C --+-------+-------+-------+
+	C           |          xxx  |
+	C --+-------+-------+-------+
+	C         4mod8   0mod8   4mod8
+	C
+	C         dest            edx
+	C         --+-------+-------+
+	C           |       |  xxx  |  
+	C         --+-------+-------+
+
+	movq	(%ebx), %mm0		C unaligned load
+
+	psrlq	%mm6, %mm0
+	addl	$4, %ebx
+
+	decl	%eax
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_src_aligned):
+
+
+	movq	(%ebx), %mm1
+	testl	%edi, %edx
+
+	psrlq	%mm6, %mm5		C retval
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, add 4 to make it so, and pretend the shift is
+	C 32 bits extra.  Low limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C          source          ebx
+	C          --+-------+-------+
+	C            |      mm1      |
+	C          --+-------+-------+
+	C                  4mod8   0mod8
+	C
+	C  dest                    edx
+	C  --+-------+-------+-------+
+	C                    |  xxx  |        
+	C  --+-------+-------+-------+
+	C          4mod8   0mod8   4mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psrlq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_dst_aligned):
+
+
+	movq	8(%ebx), %mm3
+	negl	%ecx
+
+ 	movq	%mm3, %mm2		C mm2 src qword
+        addl    $64, %ecx
+
+        movd    %ecx, %mm7
+ 	psrlq	%mm6, %mm1
+
+	leal	-12(%ebx,%eax,4), %ebx
+	leal	-20(%edx,%eax,4), %edx
+
+ 	psllq	%mm7, %mm3
+	subl	$7, %eax		C size-7
+
+ 	por	%mm1, %mm3		C mm3 ready to store
+	negl	%eax			C -(size-7)
+
+	jns	L(finish)
+
+
+	C This loop is the important bit, the rest is just support.  Careful
+	C instruction scheduling achieves the claimed 1.75 c/l.  The
+	C relevant parts of the pairing rules are:
+	C
+	C - mmx loads and stores execute only in the U pipe
+	C - only one mmx shift in a pair
+	C - wait one cycle before storing an mmx register result
+	C - the usual address generation interlock
+	C
+	C Two qword calculations are slightly interleaved.  The instructions
+	C marked "C" belong to the second qword, and the "C prev" one is for
+	C the second qword from the previous iteration.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-12]
+	C ecx
+	C edx	&dst[size-12]
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from -8(%ebx,%eax,4)
+	C mm3	dst qword ready to store to -8(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	rshift
+	C mm7	lshift
+
+ 	movq	(%ebx,%eax,4), %mm0
+ 	psrlq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psllq	%mm7, %mm0
+
+ 	movq	%mm3, -8(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+ 	movq	8(%ebx,%eax,4), %mm3	C
+ 	psrlq	%mm6, %mm1		C
+
+ 	movq	%mm0, (%edx,%eax,4)
+ 	movq	%mm3, %mm2		C
+
+ 	psllq	%mm7, %mm3		C
+	addl	$4, %eax
+
+ 	por	%mm1, %mm3		C
+	js	L(unroll_loop)
+
+
+L(finish):
+	C eax	0 to 3 representing respectively 3 to 0 limbs remaining
+
+	testb	$2, %al
+
+	jnz	L(finish_no_two)
+
+ 	movq	(%ebx,%eax,4), %mm0
+ 	psrlq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psllq	%mm7, %mm0
+
+ 	movq	%mm3, -8(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	addl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	2 or 3 representing respectively 1 or 0 limbs remaining
+	C
+	C mm2	src prev qword, from -8(%ebx,%eax,4)
+	C mm3	dst qword, for -8(%edx,%eax,4)
+
+	testb	$1, %al
+	popl	%edi
+
+	movd	%mm5, %eax	C retval
+	jnz	L(finish_zero)
+
+
+	C One extra limb, destination was aligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                                  edx
+	C +-------+---------------+---------------+--
+	C |       |               |      mm3      |
+	C +-------+---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra limb, destination was unaligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                          edx
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 8(%edx), and in the aligned case
+	C there's a further extra limb of dst to be formed.
+
+
+        movd    8(%ebx), %mm0
+ 	psrlq	%mm6, %mm2
+
+        movq    %mm0, %mm1
+        psllq   %mm7, %mm0
+
+	movq	%mm3, (%edx)
+        por     %mm2, %mm0
+
+        psrlq   %mm6, %mm1
+        andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_one_unaligned)
+
+        C dst was aligned, must store one extra limb
+	movd	%mm1, 16(%edx)
+L(finish_one_unaligned):
+
+	movq    %mm0, 8(%edx)
+
+	emms
+
+        ret
+
+
+L(finish_zero):
+
+	C No extra limbs, destination was aligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                        edx+4
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra limbs, destination was unaligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                edx+4
+	C +-------+---------------+--
+	C |       |      mm3      |
+	C +-------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = 64-(shift+32)
+
+
+	C The movd for the unaligned case is clearly the same data as the
+	C movq for the aligned case, it's just a choice between whether one
+	C or two limbs should be written.
+
+
+ 	movq	%mm3, 4(%edx)
+ 	psrlq	%mm6, %mm2
+
+ 	movd	%mm2, 12(%edx)
+	andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_zero_unaligned)
+
+ 	movq	%mm2, 12(%edx)
+L(finish_zero_unaligned):
+
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_1.asm b/rts/gmp/mpn/x86/pentium/mul_1.asm
new file mode 100644
index 0000000000..08639eca09
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mul_1.asm
@@ -0,0 +1,79 @@
+dnl  Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
+dnl
+dnl  P5: 13.0 cycles/limb
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA. */
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST, %edi
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_MULTIPLIER, %ebp
+
+	leal	(%edi,%ecx,4), %edi
+	leal	(%esi,%ecx,4), %esi
+	negl	%ecx
+	xorl	%ebx, %ebx
+	ALIGN(8)
+
+L(oop):	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(oop)
+
+	adcl	$0, %ebx
+	movl	%ebx, %eax
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_basecase.asm b/rts/gmp/mpn/x86/pentium/mul_basecase.asm
new file mode 100644
index 0000000000..d9f79a0831
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mul_basecase.asm
@@ -0,0 +1,135 @@
+dnl  Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
+dnl 
+dnl  P5: 14.2 cycles/crossproduct (approx)
+
+
+dnl  Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+
+defframe(PARAM_YSIZE, 20)
+defframe(PARAM_YP,    16)
+defframe(PARAM_XSIZE, 12)
+defframe(PARAM_XP,    8)
+defframe(PARAM_WP,    4)
+
+defframe(VAR_COUNTER, -4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_mul_basecase)
+
+	pushl	%eax			C dummy push for allocating stack slot
+	pushl	%esi
+	pushl	%ebp
+	pushl	%edi
+deflit(`FRAME',16)
+
+	movl	PARAM_XP,%esi
+	movl	PARAM_WP,%edi
+	movl	PARAM_YP,%ebp
+
+	movl	(%esi),%eax		C load xp[0]
+	mull	(%ebp)			C multiply by yp[0]
+	movl	%eax,(%edi)		C store to wp[0]
+	movl	PARAM_XSIZE,%ecx	C xsize
+	decl	%ecx			C If xsize = 1, ysize = 1 too
+	jz	L(done)
+
+	movl	PARAM_XSIZE,%eax
+	pushl	%ebx
+FRAME_pushl()
+	movl	%edx,%ebx
+	leal	(%esi,%eax,4),%esi	C make xp point at end
+	leal	(%edi,%eax,4),%edi	C offset wp by xsize
+	negl	%ecx			C negate j size/index for inner loop
+	xorl	%eax,%eax		C clear carry
+
+	ALIGN(8)
+L(oop1):	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax	C load next limb at xp[j]
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	%eax,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop1)
+
+	adcl	$0,%ebx
+	movl	PARAM_YSIZE,%eax
+	movl	%ebx,(%edi)		C most significant limb of product
+	addl	$4,%edi			C increment wp
+	decl	%eax
+	jz	L(skip)
+	movl	%eax,VAR_COUNTER	C set index i to ysize
+
+L(outer):
+	addl	$4,%ebp			C make ebp point to next y limb
+	movl	PARAM_XSIZE,%ecx
+	negl	%ecx
+	xorl	%ebx,%ebx
+
+	C code at 0x61 here, close enough to aligned
+L(oop2):
+	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	(%edi,%ecx,4),%ebx
+	adcl	$0,%edx
+	addl	%eax,%ebx
+	movl	%ebx,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop2)
+
+	adcl	$0,%ebx
+
+	movl	%ebx,(%edi)
+	addl	$4,%edi
+	movl	VAR_COUNTER,%eax
+	decl	%eax
+	movl	%eax,VAR_COUNTER
+	jnz	L(outer)
+
+L(skip):
+	popl	%ebx
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$4,%esp
+	ret
+
+L(done):
+	movl	%edx,4(%edi)	C store to wp[1]
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	popl	%eax		C dummy pop for deallocating stack slot
+	ret
+
+EPILOGUE()
+
diff --git a/rts/gmp/mpn/x86/pentium/rshift.asm b/rts/gmp/mpn/x86/pentium/rshift.asm
new file mode 100644
index 0000000000..e8f5ae8ec8
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/rshift.asm
@@ -0,0 +1,236 @@
+dnl  Intel Pentium mpn_rshift -- mpn right shift.
+dnl
+dnl          cycles/limb
+dnl  P5,P54:    6.0
+dnl  P55:       5.375
+
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions. 
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%edi),%eax
+	cmpl	%esi,%eax
+	jnc	L(special)		C jump if res_ptr + 1 >= s_ptr
+	leal	(%edi,%ebp,4),%eax
+	cmpl	%eax,%esi
+	jnc	L(special)		C jump if s_ptr >= res_ptr + size
+
+L(normal):
+	movl	(%esi),%edx
+	addl	$4,%esi
+	xorl	%eax,%eax
+	shrdl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	shrdl(	%cl, %ebx, %edx)
+	shrdl(	%cl, %eax, %ebx)
+	movl	%edx,8(%edi)
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	shrdl(	%cl, %edx, %eax)
+	shrdl(	%cl, %ebx, %edx)
+	movl	%eax,16(%edi)
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,24(%edi)
+	movl	%eax,28(%edi)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shrdl(	%cl,%eax,%edx)		C compute result limb
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	addl	$4,%esi
+	addl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shrl	%cl,%edx		C compute most significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	shrl	%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,(%edi)
+	rcrl	%edx
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	rcrl	%ebx
+	movl	%edx,-8(%edi)
+	rcrl	%eax
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	rcrl	%edx
+	movl	%eax,-16(%edi)
+	rcrl	%ebx
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,-24(%edi)
+	rcrl	%edx
+	movl	%eax,-28(%edi)
+
+	leal	-32(%esi),%esi		C use leal not to clobber carry
+	leal	-32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	rcrl	%edx
+	movl	%ebx,(%edi)
+
+	leal	-4(%esi),%esi		C use leal not to clobber carry
+	leal	-4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	movl	$0,%eax
+	rcrl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/sqr_basecase.asm b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
new file mode 100644
index 0000000000..c8584df13c
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
@@ -0,0 +1,520 @@
+dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
+dnl  product at around 20x20 limbs.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Calculate src,size squared, storing the result in dst,2*size.
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+
+	je	L(two_limbs)
+
+	movl	(%eax), %eax
+	ja	L(three_or_more)
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	ret
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	pushl	%esi
+	pushl	%ebx
+
+	movl	%eax, %ebx
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	%edx, %esi	C dst[1]
+
+	movl	4(%ebx), %eax
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, %edi	C dst[2]
+	movl	%edx, %ebp	C dst[3]
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0]*src[1]
+
+	addl	%eax, %esi
+	popl	%ebx
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%esi, %eax
+
+	adcl	%edi, %edx
+	movl	%eax, 4(%ecx)
+
+	adcl	$0, %ebp
+	popl	%esi
+
+	movl	%edx, 8(%ecx)
+	movl	%ebp, 12(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(three_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	cmpl	$4, %edx
+	pushl	%ebx
+deflit(`FRAME',4)
+
+	movl	PARAM_SRC, %ebx
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	movl	4(%ebx), %eax
+	xorl	%ebp, %ebp
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	%edx, 12(%ecx)
+
+	movl	8(%ebx), %eax
+	pushl	%esi		C risk of cache bank clash
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	%edx, 20(%ecx)
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	movl	(%ebx), %eax
+
+	mull	8(%ebx)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%edx, %ebp
+
+	adcl	$0, %ebp
+	movl	4(%ebx), %eax
+
+	mull	8(%ebx)		C src[1] * src[2]
+
+	xorl	%ebx, %ebx
+	addl	%eax, %ebp
+
+	C eax
+	C ebx	zero, will be dst[5]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%esi, %esi
+
+	adcl	%edi, %edi
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+	movl	4(%ecx), %eax
+
+	adcl	$0, %ebx
+	addl	%esi, %eax
+
+	movl	%eax, 4(%ecx)
+	movl	8(%ecx), %eax
+
+	adcl	%edi, %eax
+	movl	12(%ecx), %esi
+
+	adcl	%ebp, %esi
+	movl	16(%ecx), %edi
+
+	movl	%eax, 8(%ecx)
+	movl	%esi, 12(%ecx)
+
+	adcl	%edx, %edi
+	popl	%esi
+
+	movl	20(%ecx), %eax
+	movl	%edi, 16(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	adcl	%ebx, %eax	C no carry out of this
+	popl	%ebx
+
+	movl	%eax, 20(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(four_or_more):
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+	C esi
+	C edi
+	C ebp
+	C
+	C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+deflit(`FRAME',4)
+
+	pushl	%edi
+FRAME_pushl()
+	pushl	%esi
+FRAME_pushl()
+
+	pushl	%ebp
+FRAME_pushl()
+	leal	(%ecx,%edx,4), %edi	C dst end of this mul1
+
+	leal	(%ebx,%edx,4), %esi	C src end
+	movl	%ebx, %ebp		C src
+
+	negl	%edx			C -size
+	xorl	%ebx, %ebx		C clear carry limb and carry flag
+
+	leal	1(%edx), %ecx		C -(size-1)
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	src
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(mul1)
+
+
+	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+	C n=1..size-2.
+	C
+	C The last two products, which are the end corner of the product
+	C triangle, are handled separately to save looping overhead.  These
+	C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
+	C If size is 4 then it's only these that need to be done.
+	C
+	C In the outer loop %esi is a constant, and %edi just advances by 1
+	C limb each time.  The size of the operation decreases by 1 limb
+	C each time.
+
+	C eax
+	C ebx	carry (needing carry flag added)
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	adcl	$0, %ebx
+	movl	PARAM_SIZE, %edx
+
+	movl	%ebx, (%edi)
+	subl	$4, %edx
+
+	negl	%edx
+	jz	L(corner)
+
+
+L(outer):
+	C ebx	previous carry limb to store
+	C edx	outer loop counter (negative)
+	C esi	&src[size]
+	C edi	dst, pointing at stored carry limb of previous loop
+
+	pushl	%edx			C new outer loop counter
+	leal	-2(%edx), %ecx
+
+	movl	%ebx, (%edi)
+	addl	$4, %edi
+
+	addl	$4, %ebp
+	xorl	%ebx, %ebx		C initial carry limb, clear carry flag
+
+L(inner):
+	C eax	scratch
+	C ebx	carry (needing carry flag added)
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	dst end of this addmul
+	C ebp	&src[j]
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%ebx, %eax
+	movl	(%edi,%ecx,4), %ebx
+
+	adcl	$0, %edx
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(inner)
+
+
+	adcl	$0, %ebx
+	popl	%edx		C outer loop counter
+
+	incl	%edx
+	jnz	L(outer)
+
+
+	movl	%ebx, (%edi)
+
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+
+	movl	-8(%esi), %eax
+	movl	-4(%edi), %ebx		C risk of data cache bank clash here
+
+	mull	-12(%esi)		C src[size-2]*src[size-3]
+
+	addl	%eax, %ebx
+	movl	%edx, %ecx
+
+	adcl	$0, %ecx
+	movl	-4(%esi), %eax
+
+	mull	-12(%esi)		C src[size-1]*src[size-3]
+
+	addl	%ecx, %eax
+	movl	(%edi), %ecx
+
+	adcl	$0, %edx
+	movl	%ebx, -4(%edi)
+
+	addl	%eax, %ecx
+	movl	%edx, %ebx
+
+	adcl	$0, %ebx
+	movl	-4(%esi), %eax
+
+	mull	-8(%esi)		C src[size-1]*src[size-2]
+
+	movl	%ecx, 0(%edi)
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+	movl	PARAM_SIZE, %eax
+
+	negl	%eax
+	movl	%ebx, 4(%edi)
+
+	addl	$1, %eax		C -(size-1) and clear carry
+	movl	%edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift):
+	C eax	counter, negative
+	C ebx	next limb
+	C ecx
+	C edx
+	C esi
+	C edi	&dst[2*size-4]
+	C ebp
+
+	movl	12(%edi,%eax,8), %ebx
+
+	rcll	%ebx
+	movl	16(%edi,%eax,8), %ecx
+	
+	rcll	%ecx
+	movl	%ebx, 12(%edi,%eax,8)
+
+	movl	%ecx, 16(%edi,%eax,8)
+	incl	%eax
+
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax		C high bit out
+	movl	PARAM_SRC, %esi
+
+	movl	PARAM_SIZE, %ecx	C risk of cache bank clash
+	movl	%eax, 12(%edi)		C dst most significant limb
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	(%esi), %eax		C src[0]
+	leal	(%esi,%ecx,4), %esi	C src end
+
+	negl	%ecx
+
+	mull	%eax
+
+	movl	%eax, 16(%edi,%ecx,8)	C dst[0]
+	movl	%edx, %ebx
+
+	addl	$1, %ecx		C size-1 and clear carry
+
+L(diag):
+	C eax	scratch (low product)
+	C ebx	carry limb
+	C ecx	counter, negative
+	C edx	scratch (high product)
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+	C ebp	scratch (fetched dst limbs)
+
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %ebx
+
+	mull	%eax
+
+	movl	16-4(%edi,%ecx,8), %ebp
+
+	addl	%ebp, %ebx
+	movl	16(%edi,%ecx,8), %ebp
+
+	adcl	%eax, %ebp
+	movl	%ebx, 16-4(%edi,%ecx,8)
+
+	movl	%ebp, 16(%edi,%ecx,8)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(diag)
+
+
+	adcl	$0, %edx
+	movl	16-4(%edi), %eax	C dst most significant limb
+
+	addl	%eax, %edx
+	popl	%ebp
+
+	movl	%edx, 16-4(%edi)
+	popl	%esi		C risk of cache bank clash
+
+	popl	%edi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/rshift.asm b/rts/gmp/mpn/x86/rshift.asm
new file mode 100644
index 0000000000..c9881fd966
--- /dev/null
+++ b/rts/gmp/mpn/x86/rshift.asm
@@ -0,0 +1,92 @@
+dnl  x86 mpn_rshift -- mpn right shift.
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+deflit(`FRAME',12)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%edx
+	movl	PARAM_SHIFT,%ecx
+
+	leal	-4(%edi,%edx,4),%edi
+	leal	(%esi,%edx,4),%esi
+	negl	%edx
+
+	movl	(%esi,%edx,4),%ebx	C read least significant limb
+	xorl	%eax,%eax
+	shrdl(	%cl, %ebx, %eax)	C compute carry limb
+	incl	%edx
+	jz	L(end)
+	pushl	%eax			C push carry limb onto stack
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movl	%ebx,%eax
+
+	ALIGN(8)
+L(oop):	movl	(%esi,%edx,4),%ebx	C load next higher limb
+	shrdl(	%cl, %ebx, %eax)	C compute result limb
+	movl	%eax,(%edi,%edx,4)	C store it
+	incl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shrdl(	%cl, %eax, %ebx)
+	movl	%ebx,(%edi,%edx,4)
+	incl	%edx
+	jnz	L(oop)
+
+	shrl	%cl,%eax		C compute most significant limb
+	movl	%eax,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+L(end):	shrl	%cl,%ebx		C compute most significant limb
+	movl	%ebx,(%edi)		C store it
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/udiv.asm b/rts/gmp/mpn/x86/udiv.asm
new file mode 100644
index 0000000000..9fe022b107
--- /dev/null
+++ b/rts/gmp/mpn/x86/udiv.asm
@@ -0,0 +1,44 @@
+dnl  x86 mpn_udiv_qrnnd -- 2 by 1 limb division
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low,
+C                           mp_limb_t divisor);
+
+defframe(PARAM_DIVISOR, 16)
+defframe(PARAM_LOW,     12)
+defframe(PARAM_HIGH,    8)
+defframe(PARAM_REMPTR,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_udiv_qrnnd)
+deflit(`FRAME',0)
+	movl	PARAM_LOW, %eax
+	movl	PARAM_HIGH, %edx
+	divl	PARAM_DIVISOR
+	movl	PARAM_REMPTR, %ecx
+	movl	%edx, (%ecx)
+	ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/umul.asm b/rts/gmp/mpn/x86/umul.asm
new file mode 100644
index 0000000000..3d289d1784
--- /dev/null
+++ b/rts/gmp/mpn/x86/umul.asm
@@ -0,0 +1,43 @@
+dnl  mpn_umul_ppmm -- 1x1->2 limb multiplication
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+defframe(PARAM_M2,    12)
+defframe(PARAM_M1,     8)
+defframe(PARAM_LOWPTR, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_umul_ppmm)
+deflit(`FRAME',0)
+	movl	PARAM_LOWPTR, %ecx
+	movl	PARAM_M1, %eax
+	mull	PARAM_M2
+	movl	%eax, (%ecx)
+	movl	%edx, %eax
+	ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/x86-defs.m4 b/rts/gmp/mpn/x86/x86-defs.m4
new file mode 100644
index 0000000000..2dad698002
--- /dev/null
+++ b/rts/gmp/mpn/x86/x86-defs.m4
@@ -0,0 +1,713 @@
+divert(-1)
+
+dnl  m4 macros for x86 assembler.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  Notes:
+dnl
+dnl  m4 isn't perfect for processing BSD style x86 assembler code, the main
+dnl  problems are,
+dnl
+dnl  1. Doing define(foo,123) and then using foo in an addressing mode like
+dnl     foo(%ebx) expands as a macro rather than a constant.  This is worked
+dnl     around by using deflit() from asm-defs.m4, instead of define().
+dnl
+dnl  2. Immediates in macro definitions need a space or `' to stop the $
+dnl     looking like a macro parameter.  For example,
+dnl
+dnl  	        define(foo, `mov $ 123, %eax')
+dnl
+dnl     This is only a problem in macro definitions, not in ordinary text,
+dnl     nor in macro parameters like text passed to forloop() or ifdef().
+
+
+deflit(BYTES_PER_MP_LIMB, 4)
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Replacement PROLOGUE/EPILOGUE with more sophisticated error checking.
+dnl  Nesting and overlapping not allowed.
+dnl
+
+
+dnl  Usage: PROLOGUE(functionname)
+dnl
+dnl  Generate a function prologue.  functionname gets GSYM_PREFIX added.
+dnl  Examples,
+dnl
+dnl         PROLOGUE(mpn_add_n)
+dnl         PROLOGUE(somefun)
+
+define(`PROLOGUE',
+m4_assert_numargs(1)
+m4_assert_defined(`PROLOGUE_cpu')
+`ifdef(`PROLOGUE_current_function',
+`m4_error(`PROLOGUE'(`PROLOGUE_current_function') needs an `EPILOGUE'() before `PROLOGUE'($1)
+)')dnl
+m4_file_seen()dnl
+define(`PROLOGUE_current_function',`$1')dnl
+PROLOGUE_cpu(GSYM_PREFIX`'$1)')
+
+
+dnl  Usage: EPILOGUE()
+dnl
+dnl  Notice the function name is passed to EPILOGUE_cpu(), letting it use $1
+dnl  instead of the long PROLOGUE_current_function symbol.
+
+define(`EPILOGUE',
+m4_assert_numargs(0)
+m4_assert_defined(`EPILOGUE_cpu')
+`ifdef(`PROLOGUE_current_function',,
+`m4_error(`EPILOGUE'() with no `PROLOGUE'()
+)')dnl
+EPILOGUE_cpu(GSYM_PREFIX`'PROLOGUE_current_function)`'dnl
+undefine(`PROLOGUE_current_function')')
+
+m4wrap_prepend(
+`ifdef(`PROLOGUE_current_function',
+`m4_error(`EPILOGUE() for PROLOGUE('PROLOGUE_current_function`) never seen
+')')')
+
+
+dnl  Usage: PROLOGUE_assert_inside()
+dnl
+dnl  Use this unquoted on a line on its own at the start of a macro
+dnl  definition to add some code to check the macro is only used inside a
+dnl  PROLOGUE/EPILOGUE pair, and that hence PROLOGUE_current_function is
+dnl  defined.
+
+define(PROLOGUE_assert_inside,
+m4_assert_numargs(0)
+``PROLOGUE_assert_inside_internal'(m4_doublequote($`'0))`dnl '')
+
+define(PROLOGUE_assert_inside_internal,
+m4_assert_numargs(1)
+`ifdef(`PROLOGUE_current_function',,
+`m4_error(`$1 used outside a PROLOGUE / EPILOGUE pair
+')')')
+
+
+dnl  Usage: L(labelname)
+dnl         LF(functionname,labelname)
+dnl
+dnl  Generate a local label in the current or given function.  For LF(),
+dnl  functionname gets GSYM_PREFIX added, the same as with PROLOGUE().
+dnl
+dnl  For example, in a function mpn_add_n (and with MPN_PREFIX __gmpn),
+dnl
+dnl         L(bar)          => L__gmpn_add_n__bar
+dnl         LF(somefun,bar) => Lsomefun__bar
+dnl
+dnl  The funtion name and label name get two underscores between them rather
+dnl  than one to guard against clashing with a separate external symbol that
+dnl  happened to be called functionname_labelname.  (Though this would only
+dnl  happen if the local label prefix is is empty.)  Underscores are used so
+dnl  the whole label will still be a valid C identifier and so can be easily
+dnl  used in gdb.
+
+dnl  LSYM_PREFIX can be L$, so defn() is used to prevent L expanding as the
+dnl  L macro and making an infinite recursion.
+define(LF,
+m4_assert_numargs(2)
+m4_assert_defined(`LSYM_PREFIX')
+`defn(`LSYM_PREFIX')GSYM_PREFIX`'$1`'__$2')
+
+define(`L',
+m4_assert_numargs(1)
+PROLOGUE_assert_inside()
+`LF(PROLOGUE_current_function,`$1')')
+
+
+dnl  Called: PROLOGUE_cpu(gsym)
+dnl          EPILOGUE_cpu(gsym)
+
+define(PROLOGUE_cpu,
+m4_assert_numargs(1)
+	`GLOBL	$1
+	TYPE($1,`function')
+$1:')
+
+define(EPILOGUE_cpu,
+m4_assert_numargs(1)
+`	SIZE($1,.-$1)')
+
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various x86 macros.
+dnl
+
+
+dnl  Usage: ALIGN_OFFSET(bytes,offset)
+dnl
+dnl  Align to `offset' away from a multiple of `bytes'.
+dnl
+dnl  This is useful for testing, for example align to something very strict
+dnl  and see what effect offsets from it have, "ALIGN_OFFSET(256,32)".
+dnl
+dnl  Generally you wouldn't execute across the padding, but it's done with
+dnl  nop's so it'll work.
+
+define(ALIGN_OFFSET,
+m4_assert_numargs(2)
+`ALIGN($1)
+forloop(`i',1,$2,`	nop
+')')
+
+
+dnl  Usage: defframe(name,offset)
+dnl
+dnl  Make a definition like the following with which to access a parameter
+dnl  or variable on the stack.
+dnl
+dnl         define(name,`FRAME+offset(%esp)')
+dnl
+dnl  Actually m4_empty_if_zero(FRAME+offset) is used, which will save one
+dnl  byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp).
+dnl  Use define(`defframe_empty_if_zero_disabled',1) if for some reason the
+dnl  zero offset is wanted.
+dnl
+dnl  The new macro also gets a check that when it's used FRAME is actually
+dnl  defined, and that the final %esp offset isn't negative, which would
+dnl  mean an attempt to access something below the current %esp.
+dnl
+dnl  deflit() is used rather than a plain define(), so the new macro won't
+dnl  delete any following parenthesized expression.  name(%edi) will come
+dnl  out say as 16(%esp)(%edi).  This isn't valid assembler and should
+dnl  provoke an error, which is better than silently giving just 16(%esp).
+dnl
+dnl  See README.family for more on the suggested way to access the stack
+dnl  frame.
+
+define(defframe,
+m4_assert_numargs(2)
+`deflit(`$1',
+m4_assert_defined(`FRAME')
+`defframe_check_notbelow(`$1',$2,FRAME)dnl
+defframe_empty_if_zero(FRAME+($2))(%esp)')')
+
+dnl  Called: defframe_empty_if_zero(expression)
+define(defframe_empty_if_zero,
+`ifelse(defframe_empty_if_zero_disabled,1,
+`eval($1)',
+`m4_empty_if_zero($1)')')
+
+dnl  Called: defframe_check_notbelow(`name',offset,FRAME)
+define(defframe_check_notbelow,
+m4_assert_numargs(3)
+`ifelse(eval(($3)+($2)<0),1,
+`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes
+')')')
+
+
+dnl  Usage: FRAME_pushl()
+dnl         FRAME_popl()
+dnl         FRAME_addl_esp(n)
+dnl         FRAME_subl_esp(n)
+dnl
+dnl  Adjust FRAME appropriately for a pushl or popl, or for an addl or subl
+dnl  %esp of n bytes.
+dnl
+dnl  Using these macros is completely optional.  Sometimes it makes more
+dnl  sense to put explicit deflit(`FRAME',N) forms, especially when there's
+dnl  jumps and different sequences of FRAME values need to be used in
+dnl  different places.
+
+define(FRAME_pushl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+4))')
+
+define(FRAME_popl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-4))')
+
+define(FRAME_addl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-($1)))')
+
+define(FRAME_subl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+($1)))')
+
+
+dnl  Usage: defframe_pushl(name)
+dnl
+dnl  Do a combination of a FRAME_pushl() and a defframe() to name the stack
+dnl  location just pushed.  This should come after a pushl instruction.
+dnl  Putting it on the same line works and avoids lengthening the code.  For
+dnl  example,
+dnl
+dnl         pushl   %eax     defframe_pushl(VAR_COUNTER)
+dnl
+dnl  Notice the defframe() is done with an unquoted -FRAME thus giving its
+dnl  current value without tracking future changes.
+
+define(defframe_pushl,
+`FRAME_pushl()defframe(`$1',-FRAME)')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Assembler instruction macros.
+dnl
+
+
+dnl  Usage: emms_or_femms
+dnl         femms_available_p
+dnl
+dnl  femms_available_p expands to 1 or 0 according to whether the AMD 3DNow
+dnl  femms instruction is available.  emms_or_femms expands to femms if
+dnl  available, or emms if not.
+dnl
+dnl  emms_or_femms is meant for use in the K6 directory where plain K6
+dnl  (without femms) and K6-2 and K6-3 (with a slightly faster femms) are
+dnl  supported together.
+dnl
+dnl  On K7 femms is no longer faster and is just an alias for emms, so plain
+dnl  emms may as well be used.
+
+define(femms_available_p,
+m4_assert_numargs(-1)
+`m4_ifdef_anyof_p(
+	`HAVE_TARGET_CPU_k62',
+	`HAVE_TARGET_CPU_k63',
+	`HAVE_TARGET_CPU_athlon')')
+
+define(emms_or_femms,
+m4_assert_numargs(-1)
+`ifelse(femms_available_p,1,`femms',`emms')')
+
+
+dnl  Usage: femms
+dnl
+dnl  The gas 2.9.1 that comes with FreeBSD 3.4 doesn't support femms, so the
+dnl  following is a replacement using .byte.
+dnl
+dnl  If femms isn't available, an emms is generated instead, for convenience
+dnl  when testing on a machine without femms.
+
+define(femms,
+m4_assert_numargs(-1)
+`ifelse(femms_available_p,1,
+`.byte	15,14	C AMD 3DNow femms',
+`emms`'dnl
+m4_warning(`warning, using emms in place of femms, use for testing only
+')')')
+
+
+dnl  Usage: jadcl0(op)
+dnl
+dnl  Issue a jnc/incl as a substitute for adcl $0,op.  This isn't an exact
+dnl  replacement, since it doesn't set the flags like adcl does.
+dnl
+dnl  This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and
+dnl  mpn_sqr_basecase because on K6 an adcl is slow, the branch
+dnl  misprediction penalty is small, and the multiply algorithm used leads
+dnl  to a carry bit on average only 1/4 of the time.
+dnl
+dnl  jadcl0_disabled can be set to 1 to instead issue an ordinary adcl for
+dnl  comparison.  For example,
+dnl
+dnl		define(`jadcl0_disabled',1)
+dnl
+dnl  When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is
+dnl  the same size as an adcl.  This makes it possible to use the exact same
+dnl  computed jump code when testing the relative speed of jnc/incl and adcl
+dnl  with jadcl0_disabled.
+
+define(jadcl0,
+m4_assert_numargs(1)
+`ifelse(jadcl0_disabled,1,
+	`adcl	$`'0, $1',
+	`jnc	1f
+	incl	$1
+1:dnl')')
+
+
+dnl  Usage: cmov_available_p
+dnl
+dnl  Expand to 1 if cmov is available, 0 if not.
+
+define(cmov_available_p,
+`m4_ifdef_anyof_p(
+	`HAVE_TARGET_CPU_pentiumpro',
+	`HAVE_TARGET_CPU_pentium2',
+	`HAVE_TARGET_CPU_pentium3',
+	`HAVE_TARGET_CPU_athlon')')
+
+
+dnl  Usage: x86_lookup(target, key,value, key,value, ...)
+dnl         x86_lookup_p(target, key,value, key,value, ...)
+dnl
+dnl  Look for `target' among the `key' parameters.
+dnl
+dnl  x86_lookup expands to the corresponding `value', or generates an error
+dnl  if `target' isn't found.
+dnl
+dnl  x86_lookup_p expands to 1 if `target' is found, or 0 if not.
+
+define(x86_lookup,
+`ifelse(eval($#<3),1,
+`m4_error(`unrecognised part of x86 instruction: $1
+')',
+`ifelse(`$1',`$2', `$3',
+`x86_lookup(`$1',shift(shift(shift($@))))')')')
+
+define(x86_lookup_p,
+`ifelse(eval($#<3),1, `0',
+`ifelse(`$1',`$2',    `1',
+`x86_lookup_p(`$1',shift(shift(shift($@))))')')')
+
+
+dnl  Usage: x86_opcode_reg32(reg)
+dnl         x86_opcode_reg32_p(reg)
+dnl
+dnl  x86_opcode_reg32 expands to the standard 3 bit encoding for the given
+dnl  32-bit register, eg. `%ebp' turns into 5.
+dnl
+dnl  x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0
+dnl  if not.
+
+define(x86_opcode_reg32,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_p,
+m4_assert_onearg()
+`x86_lookup_p(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_list,
+``%eax',0,
+`%ecx',1,
+`%edx',2,
+`%ebx',3,
+`%esp',4,
+`%ebp',5,
+`%esi',6,
+`%edi',7')
+
+
+dnl  Usage: x86_opcode_tttn(cond)
+dnl
+dnl  Expand to the 4-bit "tttn" field value for the given x86 branch
+dnl  condition (like `c', `ae', etc).
+
+define(x86_opcode_tttn,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_ttn_list)')
+
+define(x86_opcode_tttn_list,
+``o',  0,
+`no',  1,
+`b',   2, `c',  2, `nae',2,
+`nb',  3, `nc', 3, `ae', 3,
+`e',   4, `z',  4,
+`ne',  5, `nz', 5,
+`be',  6, `na', 6,
+`nbe', 7, `a',  7,
+`s',   8,
+`ns',  9,
+`p',  10, `pe', 10, `npo',10,
+`np', 11, `npe',11, `po', 11,
+`l',  12, `nge',12,
+`nl', 13, `ge', 13,
+`le', 14, `ng', 14,
+`nle',15, `g',  15')
+
+
+dnl  Usage: cmovCC(srcreg,dstreg)
+dnl
+dnl  Generate a cmov instruction if the target supports cmov, or simulate it
+dnl  with a conditional jump if not (the latter being meant only for
+dnl  testing).  For example,
+dnl
+dnl         cmovz(  %eax, %ebx)
+dnl
+dnl  cmov instructions are generated using .byte sequences, since only
+dnl  recent versions of gas know cmov.
+dnl
+dnl  The source operand can only be a plain register.  (m4 code implementing
+dnl  full memory addressing modes exists, believe it or not, but isn't
+dnl  currently needed and isn't included.)
+dnl
+dnl  All the standard conditions are defined.  Attempting to use one without
+dnl  the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke
+dnl  an error.  This ensures the necessary .byte sequences aren't
+dnl  accidentally missed.
+
+dnl  Called: define_cmov_many(cond,tttn,cond,tttn,...)
+define(define_cmov_many,
+`ifelse(m4_length(`$1'),0,,
+`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')')
+
+dnl  Called: define_cmov(cond,tttn)
+define(define_cmov,
+m4_assert_numargs(2)
+`define(`cmov$1',
+m4_instruction_wrapper()
+m4_assert_numargs(2)
+`cmov_internal'(m4_doublequote($`'0),``$1',`$2'',dnl
+m4_doublequote($`'1),m4_doublequote($`'2)))')
+
+define_cmov_many(x86_opcode_tttn_list)
+
+
+dnl  Called: cmov_internal(name,cond,tttn,src,dst)
+define(cmov_internal,
+m4_assert_numargs(5)
+`ifelse(cmov_available_p,1,
+`cmov_bytes_tttn(`$1',`$3',`$4',`$5')',
+`m4_warning(`warning, simulating cmov with jump, use for testing only
+')cmov_simulate(`$2',`$4',`$5')')')
+
+dnl  Called: cmov_simulate(cond,src,dst)
+dnl  If this is going to be used with memory operands for the source it will
+dnl  need to be changed to do a fetch even if the condition is false, so as
+dnl  to trigger exceptions the same way a real cmov does.
+define(cmov_simulate,
+m4_assert_numargs(3)
+	`j$1	1f	C cmov$1 $2, $3
+	jmp	2f
+1:	movl	$2, $3
+2:')
+
+dnl  Called: cmov_bytes_tttn(name,tttn,src,dst)
+define(cmov_bytes_tttn,
+m4_assert_numargs(4)
+`.byte	dnl
+15, dnl
+eval(64+$2), dnl
+eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl
+	C `$1 $3, $4'')
+
+
+dnl  Usage: loop_or_decljnz label
+dnl
+dnl  Generate either a "loop" instruction or a "decl %ecx / jnz", whichever
+dnl  is better.  "loop" is better on K6 and probably on 386, on other chips
+dnl  separate decl/jnz is better.
+dnl
+dnl  This macro is just for mpn/x86/divrem_1.asm and mpn/x86/mod_1.asm where
+dnl  this loop_or_decljnz variation is enough to let the code be shared by
+dnl  all chips.
+
+define(loop_or_decljnz,
+`ifelse(loop_is_better_p,1,
+	`loop',
+	`decl	%ecx
+	jnz')')
+
+define(loop_is_better_p,
+`m4_ifdef_anyof_p(`HAVE_TARGET_CPU_k6',
+                  `HAVE_TARGET_CPU_k62',
+                  `HAVE_TARGET_CPU_k63',
+                  `HAVE_TARGET_CPU_i386')')
+
+
+dnl  Usage: Zdisp(inst,op,op,op)
+dnl
+dnl  Generate explicit .byte sequences if necessary to force a byte-sized
+dnl  zero displacement on an instruction.  For example,
+dnl
+dnl         Zdisp(  movl,   0,(%esi), %eax)
+dnl
+dnl  expands to
+dnl
+dnl                 .byte   139,70,0  C movl 0(%esi), %eax
+dnl
+dnl  If the displacement given isn't 0, then normal assembler code is
+dnl  generated.  For example,
+dnl
+dnl         Zdisp(  movl,   4,(%esi), %eax)
+dnl
+dnl  expands to
+dnl
+dnl                 movl    4(%esi), %eax
+dnl
+dnl  This means a single Zdisp() form can be used with an expression for the
+dnl  displacement, and .byte will be used only if necessary.  The
+dnl  displacement argument is eval()ed.
+dnl
+dnl  Because there aren't many places a 0(reg) form is wanted, Zdisp is
+dnl  implemented with a table of instructions and encodings.  A new entry is
+dnl  needed for any different operation or registers.
+
+define(Zdisp,
+`define(`Zdisp_found',0)dnl
+Zdisp_match( movl, %eax, 0,(%edi), `137,71,0',    $@)`'dnl
+Zdisp_match( movl, %ebx, 0,(%edi), `137,95,0',    $@)`'dnl
+Zdisp_match( movl, %esi, 0,(%edi), `137,119,0',   $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %eax, `139,67,0',    $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %esi, `139,115,0',   $@)`'dnl
+Zdisp_match( movl, 0,(%esi), %eax, `139,70,0',    $@)`'dnl
+Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00',      $@)`'dnl
+Zdisp_match( addl, %ebx, 0,(%edi), `1,95,0',      $@)`'dnl
+Zdisp_match( addl, %ecx, 0,(%edi), `1,79,0',      $@)`'dnl
+Zdisp_match( addl, %esi, 0,(%edi), `1,119,0',     $@)`'dnl
+Zdisp_match( subl, %ecx, 0,(%edi), `41,79,0',     $@)`'dnl
+Zdisp_match( adcl, 0,(%edx), %esi, `19,114,0',    $@)`'dnl
+Zdisp_match( sbbl, 0,(%edx), %esi, `27,114,0',    $@)`'dnl
+Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%esi),        %mm0, `15,111,70,0',     $@)`'dnl
+Zdisp_match( movq, %mm0,        0,(%edi), `15,127,71,0',     $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl
+Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl
+Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl
+ifelse(Zdisp_found,0,
+`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4
+')')')
+
+define(Zdisp_match,
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+	&& m4_stringequal_p(`$2',0)
+	&& m4_stringequal_p(`$3',`$8')
+	&& m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$7'),0,
+`	.byte	$5  C `$1 0$3, $4'',
+`	$6	$7$8, $9')',
+
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+	&& m4_stringequal_p(`$2',`$7')
+	&& m4_stringequal_p(`$3',0)
+	&& m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$8'),0,
+`	.byte	$5  C `$1 $2, 0$4'',
+`	$6	$7, $8$9')')')')
+
+
+dnl  Usage: shldl(count,src,dst)
+dnl         shrdl(count,src,dst)
+dnl         shldw(count,src,dst)
+dnl         shrdw(count,src,dst)
+dnl
+dnl  Generate a double-shift instruction, possibly omitting a %cl count
+dnl  parameter if that's what the assembler requires, as indicated by
+dnl  WANT_SHLDL_CL in config.m4.  For example,
+dnl
+dnl         shldl(  %cl, %eax, %ebx)
+dnl
+dnl  turns into either
+dnl
+dnl         shldl   %cl, %eax, %ebx
+dnl  or
+dnl         shldl   %eax, %ebx
+dnl
+dnl  Immediate counts are always passed through unchanged.  For example,
+dnl
+dnl         shrdl(  $2, %esi, %edi)
+dnl  becomes
+dnl         shrdl   $2, %esi, %edi
+dnl
+dnl
+dnl  If you forget to use the macro form "shldl( ...)" and instead write
+dnl  just a plain "shldl ...", an error results.  This ensures the necessary
+dnl  variant treatment of %cl isn't accidentally bypassed.
+
+define(define_shd_instruction,
+`define($1,
+m4_instruction_wrapper()
+m4_assert_numargs(3)
+`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl
+m4_doublequote($`'2),m4_doublequote($`'3)))')
+
+dnl  Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc
+define_shd_instruction(shldl)
+define_shd_instruction(shrdl)
+define_shd_instruction(shldw)
+define_shd_instruction(shrdw)
+
+dnl  Called: shd_instruction(op,count,src,dst)
+define(shd_instruction,
+m4_assert_numargs(4)
+m4_assert_defined(`WANT_SHLDL_CL')
+`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1,
+``$1'	`$3', `$4'',
+``$1'	`$2', `$3', `$4'')')
+
+
+dnl  Usage: ASSERT(cond, instructions)
+dnl
+dnl  If WANT_ASSERT is 1, output the given instructions and expect the given
+dnl  flags condition to then be satisfied.  For example,
+dnl
+dnl         ASSERT(ne, `cmpl %eax, %ebx')
+dnl
+dnl  The instructions can be omitted to just assert a flags condition with
+dnl  no extra calculation.  For example,
+dnl
+dnl         ASSERT(nc)
+dnl
+dnl  When `instructions' is not empty, a pushf/popf is added to preserve the
+dnl  flags, but the instructions themselves must preserve any registers that
+dnl  matter.  FRAME is adjusted for the push and pop, so the instructions
+dnl  given can use defframe() stack variables.
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+`ifelse(WANT_ASSERT,1,
+	`C ASSERT
+ifelse(`$2',,,`	pushf	ifdef(`FRAME',`FRAME_pushl()')')
+	$2
+	j`$1'	1f
+	ud2	C assertion failed
+1:
+ifelse(`$2',,,`	popf	ifdef(`FRAME',`FRAME_popl()')')
+')')
+
+
+dnl  Usage: movl_text_address(label,register)
+dnl
+dnl  Get the address of a text segment label, using either a plain movl or a
+dnl  position-independent calculation, as necessary.  For example,
+dnl
+dnl         movl_code_address(L(foo),%eax)
+dnl
+dnl  This macro is only meant for use in ASSERT()s or when testing, since
+dnl  the PIC sequence it generates will want to be done with a ret balancing
+dnl  the call on CPUs with return address branch predition.
+dnl
+dnl  The addl generated here has a backward reference to 1b, and so won't
+dnl  suffer from the two forwards references bug in old gas (described in
+dnl  mpn/x86/README.family).
+
+define(movl_text_address,
+`ifdef(`PIC',
+	`call	1f
+1:	popl	$2	C %eip
+	addl	`$'$1-1b, $2',
+	`movl	`$'$1, $2')')
+
+
+divert`'dnl
diff --git a/rts/gmp/mpn/z8000/add_n.s b/rts/gmp/mpn/z8000/add_n.s
new file mode 100644
index 0000000000..3a136107fe
--- /dev/null
+++ b/rts/gmp/mpn/z8000/add_n.s
@@ -0,0 +1,53 @@
+! Z8000 __gmpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	unseg
+	.text
+	even
+	global ___gmpn_add_n
+___gmpn_add_n:
+	pop	r0,@r6
+	pop	r1,@r5
+	add	r0,r1
+	ld	@r7,r0
+	dec	r4
+	jr	eq,Lend
+Loop:	pop	r0,@r6
+	pop	r1,@r5
+	adc	r0,r1
+	inc	r7,#2
+	ld	@r7,r0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	adc	r2,r2
+	ret	t
diff --git a/rts/gmp/mpn/z8000/gmp-mparam.h b/rts/gmp/mpn/z8000/gmp-mparam.h
new file mode 100644
index 0000000000..4216df673c
--- /dev/null
+++ b/rts/gmp/mpn/z8000/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 16
+#define BYTES_PER_MP_LIMB 2
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 16
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/z8000/mul_1.s b/rts/gmp/mpn/z8000/mul_1.s
new file mode 100644
index 0000000000..20fadd340a
--- /dev/null
+++ b/rts/gmp/mpn/z8000/mul_1.s
@@ -0,0 +1,68 @@
+! Z8000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright (C) 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! size		r5
+! s2_limb	r4
+
+	unseg
+	.text
+	even
+	global ___gmpn_mul_1
+___gmpn_mul_1:
+	sub	r2,r2		! zero carry limb
+	and	r4,r4
+	jr	mi,Lneg
+
+Lpos:	pop	r1,@r6
+	ld	r9,r1
+	mult	rr8,r4
+	and	r1,r1		! shift msb of loaded limb into cy
+	jr	mi,Lp		! branch if loaded limb's msb is set
+	add	r8,r4		! hi_limb += sign_comp2
+Lp:	add	r9,r2		! lo_limb += cy_limb
+	xor	r2,r2
+	adc	r2,r8
+	ld	@r7,r9
+	inc	r7,#2
+	dec	r5
+	jr	ne,Lpos
+	ret t
+
+Lneg:	pop	r1,@r6
+	ld	r9,r1
+	mult	rr8,r4
+	add	r8,r1		! hi_limb += sign_comp1
+	and	r1,r1
+	jr	mi,Ln
+	add	r8,r4		! hi_limb += sign_comp2
+Ln:	add	r9,r2		! lo_limb += cy_limb
+	xor	r2,r2
+	adc	r2,r8
+	ld	@r7,r9
+	inc	r7,#2
+	dec	r5
+	jr	ne,Lneg
+	ret t
diff --git a/rts/gmp/mpn/z8000/sub_n.s b/rts/gmp/mpn/z8000/sub_n.s
new file mode 100644
index 0000000000..bd9a7ad409
--- /dev/null
+++ b/rts/gmp/mpn/z8000/sub_n.s
@@ -0,0 +1,54 @@
+! Z8000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	unseg
+	.text
+	even
+	global ___gmpn_sub_n
+___gmpn_sub_n:
+	pop	r0,@r6
+	pop	r1,@r5
+	sub	r0,r1
+	ld	@r7,r0
+	dec	r4
+	jr	eq,Lend
+Loop:	pop	r0,@r6
+	pop	r1,@r5
+	sbc	r0,r1
+	inc	r7,#2
+	ld	@r7,r0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	adc	r2,r2
+	ret	t
diff --git a/rts/gmp/mpn/z8000x/add_n.s b/rts/gmp/mpn/z8000x/add_n.s
new file mode 100644
index 0000000000..7f130785c5
--- /dev/null
+++ b/rts/gmp/mpn/z8000x/add_n.s
@@ -0,0 +1,56 @@
+! Z8000 (32 bit limb version) __gmpn_add_n -- Add two limb vectors of equal,
+! non-zero length.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	segm
+	.text
+	even
+	global ___gmpn_add_n
+___gmpn_add_n:
+	popl	rr0,@r6
+	popl	rr8,@r5
+	addl	rr0,rr8
+	ldl	@r7,rr0
+	dec	r4
+	jr	eq,Lend
+Loop:	popl	rr0,@r6
+	popl	rr8,@r5
+	adc	r1,r9
+	adc	r0,r8
+	inc	r7,#4
+	ldl	@r7,rr0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	ld	r3,r4
+	adc	r2,r2
+	ret	t
diff --git a/rts/gmp/mpn/z8000x/sub_n.s b/rts/gmp/mpn/z8000x/sub_n.s
new file mode 100644
index 0000000000..f416d1d6eb
--- /dev/null
+++ b/rts/gmp/mpn/z8000x/sub_n.s
@@ -0,0 +1,56 @@
+! Z8000 (32 bit limb version) __gmpn_sub_n -- Subtract two limb vectors of the
+! same length > 0 and store difference in a third limb vector.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	segm
+	.text
+	even
+	global ___gmpn_sub_n
+___gmpn_sub_n:
+	popl	rr0,@r6
+	popl	rr8,@r5
+	subl	rr0,rr8
+	ldl	@r7,rr0
+	dec	r4
+	jr	eq,Lend
+Loop:	popl	rr0,@r6
+	popl	rr8,@r5
+	sbc	r1,r9
+	sbc	r0,r8
+	inc	r7,#4
+	ldl	@r7,rr0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	ld	r3,r4
+	adc	r2,r2
+	ret	t